All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH RFC] nfsd: serialize layout stateid morphing operations
@ 2015-09-17 11:58 Jeff Layton
  2015-10-11 13:15 ` Christoph Hellwig
  2015-11-29  4:07 ` Kinglong Mee
  0 siblings, 2 replies; 26+ messages in thread
From: Jeff Layton @ 2015-09-17 11:58 UTC (permalink / raw)
  To: bfields; +Cc: Christoph Hellwig, linux-nfs

In order to allow the client to make a sane determination of what
happened with racing LAYOUTGET/LAYOUTRETURN/CB_LAYOUTRECALL calls, we
must ensure that the seqids return accurately represent the order of
operations. The simplest way to do that is to ensure that operations on
a single stateid are serialized.

This patch adds a mutex to the layout stateid, and locks it when
checking the layout stateid's seqid. The mutex is held over the entire
operation and released after the seqid is bumped.

Note that in the case of CB_LAYOUTRECALL we must move the increment of
the seqid and setting into a new cb "prepare" operation. The lease
infrastructure will call the lm_break callback with a spinlock held, so
and we can't take the mutex in that codepath.

Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
---
 fs/nfsd/nfs4layouts.c | 25 +++++++++++++++++++++----
 fs/nfsd/nfs4proc.c    |  4 ++++
 fs/nfsd/state.h       |  1 +
 3 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index ebf90e487c75..4a68ab901b4b 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -201,6 +201,7 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
 	INIT_LIST_HEAD(&ls->ls_perfile);
 	spin_lock_init(&ls->ls_lock);
 	INIT_LIST_HEAD(&ls->ls_layouts);
+	mutex_init(&ls->ls_mutex);
 	ls->ls_layout_type = layout_type;
 	nfsd4_init_cb(&ls->ls_recall, clp, &nfsd4_cb_layout_ops,
 			NFSPROC4_CLNT_CB_LAYOUT);
@@ -262,19 +263,23 @@ nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
 		status = nfserr_jukebox;
 		if (!ls)
 			goto out;
+		mutex_lock(&ls->ls_mutex);
 	} else {
 		ls = container_of(stid, struct nfs4_layout_stateid, ls_stid);
 
 		status = nfserr_bad_stateid;
+		mutex_lock(&ls->ls_mutex);
 		if (stateid->si_generation > stid->sc_stateid.si_generation)
-			goto out_put_stid;
+			goto out_unlock_stid;
 		if (layout_type != ls->ls_layout_type)
-			goto out_put_stid;
+			goto out_unlock_stid;
 	}
 
 	*lsp = ls;
 	return 0;
 
+out_unlock_stid:
+	mutex_unlock(&ls->ls_mutex);
 out_put_stid:
 	nfs4_put_stid(stid);
 out:
@@ -296,8 +301,6 @@ nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls)
 	trace_layout_recall(&ls->ls_stid.sc_stateid);
 
 	atomic_inc(&ls->ls_stid.sc_count);
-	update_stateid(&ls->ls_stid.sc_stateid);
-	memcpy(&ls->ls_recall_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t));
 	nfsd4_run_cb(&ls->ls_recall);
 
 out_unlock:
@@ -494,6 +497,7 @@ nfsd4_return_file_layouts(struct svc_rqst *rqstp,
 	}
 	spin_unlock(&ls->ls_lock);
 
+	mutex_unlock(&ls->ls_mutex);
 	nfs4_put_stid(&ls->ls_stid);
 	nfsd4_free_layouts(&reaplist);
 	return nfs_ok;
@@ -608,6 +612,17 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
 	}
 }
 
+static void
+nfsd4_cb_layout_prepare(struct nfsd4_callback *cb)
+{
+	struct nfs4_layout_stateid *ls =
+		container_of(cb, struct nfs4_layout_stateid, ls_recall);
+
+	mutex_lock(&ls->ls_mutex);
+	update_stateid(&ls->ls_stid.sc_stateid);
+	memcpy(&ls->ls_recall_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t));
+}
+
 static int
 nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
 {
@@ -649,12 +664,14 @@ nfsd4_cb_layout_release(struct nfsd4_callback *cb)
 
 	trace_layout_recall_release(&ls->ls_stid.sc_stateid);
 
+	mutex_unlock(&ls->ls_mutex);
 	nfsd4_return_all_layouts(ls, &reaplist);
 	nfsd4_free_layouts(&reaplist);
 	nfs4_put_stid(&ls->ls_stid);
 }
 
 static struct nfsd4_callback_ops nfsd4_cb_layout_ops = {
+	.prepare	= nfsd4_cb_layout_prepare,
 	.done		= nfsd4_cb_layout_done,
 	.release	= nfsd4_cb_layout_release,
 };
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 4ce6b97b31ad..a9f096c7e99f 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1309,6 +1309,7 @@ nfsd4_layoutget(struct svc_rqst *rqstp,
 	nfserr = nfsd4_insert_layout(lgp, ls);
 
 out_put_stid:
+	mutex_unlock(&ls->ls_mutex);
 	nfs4_put_stid(&ls->ls_stid);
 out:
 	return nfserr;
@@ -1362,6 +1363,9 @@ nfsd4_layoutcommit(struct svc_rqst *rqstp,
 		goto out;
 	}
 
+	/* LAYOUTCOMMIT does not require any serialization */
+	mutex_unlock(&ls->ls_mutex);
+
 	if (new_size > i_size_read(inode)) {
 		lcp->lc_size_chg = 1;
 		lcp->lc_newsize = new_size;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 31bde12feefe..1fa0f3848d4e 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -562,6 +562,7 @@ struct nfs4_layout_stateid {
 	struct nfsd4_callback		ls_recall;
 	stateid_t			ls_recall_sid;
 	bool				ls_recalled;
+	struct mutex			ls_mutex;
 };
 
 static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s)
-- 
2.4.3


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC] nfsd: serialize layout stateid morphing operations
  2015-09-17 11:58 [PATCH RFC] nfsd: serialize layout stateid morphing operations Jeff Layton
@ 2015-10-11 13:15 ` Christoph Hellwig
  2015-10-11 20:51   ` Jeff Layton
  2015-11-29  4:07 ` Kinglong Mee
  1 sibling, 1 reply; 26+ messages in thread
From: Christoph Hellwig @ 2015-10-11 13:15 UTC (permalink / raw)
  To: Jeff Layton; +Cc: bfields, Christoph Hellwig, linux-nfs

On Thu, Sep 17, 2015 at 07:58:24AM -0400, Jeff Layton wrote:
> In order to allow the client to make a sane determination of what
> happened with racing LAYOUTGET/LAYOUTRETURN/CB_LAYOUTRECALL calls, we
> must ensure that the seqids return accurately represent the order of
> operations. The simplest way to do that is to ensure that operations on
> a single stateid are serialized.
> 
> This patch adds a mutex to the layout stateid, and locks it when
> checking the layout stateid's seqid. The mutex is held over the entire
> operation and released after the seqid is bumped.
> 
> Note that in the case of CB_LAYOUTRECALL we must move the increment of
> the seqid and setting into a new cb "prepare" operation. The lease
> infrastructure will call the lm_break callback with a spinlock held, so
> and we can't take the mutex in that codepath.

I can't say I like the long running mutex all that much.  What kinds
of reproducers do you have where the current behavior causes problems?

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC] nfsd: serialize layout stateid morphing operations
  2015-10-11 13:15 ` Christoph Hellwig
@ 2015-10-11 20:51   ` Jeff Layton
  2015-10-23 19:35     ` J. Bruce Fields
  0 siblings, 1 reply; 26+ messages in thread
From: Jeff Layton @ 2015-10-11 20:51 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: bfields, linux-nfs

On Sun, 11 Oct 2015 15:15:27 +0200
Christoph Hellwig <hch@lst.de> wrote:

> On Thu, Sep 17, 2015 at 07:58:24AM -0400, Jeff Layton wrote:
> > In order to allow the client to make a sane determination of what
> > happened with racing LAYOUTGET/LAYOUTRETURN/CB_LAYOUTRECALL calls, we
> > must ensure that the seqids return accurately represent the order of
> > operations. The simplest way to do that is to ensure that operations on
> > a single stateid are serialized.
> > 
> > This patch adds a mutex to the layout stateid, and locks it when
> > checking the layout stateid's seqid. The mutex is held over the entire
> > operation and released after the seqid is bumped.
> > 
> > Note that in the case of CB_LAYOUTRECALL we must move the increment of
> > the seqid and setting into a new cb "prepare" operation. The lease
> > infrastructure will call the lm_break callback with a spinlock held, so
> > and we can't take the mutex in that codepath.
> 
> I can't say I like the long running mutex all that much.  What kinds
> of reproducers do you have where the current behavior causes problems?

I'm not thrilled with it either, though it is per-stateid. It should
only ever be contended when there are concurrent operations that specify
the same stateid.

We did have a report of this problem with open stateids that came about
after the client started parallelizing opens more aggressively. It was
pretty clear that you could hit similar races with lock stateids as
well, given a client that didn't serialize things properly.

I don't have any reports of this problem with layout stateids though.
It may be that the client is good enough at serializing these
operations (or slow enough) that it's never an issue. But, if that's
the case then the mutex is harmless anyway since it'd never be
contended.

In hindsight I probably should have sent this as a RFC patch, I'm fine
with dropping this for now if you think it's potentially harmful.

-- 
Jeff Layton <jlayton@poochiereds.net>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC] nfsd: serialize layout stateid morphing operations
  2015-10-11 20:51   ` Jeff Layton
@ 2015-10-23 19:35     ` J. Bruce Fields
  0 siblings, 0 replies; 26+ messages in thread
From: J. Bruce Fields @ 2015-10-23 19:35 UTC (permalink / raw)
  To: Jeff Layton; +Cc: Christoph Hellwig, linux-nfs

On Sun, Oct 11, 2015 at 04:51:21PM -0400, Jeff Layton wrote:
> On Sun, 11 Oct 2015 15:15:27 +0200
> Christoph Hellwig <hch@lst.de> wrote:
> 
> > On Thu, Sep 17, 2015 at 07:58:24AM -0400, Jeff Layton wrote:
> > > In order to allow the client to make a sane determination of what
> > > happened with racing LAYOUTGET/LAYOUTRETURN/CB_LAYOUTRECALL calls, we
> > > must ensure that the seqids return accurately represent the order of
> > > operations. The simplest way to do that is to ensure that operations on
> > > a single stateid are serialized.
> > > 
> > > This patch adds a mutex to the layout stateid, and locks it when
> > > checking the layout stateid's seqid. The mutex is held over the entire
> > > operation and released after the seqid is bumped.
> > > 
> > > Note that in the case of CB_LAYOUTRECALL we must move the increment of
> > > the seqid and setting into a new cb "prepare" operation. The lease
> > > infrastructure will call the lm_break callback with a spinlock held, so
> > > and we can't take the mutex in that codepath.
> > 
> > I can't say I like the long running mutex all that much.  What kinds
> > of reproducers do you have where the current behavior causes problems?
> 
> I'm not thrilled with it either, though it is per-stateid. It should
> only ever be contended when there are concurrent operations that specify
> the same stateid.
> 
> We did have a report of this problem with open stateids that came about
> after the client started parallelizing opens more aggressively. It was
> pretty clear that you could hit similar races with lock stateids as
> well, given a client that didn't serialize things properly.
> 
> I don't have any reports of this problem with layout stateids though.
> It may be that the client is good enough at serializing these
> operations (or slow enough) that it's never an issue. But, if that's
> the case then the mutex is harmless anyway since it'd never be
> contended.
> 
> In hindsight I probably should have sent this as a RFC patch, I'm fine
> with dropping this for now if you think it's potentially harmful.

Still looks to me like the code's incorrect without your patch, so I
intend to apply it.

I'll fully admit that I haven't looked at how layout stateid's are
supposed to work closely enough so it's possible Christoph can persaude
me otherwise.

--b.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC] nfsd: serialize layout stateid morphing operations
  2015-09-17 11:58 [PATCH RFC] nfsd: serialize layout stateid morphing operations Jeff Layton
  2015-10-11 13:15 ` Christoph Hellwig
@ 2015-11-29  4:07 ` Kinglong Mee
  2015-11-29 13:46   ` Jeff Layton
  1 sibling, 1 reply; 26+ messages in thread
From: Kinglong Mee @ 2015-11-29  4:07 UTC (permalink / raw)
  To: Jeff Layton, bfields; +Cc: Christoph Hellwig, linux-nfs

I meet two problems with this patch,

1. a dump_stack messages printed in process_one_work() at kernel/workqueue.c.

BUG: workqueue leaked lock or atomic: kworker/u2:4/0x00000000/98#012     last function: nfsd4_run_cb_work [nfsd]
1 lock held by kworker/u2:4/98:
#0:  (&ls->ls_mutex){......}, at: [<ffffffffa0250d34>] nfsd4_cb_layout_prepare+0x24/0x40 [nfsd]
CPU: 0 PID: 98 Comm: kworker/u2:4 Not tainted 4.4.0-rc2+ #333
Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/02/2015
Workqueue: nfsd4_callbacks nfsd4_run_cb_work [nfsd]
ffff8800362b9e40 000000007fe9394f ffff880036353d58 ffffffff8136dc64
ffff880036353dd8 ffffffff810a3f12 ffffffff810a3cbd 000000000000000a
ffffffffa0261d78 ffffffff82902e20 0000000000000000 ffffffffa0259241
Call Trace:
[<ffffffff8136dc64>] dump_stack+0x19/0x25
[<ffffffff810a3f12>] process_one_work+0x3c2/0x4c0
[<ffffffff810a3cbd>] ? process_one_work+0x16d/0x4c0
[<ffffffff810a405a>] worker_thread+0x4a/0x440
[<ffffffff810a4010>] ? process_one_work+0x4c0/0x4c0
[<ffffffff810a4010>] ? process_one_work+0x4c0/0x4c0
[<ffffffff810a91e5>] kthread+0xf5/0x110
[<ffffffff810a90f0>] ? kthread_create_on_node+0x240/0x240
[<ffffffff81738d0f>] ret_from_fork+0x3f/0x70
[<ffffffff810a90f0>] ? kthread_create_on_node+0x240/0x240

2. a mutex race between layoutrecall and layoutcommit, 

callback thread,
nfsd4_cb_layout_prepare 
  --->mutex_lock(&ls->ls_mutex);

layoutcommit thread,
nfsd4_layoutcommit
  ---> nfsd4_preprocess_layout_stateid
     --> mutex_lock(&ls->ls_mutex);   <----------------  hang 

[  600.645142] INFO: task nfsd:11623 blocked for more than 120 seconds.
[  600.646337]       Not tainted 4.4.0-rc2+ #332
[  600.647404] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  600.648546] nfsd            D ffff880064277b80     0 11623      2 0x00000000
[  600.649803]  ffff880064277b80 ffff880064278000 00000000ffffffff ffff88005dd241a8
[  600.651021]  ffffffffa025e77c 0000000000000246 ffff880064277b98 ffffffff81733103
[  600.652255]  ffff880063d7e100 ffff880064277ba8 ffffffff8173330e ffff880064277c28
[  600.653512] Call Trace:
[  600.654765]  [<ffffffffa025e77c>] ? nfsd4_preprocess_layout_stateid+0x37c/0x400 [nfsd]
[  600.656084]  [<ffffffff81733103>] schedule+0x33/0x80
[  600.657405]  [<ffffffff8173330e>] schedule_preempt_disabled+0xe/0x10
[  600.658741]  [<ffffffff817357f5>] mutex_lock_nested+0x145/0x330
[  600.660094]  [<ffffffffa025e77c>] ? nfsd4_preprocess_layout_stateid+0x37c/0x400 [nfsd]
[  600.661696]  [<ffffffffa025e77c>] nfsd4_preprocess_layout_stateid+0x37c/0x400 [nfsd]
[  600.663129]  [<ffffffffa025e405>] ? nfsd4_preprocess_layout_stateid+0x5/0x400 [nfsd]
[  600.664558]  [<ffffffff81173b8f>] ? printk+0x56/0x72
[  600.665990]  [<ffffffffa023e3ec>] nfsd4_layoutcommit+0x13c/0x200 [nfsd]
[  600.667365]  [<ffffffffa023fb98>] nfsd4_proc_compound+0x388/0x660 [nfsd]
[  600.668835]  [<ffffffffa022c148>] nfsd_dispatch+0xb8/0x200 [nfsd]
[  600.670323]  [<ffffffffa0093d89>] svc_process_common+0x409/0x650 [sunrpc]
[  600.671836]  [<ffffffffa0094e04>] svc_process+0xf4/0x190 [sunrpc]
[  600.673328]  [<ffffffffa022bb05>] nfsd+0x135/0x1a0 [nfsd]
[  600.674825]  [<ffffffffa022b9d5>] ? nfsd+0x5/0x1a0 [nfsd]
[  600.676388]  [<ffffffffa022b9d0>] ? nfsd_destroy+0xb0/0xb0 [nfsd]
[  600.677884]  [<ffffffff810a9175>] kthread+0xf5/0x110
[  600.679373]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
[  600.680874]  [<ffffffff81738ccf>] ret_from_fork+0x3f/0x70
[  600.682398]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
[  600.683893] 1 lock held by nfsd/11623:
[  600.685449]  #0:  (&ls->ls_mutex){......}, at: [<ffffffffa025e77c>] nfsd4_preprocess_layout_stateid+0x37c/0x400 [nfsd]
[  600.688778] Sending NMI to all CPUs:
[  600.690854] NMI backtrace for cpu 0
[  600.691909] CPU: 0 PID: 11 Comm: khungtaskd Not tainted 4.4.0-rc2+ #332
[  600.692523] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/02/2015
[  600.693821] task: ffff88007b900000 ti: ffff88007b8fc000 task.ti: ffff88007b8fc000
[  600.694496] RIP: 0010:[<ffffffff81053aca>]  [<ffffffff81053aca>] native_write_msr_safe+0xa/0x10
[  600.695185] RSP: 0018:ffff88007b8ffd70  EFLAGS: 00000046
[  600.695861] RAX: 0000000000000400 RBX: 0000000000000286 RCX: 0000000000000830
[  600.696539] RDX: 0000000000000000 RSI: 0000000000000400 RDI: 0000000000000830
[  600.697204] RBP: ffff88007b8ffd70 R08: 0000000000000400 R09: 0000000000000000
[  600.697862] R10: 00000000000000e4 R11: 0000000000000001 R12: ffff880063d7e100
[  600.698513] R13: 00000000003fff3c R14: ffff880063d7e308 R15: 0000000000000004
[  600.699156] FS:  0000000000000000(0000) GS:ffffffff81c27000(0000) knlGS:0000000000000000
[  600.699823] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  600.700459] CR2: 00007f366afd4000 CR3: 000000005dd56000 CR4: 00000000001406f0
[  600.701106] Stack:
[  600.701745]  ffff88007b8ffd88 ffffffff8104a860 ffffffff81047340 ffff88007b8ffd98
[  600.702404]  ffffffff8104a885 ffff88007b8ffda8 ffffffff8104735b ffff88007b8ffdd8
[  600.703058]  ffffffff813723ad 0000000000000078 ffff880063d7e100 00000000003fff3c
[  600.703712] Call Trace:
[  600.704355]  [<ffffffff8104a860>] __x2apic_send_IPI_mask.isra.2+0x60/0x70
[  600.705017]  [<ffffffff81047340>] ? setup_vector_irq+0x130/0x130
[  600.705676]  [<ffffffff8104a885>] x2apic_send_IPI_mask+0x15/0x20
[  600.706335]  [<ffffffff8104735b>] nmi_raise_cpu_backtrace+0x1b/0x20
[  600.706989]  [<ffffffff813723ad>] nmi_trigger_all_cpu_backtrace+0x14d/0x1c0
[  600.707693]  [<ffffffff810473b9>] arch_trigger_all_cpu_backtrace+0x19/0x20
[  600.708362]  [<ffffffff8112c4cf>] watchdog+0x32f/0x370
[  600.709031]  [<ffffffff8112c221>] ? watchdog+0x81/0x370
[  600.709725]  [<ffffffff8112c1a0>] ? reset_hung_task_detector+0x20/0x20
[  600.710398]  [<ffffffff810a9175>] kthread+0xf5/0x110
[  600.711067]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
[  600.711739]  [<ffffffff81738ccf>] ret_from_fork+0x3f/0x70
[  600.712405]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
[  600.713073] Code: 00 55 89 f9 48 89 e5 0f 32 45 31 c0 48 c1 e2 20 44 89 06 48 09 d0 5d c3 66 0f 1f 84 00 00 00 00 00 55 89 f0 89 f9 48 89 e5 0f 30 <31> c0 5d c3 66 90 55 89 f9 48 89 e5 0f 33 48 c1 e2 20 48 09 d0
[  600.715196] Kernel panic - not syncing: hung_task: blocked tasks
[  600.715889] CPU: 0 PID: 11 Comm: khungtaskd Not tainted 4.4.0-rc2+ #332
[  600.716540] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/02/2015
[  600.717910]  ffffffff81a4d19c 00000000480750be ffff88007b8ffd60 ffffffff8136dbf4
[  600.718610]  ffff88007b8ffde8 ffffffff81173559 ffff880000000008 ffff88007b8ffdf8
[  600.719302]  ffff88007b8ffd90 00000000480750be 0000000000000001 0000000000000001
[  600.719984] Call Trace:
[  600.720646]  [<ffffffff8136dbf4>] dump_stack+0x19/0x25
[  600.721330]  [<ffffffff81173559>] panic+0xd3/0x212
[  600.722009]  [<ffffffff8112c4db>] watchdog+0x33b/0x370
[  600.722686]  [<ffffffff8112c221>] ? watchdog+0x81/0x370
[  600.723213]  [<ffffffff8112c1a0>] ? reset_hung_task_detector+0x20/0x20
[  600.723674]  [<ffffffff810a9175>] kthread+0xf5/0x110
[  600.724107]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
[  600.724509]  [<ffffffff81738ccf>] ret_from_fork+0x3f/0x70
[  600.724903]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240

thanks,
Kinglong Mee

On 9/17/2015 19:58, Jeff Layton wrote:
> In order to allow the client to make a sane determination of what
> happened with racing LAYOUTGET/LAYOUTRETURN/CB_LAYOUTRECALL calls, we
> must ensure that the seqids return accurately represent the order of
> operations. The simplest way to do that is to ensure that operations on
> a single stateid are serialized.
> 
> This patch adds a mutex to the layout stateid, and locks it when
> checking the layout stateid's seqid. The mutex is held over the entire
> operation and released after the seqid is bumped.
> 
> Note that in the case of CB_LAYOUTRECALL we must move the increment of
> the seqid and setting into a new cb "prepare" operation. The lease
> infrastructure will call the lm_break callback with a spinlock held, so
> and we can't take the mutex in that codepath.
> 
> Cc: Christoph Hellwig <hch@lst.de>
> Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
> ---
>  fs/nfsd/nfs4layouts.c | 25 +++++++++++++++++++++----
>  fs/nfsd/nfs4proc.c    |  4 ++++
>  fs/nfsd/state.h       |  1 +
>  3 files changed, 26 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
> index ebf90e487c75..4a68ab901b4b 100644
> --- a/fs/nfsd/nfs4layouts.c
> +++ b/fs/nfsd/nfs4layouts.c
> @@ -201,6 +201,7 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
>  	INIT_LIST_HEAD(&ls->ls_perfile);
>  	spin_lock_init(&ls->ls_lock);
>  	INIT_LIST_HEAD(&ls->ls_layouts);
> +	mutex_init(&ls->ls_mutex);
>  	ls->ls_layout_type = layout_type;
>  	nfsd4_init_cb(&ls->ls_recall, clp, &nfsd4_cb_layout_ops,
>  			NFSPROC4_CLNT_CB_LAYOUT);
> @@ -262,19 +263,23 @@ nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
>  		status = nfserr_jukebox;
>  		if (!ls)
>  			goto out;
> +		mutex_lock(&ls->ls_mutex);
>  	} else {
>  		ls = container_of(stid, struct nfs4_layout_stateid, ls_stid);
>  
>  		status = nfserr_bad_stateid;
> +		mutex_lock(&ls->ls_mutex);
>  		if (stateid->si_generation > stid->sc_stateid.si_generation)
> -			goto out_put_stid;
> +			goto out_unlock_stid;
>  		if (layout_type != ls->ls_layout_type)
> -			goto out_put_stid;
> +			goto out_unlock_stid;
>  	}
>  
>  	*lsp = ls;
>  	return 0;
>  
> +out_unlock_stid:
> +	mutex_unlock(&ls->ls_mutex);
>  out_put_stid:
>  	nfs4_put_stid(stid);
>  out:
> @@ -296,8 +301,6 @@ nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls)
>  	trace_layout_recall(&ls->ls_stid.sc_stateid);
>  
>  	atomic_inc(&ls->ls_stid.sc_count);
> -	update_stateid(&ls->ls_stid.sc_stateid);
> -	memcpy(&ls->ls_recall_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t));
>  	nfsd4_run_cb(&ls->ls_recall);
>  
>  out_unlock:
> @@ -494,6 +497,7 @@ nfsd4_return_file_layouts(struct svc_rqst *rqstp,
>  	}
>  	spin_unlock(&ls->ls_lock);
>  
> +	mutex_unlock(&ls->ls_mutex);
>  	nfs4_put_stid(&ls->ls_stid);
>  	nfsd4_free_layouts(&reaplist);
>  	return nfs_ok;
> @@ -608,6 +612,17 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
>  	}
>  }
>  
> +static void
> +nfsd4_cb_layout_prepare(struct nfsd4_callback *cb)
> +{
> +	struct nfs4_layout_stateid *ls =
> +		container_of(cb, struct nfs4_layout_stateid, ls_recall);
> +
> +	mutex_lock(&ls->ls_mutex);
> +	update_stateid(&ls->ls_stid.sc_stateid);
> +	memcpy(&ls->ls_recall_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t));
> +}
> +
>  static int
>  nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
>  {
> @@ -649,12 +664,14 @@ nfsd4_cb_layout_release(struct nfsd4_callback *cb)
>  
>  	trace_layout_recall_release(&ls->ls_stid.sc_stateid);
>  
> +	mutex_unlock(&ls->ls_mutex);
>  	nfsd4_return_all_layouts(ls, &reaplist);
>  	nfsd4_free_layouts(&reaplist);
>  	nfs4_put_stid(&ls->ls_stid);
>  }
>  
>  static struct nfsd4_callback_ops nfsd4_cb_layout_ops = {
> +	.prepare	= nfsd4_cb_layout_prepare,
>  	.done		= nfsd4_cb_layout_done,
>  	.release	= nfsd4_cb_layout_release,
>  };
> diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
> index 4ce6b97b31ad..a9f096c7e99f 100644
> --- a/fs/nfsd/nfs4proc.c
> +++ b/fs/nfsd/nfs4proc.c
> @@ -1309,6 +1309,7 @@ nfsd4_layoutget(struct svc_rqst *rqstp,
>  	nfserr = nfsd4_insert_layout(lgp, ls);
>  
>  out_put_stid:
> +	mutex_unlock(&ls->ls_mutex);
>  	nfs4_put_stid(&ls->ls_stid);
>  out:
>  	return nfserr;
> @@ -1362,6 +1363,9 @@ nfsd4_layoutcommit(struct svc_rqst *rqstp,
>  		goto out;
>  	}
>  
> +	/* LAYOUTCOMMIT does not require any serialization */
> +	mutex_unlock(&ls->ls_mutex);
> +
>  	if (new_size > i_size_read(inode)) {
>  		lcp->lc_size_chg = 1;
>  		lcp->lc_newsize = new_size;
> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> index 31bde12feefe..1fa0f3848d4e 100644
> --- a/fs/nfsd/state.h
> +++ b/fs/nfsd/state.h
> @@ -562,6 +562,7 @@ struct nfs4_layout_stateid {
>  	struct nfsd4_callback		ls_recall;
>  	stateid_t			ls_recall_sid;
>  	bool				ls_recalled;
> +	struct mutex			ls_mutex;
>  };
>  
>  static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s)
> 

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC] nfsd: serialize layout stateid morphing operations
  2015-11-29  4:07 ` Kinglong Mee
@ 2015-11-29 13:46   ` Jeff Layton
  2015-11-30  2:57     ` Kinglong Mee
  0 siblings, 1 reply; 26+ messages in thread
From: Jeff Layton @ 2015-11-29 13:46 UTC (permalink / raw)
  To: Kinglong Mee; +Cc: bfields, Christoph Hellwig, linux-nfs

On Sun, 29 Nov 2015 12:07:48 +0800
Kinglong Mee <kinglongmee@gmail.com> wrote:

> I meet two problems with this patch,
> 

Ok, I was less sure of this patch than the other one. I think we will
need to serialize these operations, but this may not be the best way to
do it.

Bruce if you want to back this one for now, then I'm fine with that. It
may be a bit before I can get to it.


> 1. a dump_stack messages printed in process_one_work() at kernel/workqueue.c.
> 
> BUG: workqueue leaked lock or atomic: kworker/u2:4/0x00000000/98#012     last function: nfsd4_run_cb_work [nfsd]
> 1 lock held by kworker/u2:4/98:
> #0:  (&ls->ls_mutex){......}, at: [<ffffffffa0250d34>] nfsd4_cb_layout_prepare+0x24/0x40 [nfsd]
> CPU: 0 PID: 98 Comm: kworker/u2:4 Not tainted 4.4.0-rc2+ #333
> Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/02/2015
> Workqueue: nfsd4_callbacks nfsd4_run_cb_work [nfsd]
> ffff8800362b9e40 000000007fe9394f ffff880036353d58 ffffffff8136dc64
> ffff880036353dd8 ffffffff810a3f12 ffffffff810a3cbd 000000000000000a
> ffffffffa0261d78 ffffffff82902e20 0000000000000000 ffffffffa0259241
> Call Trace:
> [<ffffffff8136dc64>] dump_stack+0x19/0x25
> [<ffffffff810a3f12>] process_one_work+0x3c2/0x4c0
> [<ffffffff810a3cbd>] ? process_one_work+0x16d/0x4c0
> [<ffffffff810a405a>] worker_thread+0x4a/0x440
> [<ffffffff810a4010>] ? process_one_work+0x4c0/0x4c0
> [<ffffffff810a4010>] ? process_one_work+0x4c0/0x4c0
> [<ffffffff810a91e5>] kthread+0xf5/0x110
> [<ffffffff810a90f0>] ? kthread_create_on_node+0x240/0x240
> [<ffffffff81738d0f>] ret_from_fork+0x3f/0x70
> [<ffffffff810a90f0>] ? kthread_create_on_node+0x240/0x240
> 


I guess this is just a general "hey you left a mutex locked" warning
when finishing a workqueue job? This patch actually does want to do
that, but I trying to tell that to lockdep may be tricky...


> 2. a mutex race between layoutrecall and layoutcommit, 
> 
> callback thread,
> nfsd4_cb_layout_prepare 
>   --->mutex_lock(&ls->ls_mutex);
> 
> layoutcommit thread,
> nfsd4_layoutcommit
>   ---> nfsd4_preprocess_layout_stateid
>      --> mutex_lock(&ls->ls_mutex);   <----------------  hang 
> 
> [  600.645142] INFO: task nfsd:11623 blocked for more than 120 seconds.
> [  600.646337]       Not tainted 4.4.0-rc2+ #332
> [  600.647404] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [  600.648546] nfsd            D ffff880064277b80     0 11623      2 0x00000000
> [  600.649803]  ffff880064277b80 ffff880064278000 00000000ffffffff ffff88005dd241a8
> [  600.651021]  ffffffffa025e77c 0000000000000246 ffff880064277b98 ffffffff81733103
> [  600.652255]  ffff880063d7e100 ffff880064277ba8 ffffffff8173330e ffff880064277c28
> [  600.653512] Call Trace:
> [  600.654765]  [<ffffffffa025e77c>] ? nfsd4_preprocess_layout_stateid+0x37c/0x400 [nfsd]
> [  600.656084]  [<ffffffff81733103>] schedule+0x33/0x80
> [  600.657405]  [<ffffffff8173330e>] schedule_preempt_disabled+0xe/0x10
> [  600.658741]  [<ffffffff817357f5>] mutex_lock_nested+0x145/0x330
> [  600.660094]  [<ffffffffa025e77c>] ? nfsd4_preprocess_layout_stateid+0x37c/0x400 [nfsd]
> [  600.661696]  [<ffffffffa025e77c>] nfsd4_preprocess_layout_stateid+0x37c/0x400 [nfsd]
> [  600.663129]  [<ffffffffa025e405>] ? nfsd4_preprocess_layout_stateid+0x5/0x400 [nfsd]
> [  600.664558]  [<ffffffff81173b8f>] ? printk+0x56/0x72
> [  600.665990]  [<ffffffffa023e3ec>] nfsd4_layoutcommit+0x13c/0x200 [nfsd]
> [  600.667365]  [<ffffffffa023fb98>] nfsd4_proc_compound+0x388/0x660 [nfsd]
> [  600.668835]  [<ffffffffa022c148>] nfsd_dispatch+0xb8/0x200 [nfsd]
> [  600.670323]  [<ffffffffa0093d89>] svc_process_common+0x409/0x650 [sunrpc]
> [  600.671836]  [<ffffffffa0094e04>] svc_process+0xf4/0x190 [sunrpc]
> [  600.673328]  [<ffffffffa022bb05>] nfsd+0x135/0x1a0 [nfsd]
> [  600.674825]  [<ffffffffa022b9d5>] ? nfsd+0x5/0x1a0 [nfsd]
> [  600.676388]  [<ffffffffa022b9d0>] ? nfsd_destroy+0xb0/0xb0 [nfsd]
> [  600.677884]  [<ffffffff810a9175>] kthread+0xf5/0x110
> [  600.679373]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
> [  600.680874]  [<ffffffff81738ccf>] ret_from_fork+0x3f/0x70
> [  600.682398]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
> [  600.683893] 1 lock held by nfsd/11623:
> [  600.685449]  #0:  (&ls->ls_mutex){......}, at: [<ffffffffa025e77c>] nfsd4_preprocess_layout_stateid+0x37c/0x400 [nfsd]
> [  600.688778] Sending NMI to all CPUs:
> [  600.690854] NMI backtrace for cpu 0
> [  600.691909] CPU: 0 PID: 11 Comm: khungtaskd Not tainted 4.4.0-rc2+ #332
> [  600.692523] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/02/2015
> [  600.693821] task: ffff88007b900000 ti: ffff88007b8fc000 task.ti: ffff88007b8fc000
> [  600.694496] RIP: 0010:[<ffffffff81053aca>]  [<ffffffff81053aca>] native_write_msr_safe+0xa/0x10
> [  600.695185] RSP: 0018:ffff88007b8ffd70  EFLAGS: 00000046
> [  600.695861] RAX: 0000000000000400 RBX: 0000000000000286 RCX: 0000000000000830
> [  600.696539] RDX: 0000000000000000 RSI: 0000000000000400 RDI: 0000000000000830
> [  600.697204] RBP: ffff88007b8ffd70 R08: 0000000000000400 R09: 0000000000000000
> [  600.697862] R10: 00000000000000e4 R11: 0000000000000001 R12: ffff880063d7e100
> [  600.698513] R13: 00000000003fff3c R14: ffff880063d7e308 R15: 0000000000000004
> [  600.699156] FS:  0000000000000000(0000) GS:ffffffff81c27000(0000) knlGS:0000000000000000
> [  600.699823] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  600.700459] CR2: 00007f366afd4000 CR3: 000000005dd56000 CR4: 00000000001406f0
> [  600.701106] Stack:
> [  600.701745]  ffff88007b8ffd88 ffffffff8104a860 ffffffff81047340 ffff88007b8ffd98
> [  600.702404]  ffffffff8104a885 ffff88007b8ffda8 ffffffff8104735b ffff88007b8ffdd8
> [  600.703058]  ffffffff813723ad 0000000000000078 ffff880063d7e100 00000000003fff3c
> [  600.703712] Call Trace:
> [  600.704355]  [<ffffffff8104a860>] __x2apic_send_IPI_mask.isra.2+0x60/0x70
> [  600.705017]  [<ffffffff81047340>] ? setup_vector_irq+0x130/0x130
> [  600.705676]  [<ffffffff8104a885>] x2apic_send_IPI_mask+0x15/0x20
> [  600.706335]  [<ffffffff8104735b>] nmi_raise_cpu_backtrace+0x1b/0x20
> [  600.706989]  [<ffffffff813723ad>] nmi_trigger_all_cpu_backtrace+0x14d/0x1c0
> [  600.707693]  [<ffffffff810473b9>] arch_trigger_all_cpu_backtrace+0x19/0x20
> [  600.708362]  [<ffffffff8112c4cf>] watchdog+0x32f/0x370
> [  600.709031]  [<ffffffff8112c221>] ? watchdog+0x81/0x370
> [  600.709725]  [<ffffffff8112c1a0>] ? reset_hung_task_detector+0x20/0x20
> [  600.710398]  [<ffffffff810a9175>] kthread+0xf5/0x110
> [  600.711067]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
> [  600.711739]  [<ffffffff81738ccf>] ret_from_fork+0x3f/0x70
> [  600.712405]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
> [  600.713073] Code: 00 55 89 f9 48 89 e5 0f 32 45 31 c0 48 c1 e2 20 44 89 06 48 09 d0 5d c3 66 0f 1f 84 00 00 00 00 00 55 89 f0 89 f9 48 89 e5 0f 30 <31> c0 5d c3 66 90 55 89 f9 48 89 e5 0f 33 48 c1 e2 20 48 09 d0
> [  600.715196] Kernel panic - not syncing: hung_task: blocked tasks
> [  600.715889] CPU: 0 PID: 11 Comm: khungtaskd Not tainted 4.4.0-rc2+ #332
> [  600.716540] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/02/2015
> [  600.717910]  ffffffff81a4d19c 00000000480750be ffff88007b8ffd60 ffffffff8136dbf4
> [  600.718610]  ffff88007b8ffde8 ffffffff81173559 ffff880000000008 ffff88007b8ffdf8
> [  600.719302]  ffff88007b8ffd90 00000000480750be 0000000000000001 0000000000000001
> [  600.719984] Call Trace:
> [  600.720646]  [<ffffffff8136dbf4>] dump_stack+0x19/0x25
> [  600.721330]  [<ffffffff81173559>] panic+0xd3/0x212
> [  600.722009]  [<ffffffff8112c4db>] watchdog+0x33b/0x370
> [  600.722686]  [<ffffffff8112c221>] ? watchdog+0x81/0x370
> [  600.723213]  [<ffffffff8112c1a0>] ? reset_hung_task_detector+0x20/0x20
> [  600.723674]  [<ffffffff810a9175>] kthread+0xf5/0x110
> [  600.724107]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
> [  600.724509]  [<ffffffff81738ccf>] ret_from_fork+0x3f/0x70
> [  600.724903]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
> 
> thanks,
> Kinglong Mee
> 

This is the bigger problem, I think. The question of course is why the
client didn't respond to the cb request? Still, holding a mutex across
the callback RPC is pretty ugly and now that I think about it, I don't
think it's really necessary anyway. Once we've copied the stateid, we
aren't really changing anything else so we can let other layout ops
proceed.

Would a patch like this fix it?

-----------------------8<--------------------------

[PATCH] nfsd: don't hold ls_mutex across a layout recall

We do need to serialize layout stateid morphing operations, but we
currently hold the ls_mutex across a layout recall which is pretty
ugly. It's also unnecessary -- once we've bumped the seqid and
copied it, we don't need to serialize the rest of the CB_LAYOUTRECALL
vs. anything else. Just drop the mutex once the copy is done.

Fixes: cc8a55320b5f "nfsd: serialize layout stateid morphing operations"
Cc: stable@vger.kernel.org
Reported-by: Kinglong Mee <kinglongmee@gmail.com>
Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
---
 fs/nfsd/nfs4layouts.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 9ffef06b30d5..c9d6c715c0fb 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -616,6 +616,7 @@ nfsd4_cb_layout_prepare(struct nfsd4_callback *cb)
 
 	mutex_lock(&ls->ls_mutex);
 	nfs4_inc_and_copy_stateid(&ls->ls_recall_sid, &ls->ls_stid);
+	mutex_unlock(&ls->ls_mutex);
 }
 
 static int
@@ -659,7 +660,6 @@ nfsd4_cb_layout_release(struct nfsd4_callback *cb)
 
 	trace_layout_recall_release(&ls->ls_stid.sc_stateid);
 
-	mutex_unlock(&ls->ls_mutex);
 	nfsd4_return_all_layouts(ls, &reaplist);
 	nfsd4_free_layouts(&reaplist);
 	nfs4_put_stid(&ls->ls_stid);
-- 
2.4.3

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC] nfsd: serialize layout stateid morphing operations
  2015-11-29 13:46   ` Jeff Layton
@ 2015-11-30  2:57     ` Kinglong Mee
  2015-11-30 21:34       ` J. Bruce Fields
  0 siblings, 1 reply; 26+ messages in thread
From: Kinglong Mee @ 2015-11-30  2:57 UTC (permalink / raw)
  To: Jeff Layton; +Cc: bfields, Christoph Hellwig, linux-nfs, kinglongmee

On 11/29/2015 21:46, Jeff Layton wrote:
> On Sun, 29 Nov 2015 12:07:48 +0800
> Kinglong Mee <kinglongmee@gmail.com> wrote:
> 
>> I meet two problems with this patch,
>>
> 
> Ok, I was less sure of this patch than the other one. I think we will
> need to serialize these operations, but this may not be the best way to
> do it.
> 
> Bruce if you want to back this one for now, then I'm fine with that. It
> may be a bit before I can get to it.
> 
> 
>> 1. a dump_stack messages printed in process_one_work() at kernel/workqueue.c.
>>
>> BUG: workqueue leaked lock or atomic: kworker/u2:4/0x00000000/98#012     last function: nfsd4_run_cb_work [nfsd]
>> 1 lock held by kworker/u2:4/98:
>> #0:  (&ls->ls_mutex){......}, at: [<ffffffffa0250d34>] nfsd4_cb_layout_prepare+0x24/0x40 [nfsd]
>> CPU: 0 PID: 98 Comm: kworker/u2:4 Not tainted 4.4.0-rc2+ #333
>> Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/02/2015
>> Workqueue: nfsd4_callbacks nfsd4_run_cb_work [nfsd]
>> ffff8800362b9e40 000000007fe9394f ffff880036353d58 ffffffff8136dc64
>> ffff880036353dd8 ffffffff810a3f12 ffffffff810a3cbd 000000000000000a
>> ffffffffa0261d78 ffffffff82902e20 0000000000000000 ffffffffa0259241
>> Call Trace:
>> [<ffffffff8136dc64>] dump_stack+0x19/0x25
>> [<ffffffff810a3f12>] process_one_work+0x3c2/0x4c0
>> [<ffffffff810a3cbd>] ? process_one_work+0x16d/0x4c0
>> [<ffffffff810a405a>] worker_thread+0x4a/0x440
>> [<ffffffff810a4010>] ? process_one_work+0x4c0/0x4c0
>> [<ffffffff810a4010>] ? process_one_work+0x4c0/0x4c0
>> [<ffffffff810a91e5>] kthread+0xf5/0x110
>> [<ffffffff810a90f0>] ? kthread_create_on_node+0x240/0x240
>> [<ffffffff81738d0f>] ret_from_fork+0x3f/0x70
>> [<ffffffff810a90f0>] ? kthread_create_on_node+0x240/0x240
>>
> 
> 
> I guess this is just a general "hey you left a mutex locked" warning
> when finishing a workqueue job? This patch actually does want to do
> that, but I trying to tell that to lockdep may be tricky...

Yes, it's just a warning.
But, it's terrible that kernel prints it every time
when calling nfsd4_cb_layout_prepare.

> 
> 
>> 2. a mutex race between layoutrecall and layoutcommit, 
>>
>> callback thread,
>> nfsd4_cb_layout_prepare 
>>   --->mutex_lock(&ls->ls_mutex);
>>
>> layoutcommit thread,
>> nfsd4_layoutcommit
>>   ---> nfsd4_preprocess_layout_stateid
>>      --> mutex_lock(&ls->ls_mutex);   <----------------  hang 
>>
>> [  600.645142] INFO: task nfsd:11623 blocked for more than 120 seconds.
>> [  600.646337]       Not tainted 4.4.0-rc2+ #332
>> [  600.647404] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>> [  600.648546] nfsd            D ffff880064277b80     0 11623      2 0x00000000
>> [  600.649803]  ffff880064277b80 ffff880064278000 00000000ffffffff ffff88005dd241a8
>> [  600.651021]  ffffffffa025e77c 0000000000000246 ffff880064277b98 ffffffff81733103
>> [  600.652255]  ffff880063d7e100 ffff880064277ba8 ffffffff8173330e ffff880064277c28
>> [  600.653512] Call Trace:
>> [  600.654765]  [<ffffffffa025e77c>] ? nfsd4_preprocess_layout_stateid+0x37c/0x400 [nfsd]
>> [  600.656084]  [<ffffffff81733103>] schedule+0x33/0x80
>> [  600.657405]  [<ffffffff8173330e>] schedule_preempt_disabled+0xe/0x10
>> [  600.658741]  [<ffffffff817357f5>] mutex_lock_nested+0x145/0x330
>> [  600.660094]  [<ffffffffa025e77c>] ? nfsd4_preprocess_layout_stateid+0x37c/0x400 [nfsd]
>> [  600.661696]  [<ffffffffa025e77c>] nfsd4_preprocess_layout_stateid+0x37c/0x400 [nfsd]
>> [  600.663129]  [<ffffffffa025e405>] ? nfsd4_preprocess_layout_stateid+0x5/0x400 [nfsd]
>> [  600.664558]  [<ffffffff81173b8f>] ? printk+0x56/0x72
>> [  600.665990]  [<ffffffffa023e3ec>] nfsd4_layoutcommit+0x13c/0x200 [nfsd]
>> [  600.667365]  [<ffffffffa023fb98>] nfsd4_proc_compound+0x388/0x660 [nfsd]
>> [  600.668835]  [<ffffffffa022c148>] nfsd_dispatch+0xb8/0x200 [nfsd]
>> [  600.670323]  [<ffffffffa0093d89>] svc_process_common+0x409/0x650 [sunrpc]
>> [  600.671836]  [<ffffffffa0094e04>] svc_process+0xf4/0x190 [sunrpc]
>> [  600.673328]  [<ffffffffa022bb05>] nfsd+0x135/0x1a0 [nfsd]
>> [  600.674825]  [<ffffffffa022b9d5>] ? nfsd+0x5/0x1a0 [nfsd]
>> [  600.676388]  [<ffffffffa022b9d0>] ? nfsd_destroy+0xb0/0xb0 [nfsd]
>> [  600.677884]  [<ffffffff810a9175>] kthread+0xf5/0x110
>> [  600.679373]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
>> [  600.680874]  [<ffffffff81738ccf>] ret_from_fork+0x3f/0x70
>> [  600.682398]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
>> [  600.683893] 1 lock held by nfsd/11623:
>> [  600.685449]  #0:  (&ls->ls_mutex){......}, at: [<ffffffffa025e77c>] nfsd4_preprocess_layout_stateid+0x37c/0x400 [nfsd]
>> [  600.688778] Sending NMI to all CPUs:
>> [  600.690854] NMI backtrace for cpu 0
>> [  600.691909] CPU: 0 PID: 11 Comm: khungtaskd Not tainted 4.4.0-rc2+ #332
>> [  600.692523] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/02/2015
>> [  600.693821] task: ffff88007b900000 ti: ffff88007b8fc000 task.ti: ffff88007b8fc000
>> [  600.694496] RIP: 0010:[<ffffffff81053aca>]  [<ffffffff81053aca>] native_write_msr_safe+0xa/0x10
>> [  600.695185] RSP: 0018:ffff88007b8ffd70  EFLAGS: 00000046
>> [  600.695861] RAX: 0000000000000400 RBX: 0000000000000286 RCX: 0000000000000830
>> [  600.696539] RDX: 0000000000000000 RSI: 0000000000000400 RDI: 0000000000000830
>> [  600.697204] RBP: ffff88007b8ffd70 R08: 0000000000000400 R09: 0000000000000000
>> [  600.697862] R10: 00000000000000e4 R11: 0000000000000001 R12: ffff880063d7e100
>> [  600.698513] R13: 00000000003fff3c R14: ffff880063d7e308 R15: 0000000000000004
>> [  600.699156] FS:  0000000000000000(0000) GS:ffffffff81c27000(0000) knlGS:0000000000000000
>> [  600.699823] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>> [  600.700459] CR2: 00007f366afd4000 CR3: 000000005dd56000 CR4: 00000000001406f0
>> [  600.701106] Stack:
>> [  600.701745]  ffff88007b8ffd88 ffffffff8104a860 ffffffff81047340 ffff88007b8ffd98
>> [  600.702404]  ffffffff8104a885 ffff88007b8ffda8 ffffffff8104735b ffff88007b8ffdd8
>> [  600.703058]  ffffffff813723ad 0000000000000078 ffff880063d7e100 00000000003fff3c
>> [  600.703712] Call Trace:
>> [  600.704355]  [<ffffffff8104a860>] __x2apic_send_IPI_mask.isra.2+0x60/0x70
>> [  600.705017]  [<ffffffff81047340>] ? setup_vector_irq+0x130/0x130
>> [  600.705676]  [<ffffffff8104a885>] x2apic_send_IPI_mask+0x15/0x20
>> [  600.706335]  [<ffffffff8104735b>] nmi_raise_cpu_backtrace+0x1b/0x20
>> [  600.706989]  [<ffffffff813723ad>] nmi_trigger_all_cpu_backtrace+0x14d/0x1c0
>> [  600.707693]  [<ffffffff810473b9>] arch_trigger_all_cpu_backtrace+0x19/0x20
>> [  600.708362]  [<ffffffff8112c4cf>] watchdog+0x32f/0x370
>> [  600.709031]  [<ffffffff8112c221>] ? watchdog+0x81/0x370
>> [  600.709725]  [<ffffffff8112c1a0>] ? reset_hung_task_detector+0x20/0x20
>> [  600.710398]  [<ffffffff810a9175>] kthread+0xf5/0x110
>> [  600.711067]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
>> [  600.711739]  [<ffffffff81738ccf>] ret_from_fork+0x3f/0x70
>> [  600.712405]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
>> [  600.713073] Code: 00 55 89 f9 48 89 e5 0f 32 45 31 c0 48 c1 e2 20 44 89 06 48 09 d0 5d c3 66 0f 1f 84 00 00 00 00 00 55 89 f0 89 f9 48 89 e5 0f 30 <31> c0 5d c3 66 90 55 89 f9 48 89 e5 0f 33 48 c1 e2 20 48 09 d0
>> [  600.715196] Kernel panic - not syncing: hung_task: blocked tasks
>> [  600.715889] CPU: 0 PID: 11 Comm: khungtaskd Not tainted 4.4.0-rc2+ #332
>> [  600.716540] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/02/2015
>> [  600.717910]  ffffffff81a4d19c 00000000480750be ffff88007b8ffd60 ffffffff8136dbf4
>> [  600.718610]  ffff88007b8ffde8 ffffffff81173559 ffff880000000008 ffff88007b8ffdf8
>> [  600.719302]  ffff88007b8ffd90 00000000480750be 0000000000000001 0000000000000001
>> [  600.719984] Call Trace:
>> [  600.720646]  [<ffffffff8136dbf4>] dump_stack+0x19/0x25
>> [  600.721330]  [<ffffffff81173559>] panic+0xd3/0x212
>> [  600.722009]  [<ffffffff8112c4db>] watchdog+0x33b/0x370
>> [  600.722686]  [<ffffffff8112c221>] ? watchdog+0x81/0x370
>> [  600.723213]  [<ffffffff8112c1a0>] ? reset_hung_task_detector+0x20/0x20
>> [  600.723674]  [<ffffffff810a9175>] kthread+0xf5/0x110
>> [  600.724107]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
>> [  600.724509]  [<ffffffff81738ccf>] ret_from_fork+0x3f/0x70
>> [  600.724903]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
>>
>> thanks,
>> Kinglong Mee
>>
> 
> This is the bigger problem, I think. The question of course is why the
> client didn't respond to the cb request? Still, holding a mutex across
> the callback RPC is pretty ugly and now that I think about it, I don't
> think it's really necessary anyway. Once we've copied the stateid, we
> aren't really changing anything else so we can let other layout ops
> proceed.
> 
> Would a patch like this fix it?

Yes, it's great.
With this patch, every thing is okay.

> 
> -----------------------8<--------------------------
> 
> [PATCH] nfsd: don't hold ls_mutex across a layout recall
> 
> We do need to serialize layout stateid morphing operations, but we
> currently hold the ls_mutex across a layout recall which is pretty
> ugly. It's also unnecessary -- once we've bumped the seqid and
> copied it, we don't need to serialize the rest of the CB_LAYOUTRECALL
> vs. anything else. Just drop the mutex once the copy is done.
> 
> Fixes: cc8a55320b5f "nfsd: serialize layout stateid morphing operations"
> Cc: stable@vger.kernel.org
> Reported-by: Kinglong Mee <kinglongmee@gmail.com>
> Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>

Tested-by: Kinglong Mee <kinglongmee@gmail.com>

thanks,
Kinglong Mee

> ---
>  fs/nfsd/nfs4layouts.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
> index 9ffef06b30d5..c9d6c715c0fb 100644
> --- a/fs/nfsd/nfs4layouts.c
> +++ b/fs/nfsd/nfs4layouts.c
> @@ -616,6 +616,7 @@ nfsd4_cb_layout_prepare(struct nfsd4_callback *cb)
>  
>  	mutex_lock(&ls->ls_mutex);
>  	nfs4_inc_and_copy_stateid(&ls->ls_recall_sid, &ls->ls_stid);
> +	mutex_unlock(&ls->ls_mutex);
>  }
>  
>  static int
> @@ -659,7 +660,6 @@ nfsd4_cb_layout_release(struct nfsd4_callback *cb)
>  
>  	trace_layout_recall_release(&ls->ls_stid.sc_stateid);
>  
> -	mutex_unlock(&ls->ls_mutex);
>  	nfsd4_return_all_layouts(ls, &reaplist);
>  	nfsd4_free_layouts(&reaplist);
>  	nfs4_put_stid(&ls->ls_stid);
> 

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC] nfsd: serialize layout stateid morphing operations
  2015-11-30  2:57     ` Kinglong Mee
@ 2015-11-30 21:34       ` J. Bruce Fields
  2015-12-01  0:33         ` Jeff Layton
  0 siblings, 1 reply; 26+ messages in thread
From: J. Bruce Fields @ 2015-11-30 21:34 UTC (permalink / raw)
  To: Kinglong Mee; +Cc: Jeff Layton, Christoph Hellwig, linux-nfs

On Mon, Nov 30, 2015 at 10:57:07AM +0800, Kinglong Mee wrote:
> On 11/29/2015 21:46, Jeff Layton wrote:
> > On Sun, 29 Nov 2015 12:07:48 +0800
> > Kinglong Mee <kinglongmee@gmail.com> wrote:
> > 
> >> I meet two problems with this patch,
> >>
> > 
> > Ok, I was less sure of this patch than the other one. I think we will
> > need to serialize these operations, but this may not be the best way to
> > do it.
> > 
> > Bruce if you want to back this one for now, then I'm fine with that. It
> > may be a bit before I can get to it.
> > 
> > 
> >> 1. a dump_stack messages printed in process_one_work() at kernel/workqueue.c.
> >>
> >> BUG: workqueue leaked lock or atomic: kworker/u2:4/0x00000000/98#012     last function: nfsd4_run_cb_work [nfsd]
> >> 1 lock held by kworker/u2:4/98:
> >> #0:  (&ls->ls_mutex){......}, at: [<ffffffffa0250d34>] nfsd4_cb_layout_prepare+0x24/0x40 [nfsd]
> >> CPU: 0 PID: 98 Comm: kworker/u2:4 Not tainted 4.4.0-rc2+ #333
> >> Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/02/2015
> >> Workqueue: nfsd4_callbacks nfsd4_run_cb_work [nfsd]
> >> ffff8800362b9e40 000000007fe9394f ffff880036353d58 ffffffff8136dc64
> >> ffff880036353dd8 ffffffff810a3f12 ffffffff810a3cbd 000000000000000a
> >> ffffffffa0261d78 ffffffff82902e20 0000000000000000 ffffffffa0259241
> >> Call Trace:
> >> [<ffffffff8136dc64>] dump_stack+0x19/0x25
> >> [<ffffffff810a3f12>] process_one_work+0x3c2/0x4c0
> >> [<ffffffff810a3cbd>] ? process_one_work+0x16d/0x4c0
> >> [<ffffffff810a405a>] worker_thread+0x4a/0x440
> >> [<ffffffff810a4010>] ? process_one_work+0x4c0/0x4c0
> >> [<ffffffff810a4010>] ? process_one_work+0x4c0/0x4c0
> >> [<ffffffff810a91e5>] kthread+0xf5/0x110
> >> [<ffffffff810a90f0>] ? kthread_create_on_node+0x240/0x240
> >> [<ffffffff81738d0f>] ret_from_fork+0x3f/0x70
> >> [<ffffffff810a90f0>] ? kthread_create_on_node+0x240/0x240
> >>
> > 
> > 
> > I guess this is just a general "hey you left a mutex locked" warning
> > when finishing a workqueue job? This patch actually does want to do
> > that, but I trying to tell that to lockdep may be tricky...
> 
> Yes, it's just a warning.
> But, it's terrible that kernel prints it every time
> when calling nfsd4_cb_layout_prepare.
> 
> > 
> > 
> >> 2. a mutex race between layoutrecall and layoutcommit, 
> >>
> >> callback thread,
> >> nfsd4_cb_layout_prepare 
> >>   --->mutex_lock(&ls->ls_mutex);
> >>
> >> layoutcommit thread,
> >> nfsd4_layoutcommit
> >>   ---> nfsd4_preprocess_layout_stateid
> >>      --> mutex_lock(&ls->ls_mutex);   <----------------  hang 
> >>
> >> [  600.645142] INFO: task nfsd:11623 blocked for more than 120 seconds.
> >> [  600.646337]       Not tainted 4.4.0-rc2+ #332
> >> [  600.647404] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> >> [  600.648546] nfsd            D ffff880064277b80     0 11623      2 0x00000000
> >> [  600.649803]  ffff880064277b80 ffff880064278000 00000000ffffffff ffff88005dd241a8
> >> [  600.651021]  ffffffffa025e77c 0000000000000246 ffff880064277b98 ffffffff81733103
> >> [  600.652255]  ffff880063d7e100 ffff880064277ba8 ffffffff8173330e ffff880064277c28
> >> [  600.653512] Call Trace:
> >> [  600.654765]  [<ffffffffa025e77c>] ? nfsd4_preprocess_layout_stateid+0x37c/0x400 [nfsd]
> >> [  600.656084]  [<ffffffff81733103>] schedule+0x33/0x80
> >> [  600.657405]  [<ffffffff8173330e>] schedule_preempt_disabled+0xe/0x10
> >> [  600.658741]  [<ffffffff817357f5>] mutex_lock_nested+0x145/0x330
> >> [  600.660094]  [<ffffffffa025e77c>] ? nfsd4_preprocess_layout_stateid+0x37c/0x400 [nfsd]
> >> [  600.661696]  [<ffffffffa025e77c>] nfsd4_preprocess_layout_stateid+0x37c/0x400 [nfsd]
> >> [  600.663129]  [<ffffffffa025e405>] ? nfsd4_preprocess_layout_stateid+0x5/0x400 [nfsd]
> >> [  600.664558]  [<ffffffff81173b8f>] ? printk+0x56/0x72
> >> [  600.665990]  [<ffffffffa023e3ec>] nfsd4_layoutcommit+0x13c/0x200 [nfsd]
> >> [  600.667365]  [<ffffffffa023fb98>] nfsd4_proc_compound+0x388/0x660 [nfsd]
> >> [  600.668835]  [<ffffffffa022c148>] nfsd_dispatch+0xb8/0x200 [nfsd]
> >> [  600.670323]  [<ffffffffa0093d89>] svc_process_common+0x409/0x650 [sunrpc]
> >> [  600.671836]  [<ffffffffa0094e04>] svc_process+0xf4/0x190 [sunrpc]
> >> [  600.673328]  [<ffffffffa022bb05>] nfsd+0x135/0x1a0 [nfsd]
> >> [  600.674825]  [<ffffffffa022b9d5>] ? nfsd+0x5/0x1a0 [nfsd]
> >> [  600.676388]  [<ffffffffa022b9d0>] ? nfsd_destroy+0xb0/0xb0 [nfsd]
> >> [  600.677884]  [<ffffffff810a9175>] kthread+0xf5/0x110
> >> [  600.679373]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
> >> [  600.680874]  [<ffffffff81738ccf>] ret_from_fork+0x3f/0x70
> >> [  600.682398]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
> >> [  600.683893] 1 lock held by nfsd/11623:
> >> [  600.685449]  #0:  (&ls->ls_mutex){......}, at: [<ffffffffa025e77c>] nfsd4_preprocess_layout_stateid+0x37c/0x400 [nfsd]
> >> [  600.688778] Sending NMI to all CPUs:
> >> [  600.690854] NMI backtrace for cpu 0
> >> [  600.691909] CPU: 0 PID: 11 Comm: khungtaskd Not tainted 4.4.0-rc2+ #332
> >> [  600.692523] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/02/2015
> >> [  600.693821] task: ffff88007b900000 ti: ffff88007b8fc000 task.ti: ffff88007b8fc000
> >> [  600.694496] RIP: 0010:[<ffffffff81053aca>]  [<ffffffff81053aca>] native_write_msr_safe+0xa/0x10
> >> [  600.695185] RSP: 0018:ffff88007b8ffd70  EFLAGS: 00000046
> >> [  600.695861] RAX: 0000000000000400 RBX: 0000000000000286 RCX: 0000000000000830
> >> [  600.696539] RDX: 0000000000000000 RSI: 0000000000000400 RDI: 0000000000000830
> >> [  600.697204] RBP: ffff88007b8ffd70 R08: 0000000000000400 R09: 0000000000000000
> >> [  600.697862] R10: 00000000000000e4 R11: 0000000000000001 R12: ffff880063d7e100
> >> [  600.698513] R13: 00000000003fff3c R14: ffff880063d7e308 R15: 0000000000000004
> >> [  600.699156] FS:  0000000000000000(0000) GS:ffffffff81c27000(0000) knlGS:0000000000000000
> >> [  600.699823] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> >> [  600.700459] CR2: 00007f366afd4000 CR3: 000000005dd56000 CR4: 00000000001406f0
> >> [  600.701106] Stack:
> >> [  600.701745]  ffff88007b8ffd88 ffffffff8104a860 ffffffff81047340 ffff88007b8ffd98
> >> [  600.702404]  ffffffff8104a885 ffff88007b8ffda8 ffffffff8104735b ffff88007b8ffdd8
> >> [  600.703058]  ffffffff813723ad 0000000000000078 ffff880063d7e100 00000000003fff3c
> >> [  600.703712] Call Trace:
> >> [  600.704355]  [<ffffffff8104a860>] __x2apic_send_IPI_mask.isra.2+0x60/0x70
> >> [  600.705017]  [<ffffffff81047340>] ? setup_vector_irq+0x130/0x130
> >> [  600.705676]  [<ffffffff8104a885>] x2apic_send_IPI_mask+0x15/0x20
> >> [  600.706335]  [<ffffffff8104735b>] nmi_raise_cpu_backtrace+0x1b/0x20
> >> [  600.706989]  [<ffffffff813723ad>] nmi_trigger_all_cpu_backtrace+0x14d/0x1c0
> >> [  600.707693]  [<ffffffff810473b9>] arch_trigger_all_cpu_backtrace+0x19/0x20
> >> [  600.708362]  [<ffffffff8112c4cf>] watchdog+0x32f/0x370
> >> [  600.709031]  [<ffffffff8112c221>] ? watchdog+0x81/0x370
> >> [  600.709725]  [<ffffffff8112c1a0>] ? reset_hung_task_detector+0x20/0x20
> >> [  600.710398]  [<ffffffff810a9175>] kthread+0xf5/0x110
> >> [  600.711067]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
> >> [  600.711739]  [<ffffffff81738ccf>] ret_from_fork+0x3f/0x70
> >> [  600.712405]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
> >> [  600.713073] Code: 00 55 89 f9 48 89 e5 0f 32 45 31 c0 48 c1 e2 20 44 89 06 48 09 d0 5d c3 66 0f 1f 84 00 00 00 00 00 55 89 f0 89 f9 48 89 e5 0f 30 <31> c0 5d c3 66 90 55 89 f9 48 89 e5 0f 33 48 c1 e2 20 48 09 d0
> >> [  600.715196] Kernel panic - not syncing: hung_task: blocked tasks
> >> [  600.715889] CPU: 0 PID: 11 Comm: khungtaskd Not tainted 4.4.0-rc2+ #332
> >> [  600.716540] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/02/2015
> >> [  600.717910]  ffffffff81a4d19c 00000000480750be ffff88007b8ffd60 ffffffff8136dbf4
> >> [  600.718610]  ffff88007b8ffde8 ffffffff81173559 ffff880000000008 ffff88007b8ffdf8
> >> [  600.719302]  ffff88007b8ffd90 00000000480750be 0000000000000001 0000000000000001
> >> [  600.719984] Call Trace:
> >> [  600.720646]  [<ffffffff8136dbf4>] dump_stack+0x19/0x25
> >> [  600.721330]  [<ffffffff81173559>] panic+0xd3/0x212
> >> [  600.722009]  [<ffffffff8112c4db>] watchdog+0x33b/0x370
> >> [  600.722686]  [<ffffffff8112c221>] ? watchdog+0x81/0x370
> >> [  600.723213]  [<ffffffff8112c1a0>] ? reset_hung_task_detector+0x20/0x20
> >> [  600.723674]  [<ffffffff810a9175>] kthread+0xf5/0x110
> >> [  600.724107]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
> >> [  600.724509]  [<ffffffff81738ccf>] ret_from_fork+0x3f/0x70
> >> [  600.724903]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
> >>
> >> thanks,
> >> Kinglong Mee
> >>
> > 
> > This is the bigger problem, I think. The question of course is why the
> > client didn't respond to the cb request? Still, holding a mutex across
> > the callback RPC is pretty ugly and now that I think about it, I don't
> > think it's really necessary anyway. Once we've copied the stateid, we
> > aren't really changing anything else so we can let other layout ops
> > proceed.
> > 
> > Would a patch like this fix it?
> 
> Yes, it's great.
> With this patch, every thing is okay.
> 
> > 
> > -----------------------8<--------------------------
> > 
> > [PATCH] nfsd: don't hold ls_mutex across a layout recall
> > 
> > We do need to serialize layout stateid morphing operations, but we
> > currently hold the ls_mutex across a layout recall which is pretty
> > ugly. It's also unnecessary -- once we've bumped the seqid and
> > copied it, we don't need to serialize the rest of the CB_LAYOUTRECALL
> > vs. anything else. Just drop the mutex once the copy is done.
> > 
> > Fixes: cc8a55320b5f "nfsd: serialize layout stateid morphing operations"
> > Cc: stable@vger.kernel.org
> > Reported-by: Kinglong Mee <kinglongmee@gmail.com>
> > Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
> 
> Tested-by: Kinglong Mee <kinglongmee@gmail.com>

Thanks for the report and the testing.

It may be worth applying just as a stopgap, but I'm not convinced this
is right yet....

I guess I'd expect the stateid bump to be atomic with the actual change
to the layout state.  We're bumping the layout stateid on sending the
callback, and modifying the actual layout state on receive.  So after
the below patch we've left a window during which nfsd threads see the
new layout stateid but the old layout state.  I wonder what the
practical consequences of that are?

My temptation is to argue that the layout processing currently done in
the release method should be done on sending (doesn't look like it's
conditional on the result of the callback, so what are we waiting for?).
But I'm not sure that's right.  I'll go read the spec....

--b.

> 
> thanks,
> Kinglong Mee
> 
> > ---
> >  fs/nfsd/nfs4layouts.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> > 
> > diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
> > index 9ffef06b30d5..c9d6c715c0fb 100644
> > --- a/fs/nfsd/nfs4layouts.c
> > +++ b/fs/nfsd/nfs4layouts.c
> > @@ -616,6 +616,7 @@ nfsd4_cb_layout_prepare(struct nfsd4_callback *cb)
> >  
> >  	mutex_lock(&ls->ls_mutex);
> >  	nfs4_inc_and_copy_stateid(&ls->ls_recall_sid, &ls->ls_stid);
> > +	mutex_unlock(&ls->ls_mutex);
> >  }
> >  
> >  static int
> > @@ -659,7 +660,6 @@ nfsd4_cb_layout_release(struct nfsd4_callback *cb)
> >  
> >  	trace_layout_recall_release(&ls->ls_stid.sc_stateid);
> >  
> > -	mutex_unlock(&ls->ls_mutex);
> >  	nfsd4_return_all_layouts(ls, &reaplist);
> >  	nfsd4_free_layouts(&reaplist);
> >  	nfs4_put_stid(&ls->ls_stid);
> > 

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC] nfsd: serialize layout stateid morphing operations
  2015-11-30 21:34       ` J. Bruce Fields
@ 2015-12-01  0:33         ` Jeff Layton
  2015-12-01  0:55           ` Trond Myklebust
  2015-12-01 11:56           ` Christoph Hellwig
  0 siblings, 2 replies; 26+ messages in thread
From: Jeff Layton @ 2015-12-01  0:33 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Kinglong Mee, Christoph Hellwig, linux-nfs

On Mon, 30 Nov 2015 16:34:20 -0500
"J. Bruce Fields" <bfields@fieldses.org> wrote:

> On Mon, Nov 30, 2015 at 10:57:07AM +0800, Kinglong Mee wrote:
> > On 11/29/2015 21:46, Jeff Layton wrote:
> > > On Sun, 29 Nov 2015 12:07:48 +0800
> > > Kinglong Mee <kinglongmee@gmail.com> wrote:
> > > 
> > >> I meet two problems with this patch,
> > >>
> > > 
> > > Ok, I was less sure of this patch than the other one. I think we will
> > > need to serialize these operations, but this may not be the best way to
> > > do it.
> > > 
> > > Bruce if you want to back this one for now, then I'm fine with that. It
> > > may be a bit before I can get to it.
> > > 
> > > 
> > >> 1. a dump_stack messages printed in process_one_work() at kernel/workqueue.c.
> > >>
> > >> BUG: workqueue leaked lock or atomic: kworker/u2:4/0x00000000/98#012     last function: nfsd4_run_cb_work [nfsd]
> > >> 1 lock held by kworker/u2:4/98:
> > >> #0:  (&ls->ls_mutex){......}, at: [<ffffffffa0250d34>] nfsd4_cb_layout_prepare+0x24/0x40 [nfsd]
> > >> CPU: 0 PID: 98 Comm: kworker/u2:4 Not tainted 4.4.0-rc2+ #333
> > >> Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/02/2015
> > >> Workqueue: nfsd4_callbacks nfsd4_run_cb_work [nfsd]
> > >> ffff8800362b9e40 000000007fe9394f ffff880036353d58 ffffffff8136dc64
> > >> ffff880036353dd8 ffffffff810a3f12 ffffffff810a3cbd 000000000000000a
> > >> ffffffffa0261d78 ffffffff82902e20 0000000000000000 ffffffffa0259241
> > >> Call Trace:
> > >> [<ffffffff8136dc64>] dump_stack+0x19/0x25
> > >> [<ffffffff810a3f12>] process_one_work+0x3c2/0x4c0
> > >> [<ffffffff810a3cbd>] ? process_one_work+0x16d/0x4c0
> > >> [<ffffffff810a405a>] worker_thread+0x4a/0x440
> > >> [<ffffffff810a4010>] ? process_one_work+0x4c0/0x4c0
> > >> [<ffffffff810a4010>] ? process_one_work+0x4c0/0x4c0
> > >> [<ffffffff810a91e5>] kthread+0xf5/0x110
> > >> [<ffffffff810a90f0>] ? kthread_create_on_node+0x240/0x240
> > >> [<ffffffff81738d0f>] ret_from_fork+0x3f/0x70
> > >> [<ffffffff810a90f0>] ? kthread_create_on_node+0x240/0x240
> > >>
> > > 
> > > 
> > > I guess this is just a general "hey you left a mutex locked" warning
> > > when finishing a workqueue job? This patch actually does want to do
> > > that, but I trying to tell that to lockdep may be tricky...
> > 
> > Yes, it's just a warning.
> > But, it's terrible that kernel prints it every time
> > when calling nfsd4_cb_layout_prepare.
> > 
> > > 
> > > 
> > >> 2. a mutex race between layoutrecall and layoutcommit, 
> > >>
> > >> callback thread,
> > >> nfsd4_cb_layout_prepare 
> > >>   --->mutex_lock(&ls->ls_mutex);
> > >>
> > >> layoutcommit thread,
> > >> nfsd4_layoutcommit
> > >>   ---> nfsd4_preprocess_layout_stateid
> > >>      --> mutex_lock(&ls->ls_mutex);   <----------------  hang 
> > >>
> > >> [  600.645142] INFO: task nfsd:11623 blocked for more than 120 seconds.
> > >> [  600.646337]       Not tainted 4.4.0-rc2+ #332
> > >> [  600.647404] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> > >> [  600.648546] nfsd            D ffff880064277b80     0 11623      2 0x00000000
> > >> [  600.649803]  ffff880064277b80 ffff880064278000 00000000ffffffff ffff88005dd241a8
> > >> [  600.651021]  ffffffffa025e77c 0000000000000246 ffff880064277b98 ffffffff81733103
> > >> [  600.652255]  ffff880063d7e100 ffff880064277ba8 ffffffff8173330e ffff880064277c28
> > >> [  600.653512] Call Trace:
> > >> [  600.654765]  [<ffffffffa025e77c>] ? nfsd4_preprocess_layout_stateid+0x37c/0x400 [nfsd]
> > >> [  600.656084]  [<ffffffff81733103>] schedule+0x33/0x80
> > >> [  600.657405]  [<ffffffff8173330e>] schedule_preempt_disabled+0xe/0x10
> > >> [  600.658741]  [<ffffffff817357f5>] mutex_lock_nested+0x145/0x330
> > >> [  600.660094]  [<ffffffffa025e77c>] ? nfsd4_preprocess_layout_stateid+0x37c/0x400 [nfsd]
> > >> [  600.661696]  [<ffffffffa025e77c>] nfsd4_preprocess_layout_stateid+0x37c/0x400 [nfsd]
> > >> [  600.663129]  [<ffffffffa025e405>] ? nfsd4_preprocess_layout_stateid+0x5/0x400 [nfsd]
> > >> [  600.664558]  [<ffffffff81173b8f>] ? printk+0x56/0x72
> > >> [  600.665990]  [<ffffffffa023e3ec>] nfsd4_layoutcommit+0x13c/0x200 [nfsd]
> > >> [  600.667365]  [<ffffffffa023fb98>] nfsd4_proc_compound+0x388/0x660 [nfsd]
> > >> [  600.668835]  [<ffffffffa022c148>] nfsd_dispatch+0xb8/0x200 [nfsd]
> > >> [  600.670323]  [<ffffffffa0093d89>] svc_process_common+0x409/0x650 [sunrpc]
> > >> [  600.671836]  [<ffffffffa0094e04>] svc_process+0xf4/0x190 [sunrpc]
> > >> [  600.673328]  [<ffffffffa022bb05>] nfsd+0x135/0x1a0 [nfsd]
> > >> [  600.674825]  [<ffffffffa022b9d5>] ? nfsd+0x5/0x1a0 [nfsd]
> > >> [  600.676388]  [<ffffffffa022b9d0>] ? nfsd_destroy+0xb0/0xb0 [nfsd]
> > >> [  600.677884]  [<ffffffff810a9175>] kthread+0xf5/0x110
> > >> [  600.679373]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
> > >> [  600.680874]  [<ffffffff81738ccf>] ret_from_fork+0x3f/0x70
> > >> [  600.682398]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
> > >> [  600.683893] 1 lock held by nfsd/11623:
> > >> [  600.685449]  #0:  (&ls->ls_mutex){......}, at: [<ffffffffa025e77c>] nfsd4_preprocess_layout_stateid+0x37c/0x400 [nfsd]
> > >> [  600.688778] Sending NMI to all CPUs:
> > >> [  600.690854] NMI backtrace for cpu 0
> > >> [  600.691909] CPU: 0 PID: 11 Comm: khungtaskd Not tainted 4.4.0-rc2+ #332
> > >> [  600.692523] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/02/2015
> > >> [  600.693821] task: ffff88007b900000 ti: ffff88007b8fc000 task.ti: ffff88007b8fc000
> > >> [  600.694496] RIP: 0010:[<ffffffff81053aca>]  [<ffffffff81053aca>] native_write_msr_safe+0xa/0x10
> > >> [  600.695185] RSP: 0018:ffff88007b8ffd70  EFLAGS: 00000046
> > >> [  600.695861] RAX: 0000000000000400 RBX: 0000000000000286 RCX: 0000000000000830
> > >> [  600.696539] RDX: 0000000000000000 RSI: 0000000000000400 RDI: 0000000000000830
> > >> [  600.697204] RBP: ffff88007b8ffd70 R08: 0000000000000400 R09: 0000000000000000
> > >> [  600.697862] R10: 00000000000000e4 R11: 0000000000000001 R12: ffff880063d7e100
> > >> [  600.698513] R13: 00000000003fff3c R14: ffff880063d7e308 R15: 0000000000000004
> > >> [  600.699156] FS:  0000000000000000(0000) GS:ffffffff81c27000(0000) knlGS:0000000000000000
> > >> [  600.699823] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > >> [  600.700459] CR2: 00007f366afd4000 CR3: 000000005dd56000 CR4: 00000000001406f0
> > >> [  600.701106] Stack:
> > >> [  600.701745]  ffff88007b8ffd88 ffffffff8104a860 ffffffff81047340 ffff88007b8ffd98
> > >> [  600.702404]  ffffffff8104a885 ffff88007b8ffda8 ffffffff8104735b ffff88007b8ffdd8
> > >> [  600.703058]  ffffffff813723ad 0000000000000078 ffff880063d7e100 00000000003fff3c
> > >> [  600.703712] Call Trace:
> > >> [  600.704355]  [<ffffffff8104a860>] __x2apic_send_IPI_mask.isra.2+0x60/0x70
> > >> [  600.705017]  [<ffffffff81047340>] ? setup_vector_irq+0x130/0x130
> > >> [  600.705676]  [<ffffffff8104a885>] x2apic_send_IPI_mask+0x15/0x20
> > >> [  600.706335]  [<ffffffff8104735b>] nmi_raise_cpu_backtrace+0x1b/0x20
> > >> [  600.706989]  [<ffffffff813723ad>] nmi_trigger_all_cpu_backtrace+0x14d/0x1c0
> > >> [  600.707693]  [<ffffffff810473b9>] arch_trigger_all_cpu_backtrace+0x19/0x20
> > >> [  600.708362]  [<ffffffff8112c4cf>] watchdog+0x32f/0x370
> > >> [  600.709031]  [<ffffffff8112c221>] ? watchdog+0x81/0x370
> > >> [  600.709725]  [<ffffffff8112c1a0>] ? reset_hung_task_detector+0x20/0x20
> > >> [  600.710398]  [<ffffffff810a9175>] kthread+0xf5/0x110
> > >> [  600.711067]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
> > >> [  600.711739]  [<ffffffff81738ccf>] ret_from_fork+0x3f/0x70
> > >> [  600.712405]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
> > >> [  600.713073] Code: 00 55 89 f9 48 89 e5 0f 32 45 31 c0 48 c1 e2 20 44 89 06 48 09 d0 5d c3 66 0f 1f 84 00 00 00 00 00 55 89 f0 89 f9 48 89 e5 0f 30 <31> c0 5d c3 66 90 55 89 f9 48 89 e5 0f 33 48 c1 e2 20 48 09 d0
> > >> [  600.715196] Kernel panic - not syncing: hung_task: blocked tasks
> > >> [  600.715889] CPU: 0 PID: 11 Comm: khungtaskd Not tainted 4.4.0-rc2+ #332
> > >> [  600.716540] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/02/2015
> > >> [  600.717910]  ffffffff81a4d19c 00000000480750be ffff88007b8ffd60 ffffffff8136dbf4
> > >> [  600.718610]  ffff88007b8ffde8 ffffffff81173559 ffff880000000008 ffff88007b8ffdf8
> > >> [  600.719302]  ffff88007b8ffd90 00000000480750be 0000000000000001 0000000000000001
> > >> [  600.719984] Call Trace:
> > >> [  600.720646]  [<ffffffff8136dbf4>] dump_stack+0x19/0x25
> > >> [  600.721330]  [<ffffffff81173559>] panic+0xd3/0x212
> > >> [  600.722009]  [<ffffffff8112c4db>] watchdog+0x33b/0x370
> > >> [  600.722686]  [<ffffffff8112c221>] ? watchdog+0x81/0x370
> > >> [  600.723213]  [<ffffffff8112c1a0>] ? reset_hung_task_detector+0x20/0x20
> > >> [  600.723674]  [<ffffffff810a9175>] kthread+0xf5/0x110
> > >> [  600.724107]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
> > >> [  600.724509]  [<ffffffff81738ccf>] ret_from_fork+0x3f/0x70
> > >> [  600.724903]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
> > >>
> > >> thanks,
> > >> Kinglong Mee
> > >>
> > > 
> > > This is the bigger problem, I think. The question of course is why the
> > > client didn't respond to the cb request? Still, holding a mutex across
> > > the callback RPC is pretty ugly and now that I think about it, I don't
> > > think it's really necessary anyway. Once we've copied the stateid, we
> > > aren't really changing anything else so we can let other layout ops
> > > proceed.
> > > 
> > > Would a patch like this fix it?
> > 
> > Yes, it's great.
> > With this patch, every thing is okay.
> > 
> > > 
> > > -----------------------8<--------------------------
> > > 
> > > [PATCH] nfsd: don't hold ls_mutex across a layout recall
> > > 
> > > We do need to serialize layout stateid morphing operations, but we
> > > currently hold the ls_mutex across a layout recall which is pretty
> > > ugly. It's also unnecessary -- once we've bumped the seqid and
> > > copied it, we don't need to serialize the rest of the CB_LAYOUTRECALL
> > > vs. anything else. Just drop the mutex once the copy is done.
> > > 
> > > Fixes: cc8a55320b5f "nfsd: serialize layout stateid morphing operations"
> > > Cc: stable@vger.kernel.org
> > > Reported-by: Kinglong Mee <kinglongmee@gmail.com>
> > > Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
> > 
> > Tested-by: Kinglong Mee <kinglongmee@gmail.com>
> 
> Thanks for the report and the testing.
> 
> It may be worth applying just as a stopgap, but I'm not convinced this
> is right yet....
> 
> I guess I'd expect the stateid bump to be atomic with the actual change
> to the layout state.  We're bumping the layout stateid on sending the
> callback, and modifying the actual layout state on receive.  So after
> the below patch we've left a window during which nfsd threads see the
> new layout stateid but the old layout state.  I wonder what the
> practical consequences of that are?
> 
> My temptation is to argue that the layout processing currently done in
> the release method should be done on sending (doesn't look like it's
> conditional on the result of the callback, so what are we waiting for?).
> But I'm not sure that's right.  I'll go read the spec....
> 
> --b.
> 

Agreed. The basic rule (AIUI) is that if you change anything in the
layout then you should bump the seqid.

But... is the layout processing in that release method supposed to be
done at all in CB_LAYOUTRECALL? The spec says:

-------------------8<-----------------------
   The client's processing for CB_LAYOUTRECALL is similar to CB_RECALL
   (recall of file delegations) in that the client responds to the
   request before actually returning layouts via the LAYOUTRETURN
   operation.  While the client responds to the CB_LAYOUTRECALL
   immediately, the operation is not considered complete (i.e.,
   considered pending) until all affected layouts are returned to the
   server via the LAYOUTRETURN operation
-------------------8<-----------------------

It doesn't seem like we ought to be tearing down layouts at that
point, but rather just notifying the client that they should be
returned. Revoking the layout while the client still might have it in
use seems like it could be problematic. Is there something I'm missing
there?

Oh, but I note that we do always set clora_changed to true...is that
what gives us carte blanche to go ahead and return the layouts? If so,
then doing that in the "prepare" op does seem like the right thing to
do...

> > 
> > thanks,
> > Kinglong Mee
> > 
> > > ---
> > >  fs/nfsd/nfs4layouts.c | 2 +-
> > >  1 file changed, 1 insertion(+), 1 deletion(-)
> > > 
> > > diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
> > > index 9ffef06b30d5..c9d6c715c0fb 100644
> > > --- a/fs/nfsd/nfs4layouts.c
> > > +++ b/fs/nfsd/nfs4layouts.c
> > > @@ -616,6 +616,7 @@ nfsd4_cb_layout_prepare(struct nfsd4_callback *cb)
> > >  
> > >  	mutex_lock(&ls->ls_mutex);
> > >  	nfs4_inc_and_copy_stateid(&ls->ls_recall_sid, &ls->ls_stid);
> > > +	mutex_unlock(&ls->ls_mutex);
> > >  }
> > >  
> > >  static int
> > > @@ -659,7 +660,6 @@ nfsd4_cb_layout_release(struct nfsd4_callback *cb)
> > >  
> > >  	trace_layout_recall_release(&ls->ls_stid.sc_stateid);
> > >  
> > > -	mutex_unlock(&ls->ls_mutex);
> > >  	nfsd4_return_all_layouts(ls, &reaplist);
> > >  	nfsd4_free_layouts(&reaplist);
> > >  	nfs4_put_stid(&ls->ls_stid);
> > > 


-- 
Jeff Layton <jlayton@poochiereds.net>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC] nfsd: serialize layout stateid morphing operations
  2015-12-01  0:33         ` Jeff Layton
@ 2015-12-01  0:55           ` Trond Myklebust
  2015-12-01 11:56           ` Christoph Hellwig
  1 sibling, 0 replies; 26+ messages in thread
From: Trond Myklebust @ 2015-12-01  0:55 UTC (permalink / raw)
  To: Jeff Layton
  Cc: J. Bruce Fields, Kinglong Mee, Christoph Hellwig, Linux NFS Mailing List

On Mon, Nov 30, 2015 at 7:33 PM, Jeff Layton <jlayton@poochiereds.net> wrote:
>
> On Mon, 30 Nov 2015 16:34:20 -0500
> "J. Bruce Fields" <bfields@fieldses.org> wrote:
>
> > On Mon, Nov 30, 2015 at 10:57:07AM +0800, Kinglong Mee wrote:
> > > On 11/29/2015 21:46, Jeff Layton wrote:
> > > > On Sun, 29 Nov 2015 12:07:48 +0800
> > > > Kinglong Mee <kinglongmee@gmail.com> wrote:
> > > >
> > > >> I meet two problems with this patch,
> > > >>
> > > >
> > > > Ok, I was less sure of this patch than the other one. I think we will
> > > > need to serialize these operations, but this may not be the best way to
> > > > do it.
> > > >
> > > > Bruce if you want to back this one for now, then I'm fine with that. It
> > > > may be a bit before I can get to it.
> > > >
> > > >
> > > >> 1. a dump_stack messages printed in process_one_work() at kernel/workqueue.c.
> > > >>
> > > >> BUG: workqueue leaked lock or atomic: kworker/u2:4/0x00000000/98#012     last function: nfsd4_run_cb_work [nfsd]
> > > >> 1 lock held by kworker/u2:4/98:
> > > >> #0:  (&ls->ls_mutex){......}, at: [<ffffffffa0250d34>] nfsd4_cb_layout_prepare+0x24/0x40 [nfsd]
> > > >> CPU: 0 PID: 98 Comm: kworker/u2:4 Not tainted 4.4.0-rc2+ #333
> > > >> Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/02/2015
> > > >> Workqueue: nfsd4_callbacks nfsd4_run_cb_work [nfsd]
> > > >> ffff8800362b9e40 000000007fe9394f ffff880036353d58 ffffffff8136dc64
> > > >> ffff880036353dd8 ffffffff810a3f12 ffffffff810a3cbd 000000000000000a
> > > >> ffffffffa0261d78 ffffffff82902e20 0000000000000000 ffffffffa0259241
> > > >> Call Trace:
> > > >> [<ffffffff8136dc64>] dump_stack+0x19/0x25
> > > >> [<ffffffff810a3f12>] process_one_work+0x3c2/0x4c0
> > > >> [<ffffffff810a3cbd>] ? process_one_work+0x16d/0x4c0
> > > >> [<ffffffff810a405a>] worker_thread+0x4a/0x440
> > > >> [<ffffffff810a4010>] ? process_one_work+0x4c0/0x4c0
> > > >> [<ffffffff810a4010>] ? process_one_work+0x4c0/0x4c0
> > > >> [<ffffffff810a91e5>] kthread+0xf5/0x110
> > > >> [<ffffffff810a90f0>] ? kthread_create_on_node+0x240/0x240
> > > >> [<ffffffff81738d0f>] ret_from_fork+0x3f/0x70
> > > >> [<ffffffff810a90f0>] ? kthread_create_on_node+0x240/0x240
> > > >>
> > > >
> > > >
> > > > I guess this is just a general "hey you left a mutex locked" warning
> > > > when finishing a workqueue job? This patch actually does want to do
> > > > that, but I trying to tell that to lockdep may be tricky...
> > >
> > > Yes, it's just a warning.
> > > But, it's terrible that kernel prints it every time
> > > when calling nfsd4_cb_layout_prepare.
> > >
> > > >
> > > >
> > > >> 2. a mutex race between layoutrecall and layoutcommit,
> > > >>
> > > >> callback thread,
> > > >> nfsd4_cb_layout_prepare
> > > >>   --->mutex_lock(&ls->ls_mutex);
> > > >>
> > > >> layoutcommit thread,
> > > >> nfsd4_layoutcommit
> > > >>   ---> nfsd4_preprocess_layout_stateid
> > > >>      --> mutex_lock(&ls->ls_mutex);   <----------------  hang
> > > >>
> > > >> [  600.645142] INFO: task nfsd:11623 blocked for more than 120 seconds.
> > > >> [  600.646337]       Not tainted 4.4.0-rc2+ #332
> > > >> [  600.647404] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> > > >> [  600.648546] nfsd            D ffff880064277b80     0 11623      2 0x00000000
> > > >> [  600.649803]  ffff880064277b80 ffff880064278000 00000000ffffffff ffff88005dd241a8
> > > >> [  600.651021]  ffffffffa025e77c 0000000000000246 ffff880064277b98 ffffffff81733103
> > > >> [  600.652255]  ffff880063d7e100 ffff880064277ba8 ffffffff8173330e ffff880064277c28
> > > >> [  600.653512] Call Trace:
> > > >> [  600.654765]  [<ffffffffa025e77c>] ? nfsd4_preprocess_layout_stateid+0x37c/0x400 [nfsd]
> > > >> [  600.656084]  [<ffffffff81733103>] schedule+0x33/0x80
> > > >> [  600.657405]  [<ffffffff8173330e>] schedule_preempt_disabled+0xe/0x10
> > > >> [  600.658741]  [<ffffffff817357f5>] mutex_lock_nested+0x145/0x330
> > > >> [  600.660094]  [<ffffffffa025e77c>] ? nfsd4_preprocess_layout_stateid+0x37c/0x400 [nfsd]
> > > >> [  600.661696]  [<ffffffffa025e77c>] nfsd4_preprocess_layout_stateid+0x37c/0x400 [nfsd]
> > > >> [  600.663129]  [<ffffffffa025e405>] ? nfsd4_preprocess_layout_stateid+0x5/0x400 [nfsd]
> > > >> [  600.664558]  [<ffffffff81173b8f>] ? printk+0x56/0x72
> > > >> [  600.665990]  [<ffffffffa023e3ec>] nfsd4_layoutcommit+0x13c/0x200 [nfsd]
> > > >> [  600.667365]  [<ffffffffa023fb98>] nfsd4_proc_compound+0x388/0x660 [nfsd]
> > > >> [  600.668835]  [<ffffffffa022c148>] nfsd_dispatch+0xb8/0x200 [nfsd]
> > > >> [  600.670323]  [<ffffffffa0093d89>] svc_process_common+0x409/0x650 [sunrpc]
> > > >> [  600.671836]  [<ffffffffa0094e04>] svc_process+0xf4/0x190 [sunrpc]
> > > >> [  600.673328]  [<ffffffffa022bb05>] nfsd+0x135/0x1a0 [nfsd]
> > > >> [  600.674825]  [<ffffffffa022b9d5>] ? nfsd+0x5/0x1a0 [nfsd]
> > > >> [  600.676388]  [<ffffffffa022b9d0>] ? nfsd_destroy+0xb0/0xb0 [nfsd]
> > > >> [  600.677884]  [<ffffffff810a9175>] kthread+0xf5/0x110
> > > >> [  600.679373]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
> > > >> [  600.680874]  [<ffffffff81738ccf>] ret_from_fork+0x3f/0x70
> > > >> [  600.682398]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
> > > >> [  600.683893] 1 lock held by nfsd/11623:
> > > >> [  600.685449]  #0:  (&ls->ls_mutex){......}, at: [<ffffffffa025e77c>] nfsd4_preprocess_layout_stateid+0x37c/0x400 [nfsd]
> > > >> [  600.688778] Sending NMI to all CPUs:
> > > >> [  600.690854] NMI backtrace for cpu 0
> > > >> [  600.691909] CPU: 0 PID: 11 Comm: khungtaskd Not tainted 4.4.0-rc2+ #332
> > > >> [  600.692523] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/02/2015
> > > >> [  600.693821] task: ffff88007b900000 ti: ffff88007b8fc000 task.ti: ffff88007b8fc000
> > > >> [  600.694496] RIP: 0010:[<ffffffff81053aca>]  [<ffffffff81053aca>] native_write_msr_safe+0xa/0x10
> > > >> [  600.695185] RSP: 0018:ffff88007b8ffd70  EFLAGS: 00000046
> > > >> [  600.695861] RAX: 0000000000000400 RBX: 0000000000000286 RCX: 0000000000000830
> > > >> [  600.696539] RDX: 0000000000000000 RSI: 0000000000000400 RDI: 0000000000000830
> > > >> [  600.697204] RBP: ffff88007b8ffd70 R08: 0000000000000400 R09: 0000000000000000
> > > >> [  600.697862] R10: 00000000000000e4 R11: 0000000000000001 R12: ffff880063d7e100
> > > >> [  600.698513] R13: 00000000003fff3c R14: ffff880063d7e308 R15: 0000000000000004
> > > >> [  600.699156] FS:  0000000000000000(0000) GS:ffffffff81c27000(0000) knlGS:0000000000000000
> > > >> [  600.699823] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > > >> [  600.700459] CR2: 00007f366afd4000 CR3: 000000005dd56000 CR4: 00000000001406f0
> > > >> [  600.701106] Stack:
> > > >> [  600.701745]  ffff88007b8ffd88 ffffffff8104a860 ffffffff81047340 ffff88007b8ffd98
> > > >> [  600.702404]  ffffffff8104a885 ffff88007b8ffda8 ffffffff8104735b ffff88007b8ffdd8
> > > >> [  600.703058]  ffffffff813723ad 0000000000000078 ffff880063d7e100 00000000003fff3c
> > > >> [  600.703712] Call Trace:
> > > >> [  600.704355]  [<ffffffff8104a860>] __x2apic_send_IPI_mask.isra.2+0x60/0x70
> > > >> [  600.705017]  [<ffffffff81047340>] ? setup_vector_irq+0x130/0x130
> > > >> [  600.705676]  [<ffffffff8104a885>] x2apic_send_IPI_mask+0x15/0x20
> > > >> [  600.706335]  [<ffffffff8104735b>] nmi_raise_cpu_backtrace+0x1b/0x20
> > > >> [  600.706989]  [<ffffffff813723ad>] nmi_trigger_all_cpu_backtrace+0x14d/0x1c0
> > > >> [  600.707693]  [<ffffffff810473b9>] arch_trigger_all_cpu_backtrace+0x19/0x20
> > > >> [  600.708362]  [<ffffffff8112c4cf>] watchdog+0x32f/0x370
> > > >> [  600.709031]  [<ffffffff8112c221>] ? watchdog+0x81/0x370
> > > >> [  600.709725]  [<ffffffff8112c1a0>] ? reset_hung_task_detector+0x20/0x20
> > > >> [  600.710398]  [<ffffffff810a9175>] kthread+0xf5/0x110
> > > >> [  600.711067]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
> > > >> [  600.711739]  [<ffffffff81738ccf>] ret_from_fork+0x3f/0x70
> > > >> [  600.712405]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
> > > >> [  600.713073] Code: 00 55 89 f9 48 89 e5 0f 32 45 31 c0 48 c1 e2 20 44 89 06 48 09 d0 5d c3 66 0f 1f 84 00 00 00 00 00 55 89 f0 89 f9 48 89 e5 0f 30 <31> c0 5d c3 66 90 55 89 f9 48 89 e5 0f 33 48 c1 e2 20 48 09 d0
> > > >> [  600.715196] Kernel panic - not syncing: hung_task: blocked tasks
> > > >> [  600.715889] CPU: 0 PID: 11 Comm: khungtaskd Not tainted 4.4.0-rc2+ #332
> > > >> [  600.716540] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/02/2015
> > > >> [  600.717910]  ffffffff81a4d19c 00000000480750be ffff88007b8ffd60 ffffffff8136dbf4
> > > >> [  600.718610]  ffff88007b8ffde8 ffffffff81173559 ffff880000000008 ffff88007b8ffdf8
> > > >> [  600.719302]  ffff88007b8ffd90 00000000480750be 0000000000000001 0000000000000001
> > > >> [  600.719984] Call Trace:
> > > >> [  600.720646]  [<ffffffff8136dbf4>] dump_stack+0x19/0x25
> > > >> [  600.721330]  [<ffffffff81173559>] panic+0xd3/0x212
> > > >> [  600.722009]  [<ffffffff8112c4db>] watchdog+0x33b/0x370
> > > >> [  600.722686]  [<ffffffff8112c221>] ? watchdog+0x81/0x370
> > > >> [  600.723213]  [<ffffffff8112c1a0>] ? reset_hung_task_detector+0x20/0x20
> > > >> [  600.723674]  [<ffffffff810a9175>] kthread+0xf5/0x110
> > > >> [  600.724107]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
> > > >> [  600.724509]  [<ffffffff81738ccf>] ret_from_fork+0x3f/0x70
> > > >> [  600.724903]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
> > > >>
> > > >> thanks,
> > > >> Kinglong Mee
> > > >>
> > > >
> > > > This is the bigger problem, I think. The question of course is why the
> > > > client didn't respond to the cb request? Still, holding a mutex across
> > > > the callback RPC is pretty ugly and now that I think about it, I don't
> > > > think it's really necessary anyway. Once we've copied the stateid, we
> > > > aren't really changing anything else so we can let other layout ops
> > > > proceed.
> > > >
> > > > Would a patch like this fix it?
> > >
> > > Yes, it's great.
> > > With this patch, every thing is okay.
> > >
> > > >
> > > > -----------------------8<--------------------------
> > > >
> > > > [PATCH] nfsd: don't hold ls_mutex across a layout recall
> > > >
> > > > We do need to serialize layout stateid morphing operations, but we
> > > > currently hold the ls_mutex across a layout recall which is pretty
> > > > ugly. It's also unnecessary -- once we've bumped the seqid and
> > > > copied it, we don't need to serialize the rest of the CB_LAYOUTRECALL
> > > > vs. anything else. Just drop the mutex once the copy is done.
> > > >
> > > > Fixes: cc8a55320b5f "nfsd: serialize layout stateid morphing operations"
> > > > Cc: stable@vger.kernel.org
> > > > Reported-by: Kinglong Mee <kinglongmee@gmail.com>
> > > > Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
> > >
> > > Tested-by: Kinglong Mee <kinglongmee@gmail.com>
> >
> > Thanks for the report and the testing.
> >
> > It may be worth applying just as a stopgap, but I'm not convinced this
> > is right yet....
> >
> > I guess I'd expect the stateid bump to be atomic with the actual change
> > to the layout state.  We're bumping the layout stateid on sending the
> > callback, and modifying the actual layout state on receive.  So after
> > the below patch we've left a window during which nfsd threads see the
> > new layout stateid but the old layout state.  I wonder what the
> > practical consequences of that are?
> >
> > My temptation is to argue that the layout processing currently done in
> > the release method should be done on sending (doesn't look like it's
> > conditional on the result of the callback, so what are we waiting for?).
> > But I'm not sure that's right.  I'll go read the spec....
> >
> > --b.
> >
>
> Agreed. The basic rule (AIUI) is that if you change anything in the
> layout then you should bump the seqid.
>
> But... is the layout processing in that release method supposed to be
> done at all in CB_LAYOUTRECALL? The spec says:
>
> -------------------8<-----------------------
>    The client's processing for CB_LAYOUTRECALL is similar to CB_RECALL
>    (recall of file delegations) in that the client responds to the
>    request before actually returning layouts via the LAYOUTRETURN
>    operation.  While the client responds to the CB_LAYOUTRECALL
>    immediately, the operation is not considered complete (i.e.,
>    considered pending) until all affected layouts are returned to the
>    server via the LAYOUTRETURN operation
> -------------------8<-----------------------
>
> It doesn't seem like we ought to be tearing down layouts at that
> point, but rather just notifying the client that they should be
> returned. Revoking the layout while the client still might have it in
> use seems like it could be problematic. Is there something I'm missing
> there?
>
> Oh, but I note that we do always set clora_changed to true...is that
> what gives us carte blanche to go ahead and return the layouts? If so,
> then doing that in the "prepare" op does seem like the right thing to
> do...
>

Nope... You're not allowed to just pull the plug on the client like
that. That can, and will trigger a TEST_STATEID storm for no good
reason.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC] nfsd: serialize layout stateid morphing operations
  2015-12-01  0:33         ` Jeff Layton
  2015-12-01  0:55           ` Trond Myklebust
@ 2015-12-01 11:56           ` Christoph Hellwig
  2015-12-01 22:48             ` Jeff Layton
  1 sibling, 1 reply; 26+ messages in thread
From: Christoph Hellwig @ 2015-12-01 11:56 UTC (permalink / raw)
  To: Jeff Layton; +Cc: J. Bruce Fields, Kinglong Mee, linux-nfs

On Mon, Nov 30, 2015 at 07:33:13PM -0500, Jeff Layton wrote:
> On Mon, 30 Nov 2015 16:34:20 -0500
> "J. Bruce Fields" <bfields@fieldses.org> wrote:
> 
> > On Mon, Nov 30, 2015 at 10:57:07AM +0800, Kinglong Mee wrote:
> > > On 11/29/2015 21:46, Jeff Layton wrote:
> > > > On Sun, 29 Nov 2015 12:07:48 +0800
> > > > Kinglong Mee <kinglongmee@gmail.com> wrote:
> > > > 
> > > >> I meet two problems with this patch,
> > > >>
> > > > 
> > > > Ok, I was less sure of this patch than the other one. I think we will
> > > > need to serialize these operations, but this may not be the best way to
> > > > do it.
> > > > 
> > > > Bruce if you want to back this one for now, then I'm fine with that. It
> > > > may be a bit before I can get to it.
> > > > 
> > > > 
> > > >> 1. a dump_stack messages printed in process_one_work() at kernel/workqueue.c.
> > > >>
> > > >> BUG: workqueue leaked lock or atomic: kworker/u2:4/0x00000000/98#012     last function: nfsd4_run_cb_work [nfsd]
> > > >> 1 lock held by kworker/u2:4/98:
> > > >> #0:  (&ls->ls_mutex){......}, at: [<ffffffffa0250d34>] nfsd4_cb_layout_prepare+0x24/0x40 [nfsd]
> > > >> CPU: 0 PID: 98 Comm: kworker/u2:4 Not tainted 4.4.0-rc2+ #333
> > > >> Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/02/2015
> > > >> Workqueue: nfsd4_callbacks nfsd4_run_cb_work [nfsd]
> > > >> ffff8800362b9e40 000000007fe9394f ffff880036353d58 ffffffff8136dc64
> > > >> ffff880036353dd8 ffffffff810a3f12 ffffffff810a3cbd 000000000000000a
> > > >> ffffffffa0261d78 ffffffff82902e20 0000000000000000 ffffffffa0259241
> > > >> Call Trace:
> > > >> [<ffffffff8136dc64>] dump_stack+0x19/0x25
> > > >> [<ffffffff810a3f12>] process_one_work+0x3c2/0x4c0
> > > >> [<ffffffff810a3cbd>] ? process_one_work+0x16d/0x4c0
> > > >> [<ffffffff810a405a>] worker_thread+0x4a/0x440
> > > >> [<ffffffff810a4010>] ? process_one_work+0x4c0/0x4c0
> > > >> [<ffffffff810a4010>] ? process_one_work+0x4c0/0x4c0
> > > >> [<ffffffff810a91e5>] kthread+0xf5/0x110
> > > >> [<ffffffff810a90f0>] ? kthread_create_on_node+0x240/0x240
> > > >> [<ffffffff81738d0f>] ret_from_fork+0x3f/0x70
> > > >> [<ffffffff810a90f0>] ? kthread_create_on_node+0x240/0x240
> > > >>
> > > > 
> > > > 
> > > > I guess this is just a general "hey you left a mutex locked" warning
> > > > when finishing a workqueue job? This patch actually does want to do
> > > > that, but I trying to tell that to lockdep may be tricky...
> > > 
> > > Yes, it's just a warning.
> > > But, it's terrible that kernel prints it every time
> > > when calling nfsd4_cb_layout_prepare.
> > > 
> > > > 
> > > > 
> > > >> 2. a mutex race between layoutrecall and layoutcommit, 
> > > >>
> > > >> callback thread,
> > > >> nfsd4_cb_layout_prepare 
> > > >>   --->mutex_lock(&ls->ls_mutex);
> > > >>
> > > >> layoutcommit thread,
> > > >> nfsd4_layoutcommit
> > > >>   ---> nfsd4_preprocess_layout_stateid
> > > >>      --> mutex_lock(&ls->ls_mutex);   <----------------  hang 
> > > >>
> > > >> [  600.645142] INFO: task nfsd:11623 blocked for more than 120 seconds.
> > > >> [  600.646337]       Not tainted 4.4.0-rc2+ #332
> > > >> [  600.647404] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> > > >> [  600.648546] nfsd            D ffff880064277b80     0 11623      2 0x00000000
> > > >> [  600.649803]  ffff880064277b80 ffff880064278000 00000000ffffffff ffff88005dd241a8
> > > >> [  600.651021]  ffffffffa025e77c 0000000000000246 ffff880064277b98 ffffffff81733103
> > > >> [  600.652255]  ffff880063d7e100 ffff880064277ba8 ffffffff8173330e ffff880064277c28
> > > >> [  600.653512] Call Trace:
> > > >> [  600.654765]  [<ffffffffa025e77c>] ? nfsd4_preprocess_layout_stateid+0x37c/0x400 [nfsd]
> > > >> [  600.656084]  [<ffffffff81733103>] schedule+0x33/0x80
> > > >> [  600.657405]  [<ffffffff8173330e>] schedule_preempt_disabled+0xe/0x10
> > > >> [  600.658741]  [<ffffffff817357f5>] mutex_lock_nested+0x145/0x330
> > > >> [  600.660094]  [<ffffffffa025e77c>] ? nfsd4_preprocess_layout_stateid+0x37c/0x400 [nfsd]
> > > >> [  600.661696]  [<ffffffffa025e77c>] nfsd4_preprocess_layout_stateid+0x37c/0x400 [nfsd]
> > > >> [  600.663129]  [<ffffffffa025e405>] ? nfsd4_preprocess_layout_stateid+0x5/0x400 [nfsd]
> > > >> [  600.664558]  [<ffffffff81173b8f>] ? printk+0x56/0x72
> > > >> [  600.665990]  [<ffffffffa023e3ec>] nfsd4_layoutcommit+0x13c/0x200 [nfsd]
> > > >> [  600.667365]  [<ffffffffa023fb98>] nfsd4_proc_compound+0x388/0x660 [nfsd]
> > > >> [  600.668835]  [<ffffffffa022c148>] nfsd_dispatch+0xb8/0x200 [nfsd]
> > > >> [  600.670323]  [<ffffffffa0093d89>] svc_process_common+0x409/0x650 [sunrpc]
> > > >> [  600.671836]  [<ffffffffa0094e04>] svc_process+0xf4/0x190 [sunrpc]
> > > >> [  600.673328]  [<ffffffffa022bb05>] nfsd+0x135/0x1a0 [nfsd]
> > > >> [  600.674825]  [<ffffffffa022b9d5>] ? nfsd+0x5/0x1a0 [nfsd]
> > > >> [  600.676388]  [<ffffffffa022b9d0>] ? nfsd_destroy+0xb0/0xb0 [nfsd]
> > > >> [  600.677884]  [<ffffffff810a9175>] kthread+0xf5/0x110
> > > >> [  600.679373]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
> > > >> [  600.680874]  [<ffffffff81738ccf>] ret_from_fork+0x3f/0x70
> > > >> [  600.682398]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
> > > >> [  600.683893] 1 lock held by nfsd/11623:
> > > >> [  600.685449]  #0:  (&ls->ls_mutex){......}, at: [<ffffffffa025e77c>] nfsd4_preprocess_layout_stateid+0x37c/0x400 [nfsd]
> > > >> [  600.688778] Sending NMI to all CPUs:
> > > >> [  600.690854] NMI backtrace for cpu 0
> > > >> [  600.691909] CPU: 0 PID: 11 Comm: khungtaskd Not tainted 4.4.0-rc2+ #332
> > > >> [  600.692523] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/02/2015
> > > >> [  600.693821] task: ffff88007b900000 ti: ffff88007b8fc000 task.ti: ffff88007b8fc000
> > > >> [  600.694496] RIP: 0010:[<ffffffff81053aca>]  [<ffffffff81053aca>] native_write_msr_safe+0xa/0x10
> > > >> [  600.695185] RSP: 0018:ffff88007b8ffd70  EFLAGS: 00000046
> > > >> [  600.695861] RAX: 0000000000000400 RBX: 0000000000000286 RCX: 0000000000000830
> > > >> [  600.696539] RDX: 0000000000000000 RSI: 0000000000000400 RDI: 0000000000000830
> > > >> [  600.697204] RBP: ffff88007b8ffd70 R08: 0000000000000400 R09: 0000000000000000
> > > >> [  600.697862] R10: 00000000000000e4 R11: 0000000000000001 R12: ffff880063d7e100
> > > >> [  600.698513] R13: 00000000003fff3c R14: ffff880063d7e308 R15: 0000000000000004
> > > >> [  600.699156] FS:  0000000000000000(0000) GS:ffffffff81c27000(0000) knlGS:0000000000000000
> > > >> [  600.699823] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > > >> [  600.700459] CR2: 00007f366afd4000 CR3: 000000005dd56000 CR4: 00000000001406f0
> > > >> [  600.701106] Stack:
> > > >> [  600.701745]  ffff88007b8ffd88 ffffffff8104a860 ffffffff81047340 ffff88007b8ffd98
> > > >> [  600.702404]  ffffffff8104a885 ffff88007b8ffda8 ffffffff8104735b ffff88007b8ffdd8
> > > >> [  600.703058]  ffffffff813723ad 0000000000000078 ffff880063d7e100 00000000003fff3c
> > > >> [  600.703712] Call Trace:
> > > >> [  600.704355]  [<ffffffff8104a860>] __x2apic_send_IPI_mask.isra.2+0x60/0x70
> > > >> [  600.705017]  [<ffffffff81047340>] ? setup_vector_irq+0x130/0x130
> > > >> [  600.705676]  [<ffffffff8104a885>] x2apic_send_IPI_mask+0x15/0x20
> > > >> [  600.706335]  [<ffffffff8104735b>] nmi_raise_cpu_backtrace+0x1b/0x20
> > > >> [  600.706989]  [<ffffffff813723ad>] nmi_trigger_all_cpu_backtrace+0x14d/0x1c0
> > > >> [  600.707693]  [<ffffffff810473b9>] arch_trigger_all_cpu_backtrace+0x19/0x20
> > > >> [  600.708362]  [<ffffffff8112c4cf>] watchdog+0x32f/0x370
> > > >> [  600.709031]  [<ffffffff8112c221>] ? watchdog+0x81/0x370
> > > >> [  600.709725]  [<ffffffff8112c1a0>] ? reset_hung_task_detector+0x20/0x20
> > > >> [  600.710398]  [<ffffffff810a9175>] kthread+0xf5/0x110
> > > >> [  600.711067]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
> > > >> [  600.711739]  [<ffffffff81738ccf>] ret_from_fork+0x3f/0x70
> > > >> [  600.712405]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
> > > >> [  600.713073] Code: 00 55 89 f9 48 89 e5 0f 32 45 31 c0 48 c1 e2 20 44 89 06 48 09 d0 5d c3 66 0f 1f 84 00 00 00 00 00 55 89 f0 89 f9 48 89 e5 0f 30 <31> c0 5d c3 66 90 55 89 f9 48 89 e5 0f 33 48 c1 e2 20 48 09 d0
> > > >> [  600.715196] Kernel panic - not syncing: hung_task: blocked tasks
> > > >> [  600.715889] CPU: 0 PID: 11 Comm: khungtaskd Not tainted 4.4.0-rc2+ #332
> > > >> [  600.716540] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/02/2015
> > > >> [  600.717910]  ffffffff81a4d19c 00000000480750be ffff88007b8ffd60 ffffffff8136dbf4
> > > >> [  600.718610]  ffff88007b8ffde8 ffffffff81173559 ffff880000000008 ffff88007b8ffdf8
> > > >> [  600.719302]  ffff88007b8ffd90 00000000480750be 0000000000000001 0000000000000001
> > > >> [  600.719984] Call Trace:
> > > >> [  600.720646]  [<ffffffff8136dbf4>] dump_stack+0x19/0x25
> > > >> [  600.721330]  [<ffffffff81173559>] panic+0xd3/0x212
> > > >> [  600.722009]  [<ffffffff8112c4db>] watchdog+0x33b/0x370
> > > >> [  600.722686]  [<ffffffff8112c221>] ? watchdog+0x81/0x370
> > > >> [  600.723213]  [<ffffffff8112c1a0>] ? reset_hung_task_detector+0x20/0x20
> > > >> [  600.723674]  [<ffffffff810a9175>] kthread+0xf5/0x110
> > > >> [  600.724107]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
> > > >> [  600.724509]  [<ffffffff81738ccf>] ret_from_fork+0x3f/0x70
> > > >> [  600.724903]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
> > > >>
> > > >> thanks,
> > > >> Kinglong Mee
> > > >>
> > > > 
> > > > This is the bigger problem, I think. The question of course is why the
> > > > client didn't respond to the cb request? Still, holding a mutex across
> > > > the callback RPC is pretty ugly and now that I think about it, I don't
> > > > think it's really necessary anyway. Once we've copied the stateid, we
> > > > aren't really changing anything else so we can let other layout ops
> > > > proceed.
> > > > 
> > > > Would a patch like this fix it?
> > > 
> > > Yes, it's great.
> > > With this patch, every thing is okay.
> > > 
> > > > 
> > > > -----------------------8<--------------------------
> > > > 
> > > > [PATCH] nfsd: don't hold ls_mutex across a layout recall
> > > > 
> > > > We do need to serialize layout stateid morphing operations, but we
> > > > currently hold the ls_mutex across a layout recall which is pretty
> > > > ugly. It's also unnecessary -- once we've bumped the seqid and
> > > > copied it, we don't need to serialize the rest of the CB_LAYOUTRECALL
> > > > vs. anything else. Just drop the mutex once the copy is done.
> > > > 
> > > > Fixes: cc8a55320b5f "nfsd: serialize layout stateid morphing operations"
> > > > Cc: stable@vger.kernel.org
> > > > Reported-by: Kinglong Mee <kinglongmee@gmail.com>
> > > > Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
> > > 
> > > Tested-by: Kinglong Mee <kinglongmee@gmail.com>
> > 
> > Thanks for the report and the testing.
> > 
> > It may be worth applying just as a stopgap, but I'm not convinced this
> > is right yet....
> > 
> > I guess I'd expect the stateid bump to be atomic with the actual change
> > to the layout state.  We're bumping the layout stateid on sending the
> > callback, and modifying the actual layout state on receive.  So after
> > the below patch we've left a window during which nfsd threads see the
> > new layout stateid but the old layout state.  I wonder what the
> > practical consequences of that are?
> > 
> > My temptation is to argue that the layout processing currently done in
> > the release method should be done on sending (doesn't look like it's
> > conditional on the result of the callback, so what are we waiting for?).
> > But I'm not sure that's right.  I'll go read the spec....
> > 
> > --b.
> > 
> 
> Agreed. The basic rule (AIUI) is that if you change anything in the
> layout then you should bump the seqid.
> 
> But... is the layout processing in that release method supposed to be
> done at all in CB_LAYOUTRECALL? The spec says:
> 
> -------------------8<-----------------------
>    The client's processing for CB_LAYOUTRECALL is similar to CB_RECALL
>    (recall of file delegations) in that the client responds to the
>    request before actually returning layouts via the LAYOUTRETURN
>    operation.  While the client responds to the CB_LAYOUTRECALL
>    immediately, the operation is not considered complete (i.e.,
>    considered pending) until all affected layouts are returned to the
>    server via the LAYOUTRETURN operation
> -------------------8<-----------------------
> 
> It doesn't seem like we ought to be tearing down layouts at that
> point, but rather just notifying the client that they should be
> returned. Revoking the layout while the client still might have it in
> use seems like it could be problematic. Is there something I'm missing
> there?

Yes.  My impression based on the Linux client and the delgation
code was that we'd expect the client to either return
NFS4ERR_NOMATCHING_LAYOUT, in which case the server can reclaim the
layouts, or NFS4ERR_DELAY if it needs more time to process the layouts.

But for non-forgetful clients I wonder if returning 0 should be
interpreted the same as NFS4ERR_DELAY?  Note that we still need to
time out the client if it doesn't respond in time, so NFS4ERR_DELAY
seems better than 0, but the standard doesn't really talk about
return values other than NFS4ERR_NOMATCHING_LAYOUT.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC] nfsd: serialize layout stateid morphing operations
  2015-12-01 11:56           ` Christoph Hellwig
@ 2015-12-01 22:48             ` Jeff Layton
  2015-12-02  7:25               ` Christoph Hellwig
  0 siblings, 1 reply; 26+ messages in thread
From: Jeff Layton @ 2015-12-01 22:48 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: J. Bruce Fields, Kinglong Mee, linux-nfs

On Tue, 1 Dec 2015 12:56:00 +0100
Christoph Hellwig <hch@lst.de> wrote:

> On Mon, Nov 30, 2015 at 07:33:13PM -0500, Jeff Layton wrote:
> > On Mon, 30 Nov 2015 16:34:20 -0500
> > "J. Bruce Fields" <bfields@fieldses.org> wrote:
> > 
> > > On Mon, Nov 30, 2015 at 10:57:07AM +0800, Kinglong Mee wrote:
> > > > On 11/29/2015 21:46, Jeff Layton wrote:
> > > > > On Sun, 29 Nov 2015 12:07:48 +0800
> > > > > Kinglong Mee <kinglongmee@gmail.com> wrote:
> > > > > 
> > > > >> I meet two problems with this patch,
> > > > >>
> > > > > 
> > > > > Ok, I was less sure of this patch than the other one. I think we will
> > > > > need to serialize these operations, but this may not be the best way to
> > > > > do it.
> > > > > 
> > > > > Bruce if you want to back this one for now, then I'm fine with that. It
> > > > > may be a bit before I can get to it.
> > > > > 
> > > > > 
> > > > >> 1. a dump_stack messages printed in process_one_work() at kernel/workqueue.c.
> > > > >>
> > > > >> BUG: workqueue leaked lock or atomic: kworker/u2:4/0x00000000/98#012     last function: nfsd4_run_cb_work [nfsd]
> > > > >> 1 lock held by kworker/u2:4/98:
> > > > >> #0:  (&ls->ls_mutex){......}, at: [<ffffffffa0250d34>] nfsd4_cb_layout_prepare+0x24/0x40 [nfsd]
> > > > >> CPU: 0 PID: 98 Comm: kworker/u2:4 Not tainted 4.4.0-rc2+ #333
> > > > >> Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/02/2015
> > > > >> Workqueue: nfsd4_callbacks nfsd4_run_cb_work [nfsd]
> > > > >> ffff8800362b9e40 000000007fe9394f ffff880036353d58 ffffffff8136dc64
> > > > >> ffff880036353dd8 ffffffff810a3f12 ffffffff810a3cbd 000000000000000a
> > > > >> ffffffffa0261d78 ffffffff82902e20 0000000000000000 ffffffffa0259241
> > > > >> Call Trace:
> > > > >> [<ffffffff8136dc64>] dump_stack+0x19/0x25
> > > > >> [<ffffffff810a3f12>] process_one_work+0x3c2/0x4c0
> > > > >> [<ffffffff810a3cbd>] ? process_one_work+0x16d/0x4c0
> > > > >> [<ffffffff810a405a>] worker_thread+0x4a/0x440
> > > > >> [<ffffffff810a4010>] ? process_one_work+0x4c0/0x4c0
> > > > >> [<ffffffff810a4010>] ? process_one_work+0x4c0/0x4c0
> > > > >> [<ffffffff810a91e5>] kthread+0xf5/0x110
> > > > >> [<ffffffff810a90f0>] ? kthread_create_on_node+0x240/0x240
> > > > >> [<ffffffff81738d0f>] ret_from_fork+0x3f/0x70
> > > > >> [<ffffffff810a90f0>] ? kthread_create_on_node+0x240/0x240
> > > > >>
> > > > > 
> > > > > 
> > > > > I guess this is just a general "hey you left a mutex locked" warning
> > > > > when finishing a workqueue job? This patch actually does want to do
> > > > > that, but I trying to tell that to lockdep may be tricky...
> > > > 
> > > > Yes, it's just a warning.
> > > > But, it's terrible that kernel prints it every time
> > > > when calling nfsd4_cb_layout_prepare.
> > > > 
> > > > > 
> > > > > 
> > > > >> 2. a mutex race between layoutrecall and layoutcommit, 
> > > > >>
> > > > >> callback thread,
> > > > >> nfsd4_cb_layout_prepare 
> > > > >>   --->mutex_lock(&ls->ls_mutex);
> > > > >>
> > > > >> layoutcommit thread,
> > > > >> nfsd4_layoutcommit
> > > > >>   ---> nfsd4_preprocess_layout_stateid
> > > > >>      --> mutex_lock(&ls->ls_mutex);   <----------------  hang 
> > > > >>
> > > > >> [  600.645142] INFO: task nfsd:11623 blocked for more than 120 seconds.
> > > > >> [  600.646337]       Not tainted 4.4.0-rc2+ #332
> > > > >> [  600.647404] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> > > > >> [  600.648546] nfsd            D ffff880064277b80     0 11623      2 0x00000000
> > > > >> [  600.649803]  ffff880064277b80 ffff880064278000 00000000ffffffff ffff88005dd241a8
> > > > >> [  600.651021]  ffffffffa025e77c 0000000000000246 ffff880064277b98 ffffffff81733103
> > > > >> [  600.652255]  ffff880063d7e100 ffff880064277ba8 ffffffff8173330e ffff880064277c28
> > > > >> [  600.653512] Call Trace:
> > > > >> [  600.654765]  [<ffffffffa025e77c>] ? nfsd4_preprocess_layout_stateid+0x37c/0x400 [nfsd]
> > > > >> [  600.656084]  [<ffffffff81733103>] schedule+0x33/0x80
> > > > >> [  600.657405]  [<ffffffff8173330e>] schedule_preempt_disabled+0xe/0x10
> > > > >> [  600.658741]  [<ffffffff817357f5>] mutex_lock_nested+0x145/0x330
> > > > >> [  600.660094]  [<ffffffffa025e77c>] ? nfsd4_preprocess_layout_stateid+0x37c/0x400 [nfsd]
> > > > >> [  600.661696]  [<ffffffffa025e77c>] nfsd4_preprocess_layout_stateid+0x37c/0x400 [nfsd]
> > > > >> [  600.663129]  [<ffffffffa025e405>] ? nfsd4_preprocess_layout_stateid+0x5/0x400 [nfsd]
> > > > >> [  600.664558]  [<ffffffff81173b8f>] ? printk+0x56/0x72
> > > > >> [  600.665990]  [<ffffffffa023e3ec>] nfsd4_layoutcommit+0x13c/0x200 [nfsd]
> > > > >> [  600.667365]  [<ffffffffa023fb98>] nfsd4_proc_compound+0x388/0x660 [nfsd]
> > > > >> [  600.668835]  [<ffffffffa022c148>] nfsd_dispatch+0xb8/0x200 [nfsd]
> > > > >> [  600.670323]  [<ffffffffa0093d89>] svc_process_common+0x409/0x650 [sunrpc]
> > > > >> [  600.671836]  [<ffffffffa0094e04>] svc_process+0xf4/0x190 [sunrpc]
> > > > >> [  600.673328]  [<ffffffffa022bb05>] nfsd+0x135/0x1a0 [nfsd]
> > > > >> [  600.674825]  [<ffffffffa022b9d5>] ? nfsd+0x5/0x1a0 [nfsd]
> > > > >> [  600.676388]  [<ffffffffa022b9d0>] ? nfsd_destroy+0xb0/0xb0 [nfsd]
> > > > >> [  600.677884]  [<ffffffff810a9175>] kthread+0xf5/0x110
> > > > >> [  600.679373]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
> > > > >> [  600.680874]  [<ffffffff81738ccf>] ret_from_fork+0x3f/0x70
> > > > >> [  600.682398]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
> > > > >> [  600.683893] 1 lock held by nfsd/11623:
> > > > >> [  600.685449]  #0:  (&ls->ls_mutex){......}, at: [<ffffffffa025e77c>] nfsd4_preprocess_layout_stateid+0x37c/0x400 [nfsd]
> > > > >> [  600.688778] Sending NMI to all CPUs:
> > > > >> [  600.690854] NMI backtrace for cpu 0
> > > > >> [  600.691909] CPU: 0 PID: 11 Comm: khungtaskd Not tainted 4.4.0-rc2+ #332
> > > > >> [  600.692523] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/02/2015
> > > > >> [  600.693821] task: ffff88007b900000 ti: ffff88007b8fc000 task.ti: ffff88007b8fc000
> > > > >> [  600.694496] RIP: 0010:[<ffffffff81053aca>]  [<ffffffff81053aca>] native_write_msr_safe+0xa/0x10
> > > > >> [  600.695185] RSP: 0018:ffff88007b8ffd70  EFLAGS: 00000046
> > > > >> [  600.695861] RAX: 0000000000000400 RBX: 0000000000000286 RCX: 0000000000000830
> > > > >> [  600.696539] RDX: 0000000000000000 RSI: 0000000000000400 RDI: 0000000000000830
> > > > >> [  600.697204] RBP: ffff88007b8ffd70 R08: 0000000000000400 R09: 0000000000000000
> > > > >> [  600.697862] R10: 00000000000000e4 R11: 0000000000000001 R12: ffff880063d7e100
> > > > >> [  600.698513] R13: 00000000003fff3c R14: ffff880063d7e308 R15: 0000000000000004
> > > > >> [  600.699156] FS:  0000000000000000(0000) GS:ffffffff81c27000(0000) knlGS:0000000000000000
> > > > >> [  600.699823] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > > > >> [  600.700459] CR2: 00007f366afd4000 CR3: 000000005dd56000 CR4: 00000000001406f0
> > > > >> [  600.701106] Stack:
> > > > >> [  600.701745]  ffff88007b8ffd88 ffffffff8104a860 ffffffff81047340 ffff88007b8ffd98
> > > > >> [  600.702404]  ffffffff8104a885 ffff88007b8ffda8 ffffffff8104735b ffff88007b8ffdd8
> > > > >> [  600.703058]  ffffffff813723ad 0000000000000078 ffff880063d7e100 00000000003fff3c
> > > > >> [  600.703712] Call Trace:
> > > > >> [  600.704355]  [<ffffffff8104a860>] __x2apic_send_IPI_mask.isra.2+0x60/0x70
> > > > >> [  600.705017]  [<ffffffff81047340>] ? setup_vector_irq+0x130/0x130
> > > > >> [  600.705676]  [<ffffffff8104a885>] x2apic_send_IPI_mask+0x15/0x20
> > > > >> [  600.706335]  [<ffffffff8104735b>] nmi_raise_cpu_backtrace+0x1b/0x20
> > > > >> [  600.706989]  [<ffffffff813723ad>] nmi_trigger_all_cpu_backtrace+0x14d/0x1c0
> > > > >> [  600.707693]  [<ffffffff810473b9>] arch_trigger_all_cpu_backtrace+0x19/0x20
> > > > >> [  600.708362]  [<ffffffff8112c4cf>] watchdog+0x32f/0x370
> > > > >> [  600.709031]  [<ffffffff8112c221>] ? watchdog+0x81/0x370
> > > > >> [  600.709725]  [<ffffffff8112c1a0>] ? reset_hung_task_detector+0x20/0x20
> > > > >> [  600.710398]  [<ffffffff810a9175>] kthread+0xf5/0x110
> > > > >> [  600.711067]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
> > > > >> [  600.711739]  [<ffffffff81738ccf>] ret_from_fork+0x3f/0x70
> > > > >> [  600.712405]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
> > > > >> [  600.713073] Code: 00 55 89 f9 48 89 e5 0f 32 45 31 c0 48 c1 e2 20 44 89 06 48 09 d0 5d c3 66 0f 1f 84 00 00 00 00 00 55 89 f0 89 f9 48 89 e5 0f 30 <31> c0 5d c3 66 90 55 89 f9 48 89 e5 0f 33 48 c1 e2 20 48 09 d0
> > > > >> [  600.715196] Kernel panic - not syncing: hung_task: blocked tasks
> > > > >> [  600.715889] CPU: 0 PID: 11 Comm: khungtaskd Not tainted 4.4.0-rc2+ #332
> > > > >> [  600.716540] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/02/2015
> > > > >> [  600.717910]  ffffffff81a4d19c 00000000480750be ffff88007b8ffd60 ffffffff8136dbf4
> > > > >> [  600.718610]  ffff88007b8ffde8 ffffffff81173559 ffff880000000008 ffff88007b8ffdf8
> > > > >> [  600.719302]  ffff88007b8ffd90 00000000480750be 0000000000000001 0000000000000001
> > > > >> [  600.719984] Call Trace:
> > > > >> [  600.720646]  [<ffffffff8136dbf4>] dump_stack+0x19/0x25
> > > > >> [  600.721330]  [<ffffffff81173559>] panic+0xd3/0x212
> > > > >> [  600.722009]  [<ffffffff8112c4db>] watchdog+0x33b/0x370
> > > > >> [  600.722686]  [<ffffffff8112c221>] ? watchdog+0x81/0x370
> > > > >> [  600.723213]  [<ffffffff8112c1a0>] ? reset_hung_task_detector+0x20/0x20
> > > > >> [  600.723674]  [<ffffffff810a9175>] kthread+0xf5/0x110
> > > > >> [  600.724107]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
> > > > >> [  600.724509]  [<ffffffff81738ccf>] ret_from_fork+0x3f/0x70
> > > > >> [  600.724903]  [<ffffffff810a9080>] ? kthread_create_on_node+0x240/0x240
> > > > >>
> > > > >> thanks,
> > > > >> Kinglong Mee
> > > > >>
> > > > > 
> > > > > This is the bigger problem, I think. The question of course is why the
> > > > > client didn't respond to the cb request? Still, holding a mutex across
> > > > > the callback RPC is pretty ugly and now that I think about it, I don't
> > > > > think it's really necessary anyway. Once we've copied the stateid, we
> > > > > aren't really changing anything else so we can let other layout ops
> > > > > proceed.
> > > > > 
> > > > > Would a patch like this fix it?
> > > > 
> > > > Yes, it's great.
> > > > With this patch, every thing is okay.
> > > > 
> > > > > 
> > > > > -----------------------8<--------------------------
> > > > > 
> > > > > [PATCH] nfsd: don't hold ls_mutex across a layout recall
> > > > > 
> > > > > We do need to serialize layout stateid morphing operations, but we
> > > > > currently hold the ls_mutex across a layout recall which is pretty
> > > > > ugly. It's also unnecessary -- once we've bumped the seqid and
> > > > > copied it, we don't need to serialize the rest of the CB_LAYOUTRECALL
> > > > > vs. anything else. Just drop the mutex once the copy is done.
> > > > > 
> > > > > Fixes: cc8a55320b5f "nfsd: serialize layout stateid morphing operations"
> > > > > Cc: stable@vger.kernel.org
> > > > > Reported-by: Kinglong Mee <kinglongmee@gmail.com>
> > > > > Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
> > > > 
> > > > Tested-by: Kinglong Mee <kinglongmee@gmail.com>
> > > 
> > > Thanks for the report and the testing.
> > > 
> > > It may be worth applying just as a stopgap, but I'm not convinced this
> > > is right yet....
> > > 
> > > I guess I'd expect the stateid bump to be atomic with the actual change
> > > to the layout state.  We're bumping the layout stateid on sending the
> > > callback, and modifying the actual layout state on receive.  So after
> > > the below patch we've left a window during which nfsd threads see the
> > > new layout stateid but the old layout state.  I wonder what the
> > > practical consequences of that are?
> > > 
> > > My temptation is to argue that the layout processing currently done in
> > > the release method should be done on sending (doesn't look like it's
> > > conditional on the result of the callback, so what are we waiting for?).
> > > But I'm not sure that's right.  I'll go read the spec....
> > > 
> > > --b.
> > > 
> > 
> > Agreed. The basic rule (AIUI) is that if you change anything in the
> > layout then you should bump the seqid.
> > 
> > But... is the layout processing in that release method supposed to be
> > done at all in CB_LAYOUTRECALL? The spec says:
> > 
> > -------------------8<-----------------------
> >    The client's processing for CB_LAYOUTRECALL is similar to CB_RECALL
> >    (recall of file delegations) in that the client responds to the
> >    request before actually returning layouts via the LAYOUTRETURN
> >    operation.  While the client responds to the CB_LAYOUTRECALL
> >    immediately, the operation is not considered complete (i.e.,
> >    considered pending) until all affected layouts are returned to the
> >    server via the LAYOUTRETURN operation
> > -------------------8<-----------------------
> > 
> > It doesn't seem like we ought to be tearing down layouts at that
> > point, but rather just notifying the client that they should be
> > returned. Revoking the layout while the client still might have it in
> > use seems like it could be problematic. Is there something I'm missing
> > there?
> 
> Yes.  My impression based on the Linux client and the delgation
> code was that we'd expect the client to either return
> NFS4ERR_NOMATCHING_LAYOUT, in which case the server can reclaim the
> layouts, or NFS4ERR_DELAY if it needs more time to process the layouts.
> 
> But for non-forgetful clients I wonder if returning 0 should be
> interpreted the same as NFS4ERR_DELAY?  Note that we still need to
> time out the client if it doesn't respond in time, so NFS4ERR_DELAY
> seems better than 0, but the standard doesn't really talk about
> return values other than NFS4ERR_NOMATCHING_LAYOUT.

My interpretation is somewhat different. To me, this is how we'd
interpret the response from the client (pseudocode):

NFS_OK:
	/* Message received. I'll start returning these layouts soon. */
NFS4ERR_DELAY:
	/* I'm too resource constrained to even process this simple
           request right now. Please ask me again in a bit. */
NFS4ERR_NOMATCHING_LAYOUT:
	/* Huh? What layout? */

...IMO, the spec is pretty clear that a successful response from the
client just means that it got the message that it should start
returning layouts. If it happens to return anything before the cb
response, then that's just luck/coincidence. The server shouldn't count
on that.

-- 
Jeff Layton <jlayton@poochiereds.net>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC] nfsd: serialize layout stateid morphing operations
  2015-12-01 22:48             ` Jeff Layton
@ 2015-12-02  7:25               ` Christoph Hellwig
  2015-12-03 22:08                 ` J. Bruce Fields
  0 siblings, 1 reply; 26+ messages in thread
From: Christoph Hellwig @ 2015-12-02  7:25 UTC (permalink / raw)
  To: Jeff Layton; +Cc: Christoph Hellwig, J. Bruce Fields, Kinglong Mee, linux-nfs

On Tue, Dec 01, 2015 at 05:48:00PM -0500, Jeff Layton wrote:
> > But for non-forgetful clients I wonder if returning 0 should be
> > interpreted the same as NFS4ERR_DELAY?  Note that we still need to
> > time out the client if it doesn't respond in time, so NFS4ERR_DELAY
> > seems better than 0, but the standard doesn't really talk about
> > return values other than NFS4ERR_NOMATCHING_LAYOUT.
> 
> My interpretation is somewhat different. To me, this is how we'd
> interpret the response from the client (pseudocode):
> 
> NFS_OK:
> 	/* Message received. I'll start returning these layouts soon. */
> NFS4ERR_DELAY:
> 	/* I'm too resource constrained to even process this simple
>            request right now. Please ask me again in a bit. */
> NFS4ERR_NOMATCHING_LAYOUT:
> 	/* Huh? What layout? */
> 
> ...IMO, the spec is pretty clear that a successful response from the
> client just means that it got the message that it should start
> returning layouts. If it happens to return anything before the cb
> response, then that's just luck/coincidence. The server shouldn't count
> on that.

Ok, so for 0 we should re check if the layouts are still outstanding
before sending the next recall.  But given that we have no client
returning that or test cases I'd be tempted to treat OK like DELAY
for now - if the client is properly implemented it will eventually
return NFS4ERR_NOMATCHING_LAYOUT.  We can add a big comment on why
we're doing that so that it's obvious.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC] nfsd: serialize layout stateid morphing operations
  2015-12-02  7:25               ` Christoph Hellwig
@ 2015-12-03 22:08                 ` J. Bruce Fields
  2015-12-04  8:38                   ` Christoph Hellwig
  0 siblings, 1 reply; 26+ messages in thread
From: J. Bruce Fields @ 2015-12-03 22:08 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Jeff Layton, Kinglong Mee, linux-nfs

On Wed, Dec 02, 2015 at 08:25:04AM +0100, Christoph Hellwig wrote:
> On Tue, Dec 01, 2015 at 05:48:00PM -0500, Jeff Layton wrote:
> > > But for non-forgetful clients I wonder if returning 0 should be
> > > interpreted the same as NFS4ERR_DELAY?  Note that we still need to
> > > time out the client if it doesn't respond in time, so NFS4ERR_DELAY
> > > seems better than 0, but the standard doesn't really talk about
> > > return values other than NFS4ERR_NOMATCHING_LAYOUT.
> > 
> > My interpretation is somewhat different. To me, this is how we'd
> > interpret the response from the client (pseudocode):
> > 
> > NFS_OK:
> > 	/* Message received. I'll start returning these layouts soon. */
> > NFS4ERR_DELAY:
> > 	/* I'm too resource constrained to even process this simple
> >            request right now. Please ask me again in a bit. */
> > NFS4ERR_NOMATCHING_LAYOUT:
> > 	/* Huh? What layout? */
> > 
> > ...IMO, the spec is pretty clear that a successful response from the
> > client just means that it got the message that it should start
> > returning layouts. If it happens to return anything before the cb
> > response, then that's just luck/coincidence. The server shouldn't count
> > on that.
> 
> Ok, so for 0 we should re check if the layouts are still outstanding
> before sending the next recall.  But given that we have no client
> returning that or test cases I'd be tempted to treat OK like DELAY
> for now - if the client is properly implemented it will eventually
> return NFS4ERR_NOMATCHING_LAYOUT.  We can add a big comment on why
> we're doing that so that it's obvious.

OK, so if I understand right, the current code is letting the rpc state
machine drive the whole thing, and your proposal is that the rpc task
lasts until the client either responds NFS4ERR_NOMATCHING_LAYOUT or we
just run out of time.  (NOMATCHING_LAYOUT being the one response that
isn't either "try again" or "OK I'll get to it soon").

I understand why that would work, and that handling anything other than
the NOMATCHING_LAYOUT case is a lower priority for now, but this
approach worries me.

Is there a reason we can't do as in the delegation case, and track the
revocation timeout separately from the callback rpc?

--b.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC] nfsd: serialize layout stateid morphing operations
  2015-12-03 22:08                 ` J. Bruce Fields
@ 2015-12-04  8:38                   ` Christoph Hellwig
  2015-12-04 20:51                     ` Jeff Layton
  0 siblings, 1 reply; 26+ messages in thread
From: Christoph Hellwig @ 2015-12-04  8:38 UTC (permalink / raw)
  To: J. Bruce Fields; +Cc: Christoph Hellwig, Jeff Layton, Kinglong Mee, linux-nfs

On Thu, Dec 03, 2015 at 05:08:50PM -0500, J. Bruce Fields wrote:
> OK, so if I understand right, the current code is letting the rpc state
> machine drive the whole thing, and your proposal is that the rpc task
> lasts until the client either responds NFS4ERR_NOMATCHING_LAYOUT or we
> just run out of time.  (NOMATCHING_LAYOUT being the one response that
> isn't either "try again" or "OK I'll get to it soon").

Yes (except fatal errors would end the rpc state machine).

> I understand why that would work, and that handling anything other than
> the NOMATCHING_LAYOUT case is a lower priority for now, but this
> approach worries me.
> 
> Is there a reason we can't do as in the delegation case, and track the
> revocation timeout separately from the callback rpc?

There is no reason not to do it, except for the significant effort
to implement it a well as a synthetic test case to actually reproduce
the behavior we want to handle.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC] nfsd: serialize layout stateid morphing operations
  2015-12-04  8:38                   ` Christoph Hellwig
@ 2015-12-04 20:51                     ` Jeff Layton
  2015-12-05 12:02                       ` Christoph Hellwig
  0 siblings, 1 reply; 26+ messages in thread
From: Jeff Layton @ 2015-12-04 20:51 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: J. Bruce Fields, Kinglong Mee, linux-nfs

On Fri, 4 Dec 2015 09:38:03 +0100
Christoph Hellwig <hch@lst.de> wrote:

> On Thu, Dec 03, 2015 at 05:08:50PM -0500, J. Bruce Fields wrote:
> > OK, so if I understand right, the current code is letting the rpc state
> > machine drive the whole thing, and your proposal is that the rpc task
> > lasts until the client either responds NFS4ERR_NOMATCHING_LAYOUT or we
> > just run out of time.  (NOMATCHING_LAYOUT being the one response that
> > isn't either "try again" or "OK I'll get to it soon").
> 
> Yes (except fatal errors would end the rpc state machine).
> 
> > I understand why that would work, and that handling anything other than
> > the NOMATCHING_LAYOUT case is a lower priority for now, but this
> > approach worries me.
> > 
> > Is there a reason we can't do as in the delegation case, and track the
> > revocation timeout separately from the callback rpc?
> 
> There is no reason not to do it, except for the significant effort
> to implement it a well as a synthetic test case to actually reproduce
> the behavior we want to handle.

Could you end up livelocking here? Suppose you issue the callback and
the client returns success. He then returns the layout and gets a new
one just before the delay timer pops. We then end up recalling _that_
layout...rinse, repeat...

I'm not opposed to using the state machine to drive this, but I think
I'd like to see some mechanism to cancel any callbacks that are still
in limbo, before the server starts handing out layouts again for the
file.

-- 
Jeff Layton <jlayton@poochiereds.net>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC] nfsd: serialize layout stateid morphing operations
  2015-12-04 20:51                     ` Jeff Layton
@ 2015-12-05 12:02                       ` Christoph Hellwig
  2015-12-05 12:24                         ` Jeff Layton
  0 siblings, 1 reply; 26+ messages in thread
From: Christoph Hellwig @ 2015-12-05 12:02 UTC (permalink / raw)
  To: Jeff Layton; +Cc: Christoph Hellwig, J. Bruce Fields, Kinglong Mee, linux-nfs

On Fri, Dec 04, 2015 at 03:51:10PM -0500, Jeff Layton wrote:
> > There is no reason not to do it, except for the significant effort
> > to implement it a well as a synthetic test case to actually reproduce
> > the behavior we want to handle.
> 
> Could you end up livelocking here? Suppose you issue the callback and
> the client returns success. He then returns the layout and gets a new
> one just before the delay timer pops. We then end up recalling _that_
> layout...rinse, repeat...

If we start allowing layoutgets before the whole range has been
returned there is a great chance for livelocks, yes.  But I don't think
we should allow layoutgets to proceed before that.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC] nfsd: serialize layout stateid morphing operations
  2015-12-05 12:02                       ` Christoph Hellwig
@ 2015-12-05 12:24                         ` Jeff Layton
  2015-12-06 13:09                           ` Jeff Layton
  2015-12-07 13:07                           ` Christoph Hellwig
  0 siblings, 2 replies; 26+ messages in thread
From: Jeff Layton @ 2015-12-05 12:24 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: J. Bruce Fields, Kinglong Mee, linux-nfs

On Sat, 5 Dec 2015 13:02:22 +0100
Christoph Hellwig <hch@lst.de> wrote:

> On Fri, Dec 04, 2015 at 03:51:10PM -0500, Jeff Layton wrote:
> > > There is no reason not to do it, except for the significant effort
> > > to implement it a well as a synthetic test case to actually reproduce
> > > the behavior we want to handle.
> > 
> > Could you end up livelocking here? Suppose you issue the callback and
> > the client returns success. He then returns the layout and gets a new
> > one just before the delay timer pops. We then end up recalling _that_
> > layout...rinse, repeat...
> 
> If we start allowing layoutgets before the whole range has been
> returned there is a great chance for livelocks, yes.  But I don't think
> we should allow layoutgets to proceed before that.

Maybe I didn't describe it well enough. I think you can still end up
looping even if you don't allow LAYOUTGETs before the entire range is
returned.

If we treat NFS4_OK and NFS4ERR_DELAY equivalently, then we're
expecting the client to eventually return NFS4ERR_NOMATCHING_LAYOUT (or
a different error) to break the cycle of retransmissions. But, HZ/100
is enough time for the client to return a layout and request a new one.
We may never see that error -- only a continual cycle of
CB_LAYOUTRECALL/LAYOUTRETURN/LAYOUTGET.

I think we need a more reliable way to break that cycle so we don't end
up looping like that. We should either cancel any active callbacks
before reallowing LAYOUTGETs, or move the timeout handling outside of
the RPC state machine (like Bruce was suggesting).

-- 
Jeff Layton <jlayton@poochiereds.net>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC] nfsd: serialize layout stateid morphing operations
  2015-12-05 12:24                         ` Jeff Layton
@ 2015-12-06 13:09                           ` Jeff Layton
  2015-12-07 13:09                             ` Christoph Hellwig
  2015-12-16 16:55                             ` J. Bruce Fields
  2015-12-07 13:07                           ` Christoph Hellwig
  1 sibling, 2 replies; 26+ messages in thread
From: Jeff Layton @ 2015-12-06 13:09 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: J. Bruce Fields, Kinglong Mee, linux-nfs

On Sat, 5 Dec 2015 07:24:09 -0500
Jeff Layton <jlayton@poochiereds.net> wrote:

> On Sat, 5 Dec 2015 13:02:22 +0100
> Christoph Hellwig <hch@lst.de> wrote:
> 
> > On Fri, Dec 04, 2015 at 03:51:10PM -0500, Jeff Layton wrote:
> > > > There is no reason not to do it, except for the significant effort
> > > > to implement it a well as a synthetic test case to actually reproduce
> > > > the behavior we want to handle.
> > > 
> > > Could you end up livelocking here? Suppose you issue the callback and
> > > the client returns success. He then returns the layout and gets a new
> > > one just before the delay timer pops. We then end up recalling _that_
> > > layout...rinse, repeat...
> > 
> > If we start allowing layoutgets before the whole range has been
> > returned there is a great chance for livelocks, yes.  But I don't think
> > we should allow layoutgets to proceed before that.
> 
> Maybe I didn't describe it well enough. I think you can still end up
> looping even if you don't allow LAYOUTGETs before the entire range is
> returned.
> 
> If we treat NFS4_OK and NFS4ERR_DELAY equivalently, then we're
> expecting the client to eventually return NFS4ERR_NOMATCHING_LAYOUT (or
> a different error) to break the cycle of retransmissions. But, HZ/100
> is enough time for the client to return a layout and request a new one.
> We may never see that error -- only a continual cycle of
> CB_LAYOUTRECALL/LAYOUTRETURN/LAYOUTGET.
> 
> I think we need a more reliable way to break that cycle so we don't end
> up looping like that. We should either cancel any active callbacks
> before reallowing LAYOUTGETs, or move the timeout handling outside of
> the RPC state machine (like Bruce was suggesting).
> 

Either way...in the near term we should probably take the patch that I
originally proposed, just to ensure that no one hits the bugs that
Kinglong hit. That does still leave some gaps in the seqid handling,
but those are preferable to the warning and deadlock.

Bruce, does that sound reasonable? I can send that patch in a separate
email if you'd prefer.

-- 
Jeff Layton <jlayton@poochiereds.net>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC] nfsd: serialize layout stateid morphing operations
  2015-12-05 12:24                         ` Jeff Layton
  2015-12-06 13:09                           ` Jeff Layton
@ 2015-12-07 13:07                           ` Christoph Hellwig
  1 sibling, 0 replies; 26+ messages in thread
From: Christoph Hellwig @ 2015-12-07 13:07 UTC (permalink / raw)
  To: Jeff Layton; +Cc: Christoph Hellwig, J. Bruce Fields, Kinglong Mee, linux-nfs

On Sat, Dec 05, 2015 at 07:24:09AM -0500, Jeff Layton wrote:
> If we treat NFS4_OK and NFS4ERR_DELAY equivalently, then we're
> expecting the client to eventually return NFS4ERR_NOMATCHING_LAYOUT (or
> a different error) to break the cycle of retransmissions. But, HZ/100
> is enough time for the client to return a layout and request a new one.
> We may never see that error -- only a continual cycle of
> CB_LAYOUTRECALL/LAYOUTRETURN/LAYOUTGET.
> 
> I think we need a more reliable way to break that cycle so we don't end
> up looping like that. We should either cancel any active callbacks
> before reallowing LAYOUTGETs, or move the timeout handling outside of
> the RPC state machine (like Bruce was suggesting).

We block all new LAYOUTGETS as long as fi_lo_recalls is non-zero,
and we only only decrement it from nfsd4_cb_layout_release.  The
way I understand the RPC state machine that means we block new LAYOUTGETS
until we have successfully finished the recall.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC] nfsd: serialize layout stateid morphing operations
  2015-12-06 13:09                           ` Jeff Layton
@ 2015-12-07 13:09                             ` Christoph Hellwig
  2015-12-07 13:28                               ` Jeff Layton
  2015-12-16 16:55                             ` J. Bruce Fields
  1 sibling, 1 reply; 26+ messages in thread
From: Christoph Hellwig @ 2015-12-07 13:09 UTC (permalink / raw)
  To: Jeff Layton; +Cc: Christoph Hellwig, J. Bruce Fields, Kinglong Mee, linux-nfs

On Sun, Dec 06, 2015 at 08:09:54AM -0500, Jeff Layton wrote:
> Either way...in the near term we should probably take the patch that I
> originally proposed, just to ensure that no one hits the bugs that
> Kinglong hit. That does still leave some gaps in the seqid handling,
> but those are preferable to the warning and deadlock.
> 
> Bruce, does that sound reasonable? I can send that patch in a separate
> email if you'd prefer.

What is the patch you proposed?  As far as I can tell the short term
action would require two patches:

 - treat 0 like NFS4ERR_DELAY (not directly related to your patch)
 - send the old layout stateid with a recall, and only increment it
   in nfsd4_cb_layout_release when we actually change the layout state

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC] nfsd: serialize layout stateid morphing operations
  2015-12-07 13:09                             ` Christoph Hellwig
@ 2015-12-07 13:28                               ` Jeff Layton
  2015-12-07 14:17                                 ` Christoph Hellwig
  0 siblings, 1 reply; 26+ messages in thread
From: Jeff Layton @ 2015-12-07 13:28 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: J. Bruce Fields, Kinglong Mee, linux-nfs

On Mon, 7 Dec 2015 14:09:32 +0100
Christoph Hellwig <hch@lst.de> wrote:

> On Sun, Dec 06, 2015 at 08:09:54AM -0500, Jeff Layton wrote:
> > Either way...in the near term we should probably take the patch
> > that I originally proposed, just to ensure that no one hits the
> > bugs that Kinglong hit. That does still leave some gaps in the
> > seqid handling, but those are preferable to the warning and
> > deadlock.
> > 
> > Bruce, does that sound reasonable? I can send that patch in a
> > separate email if you'd prefer.
> 
> What is the patch you proposed?  As far as I can tell the short term
> action would require two patches:
> 


The one I proposed is the one earlier in the thread that just drops the
mutex once the stateid has been copied (and so no longer holds it over
the CB RPC).


>  - treat 0 like NFS4ERR_DELAY (not directly related to your patch)
>  - send the old layout stateid with a recall, and only increment it
>    in nfsd4_cb_layout_release when we actually change the layout state

My understanding is that you need to increment the seqid when prior to
sending the callback. The basic idea there is that you want to ensure
that any LAYOUTGETs that were sent before the CB_LAYOUTRECALL get back
an OLD_STATEID error. RFC5661, Section 12.5.3:

    After the layout stateid is established, the server increments by
    one the value of the "seqid" in each subsequent LAYOUTGET and
    LAYOUTRETURN response, and in each CB_LAYOUTRECALL request.

> 
> We block all new LAYOUTGETS as long as fi_lo_recalls is non-zero,
> and we only only decrement it from nfsd4_cb_layout_release.  The
> way I understand the RPC state machine that means we block new
> LAYOUTGETS until we have successfully finished the recall.
>

Ok, so if you do treat 0 like NFS4ERR_DELAY then we shouldn't loop like
I was thinking. We do block LAYOUTGETs for a little longer than is
necessary, since you need to wait for the client to return
NFS4ERR_NOMATCHING_LAYOUT before the ->release op gets called but I
guess we can live with that.

-- 
Jeff Layton <jlayton@poochiereds.net>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC] nfsd: serialize layout stateid morphing operations
  2015-12-07 13:28                               ` Jeff Layton
@ 2015-12-07 14:17                                 ` Christoph Hellwig
  2015-12-07 16:12                                   ` Jeff Layton
  0 siblings, 1 reply; 26+ messages in thread
From: Christoph Hellwig @ 2015-12-07 14:17 UTC (permalink / raw)
  To: Jeff Layton; +Cc: J. Bruce Fields, Kinglong Mee, linux-nfs

On Mon, Dec 07, 2015 at 08:28:03AM -0500, Jeff Layton wrote:
> My understanding is that you need to increment the seqid when prior to
> sending the callback. The basic idea there is that you want to ensure
> that any LAYOUTGETs that were sent before the CB_LAYOUTRECALL get back
> an OLD_STATEID error. RFC5661, Section 12.5.3:
> 
>     After the layout stateid is established, the server increments by
>     one the value of the "seqid" in each subsequent LAYOUTGET and
>     LAYOUTRETURN response, and in each CB_LAYOUTRECALL request.

True.  Although I really don't see any way to make layoutrecall processing
race free that way.  I guess your patch to just drop the mutex is the
best we could do.

Note that it doesn't really matter for the current server, as we will
always recall the whole file, and thus leave no layout state for it on
the client.  But for finger grained recalls this could become a problem.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC] nfsd: serialize layout stateid morphing operations
  2015-12-07 14:17                                 ` Christoph Hellwig
@ 2015-12-07 16:12                                   ` Jeff Layton
  2015-12-07 16:43                                     ` J. Bruce Fields
  0 siblings, 1 reply; 26+ messages in thread
From: Jeff Layton @ 2015-12-07 16:12 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: J. Bruce Fields, Kinglong Mee, linux-nfs

On Mon, 7 Dec 2015 15:17:01 +0100
Christoph Hellwig <hch@lst.de> wrote:

> On Mon, Dec 07, 2015 at 08:28:03AM -0500, Jeff Layton wrote:
> > My understanding is that you need to increment the seqid when prior to
> > sending the callback. The basic idea there is that you want to ensure
> > that any LAYOUTGETs that were sent before the CB_LAYOUTRECALL get back
> > an OLD_STATEID error. RFC5661, Section 12.5.3:
> > 
> >     After the layout stateid is established, the server increments by
> >     one the value of the "seqid" in each subsequent LAYOUTGET and
> >     LAYOUTRETURN response, and in each CB_LAYOUTRECALL request.
> 
> True.  Although I really don't see any way to make layoutrecall processing
> race free that way.  I guess your patch to just drop the mutex is the
> best we could do.
> 
> Note that it doesn't really matter for the current server, as we will
> always recall the whole file, and thus leave no layout state for it on
> the client.  But for finger grained recalls this could become a
> problem.


Just thinking out loud...

So we inc the seqid when sending the recall, and then the
layoutreturn(s) will inc it again, so that should cover those cases.

Let's suppose that the client returns 0 for the CB though, but never
actually returns the layouts. At that point I guess we need to revoke
the whole stateid, similarly to how we revoke delegations that aren't
returned.

The catch here is that the revocation timer is the same as the RPC
timeout period. It doesn't seem like there ought to be any relationship
there. It ought to be 1-2 lease periods, IIRC?
-- 
Jeff Layton <jlayton@poochiereds.net>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC] nfsd: serialize layout stateid morphing operations
  2015-12-07 16:12                                   ` Jeff Layton
@ 2015-12-07 16:43                                     ` J. Bruce Fields
  0 siblings, 0 replies; 26+ messages in thread
From: J. Bruce Fields @ 2015-12-07 16:43 UTC (permalink / raw)
  To: Jeff Layton; +Cc: Christoph Hellwig, Kinglong Mee, linux-nfs

On Mon, Dec 07, 2015 at 11:12:26AM -0500, Jeff Layton wrote:
> On Mon, 7 Dec 2015 15:17:01 +0100
> Christoph Hellwig <hch@lst.de> wrote:
> 
> > On Mon, Dec 07, 2015 at 08:28:03AM -0500, Jeff Layton wrote:
> > > My understanding is that you need to increment the seqid when prior to
> > > sending the callback. The basic idea there is that you want to ensure
> > > that any LAYOUTGETs that were sent before the CB_LAYOUTRECALL get back
> > > an OLD_STATEID error. RFC5661, Section 12.5.3:
> > > 
> > >     After the layout stateid is established, the server increments by
> > >     one the value of the "seqid" in each subsequent LAYOUTGET and
> > >     LAYOUTRETURN response, and in each CB_LAYOUTRECALL request.
> > 
> > True.  Although I really don't see any way to make layoutrecall processing
> > race free that way.  I guess your patch to just drop the mutex is the
> > best we could do.
> > 
> > Note that it doesn't really matter for the current server, as we will
> > always recall the whole file, and thus leave no layout state for it on
> > the client.  But for finger grained recalls this could become a
> > problem.
> 
> 
> Just thinking out loud...
> 
> So we inc the seqid when sending the recall, and then the
> layoutreturn(s) will inc it again, so that should cover those cases.
> 
> Let's suppose that the client returns 0 for the CB though, but never
> actually returns the layouts. At that point I guess we need to revoke
> the whole stateid, similarly to how we revoke delegations that aren't
> returned.
> 
> The catch here is that the revocation timer is the same as the RPC
> timeout period. It doesn't seem like there ought to be any relationship
> there. It ought to be 1-2 lease periods, IIRC?

Currently it's retrying indefinitely, with a comment that it should cap
the number of retries somehow.  So we'd just do the arithmetic to pick a
number of retries that works.

I think current client behavior should prevent any of this being a
problem in practice?

But people who read the spec should have at least a fighting chance of
writing a working client.  I don't like the idea that somebody's going
to make a perfectly legal client change and suddenly the client will
fail.

As Christoph says we can end up in that situation just as easily by
writing more complicated code and not testing those cases.  And there's
probably some of that in the current sessions backchannel code.

Argh.  I guess I'll take a shot at some pynfs tests....

--b.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH RFC] nfsd: serialize layout stateid morphing operations
  2015-12-06 13:09                           ` Jeff Layton
  2015-12-07 13:09                             ` Christoph Hellwig
@ 2015-12-16 16:55                             ` J. Bruce Fields
  1 sibling, 0 replies; 26+ messages in thread
From: J. Bruce Fields @ 2015-12-16 16:55 UTC (permalink / raw)
  To: Jeff Layton; +Cc: Christoph Hellwig, Kinglong Mee, linux-nfs

On Sun, Dec 06, 2015 at 08:09:54AM -0500, Jeff Layton wrote:
> On Sat, 5 Dec 2015 07:24:09 -0500
> Jeff Layton <jlayton@poochiereds.net> wrote:
> 
> > On Sat, 5 Dec 2015 13:02:22 +0100
> > Christoph Hellwig <hch@lst.de> wrote:
> > 
> > > On Fri, Dec 04, 2015 at 03:51:10PM -0500, Jeff Layton wrote:
> > > > > There is no reason not to do it, except for the significant effort
> > > > > to implement it a well as a synthetic test case to actually reproduce
> > > > > the behavior we want to handle.
> > > > 
> > > > Could you end up livelocking here? Suppose you issue the callback and
> > > > the client returns success. He then returns the layout and gets a new
> > > > one just before the delay timer pops. We then end up recalling _that_
> > > > layout...rinse, repeat...
> > > 
> > > If we start allowing layoutgets before the whole range has been
> > > returned there is a great chance for livelocks, yes.  But I don't think
> > > we should allow layoutgets to proceed before that.
> > 
> > Maybe I didn't describe it well enough. I think you can still end up
> > looping even if you don't allow LAYOUTGETs before the entire range is
> > returned.
> > 
> > If we treat NFS4_OK and NFS4ERR_DELAY equivalently, then we're
> > expecting the client to eventually return NFS4ERR_NOMATCHING_LAYOUT (or
> > a different error) to break the cycle of retransmissions. But, HZ/100
> > is enough time for the client to return a layout and request a new one.
> > We may never see that error -- only a continual cycle of
> > CB_LAYOUTRECALL/LAYOUTRETURN/LAYOUTGET.
> > 
> > I think we need a more reliable way to break that cycle so we don't end
> > up looping like that. We should either cancel any active callbacks
> > before reallowing LAYOUTGETs, or move the timeout handling outside of
> > the RPC state machine (like Bruce was suggesting).
> > 
> 
> Either way...in the near term we should probably take the patch that I
> originally proposed, just to ensure that no one hits the bugs that
> Kinglong hit. That does still leave some gaps in the seqid handling,
> but those are preferable to the warning and deadlock.
> 
> Bruce, does that sound reasonable?

Yes, I think I'll just apply the below (your patch with a couple extra
sentences in the changelog), and pass that along for 4.4 soon.

--b.

commit be20aa00c671
Author: Jeff Layton <jlayton@poochiereds.net>
Date:   Sun Nov 29 08:46:14 2015 -0500

    nfsd: don't hold ls_mutex across a layout recall
    
    We do need to serialize layout stateid morphing operations, but we
    currently hold the ls_mutex across a layout recall which is pretty
    ugly. It's also unnecessary -- once we've bumped the seqid and
    copied it, we don't need to serialize the rest of the CB_LAYOUTRECALL
    vs. anything else. Just drop the mutex once the copy is done.
    
    This was causing a "workqueue leaked lock or atomic" warning and an
    occasional deadlock.
    
    There's more work to be done here but this fixes the immediate
    regression.
    
    Fixes: cc8a55320b5f "nfsd: serialize layout stateid morphing operations"
    Cc: stable@vger.kernel.org
    Reported-by: Kinglong Mee <kinglongmee@gmail.com>
    Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
    Signed-off-by: J. Bruce Fields <bfields@redhat.com>

diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 9ffef06b30d5..c9d6c715c0fb 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -616,6 +616,7 @@ nfsd4_cb_layout_prepare(struct nfsd4_callback *cb)
 
 	mutex_lock(&ls->ls_mutex);
 	nfs4_inc_and_copy_stateid(&ls->ls_recall_sid, &ls->ls_stid);
+	mutex_unlock(&ls->ls_mutex);
 }
 
 static int
@@ -659,7 +660,6 @@ nfsd4_cb_layout_release(struct nfsd4_callback *cb)
 
 	trace_layout_recall_release(&ls->ls_stid.sc_stateid);
 
-	mutex_unlock(&ls->ls_mutex);
 	nfsd4_return_all_layouts(ls, &reaplist);
 	nfsd4_free_layouts(&reaplist);
 	nfs4_put_stid(&ls->ls_stid);

^ permalink raw reply related	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2015-12-16 16:55 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-09-17 11:58 [PATCH RFC] nfsd: serialize layout stateid morphing operations Jeff Layton
2015-10-11 13:15 ` Christoph Hellwig
2015-10-11 20:51   ` Jeff Layton
2015-10-23 19:35     ` J. Bruce Fields
2015-11-29  4:07 ` Kinglong Mee
2015-11-29 13:46   ` Jeff Layton
2015-11-30  2:57     ` Kinglong Mee
2015-11-30 21:34       ` J. Bruce Fields
2015-12-01  0:33         ` Jeff Layton
2015-12-01  0:55           ` Trond Myklebust
2015-12-01 11:56           ` Christoph Hellwig
2015-12-01 22:48             ` Jeff Layton
2015-12-02  7:25               ` Christoph Hellwig
2015-12-03 22:08                 ` J. Bruce Fields
2015-12-04  8:38                   ` Christoph Hellwig
2015-12-04 20:51                     ` Jeff Layton
2015-12-05 12:02                       ` Christoph Hellwig
2015-12-05 12:24                         ` Jeff Layton
2015-12-06 13:09                           ` Jeff Layton
2015-12-07 13:09                             ` Christoph Hellwig
2015-12-07 13:28                               ` Jeff Layton
2015-12-07 14:17                                 ` Christoph Hellwig
2015-12-07 16:12                                   ` Jeff Layton
2015-12-07 16:43                                     ` J. Bruce Fields
2015-12-16 16:55                             ` J. Bruce Fields
2015-12-07 13:07                           ` Christoph Hellwig

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.