All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] nvme: fix flush dependency in delete controller flow
@ 2018-04-02 12:37 Nitzan Carmi
  2018-04-05  8:54 ` Christoph Hellwig
  0 siblings, 1 reply; 19+ messages in thread
From: Nitzan Carmi @ 2018-04-02 12:37 UTC (permalink / raw)


nvme_delete_ctrl queues a work on a MEM_RECLAIM queue
(nvme_delete_wq), which eventually calls cleanup_srcu_struct,
which in turn flushes a delayed work from an !MEM_RECLAIM
queue. This is unsafe as we might trigger deadlocks under
severe memory pressure.

Fix this by moving the cleanups to a seperate work over
the safe !MEM_RECLAIM system_wq.

Fixes: ed754e5dee ("nvme: track shared namespaces")
Signed-off-by: Nitzan Carmi <nitzanc at mellanox.com>
Reviewed-by: Max Gurtovoy <maxg at mellanox.com>
---
 drivers/nvme/host/core.c | 12 ++++++++++--
 drivers/nvme/host/nvme.h |  1 +
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 75f3e4c..7fc5d9d 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -362,6 +362,14 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
 }
 EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
 
+static void nvme_free_ns_head_work(struct work_struct *work) {
+	struct nvme_ns_head *head =
+		container_of(work, struct nvme_ns_head, free_work);
+
+	cleanup_srcu_struct(&head->srcu);
+	kfree(head);
+}
+
 static void nvme_free_ns_head(struct kref *ref)
 {
 	struct nvme_ns_head *head =
@@ -370,8 +378,7 @@ static void nvme_free_ns_head(struct kref *ref)
 	nvme_mpath_remove_disk(head);
 	ida_simple_remove(&head->subsys->ns_ida, head->instance);
 	list_del_init(&head->entry);
-	cleanup_srcu_struct(&head->srcu);
-	kfree(head);
+	queue_work(system_wq, &head->free_work);
 }
 
 static void nvme_put_ns_head(struct nvme_ns_head *head)
@@ -3099,6 +3106,7 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
 		goto out_free_head;
 	head->instance = ret;
 	INIT_LIST_HEAD(&head->list);
+	INIT_WORK(&head->free_work, nvme_free_ns_head_work);
 	init_srcu_struct(&head->srcu);
 	head->subsys = ctrl->subsys;
 	head->ns_id = nsid;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 178aced..f443608 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -270,6 +270,7 @@ struct nvme_ns_head {
 	spinlock_t		requeue_lock;
 	struct work_struct	requeue_work;
 #endif
+	struct work_struct	free_work;
 	struct list_head	list;
 	struct srcu_struct      srcu;
 	struct nvme_subsystem	*subsys;
-- 
1.8.2.3

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH] nvme: fix flush dependency in delete controller flow
  2018-04-02 12:37 [PATCH] nvme: fix flush dependency in delete controller flow Nitzan Carmi
@ 2018-04-05  8:54 ` Christoph Hellwig
  2018-04-05 13:14   ` Paul E. McKenney
  0 siblings, 1 reply; 19+ messages in thread
From: Christoph Hellwig @ 2018-04-05  8:54 UTC (permalink / raw)


On Mon, Apr 02, 2018@12:37:27PM +0000, Nitzan Carmi wrote:
> nvme_delete_ctrl queues a work on a MEM_RECLAIM queue
> (nvme_delete_wq), which eventually calls cleanup_srcu_struct,
> which in turn flushes a delayed work from an !MEM_RECLAIM
> queue. This is unsafe as we might trigger deadlocks under
> severe memory pressure.
> 
> Fix this by moving the cleanups to a seperate work over
> the safe !MEM_RECLAIM system_wq.

This seems like an odd workaround.  It seems like not allowing
cleanup_srcu_struct from mem reclaim wqs is a really odd API
issue.

Paul, anything we can do there?

> Fixes: ed754e5dee ("nvme: track shared namespaces")
> Signed-off-by: Nitzan Carmi <nitzanc at mellanox.com>
> Reviewed-by: Max Gurtovoy <maxg at mellanox.com>
> ---
>  drivers/nvme/host/core.c | 12 ++++++++++--
>  drivers/nvme/host/nvme.h |  1 +
>  2 files changed, 11 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 75f3e4c..7fc5d9d 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -362,6 +362,14 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
>  }
>  EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
>  
> +static void nvme_free_ns_head_work(struct work_struct *work) {
> +	struct nvme_ns_head *head =
> +		container_of(work, struct nvme_ns_head, free_work);
> +
> +	cleanup_srcu_struct(&head->srcu);
> +	kfree(head);
> +}
> +
>  static void nvme_free_ns_head(struct kref *ref)
>  {
>  	struct nvme_ns_head *head =
> @@ -370,8 +378,7 @@ static void nvme_free_ns_head(struct kref *ref)
>  	nvme_mpath_remove_disk(head);
>  	ida_simple_remove(&head->subsys->ns_ida, head->instance);
>  	list_del_init(&head->entry);
> -	cleanup_srcu_struct(&head->srcu);
> -	kfree(head);
> +	queue_work(system_wq, &head->free_work);
>  }
>  
>  static void nvme_put_ns_head(struct nvme_ns_head *head)
> @@ -3099,6 +3106,7 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
>  		goto out_free_head;
>  	head->instance = ret;
>  	INIT_LIST_HEAD(&head->list);
> +	INIT_WORK(&head->free_work, nvme_free_ns_head_work);
>  	init_srcu_struct(&head->srcu);
>  	head->subsys = ctrl->subsys;
>  	head->ns_id = nsid;
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index 178aced..f443608 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -270,6 +270,7 @@ struct nvme_ns_head {
>  	spinlock_t		requeue_lock;
>  	struct work_struct	requeue_work;
>  #endif
> +	struct work_struct	free_work;
>  	struct list_head	list;
>  	struct srcu_struct      srcu;
>  	struct nvme_subsystem	*subsys;
> -- 
> 1.8.2.3
---end quoted text---

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] nvme: fix flush dependency in delete controller flow
  2018-04-05  8:54 ` Christoph Hellwig
@ 2018-04-05 13:14   ` Paul E. McKenney
  2018-04-05 16:08     ` Paul E. McKenney
  0 siblings, 1 reply; 19+ messages in thread
From: Paul E. McKenney @ 2018-04-05 13:14 UTC (permalink / raw)


On Thu, Apr 05, 2018@10:54:38AM +0200, Christoph Hellwig wrote:
> On Mon, Apr 02, 2018@12:37:27PM +0000, Nitzan Carmi wrote:
> > nvme_delete_ctrl queues a work on a MEM_RECLAIM queue
> > (nvme_delete_wq), which eventually calls cleanup_srcu_struct,
> > which in turn flushes a delayed work from an !MEM_RECLAIM
> > queue. This is unsafe as we might trigger deadlocks under
> > severe memory pressure.
> > 
> > Fix this by moving the cleanups to a seperate work over
> > the safe !MEM_RECLAIM system_wq.
> 
> This seems like an odd workaround.  It seems like not allowing
> cleanup_srcu_struct from mem reclaim wqs is a really odd API
> issue.
> 
> Paul, anything we can do there?

Looks like the culprit is the need to do flush_delayed_work() from
within cleanup_srcu_struct().  If I don't do that, then the tail
end of a grace period, for example, due to a recent call_srcu(),
would find itself operating on the freelist.

But if this SRCU use case doesn't ever invoke call_srcu(), I
-think- that shouldn't need to do the flush_delayed_work() calls in
cleanup_srcu_struct().  I think.  It is a bit early out here, so I don't
trust myself on this one just now, and need to take another look when
fully awake.

But in the meantime, is it really the case that this SRCU use case never
ever invokes call_srcu()?

						Thanx, Paul

> > Fixes: ed754e5dee ("nvme: track shared namespaces")
> > Signed-off-by: Nitzan Carmi <nitzanc at mellanox.com>
> > Reviewed-by: Max Gurtovoy <maxg at mellanox.com>
> > ---
> >  drivers/nvme/host/core.c | 12 ++++++++++--
> >  drivers/nvme/host/nvme.h |  1 +
> >  2 files changed, 11 insertions(+), 2 deletions(-)
> > 
> > diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> > index 75f3e4c..7fc5d9d 100644
> > --- a/drivers/nvme/host/core.c
> > +++ b/drivers/nvme/host/core.c
> > @@ -362,6 +362,14 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
> >  }
> >  EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
> >  
> > +static void nvme_free_ns_head_work(struct work_struct *work) {
> > +	struct nvme_ns_head *head =
> > +		container_of(work, struct nvme_ns_head, free_work);
> > +
> > +	cleanup_srcu_struct(&head->srcu);
> > +	kfree(head);
> > +}
> > +
> >  static void nvme_free_ns_head(struct kref *ref)
> >  {
> >  	struct nvme_ns_head *head =
> > @@ -370,8 +378,7 @@ static void nvme_free_ns_head(struct kref *ref)
> >  	nvme_mpath_remove_disk(head);
> >  	ida_simple_remove(&head->subsys->ns_ida, head->instance);
> >  	list_del_init(&head->entry);
> > -	cleanup_srcu_struct(&head->srcu);
> > -	kfree(head);
> > +	queue_work(system_wq, &head->free_work);
> >  }
> >  
> >  static void nvme_put_ns_head(struct nvme_ns_head *head)
> > @@ -3099,6 +3106,7 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
> >  		goto out_free_head;
> >  	head->instance = ret;
> >  	INIT_LIST_HEAD(&head->list);
> > +	INIT_WORK(&head->free_work, nvme_free_ns_head_work);
> >  	init_srcu_struct(&head->srcu);
> >  	head->subsys = ctrl->subsys;
> >  	head->ns_id = nsid;
> > diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> > index 178aced..f443608 100644
> > --- a/drivers/nvme/host/nvme.h
> > +++ b/drivers/nvme/host/nvme.h
> > @@ -270,6 +270,7 @@ struct nvme_ns_head {
> >  	spinlock_t		requeue_lock;
> >  	struct work_struct	requeue_work;
> >  #endif
> > +	struct work_struct	free_work;
> >  	struct list_head	list;
> >  	struct srcu_struct      srcu;
> >  	struct nvme_subsystem	*subsys;
> > -- 
> > 1.8.2.3
> ---end quoted text---
> 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] nvme: fix flush dependency in delete controller flow
  2018-04-05 13:14   ` Paul E. McKenney
@ 2018-04-05 16:08     ` Paul E. McKenney
  2018-04-05 19:55       ` Christoph Hellwig
  0 siblings, 1 reply; 19+ messages in thread
From: Paul E. McKenney @ 2018-04-05 16:08 UTC (permalink / raw)


On Thu, Apr 05, 2018@06:14:26AM -0700, Paul E. McKenney wrote:
> On Thu, Apr 05, 2018@10:54:38AM +0200, Christoph Hellwig wrote:
> > On Mon, Apr 02, 2018@12:37:27PM +0000, Nitzan Carmi wrote:
> > > nvme_delete_ctrl queues a work on a MEM_RECLAIM queue
> > > (nvme_delete_wq), which eventually calls cleanup_srcu_struct,
> > > which in turn flushes a delayed work from an !MEM_RECLAIM
> > > queue. This is unsafe as we might trigger deadlocks under
> > > severe memory pressure.
> > > 
> > > Fix this by moving the cleanups to a seperate work over
> > > the safe !MEM_RECLAIM system_wq.
> > 
> > This seems like an odd workaround.  It seems like not allowing
> > cleanup_srcu_struct from mem reclaim wqs is a really odd API
> > issue.
> > 
> > Paul, anything we can do there?
> 
> Looks like the culprit is the need to do flush_delayed_work() from
> within cleanup_srcu_struct().  If I don't do that, then the tail
> end of a grace period, for example, due to a recent call_srcu(),
> would find itself operating on the freelist.
> 
> But if this SRCU use case doesn't ever invoke call_srcu(), I
> -think- that shouldn't need to do the flush_delayed_work() calls in
> cleanup_srcu_struct().  I think.  It is a bit early out here, so I don't
> trust myself on this one just now, and need to take another look when
> fully awake.
> 
> But in the meantime, is it really the case that this SRCU use case never
> ever invokes call_srcu()?

And I can do a bit better than this.  I could provide something like a
cleanup_srcu_struct_quiesced() that would avoid blocking if the caller had
quiesced it.  Here, to quiesce is to make sure that any synchronize_srcu()
or synchronize_srcu_expedited() calls have returned before the call to
cleanup_srcu_struct_quiesced(), and that any callbacks from call_srcu()
have completed.  Of course, srcu_barrier() could be used to wait for
the callbacks from earlier call_srcu().

Would something like this help?

						Thanx, Paul

> > > Fixes: ed754e5dee ("nvme: track shared namespaces")
> > > Signed-off-by: Nitzan Carmi <nitzanc at mellanox.com>
> > > Reviewed-by: Max Gurtovoy <maxg at mellanox.com>
> > > ---
> > >  drivers/nvme/host/core.c | 12 ++++++++++--
> > >  drivers/nvme/host/nvme.h |  1 +
> > >  2 files changed, 11 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> > > index 75f3e4c..7fc5d9d 100644
> > > --- a/drivers/nvme/host/core.c
> > > +++ b/drivers/nvme/host/core.c
> > > @@ -362,6 +362,14 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
> > >  }
> > >  EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
> > >  
> > > +static void nvme_free_ns_head_work(struct work_struct *work) {
> > > +	struct nvme_ns_head *head =
> > > +		container_of(work, struct nvme_ns_head, free_work);
> > > +
> > > +	cleanup_srcu_struct(&head->srcu);
> > > +	kfree(head);
> > > +}
> > > +
> > >  static void nvme_free_ns_head(struct kref *ref)
> > >  {
> > >  	struct nvme_ns_head *head =
> > > @@ -370,8 +378,7 @@ static void nvme_free_ns_head(struct kref *ref)
> > >  	nvme_mpath_remove_disk(head);
> > >  	ida_simple_remove(&head->subsys->ns_ida, head->instance);
> > >  	list_del_init(&head->entry);
> > > -	cleanup_srcu_struct(&head->srcu);
> > > -	kfree(head);
> > > +	queue_work(system_wq, &head->free_work);
> > >  }
> > >  
> > >  static void nvme_put_ns_head(struct nvme_ns_head *head)
> > > @@ -3099,6 +3106,7 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
> > >  		goto out_free_head;
> > >  	head->instance = ret;
> > >  	INIT_LIST_HEAD(&head->list);
> > > +	INIT_WORK(&head->free_work, nvme_free_ns_head_work);
> > >  	init_srcu_struct(&head->srcu);
> > >  	head->subsys = ctrl->subsys;
> > >  	head->ns_id = nsid;
> > > diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> > > index 178aced..f443608 100644
> > > --- a/drivers/nvme/host/nvme.h
> > > +++ b/drivers/nvme/host/nvme.h
> > > @@ -270,6 +270,7 @@ struct nvme_ns_head {
> > >  	spinlock_t		requeue_lock;
> > >  	struct work_struct	requeue_work;
> > >  #endif
> > > +	struct work_struct	free_work;
> > >  	struct list_head	list;
> > >  	struct srcu_struct      srcu;
> > >  	struct nvme_subsystem	*subsys;
> > > -- 
> > > 1.8.2.3
> > ---end quoted text---
> > 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] nvme: fix flush dependency in delete controller flow
  2018-04-05 16:08     ` Paul E. McKenney
@ 2018-04-05 19:55       ` Christoph Hellwig
  2018-04-06  0:18         ` Paul E. McKenney
  0 siblings, 1 reply; 19+ messages in thread
From: Christoph Hellwig @ 2018-04-05 19:55 UTC (permalink / raw)


On Thu, Apr 05, 2018@09:08:52AM -0700, Paul E. McKenney wrote:
> > But if this SRCU use case doesn't ever invoke call_srcu(), I
> > -think- that shouldn't need to do the flush_delayed_work() calls in
> > cleanup_srcu_struct().  I think.  It is a bit early out here, so I don't
> > trust myself on this one just now, and need to take another look when
> > fully awake.
> > 
> > But in the meantime, is it really the case that this SRCU use case never
> > ever invokes call_srcu()?

We never use call_srcu().

> And I can do a bit better than this.  I could provide something like a
> cleanup_srcu_struct_quiesced() that would avoid blocking if the caller had
> quiesced it.  Here, to quiesce is to make sure that any synchronize_srcu()
> or synchronize_srcu_expedited() calls have returned before the call to
> cleanup_srcu_struct_quiesced(), and that any callbacks from call_srcu()
> have completed.  Of course, srcu_barrier() could be used to wait for
> the callbacks from earlier call_srcu().
> 
> Would something like this help?

Yes.  Note that blocking isn't even the problem, just the flush_work.
But not blocking at all would be fine of course.

The pattern in nvme is that we have a struct nvme_ns_head, which has a
list of struct nvme_ns structures hanging off it.  We use SRCU to protect
lookups in said list.  We call synchronize_srcu after removing a nvme_ns
from the list in struct nvme_ns_head and nowhere else.  A struct
nvme_ns_head is freed (and we thus call cleanup_srcu_struct) once the
reference count on it hits zero, which can only happen after the last
nvme_ns has been removed from the list.

I suspect this might be a common patter in other parts of the kernel as
well.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] nvme: fix flush dependency in delete controller flow
  2018-04-05 19:55       ` Christoph Hellwig
@ 2018-04-06  0:18         ` Paul E. McKenney
  2018-04-06  6:19           ` Christoph Hellwig
  0 siblings, 1 reply; 19+ messages in thread
From: Paul E. McKenney @ 2018-04-06  0:18 UTC (permalink / raw)


On Thu, Apr 05, 2018@09:55:12PM +0200, Christoph Hellwig wrote:
> On Thu, Apr 05, 2018@09:08:52AM -0700, Paul E. McKenney wrote:
> > > But if this SRCU use case doesn't ever invoke call_srcu(), I
> > > -think- that shouldn't need to do the flush_delayed_work() calls in
> > > cleanup_srcu_struct().  I think.  It is a bit early out here, so I don't
> > > trust myself on this one just now, and need to take another look when
> > > fully awake.
> > > 
> > > But in the meantime, is it really the case that this SRCU use case never
> > > ever invokes call_srcu()?
> 
> We never use call_srcu().
> 
> > And I can do a bit better than this.  I could provide something like a
> > cleanup_srcu_struct_quiesced() that would avoid blocking if the caller had
> > quiesced it.  Here, to quiesce is to make sure that any synchronize_srcu()
> > or synchronize_srcu_expedited() calls have returned before the call to
> > cleanup_srcu_struct_quiesced(), and that any callbacks from call_srcu()
> > have completed.  Of course, srcu_barrier() could be used to wait for
> > the callbacks from earlier call_srcu().
> > 
> > Would something like this help?
> 
> Yes.  Note that blocking isn't even the problem, just the flush_work.
> But not blocking at all would be fine of course.
> 
> The pattern in nvme is that we have a struct nvme_ns_head, which has a
> list of struct nvme_ns structures hanging off it.  We use SRCU to protect
> lookups in said list.  We call synchronize_srcu after removing a nvme_ns
> from the list in struct nvme_ns_head and nowhere else.  A struct
> nvme_ns_head is freed (and we thus call cleanup_srcu_struct) once the
> reference count on it hits zero, which can only happen after the last
> nvme_ns has been removed from the list.
> 
> I suspect this might be a common patter in other parts of the kernel as
> well.

OK, how does the following (untested) patch look?

							Thanx, Paul

------------------------------------------------------------------------

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 33c1c698df09..91494d7e8e41 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -69,11 +69,45 @@ struct srcu_struct { };
 
 void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
 		void (*func)(struct rcu_head *head));
-void cleanup_srcu_struct(struct srcu_struct *sp);
+void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced);
 int __srcu_read_lock(struct srcu_struct *sp) __acquires(sp);
 void __srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp);
 void synchronize_srcu(struct srcu_struct *sp);
 
+/**
+ * cleanup_srcu_struct - deconstruct a sleep-RCU structure
+ * @sp: structure to clean up.
+ *
+ * Must invoke this after you are finished using a given srcu_struct that
+ * was initialized via init_srcu_struct(), else you leak memory.
+ */
+static inline void cleanup_srcu_struct(struct srcu_struct *sp)
+{
+	_cleanup_srcu_struct(sp, false);
+}
+
+/**
+ * cleanup_srcu_struct_quiesced - deconstruct a quiesced sleep-RCU structure
+ * @sp: structure to clean up.
+ *
+ * Must invoke this after you are finished using a given srcu_struct that
+ * was initialized via init_srcu_struct(), else you leak memory.  Also,
+ * all grace-period processing must have completed.
+ *
+ * "Completed" means that the last synchronize_srcu() and
+ * synchronize_srcu_expedited() calls must have returned before the call
+ * to cleanup_srcu_struct_quiesced().  It also means that the callback
+ * from the last call_srcu() must have been invoked before the call to
+ * cleanup_srcu_struct_quiesced(), but you can use srcu_barrier() to help
+ * with this last.  Violating these rules will get you a WARN_ON() splat
+ * (with high probability, anyway), and will also cause the srcu_struct
+ * to be leaked.
+ */
+static inline void cleanup_srcu_struct_quiesced(struct srcu_struct *sp)
+{
+	_cleanup_srcu_struct(sp, true);
+}
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 
 /**
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 680c96d8c00f..f0e1d44459f8 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -593,7 +593,12 @@ static void srcu_torture_init(void)
 
 static void srcu_torture_cleanup(void)
 {
-	cleanup_srcu_struct(&srcu_ctld);
+	static DEFINE_TORTURE_RANDOM(rand);
+
+	if (torture_random(&rand) & 0x800)
+		cleanup_srcu_struct(&srcu_ctld);
+	else
+		cleanup_srcu_struct_quiesced(&srcu_ctld);
 	srcu_ctlp = &srcu_ctl; /* In case of a later rcutorture run. */
 }
 
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 76ac5f50b2c7..622792abe41a 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -86,16 +86,19 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
  * Must invoke this after you are finished using a given srcu_struct that
  * was initialized via init_srcu_struct(), else you leak memory.
  */
-void cleanup_srcu_struct(struct srcu_struct *sp)
+void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced)
 {
 	WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]);
-	flush_work(&sp->srcu_work);
+	if (quiesced)
+		WARN_ON(work_pending(&sp->srcu_work));
+	else
+		flush_work(&sp->srcu_work);
 	WARN_ON(sp->srcu_gp_running);
 	WARN_ON(sp->srcu_gp_waiting);
 	WARN_ON(sp->srcu_cb_head);
 	WARN_ON(&sp->srcu_cb_head != sp->srcu_cb_tail);
 }
-EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
+EXPORT_SYMBOL_GPL(_cleanup_srcu_struct);
 
 /*
  * Removes the count for the old reader from the appropriate element of
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index fb560fca9ef4..b4123d7a2cec 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -366,24 +366,28 @@ static unsigned long srcu_get_delay(struct srcu_struct *sp)
 	return SRCU_INTERVAL;
 }
 
-/**
- * cleanup_srcu_struct - deconstruct a sleep-RCU structure
- * @sp: structure to clean up.
- *
- * Must invoke this after you are finished using a given srcu_struct that
- * was initialized via init_srcu_struct(), else you leak memory.
- */
-void cleanup_srcu_struct(struct srcu_struct *sp)
+/* Helper for cleanup_srcu_struct() and cleanup_srcu_struct_quiesced(). */
+void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced)
 {
 	int cpu;
 
 	if (WARN_ON(!srcu_get_delay(sp)))
-		return; /* Leakage unless caller handles error. */
+		return; /* Just leak it! */
 	if (WARN_ON(srcu_readers_active(sp)))
-		return; /* Leakage unless caller handles error. */
-	flush_delayed_work(&sp->work);
+		return; /* Just leak it! */
+	if (quiesced) {
+		if (WARN_ON(delayed_work_pending(&sp->work)))
+			return; /* Just leak it! */
+	} else {
+		flush_delayed_work(&sp->work);
+	}
 	for_each_possible_cpu(cpu)
-		flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work);
+		if (quiesced) {
+			if (WARN_ON(delayed_work_pending(&per_cpu_ptr(sp->sda, cpu)->work)))
+				return; /* Just leak it! */
+		} else {
+			flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work);
+		}
 	if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
 	    WARN_ON(srcu_readers_active(sp))) {
 		pr_info("%s: Active srcu_struct %p state: %d\n", __func__, sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)));
@@ -392,7 +396,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
 	free_percpu(sp->sda);
 	sp->sda = NULL;
 }
-EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
+EXPORT_SYMBOL_GPL(_cleanup_srcu_struct);
 
 /*
  * Counts the new reader in the appropriate per-CPU element of the

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH] nvme: fix flush dependency in delete controller flow
  2018-04-06  0:18         ` Paul E. McKenney
@ 2018-04-06  6:19           ` Christoph Hellwig
  2018-04-06 16:30             ` Paul E. McKenney
  0 siblings, 1 reply; 19+ messages in thread
From: Christoph Hellwig @ 2018-04-06  6:19 UTC (permalink / raw)


On Thu, Apr 05, 2018@05:18:11PM -0700, Paul E. McKenney wrote:
> OK, how does the following (untested) patch look?

Looks sensible to me.  Nitzan, can you test it with the obvious nvme
conversion to cleanup_srcu_struct_quiesced?

> 
> 							Thanx, Paul
> 
> ------------------------------------------------------------------------
> 
> diff --git a/include/linux/srcu.h b/include/linux/srcu.h
> index 33c1c698df09..91494d7e8e41 100644
> --- a/include/linux/srcu.h
> +++ b/include/linux/srcu.h
> @@ -69,11 +69,45 @@ struct srcu_struct { };
>  
>  void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
>  		void (*func)(struct rcu_head *head));
> -void cleanup_srcu_struct(struct srcu_struct *sp);
> +void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced);
>  int __srcu_read_lock(struct srcu_struct *sp) __acquires(sp);
>  void __srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp);
>  void synchronize_srcu(struct srcu_struct *sp);
>  
> +/**
> + * cleanup_srcu_struct - deconstruct a sleep-RCU structure
> + * @sp: structure to clean up.
> + *
> + * Must invoke this after you are finished using a given srcu_struct that
> + * was initialized via init_srcu_struct(), else you leak memory.
> + */
> +static inline void cleanup_srcu_struct(struct srcu_struct *sp)
> +{
> +	_cleanup_srcu_struct(sp, false);
> +}
> +
> +/**
> + * cleanup_srcu_struct_quiesced - deconstruct a quiesced sleep-RCU structure
> + * @sp: structure to clean up.
> + *
> + * Must invoke this after you are finished using a given srcu_struct that
> + * was initialized via init_srcu_struct(), else you leak memory.  Also,
> + * all grace-period processing must have completed.
> + *
> + * "Completed" means that the last synchronize_srcu() and
> + * synchronize_srcu_expedited() calls must have returned before the call
> + * to cleanup_srcu_struct_quiesced().  It also means that the callback
> + * from the last call_srcu() must have been invoked before the call to
> + * cleanup_srcu_struct_quiesced(), but you can use srcu_barrier() to help
> + * with this last.  Violating these rules will get you a WARN_ON() splat
> + * (with high probability, anyway), and will also cause the srcu_struct
> + * to be leaked.
> + */
> +static inline void cleanup_srcu_struct_quiesced(struct srcu_struct *sp)
> +{
> +	_cleanup_srcu_struct(sp, true);
> +}
> +
>  #ifdef CONFIG_DEBUG_LOCK_ALLOC
>  
>  /**
> diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
> index 680c96d8c00f..f0e1d44459f8 100644
> --- a/kernel/rcu/rcutorture.c
> +++ b/kernel/rcu/rcutorture.c
> @@ -593,7 +593,12 @@ static void srcu_torture_init(void)
>  
>  static void srcu_torture_cleanup(void)
>  {
> -	cleanup_srcu_struct(&srcu_ctld);
> +	static DEFINE_TORTURE_RANDOM(rand);
> +
> +	if (torture_random(&rand) & 0x800)
> +		cleanup_srcu_struct(&srcu_ctld);
> +	else
> +		cleanup_srcu_struct_quiesced(&srcu_ctld);
>  	srcu_ctlp = &srcu_ctl; /* In case of a later rcutorture run. */
>  }
>  
> diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
> index 76ac5f50b2c7..622792abe41a 100644
> --- a/kernel/rcu/srcutiny.c
> +++ b/kernel/rcu/srcutiny.c
> @@ -86,16 +86,19 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
>   * Must invoke this after you are finished using a given srcu_struct that
>   * was initialized via init_srcu_struct(), else you leak memory.
>   */
> -void cleanup_srcu_struct(struct srcu_struct *sp)
> +void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced)
>  {
>  	WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]);
> -	flush_work(&sp->srcu_work);
> +	if (quiesced)
> +		WARN_ON(work_pending(&sp->srcu_work));
> +	else
> +		flush_work(&sp->srcu_work);
>  	WARN_ON(sp->srcu_gp_running);
>  	WARN_ON(sp->srcu_gp_waiting);
>  	WARN_ON(sp->srcu_cb_head);
>  	WARN_ON(&sp->srcu_cb_head != sp->srcu_cb_tail);
>  }
> -EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
> +EXPORT_SYMBOL_GPL(_cleanup_srcu_struct);
>  
>  /*
>   * Removes the count for the old reader from the appropriate element of
> diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
> index fb560fca9ef4..b4123d7a2cec 100644
> --- a/kernel/rcu/srcutree.c
> +++ b/kernel/rcu/srcutree.c
> @@ -366,24 +366,28 @@ static unsigned long srcu_get_delay(struct srcu_struct *sp)
>  	return SRCU_INTERVAL;
>  }
>  
> -/**
> - * cleanup_srcu_struct - deconstruct a sleep-RCU structure
> - * @sp: structure to clean up.
> - *
> - * Must invoke this after you are finished using a given srcu_struct that
> - * was initialized via init_srcu_struct(), else you leak memory.
> - */
> -void cleanup_srcu_struct(struct srcu_struct *sp)
> +/* Helper for cleanup_srcu_struct() and cleanup_srcu_struct_quiesced(). */
> +void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced)
>  {
>  	int cpu;
>  
>  	if (WARN_ON(!srcu_get_delay(sp)))
> -		return; /* Leakage unless caller handles error. */
> +		return; /* Just leak it! */
>  	if (WARN_ON(srcu_readers_active(sp)))
> -		return; /* Leakage unless caller handles error. */
> -	flush_delayed_work(&sp->work);
> +		return; /* Just leak it! */
> +	if (quiesced) {
> +		if (WARN_ON(delayed_work_pending(&sp->work)))
> +			return; /* Just leak it! */
> +	} else {
> +		flush_delayed_work(&sp->work);
> +	}
>  	for_each_possible_cpu(cpu)
> -		flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work);
> +		if (quiesced) {
> +			if (WARN_ON(delayed_work_pending(&per_cpu_ptr(sp->sda, cpu)->work)))
> +				return; /* Just leak it! */
> +		} else {
> +			flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work);
> +		}
>  	if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
>  	    WARN_ON(srcu_readers_active(sp))) {
>  		pr_info("%s: Active srcu_struct %p state: %d\n", __func__, sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)));
> @@ -392,7 +396,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
>  	free_percpu(sp->sda);
>  	sp->sda = NULL;
>  }
> -EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
> +EXPORT_SYMBOL_GPL(_cleanup_srcu_struct);
>  
>  /*
>   * Counts the new reader in the appropriate per-CPU element of the
---end quoted text---

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] nvme: fix flush dependency in delete controller flow
  2018-04-06  6:19           ` Christoph Hellwig
@ 2018-04-06 16:30             ` Paul E. McKenney
  2018-04-08  7:20               ` Nitzan Carmi
  0 siblings, 1 reply; 19+ messages in thread
From: Paul E. McKenney @ 2018-04-06 16:30 UTC (permalink / raw)


On Fri, Apr 06, 2018@08:19:20AM +0200, Christoph Hellwig wrote:
> On Thu, Apr 05, 2018@05:18:11PM -0700, Paul E. McKenney wrote:
> > OK, how does the following (untested) patch look?
> 
> Looks sensible to me.  Nitzan, can you test it with the obvious nvme
> conversion to cleanup_srcu_struct_quiesced?

And it did pass light rcutorture testing overnight, so here is hoping!  ;-)

							Thanx, Paul

> > ------------------------------------------------------------------------
> > 
> > diff --git a/include/linux/srcu.h b/include/linux/srcu.h
> > index 33c1c698df09..91494d7e8e41 100644
> > --- a/include/linux/srcu.h
> > +++ b/include/linux/srcu.h
> > @@ -69,11 +69,45 @@ struct srcu_struct { };
> >  
> >  void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
> >  		void (*func)(struct rcu_head *head));
> > -void cleanup_srcu_struct(struct srcu_struct *sp);
> > +void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced);
> >  int __srcu_read_lock(struct srcu_struct *sp) __acquires(sp);
> >  void __srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp);
> >  void synchronize_srcu(struct srcu_struct *sp);
> >  
> > +/**
> > + * cleanup_srcu_struct - deconstruct a sleep-RCU structure
> > + * @sp: structure to clean up.
> > + *
> > + * Must invoke this after you are finished using a given srcu_struct that
> > + * was initialized via init_srcu_struct(), else you leak memory.
> > + */
> > +static inline void cleanup_srcu_struct(struct srcu_struct *sp)
> > +{
> > +	_cleanup_srcu_struct(sp, false);
> > +}
> > +
> > +/**
> > + * cleanup_srcu_struct_quiesced - deconstruct a quiesced sleep-RCU structure
> > + * @sp: structure to clean up.
> > + *
> > + * Must invoke this after you are finished using a given srcu_struct that
> > + * was initialized via init_srcu_struct(), else you leak memory.  Also,
> > + * all grace-period processing must have completed.
> > + *
> > + * "Completed" means that the last synchronize_srcu() and
> > + * synchronize_srcu_expedited() calls must have returned before the call
> > + * to cleanup_srcu_struct_quiesced().  It also means that the callback
> > + * from the last call_srcu() must have been invoked before the call to
> > + * cleanup_srcu_struct_quiesced(), but you can use srcu_barrier() to help
> > + * with this last.  Violating these rules will get you a WARN_ON() splat
> > + * (with high probability, anyway), and will also cause the srcu_struct
> > + * to be leaked.
> > + */
> > +static inline void cleanup_srcu_struct_quiesced(struct srcu_struct *sp)
> > +{
> > +	_cleanup_srcu_struct(sp, true);
> > +}
> > +
> >  #ifdef CONFIG_DEBUG_LOCK_ALLOC
> >  
> >  /**
> > diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
> > index 680c96d8c00f..f0e1d44459f8 100644
> > --- a/kernel/rcu/rcutorture.c
> > +++ b/kernel/rcu/rcutorture.c
> > @@ -593,7 +593,12 @@ static void srcu_torture_init(void)
> >  
> >  static void srcu_torture_cleanup(void)
> >  {
> > -	cleanup_srcu_struct(&srcu_ctld);
> > +	static DEFINE_TORTURE_RANDOM(rand);
> > +
> > +	if (torture_random(&rand) & 0x800)
> > +		cleanup_srcu_struct(&srcu_ctld);
> > +	else
> > +		cleanup_srcu_struct_quiesced(&srcu_ctld);
> >  	srcu_ctlp = &srcu_ctl; /* In case of a later rcutorture run. */
> >  }
> >  
> > diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
> > index 76ac5f50b2c7..622792abe41a 100644
> > --- a/kernel/rcu/srcutiny.c
> > +++ b/kernel/rcu/srcutiny.c
> > @@ -86,16 +86,19 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
> >   * Must invoke this after you are finished using a given srcu_struct that
> >   * was initialized via init_srcu_struct(), else you leak memory.
> >   */
> > -void cleanup_srcu_struct(struct srcu_struct *sp)
> > +void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced)
> >  {
> >  	WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]);
> > -	flush_work(&sp->srcu_work);
> > +	if (quiesced)
> > +		WARN_ON(work_pending(&sp->srcu_work));
> > +	else
> > +		flush_work(&sp->srcu_work);
> >  	WARN_ON(sp->srcu_gp_running);
> >  	WARN_ON(sp->srcu_gp_waiting);
> >  	WARN_ON(sp->srcu_cb_head);
> >  	WARN_ON(&sp->srcu_cb_head != sp->srcu_cb_tail);
> >  }
> > -EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
> > +EXPORT_SYMBOL_GPL(_cleanup_srcu_struct);
> >  
> >  /*
> >   * Removes the count for the old reader from the appropriate element of
> > diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
> > index fb560fca9ef4..b4123d7a2cec 100644
> > --- a/kernel/rcu/srcutree.c
> > +++ b/kernel/rcu/srcutree.c
> > @@ -366,24 +366,28 @@ static unsigned long srcu_get_delay(struct srcu_struct *sp)
> >  	return SRCU_INTERVAL;
> >  }
> >  
> > -/**
> > - * cleanup_srcu_struct - deconstruct a sleep-RCU structure
> > - * @sp: structure to clean up.
> > - *
> > - * Must invoke this after you are finished using a given srcu_struct that
> > - * was initialized via init_srcu_struct(), else you leak memory.
> > - */
> > -void cleanup_srcu_struct(struct srcu_struct *sp)
> > +/* Helper for cleanup_srcu_struct() and cleanup_srcu_struct_quiesced(). */
> > +void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced)
> >  {
> >  	int cpu;
> >  
> >  	if (WARN_ON(!srcu_get_delay(sp)))
> > -		return; /* Leakage unless caller handles error. */
> > +		return; /* Just leak it! */
> >  	if (WARN_ON(srcu_readers_active(sp)))
> > -		return; /* Leakage unless caller handles error. */
> > -	flush_delayed_work(&sp->work);
> > +		return; /* Just leak it! */
> > +	if (quiesced) {
> > +		if (WARN_ON(delayed_work_pending(&sp->work)))
> > +			return; /* Just leak it! */
> > +	} else {
> > +		flush_delayed_work(&sp->work);
> > +	}
> >  	for_each_possible_cpu(cpu)
> > -		flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work);
> > +		if (quiesced) {
> > +			if (WARN_ON(delayed_work_pending(&per_cpu_ptr(sp->sda, cpu)->work)))
> > +				return; /* Just leak it! */
> > +		} else {
> > +			flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work);
> > +		}
> >  	if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
> >  	    WARN_ON(srcu_readers_active(sp))) {
> >  		pr_info("%s: Active srcu_struct %p state: %d\n", __func__, sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)));
> > @@ -392,7 +396,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
> >  	free_percpu(sp->sda);
> >  	sp->sda = NULL;
> >  }
> > -EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
> > +EXPORT_SYMBOL_GPL(_cleanup_srcu_struct);
> >  
> >  /*
> >   * Counts the new reader in the appropriate per-CPU element of the
> ---end quoted text---
> 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] nvme: fix flush dependency in delete controller flow
  2018-04-06 16:30             ` Paul E. McKenney
@ 2018-04-08  7:20               ` Nitzan Carmi
  2018-04-08 16:48                 ` Paul E. McKenney
  0 siblings, 1 reply; 19+ messages in thread
From: Nitzan Carmi @ 2018-04-08  7:20 UTC (permalink / raw)


Yes, The patch works very well.
It seems to solve the problem (with the conversion to 
cleanup_srcu_struct_quiesced in nvme), and definitely better
than my original WA.

Thanks!

On 06/04/2018 19:30, Paul E. McKenney wrote:
> On Fri, Apr 06, 2018@08:19:20AM +0200, Christoph Hellwig wrote:
>> On Thu, Apr 05, 2018@05:18:11PM -0700, Paul E. McKenney wrote:
>>> OK, how does the following (untested) patch look?
>>
>> Looks sensible to me.  Nitzan, can you test it with the obvious nvme
>> conversion to cleanup_srcu_struct_quiesced?
> 
> And it did pass light rcutorture testing overnight, so here is hoping!  ;-)
> 
> 							Thanx, Paul
> 
>>> ------------------------------------------------------------------------
>>>
>>> diff --git a/include/linux/srcu.h b/include/linux/srcu.h
>>> index 33c1c698df09..91494d7e8e41 100644
>>> --- a/include/linux/srcu.h
>>> +++ b/include/linux/srcu.h
>>> @@ -69,11 +69,45 @@ struct srcu_struct { };
>>>   
>>>   void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
>>>   		void (*func)(struct rcu_head *head));
>>> -void cleanup_srcu_struct(struct srcu_struct *sp);
>>> +void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced);
>>>   int __srcu_read_lock(struct srcu_struct *sp) __acquires(sp);
>>>   void __srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp);
>>>   void synchronize_srcu(struct srcu_struct *sp);
>>>   
>>> +/**
>>> + * cleanup_srcu_struct - deconstruct a sleep-RCU structure
>>> + * @sp: structure to clean up.
>>> + *
>>> + * Must invoke this after you are finished using a given srcu_struct that
>>> + * was initialized via init_srcu_struct(), else you leak memory.
>>> + */
>>> +static inline void cleanup_srcu_struct(struct srcu_struct *sp)
>>> +{
>>> +	_cleanup_srcu_struct(sp, false);
>>> +}
>>> +
>>> +/**
>>> + * cleanup_srcu_struct_quiesced - deconstruct a quiesced sleep-RCU structure
>>> + * @sp: structure to clean up.
>>> + *
>>> + * Must invoke this after you are finished using a given srcu_struct that
>>> + * was initialized via init_srcu_struct(), else you leak memory.  Also,
>>> + * all grace-period processing must have completed.
>>> + *
>>> + * "Completed" means that the last synchronize_srcu() and
>>> + * synchronize_srcu_expedited() calls must have returned before the call
>>> + * to cleanup_srcu_struct_quiesced().  It also means that the callback
>>> + * from the last call_srcu() must have been invoked before the call to
>>> + * cleanup_srcu_struct_quiesced(), but you can use srcu_barrier() to help
>>> + * with this last.  Violating these rules will get you a WARN_ON() splat
>>> + * (with high probability, anyway), and will also cause the srcu_struct
>>> + * to be leaked.
>>> + */
>>> +static inline void cleanup_srcu_struct_quiesced(struct srcu_struct *sp)
>>> +{
>>> +	_cleanup_srcu_struct(sp, true);
>>> +}
>>> +
>>>   #ifdef CONFIG_DEBUG_LOCK_ALLOC
>>>   
>>>   /**
>>> diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
>>> index 680c96d8c00f..f0e1d44459f8 100644
>>> --- a/kernel/rcu/rcutorture.c
>>> +++ b/kernel/rcu/rcutorture.c
>>> @@ -593,7 +593,12 @@ static void srcu_torture_init(void)
>>>   
>>>   static void srcu_torture_cleanup(void)
>>>   {
>>> -	cleanup_srcu_struct(&srcu_ctld);
>>> +	static DEFINE_TORTURE_RANDOM(rand);
>>> +
>>> +	if (torture_random(&rand) & 0x800)
>>> +		cleanup_srcu_struct(&srcu_ctld);
>>> +	else
>>> +		cleanup_srcu_struct_quiesced(&srcu_ctld);
>>>   	srcu_ctlp = &srcu_ctl; /* In case of a later rcutorture run. */
>>>   }
>>>   
>>> diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
>>> index 76ac5f50b2c7..622792abe41a 100644
>>> --- a/kernel/rcu/srcutiny.c
>>> +++ b/kernel/rcu/srcutiny.c
>>> @@ -86,16 +86,19 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
>>>    * Must invoke this after you are finished using a given srcu_struct that
>>>    * was initialized via init_srcu_struct(), else you leak memory.
>>>    */
>>> -void cleanup_srcu_struct(struct srcu_struct *sp)
>>> +void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced)
>>>   {
>>>   	WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]);
>>> -	flush_work(&sp->srcu_work);
>>> +	if (quiesced)
>>> +		WARN_ON(work_pending(&sp->srcu_work));
>>> +	else
>>> +		flush_work(&sp->srcu_work);
>>>   	WARN_ON(sp->srcu_gp_running);
>>>   	WARN_ON(sp->srcu_gp_waiting);
>>>   	WARN_ON(sp->srcu_cb_head);
>>>   	WARN_ON(&sp->srcu_cb_head != sp->srcu_cb_tail);
>>>   }
>>> -EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
>>> +EXPORT_SYMBOL_GPL(_cleanup_srcu_struct);
>>>   
>>>   /*
>>>    * Removes the count for the old reader from the appropriate element of
>>> diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
>>> index fb560fca9ef4..b4123d7a2cec 100644
>>> --- a/kernel/rcu/srcutree.c
>>> +++ b/kernel/rcu/srcutree.c
>>> @@ -366,24 +366,28 @@ static unsigned long srcu_get_delay(struct srcu_struct *sp)
>>>   	return SRCU_INTERVAL;
>>>   }
>>>   
>>> -/**
>>> - * cleanup_srcu_struct - deconstruct a sleep-RCU structure
>>> - * @sp: structure to clean up.
>>> - *
>>> - * Must invoke this after you are finished using a given srcu_struct that
>>> - * was initialized via init_srcu_struct(), else you leak memory.
>>> - */
>>> -void cleanup_srcu_struct(struct srcu_struct *sp)
>>> +/* Helper for cleanup_srcu_struct() and cleanup_srcu_struct_quiesced(). */
>>> +void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced)
>>>   {
>>>   	int cpu;
>>>   
>>>   	if (WARN_ON(!srcu_get_delay(sp)))
>>> -		return; /* Leakage unless caller handles error. */
>>> +		return; /* Just leak it! */
>>>   	if (WARN_ON(srcu_readers_active(sp)))
>>> -		return; /* Leakage unless caller handles error. */
>>> -	flush_delayed_work(&sp->work);
>>> +		return; /* Just leak it! */
>>> +	if (quiesced) {
>>> +		if (WARN_ON(delayed_work_pending(&sp->work)))
>>> +			return; /* Just leak it! */
>>> +	} else {
>>> +		flush_delayed_work(&sp->work);
>>> +	}
>>>   	for_each_possible_cpu(cpu)
>>> -		flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work);
>>> +		if (quiesced) {
>>> +			if (WARN_ON(delayed_work_pending(&per_cpu_ptr(sp->sda, cpu)->work)))
>>> +				return; /* Just leak it! */
>>> +		} else {
>>> +			flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work);
>>> +		}
>>>   	if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
>>>   	    WARN_ON(srcu_readers_active(sp))) {
>>>   		pr_info("%s: Active srcu_struct %p state: %d\n", __func__, sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)));
>>> @@ -392,7 +396,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
>>>   	free_percpu(sp->sda);
>>>   	sp->sda = NULL;
>>>   }
>>> -EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
>>> +EXPORT_SYMBOL_GPL(_cleanup_srcu_struct);
>>>   
>>>   /*
>>>    * Counts the new reader in the appropriate per-CPU element of the
>> ---end quoted text---
>>
> 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] nvme: fix flush dependency in delete controller flow
  2018-04-08  7:20               ` Nitzan Carmi
@ 2018-04-08 16:48                 ` Paul E. McKenney
  2018-04-09  6:39                   ` Nitzan Carmi
  0 siblings, 1 reply; 19+ messages in thread
From: Paul E. McKenney @ 2018-04-08 16:48 UTC (permalink / raw)


On Sun, Apr 08, 2018@10:20:10AM +0300, Nitzan Carmi wrote:
> Yes, The patch works very well.
> It seems to solve the problem (with the conversion to
> cleanup_srcu_struct_quiesced in nvme), and definitely better
> than my original WA.

May I have your Tested-by?

							Thanx, Paul

> Thanks!
> 
> On 06/04/2018 19:30, Paul E. McKenney wrote:
> >On Fri, Apr 06, 2018@08:19:20AM +0200, Christoph Hellwig wrote:
> >>On Thu, Apr 05, 2018@05:18:11PM -0700, Paul E. McKenney wrote:
> >>>OK, how does the following (untested) patch look?
> >>
> >>Looks sensible to me.  Nitzan, can you test it with the obvious nvme
> >>conversion to cleanup_srcu_struct_quiesced?
> >
> >And it did pass light rcutorture testing overnight, so here is hoping!  ;-)
> >
> >							Thanx, Paul
> >
> >>>------------------------------------------------------------------------
> >>>
> >>>diff --git a/include/linux/srcu.h b/include/linux/srcu.h
> >>>index 33c1c698df09..91494d7e8e41 100644
> >>>--- a/include/linux/srcu.h
> >>>+++ b/include/linux/srcu.h
> >>>@@ -69,11 +69,45 @@ struct srcu_struct { };
> >>>  void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
> >>>  		void (*func)(struct rcu_head *head));
> >>>-void cleanup_srcu_struct(struct srcu_struct *sp);
> >>>+void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced);
> >>>  int __srcu_read_lock(struct srcu_struct *sp) __acquires(sp);
> >>>  void __srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp);
> >>>  void synchronize_srcu(struct srcu_struct *sp);
> >>>+/**
> >>>+ * cleanup_srcu_struct - deconstruct a sleep-RCU structure
> >>>+ * @sp: structure to clean up.
> >>>+ *
> >>>+ * Must invoke this after you are finished using a given srcu_struct that
> >>>+ * was initialized via init_srcu_struct(), else you leak memory.
> >>>+ */
> >>>+static inline void cleanup_srcu_struct(struct srcu_struct *sp)
> >>>+{
> >>>+	_cleanup_srcu_struct(sp, false);
> >>>+}
> >>>+
> >>>+/**
> >>>+ * cleanup_srcu_struct_quiesced - deconstruct a quiesced sleep-RCU structure
> >>>+ * @sp: structure to clean up.
> >>>+ *
> >>>+ * Must invoke this after you are finished using a given srcu_struct that
> >>>+ * was initialized via init_srcu_struct(), else you leak memory.  Also,
> >>>+ * all grace-period processing must have completed.
> >>>+ *
> >>>+ * "Completed" means that the last synchronize_srcu() and
> >>>+ * synchronize_srcu_expedited() calls must have returned before the call
> >>>+ * to cleanup_srcu_struct_quiesced().  It also means that the callback
> >>>+ * from the last call_srcu() must have been invoked before the call to
> >>>+ * cleanup_srcu_struct_quiesced(), but you can use srcu_barrier() to help
> >>>+ * with this last.  Violating these rules will get you a WARN_ON() splat
> >>>+ * (with high probability, anyway), and will also cause the srcu_struct
> >>>+ * to be leaked.
> >>>+ */
> >>>+static inline void cleanup_srcu_struct_quiesced(struct srcu_struct *sp)
> >>>+{
> >>>+	_cleanup_srcu_struct(sp, true);
> >>>+}
> >>>+
> >>>  #ifdef CONFIG_DEBUG_LOCK_ALLOC
> >>>  /**
> >>>diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
> >>>index 680c96d8c00f..f0e1d44459f8 100644
> >>>--- a/kernel/rcu/rcutorture.c
> >>>+++ b/kernel/rcu/rcutorture.c
> >>>@@ -593,7 +593,12 @@ static void srcu_torture_init(void)
> >>>  static void srcu_torture_cleanup(void)
> >>>  {
> >>>-	cleanup_srcu_struct(&srcu_ctld);
> >>>+	static DEFINE_TORTURE_RANDOM(rand);
> >>>+
> >>>+	if (torture_random(&rand) & 0x800)
> >>>+		cleanup_srcu_struct(&srcu_ctld);
> >>>+	else
> >>>+		cleanup_srcu_struct_quiesced(&srcu_ctld);
> >>>  	srcu_ctlp = &srcu_ctl; /* In case of a later rcutorture run. */
> >>>  }
> >>>diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
> >>>index 76ac5f50b2c7..622792abe41a 100644
> >>>--- a/kernel/rcu/srcutiny.c
> >>>+++ b/kernel/rcu/srcutiny.c
> >>>@@ -86,16 +86,19 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
> >>>   * Must invoke this after you are finished using a given srcu_struct that
> >>>   * was initialized via init_srcu_struct(), else you leak memory.
> >>>   */
> >>>-void cleanup_srcu_struct(struct srcu_struct *sp)
> >>>+void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced)
> >>>  {
> >>>  	WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]);
> >>>-	flush_work(&sp->srcu_work);
> >>>+	if (quiesced)
> >>>+		WARN_ON(work_pending(&sp->srcu_work));
> >>>+	else
> >>>+		flush_work(&sp->srcu_work);
> >>>  	WARN_ON(sp->srcu_gp_running);
> >>>  	WARN_ON(sp->srcu_gp_waiting);
> >>>  	WARN_ON(sp->srcu_cb_head);
> >>>  	WARN_ON(&sp->srcu_cb_head != sp->srcu_cb_tail);
> >>>  }
> >>>-EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
> >>>+EXPORT_SYMBOL_GPL(_cleanup_srcu_struct);
> >>>  /*
> >>>   * Removes the count for the old reader from the appropriate element of
> >>>diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
> >>>index fb560fca9ef4..b4123d7a2cec 100644
> >>>--- a/kernel/rcu/srcutree.c
> >>>+++ b/kernel/rcu/srcutree.c
> >>>@@ -366,24 +366,28 @@ static unsigned long srcu_get_delay(struct srcu_struct *sp)
> >>>  	return SRCU_INTERVAL;
> >>>  }
> >>>-/**
> >>>- * cleanup_srcu_struct - deconstruct a sleep-RCU structure
> >>>- * @sp: structure to clean up.
> >>>- *
> >>>- * Must invoke this after you are finished using a given srcu_struct that
> >>>- * was initialized via init_srcu_struct(), else you leak memory.
> >>>- */
> >>>-void cleanup_srcu_struct(struct srcu_struct *sp)
> >>>+/* Helper for cleanup_srcu_struct() and cleanup_srcu_struct_quiesced(). */
> >>>+void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced)
> >>>  {
> >>>  	int cpu;
> >>>  	if (WARN_ON(!srcu_get_delay(sp)))
> >>>-		return; /* Leakage unless caller handles error. */
> >>>+		return; /* Just leak it! */
> >>>  	if (WARN_ON(srcu_readers_active(sp)))
> >>>-		return; /* Leakage unless caller handles error. */
> >>>-	flush_delayed_work(&sp->work);
> >>>+		return; /* Just leak it! */
> >>>+	if (quiesced) {
> >>>+		if (WARN_ON(delayed_work_pending(&sp->work)))
> >>>+			return; /* Just leak it! */
> >>>+	} else {
> >>>+		flush_delayed_work(&sp->work);
> >>>+	}
> >>>  	for_each_possible_cpu(cpu)
> >>>-		flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work);
> >>>+		if (quiesced) {
> >>>+			if (WARN_ON(delayed_work_pending(&per_cpu_ptr(sp->sda, cpu)->work)))
> >>>+				return; /* Just leak it! */
> >>>+		} else {
> >>>+			flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work);
> >>>+		}
> >>>  	if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
> >>>  	    WARN_ON(srcu_readers_active(sp))) {
> >>>  		pr_info("%s: Active srcu_struct %p state: %d\n", __func__, sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)));
> >>>@@ -392,7 +396,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
> >>>  	free_percpu(sp->sda);
> >>>  	sp->sda = NULL;
> >>>  }
> >>>-EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
> >>>+EXPORT_SYMBOL_GPL(_cleanup_srcu_struct);
> >>>  /*
> >>>   * Counts the new reader in the appropriate per-CPU element of the
> >>---end quoted text---
> >>
> >
> 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] nvme: fix flush dependency in delete controller flow
  2018-04-08 16:48                 ` Paul E. McKenney
@ 2018-04-09  6:39                   ` Nitzan Carmi
  2018-04-09  6:50                     ` Christoph Hellwig
  2018-04-09 16:30                     ` [PATCH] nvme: fix " Paul E. McKenney
  0 siblings, 2 replies; 19+ messages in thread
From: Nitzan Carmi @ 2018-04-09  6:39 UTC (permalink / raw)


I'll be honored :-).

Tested-by: Nitzan Carmi <nitzanc at mellanox.com>

On 08/04/2018 19:48, Paul E. McKenney wrote:
> On Sun, Apr 08, 2018@10:20:10AM +0300, Nitzan Carmi wrote:
>> Yes, The patch works very well.
>> It seems to solve the problem (with the conversion to
>> cleanup_srcu_struct_quiesced in nvme), and definitely better
>> than my original WA.
> 
> May I have your Tested-by?
> 
> 							Thanx, Paul
> 
>> Thanks!
>>
>> On 06/04/2018 19:30, Paul E. McKenney wrote:
>>> On Fri, Apr 06, 2018@08:19:20AM +0200, Christoph Hellwig wrote:
>>>> On Thu, Apr 05, 2018@05:18:11PM -0700, Paul E. McKenney wrote:
>>>>> OK, how does the following (untested) patch look?
>>>>
>>>> Looks sensible to me.  Nitzan, can you test it with the obvious nvme
>>>> conversion to cleanup_srcu_struct_quiesced?
>>>
>>> And it did pass light rcutorture testing overnight, so here is hoping!  ;-)
>>>
>>> 							Thanx, Paul
>>>
>>>>> ------------------------------------------------------------------------
>>>>>
>>>>> diff --git a/include/linux/srcu.h b/include/linux/srcu.h
>>>>> index 33c1c698df09..91494d7e8e41 100644
>>>>> --- a/include/linux/srcu.h
>>>>> +++ b/include/linux/srcu.h
>>>>> @@ -69,11 +69,45 @@ struct srcu_struct { };
>>>>>   void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
>>>>>   		void (*func)(struct rcu_head *head));
>>>>> -void cleanup_srcu_struct(struct srcu_struct *sp);
>>>>> +void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced);
>>>>>   int __srcu_read_lock(struct srcu_struct *sp) __acquires(sp);
>>>>>   void __srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp);
>>>>>   void synchronize_srcu(struct srcu_struct *sp);
>>>>> +/**
>>>>> + * cleanup_srcu_struct - deconstruct a sleep-RCU structure
>>>>> + * @sp: structure to clean up.
>>>>> + *
>>>>> + * Must invoke this after you are finished using a given srcu_struct that
>>>>> + * was initialized via init_srcu_struct(), else you leak memory.
>>>>> + */
>>>>> +static inline void cleanup_srcu_struct(struct srcu_struct *sp)
>>>>> +{
>>>>> +	_cleanup_srcu_struct(sp, false);
>>>>> +}
>>>>> +
>>>>> +/**
>>>>> + * cleanup_srcu_struct_quiesced - deconstruct a quiesced sleep-RCU structure
>>>>> + * @sp: structure to clean up.
>>>>> + *
>>>>> + * Must invoke this after you are finished using a given srcu_struct that
>>>>> + * was initialized via init_srcu_struct(), else you leak memory.  Also,
>>>>> + * all grace-period processing must have completed.
>>>>> + *
>>>>> + * "Completed" means that the last synchronize_srcu() and
>>>>> + * synchronize_srcu_expedited() calls must have returned before the call
>>>>> + * to cleanup_srcu_struct_quiesced().  It also means that the callback
>>>>> + * from the last call_srcu() must have been invoked before the call to
>>>>> + * cleanup_srcu_struct_quiesced(), but you can use srcu_barrier() to help
>>>>> + * with this last.  Violating these rules will get you a WARN_ON() splat
>>>>> + * (with high probability, anyway), and will also cause the srcu_struct
>>>>> + * to be leaked.
>>>>> + */
>>>>> +static inline void cleanup_srcu_struct_quiesced(struct srcu_struct *sp)
>>>>> +{
>>>>> +	_cleanup_srcu_struct(sp, true);
>>>>> +}
>>>>> +
>>>>>   #ifdef CONFIG_DEBUG_LOCK_ALLOC
>>>>>   /**
>>>>> diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
>>>>> index 680c96d8c00f..f0e1d44459f8 100644
>>>>> --- a/kernel/rcu/rcutorture.c
>>>>> +++ b/kernel/rcu/rcutorture.c
>>>>> @@ -593,7 +593,12 @@ static void srcu_torture_init(void)
>>>>>   static void srcu_torture_cleanup(void)
>>>>>   {
>>>>> -	cleanup_srcu_struct(&srcu_ctld);
>>>>> +	static DEFINE_TORTURE_RANDOM(rand);
>>>>> +
>>>>> +	if (torture_random(&rand) & 0x800)
>>>>> +		cleanup_srcu_struct(&srcu_ctld);
>>>>> +	else
>>>>> +		cleanup_srcu_struct_quiesced(&srcu_ctld);
>>>>>   	srcu_ctlp = &srcu_ctl; /* In case of a later rcutorture run. */
>>>>>   }
>>>>> diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
>>>>> index 76ac5f50b2c7..622792abe41a 100644
>>>>> --- a/kernel/rcu/srcutiny.c
>>>>> +++ b/kernel/rcu/srcutiny.c
>>>>> @@ -86,16 +86,19 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
>>>>>    * Must invoke this after you are finished using a given srcu_struct that
>>>>>    * was initialized via init_srcu_struct(), else you leak memory.
>>>>>    */
>>>>> -void cleanup_srcu_struct(struct srcu_struct *sp)
>>>>> +void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced)
>>>>>   {
>>>>>   	WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]);
>>>>> -	flush_work(&sp->srcu_work);
>>>>> +	if (quiesced)
>>>>> +		WARN_ON(work_pending(&sp->srcu_work));
>>>>> +	else
>>>>> +		flush_work(&sp->srcu_work);
>>>>>   	WARN_ON(sp->srcu_gp_running);
>>>>>   	WARN_ON(sp->srcu_gp_waiting);
>>>>>   	WARN_ON(sp->srcu_cb_head);
>>>>>   	WARN_ON(&sp->srcu_cb_head != sp->srcu_cb_tail);
>>>>>   }
>>>>> -EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
>>>>> +EXPORT_SYMBOL_GPL(_cleanup_srcu_struct);
>>>>>   /*
>>>>>    * Removes the count for the old reader from the appropriate element of
>>>>> diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
>>>>> index fb560fca9ef4..b4123d7a2cec 100644
>>>>> --- a/kernel/rcu/srcutree.c
>>>>> +++ b/kernel/rcu/srcutree.c
>>>>> @@ -366,24 +366,28 @@ static unsigned long srcu_get_delay(struct srcu_struct *sp)
>>>>>   	return SRCU_INTERVAL;
>>>>>   }
>>>>> -/**
>>>>> - * cleanup_srcu_struct - deconstruct a sleep-RCU structure
>>>>> - * @sp: structure to clean up.
>>>>> - *
>>>>> - * Must invoke this after you are finished using a given srcu_struct that
>>>>> - * was initialized via init_srcu_struct(), else you leak memory.
>>>>> - */
>>>>> -void cleanup_srcu_struct(struct srcu_struct *sp)
>>>>> +/* Helper for cleanup_srcu_struct() and cleanup_srcu_struct_quiesced(). */
>>>>> +void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced)
>>>>>   {
>>>>>   	int cpu;
>>>>>   	if (WARN_ON(!srcu_get_delay(sp)))
>>>>> -		return; /* Leakage unless caller handles error. */
>>>>> +		return; /* Just leak it! */
>>>>>   	if (WARN_ON(srcu_readers_active(sp)))
>>>>> -		return; /* Leakage unless caller handles error. */
>>>>> -	flush_delayed_work(&sp->work);
>>>>> +		return; /* Just leak it! */
>>>>> +	if (quiesced) {
>>>>> +		if (WARN_ON(delayed_work_pending(&sp->work)))
>>>>> +			return; /* Just leak it! */
>>>>> +	} else {
>>>>> +		flush_delayed_work(&sp->work);
>>>>> +	}
>>>>>   	for_each_possible_cpu(cpu)
>>>>> -		flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work);
>>>>> +		if (quiesced) {
>>>>> +			if (WARN_ON(delayed_work_pending(&per_cpu_ptr(sp->sda, cpu)->work)))
>>>>> +				return; /* Just leak it! */
>>>>> +		} else {
>>>>> +			flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work);
>>>>> +		}
>>>>>   	if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
>>>>>   	    WARN_ON(srcu_readers_active(sp))) {
>>>>>   		pr_info("%s: Active srcu_struct %p state: %d\n", __func__, sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)));
>>>>> @@ -392,7 +396,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
>>>>>   	free_percpu(sp->sda);
>>>>>   	sp->sda = NULL;
>>>>>   }
>>>>> -EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
>>>>> +EXPORT_SYMBOL_GPL(_cleanup_srcu_struct);
>>>>>   /*
>>>>>    * Counts the new reader in the appropriate per-CPU element of the
>>>> ---end quoted text---
>>>>
>>>
>>
> 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] nvme: fix flush dependency in delete controller flow
  2018-04-09  6:39                   ` Nitzan Carmi
@ 2018-04-09  6:50                     ` Christoph Hellwig
  2018-04-09 14:50                       ` [PATCH] nvme: avoid " Nitzan Carmi
  2018-04-09 16:30                     ` [PATCH] nvme: fix " Paul E. McKenney
  1 sibling, 1 reply; 19+ messages in thread
From: Christoph Hellwig @ 2018-04-09  6:50 UTC (permalink / raw)


On Mon, Apr 09, 2018@09:39:01AM +0300, Nitzan Carmi wrote:
> I'll be honored :-).
>
> Tested-by: Nitzan Carmi <nitzanc at mellanox.com>

Paul, Nitzan - how do we want to handle the new series?  Might make
sense to add the nvme path to the rcu series?  Nitzan, can you send
the patch to linux-nvme and Paul?

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] nvme: avoid flush dependency in delete controller flow
  2018-04-09  6:50                     ` Christoph Hellwig
@ 2018-04-09 14:50                       ` Nitzan Carmi
  2018-04-09 16:48                         ` Paul E. McKenney
  0 siblings, 1 reply; 19+ messages in thread
From: Nitzan Carmi @ 2018-04-09 14:50 UTC (permalink / raw)


nvme_delete_ctrl queues a work on a MEM_RECLAIM queue
(nvme_delete_wq), which eventually calls cleanup_srcu_struct,
which in turn flushes a delayed work from an !MEM_RECLAIM
queue. This is unsafe as we might trigger deadlocks under
severe memory pressure.

Since we don't ever invoke call_srcu(), it is safe
to use the _quiesced() version of srcu cleanup, and
avoid that flush dependency.

Signed-off-by: Nitzan Carmi <nitzanc at mellanox.com>
---
 drivers/nvme/host/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 197a6ba9700f..0ef5c495ca07 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -349,7 +349,7 @@ static void nvme_free_ns_head(struct kref *ref)
 	nvme_mpath_remove_disk(head);
 	ida_simple_remove(&head->subsys->ns_ida, head->instance);
 	list_del_init(&head->entry);
-	cleanup_srcu_struct(&head->srcu);
+	cleanup_srcu_struct_quiesced(&head->srcu);
 	kfree(head);
 }
 
-- 
2.14.1

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH] nvme: fix flush dependency in delete controller flow
  2018-04-09  6:39                   ` Nitzan Carmi
  2018-04-09  6:50                     ` Christoph Hellwig
@ 2018-04-09 16:30                     ` Paul E. McKenney
  1 sibling, 0 replies; 19+ messages in thread
From: Paul E. McKenney @ 2018-04-09 16:30 UTC (permalink / raw)


On Mon, Apr 09, 2018@09:39:01AM +0300, Nitzan Carmi wrote:
> I'll be honored :-).
> 
> Tested-by: Nitzan Carmi <nitzanc at mellanox.com>

Applied, and thank you very much for the testing!

							Thanx, Paul

> On 08/04/2018 19:48, Paul E. McKenney wrote:
> >On Sun, Apr 08, 2018@10:20:10AM +0300, Nitzan Carmi wrote:
> >>Yes, The patch works very well.
> >>It seems to solve the problem (with the conversion to
> >>cleanup_srcu_struct_quiesced in nvme), and definitely better
> >>than my original WA.
> >
> >May I have your Tested-by?
> >
> >							Thanx, Paul
> >
> >>Thanks!
> >>
> >>On 06/04/2018 19:30, Paul E. McKenney wrote:
> >>>On Fri, Apr 06, 2018@08:19:20AM +0200, Christoph Hellwig wrote:
> >>>>On Thu, Apr 05, 2018@05:18:11PM -0700, Paul E. McKenney wrote:
> >>>>>OK, how does the following (untested) patch look?
> >>>>
> >>>>Looks sensible to me.  Nitzan, can you test it with the obvious nvme
> >>>>conversion to cleanup_srcu_struct_quiesced?
> >>>
> >>>And it did pass light rcutorture testing overnight, so here is hoping!  ;-)
> >>>
> >>>							Thanx, Paul
> >>>
> >>>>>------------------------------------------------------------------------
> >>>>>
> >>>>>diff --git a/include/linux/srcu.h b/include/linux/srcu.h
> >>>>>index 33c1c698df09..91494d7e8e41 100644
> >>>>>--- a/include/linux/srcu.h
> >>>>>+++ b/include/linux/srcu.h
> >>>>>@@ -69,11 +69,45 @@ struct srcu_struct { };
> >>>>>  void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
> >>>>>  		void (*func)(struct rcu_head *head));
> >>>>>-void cleanup_srcu_struct(struct srcu_struct *sp);
> >>>>>+void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced);
> >>>>>  int __srcu_read_lock(struct srcu_struct *sp) __acquires(sp);
> >>>>>  void __srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp);
> >>>>>  void synchronize_srcu(struct srcu_struct *sp);
> >>>>>+/**
> >>>>>+ * cleanup_srcu_struct - deconstruct a sleep-RCU structure
> >>>>>+ * @sp: structure to clean up.
> >>>>>+ *
> >>>>>+ * Must invoke this after you are finished using a given srcu_struct that
> >>>>>+ * was initialized via init_srcu_struct(), else you leak memory.
> >>>>>+ */
> >>>>>+static inline void cleanup_srcu_struct(struct srcu_struct *sp)
> >>>>>+{
> >>>>>+	_cleanup_srcu_struct(sp, false);
> >>>>>+}
> >>>>>+
> >>>>>+/**
> >>>>>+ * cleanup_srcu_struct_quiesced - deconstruct a quiesced sleep-RCU structure
> >>>>>+ * @sp: structure to clean up.
> >>>>>+ *
> >>>>>+ * Must invoke this after you are finished using a given srcu_struct that
> >>>>>+ * was initialized via init_srcu_struct(), else you leak memory.  Also,
> >>>>>+ * all grace-period processing must have completed.
> >>>>>+ *
> >>>>>+ * "Completed" means that the last synchronize_srcu() and
> >>>>>+ * synchronize_srcu_expedited() calls must have returned before the call
> >>>>>+ * to cleanup_srcu_struct_quiesced().  It also means that the callback
> >>>>>+ * from the last call_srcu() must have been invoked before the call to
> >>>>>+ * cleanup_srcu_struct_quiesced(), but you can use srcu_barrier() to help
> >>>>>+ * with this last.  Violating these rules will get you a WARN_ON() splat
> >>>>>+ * (with high probability, anyway), and will also cause the srcu_struct
> >>>>>+ * to be leaked.
> >>>>>+ */
> >>>>>+static inline void cleanup_srcu_struct_quiesced(struct srcu_struct *sp)
> >>>>>+{
> >>>>>+	_cleanup_srcu_struct(sp, true);
> >>>>>+}
> >>>>>+
> >>>>>  #ifdef CONFIG_DEBUG_LOCK_ALLOC
> >>>>>  /**
> >>>>>diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
> >>>>>index 680c96d8c00f..f0e1d44459f8 100644
> >>>>>--- a/kernel/rcu/rcutorture.c
> >>>>>+++ b/kernel/rcu/rcutorture.c
> >>>>>@@ -593,7 +593,12 @@ static void srcu_torture_init(void)
> >>>>>  static void srcu_torture_cleanup(void)
> >>>>>  {
> >>>>>-	cleanup_srcu_struct(&srcu_ctld);
> >>>>>+	static DEFINE_TORTURE_RANDOM(rand);
> >>>>>+
> >>>>>+	if (torture_random(&rand) & 0x800)
> >>>>>+		cleanup_srcu_struct(&srcu_ctld);
> >>>>>+	else
> >>>>>+		cleanup_srcu_struct_quiesced(&srcu_ctld);
> >>>>>  	srcu_ctlp = &srcu_ctl; /* In case of a later rcutorture run. */
> >>>>>  }
> >>>>>diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
> >>>>>index 76ac5f50b2c7..622792abe41a 100644
> >>>>>--- a/kernel/rcu/srcutiny.c
> >>>>>+++ b/kernel/rcu/srcutiny.c
> >>>>>@@ -86,16 +86,19 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
> >>>>>   * Must invoke this after you are finished using a given srcu_struct that
> >>>>>   * was initialized via init_srcu_struct(), else you leak memory.
> >>>>>   */
> >>>>>-void cleanup_srcu_struct(struct srcu_struct *sp)
> >>>>>+void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced)
> >>>>>  {
> >>>>>  	WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]);
> >>>>>-	flush_work(&sp->srcu_work);
> >>>>>+	if (quiesced)
> >>>>>+		WARN_ON(work_pending(&sp->srcu_work));
> >>>>>+	else
> >>>>>+		flush_work(&sp->srcu_work);
> >>>>>  	WARN_ON(sp->srcu_gp_running);
> >>>>>  	WARN_ON(sp->srcu_gp_waiting);
> >>>>>  	WARN_ON(sp->srcu_cb_head);
> >>>>>  	WARN_ON(&sp->srcu_cb_head != sp->srcu_cb_tail);
> >>>>>  }
> >>>>>-EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
> >>>>>+EXPORT_SYMBOL_GPL(_cleanup_srcu_struct);
> >>>>>  /*
> >>>>>   * Removes the count for the old reader from the appropriate element of
> >>>>>diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
> >>>>>index fb560fca9ef4..b4123d7a2cec 100644
> >>>>>--- a/kernel/rcu/srcutree.c
> >>>>>+++ b/kernel/rcu/srcutree.c
> >>>>>@@ -366,24 +366,28 @@ static unsigned long srcu_get_delay(struct srcu_struct *sp)
> >>>>>  	return SRCU_INTERVAL;
> >>>>>  }
> >>>>>-/**
> >>>>>- * cleanup_srcu_struct - deconstruct a sleep-RCU structure
> >>>>>- * @sp: structure to clean up.
> >>>>>- *
> >>>>>- * Must invoke this after you are finished using a given srcu_struct that
> >>>>>- * was initialized via init_srcu_struct(), else you leak memory.
> >>>>>- */
> >>>>>-void cleanup_srcu_struct(struct srcu_struct *sp)
> >>>>>+/* Helper for cleanup_srcu_struct() and cleanup_srcu_struct_quiesced(). */
> >>>>>+void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced)
> >>>>>  {
> >>>>>  	int cpu;
> >>>>>  	if (WARN_ON(!srcu_get_delay(sp)))
> >>>>>-		return; /* Leakage unless caller handles error. */
> >>>>>+		return; /* Just leak it! */
> >>>>>  	if (WARN_ON(srcu_readers_active(sp)))
> >>>>>-		return; /* Leakage unless caller handles error. */
> >>>>>-	flush_delayed_work(&sp->work);
> >>>>>+		return; /* Just leak it! */
> >>>>>+	if (quiesced) {
> >>>>>+		if (WARN_ON(delayed_work_pending(&sp->work)))
> >>>>>+			return; /* Just leak it! */
> >>>>>+	} else {
> >>>>>+		flush_delayed_work(&sp->work);
> >>>>>+	}
> >>>>>  	for_each_possible_cpu(cpu)
> >>>>>-		flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work);
> >>>>>+		if (quiesced) {
> >>>>>+			if (WARN_ON(delayed_work_pending(&per_cpu_ptr(sp->sda, cpu)->work)))
> >>>>>+				return; /* Just leak it! */
> >>>>>+		} else {
> >>>>>+			flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work);
> >>>>>+		}
> >>>>>  	if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
> >>>>>  	    WARN_ON(srcu_readers_active(sp))) {
> >>>>>  		pr_info("%s: Active srcu_struct %p state: %d\n", __func__, sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)));
> >>>>>@@ -392,7 +396,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
> >>>>>  	free_percpu(sp->sda);
> >>>>>  	sp->sda = NULL;
> >>>>>  }
> >>>>>-EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
> >>>>>+EXPORT_SYMBOL_GPL(_cleanup_srcu_struct);
> >>>>>  /*
> >>>>>   * Counts the new reader in the appropriate per-CPU element of the
> >>>>---end quoted text---
> >>>>
> >>>
> >>
> >
> 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] nvme: avoid flush dependency in delete controller flow
  2018-04-09 14:50                       ` [PATCH] nvme: avoid " Nitzan Carmi
@ 2018-04-09 16:48                         ` Paul E. McKenney
  2018-04-09 16:58                           ` Max Gurtovoy
  0 siblings, 1 reply; 19+ messages in thread
From: Paul E. McKenney @ 2018-04-09 16:48 UTC (permalink / raw)


On Mon, Apr 09, 2018@05:50:26PM +0300, Nitzan Carmi wrote:
> nvme_delete_ctrl queues a work on a MEM_RECLAIM queue
> (nvme_delete_wq), which eventually calls cleanup_srcu_struct,
> which in turn flushes a delayed work from an !MEM_RECLAIM
> queue. This is unsafe as we might trigger deadlocks under
> severe memory pressure.
> 
> Since we don't ever invoke call_srcu(), it is safe
> to use the _quiesced() version of srcu cleanup, and
> avoid that flush dependency.
> 
> Signed-off-by: Nitzan Carmi <nitzanc at mellanox.com>

Very good, thank you!

I queued this with a few edits to the commit log (please see below),
so please let me know if I messed anything up.

Given that it only happens under high memory pressure, I am assuming
that sending this up the v4.18 merge window (not this one, but the next
one) will work.  Please let me know if this is more urgent than that.
(If it is not more urgent, I would like to allow the extra time for
people to get used to yet another addition to the SRCU API.)

------------------------------------------------------------------------

commit 6fde861519fc6e4be50c130272be556ef5656324
Author: Nitzan Carmi <nitzanc at mellanox.com>
Date:   Mon Apr 9 17:50:26 2018 +0300

    nvme: Avoid flush dependency in delete controller flow
    
    The nvme_delete_ctrl() function queues a work item on a MEM_RECLAIM
    queue (nvme_delete_wq), which eventually calls cleanup_srcu_struct(),
    which in turn flushes a delayed work from an !MEM_RECLAIM queue. This
    is unsafe as we might trigger deadlocks under severe memory pressure.
    
    Since we don't ever invoke call_srcu(), it is safe to use the shiny new
    _quiesced() version of srcu cleanup, thus avoiding that flush dependency.
    This commit makes that change.
    
    Signed-off-by: Nitzan Carmi <nitzanc at mellanox.com>
    Signed-off-by: Paul E. McKenney <paulmck at linux.vnet.ibm.com>

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 7aeca5db7916..40008d06c365 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -351,7 +351,7 @@ static void nvme_free_ns_head(struct kref *ref)
 	nvme_mpath_remove_disk(head);
 	ida_simple_remove(&head->subsys->ns_ida, head->instance);
 	list_del_init(&head->entry);
-	cleanup_srcu_struct(&head->srcu);
+	cleanup_srcu_struct_quiesced(&head->srcu);
 	kfree(head);
 }
 

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH] nvme: avoid flush dependency in delete controller flow
  2018-04-09 16:48                         ` Paul E. McKenney
@ 2018-04-09 16:58                           ` Max Gurtovoy
  2018-04-09 18:22                             ` Paul E. McKenney
  0 siblings, 1 reply; 19+ messages in thread
From: Max Gurtovoy @ 2018-04-09 16:58 UTC (permalink / raw)




On 4/9/2018 7:48 PM, Paul E. McKenney wrote:
> On Mon, Apr 09, 2018@05:50:26PM +0300, Nitzan Carmi wrote:
>> nvme_delete_ctrl queues a work on a MEM_RECLAIM queue
>> (nvme_delete_wq), which eventually calls cleanup_srcu_struct,
>> which in turn flushes a delayed work from an !MEM_RECLAIM
>> queue. This is unsafe as we might trigger deadlocks under
>> severe memory pressure.
>>
>> Since we don't ever invoke call_srcu(), it is safe
>> to use the _quiesced() version of srcu cleanup, and
>> avoid that flush dependency.
>>
>> Signed-off-by: Nitzan Carmi <nitzanc at mellanox.com>
> 
> Very good, thank you!
> 
> I queued this with a few edits to the commit log (please see below),
> so please let me know if I messed anything up.
> 
> Given that it only happens under high memory pressure, I am assuming
> that sending this up the v4.18 merge window (not this one, but the next
> one) will work.  Please let me know if this is more urgent than that.
> (If it is not more urgent, I would like to allow the extra time for
> people to get used to yet another addition to the SRCU API.)

I guess we can wait to v4.18 but not sure we actually need to wait that 
long, Christoph ?

> 
> ------------------------------------------------------------------------
> 
> commit 6fde861519fc6e4be50c130272be556ef5656324
> Author: Nitzan Carmi <nitzanc at mellanox.com>
> Date:   Mon Apr 9 17:50:26 2018 +0300
> 
>      nvme: Avoid flush dependency in delete controller flow
>      
>      The nvme_delete_ctrl() function queues a work item on a MEM_RECLAIM
>      queue (nvme_delete_wq), which eventually calls cleanup_srcu_struct(),
>      which in turn flushes a delayed work from an !MEM_RECLAIM queue. This
>      is unsafe as we might trigger deadlocks under severe memory pressure.
>      
>      Since we don't ever invoke call_srcu(), it is safe to use the shiny new
>      _quiesced() version of srcu cleanup, thus avoiding that flush dependency.
>      This commit makes that change.
>      
>      Signed-off-by: Nitzan Carmi <nitzanc at mellanox.com>
>      Signed-off-by: Paul E. McKenney <paulmck at linux.vnet.ibm.com>
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 7aeca5db7916..40008d06c365 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -351,7 +351,7 @@ static void nvme_free_ns_head(struct kref *ref)
>   	nvme_mpath_remove_disk(head);
>   	ida_simple_remove(&head->subsys->ns_ida, head->instance);
>   	list_del_init(&head->entry);
> -	cleanup_srcu_struct(&head->srcu);
> +	cleanup_srcu_struct_quiesced(&head->srcu);
>   	kfree(head);
>   }
>   
> 

Looks good, thanks Nitzan and Paul.
Reviewed-by: Max Gurtovoy <maxg at mellanox.com>

> 
> _______________________________________________
> Linux-nvme mailing list
> Linux-nvme at lists.infradead.org
> https://emea01.safelinks.protection.outlook.com/?url=http%3A%2F%2Flists.infradead.org%2Fmailman%2Flistinfo%2Flinux-nvme&data=02%7C01%7Cmaxg%40mellanox.com%7Cd1344fa628dc47e8a5f308d59e399a13%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636588892600812825&sdata=HTf3qkTyLUzE24yXG16UF%2BHVoA5OjykV6naG6o7M7M0%3D&reserved=0
> 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] nvme: avoid flush dependency in delete controller flow
  2018-04-09 16:58                           ` Max Gurtovoy
@ 2018-04-09 18:22                             ` Paul E. McKenney
  2018-04-10 10:47                               ` Max Gurtovoy
  0 siblings, 1 reply; 19+ messages in thread
From: Paul E. McKenney @ 2018-04-09 18:22 UTC (permalink / raw)


On Mon, Apr 09, 2018@07:58:48PM +0300, Max Gurtovoy wrote:
> 
> 
> On 4/9/2018 7:48 PM, Paul E. McKenney wrote:
> >On Mon, Apr 09, 2018@05:50:26PM +0300, Nitzan Carmi wrote:
> >>nvme_delete_ctrl queues a work on a MEM_RECLAIM queue
> >>(nvme_delete_wq), which eventually calls cleanup_srcu_struct,
> >>which in turn flushes a delayed work from an !MEM_RECLAIM
> >>queue. This is unsafe as we might trigger deadlocks under
> >>severe memory pressure.
> >>
> >>Since we don't ever invoke call_srcu(), it is safe
> >>to use the _quiesced() version of srcu cleanup, and
> >>avoid that flush dependency.
> >>
> >>Signed-off-by: Nitzan Carmi <nitzanc at mellanox.com>
> >
> >Very good, thank you!
> >
> >I queued this with a few edits to the commit log (please see below),
> >so please let me know if I messed anything up.
> >
> >Given that it only happens under high memory pressure, I am assuming
> >that sending this up the v4.18 merge window (not this one, but the next
> >one) will work.  Please let me know if this is more urgent than that.
> >(If it is not more urgent, I would like to allow the extra time for
> >people to get used to yet another addition to the SRCU API.)
> 
> I guess we can wait to v4.18 but not sure we actually need to wait
> that long, Christoph ?

Or to look at it another way, do these two patches need to go to -stable,
and if so, how far back?

> >------------------------------------------------------------------------
> >
> >commit 6fde861519fc6e4be50c130272be556ef5656324
> >Author: Nitzan Carmi <nitzanc at mellanox.com>
> >Date:   Mon Apr 9 17:50:26 2018 +0300
> >
> >     nvme: Avoid flush dependency in delete controller flow
> >     The nvme_delete_ctrl() function queues a work item on a MEM_RECLAIM
> >     queue (nvme_delete_wq), which eventually calls cleanup_srcu_struct(),
> >     which in turn flushes a delayed work from an !MEM_RECLAIM queue. This
> >     is unsafe as we might trigger deadlocks under severe memory pressure.
> >     Since we don't ever invoke call_srcu(), it is safe to use the shiny new
> >     _quiesced() version of srcu cleanup, thus avoiding that flush dependency.
> >     This commit makes that change.
> >     Signed-off-by: Nitzan Carmi <nitzanc at mellanox.com>
> >     Signed-off-by: Paul E. McKenney <paulmck at linux.vnet.ibm.com>
> >
> >diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> >index 7aeca5db7916..40008d06c365 100644
> >--- a/drivers/nvme/host/core.c
> >+++ b/drivers/nvme/host/core.c
> >@@ -351,7 +351,7 @@ static void nvme_free_ns_head(struct kref *ref)
> >  	nvme_mpath_remove_disk(head);
> >  	ida_simple_remove(&head->subsys->ns_ida, head->instance);
> >  	list_del_init(&head->entry);
> >-	cleanup_srcu_struct(&head->srcu);
> >+	cleanup_srcu_struct_quiesced(&head->srcu);
> >  	kfree(head);
> >  }
> 
> Looks good, thanks Nitzan and Paul.
> Reviewed-by: Max Gurtovoy <maxg at mellanox.com>

Added your Reviewed-by, thank you!

							Thanx, Paul

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] nvme: avoid flush dependency in delete controller flow
  2018-04-09 18:22                             ` Paul E. McKenney
@ 2018-04-10 10:47                               ` Max Gurtovoy
  2018-04-10 17:01                                 ` Paul E. McKenney
  0 siblings, 1 reply; 19+ messages in thread
From: Max Gurtovoy @ 2018-04-10 10:47 UTC (permalink / raw)




On 4/9/2018 9:22 PM, Paul E. McKenney wrote:
> On Mon, Apr 09, 2018@07:58:48PM +0300, Max Gurtovoy wrote:
>>
>>
>> On 4/9/2018 7:48 PM, Paul E. McKenney wrote:
>>> On Mon, Apr 09, 2018@05:50:26PM +0300, Nitzan Carmi wrote:
>>>> nvme_delete_ctrl queues a work on a MEM_RECLAIM queue
>>>> (nvme_delete_wq), which eventually calls cleanup_srcu_struct,
>>>> which in turn flushes a delayed work from an !MEM_RECLAIM
>>>> queue. This is unsafe as we might trigger deadlocks under
>>>> severe memory pressure.
>>>>
>>>> Since we don't ever invoke call_srcu(), it is safe
>>>> to use the _quiesced() version of srcu cleanup, and
>>>> avoid that flush dependency.
>>>>
>>>> Signed-off-by: Nitzan Carmi <nitzanc at mellanox.com>
>>>
>>> Very good, thank you!
>>>
>>> I queued this with a few edits to the commit log (please see below),
>>> so please let me know if I messed anything up.
>>>
>>> Given that it only happens under high memory pressure, I am assuming
>>> that sending this up the v4.18 merge window (not this one, but the next
>>> one) will work.  Please let me know if this is more urgent than that.
>>> (If it is not more urgent, I would like to allow the extra time for
>>> people to get used to yet another addition to the SRCU API.)
>>
>> I guess we can wait to v4.18 but not sure we actually need to wait
>> that long, Christoph ?
> 
> Or to look at it another way, do these two patches need to go to -stable,
> and if so, how far back?

I don't know if they need to go to -stable but if so then for nvme_core 
it should go to v4.15 (below commit):

commit ed754e5deeb17f4e675c84e4b6c640cc7344e498
Author: Christoph Hellwig <hch at lst.de>
Date:   Thu Nov 9 13:50:43 2017 +0100

     nvme: track shared namespaces

     Introduce a new struct nvme_ns_head that holds information about an 
actual
     namespace, unlike struct nvme_ns, which only holds the per-controller
     namespace information.  For private namespaces there is a 1:1 
relation of
     the two, but for shared namespaces this lets us discover all the 
paths to
     it.  For now only the identifiers are moved to the new structure, 
but most
     of the information in struct nvme_ns should eventually move over.

     To allow lockless path lookup the list of nvme_ns structures per
     nvme_ns_head is protected by SRCU, which requires freeing the nvme_ns
     structure through call_srcu.

     Signed-off-by: Christoph Hellwig <hch at lst.de>
     Reviewed-by: Keith Busch <keith.busch at intel.com>
     Reviewed-by: Javier Gonz<C3><A1>lez <javier at cnexlabs.com>
     Reviewed-by: Sagi Grimberg <sagi at grimberg.me>
     Reviewed-by: Johannes Thumshirn <jthumshirn at suse.de>
     Reviewed-by: Martin K. Petersen <martin.petersen at oracle.com>
     Reviewed-by: Hannes Reinecke <hare at suse.com>
     Signed-off-by: Jens Axboe <axboe at kernel.dk>




As for SRCU, I guess you can find it better than me :)

> 
>>> ------------------------------------------------------------------------
>>>
>>> commit 6fde861519fc6e4be50c130272be556ef5656324
>>> Author: Nitzan Carmi <nitzanc at mellanox.com>
>>> Date:   Mon Apr 9 17:50:26 2018 +0300
>>>
>>>      nvme: Avoid flush dependency in delete controller flow
>>>      The nvme_delete_ctrl() function queues a work item on a MEM_RECLAIM
>>>      queue (nvme_delete_wq), which eventually calls cleanup_srcu_struct(),
>>>      which in turn flushes a delayed work from an !MEM_RECLAIM queue. This
>>>      is unsafe as we might trigger deadlocks under severe memory pressure.
>>>      Since we don't ever invoke call_srcu(), it is safe to use the shiny new
>>>      _quiesced() version of srcu cleanup, thus avoiding that flush dependency.
>>>      This commit makes that change.
>>>      Signed-off-by: Nitzan Carmi <nitzanc at mellanox.com>
>>>      Signed-off-by: Paul E. McKenney <paulmck at linux.vnet.ibm.com>
>>>
>>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>>> index 7aeca5db7916..40008d06c365 100644
>>> --- a/drivers/nvme/host/core.c
>>> +++ b/drivers/nvme/host/core.c
>>> @@ -351,7 +351,7 @@ static void nvme_free_ns_head(struct kref *ref)
>>>   	nvme_mpath_remove_disk(head);
>>>   	ida_simple_remove(&head->subsys->ns_ida, head->instance);
>>>   	list_del_init(&head->entry);
>>> -	cleanup_srcu_struct(&head->srcu);
>>> +	cleanup_srcu_struct_quiesced(&head->srcu);
>>>   	kfree(head);
>>>   }
>>
>> Looks good, thanks Nitzan and Paul.
>> Reviewed-by: Max Gurtovoy <maxg at mellanox.com>
> 
> Added your Reviewed-by, thank you!
> 
> 							Thanx, Paul
> 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] nvme: avoid flush dependency in delete controller flow
  2018-04-10 10:47                               ` Max Gurtovoy
@ 2018-04-10 17:01                                 ` Paul E. McKenney
  0 siblings, 0 replies; 19+ messages in thread
From: Paul E. McKenney @ 2018-04-10 17:01 UTC (permalink / raw)


On Tue, Apr 10, 2018@01:47:38PM +0300, Max Gurtovoy wrote:
> 
> 
> On 4/9/2018 9:22 PM, Paul E. McKenney wrote:
> >On Mon, Apr 09, 2018@07:58:48PM +0300, Max Gurtovoy wrote:
> >>
> >>
> >>On 4/9/2018 7:48 PM, Paul E. McKenney wrote:
> >>>On Mon, Apr 09, 2018@05:50:26PM +0300, Nitzan Carmi wrote:
> >>>>nvme_delete_ctrl queues a work on a MEM_RECLAIM queue
> >>>>(nvme_delete_wq), which eventually calls cleanup_srcu_struct,
> >>>>which in turn flushes a delayed work from an !MEM_RECLAIM
> >>>>queue. This is unsafe as we might trigger deadlocks under
> >>>>severe memory pressure.
> >>>>
> >>>>Since we don't ever invoke call_srcu(), it is safe
> >>>>to use the _quiesced() version of srcu cleanup, and
> >>>>avoid that flush dependency.
> >>>>
> >>>>Signed-off-by: Nitzan Carmi <nitzanc at mellanox.com>
> >>>
> >>>Very good, thank you!
> >>>
> >>>I queued this with a few edits to the commit log (please see below),
> >>>so please let me know if I messed anything up.
> >>>
> >>>Given that it only happens under high memory pressure, I am assuming
> >>>that sending this up the v4.18 merge window (not this one, but the next
> >>>one) will work.  Please let me know if this is more urgent than that.
> >>>(If it is not more urgent, I would like to allow the extra time for
> >>>people to get used to yet another addition to the SRCU API.)
> >>
> >>I guess we can wait to v4.18 but not sure we actually need to wait
> >>that long, Christoph ?
> >
> >Or to look at it another way, do these two patches need to go to -stable,
> >and if so, how far back?
> 
> I don't know if they need to go to -stable but if so then for
> nvme_core it should go to v4.15 (below commit):
> 
> commit ed754e5deeb17f4e675c84e4b6c640cc7344e498
> Author: Christoph Hellwig <hch at lst.de>
> Date:   Thu Nov 9 13:50:43 2017 +0100
> 
>     nvme: track shared namespaces
> 
>     Introduce a new struct nvme_ns_head that holds information about
> an actual
>     namespace, unlike struct nvme_ns, which only holds the per-controller
>     namespace information.  For private namespaces there is a 1:1
> relation of
>     the two, but for shared namespaces this lets us discover all the
> paths to
>     it.  For now only the identifiers are moved to the new
> structure, but most
>     of the information in struct nvme_ns should eventually move over.
> 
>     To allow lockless path lookup the list of nvme_ns structures per
>     nvme_ns_head is protected by SRCU, which requires freeing the nvme_ns
>     structure through call_srcu.
> 
>     Signed-off-by: Christoph Hellwig <hch at lst.de>
>     Reviewed-by: Keith Busch <keith.busch at intel.com>
>     Reviewed-by: Javier Gonz<C3><A1>lez <javier at cnexlabs.com>
>     Reviewed-by: Sagi Grimberg <sagi at grimberg.me>
>     Reviewed-by: Johannes Thumshirn <jthumshirn at suse.de>
>     Reviewed-by: Martin K. Petersen <martin.petersen at oracle.com>
>     Reviewed-by: Hannes Reinecke <hare at suse.com>
>     Signed-off-by: Jens Axboe <axboe at kernel.dk>
> 
> As for SRCU, I guess you can find it better than me :)

Same answer, since the only reason for the SRCU change is to support
your fix.  ;-)

							Thanx, Paul

> >>>------------------------------------------------------------------------
> >>>
> >>>commit 6fde861519fc6e4be50c130272be556ef5656324
> >>>Author: Nitzan Carmi <nitzanc at mellanox.com>
> >>>Date:   Mon Apr 9 17:50:26 2018 +0300
> >>>
> >>>     nvme: Avoid flush dependency in delete controller flow
> >>>     The nvme_delete_ctrl() function queues a work item on a MEM_RECLAIM
> >>>     queue (nvme_delete_wq), which eventually calls cleanup_srcu_struct(),
> >>>     which in turn flushes a delayed work from an !MEM_RECLAIM queue. This
> >>>     is unsafe as we might trigger deadlocks under severe memory pressure.
> >>>     Since we don't ever invoke call_srcu(), it is safe to use the shiny new
> >>>     _quiesced() version of srcu cleanup, thus avoiding that flush dependency.
> >>>     This commit makes that change.
> >>>     Signed-off-by: Nitzan Carmi <nitzanc at mellanox.com>
> >>>     Signed-off-by: Paul E. McKenney <paulmck at linux.vnet.ibm.com>
> >>>
> >>>diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> >>>index 7aeca5db7916..40008d06c365 100644
> >>>--- a/drivers/nvme/host/core.c
> >>>+++ b/drivers/nvme/host/core.c
> >>>@@ -351,7 +351,7 @@ static void nvme_free_ns_head(struct kref *ref)
> >>>  	nvme_mpath_remove_disk(head);
> >>>  	ida_simple_remove(&head->subsys->ns_ida, head->instance);
> >>>  	list_del_init(&head->entry);
> >>>-	cleanup_srcu_struct(&head->srcu);
> >>>+	cleanup_srcu_struct_quiesced(&head->srcu);
> >>>  	kfree(head);
> >>>  }
> >>
> >>Looks good, thanks Nitzan and Paul.
> >>Reviewed-by: Max Gurtovoy <maxg at mellanox.com>
> >
> >Added your Reviewed-by, thank you!
> >
> >							Thanx, Paul
> >
> 

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2018-04-10 17:01 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-04-02 12:37 [PATCH] nvme: fix flush dependency in delete controller flow Nitzan Carmi
2018-04-05  8:54 ` Christoph Hellwig
2018-04-05 13:14   ` Paul E. McKenney
2018-04-05 16:08     ` Paul E. McKenney
2018-04-05 19:55       ` Christoph Hellwig
2018-04-06  0:18         ` Paul E. McKenney
2018-04-06  6:19           ` Christoph Hellwig
2018-04-06 16:30             ` Paul E. McKenney
2018-04-08  7:20               ` Nitzan Carmi
2018-04-08 16:48                 ` Paul E. McKenney
2018-04-09  6:39                   ` Nitzan Carmi
2018-04-09  6:50                     ` Christoph Hellwig
2018-04-09 14:50                       ` [PATCH] nvme: avoid " Nitzan Carmi
2018-04-09 16:48                         ` Paul E. McKenney
2018-04-09 16:58                           ` Max Gurtovoy
2018-04-09 18:22                             ` Paul E. McKenney
2018-04-10 10:47                               ` Max Gurtovoy
2018-04-10 17:01                                 ` Paul E. McKenney
2018-04-09 16:30                     ` [PATCH] nvme: fix " Paul E. McKenney

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.