linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] rcu/segcblist: Add debug checks for segment lengths
@ 2020-11-18 16:15 Joel Fernandes (Google)
  2020-11-18 20:13 ` Paul E. McKenney
  0 siblings, 1 reply; 13+ messages in thread
From: Joel Fernandes (Google) @ 2020-11-18 16:15 UTC (permalink / raw)
  To: linux-kernel
  Cc: Joel Fernandes (Google),
	Josh Triplett, Lai Jiangshan, Mathieu Desnoyers,
	Paul E. McKenney, rcu, Steven Rostedt

After rcu_do_batch(), add a check for whether the seglen counts went to
zero if the list was indeed empty.

Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
---
v1->v2: Added more debug checks.

 kernel/rcu/rcu_segcblist.c | 12 ++++++++++++
 kernel/rcu/rcu_segcblist.h |  3 +++
 kernel/rcu/tree.c          |  2 ++
 3 files changed, 17 insertions(+)

diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 5059b6102afe..6e98bb3804f0 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -94,6 +94,18 @@ static long rcu_segcblist_get_seglen(struct rcu_segcblist *rsclp, int seg)
 	return READ_ONCE(rsclp->seglen[seg]);
 }
 
+/* Return number of callbacks in segmented callback list by totalling seglen. */
+long rcu_segcblist_n_segment_cbs(struct rcu_segcblist *rsclp)
+{
+	long len = 0;
+	int i;
+
+	for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++)
+		len += rcu_segcblist_get_seglen(rsclp, i);
+
+	return len;
+}
+
 /* Set the length of a segment of the rcu_segcblist structure. */
 static void rcu_segcblist_set_seglen(struct rcu_segcblist *rsclp, int seg, long v)
 {
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index cd35c9faaf51..46a42d77f7e1 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -15,6 +15,9 @@ static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp)
 	return READ_ONCE(rclp->len);
 }
 
+/* Return number of callbacks in segmented callback list by totalling seglen. */
+long rcu_segcblist_n_segment_cbs(struct rcu_segcblist *rsclp);
+
 void rcu_cblist_init(struct rcu_cblist *rclp);
 void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp);
 void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp,
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index f5b61e10f1de..91e35b521e51 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2553,6 +2553,8 @@ static void rcu_do_batch(struct rcu_data *rdp)
 	WARN_ON_ONCE(count == 0 && !rcu_segcblist_empty(&rdp->cblist));
 	WARN_ON_ONCE(!IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
 		     count != 0 && rcu_segcblist_empty(&rdp->cblist));
+	WARN_ON_ONCE(count == 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) != 0);
+	WARN_ON_ONCE(count != 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0);
 
 	rcu_nocb_unlock_irqrestore(rdp, flags);
 
-- 
2.29.2.299.gdc1121823c-goog

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] rcu/segcblist: Add debug checks for segment lengths
  2020-11-18 16:15 [PATCH v2] rcu/segcblist: Add debug checks for segment lengths Joel Fernandes (Google)
@ 2020-11-18 20:13 ` Paul E. McKenney
  2020-11-19  3:52   ` Paul E. McKenney
  0 siblings, 1 reply; 13+ messages in thread
From: Paul E. McKenney @ 2020-11-18 20:13 UTC (permalink / raw)
  To: Joel Fernandes (Google)
  Cc: linux-kernel, Josh Triplett, Lai Jiangshan, Mathieu Desnoyers,
	rcu, Steven Rostedt

On Wed, Nov 18, 2020 at 11:15:41AM -0500, Joel Fernandes (Google) wrote:
> After rcu_do_batch(), add a check for whether the seglen counts went to
> zero if the list was indeed empty.
> 
> Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>

Queued for testing and further review, thank you!

							Thanx, Paul

> ---
> v1->v2: Added more debug checks.
> 
>  kernel/rcu/rcu_segcblist.c | 12 ++++++++++++
>  kernel/rcu/rcu_segcblist.h |  3 +++
>  kernel/rcu/tree.c          |  2 ++
>  3 files changed, 17 insertions(+)
> 
> diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
> index 5059b6102afe..6e98bb3804f0 100644
> --- a/kernel/rcu/rcu_segcblist.c
> +++ b/kernel/rcu/rcu_segcblist.c
> @@ -94,6 +94,18 @@ static long rcu_segcblist_get_seglen(struct rcu_segcblist *rsclp, int seg)
>  	return READ_ONCE(rsclp->seglen[seg]);
>  }
>  
> +/* Return number of callbacks in segmented callback list by totalling seglen. */
> +long rcu_segcblist_n_segment_cbs(struct rcu_segcblist *rsclp)
> +{
> +	long len = 0;
> +	int i;
> +
> +	for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++)
> +		len += rcu_segcblist_get_seglen(rsclp, i);
> +
> +	return len;
> +}
> +
>  /* Set the length of a segment of the rcu_segcblist structure. */
>  static void rcu_segcblist_set_seglen(struct rcu_segcblist *rsclp, int seg, long v)
>  {
> diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
> index cd35c9faaf51..46a42d77f7e1 100644
> --- a/kernel/rcu/rcu_segcblist.h
> +++ b/kernel/rcu/rcu_segcblist.h
> @@ -15,6 +15,9 @@ static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp)
>  	return READ_ONCE(rclp->len);
>  }
>  
> +/* Return number of callbacks in segmented callback list by totalling seglen. */
> +long rcu_segcblist_n_segment_cbs(struct rcu_segcblist *rsclp);
> +
>  void rcu_cblist_init(struct rcu_cblist *rclp);
>  void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp);
>  void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp,
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index f5b61e10f1de..91e35b521e51 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -2553,6 +2553,8 @@ static void rcu_do_batch(struct rcu_data *rdp)
>  	WARN_ON_ONCE(count == 0 && !rcu_segcblist_empty(&rdp->cblist));
>  	WARN_ON_ONCE(!IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
>  		     count != 0 && rcu_segcblist_empty(&rdp->cblist));
> +	WARN_ON_ONCE(count == 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) != 0);
> +	WARN_ON_ONCE(count != 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0);
>  
>  	rcu_nocb_unlock_irqrestore(rdp, flags);
>  
> -- 
> 2.29.2.299.gdc1121823c-goog

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] rcu/segcblist: Add debug checks for segment lengths
  2020-11-18 20:13 ` Paul E. McKenney
@ 2020-11-19  3:52   ` Paul E. McKenney
  2020-11-19  3:56     ` Paul E. McKenney
  0 siblings, 1 reply; 13+ messages in thread
From: Paul E. McKenney @ 2020-11-19  3:52 UTC (permalink / raw)
  To: Joel Fernandes (Google)
  Cc: linux-kernel, Josh Triplett, Lai Jiangshan, Mathieu Desnoyers,
	rcu, Steven Rostedt

On Wed, Nov 18, 2020 at 12:13:35PM -0800, Paul E. McKenney wrote:
> On Wed, Nov 18, 2020 at 11:15:41AM -0500, Joel Fernandes (Google) wrote:
> > After rcu_do_batch(), add a check for whether the seglen counts went to
> > zero if the list was indeed empty.
> > 
> > Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> 
> Queued for testing and further review, thank you!

FYI, the second of the two checks triggered in all four one-hour runs of
TREE01, all four one-hour runs of TREE04, and one of the four one-hour
runs of TREE07.  This one:

WARN_ON_ONCE(count != 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0);

That is, there are callbacks in the list, but the sum of the segment
counts is nevertheless zero.  The ->nocb_lock is held.

Thoughts?

							Thanx, Paul

> > ---
> > v1->v2: Added more debug checks.
> > 
> >  kernel/rcu/rcu_segcblist.c | 12 ++++++++++++
> >  kernel/rcu/rcu_segcblist.h |  3 +++
> >  kernel/rcu/tree.c          |  2 ++
> >  3 files changed, 17 insertions(+)
> > 
> > diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
> > index 5059b6102afe..6e98bb3804f0 100644
> > --- a/kernel/rcu/rcu_segcblist.c
> > +++ b/kernel/rcu/rcu_segcblist.c
> > @@ -94,6 +94,18 @@ static long rcu_segcblist_get_seglen(struct rcu_segcblist *rsclp, int seg)
> >  	return READ_ONCE(rsclp->seglen[seg]);
> >  }
> >  
> > +/* Return number of callbacks in segmented callback list by totalling seglen. */
> > +long rcu_segcblist_n_segment_cbs(struct rcu_segcblist *rsclp)
> > +{
> > +	long len = 0;
> > +	int i;
> > +
> > +	for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++)
> > +		len += rcu_segcblist_get_seglen(rsclp, i);
> > +
> > +	return len;
> > +}
> > +
> >  /* Set the length of a segment of the rcu_segcblist structure. */
> >  static void rcu_segcblist_set_seglen(struct rcu_segcblist *rsclp, int seg, long v)
> >  {
> > diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
> > index cd35c9faaf51..46a42d77f7e1 100644
> > --- a/kernel/rcu/rcu_segcblist.h
> > +++ b/kernel/rcu/rcu_segcblist.h
> > @@ -15,6 +15,9 @@ static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp)
> >  	return READ_ONCE(rclp->len);
> >  }
> >  
> > +/* Return number of callbacks in segmented callback list by totalling seglen. */
> > +long rcu_segcblist_n_segment_cbs(struct rcu_segcblist *rsclp);
> > +
> >  void rcu_cblist_init(struct rcu_cblist *rclp);
> >  void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp);
> >  void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp,
> > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > index f5b61e10f1de..91e35b521e51 100644
> > --- a/kernel/rcu/tree.c
> > +++ b/kernel/rcu/tree.c
> > @@ -2553,6 +2553,8 @@ static void rcu_do_batch(struct rcu_data *rdp)
> >  	WARN_ON_ONCE(count == 0 && !rcu_segcblist_empty(&rdp->cblist));
> >  	WARN_ON_ONCE(!IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
> >  		     count != 0 && rcu_segcblist_empty(&rdp->cblist));
> > +	WARN_ON_ONCE(count == 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) != 0);
> > +	WARN_ON_ONCE(count != 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0);
> >  
> >  	rcu_nocb_unlock_irqrestore(rdp, flags);
> >  
> > -- 
> > 2.29.2.299.gdc1121823c-goog

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] rcu/segcblist: Add debug checks for segment lengths
  2020-11-19  3:52   ` Paul E. McKenney
@ 2020-11-19  3:56     ` Paul E. McKenney
  2020-11-19 18:32       ` Joel Fernandes
  0 siblings, 1 reply; 13+ messages in thread
From: Paul E. McKenney @ 2020-11-19  3:56 UTC (permalink / raw)
  To: Joel Fernandes (Google)
  Cc: linux-kernel, Josh Triplett, Lai Jiangshan, Mathieu Desnoyers,
	rcu, Steven Rostedt

On Wed, Nov 18, 2020 at 07:52:23PM -0800, Paul E. McKenney wrote:
> On Wed, Nov 18, 2020 at 12:13:35PM -0800, Paul E. McKenney wrote:
> > On Wed, Nov 18, 2020 at 11:15:41AM -0500, Joel Fernandes (Google) wrote:
> > > After rcu_do_batch(), add a check for whether the seglen counts went to
> > > zero if the list was indeed empty.
> > > 
> > > Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> > 
> > Queued for testing and further review, thank you!
> 
> FYI, the second of the two checks triggered in all four one-hour runs of
> TREE01, all four one-hour runs of TREE04, and one of the four one-hour
> runs of TREE07.  This one:
> 
> WARN_ON_ONCE(count != 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0);
> 
> That is, there are callbacks in the list, but the sum of the segment
> counts is nevertheless zero.  The ->nocb_lock is held.
> 
> Thoughts?

FWIW, TREE01 reproduces it very quickly compared to the other two
scenarios, on all four run, within five minutes.

							Thanx, Paul

> > > ---
> > > v1->v2: Added more debug checks.
> > > 
> > >  kernel/rcu/rcu_segcblist.c | 12 ++++++++++++
> > >  kernel/rcu/rcu_segcblist.h |  3 +++
> > >  kernel/rcu/tree.c          |  2 ++
> > >  3 files changed, 17 insertions(+)
> > > 
> > > diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
> > > index 5059b6102afe..6e98bb3804f0 100644
> > > --- a/kernel/rcu/rcu_segcblist.c
> > > +++ b/kernel/rcu/rcu_segcblist.c
> > > @@ -94,6 +94,18 @@ static long rcu_segcblist_get_seglen(struct rcu_segcblist *rsclp, int seg)
> > >  	return READ_ONCE(rsclp->seglen[seg]);
> > >  }
> > >  
> > > +/* Return number of callbacks in segmented callback list by totalling seglen. */
> > > +long rcu_segcblist_n_segment_cbs(struct rcu_segcblist *rsclp)
> > > +{
> > > +	long len = 0;
> > > +	int i;
> > > +
> > > +	for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++)
> > > +		len += rcu_segcblist_get_seglen(rsclp, i);
> > > +
> > > +	return len;
> > > +}
> > > +
> > >  /* Set the length of a segment of the rcu_segcblist structure. */
> > >  static void rcu_segcblist_set_seglen(struct rcu_segcblist *rsclp, int seg, long v)
> > >  {
> > > diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
> > > index cd35c9faaf51..46a42d77f7e1 100644
> > > --- a/kernel/rcu/rcu_segcblist.h
> > > +++ b/kernel/rcu/rcu_segcblist.h
> > > @@ -15,6 +15,9 @@ static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp)
> > >  	return READ_ONCE(rclp->len);
> > >  }
> > >  
> > > +/* Return number of callbacks in segmented callback list by totalling seglen. */
> > > +long rcu_segcblist_n_segment_cbs(struct rcu_segcblist *rsclp);
> > > +
> > >  void rcu_cblist_init(struct rcu_cblist *rclp);
> > >  void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp);
> > >  void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp,
> > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > > index f5b61e10f1de..91e35b521e51 100644
> > > --- a/kernel/rcu/tree.c
> > > +++ b/kernel/rcu/tree.c
> > > @@ -2553,6 +2553,8 @@ static void rcu_do_batch(struct rcu_data *rdp)
> > >  	WARN_ON_ONCE(count == 0 && !rcu_segcblist_empty(&rdp->cblist));
> > >  	WARN_ON_ONCE(!IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
> > >  		     count != 0 && rcu_segcblist_empty(&rdp->cblist));
> > > +	WARN_ON_ONCE(count == 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) != 0);
> > > +	WARN_ON_ONCE(count != 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0);
> > >  
> > >  	rcu_nocb_unlock_irqrestore(rdp, flags);
> > >  
> > > -- 
> > > 2.29.2.299.gdc1121823c-goog

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] rcu/segcblist: Add debug checks for segment lengths
  2020-11-19  3:56     ` Paul E. McKenney
@ 2020-11-19 18:32       ` Joel Fernandes
  2020-11-19 19:22         ` Paul E. McKenney
  0 siblings, 1 reply; 13+ messages in thread
From: Joel Fernandes @ 2020-11-19 18:32 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: linux-kernel, Josh Triplett, Lai Jiangshan, Mathieu Desnoyers,
	rcu, Steven Rostedt

On Wed, Nov 18, 2020 at 07:56:13PM -0800, Paul E. McKenney wrote:
> On Wed, Nov 18, 2020 at 07:52:23PM -0800, Paul E. McKenney wrote:
> > On Wed, Nov 18, 2020 at 12:13:35PM -0800, Paul E. McKenney wrote:
> > > On Wed, Nov 18, 2020 at 11:15:41AM -0500, Joel Fernandes (Google) wrote:
> > > > After rcu_do_batch(), add a check for whether the seglen counts went to
> > > > zero if the list was indeed empty.
> > > > 
> > > > Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> > > 
> > > Queued for testing and further review, thank you!
> > 
> > FYI, the second of the two checks triggered in all four one-hour runs of
> > TREE01, all four one-hour runs of TREE04, and one of the four one-hour
> > runs of TREE07.  This one:
> > 
> > WARN_ON_ONCE(count != 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0);
> > 
> > That is, there are callbacks in the list, but the sum of the segment
> > counts is nevertheless zero.  The ->nocb_lock is held.
> > 
> > Thoughts?
> 
> FWIW, TREE01 reproduces it very quickly compared to the other two
> scenarios, on all four run, within five minutes.

So far for TREE01, I traced it down to an rcu_barrier happening so it could
be related to some interaction with rcu_barrier() (Just a guess).

'p2' print below is the panic on the second warning. It executes 43 callbacks
from the segcb list, but the list length still does not appear to be 0.

I'll debug it more:

[  191.085702]  rcuop/0-12        0.... 75782125us : rcu_invoke_callback: rcu_preempt rhp=000000006a881152 func=__d_free
[  191.844028]  rcuop/0-12        0d..1 75796122us : rcu_segcb_stats: SegCbDequeued seglen: (DONE=0, WAIT=43, NEXT_READY=0, NEXT=0) gp_seq: (DONE=0, WAIT=11656, NEXT_READY=11656, NEXT=0)
[  191.846493]  rcuop/0-12        0.... 75796122us : rcu_invoke_callback: rcu_preempt rhp=00000000017536a2 func=i_callback
[  191.848160]  rcuop/0-12        0.... 75796123us : rcu_invoke_callback: rcu_preempt rhp=0000000014235c71 func=__d_free
[  191.849695]  rcuop/0-12        0.... 75796123us : rcu_invoke_callback: rcu_preempt rhp=00000000368c5928 func=i_callback
[  191.851262]  rcuop/0-12        0.... 75796124us : rcu_invoke_callback: rcu_preempt rhp=00000000bdbea790 func=__d_free
[  191.852800]  rcuop/0-12        0.... 75796124us : rcu_invoke_callback: rcu_preempt rhp=0000000034b99f3d func=rcu_barrier_callback
[  192.526784]  rcuop/0-12        0d..1 75809162us : rcu_segcb_stats: SegCbDequeued seglen: (DONE=0, WAIT=0, NEXT_READY=0, NEXT=0) gp_seq: (DONE=0, WAIT=11656, NEXT_READY=11656, NEXT=0)
[  192.529132]  rcuop/0-12        0.... 75809163us : rcu_invoke_callback: rcu_preempt rhp=000000002d6a3fce func=rcu_sync_func
[  192.530807]  rcuop/0-12        0.... 75809165us : rcu_invoke_callback: rcu_preempt rhp=000000009aa91c97 func=destroy_sched_domains_rcu
[  192.532556]  rcuop/0-12        0.... 75809170us : rcu_invoke_callback: rcu_preempt rhp=000000002bb5a998 func=destroy_sched_domains_rcu
[  192.534303]  rcuop/0-12        0.... 75809172us : rcu_invoke_callback: rcu_preempt rhp=00000000bcc2369a func=destroy_sched_domains_rcu
[  192.536053]  rcuop/0-12        0.... 75809174us : rcu_invoke_callback: rcu_preempt rhp=000000004dcec39b func=destroy_sched_domains_rcu
[  192.537802]  rcuop/0-12        0.... 75809176us : rcu_invoke_callback: rcu_preempt rhp=00000000dedb509d func=destroy_sched_domains_rcu
[  192.539553]  rcuop/0-12        0.... 75809178us : rcu_invoke_callback: rcu_preempt rhp=000000006fe7dd9e func=destroy_sched_domains_rcu
[  192.541299]  rcuop/0-12        0.... 75809180us : rcu_invoke_callback: rcu_preempt rhp=000000005a212061 func=destroy_sched_domains_rcu
[  192.543043]  rcuop/0-12        0.... 75809182us : rcu_invoke_callback: rcu_preempt rhp=00000000c914935f func=destroy_sched_domains_rcu
[  192.544792]  rcuop/0-12        0.... 75809184us : rcu_invoke_callback: rcu_preempt rhp=0000000019fa3368 func=destroy_sched_domains_rcu
[  192.546539]  rcuop/0-12        0.... 75809186us : rcu_invoke_callback: rcu_preempt rhp=00000000ab06c069 func=destroy_sched_domains_rcu
[  192.548289]  rcuop/0-12        0.... 75809188us : rcu_invoke_callback: rcu_preempt rhp=000000003c134d6b func=destroy_sched_domains_rcu
[  192.550037]  rcuop/0-12        0.... 75809190us : rcu_invoke_callback: rcu_preempt rhp=00000000cd1fda6c func=destroy_sched_domains_rcu
[  192.551790]  rcuop/0-12        0.... 75809192us : rcu_invoke_callback: rcu_preempt rhp=000000005e2c676e func=destroy_sched_domains_rcu
[  192.553576]  rcuop/0-12        0.... 75809194us : rcu_invoke_callback: rcu_preempt rhp=00000000ef38f46f func=destroy_sched_domains_rcu
[  192.555314]  rcuop/0-12        0.... 75809196us : rcu_invoke_callback: rcu_preempt rhp=0000000080458170 func=destroy_sched_domains_rcu
[  192.557054]  rcuop/0-12        0.... 75809198us : rcu_invoke_callback: rcu_preempt rhp=0000000009316351 func=destroy_sched_domains_rcu
[  192.558793]  rcuop/0-12        0.... 75809200us : rcu_invoke_callback: rcu_preempt rhp=0000000022a2585e func=destroy_sched_domains_rcu
[  192.560539]  rcuop/0-12        0.... 75809202us : rcu_invoke_callback: rcu_preempt rhp=00000000b3aee55f func=destroy_sched_domains_rcu
[  192.562282]  rcuop/0-12        0.... 75809204us : rcu_invoke_callback: rcu_preempt rhp=0000000044bb7261 func=destroy_sched_domains_rcu
[  192.564028]  rcuop/0-12        0.... 75809206us : rcu_invoke_callback: rcu_preempt rhp=00000000d5c7ff62 func=destroy_sched_domains_rcu
[  192.565777]  rcuop/0-12        0.... 75809208us : rcu_invoke_callback: rcu_preempt rhp=00000000bc570a55 func=destroy_sched_domains_rcu
[  192.567568]  rcuop/0-12        0.... 75809210us : rcu_invoke_callback: rcu_preempt rhp=000000004d639757 func=destroy_sched_domains_rcu
[  192.569315]  rcuop/0-12        0.... 75809212us : rcu_invoke_callback: rcu_preempt rhp=00000000de702458 func=destroy_sched_domains_rcu
[  192.571111]  rcuop/0-12        0.... 75809214us : rcu_invoke_callback: rcu_preempt rhp=000000006f7cb15a func=destroy_sched_domains_rcu
[  192.572855]  rcuop/0-12        0.... 75809216us : rcu_invoke_callback: rcu_preempt rhp=0000000000893e5b func=destroy_sched_domains_rcu
[  192.574595]  rcuop/0-12        0.... 75809218us : rcu_invoke_callback: rcu_preempt rhp=0000000066d48c64 func=destroy_sched_domains_rcu
[  192.576435]  rcuop/0-12        0.... 75809220us : rcu_invoke_callback: rcu_preempt rhp=000000009a3df053 func=destroy_sched_domains_rcu
[  192.578182]  rcuop/0-12        0.... 75809221us : rcu_invoke_callback: rcu_preempt rhp=000000002b4a7d54 func=destroy_sched_domains_rcu
[  192.579981]  rcuop/0-12        0.... 75809223us : rcu_invoke_callback: rcu_preempt rhp=000000009cf78b8b func=destroy_sched_domains_rcu
[  192.581773]  rcuop/0-12        0.... 75809225us : rcu_invoke_callback: rcu_preempt rhp=000000000beafe8a func=destroy_sched_domains_rcu
[  192.583547]  rcuop/0-12        0.... 75809227us : rcu_invoke_callback: rcu_preempt rhp=000000007ade7189 func=destroy_sched_domains_rcu
[  192.585373]  rcuop/0-12        0.... 75809229us : rcu_invoke_callback: rcu_preempt rhp=00000000e9d1e487 func=destroy_sched_domains_rcu
[  192.587139]  rcuop/0-12        0.... 75809231us : rcu_invoke_callback: rcu_preempt rhp=000000002e04188d func=destroy_sched_domains_rcu
[  192.588984]  rcuop/0-12        0.... 75809233us : rcu_invoke_callback: rcu_preempt rhp=00000000bf10a58e func=destroy_sched_domains_rcu
[  192.590732]  rcuop/0-12        0.... 75809235us : rcu_invoke_callback: rcu_preempt rhp=00000000501d3290 func=destroy_sched_domains_rcu
[  192.592562]  rcuop/0-12        0.... 75809237us : rcu_invoke_callback: rcu_preempt rhp=00000000e129bf91 func=destroy_sched_domains_rcu
[  192.594309]  rcuop/0-12        0.... 75809239us : rcu_invoke_callback: rcu_preempt rhp=0000000072364c93 func=destroy_sched_domains_rcu
[  192.596063]  rcuop/0-12        0.... 75809241us : rcu_invoke_callback: rcu_preempt rhp=000000000342d994 func=destroy_sched_domains_rcu
[  192.597807]  rcuop/0-12        0.... 75809243us : rcu_invoke_callback: rcu_preempt rhp=00000000944f6695 func=destroy_sched_domains_rcu
[  192.599608]  rcuop/0-12        0.... 75809245us : rcu_invoke_callback: rcu_preempt rhp=00000000255bf397 func=destroy_sched_domains_rcu
[  192.601341]  rcuop/0-12        0.... 75809247us : rcu_invoke_callback: rcu_preempt rhp=00000000d4a4767b func=free_rootdomain
[  192.602958]  rcuop/0-12        0.... 75809251us : rcu_invoke_callback: rcu_preempt rhp=00000000b6688098 func=destroy_sched_domains_rcu
[  192.604779]  rcuop/0-12        0.... 75809269us : rcu_invoke_callback: rcu_preempt rhp=0000000034b99f3d func=rcu_barrier_callback
[  193.894910]  rcuop/0-12        0d..1 75873331us : rcu_do_batch: p2

thanks,

 - Joel


> 
> 							Thanx, Paul
> 
> > > > ---
> > > > v1->v2: Added more debug checks.
> > > > 
> > > >  kernel/rcu/rcu_segcblist.c | 12 ++++++++++++
> > > >  kernel/rcu/rcu_segcblist.h |  3 +++
> > > >  kernel/rcu/tree.c          |  2 ++
> > > >  3 files changed, 17 insertions(+)
> > > > 
> > > > diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
> > > > index 5059b6102afe..6e98bb3804f0 100644
> > > > --- a/kernel/rcu/rcu_segcblist.c
> > > > +++ b/kernel/rcu/rcu_segcblist.c
> > > > @@ -94,6 +94,18 @@ static long rcu_segcblist_get_seglen(struct rcu_segcblist *rsclp, int seg)
> > > >  	return READ_ONCE(rsclp->seglen[seg]);
> > > >  }
> > > >  
> > > > +/* Return number of callbacks in segmented callback list by totalling seglen. */
> > > > +long rcu_segcblist_n_segment_cbs(struct rcu_segcblist *rsclp)
> > > > +{
> > > > +	long len = 0;
> > > > +	int i;
> > > > +
> > > > +	for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++)
> > > > +		len += rcu_segcblist_get_seglen(rsclp, i);
> > > > +
> > > > +	return len;
> > > > +}
> > > > +
> > > >  /* Set the length of a segment of the rcu_segcblist structure. */
> > > >  static void rcu_segcblist_set_seglen(struct rcu_segcblist *rsclp, int seg, long v)
> > > >  {
> > > > diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
> > > > index cd35c9faaf51..46a42d77f7e1 100644
> > > > --- a/kernel/rcu/rcu_segcblist.h
> > > > +++ b/kernel/rcu/rcu_segcblist.h
> > > > @@ -15,6 +15,9 @@ static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp)
> > > >  	return READ_ONCE(rclp->len);
> > > >  }
> > > >  
> > > > +/* Return number of callbacks in segmented callback list by totalling seglen. */
> > > > +long rcu_segcblist_n_segment_cbs(struct rcu_segcblist *rsclp);
> > > > +
> > > >  void rcu_cblist_init(struct rcu_cblist *rclp);
> > > >  void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp);
> > > >  void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp,
> > > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > > > index f5b61e10f1de..91e35b521e51 100644
> > > > --- a/kernel/rcu/tree.c
> > > > +++ b/kernel/rcu/tree.c
> > > > @@ -2553,6 +2553,8 @@ static void rcu_do_batch(struct rcu_data *rdp)
> > > >  	WARN_ON_ONCE(count == 0 && !rcu_segcblist_empty(&rdp->cblist));
> > > >  	WARN_ON_ONCE(!IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
> > > >  		     count != 0 && rcu_segcblist_empty(&rdp->cblist));
> > > > +	WARN_ON_ONCE(count == 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) != 0);
> > > > +	WARN_ON_ONCE(count != 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0);
> > > >  
> > > >  	rcu_nocb_unlock_irqrestore(rdp, flags);
> > > >  
> > > > -- 
> > > > 2.29.2.299.gdc1121823c-goog

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] rcu/segcblist: Add debug checks for segment lengths
  2020-11-19 18:32       ` Joel Fernandes
@ 2020-11-19 19:22         ` Paul E. McKenney
  2020-11-19 19:44           ` Joel Fernandes
  0 siblings, 1 reply; 13+ messages in thread
From: Paul E. McKenney @ 2020-11-19 19:22 UTC (permalink / raw)
  To: Joel Fernandes
  Cc: linux-kernel, Josh Triplett, Lai Jiangshan, Mathieu Desnoyers,
	rcu, Steven Rostedt

On Thu, Nov 19, 2020 at 01:32:52PM -0500, Joel Fernandes wrote:
> On Wed, Nov 18, 2020 at 07:56:13PM -0800, Paul E. McKenney wrote:
> > On Wed, Nov 18, 2020 at 07:52:23PM -0800, Paul E. McKenney wrote:
> > > On Wed, Nov 18, 2020 at 12:13:35PM -0800, Paul E. McKenney wrote:
> > > > On Wed, Nov 18, 2020 at 11:15:41AM -0500, Joel Fernandes (Google) wrote:
> > > > > After rcu_do_batch(), add a check for whether the seglen counts went to
> > > > > zero if the list was indeed empty.
> > > > > 
> > > > > Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> > > > 
> > > > Queued for testing and further review, thank you!
> > > 
> > > FYI, the second of the two checks triggered in all four one-hour runs of
> > > TREE01, all four one-hour runs of TREE04, and one of the four one-hour
> > > runs of TREE07.  This one:
> > > 
> > > WARN_ON_ONCE(count != 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0);
> > > 
> > > That is, there are callbacks in the list, but the sum of the segment
> > > counts is nevertheless zero.  The ->nocb_lock is held.
> > > 
> > > Thoughts?
> > 
> > FWIW, TREE01 reproduces it very quickly compared to the other two
> > scenarios, on all four run, within five minutes.
> 
> So far for TREE01, I traced it down to an rcu_barrier happening so it could
> be related to some interaction with rcu_barrier() (Just a guess).

Well, rcu_barrier() and srcu_barrier() are the only users of
rcu_segcblist_entrain(), if that helps.  Your modification to that
function looks plausible to me, but the system's opinion always overrules
mine.  ;-)

> 'p2' print below is the panic on the second warning. It executes 43 callbacks
> from the segcb list, but the list length still does not appear to be 0.
> 
> I'll debug it more:
> 
> [  191.085702]  rcuop/0-12        0.... 75782125us : rcu_invoke_callback: rcu_preempt rhp=000000006a881152 func=__d_free
> [  191.844028]  rcuop/0-12        0d..1 75796122us : rcu_segcb_stats: SegCbDequeued seglen: (DONE=0, WAIT=43, NEXT_READY=0, NEXT=0) gp_seq: (DONE=0, WAIT=11656, NEXT_READY=11656, NEXT=0)
> [  191.846493]  rcuop/0-12        0.... 75796122us : rcu_invoke_callback: rcu_preempt rhp=00000000017536a2 func=i_callback
> [  191.848160]  rcuop/0-12        0.... 75796123us : rcu_invoke_callback: rcu_preempt rhp=0000000014235c71 func=__d_free
> [  191.849695]  rcuop/0-12        0.... 75796123us : rcu_invoke_callback: rcu_preempt rhp=00000000368c5928 func=i_callback
> [  191.851262]  rcuop/0-12        0.... 75796124us : rcu_invoke_callback: rcu_preempt rhp=00000000bdbea790 func=__d_free
> [  191.852800]  rcuop/0-12        0.... 75796124us : rcu_invoke_callback: rcu_preempt rhp=0000000034b99f3d func=rcu_barrier_callback
> [  192.526784]  rcuop/0-12        0d..1 75809162us : rcu_segcb_stats: SegCbDequeued seglen: (DONE=0, WAIT=0, NEXT_READY=0, NEXT=0) gp_seq: (DONE=0, WAIT=11656, NEXT_READY=11656, NEXT=0)

Quite the coincidence that WAIT and NEXT_READY have exactly the same
number of callbacks, isn't it?  Or am I being too suspicious today?

								Thanx, Paul

> [  192.529132]  rcuop/0-12        0.... 75809163us : rcu_invoke_callback: rcu_preempt rhp=000000002d6a3fce func=rcu_sync_func
> [  192.530807]  rcuop/0-12        0.... 75809165us : rcu_invoke_callback: rcu_preempt rhp=000000009aa91c97 func=destroy_sched_domains_rcu
> [  192.532556]  rcuop/0-12        0.... 75809170us : rcu_invoke_callback: rcu_preempt rhp=000000002bb5a998 func=destroy_sched_domains_rcu
> [  192.534303]  rcuop/0-12        0.... 75809172us : rcu_invoke_callback: rcu_preempt rhp=00000000bcc2369a func=destroy_sched_domains_rcu
> [  192.536053]  rcuop/0-12        0.... 75809174us : rcu_invoke_callback: rcu_preempt rhp=000000004dcec39b func=destroy_sched_domains_rcu
> [  192.537802]  rcuop/0-12        0.... 75809176us : rcu_invoke_callback: rcu_preempt rhp=00000000dedb509d func=destroy_sched_domains_rcu
> [  192.539553]  rcuop/0-12        0.... 75809178us : rcu_invoke_callback: rcu_preempt rhp=000000006fe7dd9e func=destroy_sched_domains_rcu
> [  192.541299]  rcuop/0-12        0.... 75809180us : rcu_invoke_callback: rcu_preempt rhp=000000005a212061 func=destroy_sched_domains_rcu
> [  192.543043]  rcuop/0-12        0.... 75809182us : rcu_invoke_callback: rcu_preempt rhp=00000000c914935f func=destroy_sched_domains_rcu
> [  192.544792]  rcuop/0-12        0.... 75809184us : rcu_invoke_callback: rcu_preempt rhp=0000000019fa3368 func=destroy_sched_domains_rcu
> [  192.546539]  rcuop/0-12        0.... 75809186us : rcu_invoke_callback: rcu_preempt rhp=00000000ab06c069 func=destroy_sched_domains_rcu
> [  192.548289]  rcuop/0-12        0.... 75809188us : rcu_invoke_callback: rcu_preempt rhp=000000003c134d6b func=destroy_sched_domains_rcu
> [  192.550037]  rcuop/0-12        0.... 75809190us : rcu_invoke_callback: rcu_preempt rhp=00000000cd1fda6c func=destroy_sched_domains_rcu
> [  192.551790]  rcuop/0-12        0.... 75809192us : rcu_invoke_callback: rcu_preempt rhp=000000005e2c676e func=destroy_sched_domains_rcu
> [  192.553576]  rcuop/0-12        0.... 75809194us : rcu_invoke_callback: rcu_preempt rhp=00000000ef38f46f func=destroy_sched_domains_rcu
> [  192.555314]  rcuop/0-12        0.... 75809196us : rcu_invoke_callback: rcu_preempt rhp=0000000080458170 func=destroy_sched_domains_rcu
> [  192.557054]  rcuop/0-12        0.... 75809198us : rcu_invoke_callback: rcu_preempt rhp=0000000009316351 func=destroy_sched_domains_rcu
> [  192.558793]  rcuop/0-12        0.... 75809200us : rcu_invoke_callback: rcu_preempt rhp=0000000022a2585e func=destroy_sched_domains_rcu
> [  192.560539]  rcuop/0-12        0.... 75809202us : rcu_invoke_callback: rcu_preempt rhp=00000000b3aee55f func=destroy_sched_domains_rcu
> [  192.562282]  rcuop/0-12        0.... 75809204us : rcu_invoke_callback: rcu_preempt rhp=0000000044bb7261 func=destroy_sched_domains_rcu
> [  192.564028]  rcuop/0-12        0.... 75809206us : rcu_invoke_callback: rcu_preempt rhp=00000000d5c7ff62 func=destroy_sched_domains_rcu
> [  192.565777]  rcuop/0-12        0.... 75809208us : rcu_invoke_callback: rcu_preempt rhp=00000000bc570a55 func=destroy_sched_domains_rcu
> [  192.567568]  rcuop/0-12        0.... 75809210us : rcu_invoke_callback: rcu_preempt rhp=000000004d639757 func=destroy_sched_domains_rcu
> [  192.569315]  rcuop/0-12        0.... 75809212us : rcu_invoke_callback: rcu_preempt rhp=00000000de702458 func=destroy_sched_domains_rcu
> [  192.571111]  rcuop/0-12        0.... 75809214us : rcu_invoke_callback: rcu_preempt rhp=000000006f7cb15a func=destroy_sched_domains_rcu
> [  192.572855]  rcuop/0-12        0.... 75809216us : rcu_invoke_callback: rcu_preempt rhp=0000000000893e5b func=destroy_sched_domains_rcu
> [  192.574595]  rcuop/0-12        0.... 75809218us : rcu_invoke_callback: rcu_preempt rhp=0000000066d48c64 func=destroy_sched_domains_rcu
> [  192.576435]  rcuop/0-12        0.... 75809220us : rcu_invoke_callback: rcu_preempt rhp=000000009a3df053 func=destroy_sched_domains_rcu
> [  192.578182]  rcuop/0-12        0.... 75809221us : rcu_invoke_callback: rcu_preempt rhp=000000002b4a7d54 func=destroy_sched_domains_rcu
> [  192.579981]  rcuop/0-12        0.... 75809223us : rcu_invoke_callback: rcu_preempt rhp=000000009cf78b8b func=destroy_sched_domains_rcu
> [  192.581773]  rcuop/0-12        0.... 75809225us : rcu_invoke_callback: rcu_preempt rhp=000000000beafe8a func=destroy_sched_domains_rcu
> [  192.583547]  rcuop/0-12        0.... 75809227us : rcu_invoke_callback: rcu_preempt rhp=000000007ade7189 func=destroy_sched_domains_rcu
> [  192.585373]  rcuop/0-12        0.... 75809229us : rcu_invoke_callback: rcu_preempt rhp=00000000e9d1e487 func=destroy_sched_domains_rcu
> [  192.587139]  rcuop/0-12        0.... 75809231us : rcu_invoke_callback: rcu_preempt rhp=000000002e04188d func=destroy_sched_domains_rcu
> [  192.588984]  rcuop/0-12        0.... 75809233us : rcu_invoke_callback: rcu_preempt rhp=00000000bf10a58e func=destroy_sched_domains_rcu
> [  192.590732]  rcuop/0-12        0.... 75809235us : rcu_invoke_callback: rcu_preempt rhp=00000000501d3290 func=destroy_sched_domains_rcu
> [  192.592562]  rcuop/0-12        0.... 75809237us : rcu_invoke_callback: rcu_preempt rhp=00000000e129bf91 func=destroy_sched_domains_rcu
> [  192.594309]  rcuop/0-12        0.... 75809239us : rcu_invoke_callback: rcu_preempt rhp=0000000072364c93 func=destroy_sched_domains_rcu
> [  192.596063]  rcuop/0-12        0.... 75809241us : rcu_invoke_callback: rcu_preempt rhp=000000000342d994 func=destroy_sched_domains_rcu
> [  192.597807]  rcuop/0-12        0.... 75809243us : rcu_invoke_callback: rcu_preempt rhp=00000000944f6695 func=destroy_sched_domains_rcu
> [  192.599608]  rcuop/0-12        0.... 75809245us : rcu_invoke_callback: rcu_preempt rhp=00000000255bf397 func=destroy_sched_domains_rcu
> [  192.601341]  rcuop/0-12        0.... 75809247us : rcu_invoke_callback: rcu_preempt rhp=00000000d4a4767b func=free_rootdomain
> [  192.602958]  rcuop/0-12        0.... 75809251us : rcu_invoke_callback: rcu_preempt rhp=00000000b6688098 func=destroy_sched_domains_rcu
> [  192.604779]  rcuop/0-12        0.... 75809269us : rcu_invoke_callback: rcu_preempt rhp=0000000034b99f3d func=rcu_barrier_callback
> [  193.894910]  rcuop/0-12        0d..1 75873331us : rcu_do_batch: p2
> 
> thanks,
> 
>  - Joel
> 
> 
> > 
> > 							Thanx, Paul
> > 
> > > > > ---
> > > > > v1->v2: Added more debug checks.
> > > > > 
> > > > >  kernel/rcu/rcu_segcblist.c | 12 ++++++++++++
> > > > >  kernel/rcu/rcu_segcblist.h |  3 +++
> > > > >  kernel/rcu/tree.c          |  2 ++
> > > > >  3 files changed, 17 insertions(+)
> > > > > 
> > > > > diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
> > > > > index 5059b6102afe..6e98bb3804f0 100644
> > > > > --- a/kernel/rcu/rcu_segcblist.c
> > > > > +++ b/kernel/rcu/rcu_segcblist.c
> > > > > @@ -94,6 +94,18 @@ static long rcu_segcblist_get_seglen(struct rcu_segcblist *rsclp, int seg)
> > > > >  	return READ_ONCE(rsclp->seglen[seg]);
> > > > >  }
> > > > >  
> > > > > +/* Return number of callbacks in segmented callback list by totalling seglen. */
> > > > > +long rcu_segcblist_n_segment_cbs(struct rcu_segcblist *rsclp)
> > > > > +{
> > > > > +	long len = 0;
> > > > > +	int i;
> > > > > +
> > > > > +	for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++)
> > > > > +		len += rcu_segcblist_get_seglen(rsclp, i);
> > > > > +
> > > > > +	return len;
> > > > > +}
> > > > > +
> > > > >  /* Set the length of a segment of the rcu_segcblist structure. */
> > > > >  static void rcu_segcblist_set_seglen(struct rcu_segcblist *rsclp, int seg, long v)
> > > > >  {
> > > > > diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
> > > > > index cd35c9faaf51..46a42d77f7e1 100644
> > > > > --- a/kernel/rcu/rcu_segcblist.h
> > > > > +++ b/kernel/rcu/rcu_segcblist.h
> > > > > @@ -15,6 +15,9 @@ static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp)
> > > > >  	return READ_ONCE(rclp->len);
> > > > >  }
> > > > >  
> > > > > +/* Return number of callbacks in segmented callback list by totalling seglen. */
> > > > > +long rcu_segcblist_n_segment_cbs(struct rcu_segcblist *rsclp);
> > > > > +
> > > > >  void rcu_cblist_init(struct rcu_cblist *rclp);
> > > > >  void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp);
> > > > >  void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp,
> > > > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > > > > index f5b61e10f1de..91e35b521e51 100644
> > > > > --- a/kernel/rcu/tree.c
> > > > > +++ b/kernel/rcu/tree.c
> > > > > @@ -2553,6 +2553,8 @@ static void rcu_do_batch(struct rcu_data *rdp)
> > > > >  	WARN_ON_ONCE(count == 0 && !rcu_segcblist_empty(&rdp->cblist));
> > > > >  	WARN_ON_ONCE(!IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
> > > > >  		     count != 0 && rcu_segcblist_empty(&rdp->cblist));
> > > > > +	WARN_ON_ONCE(count == 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) != 0);
> > > > > +	WARN_ON_ONCE(count != 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0);
> > > > >  
> > > > >  	rcu_nocb_unlock_irqrestore(rdp, flags);
> > > > >  
> > > > > -- 
> > > > > 2.29.2.299.gdc1121823c-goog

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] rcu/segcblist: Add debug checks for segment lengths
  2020-11-19 19:22         ` Paul E. McKenney
@ 2020-11-19 19:44           ` Joel Fernandes
  2020-11-19 20:16             ` Paul E. McKenney
  0 siblings, 1 reply; 13+ messages in thread
From: Joel Fernandes @ 2020-11-19 19:44 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: LKML, Josh Triplett, Lai Jiangshan, Mathieu Desnoyers, rcu,
	Steven Rostedt

On Thu, Nov 19, 2020 at 2:22 PM Paul E. McKenney <paulmck@kernel.org> wrote:
> > > > > On Wed, Nov 18, 2020 at 11:15:41AM -0500, Joel Fernandes (Google) wrote:
> > > > > > After rcu_do_batch(), add a check for whether the seglen counts went to
> > > > > > zero if the list was indeed empty.
> > > > > >
> > > > > > Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> > > > >
> > > > > Queued for testing and further review, thank you!
> > > >
> > > > FYI, the second of the two checks triggered in all four one-hour runs of
> > > > TREE01, all four one-hour runs of TREE04, and one of the four one-hour
> > > > runs of TREE07.  This one:
> > > >
> > > > WARN_ON_ONCE(count != 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0);
> > > >
> > > > That is, there are callbacks in the list, but the sum of the segment
> > > > counts is nevertheless zero.  The ->nocb_lock is held.
> > > >
> > > > Thoughts?
> > >
> > > FWIW, TREE01 reproduces it very quickly compared to the other two
> > > scenarios, on all four run, within five minutes.
> >
> > So far for TREE01, I traced it down to an rcu_barrier happening so it could
> > be related to some interaction with rcu_barrier() (Just a guess).
>
> Well, rcu_barrier() and srcu_barrier() are the only users of
> rcu_segcblist_entrain(), if that helps.  Your modification to that
> function looks plausible to me, but the system's opinion always overrules
> mine.  ;-)

Right. Does anything the bypass code standout? That happens during
rcu_barrier() as well, and it messes with the lengths.

> > 'p2' print below is the panic on the second warning. It executes 43 callbacks
> > from the segcb list, but the list length still does not appear to be 0.
> >
> > I'll debug it more:
> >
> > [  191.085702]  rcuop/0-12        0.... 75782125us : rcu_invoke_callback: rcu_preempt rhp=000000006a881152 func=__d_free
> > [  191.844028]  rcuop/0-12        0d..1 75796122us : rcu_segcb_stats: SegCbDequeued seglen: (DONE=0, WAIT=43, NEXT_READY=0, NEXT=0) gp_seq: (DONE=0, WAIT=11656, NEXT_READY=11656, NEXT=0)
> > [  191.846493]  rcuop/0-12        0.... 75796122us : rcu_invoke_callback: rcu_preempt rhp=00000000017536a2 func=i_callback
> > [  191.848160]  rcuop/0-12        0.... 75796123us : rcu_invoke_callback: rcu_preempt rhp=0000000014235c71 func=__d_free
> > [  191.849695]  rcuop/0-12        0.... 75796123us : rcu_invoke_callback: rcu_preempt rhp=00000000368c5928 func=i_callback
> > [  191.851262]  rcuop/0-12        0.... 75796124us : rcu_invoke_callback: rcu_preempt rhp=00000000bdbea790 func=__d_free
> > [  191.852800]  rcuop/0-12        0.... 75796124us : rcu_invoke_callback: rcu_preempt rhp=0000000034b99f3d func=rcu_barrier_callback
> > [  192.526784]  rcuop/0-12        0d..1 75809162us : rcu_segcb_stats: SegCbDequeued seglen: (DONE=0, WAIT=0, NEXT_READY=0, NEXT=0) gp_seq: (DONE=0, WAIT=11656, NEXT_READY=11656, NEXT=0)
>
> Quite the coincidence that WAIT and NEXT_READY have exactly the same
> number of callbacks, isn't it?  Or am I being too suspicious today?

Those numbers are gp_seq :-D.

thanks,

 - Joel



>
>                                                                 Thanx, Paul
>
> > [  192.529132]  rcuop/0-12        0.... 75809163us : rcu_invoke_callback: rcu_preempt rhp=000000002d6a3fce func=rcu_sync_func
> > [  192.530807]  rcuop/0-12        0.... 75809165us : rcu_invoke_callback: rcu_preempt rhp=000000009aa91c97 func=destroy_sched_domains_rcu
> > [  192.532556]  rcuop/0-12        0.... 75809170us : rcu_invoke_callback: rcu_preempt rhp=000000002bb5a998 func=destroy_sched_domains_rcu
> > [  192.534303]  rcuop/0-12        0.... 75809172us : rcu_invoke_callback: rcu_preempt rhp=00000000bcc2369a func=destroy_sched_domains_rcu
> > [  192.536053]  rcuop/0-12        0.... 75809174us : rcu_invoke_callback: rcu_preempt rhp=000000004dcec39b func=destroy_sched_domains_rcu
> > [  192.537802]  rcuop/0-12        0.... 75809176us : rcu_invoke_callback: rcu_preempt rhp=00000000dedb509d func=destroy_sched_domains_rcu
> > [  192.539553]  rcuop/0-12        0.... 75809178us : rcu_invoke_callback: rcu_preempt rhp=000000006fe7dd9e func=destroy_sched_domains_rcu
> > [  192.541299]  rcuop/0-12        0.... 75809180us : rcu_invoke_callback: rcu_preempt rhp=000000005a212061 func=destroy_sched_domains_rcu
> > [  192.543043]  rcuop/0-12        0.... 75809182us : rcu_invoke_callback: rcu_preempt rhp=00000000c914935f func=destroy_sched_domains_rcu
> > [  192.544792]  rcuop/0-12        0.... 75809184us : rcu_invoke_callback: rcu_preempt rhp=0000000019fa3368 func=destroy_sched_domains_rcu
> > [  192.546539]  rcuop/0-12        0.... 75809186us : rcu_invoke_callback: rcu_preempt rhp=00000000ab06c069 func=destroy_sched_domains_rcu
> > [  192.548289]  rcuop/0-12        0.... 75809188us : rcu_invoke_callback: rcu_preempt rhp=000000003c134d6b func=destroy_sched_domains_rcu
> > [  192.550037]  rcuop/0-12        0.... 75809190us : rcu_invoke_callback: rcu_preempt rhp=00000000cd1fda6c func=destroy_sched_domains_rcu
> > [  192.551790]  rcuop/0-12        0.... 75809192us : rcu_invoke_callback: rcu_preempt rhp=000000005e2c676e func=destroy_sched_domains_rcu
> > [  192.553576]  rcuop/0-12        0.... 75809194us : rcu_invoke_callback: rcu_preempt rhp=00000000ef38f46f func=destroy_sched_domains_rcu
> > [  192.555314]  rcuop/0-12        0.... 75809196us : rcu_invoke_callback: rcu_preempt rhp=0000000080458170 func=destroy_sched_domains_rcu
> > [  192.557054]  rcuop/0-12        0.... 75809198us : rcu_invoke_callback: rcu_preempt rhp=0000000009316351 func=destroy_sched_domains_rcu
> > [  192.558793]  rcuop/0-12        0.... 75809200us : rcu_invoke_callback: rcu_preempt rhp=0000000022a2585e func=destroy_sched_domains_rcu
> > [  192.560539]  rcuop/0-12        0.... 75809202us : rcu_invoke_callback: rcu_preempt rhp=00000000b3aee55f func=destroy_sched_domains_rcu
> > [  192.562282]  rcuop/0-12        0.... 75809204us : rcu_invoke_callback: rcu_preempt rhp=0000000044bb7261 func=destroy_sched_domains_rcu
> > [  192.564028]  rcuop/0-12        0.... 75809206us : rcu_invoke_callback: rcu_preempt rhp=00000000d5c7ff62 func=destroy_sched_domains_rcu
> > [  192.565777]  rcuop/0-12        0.... 75809208us : rcu_invoke_callback: rcu_preempt rhp=00000000bc570a55 func=destroy_sched_domains_rcu
> > [  192.567568]  rcuop/0-12        0.... 75809210us : rcu_invoke_callback: rcu_preempt rhp=000000004d639757 func=destroy_sched_domains_rcu
> > [  192.569315]  rcuop/0-12        0.... 75809212us : rcu_invoke_callback: rcu_preempt rhp=00000000de702458 func=destroy_sched_domains_rcu
> > [  192.571111]  rcuop/0-12        0.... 75809214us : rcu_invoke_callback: rcu_preempt rhp=000000006f7cb15a func=destroy_sched_domains_rcu
> > [  192.572855]  rcuop/0-12        0.... 75809216us : rcu_invoke_callback: rcu_preempt rhp=0000000000893e5b func=destroy_sched_domains_rcu
> > [  192.574595]  rcuop/0-12        0.... 75809218us : rcu_invoke_callback: rcu_preempt rhp=0000000066d48c64 func=destroy_sched_domains_rcu
> > [  192.576435]  rcuop/0-12        0.... 75809220us : rcu_invoke_callback: rcu_preempt rhp=000000009a3df053 func=destroy_sched_domains_rcu
> > [  192.578182]  rcuop/0-12        0.... 75809221us : rcu_invoke_callback: rcu_preempt rhp=000000002b4a7d54 func=destroy_sched_domains_rcu
> > [  192.579981]  rcuop/0-12        0.... 75809223us : rcu_invoke_callback: rcu_preempt rhp=000000009cf78b8b func=destroy_sched_domains_rcu
> > [  192.581773]  rcuop/0-12        0.... 75809225us : rcu_invoke_callback: rcu_preempt rhp=000000000beafe8a func=destroy_sched_domains_rcu
> > [  192.583547]  rcuop/0-12        0.... 75809227us : rcu_invoke_callback: rcu_preempt rhp=000000007ade7189 func=destroy_sched_domains_rcu
> > [  192.585373]  rcuop/0-12        0.... 75809229us : rcu_invoke_callback: rcu_preempt rhp=00000000e9d1e487 func=destroy_sched_domains_rcu
> > [  192.587139]  rcuop/0-12        0.... 75809231us : rcu_invoke_callback: rcu_preempt rhp=000000002e04188d func=destroy_sched_domains_rcu
> > [  192.588984]  rcuop/0-12        0.... 75809233us : rcu_invoke_callback: rcu_preempt rhp=00000000bf10a58e func=destroy_sched_domains_rcu
> > [  192.590732]  rcuop/0-12        0.... 75809235us : rcu_invoke_callback: rcu_preempt rhp=00000000501d3290 func=destroy_sched_domains_rcu
> > [  192.592562]  rcuop/0-12        0.... 75809237us : rcu_invoke_callback: rcu_preempt rhp=00000000e129bf91 func=destroy_sched_domains_rcu
> > [  192.594309]  rcuop/0-12        0.... 75809239us : rcu_invoke_callback: rcu_preempt rhp=0000000072364c93 func=destroy_sched_domains_rcu
> > [  192.596063]  rcuop/0-12        0.... 75809241us : rcu_invoke_callback: rcu_preempt rhp=000000000342d994 func=destroy_sched_domains_rcu
> > [  192.597807]  rcuop/0-12        0.... 75809243us : rcu_invoke_callback: rcu_preempt rhp=00000000944f6695 func=destroy_sched_domains_rcu
> > [  192.599608]  rcuop/0-12        0.... 75809245us : rcu_invoke_callback: rcu_preempt rhp=00000000255bf397 func=destroy_sched_domains_rcu
> > [  192.601341]  rcuop/0-12        0.... 75809247us : rcu_invoke_callback: rcu_preempt rhp=00000000d4a4767b func=free_rootdomain
> > [  192.602958]  rcuop/0-12        0.... 75809251us : rcu_invoke_callback: rcu_preempt rhp=00000000b6688098 func=destroy_sched_domains_rcu
> > [  192.604779]  rcuop/0-12        0.... 75809269us : rcu_invoke_callback: rcu_preempt rhp=0000000034b99f3d func=rcu_barrier_callback
> > [  193.894910]  rcuop/0-12        0d..1 75873331us : rcu_do_batch: p2
> >
> > thanks,
> >
> >  - Joel
> >
> >
> > >
> > >                                                     Thanx, Paul
> > >
> > > > > > ---
> > > > > > v1->v2: Added more debug checks.
> > > > > >
> > > > > >  kernel/rcu/rcu_segcblist.c | 12 ++++++++++++
> > > > > >  kernel/rcu/rcu_segcblist.h |  3 +++
> > > > > >  kernel/rcu/tree.c          |  2 ++
> > > > > >  3 files changed, 17 insertions(+)
> > > > > >
> > > > > > diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
> > > > > > index 5059b6102afe..6e98bb3804f0 100644
> > > > > > --- a/kernel/rcu/rcu_segcblist.c
> > > > > > +++ b/kernel/rcu/rcu_segcblist.c
> > > > > > @@ -94,6 +94,18 @@ static long rcu_segcblist_get_seglen(struct rcu_segcblist *rsclp, int seg)
> > > > > >       return READ_ONCE(rsclp->seglen[seg]);
> > > > > >  }
> > > > > >
> > > > > > +/* Return number of callbacks in segmented callback list by totalling seglen. */
> > > > > > +long rcu_segcblist_n_segment_cbs(struct rcu_segcblist *rsclp)
> > > > > > +{
> > > > > > +     long len = 0;
> > > > > > +     int i;
> > > > > > +
> > > > > > +     for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++)
> > > > > > +             len += rcu_segcblist_get_seglen(rsclp, i);
> > > > > > +
> > > > > > +     return len;
> > > > > > +}
> > > > > > +
> > > > > >  /* Set the length of a segment of the rcu_segcblist structure. */
> > > > > >  static void rcu_segcblist_set_seglen(struct rcu_segcblist *rsclp, int seg, long v)
> > > > > >  {
> > > > > > diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
> > > > > > index cd35c9faaf51..46a42d77f7e1 100644
> > > > > > --- a/kernel/rcu/rcu_segcblist.h
> > > > > > +++ b/kernel/rcu/rcu_segcblist.h
> > > > > > @@ -15,6 +15,9 @@ static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp)
> > > > > >       return READ_ONCE(rclp->len);
> > > > > >  }
> > > > > >
> > > > > > +/* Return number of callbacks in segmented callback list by totalling seglen. */
> > > > > > +long rcu_segcblist_n_segment_cbs(struct rcu_segcblist *rsclp);
> > > > > > +
> > > > > >  void rcu_cblist_init(struct rcu_cblist *rclp);
> > > > > >  void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp);
> > > > > >  void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp,
> > > > > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > > > > > index f5b61e10f1de..91e35b521e51 100644
> > > > > > --- a/kernel/rcu/tree.c
> > > > > > +++ b/kernel/rcu/tree.c
> > > > > > @@ -2553,6 +2553,8 @@ static void rcu_do_batch(struct rcu_data *rdp)
> > > > > >       WARN_ON_ONCE(count == 0 && !rcu_segcblist_empty(&rdp->cblist));
> > > > > >       WARN_ON_ONCE(!IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
> > > > > >                    count != 0 && rcu_segcblist_empty(&rdp->cblist));
> > > > > > +     WARN_ON_ONCE(count == 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) != 0);
> > > > > > +     WARN_ON_ONCE(count != 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0);
> > > > > >
> > > > > >       rcu_nocb_unlock_irqrestore(rdp, flags);
> > > > > >
> > > > > > --
> > > > > > 2.29.2.299.gdc1121823c-goog

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] rcu/segcblist: Add debug checks for segment lengths
  2020-11-19 19:44           ` Joel Fernandes
@ 2020-11-19 20:16             ` Paul E. McKenney
  2020-11-19 20:42               ` Joel Fernandes
  0 siblings, 1 reply; 13+ messages in thread
From: Paul E. McKenney @ 2020-11-19 20:16 UTC (permalink / raw)
  To: Joel Fernandes
  Cc: LKML, Josh Triplett, Lai Jiangshan, Mathieu Desnoyers, rcu,
	Steven Rostedt

On Thu, Nov 19, 2020 at 02:44:35PM -0500, Joel Fernandes wrote:
> On Thu, Nov 19, 2020 at 2:22 PM Paul E. McKenney <paulmck@kernel.org> wrote:
> > > > > > On Wed, Nov 18, 2020 at 11:15:41AM -0500, Joel Fernandes (Google) wrote:
> > > > > > > After rcu_do_batch(), add a check for whether the seglen counts went to
> > > > > > > zero if the list was indeed empty.
> > > > > > >
> > > > > > > Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> > > > > >
> > > > > > Queued for testing and further review, thank you!
> > > > >
> > > > > FYI, the second of the two checks triggered in all four one-hour runs of
> > > > > TREE01, all four one-hour runs of TREE04, and one of the four one-hour
> > > > > runs of TREE07.  This one:
> > > > >
> > > > > WARN_ON_ONCE(count != 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0);
> > > > >
> > > > > That is, there are callbacks in the list, but the sum of the segment
> > > > > counts is nevertheless zero.  The ->nocb_lock is held.
> > > > >
> > > > > Thoughts?
> > > >
> > > > FWIW, TREE01 reproduces it very quickly compared to the other two
> > > > scenarios, on all four run, within five minutes.
> > >
> > > So far for TREE01, I traced it down to an rcu_barrier happening so it could
> > > be related to some interaction with rcu_barrier() (Just a guess).
> >
> > Well, rcu_barrier() and srcu_barrier() are the only users of
> > rcu_segcblist_entrain(), if that helps.  Your modification to that
> > function looks plausible to me, but the system's opinion always overrules
> > mine.  ;-)
> 
> Right. Does anything the bypass code standout? That happens during
> rcu_barrier() as well, and it messes with the lengths.

In theory, rcu_barrier_func() flushes the bypass before doing the
entrain, and does the rcu_segcblist_entrain() afterwards.

Ah, and that is the issue.  If ->cblist is empty and ->nocb_bypass
is not, then ->cblist length will be nonzero, and none of the
segments will be nonzero.

So you need something like this for that second WARN, correct?

	WARN_ON_ONCE(!rcu_segcblist_empty(&rdp->cblist) &&
		     rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0);

This is off the cuff, so should be taken with a grain of salt.  And
there might well be other similar issues.

> > > 'p2' print below is the panic on the second warning. It executes 43 callbacks
> > > from the segcb list, but the list length still does not appear to be 0.
> > >
> > > I'll debug it more:
> > >
> > > [  191.085702]  rcuop/0-12        0.... 75782125us : rcu_invoke_callback: rcu_preempt rhp=000000006a881152 func=__d_free
> > > [  191.844028]  rcuop/0-12        0d..1 75796122us : rcu_segcb_stats: SegCbDequeued seglen: (DONE=0, WAIT=43, NEXT_READY=0, NEXT=0) gp_seq: (DONE=0, WAIT=11656, NEXT_READY=11656, NEXT=0)
> > > [  191.846493]  rcuop/0-12        0.... 75796122us : rcu_invoke_callback: rcu_preempt rhp=00000000017536a2 func=i_callback
> > > [  191.848160]  rcuop/0-12        0.... 75796123us : rcu_invoke_callback: rcu_preempt rhp=0000000014235c71 func=__d_free
> > > [  191.849695]  rcuop/0-12        0.... 75796123us : rcu_invoke_callback: rcu_preempt rhp=00000000368c5928 func=i_callback
> > > [  191.851262]  rcuop/0-12        0.... 75796124us : rcu_invoke_callback: rcu_preempt rhp=00000000bdbea790 func=__d_free
> > > [  191.852800]  rcuop/0-12        0.... 75796124us : rcu_invoke_callback: rcu_preempt rhp=0000000034b99f3d func=rcu_barrier_callback
> > > [  192.526784]  rcuop/0-12        0d..1 75809162us : rcu_segcb_stats: SegCbDequeued seglen: (DONE=0, WAIT=0, NEXT_READY=0, NEXT=0) gp_seq: (DONE=0, WAIT=11656, NEXT_READY=11656, NEXT=0)
> >
> > Quite the coincidence that WAIT and NEXT_READY have exactly the same
> > number of callbacks, isn't it?  Or am I being too suspicious today?
> 
> Those numbers are gp_seq :-D.

That would explain it!  ;-)

Still odd from a useful-information point of view.  If both gp_seq fields
are identical, they should be merged.  But yes, if the second one is
empty, this perhaps could happen.  And it probably predates your series.

							Thanx, Paul

> thanks,
> 
>  - Joel
> 
> 
> 
> >
> >                                                                 Thanx, Paul
> >
> > > [  192.529132]  rcuop/0-12        0.... 75809163us : rcu_invoke_callback: rcu_preempt rhp=000000002d6a3fce func=rcu_sync_func
> > > [  192.530807]  rcuop/0-12        0.... 75809165us : rcu_invoke_callback: rcu_preempt rhp=000000009aa91c97 func=destroy_sched_domains_rcu
> > > [  192.532556]  rcuop/0-12        0.... 75809170us : rcu_invoke_callback: rcu_preempt rhp=000000002bb5a998 func=destroy_sched_domains_rcu
> > > [  192.534303]  rcuop/0-12        0.... 75809172us : rcu_invoke_callback: rcu_preempt rhp=00000000bcc2369a func=destroy_sched_domains_rcu
> > > [  192.536053]  rcuop/0-12        0.... 75809174us : rcu_invoke_callback: rcu_preempt rhp=000000004dcec39b func=destroy_sched_domains_rcu
> > > [  192.537802]  rcuop/0-12        0.... 75809176us : rcu_invoke_callback: rcu_preempt rhp=00000000dedb509d func=destroy_sched_domains_rcu
> > > [  192.539553]  rcuop/0-12        0.... 75809178us : rcu_invoke_callback: rcu_preempt rhp=000000006fe7dd9e func=destroy_sched_domains_rcu
> > > [  192.541299]  rcuop/0-12        0.... 75809180us : rcu_invoke_callback: rcu_preempt rhp=000000005a212061 func=destroy_sched_domains_rcu
> > > [  192.543043]  rcuop/0-12        0.... 75809182us : rcu_invoke_callback: rcu_preempt rhp=00000000c914935f func=destroy_sched_domains_rcu
> > > [  192.544792]  rcuop/0-12        0.... 75809184us : rcu_invoke_callback: rcu_preempt rhp=0000000019fa3368 func=destroy_sched_domains_rcu
> > > [  192.546539]  rcuop/0-12        0.... 75809186us : rcu_invoke_callback: rcu_preempt rhp=00000000ab06c069 func=destroy_sched_domains_rcu
> > > [  192.548289]  rcuop/0-12        0.... 75809188us : rcu_invoke_callback: rcu_preempt rhp=000000003c134d6b func=destroy_sched_domains_rcu
> > > [  192.550037]  rcuop/0-12        0.... 75809190us : rcu_invoke_callback: rcu_preempt rhp=00000000cd1fda6c func=destroy_sched_domains_rcu
> > > [  192.551790]  rcuop/0-12        0.... 75809192us : rcu_invoke_callback: rcu_preempt rhp=000000005e2c676e func=destroy_sched_domains_rcu
> > > [  192.553576]  rcuop/0-12        0.... 75809194us : rcu_invoke_callback: rcu_preempt rhp=00000000ef38f46f func=destroy_sched_domains_rcu
> > > [  192.555314]  rcuop/0-12        0.... 75809196us : rcu_invoke_callback: rcu_preempt rhp=0000000080458170 func=destroy_sched_domains_rcu
> > > [  192.557054]  rcuop/0-12        0.... 75809198us : rcu_invoke_callback: rcu_preempt rhp=0000000009316351 func=destroy_sched_domains_rcu
> > > [  192.558793]  rcuop/0-12        0.... 75809200us : rcu_invoke_callback: rcu_preempt rhp=0000000022a2585e func=destroy_sched_domains_rcu
> > > [  192.560539]  rcuop/0-12        0.... 75809202us : rcu_invoke_callback: rcu_preempt rhp=00000000b3aee55f func=destroy_sched_domains_rcu
> > > [  192.562282]  rcuop/0-12        0.... 75809204us : rcu_invoke_callback: rcu_preempt rhp=0000000044bb7261 func=destroy_sched_domains_rcu
> > > [  192.564028]  rcuop/0-12        0.... 75809206us : rcu_invoke_callback: rcu_preempt rhp=00000000d5c7ff62 func=destroy_sched_domains_rcu
> > > [  192.565777]  rcuop/0-12        0.... 75809208us : rcu_invoke_callback: rcu_preempt rhp=00000000bc570a55 func=destroy_sched_domains_rcu
> > > [  192.567568]  rcuop/0-12        0.... 75809210us : rcu_invoke_callback: rcu_preempt rhp=000000004d639757 func=destroy_sched_domains_rcu
> > > [  192.569315]  rcuop/0-12        0.... 75809212us : rcu_invoke_callback: rcu_preempt rhp=00000000de702458 func=destroy_sched_domains_rcu
> > > [  192.571111]  rcuop/0-12        0.... 75809214us : rcu_invoke_callback: rcu_preempt rhp=000000006f7cb15a func=destroy_sched_domains_rcu
> > > [  192.572855]  rcuop/0-12        0.... 75809216us : rcu_invoke_callback: rcu_preempt rhp=0000000000893e5b func=destroy_sched_domains_rcu
> > > [  192.574595]  rcuop/0-12        0.... 75809218us : rcu_invoke_callback: rcu_preempt rhp=0000000066d48c64 func=destroy_sched_domains_rcu
> > > [  192.576435]  rcuop/0-12        0.... 75809220us : rcu_invoke_callback: rcu_preempt rhp=000000009a3df053 func=destroy_sched_domains_rcu
> > > [  192.578182]  rcuop/0-12        0.... 75809221us : rcu_invoke_callback: rcu_preempt rhp=000000002b4a7d54 func=destroy_sched_domains_rcu
> > > [  192.579981]  rcuop/0-12        0.... 75809223us : rcu_invoke_callback: rcu_preempt rhp=000000009cf78b8b func=destroy_sched_domains_rcu
> > > [  192.581773]  rcuop/0-12        0.... 75809225us : rcu_invoke_callback: rcu_preempt rhp=000000000beafe8a func=destroy_sched_domains_rcu
> > > [  192.583547]  rcuop/0-12        0.... 75809227us : rcu_invoke_callback: rcu_preempt rhp=000000007ade7189 func=destroy_sched_domains_rcu
> > > [  192.585373]  rcuop/0-12        0.... 75809229us : rcu_invoke_callback: rcu_preempt rhp=00000000e9d1e487 func=destroy_sched_domains_rcu
> > > [  192.587139]  rcuop/0-12        0.... 75809231us : rcu_invoke_callback: rcu_preempt rhp=000000002e04188d func=destroy_sched_domains_rcu
> > > [  192.588984]  rcuop/0-12        0.... 75809233us : rcu_invoke_callback: rcu_preempt rhp=00000000bf10a58e func=destroy_sched_domains_rcu
> > > [  192.590732]  rcuop/0-12        0.... 75809235us : rcu_invoke_callback: rcu_preempt rhp=00000000501d3290 func=destroy_sched_domains_rcu
> > > [  192.592562]  rcuop/0-12        0.... 75809237us : rcu_invoke_callback: rcu_preempt rhp=00000000e129bf91 func=destroy_sched_domains_rcu
> > > [  192.594309]  rcuop/0-12        0.... 75809239us : rcu_invoke_callback: rcu_preempt rhp=0000000072364c93 func=destroy_sched_domains_rcu
> > > [  192.596063]  rcuop/0-12        0.... 75809241us : rcu_invoke_callback: rcu_preempt rhp=000000000342d994 func=destroy_sched_domains_rcu
> > > [  192.597807]  rcuop/0-12        0.... 75809243us : rcu_invoke_callback: rcu_preempt rhp=00000000944f6695 func=destroy_sched_domains_rcu
> > > [  192.599608]  rcuop/0-12        0.... 75809245us : rcu_invoke_callback: rcu_preempt rhp=00000000255bf397 func=destroy_sched_domains_rcu
> > > [  192.601341]  rcuop/0-12        0.... 75809247us : rcu_invoke_callback: rcu_preempt rhp=00000000d4a4767b func=free_rootdomain
> > > [  192.602958]  rcuop/0-12        0.... 75809251us : rcu_invoke_callback: rcu_preempt rhp=00000000b6688098 func=destroy_sched_domains_rcu
> > > [  192.604779]  rcuop/0-12        0.... 75809269us : rcu_invoke_callback: rcu_preempt rhp=0000000034b99f3d func=rcu_barrier_callback
> > > [  193.894910]  rcuop/0-12        0d..1 75873331us : rcu_do_batch: p2
> > >
> > > thanks,
> > >
> > >  - Joel
> > >
> > >
> > > >
> > > >                                                     Thanx, Paul
> > > >
> > > > > > > ---
> > > > > > > v1->v2: Added more debug checks.
> > > > > > >
> > > > > > >  kernel/rcu/rcu_segcblist.c | 12 ++++++++++++
> > > > > > >  kernel/rcu/rcu_segcblist.h |  3 +++
> > > > > > >  kernel/rcu/tree.c          |  2 ++
> > > > > > >  3 files changed, 17 insertions(+)
> > > > > > >
> > > > > > > diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
> > > > > > > index 5059b6102afe..6e98bb3804f0 100644
> > > > > > > --- a/kernel/rcu/rcu_segcblist.c
> > > > > > > +++ b/kernel/rcu/rcu_segcblist.c
> > > > > > > @@ -94,6 +94,18 @@ static long rcu_segcblist_get_seglen(struct rcu_segcblist *rsclp, int seg)
> > > > > > >       return READ_ONCE(rsclp->seglen[seg]);
> > > > > > >  }
> > > > > > >
> > > > > > > +/* Return number of callbacks in segmented callback list by totalling seglen. */
> > > > > > > +long rcu_segcblist_n_segment_cbs(struct rcu_segcblist *rsclp)
> > > > > > > +{
> > > > > > > +     long len = 0;
> > > > > > > +     int i;
> > > > > > > +
> > > > > > > +     for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++)
> > > > > > > +             len += rcu_segcblist_get_seglen(rsclp, i);
> > > > > > > +
> > > > > > > +     return len;
> > > > > > > +}
> > > > > > > +
> > > > > > >  /* Set the length of a segment of the rcu_segcblist structure. */
> > > > > > >  static void rcu_segcblist_set_seglen(struct rcu_segcblist *rsclp, int seg, long v)
> > > > > > >  {
> > > > > > > diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
> > > > > > > index cd35c9faaf51..46a42d77f7e1 100644
> > > > > > > --- a/kernel/rcu/rcu_segcblist.h
> > > > > > > +++ b/kernel/rcu/rcu_segcblist.h
> > > > > > > @@ -15,6 +15,9 @@ static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp)
> > > > > > >       return READ_ONCE(rclp->len);
> > > > > > >  }
> > > > > > >
> > > > > > > +/* Return number of callbacks in segmented callback list by totalling seglen. */
> > > > > > > +long rcu_segcblist_n_segment_cbs(struct rcu_segcblist *rsclp);
> > > > > > > +
> > > > > > >  void rcu_cblist_init(struct rcu_cblist *rclp);
> > > > > > >  void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp);
> > > > > > >  void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp,
> > > > > > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > > > > > > index f5b61e10f1de..91e35b521e51 100644
> > > > > > > --- a/kernel/rcu/tree.c
> > > > > > > +++ b/kernel/rcu/tree.c
> > > > > > > @@ -2553,6 +2553,8 @@ static void rcu_do_batch(struct rcu_data *rdp)
> > > > > > >       WARN_ON_ONCE(count == 0 && !rcu_segcblist_empty(&rdp->cblist));
> > > > > > >       WARN_ON_ONCE(!IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
> > > > > > >                    count != 0 && rcu_segcblist_empty(&rdp->cblist));
> > > > > > > +     WARN_ON_ONCE(count == 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) != 0);
> > > > > > > +     WARN_ON_ONCE(count != 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0);
> > > > > > >
> > > > > > >       rcu_nocb_unlock_irqrestore(rdp, flags);
> > > > > > >
> > > > > > > --
> > > > > > > 2.29.2.299.gdc1121823c-goog

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] rcu/segcblist: Add debug checks for segment lengths
  2020-11-19 20:16             ` Paul E. McKenney
@ 2020-11-19 20:42               ` Joel Fernandes
  2020-12-01 22:26                 ` Joel Fernandes
  0 siblings, 1 reply; 13+ messages in thread
From: Joel Fernandes @ 2020-11-19 20:42 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: LKML, Josh Triplett, Lai Jiangshan, Mathieu Desnoyers, rcu,
	Steven Rostedt

On Thu, Nov 19, 2020 at 12:16:15PM -0800, Paul E. McKenney wrote:
> On Thu, Nov 19, 2020 at 02:44:35PM -0500, Joel Fernandes wrote:
> > On Thu, Nov 19, 2020 at 2:22 PM Paul E. McKenney <paulmck@kernel.org> wrote:
> > > > > > > On Wed, Nov 18, 2020 at 11:15:41AM -0500, Joel Fernandes (Google) wrote:
> > > > > > > > After rcu_do_batch(), add a check for whether the seglen counts went to
> > > > > > > > zero if the list was indeed empty.
> > > > > > > >
> > > > > > > > Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> > > > > > >
> > > > > > > Queued for testing and further review, thank you!
> > > > > >
> > > > > > FYI, the second of the two checks triggered in all four one-hour runs of
> > > > > > TREE01, all four one-hour runs of TREE04, and one of the four one-hour
> > > > > > runs of TREE07.  This one:
> > > > > >
> > > > > > WARN_ON_ONCE(count != 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0);
> > > > > >
> > > > > > That is, there are callbacks in the list, but the sum of the segment
> > > > > > counts is nevertheless zero.  The ->nocb_lock is held.
> > > > > >
> > > > > > Thoughts?
> > > > >
> > > > > FWIW, TREE01 reproduces it very quickly compared to the other two
> > > > > scenarios, on all four run, within five minutes.
> > > >
> > > > So far for TREE01, I traced it down to an rcu_barrier happening so it could
> > > > be related to some interaction with rcu_barrier() (Just a guess).
> > >
> > > Well, rcu_barrier() and srcu_barrier() are the only users of
> > > rcu_segcblist_entrain(), if that helps.  Your modification to that
> > > function looks plausible to me, but the system's opinion always overrules
> > > mine.  ;-)
> > 
> > Right. Does anything the bypass code standout? That happens during
> > rcu_barrier() as well, and it messes with the lengths.
> 
> In theory, rcu_barrier_func() flushes the bypass before doing the
> entrain, and does the rcu_segcblist_entrain() afterwards.
> 
> Ah, and that is the issue.  If ->cblist is empty and ->nocb_bypass
> is not, then ->cblist length will be nonzero, and none of the
> segments will be nonzero.
> 
> So you need something like this for that second WARN, correct?
> 
> 	WARN_ON_ONCE(!rcu_segcblist_empty(&rdp->cblist) &&
> 		     rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0);
> 
> This is off the cuff, so should be taken with a grain of salt.  And
> there might well be other similar issues.

Ah, makes sense. Or maybe should be made like the other warning?
        WARN_ON_ONCE(!IS_ENABLED(CONFIG_RCU_NOCB_CPU) && count != 0
		&& rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0);

Though your warning is better.

I will try these out and see if it goes away. I am afraid though that there
is an issue with !NOCB code since you had other configs that were failing
similarly.. :-\.

thanks, :-)

 - Joel


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] rcu/segcblist: Add debug checks for segment lengths
  2020-11-19 20:42               ` Joel Fernandes
@ 2020-12-01 22:26                 ` Joel Fernandes
  2020-12-02  4:21                   ` Paul E. McKenney
  0 siblings, 1 reply; 13+ messages in thread
From: Joel Fernandes @ 2020-12-01 22:26 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: LKML, Josh Triplett, Lai Jiangshan, Mathieu Desnoyers, rcu,
	Steven Rostedt

On Thu, Nov 19, 2020 at 3:42 PM Joel Fernandes <joel@joelfernandes.org> wrote:
>
> On Thu, Nov 19, 2020 at 12:16:15PM -0800, Paul E. McKenney wrote:
> > On Thu, Nov 19, 2020 at 02:44:35PM -0500, Joel Fernandes wrote:
> > > On Thu, Nov 19, 2020 at 2:22 PM Paul E. McKenney <paulmck@kernel.org> wrote:
> > > > > > > > On Wed, Nov 18, 2020 at 11:15:41AM -0500, Joel Fernandes (Google) wrote:
> > > > > > > > > After rcu_do_batch(), add a check for whether the seglen counts went to
> > > > > > > > > zero if the list was indeed empty.
> > > > > > > > >
> > > > > > > > > Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> > > > > > > >
> > > > > > > > Queued for testing and further review, thank you!
> > > > > > >
> > > > > > > FYI, the second of the two checks triggered in all four one-hour runs of
> > > > > > > TREE01, all four one-hour runs of TREE04, and one of the four one-hour
> > > > > > > runs of TREE07.  This one:
> > > > > > >
> > > > > > > WARN_ON_ONCE(count != 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0);
> > > > > > >
> > > > > > > That is, there are callbacks in the list, but the sum of the segment
> > > > > > > counts is nevertheless zero.  The ->nocb_lock is held.
> > > > > > >
> > > > > > > Thoughts?
> > > > > >
> > > > > > FWIW, TREE01 reproduces it very quickly compared to the other two
> > > > > > scenarios, on all four run, within five minutes.
> > > > >
> > > > > So far for TREE01, I traced it down to an rcu_barrier happening so it could
> > > > > be related to some interaction with rcu_barrier() (Just a guess).
> > > >
> > > > Well, rcu_barrier() and srcu_barrier() are the only users of
> > > > rcu_segcblist_entrain(), if that helps.  Your modification to that
> > > > function looks plausible to me, but the system's opinion always overrules
> > > > mine.  ;-)
> > >
> > > Right. Does anything the bypass code standout? That happens during
> > > rcu_barrier() as well, and it messes with the lengths.
> >
> > In theory, rcu_barrier_func() flushes the bypass before doing the
> > entrain, and does the rcu_segcblist_entrain() afterwards.
> >
> > Ah, and that is the issue.  If ->cblist is empty and ->nocb_bypass
> > is not, then ->cblist length will be nonzero, and none of the
> > segments will be nonzero.
> >
> > So you need something like this for that second WARN, correct?
> >
> >       WARN_ON_ONCE(!rcu_segcblist_empty(&rdp->cblist) &&
> >                    rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0);

Just started to look into it again. If the &rdp->cblist is empty, that
means the bypass list could not have been used (Since per comments on
rcu_nocb_try_bypass() , the bypass list is in use only when the cblist
is non-empty). So the cblist was non empty, then the segment counts
should not sum to 0.  So I don't think that explains it. Anyway, I
will try the new version of your warning in case there is something
about bypass lists that I'm missing.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] rcu/segcblist: Add debug checks for segment lengths
  2020-12-01 22:26                 ` Joel Fernandes
@ 2020-12-02  4:21                   ` Paul E. McKenney
  2020-12-02 14:58                     ` Joel Fernandes
  0 siblings, 1 reply; 13+ messages in thread
From: Paul E. McKenney @ 2020-12-02  4:21 UTC (permalink / raw)
  To: Joel Fernandes
  Cc: LKML, Josh Triplett, Lai Jiangshan, Mathieu Desnoyers, rcu,
	Steven Rostedt

On Tue, Dec 01, 2020 at 05:26:32PM -0500, Joel Fernandes wrote:
> On Thu, Nov 19, 2020 at 3:42 PM Joel Fernandes <joel@joelfernandes.org> wrote:
> >
> > On Thu, Nov 19, 2020 at 12:16:15PM -0800, Paul E. McKenney wrote:
> > > On Thu, Nov 19, 2020 at 02:44:35PM -0500, Joel Fernandes wrote:
> > > > On Thu, Nov 19, 2020 at 2:22 PM Paul E. McKenney <paulmck@kernel.org> wrote:
> > > > > > > > > On Wed, Nov 18, 2020 at 11:15:41AM -0500, Joel Fernandes (Google) wrote:
> > > > > > > > > > After rcu_do_batch(), add a check for whether the seglen counts went to
> > > > > > > > > > zero if the list was indeed empty.
> > > > > > > > > >
> > > > > > > > > > Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> > > > > > > > >
> > > > > > > > > Queued for testing and further review, thank you!
> > > > > > > >
> > > > > > > > FYI, the second of the two checks triggered in all four one-hour runs of
> > > > > > > > TREE01, all four one-hour runs of TREE04, and one of the four one-hour
> > > > > > > > runs of TREE07.  This one:
> > > > > > > >
> > > > > > > > WARN_ON_ONCE(count != 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0);
> > > > > > > >
> > > > > > > > That is, there are callbacks in the list, but the sum of the segment
> > > > > > > > counts is nevertheless zero.  The ->nocb_lock is held.
> > > > > > > >
> > > > > > > > Thoughts?
> > > > > > >
> > > > > > > FWIW, TREE01 reproduces it very quickly compared to the other two
> > > > > > > scenarios, on all four run, within five minutes.
> > > > > >
> > > > > > So far for TREE01, I traced it down to an rcu_barrier happening so it could
> > > > > > be related to some interaction with rcu_barrier() (Just a guess).
> > > > >
> > > > > Well, rcu_barrier() and srcu_barrier() are the only users of
> > > > > rcu_segcblist_entrain(), if that helps.  Your modification to that
> > > > > function looks plausible to me, but the system's opinion always overrules
> > > > > mine.  ;-)
> > > >
> > > > Right. Does anything the bypass code standout? That happens during
> > > > rcu_barrier() as well, and it messes with the lengths.
> > >
> > > In theory, rcu_barrier_func() flushes the bypass before doing the
> > > entrain, and does the rcu_segcblist_entrain() afterwards.
> > >
> > > Ah, and that is the issue.  If ->cblist is empty and ->nocb_bypass
> > > is not, then ->cblist length will be nonzero, and none of the
> > > segments will be nonzero.
> > >
> > > So you need something like this for that second WARN, correct?
> > >
> > >       WARN_ON_ONCE(!rcu_segcblist_empty(&rdp->cblist) &&
> > >                    rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0);
> 
> Just started to look into it again. If the &rdp->cblist is empty, that
> means the bypass list could not have been used (Since per comments on
> rcu_nocb_try_bypass() , the bypass list is in use only when the cblist
> is non-empty). So the cblist was non empty, then the segment counts
> should not sum to 0.  So I don't think that explains it. Anyway, I
> will try the new version of your warning in case there is something
> about bypass lists that I'm missing.

Good point.  I really did see failures, though.  Do they show up for
you?

							Thanx, Paul

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] rcu/segcblist: Add debug checks for segment lengths
  2020-12-02  4:21                   ` Paul E. McKenney
@ 2020-12-02 14:58                     ` Joel Fernandes
  2020-12-02 15:25                       ` Paul E. McKenney
  0 siblings, 1 reply; 13+ messages in thread
From: Joel Fernandes @ 2020-12-02 14:58 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: LKML, Josh Triplett, Lai Jiangshan, Mathieu Desnoyers, rcu,
	Steven Rostedt

On Tue, Dec 01, 2020 at 08:21:43PM -0800, Paul E. McKenney wrote:
> On Tue, Dec 01, 2020 at 05:26:32PM -0500, Joel Fernandes wrote:
> > On Thu, Nov 19, 2020 at 3:42 PM Joel Fernandes <joel@joelfernandes.org> wrote:
> > >
> > > On Thu, Nov 19, 2020 at 12:16:15PM -0800, Paul E. McKenney wrote:
> > > > On Thu, Nov 19, 2020 at 02:44:35PM -0500, Joel Fernandes wrote:
> > > > > On Thu, Nov 19, 2020 at 2:22 PM Paul E. McKenney <paulmck@kernel.org> wrote:
> > > > > > > > > > On Wed, Nov 18, 2020 at 11:15:41AM -0500, Joel Fernandes (Google) wrote:
> > > > > > > > > > > After rcu_do_batch(), add a check for whether the seglen counts went to
> > > > > > > > > > > zero if the list was indeed empty.
> > > > > > > > > > >
> > > > > > > > > > > Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> > > > > > > > > >
> > > > > > > > > > Queued for testing and further review, thank you!
> > > > > > > > >
> > > > > > > > > FYI, the second of the two checks triggered in all four one-hour runs of
> > > > > > > > > TREE01, all four one-hour runs of TREE04, and one of the four one-hour
> > > > > > > > > runs of TREE07.  This one:
> > > > > > > > >
> > > > > > > > > WARN_ON_ONCE(count != 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0);
> > > > > > > > >
> > > > > > > > > That is, there are callbacks in the list, but the sum of the segment
> > > > > > > > > counts is nevertheless zero.  The ->nocb_lock is held.
> > > > > > > > >
> > > > > > > > > Thoughts?
> > > > > > > >
> > > > > > > > FWIW, TREE01 reproduces it very quickly compared to the other two
> > > > > > > > scenarios, on all four run, within five minutes.
> > > > > > >
> > > > > > > So far for TREE01, I traced it down to an rcu_barrier happening so it could
> > > > > > > be related to some interaction with rcu_barrier() (Just a guess).
> > > > > >
> > > > > > Well, rcu_barrier() and srcu_barrier() are the only users of
> > > > > > rcu_segcblist_entrain(), if that helps.  Your modification to that
> > > > > > function looks plausible to me, but the system's opinion always overrules
> > > > > > mine.  ;-)
> > > > >
> > > > > Right. Does anything the bypass code standout? That happens during
> > > > > rcu_barrier() as well, and it messes with the lengths.
> > > >
> > > > In theory, rcu_barrier_func() flushes the bypass before doing the
> > > > entrain, and does the rcu_segcblist_entrain() afterwards.
> > > >
> > > > Ah, and that is the issue.  If ->cblist is empty and ->nocb_bypass
> > > > is not, then ->cblist length will be nonzero, and none of the
> > > > segments will be nonzero.
> > > >
> > > > So you need something like this for that second WARN, correct?
> > > >
> > > >       WARN_ON_ONCE(!rcu_segcblist_empty(&rdp->cblist) &&
> > > >                    rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0);
> > 
> > Just started to look into it again. If the &rdp->cblist is empty, that
> > means the bypass list could not have been used (Since per comments on
> > rcu_nocb_try_bypass() , the bypass list is in use only when the cblist
> > is non-empty). So the cblist was non empty, then the segment counts
> > should not sum to 0.  So I don't think that explains it. Anyway, I
> > will try the new version of your warning in case there is something
> > about bypass lists that I'm missing.
> 
> Good point.  I really did see failures, though.  Do they show up for
> you?

Yeah I do see failures. Once I change the warning as below, the failures go
away though. So looks like indeed a segcblist can be empty when the bypass
list has something in it?  If you agree, could you change the warning to as
below? The tests failing before all pass 1 hour rcutorture testing now
(TREE01, TREE04 and TREE07).

---8<-----------------------

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 91e35b521e51..3cd92b7df8ac 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2554,7 +2554,8 @@ static void rcu_do_batch(struct rcu_data *rdp)
 	WARN_ON_ONCE(!IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
 		     count != 0 && rcu_segcblist_empty(&rdp->cblist));
 	WARN_ON_ONCE(count == 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) != 0);
-	WARN_ON_ONCE(count != 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0);
+	WARN_ON_ONCE(!rcu_segcblist_empty(&rdp->cblist) &&
+		     rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0);
 
 	rcu_nocb_unlock_irqrestore(rdp, flags);
 
-- 
2.29.2.454.gaff20da3a2-goog


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] rcu/segcblist: Add debug checks for segment lengths
  2020-12-02 14:58                     ` Joel Fernandes
@ 2020-12-02 15:25                       ` Paul E. McKenney
  0 siblings, 0 replies; 13+ messages in thread
From: Paul E. McKenney @ 2020-12-02 15:25 UTC (permalink / raw)
  To: Joel Fernandes
  Cc: LKML, Josh Triplett, Lai Jiangshan, Mathieu Desnoyers, rcu,
	Steven Rostedt

On Wed, Dec 02, 2020 at 09:58:38AM -0500, Joel Fernandes wrote:
> On Tue, Dec 01, 2020 at 08:21:43PM -0800, Paul E. McKenney wrote:
> > On Tue, Dec 01, 2020 at 05:26:32PM -0500, Joel Fernandes wrote:
> > > On Thu, Nov 19, 2020 at 3:42 PM Joel Fernandes <joel@joelfernandes.org> wrote:
> > > >
> > > > On Thu, Nov 19, 2020 at 12:16:15PM -0800, Paul E. McKenney wrote:
> > > > > On Thu, Nov 19, 2020 at 02:44:35PM -0500, Joel Fernandes wrote:
> > > > > > On Thu, Nov 19, 2020 at 2:22 PM Paul E. McKenney <paulmck@kernel.org> wrote:
> > > > > > > > > > > On Wed, Nov 18, 2020 at 11:15:41AM -0500, Joel Fernandes (Google) wrote:
> > > > > > > > > > > > After rcu_do_batch(), add a check for whether the seglen counts went to
> > > > > > > > > > > > zero if the list was indeed empty.
> > > > > > > > > > > >
> > > > > > > > > > > > Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> > > > > > > > > > >
> > > > > > > > > > > Queued for testing and further review, thank you!
> > > > > > > > > >
> > > > > > > > > > FYI, the second of the two checks triggered in all four one-hour runs of
> > > > > > > > > > TREE01, all four one-hour runs of TREE04, and one of the four one-hour
> > > > > > > > > > runs of TREE07.  This one:
> > > > > > > > > >
> > > > > > > > > > WARN_ON_ONCE(count != 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0);
> > > > > > > > > >
> > > > > > > > > > That is, there are callbacks in the list, but the sum of the segment
> > > > > > > > > > counts is nevertheless zero.  The ->nocb_lock is held.
> > > > > > > > > >
> > > > > > > > > > Thoughts?
> > > > > > > > >
> > > > > > > > > FWIW, TREE01 reproduces it very quickly compared to the other two
> > > > > > > > > scenarios, on all four run, within five minutes.
> > > > > > > >
> > > > > > > > So far for TREE01, I traced it down to an rcu_barrier happening so it could
> > > > > > > > be related to some interaction with rcu_barrier() (Just a guess).
> > > > > > >
> > > > > > > Well, rcu_barrier() and srcu_barrier() are the only users of
> > > > > > > rcu_segcblist_entrain(), if that helps.  Your modification to that
> > > > > > > function looks plausible to me, but the system's opinion always overrules
> > > > > > > mine.  ;-)
> > > > > >
> > > > > > Right. Does anything the bypass code standout? That happens during
> > > > > > rcu_barrier() as well, and it messes with the lengths.
> > > > >
> > > > > In theory, rcu_barrier_func() flushes the bypass before doing the
> > > > > entrain, and does the rcu_segcblist_entrain() afterwards.
> > > > >
> > > > > Ah, and that is the issue.  If ->cblist is empty and ->nocb_bypass
> > > > > is not, then ->cblist length will be nonzero, and none of the
> > > > > segments will be nonzero.
> > > > >
> > > > > So you need something like this for that second WARN, correct?
> > > > >
> > > > >       WARN_ON_ONCE(!rcu_segcblist_empty(&rdp->cblist) &&
> > > > >                    rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0);
> > > 
> > > Just started to look into it again. If the &rdp->cblist is empty, that
> > > means the bypass list could not have been used (Since per comments on
> > > rcu_nocb_try_bypass() , the bypass list is in use only when the cblist
> > > is non-empty). So the cblist was non empty, then the segment counts
> > > should not sum to 0.  So I don't think that explains it. Anyway, I
> > > will try the new version of your warning in case there is something
> > > about bypass lists that I'm missing.
> > 
> > Good point.  I really did see failures, though.  Do they show up for
> > you?
> 
> Yeah I do see failures. Once I change the warning as below, the failures go
> away though. So looks like indeed a segcblist can be empty when the bypass
> list has something in it?  If you agree, could you change the warning to as
> below? The tests failing before all pass 1 hour rcutorture testing now
> (TREE01, TREE04 and TREE07).

I agree with the fix, which should not be too surprising.  ;-)

But can you tell me how you can have a non-zero count while all the
segment counts sum to zero?  Yes, you told me why nothing should be
placed in the bypass list while cblist is empty.  Is that really the
only way this state can be entered?

							Thanx, Paul

> ---8<-----------------------
> 
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 91e35b521e51..3cd92b7df8ac 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -2554,7 +2554,8 @@ static void rcu_do_batch(struct rcu_data *rdp)
>  	WARN_ON_ONCE(!IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
>  		     count != 0 && rcu_segcblist_empty(&rdp->cblist));
>  	WARN_ON_ONCE(count == 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) != 0);
> -	WARN_ON_ONCE(count != 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0);
> +	WARN_ON_ONCE(!rcu_segcblist_empty(&rdp->cblist) &&
> +		     rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0);
>  
>  	rcu_nocb_unlock_irqrestore(rdp, flags);
>  
> -- 
> 2.29.2.454.gaff20da3a2-goog
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2020-12-02 15:26 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-11-18 16:15 [PATCH v2] rcu/segcblist: Add debug checks for segment lengths Joel Fernandes (Google)
2020-11-18 20:13 ` Paul E. McKenney
2020-11-19  3:52   ` Paul E. McKenney
2020-11-19  3:56     ` Paul E. McKenney
2020-11-19 18:32       ` Joel Fernandes
2020-11-19 19:22         ` Paul E. McKenney
2020-11-19 19:44           ` Joel Fernandes
2020-11-19 20:16             ` Paul E. McKenney
2020-11-19 20:42               ` Joel Fernandes
2020-12-01 22:26                 ` Joel Fernandes
2020-12-02  4:21                   ` Paul E. McKenney
2020-12-02 14:58                     ` Joel Fernandes
2020-12-02 15:25                       ` Paul E. McKenney

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).