linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH tip/core/rcu 0/10] Expedited grace-period updates for v5.6
@ 2019-12-10  4:01 Paul E. McKenney
  2019-12-10  4:01 ` [PATCH tip/core/rcu 01/10] rcu: Use *_ONCE() to protect lockless ->expmask accesses paulmck
                   ` (9 more replies)
  0 siblings, 10 replies; 11+ messages in thread
From: Paul E. McKenney @ 2019-12-10  4:01 UTC (permalink / raw)
  To: rcu
  Cc: linux-kernel, kernel-team, mingo, jiangshanlai, dipankar, akpm,
	mathieu.desnoyers, josh, tglx, peterz, rostedt, dhowells,
	edumazet, fweisbec, oleg, joel

Hello!

This series provides updates to RCU's expedited grace periods:

1.	Use *_ONCE() to protect lockless ->expmask accesses.

2.	Avoid modifying mask_ofl_ipi in sync_rcu_exp_select_node_cpus(),
	courtesy of Boqun Feng.

3.	Fix data-race due to atomic_t copy-by-value, courtesy of
	Marco Elver.

4.	Lookup instead of bit-twiddling in sync_rcu_exp_select_node_cpus().

5.	Fix missed wakeup of exp_wq waiters, courtesy of Neeraj Upadhyay.

6.	Allow only one expedited GP to run concurrently with wakeups,
	courtesy of Neeraj Upadhyay.

7.	Rename sync_rcu_preempt_exp_done() to sync_rcu_exp_done().

8.	Update tree_exp.h function-header comments.

9.	Replace synchronize_sched_expedited_wait() "_sched" with "_rcu".

10.	Enable tick for nohz_full CPUs slow to provide expedited QS.

							Thanx, Paul

------------------------------------------------------------------------

 include/linux/tick.h       |    5 +
 include/trace/events/rcu.h |    4 -
 kernel/rcu/tree.c          |   11 +--
 kernel/rcu/tree.h          |    1 
 kernel/rcu/tree_exp.h      |  149 +++++++++++++++++++++++++++------------------
 kernel/rcu/tree_plugin.h   |    4 -
 6 files changed, 107 insertions(+), 67 deletions(-)

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH tip/core/rcu 01/10] rcu: Use *_ONCE() to protect lockless ->expmask accesses
  2019-12-10  4:01 [PATCH tip/core/rcu 0/10] Expedited grace-period updates for v5.6 Paul E. McKenney
@ 2019-12-10  4:01 ` paulmck
  2019-12-10  4:01 ` [PATCH tip/core/rcu 02/10] rcu: Avoid modifying mask_ofl_ipi in sync_rcu_exp_select_node_cpus() paulmck
                   ` (8 subsequent siblings)
  9 siblings, 0 replies; 11+ messages in thread
From: paulmck @ 2019-12-10  4:01 UTC (permalink / raw)
  To: rcu
  Cc: linux-kernel, kernel-team, mingo, jiangshanlai, dipankar, akpm,
	mathieu.desnoyers, josh, tglx, peterz, rostedt, dhowells,
	edumazet, fweisbec, oleg, joel, Paul E. McKenney

From: "Paul E. McKenney" <paulmck@kernel.org>

The rcu_node structure's ->expmask field is accessed locklessly when
starting a new expedited grace period and when reporting an expedited
RCU CPU stall warning.  This commit therefore handles the former by
taking a snapshot of ->expmask while the lock is held and the latter
by applying READ_ONCE() to lockless reads and WRITE_ONCE() to the
corresponding updates.

Link: https://lore.kernel.org/lkml/CANpmjNNmSOagbTpffHr4=Yedckx9Rm2NuGqC9UqE+AOz5f1-ZQ@mail.gmail.com
Reported-by: syzbot+134336b86f728d6e55a0@syzkaller.appspotmail.com
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Acked-by: Marco Elver <elver@google.com>
---
 kernel/rcu/tree_exp.h | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index d632cd0..69c5aa6 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -134,7 +134,7 @@ static void __maybe_unused sync_exp_reset_tree(void)
 	rcu_for_each_node_breadth_first(rnp) {
 		raw_spin_lock_irqsave_rcu_node(rnp, flags);
 		WARN_ON_ONCE(rnp->expmask);
-		rnp->expmask = rnp->expmaskinit;
+		WRITE_ONCE(rnp->expmask, rnp->expmaskinit);
 		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 	}
 }
@@ -211,7 +211,7 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp,
 		rnp = rnp->parent;
 		raw_spin_lock_rcu_node(rnp); /* irqs already disabled */
 		WARN_ON_ONCE(!(rnp->expmask & mask));
-		rnp->expmask &= ~mask;
+		WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask);
 	}
 }
 
@@ -241,7 +241,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp,
 		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 		return;
 	}
-	rnp->expmask &= ~mask;
+	WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask);
 	__rcu_report_exp_rnp(rnp, wake, flags); /* Releases rnp->lock. */
 }
 
@@ -372,12 +372,10 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 
 	/* IPI the remaining CPUs for expedited quiescent state. */
-	for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
+	for_each_leaf_node_cpu_mask(rnp, cpu, mask_ofl_ipi) {
 		unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
 		struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
 
-		if (!(mask_ofl_ipi & mask))
-			continue;
 retry_ipi:
 		if (rcu_dynticks_in_eqs_since(rdp, rdp->exp_dynticks_snap)) {
 			mask_ofl_test |= mask;
@@ -491,7 +489,7 @@ static void synchronize_sched_expedited_wait(void)
 				struct rcu_data *rdp;
 
 				mask = leaf_node_cpu_bit(rnp, cpu);
-				if (!(rnp->expmask & mask))
+				if (!(READ_ONCE(rnp->expmask) & mask))
 					continue;
 				ndetected++;
 				rdp = per_cpu_ptr(&rcu_data, cpu);
@@ -503,7 +501,8 @@ static void synchronize_sched_expedited_wait(void)
 		}
 		pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
 			jiffies - jiffies_start, rcu_state.expedited_sequence,
-			rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]);
+			READ_ONCE(rnp_root->expmask),
+			".T"[!!rnp_root->exp_tasks]);
 		if (ndetected) {
 			pr_err("blocking rcu_node structures:");
 			rcu_for_each_node_breadth_first(rnp) {
@@ -513,7 +512,7 @@ static void synchronize_sched_expedited_wait(void)
 					continue;
 				pr_cont(" l=%u:%d-%d:%#lx/%c",
 					rnp->level, rnp->grplo, rnp->grphi,
-					rnp->expmask,
+					READ_ONCE(rnp->expmask),
 					".T"[!!rnp->exp_tasks]);
 			}
 			pr_cont("\n");
@@ -521,7 +520,7 @@ static void synchronize_sched_expedited_wait(void)
 		rcu_for_each_leaf_node(rnp) {
 			for_each_leaf_node_possible_cpu(rnp, cpu) {
 				mask = leaf_node_cpu_bit(rnp, cpu);
-				if (!(rnp->expmask & mask))
+				if (!(READ_ONCE(rnp->expmask) & mask))
 					continue;
 				dump_cpu_task(cpu);
 			}
-- 
2.9.5


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH tip/core/rcu 02/10] rcu: Avoid modifying mask_ofl_ipi in sync_rcu_exp_select_node_cpus()
  2019-12-10  4:01 [PATCH tip/core/rcu 0/10] Expedited grace-period updates for v5.6 Paul E. McKenney
  2019-12-10  4:01 ` [PATCH tip/core/rcu 01/10] rcu: Use *_ONCE() to protect lockless ->expmask accesses paulmck
@ 2019-12-10  4:01 ` paulmck
  2019-12-10  4:01 ` [PATCH tip/core/rcu 03/10] rcu: Fix data-race due to atomic_t copy-by-value paulmck
                   ` (7 subsequent siblings)
  9 siblings, 0 replies; 11+ messages in thread
From: paulmck @ 2019-12-10  4:01 UTC (permalink / raw)
  To: rcu
  Cc: linux-kernel, kernel-team, mingo, jiangshanlai, dipankar, akpm,
	mathieu.desnoyers, josh, tglx, peterz, rostedt, dhowells,
	edumazet, fweisbec, oleg, joel, Boqun Feng, Paul E . McKenney

From: Boqun Feng <boqun.feng@gmail.com>

The "mask_ofl_ipi" is used to track which CPUs get IPIed, however
in the IPI sending loop, "mask_ofl_ipi" along with another variable
"mask_ofl_test" might also get modified to record which CPUs' quiesent
states must be reported by the sync_rcu_exp_select_node_cpus() at
the end of sync_rcu_exp_select_node_cpus().  This overlap of roles
can be confusing, so this patch cleans things a little by using
"mask_ofl_ipi" solely for determining which CPUs must be IPIed  and
"mask_ofl_test" for solely determining on behalf of  which CPUs
sync_rcu_exp_select_node_cpus() must report a quiscent state.

Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Acked-by: Marco Elver <elver@google.com>
---
 kernel/rcu/tree_exp.h | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 69c5aa6..6a6f328 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -387,10 +387,10 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
 		}
 		ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0);
 		put_cpu();
-		if (!ret) {
-			mask_ofl_ipi &= ~mask;
+		/* The CPU will report the QS in response to the IPI. */
+		if (!ret)
 			continue;
-		}
+
 		/* Failed, raced with CPU hotplug operation. */
 		raw_spin_lock_irqsave_rcu_node(rnp, flags);
 		if ((rnp->qsmaskinitnext & mask) &&
@@ -401,13 +401,12 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
 			schedule_timeout_uninterruptible(1);
 			goto retry_ipi;
 		}
-		/* CPU really is offline, so we can ignore it. */
-		if (!(rnp->expmask & mask))
-			mask_ofl_ipi &= ~mask;
+		/* CPU really is offline, so we must report its QS. */
+		if (rnp->expmask & mask)
+			mask_ofl_test |= mask;
 		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 	}
 	/* Report quiescent states for those that went offline. */
-	mask_ofl_test |= mask_ofl_ipi;
 	if (mask_ofl_test)
 		rcu_report_exp_cpu_mult(rnp, mask_ofl_test, false);
 }
-- 
2.9.5


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH tip/core/rcu 03/10] rcu: Fix data-race due to atomic_t copy-by-value
  2019-12-10  4:01 [PATCH tip/core/rcu 0/10] Expedited grace-period updates for v5.6 Paul E. McKenney
  2019-12-10  4:01 ` [PATCH tip/core/rcu 01/10] rcu: Use *_ONCE() to protect lockless ->expmask accesses paulmck
  2019-12-10  4:01 ` [PATCH tip/core/rcu 02/10] rcu: Avoid modifying mask_ofl_ipi in sync_rcu_exp_select_node_cpus() paulmck
@ 2019-12-10  4:01 ` paulmck
  2019-12-10  4:01 ` [PATCH tip/core/rcu 04/10] rcu: Substitute lookup for bit-twiddling in sync_rcu_exp_select_node_cpus() paulmck
                   ` (6 subsequent siblings)
  9 siblings, 0 replies; 11+ messages in thread
From: paulmck @ 2019-12-10  4:01 UTC (permalink / raw)
  To: rcu
  Cc: linux-kernel, kernel-team, mingo, jiangshanlai, dipankar, akpm,
	mathieu.desnoyers, josh, tglx, peterz, rostedt, dhowells,
	edumazet, fweisbec, oleg, joel, Marco Elver, Paul E . McKenney,
	Ingo Molnar, Dmitry Vyukov

From: Marco Elver <elver@google.com>

This fixes a data-race where `atomic_t dynticks` is copied by value. The
copy is performed non-atomically, resulting in a data-race if `dynticks`
is updated concurrently.

This data-race was found with KCSAN:
==================================================================
BUG: KCSAN: data-race in dyntick_save_progress_counter / rcu_irq_enter

write to 0xffff989dbdbe98e0 of 4 bytes by task 10 on cpu 3:
 atomic_add_return include/asm-generic/atomic-instrumented.h:78 [inline]
 rcu_dynticks_snap kernel/rcu/tree.c:310 [inline]
 dyntick_save_progress_counter+0x43/0x1b0 kernel/rcu/tree.c:984
 force_qs_rnp+0x183/0x200 kernel/rcu/tree.c:2286
 rcu_gp_fqs kernel/rcu/tree.c:1601 [inline]
 rcu_gp_fqs_loop+0x71/0x880 kernel/rcu/tree.c:1653
 rcu_gp_kthread+0x22c/0x3b0 kernel/rcu/tree.c:1799
 kthread+0x1b5/0x200 kernel/kthread.c:255
 <snip>

read to 0xffff989dbdbe98e0 of 4 bytes by task 154 on cpu 7:
 rcu_nmi_enter_common kernel/rcu/tree.c:828 [inline]
 rcu_irq_enter+0xda/0x240 kernel/rcu/tree.c:870
 irq_enter+0x5/0x50 kernel/softirq.c:347
 <snip>

Reported by Kernel Concurrency Sanitizer on:
CPU: 7 PID: 154 Comm: kworker/7:1H Not tainted 5.3.0+ #5
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014
Workqueue: kblockd blk_mq_run_work_fn
==================================================================

Signed-off-by: Marco Elver <elver@google.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: rcu@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/trace/events/rcu.h |  4 ++--
 kernel/rcu/tree.c          | 11 ++++++-----
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index 6612260..697e2c0 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -449,7 +449,7 @@ TRACE_EVENT_RCU(rcu_fqs,
  */
 TRACE_EVENT_RCU(rcu_dyntick,
 
-	TP_PROTO(const char *polarity, long oldnesting, long newnesting, atomic_t dynticks),
+	TP_PROTO(const char *polarity, long oldnesting, long newnesting, int dynticks),
 
 	TP_ARGS(polarity, oldnesting, newnesting, dynticks),
 
@@ -464,7 +464,7 @@ TRACE_EVENT_RCU(rcu_dyntick,
 		__entry->polarity = polarity;
 		__entry->oldnesting = oldnesting;
 		__entry->newnesting = newnesting;
-		__entry->dynticks = atomic_read(&dynticks);
+		__entry->dynticks = dynticks;
 	),
 
 	TP_printk("%s %lx %lx %#3x", __entry->polarity,
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 1694a6b..6145e08 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -577,7 +577,7 @@ static void rcu_eqs_enter(bool user)
 	}
 
 	lockdep_assert_irqs_disabled();
-	trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, rdp->dynticks);
+	trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks));
 	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
 	rdp = this_cpu_ptr(&rcu_data);
 	do_nocb_deferred_wakeup(rdp);
@@ -650,14 +650,15 @@ static __always_inline void rcu_nmi_exit_common(bool irq)
 	 * leave it in non-RCU-idle state.
 	 */
 	if (rdp->dynticks_nmi_nesting != 1) {
-		trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2, rdp->dynticks);
+		trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2,
+				  atomic_read(&rdp->dynticks));
 		WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */
 			   rdp->dynticks_nmi_nesting - 2);
 		return;
 	}
 
 	/* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
-	trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, rdp->dynticks);
+	trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks));
 	WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */
 
 	if (irq)
@@ -744,7 +745,7 @@ static void rcu_eqs_exit(bool user)
 	rcu_dynticks_task_exit();
 	rcu_dynticks_eqs_exit();
 	rcu_cleanup_after_idle();
-	trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, rdp->dynticks);
+	trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks));
 	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
 	WRITE_ONCE(rdp->dynticks_nesting, 1);
 	WARN_ON_ONCE(rdp->dynticks_nmi_nesting);
@@ -833,7 +834,7 @@ static __always_inline void rcu_nmi_enter_common(bool irq)
 	}
 	trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="),
 			  rdp->dynticks_nmi_nesting,
-			  rdp->dynticks_nmi_nesting + incby, rdp->dynticks);
+			  rdp->dynticks_nmi_nesting + incby, atomic_read(&rdp->dynticks));
 	WRITE_ONCE(rdp->dynticks_nmi_nesting, /* Prevent store tearing. */
 		   rdp->dynticks_nmi_nesting + incby);
 	barrier();
-- 
2.9.5


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH tip/core/rcu 04/10] rcu: Substitute lookup for bit-twiddling in sync_rcu_exp_select_node_cpus()
  2019-12-10  4:01 [PATCH tip/core/rcu 0/10] Expedited grace-period updates for v5.6 Paul E. McKenney
                   ` (2 preceding siblings ...)
  2019-12-10  4:01 ` [PATCH tip/core/rcu 03/10] rcu: Fix data-race due to atomic_t copy-by-value paulmck
@ 2019-12-10  4:01 ` paulmck
  2019-12-10  4:01 ` [PATCH tip/core/rcu 05/10] rcu: Fix missed wakeup of exp_wq waiters paulmck
                   ` (5 subsequent siblings)
  9 siblings, 0 replies; 11+ messages in thread
From: paulmck @ 2019-12-10  4:01 UTC (permalink / raw)
  To: rcu
  Cc: linux-kernel, kernel-team, mingo, jiangshanlai, dipankar, akpm,
	mathieu.desnoyers, josh, tglx, peterz, rostedt, dhowells,
	edumazet, fweisbec, oleg, joel, Paul E. McKenney

From: "Paul E. McKenney" <paulmck@kernel.org>

The code in sync_rcu_exp_select_node_cpus() calculates the current
CPU's mask within its rcu_node structure's bitmasks, but this has
already been computed in the ->grpmask field of that CPU's rcu_data
structure.  This commit therefore just uses this ->grpmask field.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_exp.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 6a6f328..3b59c3e 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -345,8 +345,8 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
 	/* Each pass checks a CPU for identity, offline, and idle. */
 	mask_ofl_test = 0;
 	for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
-		unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
 		struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+		unsigned long mask = rdp->grpmask;
 		int snap;
 
 		if (raw_smp_processor_id() == cpu ||
@@ -373,8 +373,8 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
 
 	/* IPI the remaining CPUs for expedited quiescent state. */
 	for_each_leaf_node_cpu_mask(rnp, cpu, mask_ofl_ipi) {
-		unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
 		struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+		unsigned long mask = rdp->grpmask;
 
 retry_ipi:
 		if (rcu_dynticks_in_eqs_since(rdp, rdp->exp_dynticks_snap)) {
-- 
2.9.5


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH tip/core/rcu 05/10] rcu: Fix missed wakeup of exp_wq waiters
  2019-12-10  4:01 [PATCH tip/core/rcu 0/10] Expedited grace-period updates for v5.6 Paul E. McKenney
                   ` (3 preceding siblings ...)
  2019-12-10  4:01 ` [PATCH tip/core/rcu 04/10] rcu: Substitute lookup for bit-twiddling in sync_rcu_exp_select_node_cpus() paulmck
@ 2019-12-10  4:01 ` paulmck
  2019-12-10  4:01 ` [PATCH tip/core/rcu 06/10] rcu: Allow only one expedited GP to run concurrently with wakeups paulmck
                   ` (4 subsequent siblings)
  9 siblings, 0 replies; 11+ messages in thread
From: paulmck @ 2019-12-10  4:01 UTC (permalink / raw)
  To: rcu
  Cc: linux-kernel, kernel-team, mingo, jiangshanlai, dipankar, akpm,
	mathieu.desnoyers, josh, tglx, peterz, rostedt, dhowells,
	edumazet, fweisbec, oleg, joel, Neeraj Upadhyay,
	Paul E . McKenney

From: Neeraj Upadhyay <neeraju@codeaurora.org>

Tasks waiting within exp_funnel_lock() for an expedited grace period to
elapse can be starved due to the following sequence of events:

1.	Tasks A and B both attempt to start an expedited grace
	period at about the same time.	This grace period will have
	completed when the lower four bits of the rcu_state structure's
	->expedited_sequence field are 0b'0100', for example, when the
	initial value of this counter is zero.	Task A wins, and thus
	does the actual work of starting the grace period, including
	acquiring the rcu_state structure's .exp_mutex and sets the
	counter to 0b'0001'.

2.	Because task B lost the race to start the grace period, it
	waits on ->expedited_sequence to reach 0b'0100' inside of
	exp_funnel_lock(). This task therefore blocks on the rcu_node
	structure's ->exp_wq[1] field, keeping in mind that the
	end-of-grace-period value of ->expedited_sequence (0b'0100')
	is shifted down two bits before indexing the ->exp_wq[] field.

3.	Task C attempts to start another expedited grace period,
	but blocks on ->exp_mutex, which is still held by Task A.

4.	The aforementioned expedited grace period completes, so that
	->expedited_sequence now has the value 0b'0100'.  A kworker task
	therefore acquires the rcu_state structure's ->exp_wake_mutex
	and starts awakening any tasks waiting for this grace period.

5.	One of the first tasks awakened happens to be Task A.  Task A
	therefore releases the rcu_state structure's ->exp_mutex,
	which allows Task C to start the next expedited grace period,
	which causes the lower four bits of the rcu_state structure's
	->expedited_sequence field to become 0b'0101'.

6.	Task C's expedited grace period completes, so that the lower four
	bits of the rcu_state structure's ->expedited_sequence field now
	become 0b'1000'.

7.	The kworker task from step 4 above continues its wakeups.
	Unfortunately, the wake_up_all() refetches the rcu_state
	structure's .expedited_sequence field:

	wake_up_all(&rnp->exp_wq[rcu_seq_ctr(rcu_state.expedited_sequence) & 0x3]);

	This results in the wakeup being applied to the rcu_node
	structure's ->exp_wq[2] field, which is unfortunate given that
	Task B is instead waiting on ->exp_wq[1].

On a busy system, no harm is done (or at least no permanent harm is done).
Some later expedited grace period will redo the wakeup.  But on a quiet
system, such as many embedded systems, it might be a good long time before
there was another expedited grace period.  On such embedded systems,
this situation could therefore result in a system hang.

This issue manifested as DPM device timeout during suspend (which
usually qualifies as a quiet time) due to a SCSI device being stuck in
_synchronize_rcu_expedited(), with the following stack trace:

	schedule()
	synchronize_rcu_expedited()
	synchronize_rcu()
	scsi_device_quiesce()
	scsi_bus_suspend()
	dpm_run_callback()
	__device_suspend()

This commit therefore prevents such delays, timeouts, and hangs by
making rcu_exp_wait_wake() use its "s" argument consistently instead of
refetching from rcu_state.expedited_sequence.

Fixes: 3b5f668e715b ("rcu: Overlap wakeups with next expedited grace period")
Signed-off-by: Neeraj Upadhyay <neeraju@codeaurora.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_exp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 3b59c3e..fa143e4 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -557,7 +557,7 @@ static void rcu_exp_wait_wake(unsigned long s)
 			spin_unlock(&rnp->exp_lock);
 		}
 		smp_mb(); /* All above changes before wakeup. */
-		wake_up_all(&rnp->exp_wq[rcu_seq_ctr(rcu_state.expedited_sequence) & 0x3]);
+		wake_up_all(&rnp->exp_wq[rcu_seq_ctr(s) & 0x3]);
 	}
 	trace_rcu_exp_grace_period(rcu_state.name, s, TPS("endwake"));
 	mutex_unlock(&rcu_state.exp_wake_mutex);
-- 
2.9.5


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH tip/core/rcu 06/10] rcu: Allow only one expedited GP to run concurrently with wakeups
  2019-12-10  4:01 [PATCH tip/core/rcu 0/10] Expedited grace-period updates for v5.6 Paul E. McKenney
                   ` (4 preceding siblings ...)
  2019-12-10  4:01 ` [PATCH tip/core/rcu 05/10] rcu: Fix missed wakeup of exp_wq waiters paulmck
@ 2019-12-10  4:01 ` paulmck
  2019-12-10  4:01 ` [PATCH tip/core/rcu 07/10] rcu: Rename sync_rcu_preempt_exp_done() to sync_rcu_exp_done() paulmck
                   ` (3 subsequent siblings)
  9 siblings, 0 replies; 11+ messages in thread
From: paulmck @ 2019-12-10  4:01 UTC (permalink / raw)
  To: rcu
  Cc: linux-kernel, kernel-team, mingo, jiangshanlai, dipankar, akpm,
	mathieu.desnoyers, josh, tglx, peterz, rostedt, dhowells,
	edumazet, fweisbec, oleg, joel, Neeraj Upadhyay,
	Paul E . McKenney

From: Neeraj Upadhyay <neeraju@codeaurora.org>

The current expedited RCU grace-period code expects that a task
requesting an expedited grace period cannot awaken until that grace
period has reached the wakeup phase.  However, it is possible for a long
preemption to result in the waiting task never sleeping.  For example,
consider the following sequence of events:

1.	Task A starts an expedited grace period by invoking
	synchronize_rcu_expedited().  It proceeds normally up to the
	wait_event() near the end of that function, and is then preempted
	(or interrupted or whatever).

2.	The expedited grace period completes, and a kworker task starts
	the awaken phase, having incremented the counter and acquired
	the rcu_state structure's .exp_wake_mutex.  This kworker task
	is then preempted or interrupted or whatever.

3.	Task A resumes and enters wait_event(), which notes that the
	expedited grace period has completed, and thus doesn't sleep.

4.	Task B starts an expedited grace period exactly as did Task A,
	complete with the preemption (or whatever delay) just before
	the call to wait_event().

5.	The expedited grace period completes, and another kworker
	task starts the awaken phase, having incremented the counter.
	However, it blocks when attempting to acquire the rcu_state
	structure's .exp_wake_mutex because step 2's kworker task has
	not yet released it.

6.	Steps 4 and 5 repeat, resulting in overflow of the rcu_node
	structure's ->exp_wq[] array.

In theory, this is harmless.  Tasks waiting on the various ->exp_wq[]
array will just be spuriously awakened, but they will just sleep again
on noting that the rcu_state structure's ->expedited_sequence value has
not advanced far enough.

In practice, this wastes CPU time and is an accident waiting to happen.
This commit therefore moves the rcu_exp_gp_seq_end() call that officially
ends the expedited grace period (along with associate tracing) until
after the ->exp_wake_mutex has been acquired.  This prevents Task A from
awakening prematurely, thus preventing more than one expedited grace
period from being in flight during a previous expedited grace period's
wakeup phase.

Fixes: 3b5f668e715b ("rcu: Overlap wakeups with next expedited grace period")
Signed-off-by: Neeraj Upadhyay <neeraju@codeaurora.org>
[ paulmck: Added updated comment. ]
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_exp.h | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index fa143e4..7a1f093 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -539,14 +539,13 @@ static void rcu_exp_wait_wake(unsigned long s)
 	struct rcu_node *rnp;
 
 	synchronize_sched_expedited_wait();
-	rcu_exp_gp_seq_end();
-	trace_rcu_exp_grace_period(rcu_state.name, s, TPS("end"));
 
-	/*
-	 * Switch over to wakeup mode, allowing the next GP, but -only- the
-	 * next GP, to proceed.
-	 */
+	// Switch over to wakeup mode, allowing the next GP to proceed.
+	// End the previous grace period only after acquiring the mutex
+	// to ensure that only one GP runs concurrently with wakeups.
 	mutex_lock(&rcu_state.exp_wake_mutex);
+	rcu_exp_gp_seq_end();
+	trace_rcu_exp_grace_period(rcu_state.name, s, TPS("end"));
 
 	rcu_for_each_node_breadth_first(rnp) {
 		if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) {
-- 
2.9.5


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH tip/core/rcu 07/10] rcu: Rename sync_rcu_preempt_exp_done() to sync_rcu_exp_done()
  2019-12-10  4:01 [PATCH tip/core/rcu 0/10] Expedited grace-period updates for v5.6 Paul E. McKenney
                   ` (5 preceding siblings ...)
  2019-12-10  4:01 ` [PATCH tip/core/rcu 06/10] rcu: Allow only one expedited GP to run concurrently with wakeups paulmck
@ 2019-12-10  4:01 ` paulmck
  2019-12-10  4:01 ` [PATCH tip/core/rcu 08/10] rcu: Update tree_exp.h function-header comments paulmck
                   ` (2 subsequent siblings)
  9 siblings, 0 replies; 11+ messages in thread
From: paulmck @ 2019-12-10  4:01 UTC (permalink / raw)
  To: rcu
  Cc: linux-kernel, kernel-team, mingo, jiangshanlai, dipankar, akpm,
	mathieu.desnoyers, josh, tglx, peterz, rostedt, dhowells,
	edumazet, fweisbec, oleg, joel, Paul E. McKenney

From: "Paul E. McKenney" <paulmck@kernel.org>

Now that the RCU flavors have been consolidated, there is one common
function for checking to see if an expedited RCU grace period has
completed, namely sync_rcu_preempt_exp_done().  Because this function is
no longer specific to RCU-preempt, this commit removes the "_preempt" from
its name.  This commit also changes sync_rcu_preempt_exp_done_unlocked()
to sync_rcu_exp_done_unlocked() for the same reason.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_exp.h    | 19 +++++++++----------
 kernel/rcu/tree_plugin.h |  4 ++--
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 7a1f093..3923c07 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -148,7 +148,7 @@ static void __maybe_unused sync_exp_reset_tree(void)
  *
  * Caller must hold the specificed rcu_node structure's ->lock
  */
-static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp)
+static bool sync_rcu_exp_done(struct rcu_node *rnp)
 {
 	raw_lockdep_assert_held_rcu_node(rnp);
 
@@ -157,17 +157,16 @@ static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp)
 }
 
 /*
- * Like sync_rcu_preempt_exp_done(), but this function assumes the caller
- * doesn't hold the rcu_node's ->lock, and will acquire and release the lock
- * itself
+ * Like sync_rcu_exp_done(), but this function assumes the caller doesn't
+ * hold the rcu_node's ->lock, and will acquire and release the lock itself
  */
-static bool sync_rcu_preempt_exp_done_unlocked(struct rcu_node *rnp)
+static bool sync_rcu_exp_done_unlocked(struct rcu_node *rnp)
 {
 	unsigned long flags;
 	bool ret;
 
 	raw_spin_lock_irqsave_rcu_node(rnp, flags);
-	ret = sync_rcu_preempt_exp_done(rnp);
+	ret = sync_rcu_exp_done(rnp);
 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 
 	return ret;
@@ -191,7 +190,7 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp,
 	unsigned long mask;
 
 	for (;;) {
-		if (!sync_rcu_preempt_exp_done(rnp)) {
+		if (!sync_rcu_exp_done(rnp)) {
 			if (!rnp->expmask)
 				rcu_initiate_boost(rnp, flags);
 			else
@@ -471,9 +470,9 @@ static void synchronize_sched_expedited_wait(void)
 	for (;;) {
 		ret = swait_event_timeout_exclusive(
 				rcu_state.expedited_wq,
-				sync_rcu_preempt_exp_done_unlocked(rnp_root),
+				sync_rcu_exp_done_unlocked(rnp_root),
 				jiffies_stall);
-		if (ret > 0 || sync_rcu_preempt_exp_done_unlocked(rnp_root))
+		if (ret > 0 || sync_rcu_exp_done_unlocked(rnp_root))
 			return;
 		WARN_ON(ret < 0);  /* workqueues should not be signaled. */
 		if (rcu_cpu_stall_suppress)
@@ -507,7 +506,7 @@ static void synchronize_sched_expedited_wait(void)
 			rcu_for_each_node_breadth_first(rnp) {
 				if (rnp == rnp_root)
 					continue; /* printed unconditionally */
-				if (sync_rcu_preempt_exp_done_unlocked(rnp))
+				if (sync_rcu_exp_done_unlocked(rnp))
 					continue;
 				pr_cont(" l=%u:%d-%d:%#lx/%c",
 					rnp->level, rnp->grplo, rnp->grphi,
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index fa08d55..6dbea4b 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -485,7 +485,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
 		empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
 		WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq &&
 			     (!empty_norm || rnp->qsmask));
-		empty_exp = sync_rcu_preempt_exp_done(rnp);
+		empty_exp = sync_rcu_exp_done(rnp);
 		smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
 		np = rcu_next_node_entry(t, rnp);
 		list_del_init(&t->rcu_node_entry);
@@ -509,7 +509,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
 		 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
 		 * so we must take a snapshot of the expedited state.
 		 */
-		empty_exp_now = sync_rcu_preempt_exp_done(rnp);
+		empty_exp_now = sync_rcu_exp_done(rnp);
 		if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) {
 			trace_rcu_quiescent_state_report(TPS("preempt_rcu"),
 							 rnp->gp_seq,
-- 
2.9.5


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH tip/core/rcu 08/10] rcu: Update tree_exp.h function-header comments
  2019-12-10  4:01 [PATCH tip/core/rcu 0/10] Expedited grace-period updates for v5.6 Paul E. McKenney
                   ` (6 preceding siblings ...)
  2019-12-10  4:01 ` [PATCH tip/core/rcu 07/10] rcu: Rename sync_rcu_preempt_exp_done() to sync_rcu_exp_done() paulmck
@ 2019-12-10  4:01 ` paulmck
  2019-12-10  4:01 ` [PATCH tip/core/rcu 09/10] rcu: Replace synchronize_sched_expedited_wait() "_sched" with "_rcu" paulmck
  2019-12-10  4:01 ` [PATCH tip/core/rcu 10/10] rcu: Enable tick for nohz_full CPUs slow to provide expedited QS paulmck
  9 siblings, 0 replies; 11+ messages in thread
From: paulmck @ 2019-12-10  4:01 UTC (permalink / raw)
  To: rcu
  Cc: linux-kernel, kernel-team, mingo, jiangshanlai, dipankar, akpm,
	mathieu.desnoyers, josh, tglx, peterz, rostedt, dhowells,
	edumazet, fweisbec, oleg, joel, Paul E. McKenney

From: "Paul E. McKenney" <paulmck@kernel.org>

The function-header comments in kernel/rcu/tree_exp.h have gotten a bit
out of date, so this commit updates a number of them.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_exp.h | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 3923c07..1eafbcd 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -21,7 +21,7 @@ static void rcu_exp_gp_seq_start(void)
 }
 
 /*
- * Return then value that expedited-grace-period counter will have
+ * Return the value that the expedited-grace-period counter will have
  * at the end of the current grace period.
  */
 static __maybe_unused unsigned long rcu_exp_gp_seq_endval(void)
@@ -39,7 +39,9 @@ static void rcu_exp_gp_seq_end(void)
 }
 
 /*
- * Take a snapshot of the expedited-grace-period counter.
+ * Take a snapshot of the expedited-grace-period counter, which is the
+ * earliest value that will indicate that a full grace period has
+ * elapsed since the current time.
  */
 static unsigned long rcu_exp_gp_seq_snap(void)
 {
@@ -143,22 +145,18 @@ static void __maybe_unused sync_exp_reset_tree(void)
  * Return non-zero if there is no RCU expedited grace period in progress
  * for the specified rcu_node structure, in other words, if all CPUs and
  * tasks covered by the specified rcu_node structure have done their bit
- * for the current expedited grace period.  Works only for preemptible
- * RCU -- other RCU implementation use other means.
- *
- * Caller must hold the specificed rcu_node structure's ->lock
+ * for the current expedited grace period.
  */
 static bool sync_rcu_exp_done(struct rcu_node *rnp)
 {
 	raw_lockdep_assert_held_rcu_node(rnp);
-
 	return rnp->exp_tasks == NULL &&
 	       READ_ONCE(rnp->expmask) == 0;
 }
 
 /*
- * Like sync_rcu_exp_done(), but this function assumes the caller doesn't
- * hold the rcu_node's ->lock, and will acquire and release the lock itself
+ * Like sync_rcu_exp_done(), but where the caller does not hold the
+ * rcu_node's ->lock.
  */
 static bool sync_rcu_exp_done_unlocked(struct rcu_node *rnp)
 {
@@ -180,8 +178,6 @@ static bool sync_rcu_exp_done_unlocked(struct rcu_node *rnp)
  * which the task was queued or to one of that rcu_node structure's ancestors,
  * recursively up the tree.  (Calm down, calm down, we do the recursion
  * iteratively!)
- *
- * Caller must hold the specified rcu_node structure's ->lock.
  */
 static void __rcu_report_exp_rnp(struct rcu_node *rnp,
 				 bool wake, unsigned long flags)
@@ -189,6 +185,7 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp,
 {
 	unsigned long mask;
 
+	raw_lockdep_assert_held_rcu_node(rnp);
 	for (;;) {
 		if (!sync_rcu_exp_done(rnp)) {
 			if (!rnp->expmask)
@@ -452,6 +449,10 @@ static void sync_rcu_exp_select_cpus(void)
 			flush_work(&rnp->rew.rew_work);
 }
 
+/*
+ * Wait for the expedited grace period to elapse, issuing any needed
+ * RCU CPU stall warnings along the way.
+ */
 static void synchronize_sched_expedited_wait(void)
 {
 	int cpu;
@@ -781,7 +782,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
  * implementations, it is still unfriendly to real-time workloads, so is
  * thus not recommended for any sort of common-case code.  In fact, if
  * you are using synchronize_rcu_expedited() in a loop, please restructure
- * your code to batch your updates, and then Use a single synchronize_rcu()
+ * your code to batch your updates, and then use a single synchronize_rcu()
  * instead.
  *
  * This has the same semantics as (but is more brutal than) synchronize_rcu().
-- 
2.9.5


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH tip/core/rcu 09/10] rcu: Replace synchronize_sched_expedited_wait() "_sched" with "_rcu"
  2019-12-10  4:01 [PATCH tip/core/rcu 0/10] Expedited grace-period updates for v5.6 Paul E. McKenney
                   ` (7 preceding siblings ...)
  2019-12-10  4:01 ` [PATCH tip/core/rcu 08/10] rcu: Update tree_exp.h function-header comments paulmck
@ 2019-12-10  4:01 ` paulmck
  2019-12-10  4:01 ` [PATCH tip/core/rcu 10/10] rcu: Enable tick for nohz_full CPUs slow to provide expedited QS paulmck
  9 siblings, 0 replies; 11+ messages in thread
From: paulmck @ 2019-12-10  4:01 UTC (permalink / raw)
  To: rcu
  Cc: linux-kernel, kernel-team, mingo, jiangshanlai, dipankar, akpm,
	mathieu.desnoyers, josh, tglx, peterz, rostedt, dhowells,
	edumazet, fweisbec, oleg, joel, Paul E. McKenney

From: "Paul E. McKenney" <paulmck@kernel.org>

After RCU flavor consolidation, synchronize_sched_expedited_wait() does
both RCU-preempt and RCU-sched, whichever happens to have been built into
the running kernel.  This commit therefore changes this function's name
to synchronize_rcu_expedited_wait() to reflect its new generic nature.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_exp.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 1eafbcd..081a179 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -453,7 +453,7 @@ static void sync_rcu_exp_select_cpus(void)
  * Wait for the expedited grace period to elapse, issuing any needed
  * RCU CPU stall warnings along the way.
  */
-static void synchronize_sched_expedited_wait(void)
+static void synchronize_rcu_expedited_wait(void)
 {
 	int cpu;
 	unsigned long jiffies_stall;
@@ -538,7 +538,7 @@ static void rcu_exp_wait_wake(unsigned long s)
 {
 	struct rcu_node *rnp;
 
-	synchronize_sched_expedited_wait();
+	synchronize_rcu_expedited_wait();
 
 	// Switch over to wakeup mode, allowing the next GP to proceed.
 	// End the previous grace period only after acquiring the mutex
-- 
2.9.5


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH tip/core/rcu 10/10] rcu: Enable tick for nohz_full CPUs slow to provide expedited QS
  2019-12-10  4:01 [PATCH tip/core/rcu 0/10] Expedited grace-period updates for v5.6 Paul E. McKenney
                   ` (8 preceding siblings ...)
  2019-12-10  4:01 ` [PATCH tip/core/rcu 09/10] rcu: Replace synchronize_sched_expedited_wait() "_sched" with "_rcu" paulmck
@ 2019-12-10  4:01 ` paulmck
  9 siblings, 0 replies; 11+ messages in thread
From: paulmck @ 2019-12-10  4:01 UTC (permalink / raw)
  To: rcu
  Cc: linux-kernel, kernel-team, mingo, jiangshanlai, dipankar, akpm,
	mathieu.desnoyers, josh, tglx, peterz, rostedt, dhowells,
	edumazet, fweisbec, oleg, joel, Paul E. McKenney

From: "Paul E. McKenney" <paulmck@kernel.org>

An expedited grace period can be stalled by a nohz_full CPU looping
in kernel context.  This possibility is currently handled by some
carefully crafted checks in rcu_read_unlock_special() that enlist help
from ksoftirqd when permitted by the scheduler.  However, it is exactly
these checks that require the scheduler avoid holding any of its rq or
pi locks across rcu_read_unlock() without also having held them across
the entire RCU read-side critical section.

It would therefore be very nice if expedited grace periods could
handle nohz_full CPUs looping in kernel context without such checks.
This commit therefore adds code to the expedited grace period's wait
and cleanup code that forces the scheduler-clock interrupt on for CPUs
that fail to quickly supply a quiescent state.  "Quickly" is currently
a hard-coded single-jiffy delay.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/tick.h  |  5 ++++-
 kernel/rcu/tree.h     |  1 +
 kernel/rcu/tree_exp.h | 52 ++++++++++++++++++++++++++++++++++++++++++++-------
 3 files changed, 50 insertions(+), 8 deletions(-)

diff --git a/include/linux/tick.h b/include/linux/tick.h
index 7896f792d3..7340613 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -109,8 +109,10 @@ enum tick_dep_bits {
 	TICK_DEP_BIT_PERF_EVENTS	= 1,
 	TICK_DEP_BIT_SCHED		= 2,
 	TICK_DEP_BIT_CLOCK_UNSTABLE	= 3,
-	TICK_DEP_BIT_RCU		= 4
+	TICK_DEP_BIT_RCU		= 4,
+	TICK_DEP_BIT_RCU_EXP		= 5
 };
+#define TICK_DEP_BIT_MAX TICK_DEP_BIT_RCU_EXP
 
 #define TICK_DEP_MASK_NONE		0
 #define TICK_DEP_MASK_POSIX_TIMER	(1 << TICK_DEP_BIT_POSIX_TIMER)
@@ -118,6 +120,7 @@ enum tick_dep_bits {
 #define TICK_DEP_MASK_SCHED		(1 << TICK_DEP_BIT_SCHED)
 #define TICK_DEP_MASK_CLOCK_UNSTABLE	(1 << TICK_DEP_BIT_CLOCK_UNSTABLE)
 #define TICK_DEP_MASK_RCU		(1 << TICK_DEP_BIT_RCU)
+#define TICK_DEP_MASK_RCU_EXP		(1 << TICK_DEP_BIT_RCU_EXP)
 
 #ifdef CONFIG_NO_HZ_COMMON
 extern bool tick_nohz_enabled;
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 055c317..f9253ed 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -182,6 +182,7 @@ struct rcu_data {
 	bool rcu_need_heavy_qs;		/* GP old, so heavy quiescent state! */
 	bool rcu_urgent_qs;		/* GP old need light quiescent state. */
 	bool rcu_forced_tick;		/* Forced tick to provide QS. */
+	bool rcu_forced_tick_exp;	/*   ... provide QS to expedited GP. */
 #ifdef CONFIG_RCU_FAST_NO_HZ
 	bool all_lazy;			/* All CPU's CBs lazy at idle start? */
 	unsigned long last_accelerate;	/* Last jiffy CBs were accelerated. */
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 081a179..30b2a02 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -230,7 +230,9 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_node *rnp, bool wake)
 static void rcu_report_exp_cpu_mult(struct rcu_node *rnp,
 				    unsigned long mask, bool wake)
 {
+	int cpu;
 	unsigned long flags;
+	struct rcu_data *rdp;
 
 	raw_spin_lock_irqsave_rcu_node(rnp, flags);
 	if (!(rnp->expmask & mask)) {
@@ -238,6 +240,13 @@ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp,
 		return;
 	}
 	WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask);
+	for_each_leaf_node_cpu_mask(rnp, cpu, mask) {
+		rdp = per_cpu_ptr(&rcu_data, cpu);
+		if (!IS_ENABLED(CONFIG_NO_HZ_FULL) || !rdp->rcu_forced_tick_exp)
+			continue;
+		rdp->rcu_forced_tick_exp = false;
+		tick_dep_clear_cpu(cpu, TICK_DEP_BIT_RCU_EXP);
+	}
 	__rcu_report_exp_rnp(rnp, wake, flags); /* Releases rnp->lock. */
 }
 
@@ -450,6 +459,26 @@ static void sync_rcu_exp_select_cpus(void)
 }
 
 /*
+ * Wait for the expedited grace period to elapse, within time limit.
+ * If the time limit is exceeded without the grace period elapsing,
+ * return false, otherwise return true.
+ */
+static bool synchronize_rcu_expedited_wait_once(long tlimit)
+{
+	int t;
+	struct rcu_node *rnp_root = rcu_get_root();
+
+	t = swait_event_timeout_exclusive(rcu_state.expedited_wq,
+					  sync_rcu_exp_done_unlocked(rnp_root),
+					  tlimit);
+	// Workqueues should not be signaled.
+	if (t > 0 || sync_rcu_exp_done_unlocked(rnp_root))
+		return true;
+	WARN_ON(t < 0);  /* workqueues should not be signaled. */
+	return false;
+}
+
+/*
  * Wait for the expedited grace period to elapse, issuing any needed
  * RCU CPU stall warnings along the way.
  */
@@ -460,22 +489,31 @@ static void synchronize_rcu_expedited_wait(void)
 	unsigned long jiffies_start;
 	unsigned long mask;
 	int ndetected;
+	struct rcu_data *rdp;
 	struct rcu_node *rnp;
 	struct rcu_node *rnp_root = rcu_get_root();
-	int ret;
 
 	trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait"));
 	jiffies_stall = rcu_jiffies_till_stall_check();
 	jiffies_start = jiffies;
+	if (IS_ENABLED(CONFIG_NO_HZ_FULL)) {
+		if (synchronize_rcu_expedited_wait_once(1))
+			return;
+		rcu_for_each_leaf_node(rnp) {
+			for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
+				rdp = per_cpu_ptr(&rcu_data, cpu);
+				if (rdp->rcu_forced_tick_exp)
+					continue;
+				rdp->rcu_forced_tick_exp = true;
+				tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP);
+			}
+		}
+		WARN_ON_ONCE(1);
+	}
 
 	for (;;) {
-		ret = swait_event_timeout_exclusive(
-				rcu_state.expedited_wq,
-				sync_rcu_exp_done_unlocked(rnp_root),
-				jiffies_stall);
-		if (ret > 0 || sync_rcu_exp_done_unlocked(rnp_root))
+		if (synchronize_rcu_expedited_wait_once(jiffies_stall))
 			return;
-		WARN_ON(ret < 0);  /* workqueues should not be signaled. */
 		if (rcu_cpu_stall_suppress)
 			continue;
 		panic_on_rcu_stall();
-- 
2.9.5


^ permalink raw reply related	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2019-12-10  4:02 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-12-10  4:01 [PATCH tip/core/rcu 0/10] Expedited grace-period updates for v5.6 Paul E. McKenney
2019-12-10  4:01 ` [PATCH tip/core/rcu 01/10] rcu: Use *_ONCE() to protect lockless ->expmask accesses paulmck
2019-12-10  4:01 ` [PATCH tip/core/rcu 02/10] rcu: Avoid modifying mask_ofl_ipi in sync_rcu_exp_select_node_cpus() paulmck
2019-12-10  4:01 ` [PATCH tip/core/rcu 03/10] rcu: Fix data-race due to atomic_t copy-by-value paulmck
2019-12-10  4:01 ` [PATCH tip/core/rcu 04/10] rcu: Substitute lookup for bit-twiddling in sync_rcu_exp_select_node_cpus() paulmck
2019-12-10  4:01 ` [PATCH tip/core/rcu 05/10] rcu: Fix missed wakeup of exp_wq waiters paulmck
2019-12-10  4:01 ` [PATCH tip/core/rcu 06/10] rcu: Allow only one expedited GP to run concurrently with wakeups paulmck
2019-12-10  4:01 ` [PATCH tip/core/rcu 07/10] rcu: Rename sync_rcu_preempt_exp_done() to sync_rcu_exp_done() paulmck
2019-12-10  4:01 ` [PATCH tip/core/rcu 08/10] rcu: Update tree_exp.h function-header comments paulmck
2019-12-10  4:01 ` [PATCH tip/core/rcu 09/10] rcu: Replace synchronize_sched_expedited_wait() "_sched" with "_rcu" paulmck
2019-12-10  4:01 ` [PATCH tip/core/rcu 10/10] rcu: Enable tick for nohz_full CPUs slow to provide expedited QS paulmck

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).