All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
To: linux-kernel@vger.kernel.org
Cc: mingo@elte.hu, laijs@cn.fujitsu.com, dipankar@in.ibm.com,
	akpm@linux-foundation.org, mathieu.desnoyers@polymtl.ca,
	josh@joshtriplett.org, niv@us.ibm.com, tglx@linutronix.de,
	peterz@infradead.org, rostedt@goodmis.org, dhowells@redhat.com,
	edumazet@google.com, darren@dvhart.com, fweisbec@gmail.com,
	sbw@mit.edu, "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Subject: [PATCH RFC nohz_full v2 6/7] nohz_full: Add full-system-idle state machine
Date: Fri, 28 Jun 2013 13:10:21 -0700	[thread overview]
Message-ID: <1372450222-19420-6-git-send-email-paulmck@linux.vnet.ibm.com> (raw)
In-Reply-To: <1372450222-19420-1-git-send-email-paulmck@linux.vnet.ibm.com>

From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>

This commit adds the state machine that takes the per-CPU idle data
as input and produces a full-system-idle indication as output.  This
state machine is driven out of RCU's quiescent-state-forcing
mechanism, which invokes rcu_sysidle_check_cpu() to collect per-CPU
idle state and then rcu_sysidle_report() to drive the state machine.

The full-system-idle state is sampled using rcu_sys_is_idle(), which
also drives the state machine if RCU is idle (and does so by forcing
RCU to become non-idle).  This function returns true if all but the
timekeeping CPU (tick_do_timer_cpu) are idle and have been idle long
enough to avoid memory contention on the full_sysidle_state state
variable.  The rcu_sysidle_force_exit() may be called externally
to reset the state machine back into non-idle state.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>

Conflicts:

	kernel/rcutree.h
	kernel/rcutree_plugin.h
---
 include/linux/rcupdate.h |  18 ++++
 kernel/rcutree.c         |  16 ++-
 kernel/rcutree.h         |   5 +
 kernel/rcutree_plugin.h  | 263 ++++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 295 insertions(+), 7 deletions(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 48f1ef9..1aa8d8c 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -1011,4 +1011,22 @@ static inline bool rcu_is_nocb_cpu(int cpu) { return false; }
 #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
 
 
+/* Only for use by adaptive-ticks code. */
+#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
+extern bool rcu_sys_is_idle(void);
+extern void rcu_sysidle_force_exit(void);
+#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
+
+static inline bool rcu_sys_is_idle(void)
+{
+	return false;
+}
+
+static inline void rcu_sysidle_force_exit(void)
+{
+}
+
+#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
+
+
 #endif /* __LINUX_RCUPDATE_H */
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 9971f86..06cfd75 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -718,6 +718,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
 					 bool *isidle, unsigned long *maxj)
 {
 	rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
+	rcu_sysidle_check_cpu(rdp, isidle, maxj);
 	return (rdp->dynticks_snap & 0x1) == 0;
 }
 
@@ -1356,11 +1357,17 @@ int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
 	rsp->n_force_qs++;
 	if (fqs_state == RCU_SAVE_DYNTICK) {
 		/* Collect dyntick-idle snapshots. */
+		if (is_sysidle_rcu_state(rsp)) {
+			isidle = 1;
+			maxj = jiffies - ULONG_MAX / 4;
+		}
 		force_qs_rnp(rsp, dyntick_save_progress_counter,
 			     &isidle, &maxj);
+		rcu_sysidle_report(rsp, isidle, maxj);
 		fqs_state = RCU_FORCE_QS;
 	} else {
 		/* Handle dyntick-idle and offline CPUs. */
+		isidle = 0;
 		force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj);
 	}
 	/* Clear flag to prevent immediate re-entry. */
@@ -2087,9 +2094,12 @@ static void force_qs_rnp(struct rcu_state *rsp,
 		cpu = rnp->grplo;
 		bit = 1;
 		for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
-			if ((rnp->qsmask & bit) != 0 &&
-			    f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
-				mask |= bit;
+			if ((rnp->qsmask & bit) != 0) {
+				if ((rnp->qsmaskinit & bit) != 0)
+					*isidle = 0;
+				if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
+					mask |= bit;
+			}
 		}
 		if (mask != 0) {
 
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 1895043..7326a3c 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -555,6 +555,11 @@ static void rcu_kick_nohz_cpu(int cpu);
 static bool init_nocb_callback_list(struct rcu_data *rdp);
 static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq);
 static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq);
+static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
+				  unsigned long *maxj);
+static bool is_sysidle_rcu_state(struct rcu_state *rsp);
+static void rcu_sysidle_report(struct rcu_state *rsp, int isidle,
+			       unsigned long maxj);
 static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp);
 
 #endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index a4b2c09..9b60df5 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -28,7 +28,7 @@
 #include <linux/gfp.h>
 #include <linux/oom.h>
 #include <linux/smpboot.h>
-#include <linux/tick.h>
+#include "time/tick-internal.h"
 
 #define RCU_KTHREAD_PRIO 1
 
@@ -2395,12 +2395,12 @@ static void rcu_kick_nohz_cpu(int cpu)
  * most active flavor of RCU.
  */
 #ifdef CONFIG_PREEMPT_RCU
-static struct rcu_state __maybe_unused *rcu_sysidle_state = &rcu_preempt_state;
+static struct rcu_state *rcu_sysidle_state = &rcu_preempt_state;
 #else /* #ifdef CONFIG_PREEMPT_RCU */
-static struct rcu_state __maybe_unused *rcu_sysidle_state = &rcu_sched_state;
+static struct rcu_state *rcu_sysidle_state = &rcu_sched_state;
 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
 
-static int __maybe_unused full_sysidle_state; /* Current system-idle state. */
+static int full_sysidle_state;		/* Current system-idle state. */
 #define RCU_SYSIDLE_NOT		0	/* Some CPU is not idle. */
 #define RCU_SYSIDLE_SHORT	1	/* All CPUs idle for brief period. */
 #define RCU_SYSIDLE_FULL	2	/* All CPUs idle, ready for sysidle. */
@@ -2442,6 +2442,38 @@ static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq)
 }
 
 /*
+ * Unconditionally force exit from full system-idle state.  This is
+ * invoked when a normal CPU exits idle, but must be called separately
+ * for the timekeeping CPU (tick_do_timer_cpu).  The reason for this
+ * is that the timekeeping CPU is permitted to take scheduling-clock
+ * interrupts while the system is in system-idle state, and of course
+ * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock
+ * interrupt from any other type of interrupt.
+ */
+void rcu_sysidle_force_exit(void)
+{
+	int oldstate = ACCESS_ONCE(full_sysidle_state);
+	int newoldstate;
+
+	/*
+	 * Each pass through the following loop attempts to exit full
+	 * system-idle state.  If contention proves to be a problem,
+	 * a trylock-based contention tree could be used here.
+	 */
+	while (oldstate > RCU_SYSIDLE_SHORT) {
+		newoldstate = cmpxchg(&full_sysidle_state,
+				      oldstate, RCU_SYSIDLE_NOT);
+		if (oldstate == newoldstate &&
+		    oldstate == RCU_SYSIDLE_FULL_NOTED) {
+			rcu_kick_nohz_cpu(tick_do_timer_cpu);
+			return; /* We cleared it, done! */
+		}
+		oldstate = newoldstate;
+	}
+	smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */
+}
+
+/*
  * Invoked to note entry to irq or task transition from idle.  Note that
  * usermode execution does -not- count as idle here!  The caller must
  * have disabled interrupts.
@@ -2474,6 +2506,214 @@ static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq)
 	atomic_inc(&rdtp->dynticks_idle);
 	smp_mb__after_atomic_inc();
 	WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1));
+
+	/*
+	 * If we are the timekeeping CPU, we are permitted to be non-idle
+	 * during a system-idle state.  This must be the case, because
+	 * the timekeeping CPU has to take scheduling-clock interrupts
+	 * during the time that the system is transitioning to full
+	 * system-idle state.  This means that the timekeeping CPU must
+	 * invoke rcu_sysidle_force_exit() directly if it does anything
+	 * more than take a scheduling-clock interrupt.
+	 */
+	if (smp_processor_id() == tick_do_timer_cpu)
+		return;
+
+	/* Update system-idle state: We are clearly no longer fully idle! */
+	rcu_sysidle_force_exit();
+}
+
+/*
+ * Check to see if the current CPU is idle.  Note that usermode execution
+ * does not count as idle.  The caller must have disabled interrupts.
+ */
+static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
+				  unsigned long *maxj)
+{
+	int cur;
+	int curnmi;
+	unsigned long j;
+	struct rcu_dynticks *rdtp = rdp->dynticks;
+
+	/*
+	 * If some other CPU has already reported non-idle, if this is
+	 * not the flavor of RCU that tracks sysidle state, or if this
+	 * is an offline or the timekeeping CPU, nothing to do.
+	 */
+	if (!*isidle || rdp->rsp != rcu_sysidle_state ||
+	    cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu)
+		return;
+	/* WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu); */
+
+	/*
+	 * Pick up current idle and NMI-nesting counters, check.  We check
+	 * for NMIs using RCU's main ->dynticks counter.  This works because
+	 * any time ->dynticks has its low bit set, ->dynticks_idle will
+	 * too -- unless the only reason that ->dynticks's low bit is set
+	 * is due to an NMI from idle.  Which is exactly the case we need
+	 * to account for.
+	 */
+	cur = atomic_read(&rdtp->dynticks_idle);
+	curnmi = atomic_read(&rdtp->dynticks);
+	if ((cur & 0x1) || (curnmi & 0x1)) {
+		*isidle = 0; /* We are not idle! */
+		return;
+	}
+	smp_mb(); /* Read counters before timestamps. */
+
+	/* Pick up timestamps. */
+	j = ACCESS_ONCE(rdtp->dynticks_idle_jiffies);
+	/* If this CPU entered idle more recently, update maxj timestamp. */
+	if (ULONG_CMP_LT(*maxj, j))
+		*maxj = j;
+}
+
+/*
+ * Is this the flavor of RCU that is handling full-system idle?
+ */
+static bool is_sysidle_rcu_state(struct rcu_state *rsp)
+{
+	return rsp == rcu_sysidle_state;
+}
+
+/*
+ * Return a delay in jiffies based on the number of CPUs, rcu_node
+ * leaf fanout, and jiffies tick rate.  The idea is to allow larger
+ * systems more time to transition to full-idle state in order to
+ * avoid the cache thrashing that otherwise occur on the state variable.
+ * Really small systems (less than a couple of tens of CPUs) should
+ * instead use a single global atomically incremented counter, and later
+ * versions of this will automatically reconfigure themselves accordingly.
+ */
+static unsigned long rcu_sysidle_delay(void)
+{
+	if (nr_cpu_ids <= RCU_SYSIDLE_SMALL)
+		return 0;
+	return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000);
+}
+
+/*
+ * Advance the full-system-idle state.  This is invoked when all of
+ * the non-timekeeping CPUs are idle.
+ */
+static void rcu_sysidle(unsigned long j)
+{
+	/* Check the current state. */
+	switch (ACCESS_ONCE(full_sysidle_state)) {
+	case RCU_SYSIDLE_NOT:
+
+		/* First time all are idle, so note a short idle period. */
+		ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_SHORT;
+		break;
+
+	case RCU_SYSIDLE_SHORT:
+
+		/*
+		 * Idle for a bit, time to advance to next state?
+		 * cmpxchg failure means race with non-idle, let them win.
+		 */
+		if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
+			(void)cmpxchg(&full_sysidle_state,
+				      RCU_SYSIDLE_SHORT, RCU_SYSIDLE_FULL);
+		break;
+
+	default:
+		break;
+	}
+}
+
+/*
+ * Found a non-idle non-timekeeping CPU, so kick the system-idle state
+ * back to the beginning.
+ */
+static void rcu_sysidle_cancel(void)
+{
+	smp_mb();
+	ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT;
+}
+
+/*
+ * Update the sysidle state based on the results of a force-quiescent-state
+ * scan of the CPUs' dyntick-idle state.
+ */
+static void rcu_sysidle_report(struct rcu_state *rsp, int isidle,
+			       unsigned long maxj)
+{
+	if (rsp != rcu_sysidle_state)
+		return;  /* Wrong flavor, ignore. */
+	if (isidle)
+		rcu_sysidle(maxj);    /* More idle! */
+	else
+		rcu_sysidle_cancel(); /* Idle is over. */
+}
+
+/* Callback and function for forcing an RCU grace period. */
+struct rcu_sysidle_head {
+	struct rcu_head rh;
+	int inuse;
+};
+
+static void rcu_sysidle_cb(struct rcu_head *rhp)
+{
+	struct rcu_sysidle_head *rshp;
+
+	smp_mb();  /* grace period precedes setting inuse. */
+	rshp = container_of(rhp, struct rcu_sysidle_head, rh);
+	ACCESS_ONCE(rshp->inuse) = 0;
+}
+
+/*
+ * Check to see if the system is fully idle, other than the timekeeping CPU.
+ * The caller must have disabled interrupts.
+ */
+bool rcu_sys_is_idle(void)
+{
+	static struct rcu_sysidle_head rsh;
+	int rss = ACCESS_ONCE(full_sysidle_state);
+
+	WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu);
+
+	/* Handle small-system case by doing a full scan of CPUs. */
+	if (nr_cpu_ids <= RCU_SYSIDLE_SMALL && rss < RCU_SYSIDLE_FULL) {
+		int cpu;
+		bool isidle = true;
+		unsigned long maxj = jiffies - ULONG_MAX / 4;
+		struct rcu_data *rdp;
+
+		/* Scan all the CPUs looking for nonidle CPUs. */
+		for_each_possible_cpu(cpu) {
+			rdp = per_cpu_ptr(rcu_sysidle_state->rda, cpu);
+			rcu_sysidle_check_cpu(rdp, &isidle, &maxj);
+			if (!isidle)
+				break;
+		}
+		rcu_sysidle_report(rcu_sysidle_state, isidle, maxj);
+		rss = ACCESS_ONCE(full_sysidle_state);
+	}
+
+	/* If this is the first observation of an idle period, record it. */
+	if (rss == RCU_SYSIDLE_FULL) {
+		rss = cmpxchg(&full_sysidle_state,
+			      RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED);
+		return rss == RCU_SYSIDLE_FULL;
+	}
+
+	smp_mb(); /* ensure rss load happens before later caller actions. */
+
+	/* If already fully idle, tell the caller (in case of races). */
+	if (rss == RCU_SYSIDLE_FULL_NOTED)
+		return true;
+
+	/*
+	 * If we aren't there yet, and a grace period is not in flight,
+	 * initiate a grace period.  Either way, tell the caller that
+	 * we are not there yet.
+	 */
+	if (nr_cpu_ids > RCU_SYSIDLE_SMALL &&
+	    !rcu_gp_in_progress(rcu_sysidle_state) &&
+	    !rsh.inuse && xchg(&rsh.inuse, 1) == 0)
+		call_rcu(&rsh.rh, rcu_sysidle_cb);
+	return false;
 }
 
 /*
@@ -2494,6 +2734,21 @@ static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq)
 {
 }
 
+static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
+				  unsigned long *maxj)
+{
+}
+
+static bool is_sysidle_rcu_state(struct rcu_state *rsp)
+{
+	return false;
+}
+
+static void rcu_sysidle_report(struct rcu_state *rsp, int isidle,
+			       unsigned long maxj)
+{
+}
+
 static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
 {
 }
-- 
1.8.1.5


  parent reply	other threads:[~2013-06-28 20:10 UTC|newest]

Thread overview: 32+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-06-28 20:09 [PATCH RFC nohz_full 0/7] v2 Provide infrastructure for full-system idle Paul E. McKenney
2013-06-28 20:10 ` [PATCH RFC nohz_full v2 1/7] nohz_full: Add Kconfig parameter for scalable detection of all-idle state Paul E. McKenney
2013-06-28 20:10   ` [PATCH RFC nohz_full v2 2/7] nohz_full: Add rcu_dyntick data " Paul E. McKenney
2013-07-01 15:31     ` Josh Triplett
2013-07-01 15:52       ` Paul E. McKenney
2013-07-01 18:16         ` Josh Triplett
2013-07-01 18:23           ` Paul E. McKenney
2013-07-01 18:34             ` Josh Triplett
2013-07-01 19:16               ` Paul E. McKenney
2013-07-02  5:10                 ` Mike Galbraith
2013-07-02  5:58                   ` Paul E. McKenney
2013-06-28 20:10   ` [PATCH RFC nohz_full v2 3/7] nohz_full: Add per-CPU idle-state tracking Paul E. McKenney
2013-07-01 15:33     ` Josh Triplett
2013-06-28 20:10   ` [PATCH RFC nohz_full v2 4/7] nohz_full: Add full-system idle states and variables Paul E. McKenney
2013-06-28 20:10   ` [PATCH RFC nohz_full v2 5/7] nohz_full: Add full-system-idle arguments to API Paul E. McKenney
2013-06-28 20:10   ` Paul E. McKenney [this message]
2013-07-01 16:35     ` [PATCH RFC nohz_full v2 6/7] nohz_full: Add full-system-idle state machine Frederic Weisbecker
2013-07-01 18:10       ` Paul E. McKenney
2013-07-01 20:55         ` Frederic Weisbecker
2013-07-01 16:53     ` Frederic Weisbecker
2013-07-01 18:17       ` Paul E. McKenney
2013-07-01 21:38     ` Frederic Weisbecker
2013-07-01 22:51       ` Paul E. McKenney
2013-06-28 20:10   ` [PATCH RFC nohz_full v2 7/7] nohz_full: Force RCU's grace-period kthreads onto timekeeping CPU Paul E. McKenney
2013-07-01 15:19 ` [PATCH RFC nohz_full 0/7] v2 Provide infrastructure for full-system idle Andi Kleen
2013-07-01 16:03   ` Paul E. McKenney
2013-07-01 16:19     ` Andi Kleen
2013-07-01 19:19       ` Paul E. McKenney
2013-07-01 19:43 ` Christoph Lameter
2013-07-01 19:56   ` Paul E. McKenney
2013-07-01 20:24     ` Christoph Lameter
2013-07-01 20:43       ` Thomas Gleixner

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1372450222-19420-6-git-send-email-paulmck@linux.vnet.ibm.com \
    --to=paulmck@linux.vnet.ibm.com \
    --cc=akpm@linux-foundation.org \
    --cc=darren@dvhart.com \
    --cc=dhowells@redhat.com \
    --cc=dipankar@in.ibm.com \
    --cc=edumazet@google.com \
    --cc=fweisbec@gmail.com \
    --cc=josh@joshtriplett.org \
    --cc=laijs@cn.fujitsu.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mathieu.desnoyers@polymtl.ca \
    --cc=mingo@elte.hu \
    --cc=niv@us.ibm.com \
    --cc=peterz@infradead.org \
    --cc=rostedt@goodmis.org \
    --cc=sbw@mit.edu \
    --cc=tglx@linutronix.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.