linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH RFC tip/core/rcu 01/20] rcu: add priority-inversion testing to rcutorture
  2010-12-17 20:54 [PATCH tip/core/rcu 0/20] second preview of RCU patches for 2.6.38 Paul E. McKenney
@ 2010-12-17 20:54 ` Paul E. McKenney
  2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 02/20] rcu: move TINY_RCU from softirq to kthread Paul E. McKenney
                   ` (18 subsequent siblings)
  19 siblings, 0 replies; 34+ messages in thread
From: Paul E. McKenney @ 2010-12-17 20:54 UTC (permalink / raw)
  To: linux-kernel
  Cc: mingo, laijs, dipankar, akpm, mathieu.desnoyers, josh, niv, tglx,
	peterz, rostedt, Valdis.Kletnieks, dhowells, eric.dumazet,
	darren, Paul E. McKenney

Add an optional test to force long-term preemption of RCU read-side
critical sections, controlled by new test_boost, test_boost_interval,
and test_boost_duration module parameters.  This is to be used to
test RCU priority boosting.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutorture.c |  270 ++++++++++++++++++++++++++++++++++++++++++++++++--
 1 files changed, 259 insertions(+), 11 deletions(-)

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 9d8e8fb..89613f9 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -47,6 +47,7 @@
 #include <linux/srcu.h>
 #include <linux/slab.h>
 #include <asm/byteorder.h>
+#include <linux/sched.h>
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
@@ -64,6 +65,9 @@ static int irqreader = 1;	/* RCU readers from irq (timers). */
 static int fqs_duration = 0;	/* Duration of bursts (us), 0 to disable. */
 static int fqs_holdoff = 0;	/* Hold time within burst (us). */
 static int fqs_stutter = 3;	/* Wait time between bursts (s). */
+static int test_boost = 1;	/* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */
+static int test_boost_interval = 7; /* Interval between boost tests, seconds. */
+static int test_boost_duration = 4; /* Duration of each boost test, seconds. */
 static char *torture_type = "rcu"; /* What RCU implementation to torture. */
 
 module_param(nreaders, int, 0444);
@@ -88,6 +92,12 @@ module_param(fqs_holdoff, int, 0444);
 MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
 module_param(fqs_stutter, int, 0444);
 MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
+module_param(test_boost, int, 0444);
+MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
+module_param(test_boost_interval, int, 0444);
+MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds.");
+module_param(test_boost_duration, int, 0444);
+MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds.");
 module_param(torture_type, charp, 0444);
 MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
 
@@ -109,6 +119,7 @@ static struct task_struct *stats_task;
 static struct task_struct *shuffler_task;
 static struct task_struct *stutter_task;
 static struct task_struct *fqs_task;
+static struct task_struct *boost_tasks[NR_CPUS];
 
 #define RCU_TORTURE_PIPE_LEN 10
 
@@ -134,6 +145,12 @@ static atomic_t n_rcu_torture_alloc_fail;
 static atomic_t n_rcu_torture_free;
 static atomic_t n_rcu_torture_mberror;
 static atomic_t n_rcu_torture_error;
+static long n_rcu_torture_boost_ktrerror;
+static long n_rcu_torture_boost_rterror;
+static long n_rcu_torture_boost_allocerror;
+static long n_rcu_torture_boost_afferror;
+static long n_rcu_torture_boost_failure;
+static long n_rcu_torture_boosts;
 static long n_rcu_torture_timers;
 static struct list_head rcu_torture_removed;
 static cpumask_var_t shuffle_tmp_mask;
@@ -147,6 +164,16 @@ static int stutter_pause_test;
 #endif
 int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
 
+#ifdef CONFIG_RCU_BOOST
+#define rcu_can_boost() 1
+#else /* #ifdef CONFIG_RCU_BOOST */
+#define rcu_can_boost() 0
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
+
+static unsigned long boost_starttime;	/* jiffies of next boost test start. */
+DEFINE_MUTEX(boost_mutex);		/* protect setting boost_starttime */
+					/*  and boost task create/destroy. */
+
 /* Mediate rmmod and system shutdown.  Concurrent rmmod & shutdown illegal! */
 
 #define FULLSTOP_DONTSTOP 0	/* Normal operation. */
@@ -277,6 +304,7 @@ struct rcu_torture_ops {
 	void (*fqs)(void);
 	int (*stats)(char *page);
 	int irq_capable;
+	int can_boost;
 	char *name;
 };
 
@@ -366,6 +394,7 @@ static struct rcu_torture_ops rcu_ops = {
 	.fqs		= rcu_force_quiescent_state,
 	.stats		= NULL,
 	.irq_capable	= 1,
+	.can_boost	= rcu_can_boost(),
 	.name		= "rcu"
 };
 
@@ -408,6 +437,7 @@ static struct rcu_torture_ops rcu_sync_ops = {
 	.fqs		= rcu_force_quiescent_state,
 	.stats		= NULL,
 	.irq_capable	= 1,
+	.can_boost	= rcu_can_boost(),
 	.name		= "rcu_sync"
 };
 
@@ -424,6 +454,7 @@ static struct rcu_torture_ops rcu_expedited_ops = {
 	.fqs		= rcu_force_quiescent_state,
 	.stats		= NULL,
 	.irq_capable	= 1,
+	.can_boost	= rcu_can_boost(),
 	.name		= "rcu_expedited"
 };
 
@@ -684,6 +715,110 @@ static struct rcu_torture_ops sched_expedited_ops = {
 };
 
 /*
+ * RCU torture priority-boost testing.  Runs one real-time thread per
+ * CPU for moderate bursts, repeatedly registering RCU callbacks and
+ * spinning waiting for them to be invoked.  If a given callback takes
+ * too long to be invoked, we assume that priority inversion has occurred.
+ */
+
+struct rcu_boost_inflight {
+	struct rcu_head rcu;
+	int inflight;
+};
+
+static void rcu_torture_boost_cb(struct rcu_head *head)
+{
+	struct rcu_boost_inflight *rbip =
+		container_of(head, struct rcu_boost_inflight, rcu);
+
+	smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */
+	rbip->inflight = 0;
+}
+
+static int rcu_torture_boost(void *arg)
+{
+	unsigned long call_rcu_time;
+	unsigned long endtime;
+	unsigned long oldstarttime;
+	struct rcu_boost_inflight rbi = { .inflight = 0 };
+	struct sched_param sp;
+
+	VERBOSE_PRINTK_STRING("rcu_torture_boost started");
+
+	/* Set real-time priority. */
+	sp.sched_priority = 1;
+	if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) {
+		VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!");
+		n_rcu_torture_boost_rterror++;
+	}
+
+	/* Each pass through the following loop does one boost-test cycle. */
+	do {
+		/* Wait for the next test interval. */
+		oldstarttime = boost_starttime;
+		while (jiffies - oldstarttime > ULONG_MAX / 2) {
+			schedule_timeout_uninterruptible(1);
+			rcu_stutter_wait("rcu_torture_boost");
+			if (kthread_should_stop() ||
+			    fullstop != FULLSTOP_DONTSTOP)
+				goto checkwait;
+		}
+
+		/* Do one boost-test interval. */
+		endtime = oldstarttime + test_boost_duration * HZ;
+		call_rcu_time = jiffies;
+		while (jiffies - endtime > ULONG_MAX / 2) {
+			/* If we don't have a callback in flight, post one. */
+			if (!rbi.inflight) {
+				smp_mb(); /* RCU core before ->inflight = 1. */
+				rbi.inflight = 1;
+				call_rcu(&rbi.rcu, rcu_torture_boost_cb);
+				if (jiffies - call_rcu_time >
+					 test_boost_duration * HZ - HZ / 2) {
+					VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed");
+					n_rcu_torture_boost_failure++;
+				}
+				call_rcu_time = jiffies;
+			}
+			cond_resched();
+			rcu_stutter_wait("rcu_torture_boost");
+			if (kthread_should_stop() ||
+			    fullstop != FULLSTOP_DONTSTOP)
+				goto checkwait;
+		}
+
+		/*
+		 * Set the start time of the next test interval.
+		 * Yes, this is vulnerable to long delays, but such
+		 * delays simply cause a false negative for the next
+		 * interval.  Besides, we are running at RT priority,
+		 * so delays should be relatively rare.
+		 */
+		while (oldstarttime == boost_starttime) {
+			if (mutex_trylock(&boost_mutex)) {
+				boost_starttime = jiffies +
+						  test_boost_interval * HZ;
+				n_rcu_torture_boosts++;
+				mutex_unlock(&boost_mutex);
+				break;
+			}
+			schedule_timeout_uninterruptible(1);
+		}
+
+		/* Go do the stutter. */
+checkwait:	rcu_stutter_wait("rcu_torture_boost");
+	} while (!kthread_should_stop() && fullstop  == FULLSTOP_DONTSTOP);
+
+	/* Clean up and exit. */
+	VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
+	rcutorture_shutdown_absorb("rcu_torture_boost");
+	while (!kthread_should_stop() || rbi.inflight)
+		schedule_timeout_uninterruptible(1);
+	smp_mb(); /* order accesses to ->inflight before stack-frame death. */
+	return 0;
+}
+
+/*
  * RCU torture force-quiescent-state kthread.  Repeatedly induces
  * bursts of calls to force_quiescent_state(), increasing the probability
  * of occurrence of some important types of race conditions.
@@ -933,7 +1068,8 @@ rcu_torture_printk(char *page)
 	cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
 	cnt += sprintf(&page[cnt],
 		       "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "
-		       "rtmbe: %d nt: %ld",
+		       "rtmbe: %d rtbke: %ld rtbre: %ld rtbae: %ld rtbafe: %ld "
+		       "rtbf: %ld rtb: %ld nt: %ld",
 		       rcu_torture_current,
 		       rcu_torture_current_version,
 		       list_empty(&rcu_torture_freelist),
@@ -941,8 +1077,19 @@ rcu_torture_printk(char *page)
 		       atomic_read(&n_rcu_torture_alloc_fail),
 		       atomic_read(&n_rcu_torture_free),
 		       atomic_read(&n_rcu_torture_mberror),
+		       n_rcu_torture_boost_ktrerror,
+		       n_rcu_torture_boost_rterror,
+		       n_rcu_torture_boost_allocerror,
+		       n_rcu_torture_boost_afferror,
+		       n_rcu_torture_boost_failure,
+		       n_rcu_torture_boosts,
 		       n_rcu_torture_timers);
-	if (atomic_read(&n_rcu_torture_mberror) != 0)
+	if (atomic_read(&n_rcu_torture_mberror) != 0 ||
+	    n_rcu_torture_boost_ktrerror != 0 ||
+	    n_rcu_torture_boost_rterror != 0 ||
+	    n_rcu_torture_boost_allocerror != 0 ||
+	    n_rcu_torture_boost_afferror != 0 ||
+	    n_rcu_torture_boost_failure != 0)
 		cnt += sprintf(&page[cnt], " !!!");
 	cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
 	if (i > 1) {
@@ -1094,22 +1241,91 @@ rcu_torture_stutter(void *arg)
 }
 
 static inline void
-rcu_torture_print_module_parms(char *tag)
+rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
 {
 	printk(KERN_ALERT "%s" TORTURE_FLAG
 		"--- %s: nreaders=%d nfakewriters=%d "
 		"stat_interval=%d verbose=%d test_no_idle_hz=%d "
 		"shuffle_interval=%d stutter=%d irqreader=%d "
-		"fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n",
+		"fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
+		"test_boost=%d/%d test_boost_interval=%d "
+		"test_boost_duration=%d\n",
 		torture_type, tag, nrealreaders, nfakewriters,
 		stat_interval, verbose, test_no_idle_hz, shuffle_interval,
-		stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter);
+		stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
+		test_boost, cur_ops->can_boost,
+		test_boost_interval, test_boost_duration);
 }
 
-static struct notifier_block rcutorture_nb = {
+static struct notifier_block rcutorture_shutdown_nb = {
 	.notifier_call = rcutorture_shutdown_notify,
 };
 
+static void rcutorture_booster_cleanup(int cpu)
+{
+	struct task_struct *t;
+
+	if (boost_tasks[cpu] == NULL)
+		return;
+	mutex_lock(&boost_mutex);
+	VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task");
+	t = boost_tasks[cpu];
+	boost_tasks[cpu] = NULL;
+	mutex_unlock(&boost_mutex);
+
+	/* This must be outside of the mutex, otherwise deadlock! */
+	kthread_stop(t);
+}
+
+static int rcutorture_booster_init(int cpu)
+{
+	int retval;
+
+	if (boost_tasks[cpu] != NULL)
+		return 0;  /* Already created, nothing more to do. */
+
+	/* Don't allow time recalculation while creating a new task. */
+	mutex_lock(&boost_mutex);
+	VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task");
+	boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL,
+					  "rcu_torture_boost");
+	if (IS_ERR(boost_tasks[cpu])) {
+		retval = PTR_ERR(boost_tasks[cpu]);
+		VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed");
+		n_rcu_torture_boost_ktrerror++;
+		boost_tasks[cpu] = NULL;
+		mutex_unlock(&boost_mutex);
+		return retval;
+	}
+	kthread_bind(boost_tasks[cpu], cpu);
+	wake_up_process(boost_tasks[cpu]);
+	mutex_unlock(&boost_mutex);
+	return 0;
+}
+
+static int rcutorture_cpu_notify(struct notifier_block *self,
+				 unsigned long action, void *hcpu)
+{
+	long cpu = (long)hcpu;
+
+	switch (action) {
+	case CPU_ONLINE:
+	case CPU_DOWN_FAILED:
+		(void)rcutorture_booster_init(cpu);
+		break;
+	case CPU_DOWN_PREPARE:
+		rcutorture_booster_cleanup(cpu);
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block rcutorture_cpu_nb = {
+	.notifier_call = rcutorture_cpu_notify,
+};
+
 static void
 rcu_torture_cleanup(void)
 {
@@ -1127,7 +1343,7 @@ rcu_torture_cleanup(void)
 	}
 	fullstop = FULLSTOP_RMMOD;
 	mutex_unlock(&fullstop_mutex);
-	unregister_reboot_notifier(&rcutorture_nb);
+	unregister_reboot_notifier(&rcutorture_shutdown_nb);
 	if (stutter_task) {
 		VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
 		kthread_stop(stutter_task);
@@ -1184,6 +1400,12 @@ rcu_torture_cleanup(void)
 		kthread_stop(fqs_task);
 	}
 	fqs_task = NULL;
+	if ((test_boost == 1 && cur_ops->can_boost) ||
+	    test_boost == 2) {
+		unregister_cpu_notifier(&rcutorture_cpu_nb);
+		for_each_possible_cpu(i)
+			rcutorture_booster_cleanup(i);
+	}
 
 	/* Wait for all RCU callbacks to fire.  */
 
@@ -1195,9 +1417,9 @@ rcu_torture_cleanup(void)
 	if (cur_ops->cleanup)
 		cur_ops->cleanup();
 	if (atomic_read(&n_rcu_torture_error))
-		rcu_torture_print_module_parms("End of test: FAILURE");
+		rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
 	else
-		rcu_torture_print_module_parms("End of test: SUCCESS");
+		rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
 }
 
 static int __init
@@ -1242,7 +1464,7 @@ rcu_torture_init(void)
 		nrealreaders = nreaders;
 	else
 		nrealreaders = 2 * num_online_cpus();
-	rcu_torture_print_module_parms("Start of test");
+	rcu_torture_print_module_parms(cur_ops, "Start of test");
 	fullstop = FULLSTOP_DONTSTOP;
 
 	/* Set up the freelist. */
@@ -1263,6 +1485,12 @@ rcu_torture_init(void)
 	atomic_set(&n_rcu_torture_free, 0);
 	atomic_set(&n_rcu_torture_mberror, 0);
 	atomic_set(&n_rcu_torture_error, 0);
+	n_rcu_torture_boost_ktrerror = 0;
+	n_rcu_torture_boost_rterror = 0;
+	n_rcu_torture_boost_allocerror = 0;
+	n_rcu_torture_boost_afferror = 0;
+	n_rcu_torture_boost_failure = 0;
+	n_rcu_torture_boosts = 0;
 	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
 		atomic_set(&rcu_torture_wcount[i], 0);
 	for_each_possible_cpu(cpu) {
@@ -1376,7 +1604,27 @@ rcu_torture_init(void)
 			goto unwind;
 		}
 	}
-	register_reboot_notifier(&rcutorture_nb);
+	if (test_boost_interval < 1)
+		test_boost_interval = 1;
+	if (test_boost_duration < 2)
+		test_boost_duration = 2;
+	if ((test_boost == 1 && cur_ops->can_boost) ||
+	    test_boost == 2) {
+		int retval;
+
+		boost_starttime = jiffies + test_boost_interval * HZ;
+		register_cpu_notifier(&rcutorture_cpu_nb);
+		for_each_possible_cpu(i) {
+			if (cpu_is_offline(i))
+				continue;  /* Heuristic: CPU can go offline. */
+			retval = rcutorture_booster_init(i);
+			if (retval < 0) {
+				firsterr = retval;
+				goto unwind;
+			}
+		}
+	}
+	register_reboot_notifier(&rcutorture_shutdown_nb);
 	mutex_unlock(&fullstop_mutex);
 	return 0;
 
-- 
1.7.3.2


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH tip/core/rcu 0/20] second preview of RCU patches for 2.6.38
@ 2010-12-17 20:54 Paul E. McKenney
  2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 01/20] rcu: add priority-inversion testing to rcutorture Paul E. McKenney
                   ` (19 more replies)
  0 siblings, 20 replies; 34+ messages in thread
From: Paul E. McKenney @ 2010-12-17 20:54 UTC (permalink / raw)
  To: linux-kernel
  Cc: mingo, laijs, dipankar, akpm, mathieu.desnoyers, josh, niv, tglx,
	peterz, rostedt, Valdis.Kletnieks, dhowells, eric.dumazet,
	darren

Hello!

This patchset contains RCU priority boosting for the tiny RCU
implementations and a number of fixes and cleanups.  The patches
are as follows, with the first 12 being updates from the earlier
posting at https://lkml.org/lkml/2010/11/6/187:

1.	Add rcutorture tests to check for correct priority boosting.
2-6.	RCU priority boosting for the tiny RCU implementations.
	RCU priority boosting for the tree RCU implementations is
	in progress, but not yet ready for prime time.
7.	Fix a few naming holdouts from old Classic RCU.
8.	Move synchronize_sched_expedited() out of sched.c
	(from Lai Jiangshan).
9.	Simplify the RCU callback orphan/adopt code for CPU hotplug
	(from Lai Jiangshan).
10.	Update RCU tracing documentation to reflect patch #9.
11.	Fix a race condition in synchronize_sched_expedited(),
	but Tejun Heo has suggested another approach that might
	be better longer term.
12.	Make synchronize_srcu_expedited() spin for a bit to avoid
	blocking -- again, there may be a better long-term fix.
	That said, this adaptive approach seems to work very well
	in practice.
13.	Improve synchronize_sched_expedited()'s ability to batch
	concurrent requests, as foreshadowed above.
14-15.	Make RCU avoid looking for quiescent states in cases when
	they are not required, thereby improving RCU's energy
	efficiency (from Frederic Weisbecker).
16.	Enlist the rnp->qsmask data to simplify #14 and #15 and
	also handle another case where RCU was unnecessarily asking
	CPUs to go through quiescent states.
17.	Limit tree RCU's leaf-level fanout to reduce lock contention.
	Lock contention is higher at the leaf level that elsewhere
	in the tree of rcu_node structures due to the fact that
	CPUs synchronize at the leaf level to detect grace-period
	events.
18.	Reduce tree RCU's leaf-level lock contention still further
	by making __call_rcu() less aggressive about starting new
	grace periods.  The aggression is a holdover from ancient
	times predating synchronize_rcu_expedited().
19.	Fix a mismatched-parentheses error in __list_for_each_rcu()
	(from Mariusz Kozlowski).
20.	Given that there are no in-tree users, remove the aforementioned
	__list_for_each_rcu().  This is maintained as a separate
	commit in case there is a use out there somewhere making its
	way into mainline.  (Though I would rather keep this API
	out -- list_for_each_entry_rcu() is much better in most cases.)

For a testing-only version of this patchset from git, please see:

git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-2.6-rcu.git rcu/testing

							Thanx, Paul

 Documentation/RCU/trace.txt   |   12 -
 b/Documentation/RCU/trace.txt |  132 +++++++++++-
 b/include/linux/init_task.h   |    9 
 b/include/linux/rculist.h     |    2 
 b/include/linux/rcupdate.h    |    1 
 b/include/linux/rcutiny.h     |    8 
 b/include/linux/rcutree.h     |    1 
 b/include/linux/sched.h       |   11 -
 b/init/Kconfig                |   39 +++
 b/kernel/rcutiny.c            |   71 +++++-
 b/kernel/rcutiny_plugin.h     |   15 -
 b/kernel/rcutorture.c         |  270 ++++++++++++++++++++++++-
 b/kernel/rcutree.c            |   81 +------
 b/kernel/rcutree.h            |   16 -
 b/kernel/rcutree_plugin.h     |   71 ++++++
 b/kernel/rcutree_trace.c      |    8 
 b/kernel/sched.c              |   69 ------
 b/kernel/srcu.c               |    8 
 include/linux/rculist.h       |    5 
 include/linux/rcupdate.h      |    3 
 include/linux/rcutiny.h       |    5 
 include/linux/rcutree.h       |    1 
 init/Kconfig                  |   16 +
 kernel/rcutiny.c              |   70 ++----
 kernel/rcutiny_plugin.h       |  444 ++++++++++++++++++++++++++++++++++++++++--
 kernel/rcutree.c              |   95 ++++++--
 kernel/rcutree.h              |   45 ++--
 kernel/rcutree_plugin.h       |  110 +++++++---
 kernel/rcutree_trace.c        |    4 
 29 files changed, 1272 insertions(+), 350 deletions(-)

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH RFC tip/core/rcu 02/20] rcu: move TINY_RCU from softirq to kthread
  2010-12-17 20:54 [PATCH tip/core/rcu 0/20] second preview of RCU patches for 2.6.38 Paul E. McKenney
  2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 01/20] rcu: add priority-inversion testing to rcutorture Paul E. McKenney
@ 2010-12-17 20:54 ` Paul E. McKenney
  2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 03/20] rcu: priority boosting for TINY_PREEMPT_RCU Paul E. McKenney
                   ` (17 subsequent siblings)
  19 siblings, 0 replies; 34+ messages in thread
From: Paul E. McKenney @ 2010-12-17 20:54 UTC (permalink / raw)
  To: linux-kernel
  Cc: mingo, laijs, dipankar, akpm, mathieu.desnoyers, josh, niv, tglx,
	peterz, rostedt, Valdis.Kletnieks, dhowells, eric.dumazet,
	darren, Paul E. McKenney, Paul E. McKenney

From: Paul E. McKenney <paul.mckenney@linaro.org>

If RCU priority boosting is to be meaningful, callback invocation must
be boosted in addition to preempted RCU readers.  Otherwise, in presence
of CPU real-time threads, the grace period ends, but the callbacks don't
get invoked.  If the callbacks don't get invoked, the associated memory
doesn't get freed, so the system is still subject to OOM.

But it is not reasonable to priority-boost RCU_SOFTIRQ, so this commit
moves the callback invocations to a kthread, which can be boosted easily.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h |    1 -
 include/linux/rcutiny.h  |    8 ++---
 include/linux/rcutree.h  |    1 +
 kernel/rcutiny.c         |   71 +++++++++++++++++++++++++++++++++++++--------
 kernel/rcutiny_plugin.h  |   15 +++++----
 5 files changed, 70 insertions(+), 26 deletions(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 03cda7b..7142ee3 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -118,7 +118,6 @@ static inline int rcu_preempt_depth(void)
 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
 
 /* Internal to kernel */
-extern void rcu_init(void);
 extern void rcu_sched_qs(int cpu);
 extern void rcu_bh_qs(int cpu);
 extern void rcu_check_callbacks(int cpu, int user);
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 13877cb..ea025a6 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -27,7 +27,9 @@
 
 #include <linux/cache.h>
 
-#define rcu_init_sched()	do { } while (0)
+static inline void rcu_init(void)
+{
+}
 
 #ifdef CONFIG_TINY_RCU
 
@@ -125,16 +127,12 @@ static inline void rcu_cpu_stall_reset(void)
 }
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
-
 extern int rcu_scheduler_active __read_mostly;
 extern void rcu_scheduler_starting(void);
-
 #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-
 static inline void rcu_scheduler_starting(void)
 {
 }
-
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
 #endif /* __LINUX_RCUTINY_H */
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 95518e6..c0e9683 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -30,6 +30,7 @@
 #ifndef __LINUX_RCUTREE_H
 #define __LINUX_RCUTREE_H
 
+extern void rcu_init(void);
 extern void rcu_note_context_switch(int cpu);
 extern int rcu_needs_cpu(int cpu);
 extern void rcu_cpu_stall_reset(void);
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index d806735..86eef29 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -59,8 +59,15 @@ int rcu_scheduler_active __read_mostly;
 EXPORT_SYMBOL_GPL(rcu_scheduler_active);
 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
+/* Controls for rcu_cbs() kthread, replacing RCU_SOFTIRQ used previously. */
+static struct task_struct *rcu_cbs_task;
+static DECLARE_WAIT_QUEUE_HEAD(rcu_cbs_wq);
+static unsigned long have_rcu_cbs;
+static void invoke_rcu_cbs(void);
+
 /* Forward declarations for rcutiny_plugin.h. */
-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
+static void rcu_process_callbacks(struct rcu_ctrlblk *rcp);
+static int rcu_cbs(void *arg);
 static void __call_rcu(struct rcu_head *head,
 		       void (*func)(struct rcu_head *rcu),
 		       struct rcu_ctrlblk *rcp);
@@ -123,7 +130,7 @@ void rcu_sched_qs(int cpu)
 {
 	if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
 	    rcu_qsctr_help(&rcu_bh_ctrlblk))
-		raise_softirq(RCU_SOFTIRQ);
+		invoke_rcu_cbs();
 }
 
 /*
@@ -132,7 +139,7 @@ void rcu_sched_qs(int cpu)
 void rcu_bh_qs(int cpu)
 {
 	if (rcu_qsctr_help(&rcu_bh_ctrlblk))
-		raise_softirq(RCU_SOFTIRQ);
+		invoke_rcu_cbs();
 }
 
 /*
@@ -152,10 +159,10 @@ void rcu_check_callbacks(int cpu, int user)
 }
 
 /*
- * Helper function for rcu_process_callbacks() that operates on the
- * specified rcu_ctrlkblk structure.
+ * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure
+ * whose grace period has elapsed.
  */
-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
+static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 {
 	struct rcu_head *next, *list;
 	unsigned long flags;
@@ -180,19 +187,52 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 		next = list->next;
 		prefetch(next);
 		debug_rcu_head_unqueue(list);
+		local_bh_disable();
 		list->func(list);
+		local_bh_enable();
 		list = next;
 	}
 }
 
 /*
- * Invoke any callbacks whose grace period has completed.
+ * This kthread invokes RCU callbacks whose grace periods have
+ * elapsed.  It is awakened as needed, and takes the place of the
+ * RCU_SOFTIRQ that was used previously for this purpose.
+ * This is a kthread, but it is never stopped, at least not until
+ * the system goes down.
+ */
+static int rcu_cbs(void *arg)
+{
+	unsigned long work;
+	unsigned long flags;
+
+	for (;;) {
+		wait_event(rcu_cbs_wq, have_rcu_cbs != 0);
+		local_irq_save(flags);
+		work = have_rcu_cbs;
+		have_rcu_cbs = 0;
+		local_irq_restore(flags);
+		if (work) {
+			rcu_process_callbacks(&rcu_sched_ctrlblk);
+			rcu_process_callbacks(&rcu_bh_ctrlblk);
+			rcu_preempt_process_callbacks();
+		}
+	}
+
+	return 0;  /* Not reached, but needed to shut gcc up. */
+}
+
+/*
+ * Wake up rcu_cbs() to process callbacks now eligible for invocation.
  */
-static void rcu_process_callbacks(struct softirq_action *unused)
+static void invoke_rcu_cbs(void)
 {
-	__rcu_process_callbacks(&rcu_sched_ctrlblk);
-	__rcu_process_callbacks(&rcu_bh_ctrlblk);
-	rcu_preempt_process_callbacks();
+	unsigned long flags;
+
+	local_irq_save(flags);
+	have_rcu_cbs = 1;
+	wake_up(&rcu_cbs_wq);
+	local_irq_restore(flags);
 }
 
 /*
@@ -282,7 +322,12 @@ void rcu_barrier_sched(void)
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_sched);
 
-void __init rcu_init(void)
+/*
+ * Spawn the kthread that invokes RCU callbacks.
+ */
+static int __init rcu_spawn_kthreads(void)
 {
-	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+	rcu_cbs_task = kthread_run(rcu_cbs, NULL, "rcu_cbs");
+	return 0;
 }
+early_initcall(rcu_spawn_kthreads);
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 6ceca4f..95f9239 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -22,6 +22,8 @@
  * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
  */
 
+#include <linux/kthread.h>
+
 #ifdef CONFIG_TINY_PREEMPT_RCU
 
 #include <linux/delay.h>
@@ -164,9 +166,9 @@ static void rcu_preempt_cpu_qs(void)
 	if (!rcu_preempt_blocked_readers_any())
 		rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail;
 
-	/* If there are done callbacks, make RCU_SOFTIRQ process them. */
+	/* If there are done callbacks, cause them to be invoked. */
 	if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
-		raise_softirq(RCU_SOFTIRQ);
+		invoke_rcu_cbs();
 }
 
 /*
@@ -374,7 +376,7 @@ static void rcu_preempt_check_callbacks(void)
 		rcu_preempt_cpu_qs();
 	if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
 	    rcu_preempt_ctrlblk.rcb.donetail)
-		raise_softirq(RCU_SOFTIRQ);
+		invoke_rcu_cbs();
 	if (rcu_preempt_gp_in_progress() &&
 	    rcu_cpu_blocking_cur_gp() &&
 	    rcu_preempt_running_reader())
@@ -383,7 +385,7 @@ static void rcu_preempt_check_callbacks(void)
 
 /*
  * TINY_PREEMPT_RCU has an extra callback-list tail pointer to
- * update, so this is invoked from __rcu_process_callbacks() to
+ * update, so this is invoked from rcu_process_callbacks() to
  * handle that case.  Of course, it is invoked for all flavors of
  * RCU, but RCU callbacks can appear only on one of the lists, and
  * neither ->nexttail nor ->donetail can possibly be NULL, so there
@@ -400,7 +402,7 @@ static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
  */
 static void rcu_preempt_process_callbacks(void)
 {
-	__rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
+	rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
 }
 
 /*
@@ -599,14 +601,13 @@ static void rcu_preempt_process_callbacks(void)
 #endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
-
 #include <linux/kernel_stat.h>
 
 /*
  * During boot, we forgive RCU lockdep issues.  After this function is
  * invoked, we start taking RCU lockdep issues seriously.
  */
-void rcu_scheduler_starting(void)
+void __init rcu_scheduler_starting(void)
 {
 	WARN_ON(nr_context_switches() > 0);
 	rcu_scheduler_active = 1;
-- 
1.7.3.2


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH RFC tip/core/rcu 03/20] rcu: priority boosting for TINY_PREEMPT_RCU
  2010-12-17 20:54 [PATCH tip/core/rcu 0/20] second preview of RCU patches for 2.6.38 Paul E. McKenney
  2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 01/20] rcu: add priority-inversion testing to rcutorture Paul E. McKenney
  2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 02/20] rcu: move TINY_RCU from softirq to kthread Paul E. McKenney
@ 2010-12-17 20:54 ` Paul E. McKenney
  2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 04/20] rcu: add tracing for TINY_RCU and TINY_PREEMPT_RCU Paul E. McKenney
                   ` (16 subsequent siblings)
  19 siblings, 0 replies; 34+ messages in thread
From: Paul E. McKenney @ 2010-12-17 20:54 UTC (permalink / raw)
  To: linux-kernel
  Cc: mingo, laijs, dipankar, akpm, mathieu.desnoyers, josh, niv, tglx,
	peterz, rostedt, Valdis.Kletnieks, dhowells, eric.dumazet,
	darren, Paul E. McKenney, Paul E. McKenney

From: Paul E. McKenney <paul.mckenney@linaro.org>

Add priority boosting, but only for TINY_PREEMPT_RCU.  This is enabled
by the default-off RCU_BOOST kernel parameter.  The priority to which to
boost preempted RCU readers is controlled by the RCU_BOOST_PRIO kernel
parameter (defaulting to real-time priority 1) and the time to wait
before boosting the readers blocking a given grace period is controlled
by the RCU_BOOST_DELAY kernel parameter (defaulting to 500 milliseconds).

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/init_task.h |    9 ++-
 include/linux/sched.h     |   11 ++-
 init/Kconfig              |   39 +++++++++
 kernel/rcutiny.c          |   66 ++++++---------
 kernel/rcutiny_plugin.h   |  208 ++++++++++++++++++++++++++++++++++++++++++--
 5 files changed, 280 insertions(+), 53 deletions(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 2fea6c8..69f91aa 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -81,6 +81,12 @@ extern struct group_info init_groups;
  */
 # define CAP_INIT_BSET  CAP_FULL_SET
 
+#ifdef CONFIG_RCU_BOOST
+#define INIT_TASK_RCU_BOOST()						\
+	.rcu_boost_mutex = NULL,
+#else
+#define INIT_TASK_RCU_BOOST()
+#endif
 #ifdef CONFIG_TREE_PREEMPT_RCU
 #define INIT_TASK_RCU_TREE_PREEMPT()					\
 	.rcu_blocked_node = NULL,
@@ -92,7 +98,8 @@ extern struct group_info init_groups;
 	.rcu_read_lock_nesting = 0,					\
 	.rcu_read_unlock_special = 0,					\
 	.rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry),		\
-	INIT_TASK_RCU_TREE_PREEMPT()
+	INIT_TASK_RCU_TREE_PREEMPT()					\
+	INIT_TASK_RCU_BOOST()
 #else
 #define INIT_TASK_RCU_PREEMPT(tsk)
 #endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e18473f..ed1a9bc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1210,6 +1210,9 @@ struct task_struct {
 #ifdef CONFIG_TREE_PREEMPT_RCU
 	struct rcu_node *rcu_blocked_node;
 #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+#ifdef CONFIG_RCU_BOOST
+	struct rt_mutex *rcu_boost_mutex;
+#endif /* #ifdef CONFIG_RCU_BOOST */
 
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	struct sched_info sched_info;
@@ -1745,7 +1748,8 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
 #ifdef CONFIG_PREEMPT_RCU
 
 #define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */
-#define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */
+#define RCU_READ_UNLOCK_BOOSTED (1 << 1) /* boosted while in RCU read-side. */
+#define RCU_READ_UNLOCK_NEED_QS (1 << 2) /* RCU core needs CPU response. */
 
 static inline void rcu_copy_process(struct task_struct *p)
 {
@@ -1753,7 +1757,10 @@ static inline void rcu_copy_process(struct task_struct *p)
 	p->rcu_read_unlock_special = 0;
 #ifdef CONFIG_TREE_PREEMPT_RCU
 	p->rcu_blocked_node = NULL;
-#endif
+#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+#ifdef CONFIG_RCU_BOOST
+	p->rcu_boost_mutex = NULL;
+#endif /* #ifdef CONFIG_RCU_BOOST */
 	INIT_LIST_HEAD(&p->rcu_node_entry);
 }
 
diff --git a/init/Kconfig b/init/Kconfig
index a619a1a..48efefc 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -450,6 +450,45 @@ config TREE_RCU_TRACE
 	  TREE_PREEMPT_RCU implementations, permitting Makefile to
 	  trivially select kernel/rcutree_trace.c.
 
+config RCU_BOOST
+	bool "Enable RCU priority boosting"
+	depends on RT_MUTEXES && TINY_PREEMPT_RCU
+	default n
+	help
+	  This option boosts the priority of preempted RCU readers that
+	  block the current preemptible RCU grace period for too long.
+	  This option also prevents heavy loads from blocking RCU
+	  callback invocation for all flavors of RCU.
+
+	  Say Y here if you are working with real-time apps or heavy loads
+	  Say N here if you are unsure.
+
+config RCU_BOOST_PRIO
+	int "Real-time priority to boost RCU readers to"
+	range 1 99
+	depends on RCU_BOOST
+	default 1
+	help
+	  This option specifies the real-time priority to which preempted
+	  RCU readers are to be boosted.  If you are working with CPU-bound
+	  real-time applications, you should specify a priority higher then
+	  the highest-priority CPU-bound application.
+
+	  Specify the real-time priority, or take the default if unsure.
+
+config RCU_BOOST_DELAY
+	int "Milliseconds to delay boosting after RCU grace-period start"
+	range 0 3000
+	depends on RCU_BOOST
+	default 500
+	help
+	  This option specifies the time to wait after the beginning of
+	  a given grace period before priority-boosting preempted RCU
+	  readers blocking that grace period.  Note that any RCU reader
+	  blocking an expedited RCU grace period is boosted immediately.
+
+	  Accept the default if unsure.
+
 endmenu # "RCU Subsystem"
 
 config IKCONFIG
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 86eef29..93d1665 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -36,38 +36,16 @@
 #include <linux/time.h>
 #include <linux/cpu.h>
 
-/* Global control variables for rcupdate callback mechanism. */
-struct rcu_ctrlblk {
-	struct rcu_head *rcucblist;	/* List of pending callbacks (CBs). */
-	struct rcu_head **donetail;	/* ->next pointer of last "done" CB. */
-	struct rcu_head **curtail;	/* ->next pointer of last CB. */
-};
-
-/* Definition for rcupdate control block. */
-static struct rcu_ctrlblk rcu_sched_ctrlblk = {
-	.donetail	= &rcu_sched_ctrlblk.rcucblist,
-	.curtail	= &rcu_sched_ctrlblk.rcucblist,
-};
-
-static struct rcu_ctrlblk rcu_bh_ctrlblk = {
-	.donetail	= &rcu_bh_ctrlblk.rcucblist,
-	.curtail	= &rcu_bh_ctrlblk.rcucblist,
-};
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-int rcu_scheduler_active __read_mostly;
-EXPORT_SYMBOL_GPL(rcu_scheduler_active);
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-
-/* Controls for rcu_cbs() kthread, replacing RCU_SOFTIRQ used previously. */
-static struct task_struct *rcu_cbs_task;
-static DECLARE_WAIT_QUEUE_HEAD(rcu_cbs_wq);
-static unsigned long have_rcu_cbs;
-static void invoke_rcu_cbs(void);
+/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */
+static struct task_struct *rcu_kthread_task;
+static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
+static unsigned long have_rcu_kthread_work;
+static void invoke_rcu_kthread(void);
 
 /* Forward declarations for rcutiny_plugin.h. */
+struct rcu_ctrlblk;
 static void rcu_process_callbacks(struct rcu_ctrlblk *rcp);
-static int rcu_cbs(void *arg);
+static int rcu_kthread(void *arg);
 static void __call_rcu(struct rcu_head *head,
 		       void (*func)(struct rcu_head *rcu),
 		       struct rcu_ctrlblk *rcp);
@@ -130,7 +108,7 @@ void rcu_sched_qs(int cpu)
 {
 	if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
 	    rcu_qsctr_help(&rcu_bh_ctrlblk))
-		invoke_rcu_cbs();
+		invoke_rcu_kthread();
 }
 
 /*
@@ -139,7 +117,7 @@ void rcu_sched_qs(int cpu)
 void rcu_bh_qs(int cpu)
 {
 	if (rcu_qsctr_help(&rcu_bh_ctrlblk))
-		invoke_rcu_cbs();
+		invoke_rcu_kthread();
 }
 
 /*
@@ -201,37 +179,41 @@ static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
  * This is a kthread, but it is never stopped, at least not until
  * the system goes down.
  */
-static int rcu_cbs(void *arg)
+static int rcu_kthread(void *arg)
 {
 	unsigned long work;
+	unsigned long morework;
 	unsigned long flags;
 
 	for (;;) {
-		wait_event(rcu_cbs_wq, have_rcu_cbs != 0);
+		wait_event(rcu_kthread_wq, have_rcu_kthread_work != 0);
+		morework = rcu_boost();
 		local_irq_save(flags);
-		work = have_rcu_cbs;
-		have_rcu_cbs = 0;
+		work = have_rcu_kthread_work;
+		have_rcu_kthread_work = morework;
 		local_irq_restore(flags);
 		if (work) {
 			rcu_process_callbacks(&rcu_sched_ctrlblk);
 			rcu_process_callbacks(&rcu_bh_ctrlblk);
 			rcu_preempt_process_callbacks();
 		}
+		schedule_timeout_interruptible(1); /* Leave CPU for others. */
 	}
 
 	return 0;  /* Not reached, but needed to shut gcc up. */
 }
 
 /*
- * Wake up rcu_cbs() to process callbacks now eligible for invocation.
+ * Wake up rcu_kthread() to process callbacks now eligible for invocation
+ * or to boost readers.
  */
-static void invoke_rcu_cbs(void)
+static void invoke_rcu_kthread(void)
 {
 	unsigned long flags;
 
 	local_irq_save(flags);
-	have_rcu_cbs = 1;
-	wake_up(&rcu_cbs_wq);
+	have_rcu_kthread_work = 1;
+	wake_up(&rcu_kthread_wq);
 	local_irq_restore(flags);
 }
 
@@ -327,7 +309,11 @@ EXPORT_SYMBOL_GPL(rcu_barrier_sched);
  */
 static int __init rcu_spawn_kthreads(void)
 {
-	rcu_cbs_task = kthread_run(rcu_cbs, NULL, "rcu_cbs");
+	struct sched_param sp;
+
+	rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
+	sp.sched_priority = RCU_BOOST_PRIO;
+	sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
 	return 0;
 }
 early_initcall(rcu_spawn_kthreads);
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 95f9239..24f4316 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -24,6 +24,29 @@
 
 #include <linux/kthread.h>
 
+/* Global control variables for rcupdate callback mechanism. */
+struct rcu_ctrlblk {
+	struct rcu_head *rcucblist;	/* List of pending callbacks (CBs). */
+	struct rcu_head **donetail;	/* ->next pointer of last "done" CB. */
+	struct rcu_head **curtail;	/* ->next pointer of last CB. */
+};
+
+/* Definition for rcupdate control block. */
+static struct rcu_ctrlblk rcu_sched_ctrlblk = {
+	.donetail	= &rcu_sched_ctrlblk.rcucblist,
+	.curtail	= &rcu_sched_ctrlblk.rcucblist,
+};
+
+static struct rcu_ctrlblk rcu_bh_ctrlblk = {
+	.donetail	= &rcu_bh_ctrlblk.rcucblist,
+	.curtail	= &rcu_bh_ctrlblk.rcucblist,
+};
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+int rcu_scheduler_active __read_mostly;
+EXPORT_SYMBOL_GPL(rcu_scheduler_active);
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
 #ifdef CONFIG_TINY_PREEMPT_RCU
 
 #include <linux/delay.h>
@@ -48,17 +71,27 @@ struct rcu_preempt_ctrlblk {
 	struct list_head *gp_tasks;
 				/* Pointer to the first task blocking the */
 				/*  current grace period, or NULL if there */
-				/*  is not such task. */
+				/*  is no such task. */
 	struct list_head *exp_tasks;
 				/* Pointer to first task blocking the */
 				/*  current expedited grace period, or NULL */
 				/*  if there is no such task.  If there */
 				/*  is no current expedited grace period, */
 				/*  then there cannot be any such task. */
+#ifdef CONFIG_RCU_BOOST
+	struct list_head *boost_tasks;
+				/* Pointer to first task that needs to be */
+				/*  priority-boosted, or NULL if no priority */
+				/*  boosting is needed.  If there is no */
+				/*  current or expedited grace period, there */
+				/*  can be no such task. */
+#endif /* #ifdef CONFIG_RCU_BOOST */
 	u8 gpnum;		/* Current grace period. */
 	u8 gpcpu;		/* Last grace period blocked by the CPU. */
 	u8 completed;		/* Last grace period completed. */
 				/*  If all three are equal, RCU is idle. */
+	s8 boosted_this_gp;	/* Has boosting already happened? */
+	unsigned long boost_time; /* When to start boosting (jiffies) */
 };
 
 static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
@@ -124,6 +157,130 @@ static int rcu_preempt_gp_in_progress(void)
 }
 
 /*
+ * Advance a ->blkd_tasks-list pointer to the next entry, instead
+ * returning NULL if at the end of the list.
+ */
+static struct list_head *rcu_next_node_entry(struct task_struct *t)
+{
+	struct list_head *np;
+
+	np = t->rcu_node_entry.next;
+	if (np == &rcu_preempt_ctrlblk.blkd_tasks)
+		np = NULL;
+	return np;
+}
+
+#ifdef CONFIG_RCU_BOOST
+
+#include "rtmutex_common.h"
+
+/*
+ * Carry out RCU priority boosting on the task indicated by ->boost_tasks,
+ * and advance ->boost_tasks to the next task in the ->blkd_tasks list.
+ */
+static int rcu_boost(void)
+{
+	unsigned long flags;
+	struct rt_mutex mtx;
+	struct list_head *np;
+	struct task_struct *t;
+
+	if (rcu_preempt_ctrlblk.boost_tasks == NULL)
+		return 0;  /* Nothing to boost. */
+	raw_local_irq_save(flags);
+	rcu_preempt_ctrlblk.boosted_this_gp++;
+	t = container_of(rcu_preempt_ctrlblk.boost_tasks, struct task_struct,
+			 rcu_node_entry);
+	np = rcu_next_node_entry(t);
+	rt_mutex_init_proxy_locked(&mtx, t);
+	t->rcu_boost_mutex = &mtx;
+	t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
+	raw_local_irq_restore(flags);
+	rt_mutex_lock(&mtx);
+	rt_mutex_unlock(&mtx);
+	return rcu_preempt_ctrlblk.boost_tasks != NULL;
+}
+
+/*
+ * Check to see if it is now time to start boosting RCU readers blocking
+ * the current grace period, and, if so, tell the rcu_kthread_task to
+ * start boosting them.  If there is an expedited boost in progress,
+ * we wait for it to complete.
+ */
+static void rcu_initiate_boost(void)
+{
+	if (rcu_preempt_ctrlblk.gp_tasks != NULL &&
+	    rcu_preempt_ctrlblk.boost_tasks == NULL &&
+	    rcu_preempt_ctrlblk.boosted_this_gp == 0 &&
+	    ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) {
+		rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks;
+		invoke_rcu_kthread();
+	}
+}
+
+/*
+ * Initiate boosting for an expedited grace period.
+ */
+static void rcu_initiate_expedited_boost(void)
+{
+	unsigned long flags;
+
+	raw_local_irq_save(flags);
+	if (!list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) {
+		rcu_preempt_ctrlblk.boost_tasks =
+			rcu_preempt_ctrlblk.blkd_tasks.next;
+		rcu_preempt_ctrlblk.boosted_this_gp = -1;
+		invoke_rcu_kthread();
+	}
+	raw_local_irq_restore(flags);
+}
+
+#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000);
+
+/*
+ * Do priority-boost accounting for the start of a new grace period.
+ */
+static void rcu_preempt_boost_start_gp(void)
+{
+	rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
+	if (rcu_preempt_ctrlblk.boosted_this_gp > 0)
+		rcu_preempt_ctrlblk.boosted_this_gp = 0;
+}
+
+#else /* #ifdef CONFIG_RCU_BOOST */
+
+/*
+ * If there is no RCU priority boosting, we don't boost.
+ */
+static int rcu_boost(void)
+{
+	return 0;
+}
+
+/*
+ * If there is no RCU priority boosting, we don't initiate boosting.
+ */
+static void rcu_initiate_boost(void)
+{
+}
+
+/*
+ * If there is no RCU priority boosting, we don't initiate expedited boosting.
+ */
+static void rcu_initiate_expedited_boost(void)
+{
+}
+
+/*
+ * If there is no RCU priority boosting, nothing to do at grace-period start.
+ */
+static void rcu_preempt_boost_start_gp(void)
+{
+}
+
+#endif /* else #ifdef CONFIG_RCU_BOOST */
+
+/*
  * Record a preemptible-RCU quiescent state for the specified CPU.  Note
  * that this just means that the task currently running on the CPU is
  * in a quiescent state.  There might be any number of tasks blocked
@@ -150,12 +307,14 @@ static void rcu_preempt_cpu_qs(void)
 	rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum;
 	current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
 
-	/*
-	 * If there is no GP, or if blocked readers are still blocking GP,
-	 * then there is nothing more to do.
-	 */
+	/* If there is no GP then there is nothing more to do.  */
 	if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp())
 		return;
+	/* If there are blocked readers, go check up on boosting. */
+	if (rcu_preempt_blocked_readers_cgp()) {
+		rcu_initiate_boost();
+		return;
+	}
 
 	/* Advance callbacks. */
 	rcu_preempt_ctrlblk.completed = rcu_preempt_ctrlblk.gpnum;
@@ -168,7 +327,7 @@ static void rcu_preempt_cpu_qs(void)
 
 	/* If there are done callbacks, cause them to be invoked. */
 	if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
-		invoke_rcu_cbs();
+		invoke_rcu_kthread();
 }
 
 /*
@@ -186,6 +345,9 @@ static void rcu_preempt_start_gp(void)
 			rcu_preempt_ctrlblk.gp_tasks =
 				rcu_preempt_ctrlblk.blkd_tasks.next;
 
+		/* Set up for RCU priority boosting. */
+		rcu_preempt_boost_start_gp();
+
 		/* If there is no running reader, CPU is done with GP. */
 		if (!rcu_preempt_running_reader())
 			rcu_preempt_cpu_qs();
@@ -306,14 +468,16 @@ static void rcu_read_unlock_special(struct task_struct *t)
 		 */
 		empty = !rcu_preempt_blocked_readers_cgp();
 		empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
-		np = t->rcu_node_entry.next;
-		if (np == &rcu_preempt_ctrlblk.blkd_tasks)
-			np = NULL;
+		np = rcu_next_node_entry(t);
 		list_del(&t->rcu_node_entry);
 		if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
 			rcu_preempt_ctrlblk.gp_tasks = np;
 		if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
 			rcu_preempt_ctrlblk.exp_tasks = np;
+#ifdef CONFIG_RCU_BOOST
+		if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks)
+			rcu_preempt_ctrlblk.boost_tasks = np;
+#endif /* #ifdef CONFIG_RCU_BOOST */
 		INIT_LIST_HEAD(&t->rcu_node_entry);
 
 		/*
@@ -333,6 +497,14 @@ static void rcu_read_unlock_special(struct task_struct *t)
 		if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL)
 			rcu_report_exp_done();
 	}
+#ifdef CONFIG_RCU_BOOST
+	/* Unboost self if was boosted. */
+	if (special & RCU_READ_UNLOCK_BOOSTED) {
+		t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED;
+		rt_mutex_unlock(t->rcu_boost_mutex);
+		t->rcu_boost_mutex = NULL;
+	}
+#endif /* #ifdef CONFIG_RCU_BOOST */
 	local_irq_restore(flags);
 }
 
@@ -376,7 +548,7 @@ static void rcu_preempt_check_callbacks(void)
 		rcu_preempt_cpu_qs();
 	if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
 	    rcu_preempt_ctrlblk.rcb.donetail)
-		invoke_rcu_cbs();
+		invoke_rcu_kthread();
 	if (rcu_preempt_gp_in_progress() &&
 	    rcu_cpu_blocking_cur_gp() &&
 	    rcu_preempt_running_reader())
@@ -534,6 +706,7 @@ void synchronize_rcu_expedited(void)
 
 	/* Wait for tail of ->blkd_tasks list to drain. */
 	if (rcu_preempted_readers_exp())
+		rcu_initiate_expedited_boost();
 		wait_event(sync_rcu_preempt_exp_wq,
 			   !rcu_preempted_readers_exp());
 
@@ -575,6 +748,15 @@ void exit_rcu(void)
 #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
 
 /*
+ * Because preemptible RCU does not exist, it is never necessary to
+ * boost preempted RCU readers.
+ */
+static int rcu_boost(void)
+{
+	return 0;
+}
+
+/*
  * Because preemptible RCU does not exist, it never has any callbacks
  * to check.
  */
@@ -614,3 +796,9 @@ void __init rcu_scheduler_starting(void)
 }
 
 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+#ifdef CONFIG_RCU_BOOST
+#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
+#else /* #ifdef CONFIG_RCU_BOOST */
+#define RCU_BOOST_PRIO 1
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
-- 
1.7.3.2


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH RFC tip/core/rcu 04/20] rcu: add tracing for TINY_RCU and TINY_PREEMPT_RCU
  2010-12-17 20:54 [PATCH tip/core/rcu 0/20] second preview of RCU patches for 2.6.38 Paul E. McKenney
                   ` (2 preceding siblings ...)
  2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 03/20] rcu: priority boosting for TINY_PREEMPT_RCU Paul E. McKenney
@ 2010-12-17 20:54 ` Paul E. McKenney
  2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 05/20] rcu: document TINY_RCU and TINY_PREEMPT_RCU tracing Paul E. McKenney
                   ` (15 subsequent siblings)
  19 siblings, 0 replies; 34+ messages in thread
From: Paul E. McKenney @ 2010-12-17 20:54 UTC (permalink / raw)
  To: linux-kernel
  Cc: mingo, laijs, dipankar, akpm, mathieu.desnoyers, josh, niv, tglx,
	peterz, rostedt, Valdis.Kletnieks, dhowells, eric.dumazet,
	darren, Paul E. McKenney, Paul E. McKenney

From: Paul E. McKenney <paul.mckenney@linaro.org>

Add tracing for the tiny RCU implementations, including statistics on
boosting in the case of TINY_PREEMPT_RCU and RCU_BOOST.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 init/Kconfig            |    1 -
 kernel/rcutiny.c        |    4 +
 kernel/rcutiny_plugin.h |  232 +++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 226 insertions(+), 11 deletions(-)

diff --git a/init/Kconfig b/init/Kconfig
index 48efefc..929adf6 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -384,7 +384,6 @@ config PREEMPT_RCU
 
 config RCU_TRACE
 	bool "Enable tracing for RCU"
-	depends on TREE_RCU || TREE_PREEMPT_RCU
 	help
 	  This option provides tracing in RCU which presents stats
 	  in debugfs for debugging RCU implementation.
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 93d1665..0344937 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -144,6 +144,7 @@ static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 {
 	struct rcu_head *next, *list;
 	unsigned long flags;
+	RCU_TRACE(int cb_count = 0);
 
 	/* If no RCU callbacks ready to invoke, just return. */
 	if (&rcp->rcucblist == rcp->donetail)
@@ -169,7 +170,9 @@ static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 		list->func(list);
 		local_bh_enable();
 		list = next;
+		RCU_TRACE(cb_count++);
 	}
+	RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
 }
 
 /*
@@ -252,6 +255,7 @@ static void __call_rcu(struct rcu_head *head,
 	local_irq_save(flags);
 	*rcp->curtail = head;
 	rcp->curtail = &head->next;
+	RCU_TRACE(rcp->qlen++);
 	local_irq_restore(flags);
 }
 
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 24f4316..f4e0df0 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -23,12 +23,21 @@
  */
 
 #include <linux/kthread.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#ifdef CONFIG_RCU_TRACE
+#define RCU_TRACE(stmt)	stmt
+#else /* #ifdef CONFIG_RCU_TRACE */
+#define RCU_TRACE(stmt)
+#endif /* #else #ifdef CONFIG_RCU_TRACE */
 
 /* Global control variables for rcupdate callback mechanism. */
 struct rcu_ctrlblk {
 	struct rcu_head *rcucblist;	/* List of pending callbacks (CBs). */
 	struct rcu_head **donetail;	/* ->next pointer of last "done" CB. */
 	struct rcu_head **curtail;	/* ->next pointer of last CB. */
+	RCU_TRACE(long qlen);		/* Number of pending CBs. */
 };
 
 /* Definition for rcupdate control block. */
@@ -90,8 +99,26 @@ struct rcu_preempt_ctrlblk {
 	u8 gpcpu;		/* Last grace period blocked by the CPU. */
 	u8 completed;		/* Last grace period completed. */
 				/*  If all three are equal, RCU is idle. */
+#ifdef CONFIG_RCU_BOOST
 	s8 boosted_this_gp;	/* Has boosting already happened? */
 	unsigned long boost_time; /* When to start boosting (jiffies) */
+#endif /* #ifdef CONFIG_RCU_BOOST */
+#ifdef CONFIG_RCU_TRACE
+	unsigned long n_grace_periods;
+#ifdef CONFIG_RCU_BOOST
+	unsigned long n_tasks_boosted;
+	unsigned long n_exp_boosts;
+	unsigned long n_normal_boosts;
+	unsigned long n_normal_balk_blkd_tasks;
+	unsigned long n_normal_balk_gp_tasks;
+	unsigned long n_normal_balk_boost_tasks;
+	unsigned long n_normal_balk_boosted;
+	unsigned long n_normal_balk_notyet;
+	unsigned long n_normal_balk_nos;
+	unsigned long n_exp_balk_blkd_tasks;
+	unsigned long n_exp_balk_nos;
+#endif /* #ifdef CONFIG_RCU_BOOST */
+#endif /* #ifdef CONFIG_RCU_TRACE */
 };
 
 static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
@@ -170,6 +197,65 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t)
 	return np;
 }
 
+#ifdef CONFIG_RCU_TRACE
+
+#ifdef CONFIG_RCU_BOOST
+static void rcu_initiate_boost_trace(void);
+static void rcu_initiate_exp_boost_trace(void);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+
+/*
+ * Dump additional statistice for TINY_PREEMPT_RCU.
+ */
+static void show_tiny_preempt_stats(struct seq_file *m)
+{
+	seq_printf(m, "rcu_preempt: qlen=%ld gp=%lu g%u/p%u/c%u tasks=%c%c%c\n",
+		   rcu_preempt_ctrlblk.rcb.qlen,
+		   rcu_preempt_ctrlblk.n_grace_periods,
+		   rcu_preempt_ctrlblk.gpnum,
+		   rcu_preempt_ctrlblk.gpcpu,
+		   rcu_preempt_ctrlblk.completed,
+		   "T."[list_empty(&rcu_preempt_ctrlblk.blkd_tasks)],
+		   "N."[!rcu_preempt_ctrlblk.gp_tasks],
+		   "E."[!rcu_preempt_ctrlblk.exp_tasks]);
+#ifdef CONFIG_RCU_BOOST
+	seq_printf(m, "             ttb=%c btg=",
+		   "B."[!rcu_preempt_ctrlblk.boost_tasks]);
+	switch (rcu_preempt_ctrlblk.boosted_this_gp) {
+	case -1:
+		seq_puts(m, "exp");
+		break;
+	case 0:
+		seq_puts(m, "no");
+		break;
+	case 1:
+		seq_puts(m, "done");
+		break;
+	default:
+		seq_printf(m, "?%d?", rcu_preempt_ctrlblk.boosted_this_gp);
+	}
+	seq_printf(m, " ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
+		   rcu_preempt_ctrlblk.n_tasks_boosted,
+		   rcu_preempt_ctrlblk.n_exp_boosts,
+		   rcu_preempt_ctrlblk.n_normal_boosts,
+		   (int)(jiffies & 0xffff),
+		   (int)(rcu_preempt_ctrlblk.boost_time & 0xffff));
+	seq_printf(m, "             %s: nt=%lu gt=%lu bt=%lu b=%lu ny=%lu nos=%lu\n",
+		   "normal balk",
+		   rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks,
+		   rcu_preempt_ctrlblk.n_normal_balk_gp_tasks,
+		   rcu_preempt_ctrlblk.n_normal_balk_boost_tasks,
+		   rcu_preempt_ctrlblk.n_normal_balk_boosted,
+		   rcu_preempt_ctrlblk.n_normal_balk_notyet,
+		   rcu_preempt_ctrlblk.n_normal_balk_nos);
+	seq_printf(m, "             exp balk: bt=%lu nos=%lu\n",
+		   rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks,
+		   rcu_preempt_ctrlblk.n_exp_balk_nos);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+}
+
+#endif /* #ifdef CONFIG_RCU_TRACE */
+
 #ifdef CONFIG_RCU_BOOST
 
 #include "rtmutex_common.h"
@@ -197,6 +283,7 @@ static int rcu_boost(void)
 	t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
 	raw_local_irq_restore(flags);
 	rt_mutex_lock(&mtx);
+	RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++);
 	rt_mutex_unlock(&mtx);
 	return rcu_preempt_ctrlblk.boost_tasks != NULL;
 }
@@ -206,16 +293,27 @@ static int rcu_boost(void)
  * the current grace period, and, if so, tell the rcu_kthread_task to
  * start boosting them.  If there is an expedited boost in progress,
  * we wait for it to complete.
+ *
+ * If there are no blocked readers blocking the current grace period,
+ * return 0 to let the caller know, otherwise return 1.  Note that this
+ * return value is independent of whether or not boosting was done.
  */
-static void rcu_initiate_boost(void)
+static int rcu_initiate_boost(void)
 {
+	if (!rcu_preempt_blocked_readers_cgp()) {
+		RCU_TRACE(rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks++);
+		return 0;
+	}
 	if (rcu_preempt_ctrlblk.gp_tasks != NULL &&
 	    rcu_preempt_ctrlblk.boost_tasks == NULL &&
 	    rcu_preempt_ctrlblk.boosted_this_gp == 0 &&
 	    ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) {
 		rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks;
 		invoke_rcu_kthread();
-	}
+		RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
+	} else
+		RCU_TRACE(rcu_initiate_boost_trace());
+	return 1;
 }
 
 /*
@@ -231,7 +329,9 @@ static void rcu_initiate_expedited_boost(void)
 			rcu_preempt_ctrlblk.blkd_tasks.next;
 		rcu_preempt_ctrlblk.boosted_this_gp = -1;
 		invoke_rcu_kthread();
-	}
+		RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
+	} else
+		RCU_TRACE(rcu_initiate_exp_boost_trace());
 	raw_local_irq_restore(flags);
 }
 
@@ -258,10 +358,13 @@ static int rcu_boost(void)
 }
 
 /*
- * If there is no RCU priority boosting, we don't initiate boosting.
+ * If there is no RCU priority boosting, we don't initiate boosting,
+ * but we do indicate whether there are blocked readers blocking the
+ * current grace period.
  */
-static void rcu_initiate_boost(void)
+static int rcu_initiate_boost(void)
 {
+	return rcu_preempt_blocked_readers_cgp();
 }
 
 /*
@@ -308,13 +411,14 @@ static void rcu_preempt_cpu_qs(void)
 	current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
 
 	/* If there is no GP then there is nothing more to do.  */
-	if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp())
+	if (!rcu_preempt_gp_in_progress())
 		return;
-	/* If there are blocked readers, go check up on boosting. */
-	if (rcu_preempt_blocked_readers_cgp()) {
-		rcu_initiate_boost();
+	/*
+	 * Check up on boosting.  If there are no readers blocking the
+	 * current grace period, leave.
+	 */
+	if (rcu_initiate_boost())
 		return;
-	}
 
 	/* Advance callbacks. */
 	rcu_preempt_ctrlblk.completed = rcu_preempt_ctrlblk.gpnum;
@@ -339,6 +443,7 @@ static void rcu_preempt_start_gp(void)
 
 		/* Official start of GP. */
 		rcu_preempt_ctrlblk.gpnum++;
+		RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++);
 
 		/* Any blocked RCU readers block new GP. */
 		if (rcu_preempt_blocked_readers_any())
@@ -591,6 +696,7 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 	local_irq_save(flags);
 	*rcu_preempt_ctrlblk.nexttail = head;
 	rcu_preempt_ctrlblk.nexttail = &head->next;
+	RCU_TRACE(rcu_preempt_ctrlblk.rcb.qlen++);
 	rcu_preempt_start_gp();  /* checks to see if GP needed. */
 	local_irq_restore(flags);
 }
@@ -747,6 +853,18 @@ void exit_rcu(void)
 
 #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
 
+#ifdef CONFIG_RCU_TRACE
+
+/*
+ * Because preemptible RCU does not exist, it is not necessary to
+ * dump out its statistics.
+ */
+static void show_tiny_preempt_stats(struct seq_file *m)
+{
+}
+
+#endif /* #ifdef CONFIG_RCU_TRACE */
+
 /*
  * Because preemptible RCU does not exist, it is never necessary to
  * boost preempted RCU readers.
@@ -802,3 +920,97 @@ void __init rcu_scheduler_starting(void)
 #else /* #ifdef CONFIG_RCU_BOOST */
 #define RCU_BOOST_PRIO 1
 #endif /* #else #ifdef CONFIG_RCU_BOOST */
+
+#ifdef CONFIG_RCU_TRACE
+
+#ifdef CONFIG_RCU_BOOST
+
+static void rcu_initiate_boost_trace(void)
+{
+	if (rcu_preempt_ctrlblk.gp_tasks == NULL)
+		rcu_preempt_ctrlblk.n_normal_balk_gp_tasks++;
+	else if (rcu_preempt_ctrlblk.boost_tasks != NULL)
+		rcu_preempt_ctrlblk.n_normal_balk_boost_tasks++;
+	else if (rcu_preempt_ctrlblk.boosted_this_gp != 0)
+		rcu_preempt_ctrlblk.n_normal_balk_boosted++;
+	else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))
+		rcu_preempt_ctrlblk.n_normal_balk_notyet++;
+	else
+		rcu_preempt_ctrlblk.n_normal_balk_nos++;
+}
+
+static void rcu_initiate_exp_boost_trace(void)
+{
+	if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
+		rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks++;
+	else
+		rcu_preempt_ctrlblk.n_exp_balk_nos++;
+}
+
+#endif /* #ifdef CONFIG_RCU_BOOST */
+
+static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
+{
+	unsigned long flags;
+
+	raw_local_irq_save(flags);
+	rcp->qlen -= n;
+	raw_local_irq_restore(flags);
+}
+
+/*
+ * Dump statistics for TINY_RCU, such as they are.
+ */
+static int show_tiny_stats(struct seq_file *m, void *unused)
+{
+	show_tiny_preempt_stats(m);
+	seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen);
+	seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen);
+	return 0;
+}
+
+static int show_tiny_stats_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, show_tiny_stats, NULL);
+}
+
+static const struct file_operations show_tiny_stats_fops = {
+	.owner = THIS_MODULE,
+	.open = show_tiny_stats_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static struct dentry *rcudir;
+
+static int __init rcutiny_trace_init(void)
+{
+	struct dentry *retval;
+
+	rcudir = debugfs_create_dir("rcu", NULL);
+	if (!rcudir)
+		goto free_out;
+	retval = debugfs_create_file("rcudata", 0444, rcudir,
+				     NULL, &show_tiny_stats_fops);
+	if (!retval)
+		goto free_out;
+	return 0;
+free_out:
+	debugfs_remove_recursive(rcudir);
+	return 1;
+}
+
+static void __exit rcutiny_trace_cleanup(void)
+{
+	debugfs_remove_recursive(rcudir);
+}
+
+module_init(rcutiny_trace_init);
+module_exit(rcutiny_trace_cleanup);
+
+MODULE_AUTHOR("Paul E. McKenney");
+MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
+MODULE_LICENSE("GPL");
+
+#endif /* #ifdef CONFIG_RCU_TRACE */
-- 
1.7.3.2


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH RFC tip/core/rcu 05/20] rcu: document TINY_RCU and TINY_PREEMPT_RCU tracing.
  2010-12-17 20:54 [PATCH tip/core/rcu 0/20] second preview of RCU patches for 2.6.38 Paul E. McKenney
                   ` (3 preceding siblings ...)
  2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 04/20] rcu: add tracing for TINY_RCU and TINY_PREEMPT_RCU Paul E. McKenney
@ 2010-12-17 20:54 ` Paul E. McKenney
  2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 06/20] rcu: Distinguish between boosting and boosted Paul E. McKenney
                   ` (14 subsequent siblings)
  19 siblings, 0 replies; 34+ messages in thread
From: Paul E. McKenney @ 2010-12-17 20:54 UTC (permalink / raw)
  To: linux-kernel
  Cc: mingo, laijs, dipankar, akpm, mathieu.desnoyers, josh, niv, tglx,
	peterz, rostedt, Valdis.Kletnieks, dhowells, eric.dumazet,
	darren, Paul E. McKenney, Paul E. McKenney

From: Paul E. McKenney <paul.mckenney@linaro.org>

Add the required verbiage to Documentation/RCU/trace.txt.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 Documentation/RCU/trace.txt |  132 ++++++++++++++++++++++++++++++++++++++++---
 1 files changed, 124 insertions(+), 8 deletions(-)

diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index a851118..ff6d3f1 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -1,18 +1,22 @@
 CONFIG_RCU_TRACE debugfs Files and Formats
 
 
-The rcutree implementation of RCU provides debugfs trace output that
-summarizes counters and state.  This information is useful for debugging
-RCU itself, and can sometimes also help to debug abuses of RCU.
-The following sections describe the debugfs files and formats.
+The rcutree and rcutiny implementations of RCU provide debugfs trace
+output that summarizes counters and state.  This information is useful for
+debugging RCU itself, and can sometimes also help to debug abuses of RCU.
+The following sections describe the debugfs files and formats, first
+for rcutree and next for rcutiny.
 
 
-Hierarchical RCU debugfs Files and Formats
+CONFIG_TREE_RCU and CONFIG_TREE_PREEMPT_RCU debugfs Files and Formats
 
-This implementation of RCU provides three debugfs files under the
+These implementations of RCU provides five debugfs files under the
 top-level directory RCU: rcu/rcudata (which displays fields in struct
-rcu_data), rcu/rcugp (which displays grace-period counters), and
-rcu/rcuhier (which displays the struct rcu_node hierarchy).
+rcu_data), rcu/rcudata.csv (which is a .csv spreadsheet version of
+rcu/rcudata), rcu/rcugp (which displays grace-period counters),
+rcu/rcuhier (which displays the struct rcu_node hierarchy), and
+rcu/rcu_pending (which displays counts of the reasons that the
+rcu_pending() function decided that there was core RCU work to do).
 
 The output of "cat rcu/rcudata" looks as follows:
 
@@ -326,3 +330,115 @@ o	"nn" is the number of times that this CPU needed nothing.  Alert
 	readers will note that the rcu "nn" number for a given CPU very
 	closely matches the rcu_bh "np" number for that same CPU.  This
 	is due to short-circuit evaluation in rcu_pending().
+
+
+CONFIG_TINY_RCU and CONFIG_TINY_PREEMPT_RCU debugfs Files and Formats
+
+These implementations of RCU provides a single debugfs file under the
+top-level directory RCU, namely rcu/rcudata, which displays fields in
+rcu_bh_ctrlblk, rcu_sched_ctrlblk and, for CONFIG_TINY_PREEMPT_RCU,
+rcu_preempt_ctrlblk.
+
+The output of "cat rcu/rcudata" is as follows:
+
+rcu_preempt: qlen=24 gp=1097669 g197/p197/c197 tasks=...
+             ttb=. btg=no ntb=184 neb=0 nnb=183 j=01f7 bt=0274
+             normal balk: nt=1097669 gt=0 bt=371 b=0 ny=25073378 nos=0
+             exp balk: bt=0 nos=0
+rcu_sched: qlen: 0
+rcu_bh: qlen: 0
+
+This is split into rcu_preempt, rcu_sched, and rcu_bh sections, with the
+rcu_preempt section appearing only in CONFIG_TINY_PREEMPT_RCU builds.
+The last three lines of the rcu_preempt section appear only in
+CONFIG_RCU_BOOST kernel builds.  The fields are as follows:
+
+o	"qlen" is the number of RCU callbacks currently waiting either
+	for an RCU grace period or waiting to be invoked.  This is the
+	only field present for rcu_sched and rcu_bh, due to the
+	short-circuiting of grace period in those two cases.
+
+o	"gp" is the number of grace periods that have completed.
+
+o	"g197/p197/c197" displays the grace-period state, with the
+	"g" number being the number of grace periods that have started
+	(mod 256), the "p" number being the number of grace periods
+	that the CPU has responded to (also mod 256), and the "c"
+	number being the number of grace periods that have completed
+	(once again mode 256).
+
+	Why have both "gp" and "g"?  Because the data flowing into
+	"gp" is only present in a CONFIG_RCU_TRACE kernel.
+
+o	"tasks" is a set of bits.  The first bit is "T" if there are
+	currently tasks that have recently blocked within an RCU
+	read-side critical section, the second bit is "N" if any of the
+	aforementioned tasks are blocking the current RCU grace period,
+	and the third bit is "E" if any of the aforementioned tasks are
+	blocking the current expedited grace period.  Each bit is "."
+	if the corresponding condition does not hold.
+
+o	"ttb" is a single bit.  It is "B" if any of the blocked tasks
+	need to be priority boosted and "." otherwise.
+
+o	"btg" indicates whether boosting has been carried out during
+	the current grace period, with "exp" indicating that boosting
+	is in progress for an expedited grace period, "no" indicating
+	that boosting has not yet started for a normal grace period,
+	"begun" indicating that boosting has bebug for a normal grace
+	period, and "done" indicating that boosting has completed for
+	a normal grace period.
+
+o	"ntb" is the total number of tasks subjected to RCU priority boosting
+	periods since boot.
+
+o	"neb" is the number of expedited grace periods that have had
+	to resort to RCU priority boosting since boot.
+
+o	"nnb" is the number of normal grace periods that have had
+	to resort to RCU priority boosting since boot.
+
+o	"j" is the low-order 12 bits of the jiffies counter in hexadecimal.
+
+o	"bt" is the low-order 12 bits of the value that the jiffies counter
+	will have at the next time that boosting is scheduled to begin.
+
+o	In the line beginning with "normal balk", the fields are as follows:
+
+	o	"nt" is the number of times that the system balked from
+		boosting because there were no blocked tasks to boost.
+		Note that the system will balk from boosting even if the
+		grace period is overdue when the currently running task
+		is looping within an RCU read-side critical section.
+		There is no point in boosting in this case, because
+		boosting a running task won't make it run any faster.
+
+	o	"gt" is the number of times that the system balked
+		from boosting because, although there were blocked tasks,
+		none of them were preventing the current grace period
+		from completing.
+
+	o	"bt" is the number of times that the system balked
+		from boosting because boosting was already in progress.
+
+	o	"b" is the number of times that the system balked from
+		boosting because boosting had already completed for
+		the grace period in question.
+
+	o	"ny" is the number of times that the system balked from
+		boosting because it was not yet time to start boosting
+		the grace period in question.
+
+	o	"nos" is the number of times that the system balked from
+		boosting for inexplicable ("not otherwise specified")
+		reasons.  This can actually happen due to races involving
+		increments of the jiffies counter.
+
+o	In the line beginning with "exp balk", the fields are as follows:
+
+	o	"bt" is the number of times that the system balked from
+		boosting because there were no blocked tasks to boost.
+
+	o	"nos" is the number of times that the system balked from
+		 boosting for inexplicable ("not otherwise specified")
+		 reasons.
-- 
1.7.3.2


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH RFC tip/core/rcu 06/20] rcu: Distinguish between boosting and boosted
  2010-12-17 20:54 [PATCH tip/core/rcu 0/20] second preview of RCU patches for 2.6.38 Paul E. McKenney
                   ` (4 preceding siblings ...)
  2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 05/20] rcu: document TINY_RCU and TINY_PREEMPT_RCU tracing Paul E. McKenney
@ 2010-12-17 20:54 ` Paul E. McKenney
  2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 07/20] rcu: get rid of obsolete "classic" names in TREE_RCU tracing Paul E. McKenney
                   ` (13 subsequent siblings)
  19 siblings, 0 replies; 34+ messages in thread
From: Paul E. McKenney @ 2010-12-17 20:54 UTC (permalink / raw)
  To: linux-kernel
  Cc: mingo, laijs, dipankar, akpm, mathieu.desnoyers, josh, niv, tglx,
	peterz, rostedt, Valdis.Kletnieks, dhowells, eric.dumazet,
	darren, Paul E. McKenney, Paul E. McKenney

From: Paul E. McKenney <paul.mckenney@linaro.org>

RCU priority boosting's tracing did not distinguish between ongoing
boosting and completion of boosting.  This commit therefore adds this
capability.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutiny_plugin.h |    4 ++++
 1 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index f4e0df0..015abae 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -229,6 +229,9 @@ static void show_tiny_preempt_stats(struct seq_file *m)
 		seq_puts(m, "no");
 		break;
 	case 1:
+		seq_puts(m, "begun");
+		break;
+	case 2:
 		seq_puts(m, "done");
 		break;
 	default:
@@ -284,6 +287,7 @@ static int rcu_boost(void)
 	raw_local_irq_restore(flags);
 	rt_mutex_lock(&mtx);
 	RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++);
+	rcu_preempt_ctrlblk.boosted_this_gp++;
 	rt_mutex_unlock(&mtx);
 	return rcu_preempt_ctrlblk.boost_tasks != NULL;
 }
-- 
1.7.3.2


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH RFC tip/core/rcu 07/20] rcu: get rid of obsolete "classic" names in TREE_RCU tracing
  2010-12-17 20:54 [PATCH tip/core/rcu 0/20] second preview of RCU patches for 2.6.38 Paul E. McKenney
                   ` (5 preceding siblings ...)
  2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 06/20] rcu: Distinguish between boosting and boosted Paul E. McKenney
@ 2010-12-17 20:54 ` Paul E. McKenney
  2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 08/20] rcu,cleanup: move synchronize_sched_expedited() out of sched.c Paul E. McKenney
                   ` (12 subsequent siblings)
  19 siblings, 0 replies; 34+ messages in thread
From: Paul E. McKenney @ 2010-12-17 20:54 UTC (permalink / raw)
  To: linux-kernel
  Cc: mingo, laijs, dipankar, akpm, mathieu.desnoyers, josh, niv, tglx,
	peterz, rostedt, Valdis.Kletnieks, dhowells, eric.dumazet,
	darren, Paul E. McKenney

The TREE_RCU tracing had obsolete rcuclassic_trace_init() and
rcuclassic_trace_cleanup() function names.  This commit brings them
up to date: rcutree_trace_init() and rcutree_trace_cleanup(),
respectively.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree_trace.c |    8 ++++----
 1 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index d15430b..78ad3c3 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -300,7 +300,7 @@ static const struct file_operations rcu_pending_fops = {
 
 static struct dentry *rcudir;
 
-static int __init rcuclassic_trace_init(void)
+static int __init rcutree_trace_init(void)
 {
 	struct dentry *retval;
 
@@ -337,14 +337,14 @@ free_out:
 	return 1;
 }
 
-static void __exit rcuclassic_trace_cleanup(void)
+static void __exit rcutree_trace_cleanup(void)
 {
 	debugfs_remove_recursive(rcudir);
 }
 
 
-module_init(rcuclassic_trace_init);
-module_exit(rcuclassic_trace_cleanup);
+module_init(rcutree_trace_init);
+module_exit(rcutree_trace_cleanup);
 
 MODULE_AUTHOR("Paul E. McKenney");
 MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation");
-- 
1.7.3.2


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH RFC tip/core/rcu 08/20] rcu,cleanup: move synchronize_sched_expedited() out of sched.c
  2010-12-17 20:54 [PATCH tip/core/rcu 0/20] second preview of RCU patches for 2.6.38 Paul E. McKenney
                   ` (6 preceding siblings ...)
  2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 07/20] rcu: get rid of obsolete "classic" names in TREE_RCU tracing Paul E. McKenney
@ 2010-12-17 20:54 ` Paul E. McKenney
  2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 09/20] rcu,cleanup: simplify the code when cpu is dying Paul E. McKenney
                   ` (11 subsequent siblings)
  19 siblings, 0 replies; 34+ messages in thread
From: Paul E. McKenney @ 2010-12-17 20:54 UTC (permalink / raw)
  To: linux-kernel
  Cc: mingo, laijs, dipankar, akpm, mathieu.desnoyers, josh, niv, tglx,
	peterz, rostedt, Valdis.Kletnieks, dhowells, eric.dumazet,
	darren, Paul E. McKenney

From: Lai Jiangshan <laijs@cn.fujitsu.com>

The first version of synchronize_sched_expedited() used the migration
code in the scheduler, and was therefore implemented in kernel/sched.c.
However, the more recent version of this code no longer uses the
migration code, so this commit moves it to the main RCU source files.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h |    1 -
 include/linux/rcutiny.h  |    5 +++
 include/linux/rcutree.h  |    1 +
 kernel/rcutree_plugin.h  |   71 ++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched.c           |   69 --------------------------------------------
 5 files changed, 77 insertions(+), 70 deletions(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 7142ee3..49e8e16 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -66,7 +66,6 @@ extern void call_rcu_sched(struct rcu_head *head,
 extern void synchronize_sched(void);
 extern void rcu_barrier_bh(void);
 extern void rcu_barrier_sched(void);
-extern void synchronize_sched_expedited(void);
 extern int sched_expedited_torture_stats(char *page);
 
 static inline void __rcu_read_lock_bh(void)
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index ea025a6..30ebd7c 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -60,6 +60,11 @@ static inline void synchronize_rcu_bh_expedited(void)
 	synchronize_sched();
 }
 
+static inline void synchronize_sched_expedited(void)
+{
+	synchronize_sched();
+}
+
 #ifdef CONFIG_TINY_RCU
 
 static inline void rcu_preempt_note_context_switch(void)
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index c0e9683..3a93348 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -48,6 +48,7 @@ static inline void exit_rcu(void)
 #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
 
 extern void synchronize_rcu_bh(void);
+extern void synchronize_sched_expedited(void);
 extern void synchronize_rcu_expedited(void);
 
 static inline void synchronize_rcu_bh_expedited(void)
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 71a4147..21df7f3 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -25,6 +25,7 @@
  */
 
 #include <linux/delay.h>
+#include <linux/stop_machine.h>
 
 /*
  * Check the RCU kernel configuration parameters and print informative
@@ -1014,6 +1015,76 @@ static void __init __rcu_init_preempt(void)
 
 #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
 
+#ifndef CONFIG_SMP
+
+void synchronize_sched_expedited(void)
+{
+	cond_resched();
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+
+#else /* #ifndef CONFIG_SMP */
+
+static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
+
+static int synchronize_sched_expedited_cpu_stop(void *data)
+{
+	/*
+	 * There must be a full memory barrier on each affected CPU
+	 * between the time that try_stop_cpus() is called and the
+	 * time that it returns.
+	 *
+	 * In the current initial implementation of cpu_stop, the
+	 * above condition is already met when the control reaches
+	 * this point and the following smp_mb() is not strictly
+	 * necessary.  Do smp_mb() anyway for documentation and
+	 * robustness against future implementation changes.
+	 */
+	smp_mb(); /* See above comment block. */
+	return 0;
+}
+
+/*
+ * Wait for an rcu-sched grace period to elapse, but use "big hammer"
+ * approach to force grace period to end quickly.  This consumes
+ * significant time on all CPUs, and is thus not recommended for
+ * any sort of common-case code.
+ *
+ * Note that it is illegal to call this function while holding any
+ * lock that is acquired by a CPU-hotplug notifier.  Failing to
+ * observe this restriction will result in deadlock.
+ */
+void synchronize_sched_expedited(void)
+{
+	int snap, trycount = 0;
+
+	smp_mb();  /* ensure prior mod happens before capturing snap. */
+	snap = atomic_read(&synchronize_sched_expedited_count) + 1;
+	get_online_cpus();
+	while (try_stop_cpus(cpu_online_mask,
+			     synchronize_sched_expedited_cpu_stop,
+			     NULL) == -EAGAIN) {
+		put_online_cpus();
+		if (trycount++ < 10)
+			udelay(trycount * num_online_cpus());
+		else {
+			synchronize_sched();
+			return;
+		}
+		if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
+			smp_mb(); /* ensure test happens before caller kfree */
+			return;
+		}
+		get_online_cpus();
+	}
+	atomic_inc(&synchronize_sched_expedited_count);
+	smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
+	put_online_cpus();
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+
+#endif /* #else #ifndef CONFIG_SMP */
+
 #if !defined(CONFIG_RCU_FAST_NO_HZ)
 
 /*
diff --git a/kernel/sched.c b/kernel/sched.c
index ae8f75a..d1e8889 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9131,72 +9131,3 @@ struct cgroup_subsys cpuacct_subsys = {
 };
 #endif	/* CONFIG_CGROUP_CPUACCT */
 
-#ifndef CONFIG_SMP
-
-void synchronize_sched_expedited(void)
-{
-	barrier();
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-
-#else /* #ifndef CONFIG_SMP */
-
-static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
-
-static int synchronize_sched_expedited_cpu_stop(void *data)
-{
-	/*
-	 * There must be a full memory barrier on each affected CPU
-	 * between the time that try_stop_cpus() is called and the
-	 * time that it returns.
-	 *
-	 * In the current initial implementation of cpu_stop, the
-	 * above condition is already met when the control reaches
-	 * this point and the following smp_mb() is not strictly
-	 * necessary.  Do smp_mb() anyway for documentation and
-	 * robustness against future implementation changes.
-	 */
-	smp_mb(); /* See above comment block. */
-	return 0;
-}
-
-/*
- * Wait for an rcu-sched grace period to elapse, but use "big hammer"
- * approach to force grace period to end quickly.  This consumes
- * significant time on all CPUs, and is thus not recommended for
- * any sort of common-case code.
- *
- * Note that it is illegal to call this function while holding any
- * lock that is acquired by a CPU-hotplug notifier.  Failing to
- * observe this restriction will result in deadlock.
- */
-void synchronize_sched_expedited(void)
-{
-	int snap, trycount = 0;
-
-	smp_mb();  /* ensure prior mod happens before capturing snap. */
-	snap = atomic_read(&synchronize_sched_expedited_count) + 1;
-	get_online_cpus();
-	while (try_stop_cpus(cpu_online_mask,
-			     synchronize_sched_expedited_cpu_stop,
-			     NULL) == -EAGAIN) {
-		put_online_cpus();
-		if (trycount++ < 10)
-			udelay(trycount * num_online_cpus());
-		else {
-			synchronize_sched();
-			return;
-		}
-		if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
-			smp_mb(); /* ensure test happens before caller kfree */
-			return;
-		}
-		get_online_cpus();
-	}
-	atomic_inc(&synchronize_sched_expedited_count);
-	smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
-	put_online_cpus();
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-
-#endif /* #else #ifndef CONFIG_SMP */
-- 
1.7.3.2


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH RFC tip/core/rcu 09/20] rcu,cleanup: simplify the code when cpu is dying
  2010-12-17 20:54 [PATCH tip/core/rcu 0/20] second preview of RCU patches for 2.6.38 Paul E. McKenney
                   ` (7 preceding siblings ...)
  2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 08/20] rcu,cleanup: move synchronize_sched_expedited() out of sched.c Paul E. McKenney
@ 2010-12-17 20:54 ` Paul E. McKenney
  2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 10/20] rcu: update documentation/comments for Lai's adoption patch Paul E. McKenney
                   ` (10 subsequent siblings)
  19 siblings, 0 replies; 34+ messages in thread
From: Paul E. McKenney @ 2010-12-17 20:54 UTC (permalink / raw)
  To: linux-kernel
  Cc: mingo, laijs, dipankar, akpm, mathieu.desnoyers, josh, niv, tglx,
	peterz, rostedt, Valdis.Kletnieks, dhowells, eric.dumazet,
	darren, Paul E. McKenney

From: Lai Jiangshan <laijs@cn.fujitsu.com>

When we handle the CPU_DYING notifier, the whole system is stopped except
for the current CPU.  We therefore need no synchronization with the other
CPUs.  This allows us to move any orphaned RCU callbacks directly to the
list of any online CPU without needing to run them through the global
orphan lists.  These global orphan lists can therefore be dispensed with.
This commit makes thes changes, though currently victimizes CPU 0 @@@.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c        |   81 ++++++++++++----------------------------------
 kernel/rcutree.h        |   16 ++-------
 kernel/rcutree_plugin.h |    8 ++--
 kernel/rcutree_trace.c  |    4 +-
 4 files changed, 31 insertions(+), 78 deletions(-)

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index ccdc04c..669d7fe 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -67,9 +67,6 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
 	.gpnum = -300, \
 	.completed = -300, \
 	.onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \
-	.orphan_cbs_list = NULL, \
-	.orphan_cbs_tail = &structname.orphan_cbs_list, \
-	.orphan_qlen = 0, \
 	.fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \
 	.n_force_qs = 0, \
 	.n_force_qs_ngp = 0, \
@@ -984,53 +981,31 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
 #ifdef CONFIG_HOTPLUG_CPU
 
 /*
- * Move a dying CPU's RCU callbacks to the ->orphan_cbs_list for the
- * specified flavor of RCU.  The callbacks will be adopted by the next
- * _rcu_barrier() invocation or by the CPU_DEAD notifier, whichever
- * comes first.  Because this is invoked from the CPU_DYING notifier,
- * irqs are already disabled.
+ * Move a dying CPU's RCU callbacks to online CPU's callback list.
+ * Synchronization is not required because this function executes
+ * in stop_machine() context.
  */
-static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
+static void rcu_send_cbs_to_online(struct rcu_state *rsp)
 {
 	int i;
+	/* current DYING CPU is cleared in the cpu_online_mask */
+	int receive_cpu = cpumask_any(cpu_online_mask);
 	struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
+	struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu);
 
 	if (rdp->nxtlist == NULL)
 		return;  /* irqs disabled, so comparison is stable. */
-	raw_spin_lock(&rsp->onofflock);  /* irqs already disabled. */
-	*rsp->orphan_cbs_tail = rdp->nxtlist;
-	rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL];
+
+	*receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
+	receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+	receive_rdp->qlen += rdp->qlen;
+	receive_rdp->n_cbs_adopted += rdp->qlen;
+	rdp->n_cbs_orphaned += rdp->qlen;
+
 	rdp->nxtlist = NULL;
 	for (i = 0; i < RCU_NEXT_SIZE; i++)
 		rdp->nxttail[i] = &rdp->nxtlist;
-	rsp->orphan_qlen += rdp->qlen;
-	rdp->n_cbs_orphaned += rdp->qlen;
 	rdp->qlen = 0;
-	raw_spin_unlock(&rsp->onofflock);  /* irqs remain disabled. */
-}
-
-/*
- * Adopt previously orphaned RCU callbacks.
- */
-static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
-{
-	unsigned long flags;
-	struct rcu_data *rdp;
-
-	raw_spin_lock_irqsave(&rsp->onofflock, flags);
-	rdp = this_cpu_ptr(rsp->rda);
-	if (rsp->orphan_cbs_list == NULL) {
-		raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
-		return;
-	}
-	*rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
-	rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail;
-	rdp->qlen += rsp->orphan_qlen;
-	rdp->n_cbs_adopted += rsp->orphan_qlen;
-	rsp->orphan_cbs_list = NULL;
-	rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
-	rsp->orphan_qlen = 0;
-	raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
 }
 
 /*
@@ -1081,8 +1056,6 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	if (need_report & RCU_OFL_TASKS_EXP_GP)
 		rcu_report_exp_rnp(rsp, rnp);
-
-	rcu_adopt_orphan_cbs(rsp);
 }
 
 /*
@@ -1100,11 +1073,7 @@ static void rcu_offline_cpu(int cpu)
 
 #else /* #ifdef CONFIG_HOTPLUG_CPU */
 
-static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
-{
-}
-
-static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
+static void rcu_send_cbs_to_online(struct rcu_state *rsp)
 {
 }
 
@@ -1702,10 +1671,7 @@ static void _rcu_barrier(struct rcu_state *rsp,
 	 * early.
 	 */
 	atomic_set(&rcu_barrier_cpu_count, 1);
-	preempt_disable(); /* stop CPU_DYING from filling orphan_cbs_list */
-	rcu_adopt_orphan_cbs(rsp);
 	on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1);
-	preempt_enable(); /* CPU_DYING can again fill orphan_cbs_list */
 	if (atomic_dec_and_test(&rcu_barrier_cpu_count))
 		complete(&rcu_barrier_completion);
 	wait_for_completion(&rcu_barrier_completion);
@@ -1831,18 +1797,13 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
 	case CPU_DYING:
 	case CPU_DYING_FROZEN:
 		/*
-		 * preempt_disable() in _rcu_barrier() prevents stop_machine(),
-		 * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);"
-		 * returns, all online cpus have queued rcu_barrier_func().
-		 * The dying CPU clears its cpu_online_mask bit and
-		 * moves all of its RCU callbacks to ->orphan_cbs_list
-		 * in the context of stop_machine(), so subsequent calls
-		 * to _rcu_barrier() will adopt these callbacks and only
-		 * then queue rcu_barrier_func() on all remaining CPUs.
+		 * The whole machine is "stopped" except this cpu, so we can
+		 * touch any data without introducing corruption. And we send
+		 * the callbacks to an attribute chosen online cpu.
 		 */
-		rcu_send_cbs_to_orphanage(&rcu_bh_state);
-		rcu_send_cbs_to_orphanage(&rcu_sched_state);
-		rcu_preempt_send_cbs_to_orphanage();
+		rcu_send_cbs_to_online(&rcu_bh_state);
+		rcu_send_cbs_to_online(&rcu_sched_state);
+		rcu_preempt_send_cbs_to_online();
 		break;
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 91d4170..1a54be2 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -203,8 +203,8 @@ struct rcu_data {
 	long		qlen_last_fqs_check;
 					/* qlen at last check for QS forcing */
 	unsigned long	n_cbs_invoked;	/* count of RCU cbs invoked. */
-	unsigned long	n_cbs_orphaned;	/* RCU cbs sent to orphanage. */
-	unsigned long	n_cbs_adopted;	/* RCU cbs adopted from orphanage. */
+	unsigned long   n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */
+	unsigned long   n_cbs_adopted;  /* RCU cbs adopted from dying CPU */
 	unsigned long	n_force_qs_snap;
 					/* did other CPU force QS recently? */
 	long		blimit;		/* Upper limit on a processed batch */
@@ -309,15 +309,7 @@ struct rcu_state {
 	/* End of fields guarded by root rcu_node's lock. */
 
 	raw_spinlock_t onofflock;		/* exclude on/offline and */
-						/*  starting new GP.  Also */
-						/*  protects the following */
-						/*  orphan_cbs fields. */
-	struct rcu_head *orphan_cbs_list;	/* list of rcu_head structs */
-						/*  orphaned by all CPUs in */
-						/*  a given leaf rcu_node */
-						/*  going offline. */
-	struct rcu_head **orphan_cbs_tail;	/* And tail pointer. */
-	long orphan_qlen;			/* Number of orphaned cbs. */
+						/*  starting new GP. */
 	raw_spinlock_t fqslock;			/* Only one task forcing */
 						/*  quiescent states. */
 	unsigned long jiffies_force_qs;		/* Time at which to invoke */
@@ -390,7 +382,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp);
 static int rcu_preempt_pending(int cpu);
 static int rcu_preempt_needs_cpu(int cpu);
 static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
-static void rcu_preempt_send_cbs_to_orphanage(void);
+static void rcu_preempt_send_cbs_to_online(void);
 static void __init __rcu_init_preempt(void);
 static void rcu_needs_cpu_flush(void);
 
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 21df7f3..0de359b 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -774,11 +774,11 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
 }
 
 /*
- * Move preemptable RCU's callbacks to ->orphan_cbs_list.
+ * Move preemptable DYING RCU's callbacks to other online CPU.
  */
-static void rcu_preempt_send_cbs_to_orphanage(void)
+static void rcu_preempt_send_cbs_to_online(void)
 {
-	rcu_send_cbs_to_orphanage(&rcu_preempt_state);
+	rcu_send_cbs_to_online(&rcu_preempt_state);
 }
 
 /*
@@ -1002,7 +1002,7 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
 /*
  * Because there is no preemptable RCU, there are no callbacks to move.
  */
-static void rcu_preempt_send_cbs_to_orphanage(void)
+static void rcu_preempt_send_cbs_to_online(void)
 {
 }
 
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 78ad3c3..c8e9785 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -166,13 +166,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
 
 	gpnum = rsp->gpnum;
 	seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
-		      "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n",
+		      "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
 		   rsp->completed, gpnum, rsp->signaled,
 		   (long)(rsp->jiffies_force_qs - jiffies),
 		   (int)(jiffies & 0xffff),
 		   rsp->n_force_qs, rsp->n_force_qs_ngp,
 		   rsp->n_force_qs - rsp->n_force_qs_ngp,
-		   rsp->n_force_qs_lh, rsp->orphan_qlen);
+		   rsp->n_force_qs_lh);
 	for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
 		if (rnp->level != level) {
 			seq_puts(m, "\n");
-- 
1.7.3.2


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH RFC tip/core/rcu 10/20] rcu: update documentation/comments for Lai's adoption patch
  2010-12-17 20:54 [PATCH tip/core/rcu 0/20] second preview of RCU patches for 2.6.38 Paul E. McKenney
                   ` (8 preceding siblings ...)
  2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 09/20] rcu,cleanup: simplify the code when cpu is dying Paul E. McKenney
@ 2010-12-17 20:54 ` Paul E. McKenney
  2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 11/20] rcu: fix race condition in synchronize_sched_expedited() Paul E. McKenney
                   ` (9 subsequent siblings)
  19 siblings, 0 replies; 34+ messages in thread
From: Paul E. McKenney @ 2010-12-17 20:54 UTC (permalink / raw)
  To: linux-kernel
  Cc: mingo, laijs, dipankar, akpm, mathieu.desnoyers, josh, niv, tglx,
	peterz, rostedt, Valdis.Kletnieks, dhowells, eric.dumazet,
	darren, Paul E. McKenney

Lai's RCU-callback immediate-adoption patch changes the RCU tracing
output, so update tracing.txt.  Also update a few comments to clarify
the synchronization design.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 Documentation/RCU/trace.txt |   12 ++++--------
 kernel/rcutree.c            |   10 ++++++----
 kernel/rcutree_plugin.h     |    2 +-
 3 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index ff6d3f1..6a8c73f 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -134,7 +134,8 @@ o	"ci" is the number of RCU callbacks that have been invoked for
 	been registered in absence of CPU-hotplug activity.
 
 o	"co" is the number of RCU callbacks that have been orphaned due to
-	this CPU going offline.
+	this CPU going offline.  These orphaned callbacks have been moved
+	to an arbitrarily chosen online CPU.
 
 o	"ca" is the number of RCU callbacks that have been adopted due to
 	other CPUs going offline.  Note that ci+co-ca+ql is the number of
@@ -172,12 +173,12 @@ o	"gpnum" is the number of grace periods that have started.  It is
 
 The output of "cat rcu/rcuhier" looks as follows, with very long lines:
 
-c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6 oqlen=0
+c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6
 1/1 .>. 0:127 ^0    
 3/3 .>. 0:35 ^0    0/0 .>. 36:71 ^1    0/0 .>. 72:107 ^2    0/0 .>. 108:127 ^3    
 3/3f .>. 0:5 ^0    2/3 .>. 6:11 ^1    0/0 .>. 12:17 ^2    0/0 .>. 18:23 ^3    0/0 .>. 24:29 ^4    0/0 .>. 30:35 ^5    0/0 .>. 36:41 ^0    0/0 .>. 42:47 ^1    0/0 .>. 48:53 ^2    0/0 .>. 54:59 ^3    0/0 .>. 60:65 ^4    0/0 .>. 66:71 ^5    0/0 .>. 72:77 ^0    0/0 .>. 78:83 ^1    0/0 .>. 84:89 ^2    0/0 .>. 90:95 ^3    0/0 .>. 96:101 ^4    0/0 .>. 102:107 ^5    0/0 .>. 108:113 ^0    0/0 .>. 114:119 ^1    0/0 .>. 120:125 ^2    0/0 .>. 126:127 ^3    
 rcu_bh:
-c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0 oqlen=0
+c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0
 0/1 .>. 0:127 ^0    
 0/3 .>. 0:35 ^0    0/0 .>. 36:71 ^1    0/0 .>. 72:107 ^2    0/0 .>. 108:127 ^3    
 0/3f .>. 0:5 ^0    0/3 .>. 6:11 ^1    0/0 .>. 12:17 ^2    0/0 .>. 18:23 ^3    0/0 .>. 24:29 ^4    0/0 .>. 30:35 ^5    0/0 .>. 36:41 ^0    0/0 .>. 42:47 ^1    0/0 .>. 48:53 ^2    0/0 .>. 54:59 ^3    0/0 .>. 60:65 ^4    0/0 .>. 66:71 ^5    0/0 .>. 72:77 ^0    0/0 .>. 78:83 ^1    0/0 .>. 84:89 ^2    0/0 .>. 90:95 ^3    0/0 .>. 96:101 ^4    0/0 .>. 102:107 ^5    0/0 .>. 108:113 ^0    0/0 .>. 114:119 ^1    0/0 .>. 120:125 ^2    0/0 .>. 126:127 ^3
@@ -216,11 +217,6 @@ o	"fqlh" is the number of calls to force_quiescent_state() that
 	exited immediately (without even being counted in nfqs above)
 	due to contention on ->fqslock.
 
-o	"oqlen" is the number of callbacks on the "orphan" callback
-	list.  RCU callbacks are placed on this list by CPUs going
-	offline, and are "adopted" either by the CPU helping the outgoing
-	CPU or by the next rcu_barrier*() call, whichever comes first.
-
 o	Each element of the form "1/1 0:127 ^0" represents one struct
 	rcu_node.  Each line represents one level of the hierarchy, from
 	root to leaves.  It is best to think of the rcu_data structures
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 669d7fe..120820f 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1668,7 +1668,9 @@ static void _rcu_barrier(struct rcu_state *rsp,
 	 * decrement rcu_barrier_cpu_count -- otherwise the first CPU
 	 * might complete its grace period before all of the other CPUs
 	 * did their increment, causing this function to return too
-	 * early.
+	 * early.  Note that on_each_cpu() disables irqs, which prevents
+	 * any CPUs from coming online or going offline until each online
+	 * CPU has queued its RCU-barrier callback.
 	 */
 	atomic_set(&rcu_barrier_cpu_count, 1);
 	on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1);
@@ -1797,9 +1799,9 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
 	case CPU_DYING:
 	case CPU_DYING_FROZEN:
 		/*
-		 * The whole machine is "stopped" except this cpu, so we can
-		 * touch any data without introducing corruption. And we send
-		 * the callbacks to an attribute chosen online cpu.
+		 * The whole machine is "stopped" except this CPU, so we can
+		 * touch any data without introducing corruption. We send the
+		 * dying CPU's callbacks to an arbitrarily chosen online CPU.
 		 */
 		rcu_send_cbs_to_online(&rcu_bh_state);
 		rcu_send_cbs_to_online(&rcu_sched_state);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 0de359b..643c8f6 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -774,7 +774,7 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
 }
 
 /*
- * Move preemptable DYING RCU's callbacks to other online CPU.
+ * Move preemptable RCU's callbacks from dying CPU to other online CPU.
  */
 static void rcu_preempt_send_cbs_to_online(void)
 {
-- 
1.7.3.2


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH RFC tip/core/rcu 11/20] rcu: fix race condition in synchronize_sched_expedited()
  2010-12-17 20:54 [PATCH tip/core/rcu 0/20] second preview of RCU patches for 2.6.38 Paul E. McKenney
                   ` (9 preceding siblings ...)
  2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 10/20] rcu: update documentation/comments for Lai's adoption patch Paul E. McKenney
@ 2010-12-17 20:54 ` Paul E. McKenney
  2010-12-18 15:52   ` Tejun Heo
  2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 12/20] rcu: Make synchronize_srcu_expedited() fast if running readers Paul E. McKenney
                   ` (8 subsequent siblings)
  19 siblings, 1 reply; 34+ messages in thread
From: Paul E. McKenney @ 2010-12-17 20:54 UTC (permalink / raw)
  To: linux-kernel
  Cc: mingo, laijs, dipankar, akpm, mathieu.desnoyers, josh, niv, tglx,
	peterz, rostedt, Valdis.Kletnieks, dhowells, eric.dumazet,
	darren, Paul E. McKenney, Tejun Heo

The new (early 2010) implementation of synchronize_sched_expedited() uses
try_stop_cpu() to force a context switch on every CPU.  It also permits
concurrent calls to synchronize_sched_expedited() to share a single call
to try_stop_cpu() through use of an atomically incremented
synchronize_sched_expedited_count variable.  Unfortunately, this is
subject to failure as follows:

o	Task A invokes synchronize_sched_expedited(), try_stop_cpus()
	succeeds, but Task A is preempted before getting to the atomic
	increment of synchronize_sched_expedited_count.

o	Task B also invokes synchronize_sched_expedited(), with exactly
	the same outcome as Task A.

o	Task C also invokes synchronize_sched_expedited(), again with
	exactly the same outcome as Tasks A and B.

o	Task D also invokes synchronize_sched_expedited(), but only
	gets as far as acquiring the mutex within try_stop_cpus()
	before being preempted, interrupted, or otherwise delayed.

o	Task E also invokes synchronize_sched_expedited(), but only
	gets to the snapshotting of synchronize_sched_expedited_count.

o	Tasks A, B, and C all increment synchronize_sched_expedited_count.

o	Task E fails to get the mutex, so checks the new value
	of synchronize_sched_expedited_count.  It finds that the
	value has increased, so (wrongly) assumes that its work
	has been done, returning despite there having been no
	expedited grace period since it began.

The solution is to have the lowest-numbered CPU atomically increment
the synchronize_sched_expedited_count variable within the
synchronize_sched_expedited_cpu_stop() function, which is under
the protection of the mutex acquired by try_stop_cpus().  However, this
also requires that piggybacking tasks wait for three rather than two
instances of try_stop_cpu(), because we cannot control the order in
which the per-CPU callback function occur.

Cc: Tejun Heo <tj@kernel.org>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree_plugin.h |   18 ++++++++++++++++--
 1 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 643c8f6..c22c4ef 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1041,6 +1041,8 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
 	 * robustness against future implementation changes.
 	 */
 	smp_mb(); /* See above comment block. */
+	if (cpumask_first(cpu_online_mask) == smp_processor_id())
+		atomic_inc(&synchronize_sched_expedited_count);
 	return 0;
 }
 
@@ -1053,13 +1055,26 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
  * Note that it is illegal to call this function while holding any
  * lock that is acquired by a CPU-hotplug notifier.  Failing to
  * observe this restriction will result in deadlock.
+ *
+ * The synchronize_sched_expedited_cpu_stop() function is called
+ * in stop-CPU context, but in order to keep overhead down to a dull
+ * roar, we don't force this function to wait for its counterparts
+ * on other CPUs.  One instance of this function will increment the
+ * synchronize_sched_expedited_count variable per call to
+ * try_stop_cpus(), but there is no guarantee what order this instance
+ * will occur in.  The worst case is that it is last on one call
+ * to try_stop_cpus(), and the first on the next call.  This means
+ * that piggybacking requires that synchronize_sched_expedited_count
+ * be incremented by 3: this guarantees that the piggybacking
+ * task has waited through an entire cycle of context switches,
+ * even in the worst case.
  */
 void synchronize_sched_expedited(void)
 {
 	int snap, trycount = 0;
 
 	smp_mb();  /* ensure prior mod happens before capturing snap. */
-	snap = atomic_read(&synchronize_sched_expedited_count) + 1;
+	snap = atomic_read(&synchronize_sched_expedited_count) + 2;
 	get_online_cpus();
 	while (try_stop_cpus(cpu_online_mask,
 			     synchronize_sched_expedited_cpu_stop,
@@ -1077,7 +1092,6 @@ void synchronize_sched_expedited(void)
 		}
 		get_online_cpus();
 	}
-	atomic_inc(&synchronize_sched_expedited_count);
 	smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
 	put_online_cpus();
 }
-- 
1.7.3.2


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH RFC tip/core/rcu 12/20] rcu: Make synchronize_srcu_expedited() fast if running readers
  2010-12-17 20:54 [PATCH tip/core/rcu 0/20] second preview of RCU patches for 2.6.38 Paul E. McKenney
                   ` (10 preceding siblings ...)
  2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 11/20] rcu: fix race condition in synchronize_sched_expedited() Paul E. McKenney
@ 2010-12-17 20:54 ` Paul E. McKenney
  2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 13/20] rcu: increase synchronize_sched_expedited() batching Paul E. McKenney
                   ` (7 subsequent siblings)
  19 siblings, 0 replies; 34+ messages in thread
From: Paul E. McKenney @ 2010-12-17 20:54 UTC (permalink / raw)
  To: linux-kernel
  Cc: mingo, laijs, dipankar, akpm, mathieu.desnoyers, josh, niv, tglx,
	peterz, rostedt, Valdis.Kletnieks, dhowells, eric.dumazet,
	darren, Paul E. McKenney

The synchronize_srcu_expedited() function is currently quick if there
are no active readers, but will delay a full jiffy if there are any.
If these readers leave their SRCU read-side critical sections quickly,
this is way too long to wait.  So this commit first waits ten microseconds,
and only then falls back to jiffy-at-a-time waiting.

Reported-by: Avi Kivity <avi@redhat.com>
Reported-by: Marcelo Tosatti <mtosatti@redhat.com>
Tested-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 init/Kconfig  |   15 +++++++++++++++
 kernel/srcu.c |    8 +++++++-
 2 files changed, 22 insertions(+), 1 deletions(-)

diff --git a/init/Kconfig b/init/Kconfig
index 929adf6..3551824 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -488,6 +488,21 @@ config RCU_BOOST_DELAY
 
 	  Accept the default if unsure.
 
+config SRCU_SYNCHRONIZE_DELAY
+	int "Microseconds to delay before waiting for readers"
+	range 0 20
+	default 10
+	help
+	  This option controls how long SRCU delays before entering its
+	  loop waiting on SRCU readers.  The purpose of this loop is
+	  to avoid the unconditional context-switch penalty that would
+	  otherwise be incurred if there was an active SRCU reader,
+	  in a manner similar to adaptive locking schemes.  This should
+	  be set to be a bit longer than the common-case SRCU read-side
+	  critical-section overhead.
+
+	  Accept the default if unsure.
+
 endmenu # "RCU Subsystem"
 
 config IKCONFIG
diff --git a/kernel/srcu.c b/kernel/srcu.c
index c71e075..98d8c1e 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -31,6 +31,7 @@
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
 #include <linux/smp.h>
+#include <linux/delay.h>
 #include <linux/srcu.h>
 
 static int init_srcu_struct_fields(struct srcu_struct *sp)
@@ -203,9 +204,14 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
 	 * all srcu_read_lock() calls using the old counters have completed.
 	 * Their corresponding critical sections might well be still
 	 * executing, but the srcu_read_lock() primitives themselves
-	 * will have finished executing.
+	 * will have finished executing.  We initially give readers
+	 * an arbitrarily chosen 10 microseconds to get out of their
+	 * SRCU read-side critical sections, then loop waiting 1/HZ
+	 * seconds per iteration.
 	 */
 
+	if (srcu_readers_active_idx(sp, idx))
+		udelay(CONFIG_SRCU_SYNCHRONIZE_DELAY);
 	while (srcu_readers_active_idx(sp, idx))
 		schedule_timeout_interruptible(1);
 
-- 
1.7.3.2


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH RFC tip/core/rcu 13/20] rcu: increase synchronize_sched_expedited() batching
  2010-12-17 20:54 [PATCH tip/core/rcu 0/20] second preview of RCU patches for 2.6.38 Paul E. McKenney
                   ` (11 preceding siblings ...)
  2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 12/20] rcu: Make synchronize_srcu_expedited() fast if running readers Paul E. McKenney
@ 2010-12-17 20:54 ` Paul E. McKenney
  2010-12-18 16:13   ` Tejun Heo
  2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 14/20] rcu: Stop chasing QS if another CPU did it for us Paul E. McKenney
                   ` (6 subsequent siblings)
  19 siblings, 1 reply; 34+ messages in thread
From: Paul E. McKenney @ 2010-12-17 20:54 UTC (permalink / raw)
  To: linux-kernel
  Cc: mingo, laijs, dipankar, akpm, mathieu.desnoyers, josh, niv, tglx,
	peterz, rostedt, Valdis.Kletnieks, dhowells, eric.dumazet,
	darren, Tejun Heo, Paul E. McKenney

From: Tejun Heo <tj@kernel.org>

The fix in commit #6a0cc49 requires more than three concurrent instances
of synchronize_sched_expedited() before batching is possible.  This
patch uses a ticket-counter-like approach that is also not unrelated to
Lai Jiangshan's Ring RCU to allow sharing of expedited grace periods even
when there are only two concurrent instances of synchronize_sched_expedited().

This commit builds on Tejun's original posting, which may be found at
http://lkml.org/lkml/2010/11/9/204, adding memory barriers, avoiding
overflow of signed integers (other than via atomic_t), and fixing the
detection of batching.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h |    2 +
 kernel/rcutree_plugin.h  |   82 ++++++++++++++++++++++++++++++++++-----------
 2 files changed, 64 insertions(+), 20 deletions(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 49e8e16..af56148 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -47,6 +47,8 @@
 extern int rcutorture_runnable; /* for sysctl */
 #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
 
+#define UINT_CMP_GE(a, b)	(UINT_MAX / 2 >= (a) - (b))
+#define UINT_CMP_LT(a, b)	(UINT_MAX / 2 < (a) - (b))
 #define ULONG_CMP_GE(a, b)	(ULONG_MAX / 2 >= (a) - (b))
 #define ULONG_CMP_LT(a, b)	(ULONG_MAX / 2 < (a) - (b))
 
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index c22c4ef..a363871 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1025,7 +1025,8 @@ EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
 
 #else /* #ifndef CONFIG_SMP */
 
-static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
+static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
+static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
 
 static int synchronize_sched_expedited_cpu_stop(void *data)
 {
@@ -1041,8 +1042,6 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
 	 * robustness against future implementation changes.
 	 */
 	smp_mb(); /* See above comment block. */
-	if (cpumask_first(cpu_online_mask) == smp_processor_id())
-		atomic_inc(&synchronize_sched_expedited_count);
 	return 0;
 }
 
@@ -1056,43 +1055,86 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
  * lock that is acquired by a CPU-hotplug notifier.  Failing to
  * observe this restriction will result in deadlock.
  *
- * The synchronize_sched_expedited_cpu_stop() function is called
- * in stop-CPU context, but in order to keep overhead down to a dull
- * roar, we don't force this function to wait for its counterparts
- * on other CPUs.  One instance of this function will increment the
- * synchronize_sched_expedited_count variable per call to
- * try_stop_cpus(), but there is no guarantee what order this instance
- * will occur in.  The worst case is that it is last on one call
- * to try_stop_cpus(), and the first on the next call.  This means
- * that piggybacking requires that synchronize_sched_expedited_count
- * be incremented by 3: this guarantees that the piggybacking
- * task has waited through an entire cycle of context switches,
- * even in the worst case.
+ * This implementation can be thought of as an application of ticket
+ * locking to RCU, with sync_sched_expedited_started and
+ * sync_sched_expedited_done taking on the roles of the halves
+ * of the ticket-lock word.  Each task atomically increments
+ * sync_sched_expedited_started upon entry, snapshotting the old value,
+ * then attempts to stop all the CPUs.  If this succeeds, then each
+ * CPU will have executed a context switch, resulting in an RCU-sched
+ * grace period.  We are then done, so we use atomic_cmpxchg() to
+ * update sync_sched_expedited_done to match our snapshot -- but
+ * only if someone else has not already advanced past our snapshot.
+ *
+ * On the other hand, if try_stop_cpus() fails, we check the value
+ * of sync_sched_expedited_done.  If it has advanced past our
+ * initial snapshot, then someone else must have forced a grace period
+ * some time after we took our snapshot.  In this case, our work is
+ * done for us, and we can simply return.  Otherwise, we try again,
+ * but keep our initial snapshot for purposes of checking for someone
+ * doing our work for us.
+ *
+ * If we fail too many times in a row, we fall back to synchronize_sched().
  */
 void synchronize_sched_expedited(void)
 {
-	int snap, trycount = 0;
+	int firstsnap, s, snap, trycount = 0;
 
-	smp_mb();  /* ensure prior mod happens before capturing snap. */
-	snap = atomic_read(&synchronize_sched_expedited_count) + 2;
+	/* Note that atomic_inc_return() implies full memory barrier. */
+	firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started);
 	get_online_cpus();
+
+	/*
+	 * Each pass through the following loop attempts to force a
+	 * context switch on each CPU.
+	 */
 	while (try_stop_cpus(cpu_online_mask,
 			     synchronize_sched_expedited_cpu_stop,
 			     NULL) == -EAGAIN) {
 		put_online_cpus();
+
+		/* No joy, try again later.  Or just synchronize_sched(). */
 		if (trycount++ < 10)
 			udelay(trycount * num_online_cpus());
 		else {
 			synchronize_sched();
 			return;
 		}
-		if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
+
+		/* Check to see if someone else did our work for us. */
+		s = atomic_read(&sync_sched_expedited_done);
+		if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) {
 			smp_mb(); /* ensure test happens before caller kfree */
 			return;
 		}
+
+		/*
+		 * Refetching sync_sched_expedited_started allows later
+		 * callers to piggyback on our grace period.  We subtract
+		 * 1 to get the same token that the last incrementer got.
+		 * We retry after they started, so our grace period works
+		 * for them, and they started after our first try, so their
+		 * grace period works for us.
+		 */
 		get_online_cpus();
+		snap = atomic_read(&sync_sched_expedited_started) - 1;
+		smp_mb(); /* ensure read is before try_stop_cpus(). */
 	}
-	smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
+
+	/*
+	 * Everyone up to our most recent fetch is covered by our grace
+	 * period.  Update the counter, but only if our work is still
+	 * relevant -- which it won't be if someone who started later
+	 * than we did beat us to the punch.
+	 */
+	do {
+		s = atomic_read(&sync_sched_expedited_done);
+		if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) {
+			smp_mb(); /* ensure test happens before caller kfree */
+			break;
+		}
+	} while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s);
+
 	put_online_cpus();
 }
 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-- 
1.7.3.2


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH RFC tip/core/rcu 14/20] rcu: Stop chasing QS if another CPU did it for us
  2010-12-17 20:54 [PATCH tip/core/rcu 0/20] second preview of RCU patches for 2.6.38 Paul E. McKenney
                   ` (12 preceding siblings ...)
  2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 13/20] rcu: increase synchronize_sched_expedited() batching Paul E. McKenney
@ 2010-12-17 20:54 ` Paul E. McKenney
  2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 15/20] rcu: Keep gpnum and completed fields synchronized Paul E. McKenney
                   ` (5 subsequent siblings)
  19 siblings, 0 replies; 34+ messages in thread
From: Paul E. McKenney @ 2010-12-17 20:54 UTC (permalink / raw)
  To: linux-kernel
  Cc: mingo, laijs, dipankar, akpm, mathieu.desnoyers, josh, niv, tglx,
	peterz, rostedt, Valdis.Kletnieks, dhowells, eric.dumazet,
	darren, Frederic Weisbecker, Paul E. McKenney, Peter Zijlstra

From: Frederic Weisbecker <fweisbec@gmail.com>

When a CPU is idle and others CPUs handled its extended
quiescent state to complete grace periods on its behalf,
it will catch up with completed grace periods numbers
when it wakes up.

But at this point there might be no more grace period to
complete, but still the woken CPU always keeps its stale
qs_pending value and will then continue to chase quiescent
states even if its not needed anymore.

This results in clusters of spurious softirqs until a new
real grace period is started. Because if we continue to
chase quiescent states but we have completed every grace
periods, rcu_report_qs_rdp() is puzzled and makes that
state run into infinite loops.

As suggested by Lai Jiangshan, just reset qs_pending if
someone completed every grace periods on our behalf.

Suggested-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c |    8 ++++++++
 1 files changed, 8 insertions(+), 0 deletions(-)

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 120820f..916f42b 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -678,6 +678,14 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
 
 		/* Remember that we saw this grace-period completion. */
 		rdp->completed = rnp->completed;
+
+		/*
+		 * If another CPU handled our extended quiescent states and
+		 * we have no more grace period to complete yet, then stop
+		 * chasing quiescent states.
+		 */
+		if (rdp->completed == rnp->gpnum)
+			rdp->qs_pending = 0;
 	}
 }
 
-- 
1.7.3.2


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH RFC tip/core/rcu 15/20] rcu: Keep gpnum and completed fields synchronized
  2010-12-17 20:54 [PATCH tip/core/rcu 0/20] second preview of RCU patches for 2.6.38 Paul E. McKenney
                   ` (13 preceding siblings ...)
  2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 14/20] rcu: Stop chasing QS if another CPU did it for us Paul E. McKenney
@ 2010-12-17 20:54 ` Paul E. McKenney
  2010-12-20  2:13   ` Lai Jiangshan
  2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 16/20] rcu: fine-tune grace-period begin/end checks Paul E. McKenney
                   ` (4 subsequent siblings)
  19 siblings, 1 reply; 34+ messages in thread
From: Paul E. McKenney @ 2010-12-17 20:54 UTC (permalink / raw)
  To: linux-kernel
  Cc: mingo, laijs, dipankar, akpm, mathieu.desnoyers, josh, niv, tglx,
	peterz, rostedt, Valdis.Kletnieks, dhowells, eric.dumazet,
	darren, Frederic Weisbecker, Paul E. McKenney, Peter Zijlstra

From: Frederic Weisbecker <fweisbec@gmail.com>

When a CPU that was in an extended quiescent state wakes
up and catches up with grace periods that remote CPUs
completed on its behalf, we update the completed field
but not the gpnum that keeps a stale value of a backward
grace period ID.

Later, note_new_gpnum() will interpret the shift between
the local CPU and the node grace period ID as some new grace
period to handle and will then start to hunt quiescent state.

But if every grace periods have already been completed, this
interpretation becomes broken. And we'll be stuck in clusters
of spurious softirqs because rcu_report_qs_rdp() will make
this broken state run into infinite loop.

The solution, as suggested by Lai Jiangshan, is to ensure that
the gpnum and completed fields are well synchronized when we catch
up with completed grace periods on their behalf by other cpus.
This way we won't start noting spurious new grace periods.

Suggested-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Steven Rostedt <rostedt@goodmis.org
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c |    9 +++++++++
 1 files changed, 9 insertions(+), 0 deletions(-)

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 916f42b..8105271 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -680,6 +680,15 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
 		rdp->completed = rnp->completed;
 
 		/*
+		 * If we were in an extended quiescent state, we may have
+		 * missed some grace periods that others CPUs took care on
+		 * our behalf. Catch up with this state to avoid noting
+		 * spurious new grace periods.
+		 */
+		if (rdp->completed > rdp->gpnum)
+			rdp->gpnum = rdp->completed;
+
+		/*
 		 * If another CPU handled our extended quiescent states and
 		 * we have no more grace period to complete yet, then stop
 		 * chasing quiescent states.
-- 
1.7.3.2


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH RFC tip/core/rcu 16/20] rcu: fine-tune grace-period begin/end checks
  2010-12-17 20:54 [PATCH tip/core/rcu 0/20] second preview of RCU patches for 2.6.38 Paul E. McKenney
                   ` (14 preceding siblings ...)
  2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 15/20] rcu: Keep gpnum and completed fields synchronized Paul E. McKenney
@ 2010-12-17 20:54 ` Paul E. McKenney
  2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 17/20] rcu: limit rcu_node leaf-level fanout Paul E. McKenney
                   ` (3 subsequent siblings)
  19 siblings, 0 replies; 34+ messages in thread
From: Paul E. McKenney @ 2010-12-17 20:54 UTC (permalink / raw)
  To: linux-kernel
  Cc: mingo, laijs, dipankar, akpm, mathieu.desnoyers, josh, niv, tglx,
	peterz, rostedt, Valdis.Kletnieks, dhowells, eric.dumazet,
	darren, Paul E. McKenney

Use the CPU's bit in rnp->qsmask to determine whether or not the CPU
should try to report a quiescent state.  Handle overflow in the check
for rdp->gpnum having fallen behind.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c |   27 ++++++++++++++++++---------
 1 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 8105271..c39ec5b 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -617,9 +617,17 @@ static void __init check_cpu_stall_init(void)
 static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
 {
 	if (rdp->gpnum != rnp->gpnum) {
-		rdp->qs_pending = 1;
-		rdp->passed_quiesc = 0;
+		/*
+		 * If the current grace period is waiting for this CPU,
+		 * set up to detect a quiescent state, otherwise don't
+		 * go looking for one.
+		 */
 		rdp->gpnum = rnp->gpnum;
+		if (rnp->qsmask & rdp->grpmask) {
+			rdp->qs_pending = 1;
+			rdp->passed_quiesc = 0;
+		} else
+			rdp->qs_pending = 0;
 	}
 }
 
@@ -681,19 +689,20 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
 
 		/*
 		 * If we were in an extended quiescent state, we may have
-		 * missed some grace periods that others CPUs took care on
+		 * missed some grace periods that others CPUs handled on
 		 * our behalf. Catch up with this state to avoid noting
-		 * spurious new grace periods.
+		 * spurious new grace periods.  If another grace period
+		 * has started, then rnp->gpnum will have advanced, so
+		 * we will detect this later on.
 		 */
-		if (rdp->completed > rdp->gpnum)
+		if (ULONG_CMP_LT(rdp->gpnum, rdp->completed))
 			rdp->gpnum = rdp->completed;
 
 		/*
-		 * If another CPU handled our extended quiescent states and
-		 * we have no more grace period to complete yet, then stop
-		 * chasing quiescent states.
+		 * If RCU does not need a quiescent state from this CPU,
+		 * then make sure that this CPU doesn't go looking for one.
 		 */
-		if (rdp->completed == rnp->gpnum)
+		if ((rnp->qsmask & rdp->grpmask) == 0)
 			rdp->qs_pending = 0;
 	}
 }
-- 
1.7.3.2


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH RFC tip/core/rcu 17/20] rcu: limit rcu_node leaf-level fanout
  2010-12-17 20:54 [PATCH tip/core/rcu 0/20] second preview of RCU patches for 2.6.38 Paul E. McKenney
                   ` (15 preceding siblings ...)
  2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 16/20] rcu: fine-tune grace-period begin/end checks Paul E. McKenney
@ 2010-12-17 20:54 ` Paul E. McKenney
  2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 18/20] rcu: reduce __call_rcu()-induced contention on rcu_node structures Paul E. McKenney
                   ` (2 subsequent siblings)
  19 siblings, 0 replies; 34+ messages in thread
From: Paul E. McKenney @ 2010-12-17 20:54 UTC (permalink / raw)
  To: linux-kernel
  Cc: mingo, laijs, dipankar, akpm, mathieu.desnoyers, josh, niv, tglx,
	peterz, rostedt, Valdis.Kletnieks, dhowells, eric.dumazet,
	darren, Paul E. McKenney

Some recent benchmarks have indicated possible lock contention on the
leaf-level rcu_node locks.  This commit therefore limits the number of
CPUs per leaf-level rcu_node structure to 16, in other words, there
can be at most 16 rcu_data structures fanning into a given rcu_node
structure.  Prior to this, the limit was 32 on 32-bit systems and 64 on
64-bit systems.

Note that the fanout of non-leaf rcu_node structures is unchanged.  The
organization of accesses to the rcu_node tree is such that references
to non-leaf rcu_node structures are much less frequent than to the
leaf structures.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c |    3 ++-
 kernel/rcutree.h |   43 ++++++++++++++++++++++++-------------------
 2 files changed, 26 insertions(+), 20 deletions(-)

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index c39ec5b..01c8ad3 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1869,8 +1869,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
 {
 	int i;
 
-	for (i = NUM_RCU_LVLS - 1; i >= 0; i--)
+	for (i = NUM_RCU_LVLS - 1; i > 0; i--)
 		rsp->levelspread[i] = CONFIG_RCU_FANOUT;
+	rsp->levelspread[0] = RCU_FANOUT_LEAF;
 }
 #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
 static void __init rcu_init_levelspread(struct rcu_state *rsp)
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 1a54be2..e8f057e 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -31,46 +31,51 @@
 /*
  * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
  * In theory, it should be possible to add more levels straightforwardly.
- * In practice, this has not been tested, so there is probably some
- * bug somewhere.
+ * In practice, this did work well going from three levels to four.
+ * Of course, your mileage may vary.
  */
 #define MAX_RCU_LVLS 4
-#define RCU_FANOUT	      (CONFIG_RCU_FANOUT)
-#define RCU_FANOUT_SQ	      (RCU_FANOUT * RCU_FANOUT)
-#define RCU_FANOUT_CUBE	      (RCU_FANOUT_SQ * RCU_FANOUT)
-#define RCU_FANOUT_FOURTH     (RCU_FANOUT_CUBE * RCU_FANOUT)
-
-#if NR_CPUS <= RCU_FANOUT
+#if CONFIG_RCU_FANOUT > 16
+#define RCU_FANOUT_LEAF       16
+#else /* #if CONFIG_RCU_FANOUT > 16 */
+#define RCU_FANOUT_LEAF       (CONFIG_RCU_FANOUT)
+#endif /* #else #if CONFIG_RCU_FANOUT > 16 */
+#define RCU_FANOUT_1	      (RCU_FANOUT_LEAF)
+#define RCU_FANOUT_2	      (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)
+#define RCU_FANOUT_3	      (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)
+#define RCU_FANOUT_4	      (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
+
+#if NR_CPUS <= RCU_FANOUT_1
 #  define NUM_RCU_LVLS	      1
 #  define NUM_RCU_LVL_0	      1
 #  define NUM_RCU_LVL_1	      (NR_CPUS)
 #  define NUM_RCU_LVL_2	      0
 #  define NUM_RCU_LVL_3	      0
 #  define NUM_RCU_LVL_4	      0
-#elif NR_CPUS <= RCU_FANOUT_SQ
+#elif NR_CPUS <= RCU_FANOUT_2
 #  define NUM_RCU_LVLS	      2
 #  define NUM_RCU_LVL_0	      1
-#  define NUM_RCU_LVL_1	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
+#  define NUM_RCU_LVL_1	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
 #  define NUM_RCU_LVL_2	      (NR_CPUS)
 #  define NUM_RCU_LVL_3	      0
 #  define NUM_RCU_LVL_4	      0
-#elif NR_CPUS <= RCU_FANOUT_CUBE
+#elif NR_CPUS <= RCU_FANOUT_3
 #  define NUM_RCU_LVLS	      3
 #  define NUM_RCU_LVL_0	      1
-#  define NUM_RCU_LVL_1	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
-#  define NUM_RCU_LVL_2	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
-#  define NUM_RCU_LVL_3	      NR_CPUS
+#  define NUM_RCU_LVL_1	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
+#  define NUM_RCU_LVL_2	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
+#  define NUM_RCU_LVL_3	      (NR_CPUS)
 #  define NUM_RCU_LVL_4	      0
-#elif NR_CPUS <= RCU_FANOUT_FOURTH
+#elif NR_CPUS <= RCU_FANOUT_4
 #  define NUM_RCU_LVLS	      4
 #  define NUM_RCU_LVL_0	      1
-#  define NUM_RCU_LVL_1	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_CUBE)
-#  define NUM_RCU_LVL_2	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
-#  define NUM_RCU_LVL_3	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
-#  define NUM_RCU_LVL_4	      NR_CPUS
+#  define NUM_RCU_LVL_1	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
+#  define NUM_RCU_LVL_2	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
+#  define NUM_RCU_LVL_3	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
+#  define NUM_RCU_LVL_4	      (NR_CPUS)
 #else
 # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
-#endif /* #if (NR_CPUS) <= RCU_FANOUT */
+#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
 
 #define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
 #define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
-- 
1.7.3.2


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH RFC tip/core/rcu 18/20] rcu: reduce __call_rcu()-induced contention on rcu_node structures
  2010-12-17 20:54 [PATCH tip/core/rcu 0/20] second preview of RCU patches for 2.6.38 Paul E. McKenney
                   ` (16 preceding siblings ...)
  2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 17/20] rcu: limit rcu_node leaf-level fanout Paul E. McKenney
@ 2010-12-17 20:54 ` Paul E. McKenney
  2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 19/20] rculist: fix borked __list_for_each_rcu() macro Paul E. McKenney
  2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 20/20] rcu: remove unused " Paul E. McKenney
  19 siblings, 0 replies; 34+ messages in thread
From: Paul E. McKenney @ 2010-12-17 20:54 UTC (permalink / raw)
  To: linux-kernel
  Cc: mingo, laijs, dipankar, akpm, mathieu.desnoyers, josh, niv, tglx,
	peterz, rostedt, Valdis.Kletnieks, dhowells, eric.dumazet,
	darren, Paul E. McKenney

When the current __call_rcu() function was written, the expedited
APIs did not exist.  The __call_rcu() implementation therefore went
to great lengths to detect the end of old grace periods and to start
new ones, all in the name of reducing grace-period latency.  Now the
expedited APIs do exist, and the usage of __call_rcu() has increased
considerably.  This commit therefore causes __call_rcu() to avoid
worrying about grace periods unless there are a large number of
RCU callbacks stacked up on the current CPU.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c |   38 +++++++++++++++++++++-----------------
 1 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 01c8ad3..d0ddfea 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1435,22 +1435,11 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
 	 */
 	local_irq_save(flags);
 	rdp = this_cpu_ptr(rsp->rda);
-	rcu_process_gp_end(rsp, rdp);
-	check_for_new_grace_period(rsp, rdp);
 
 	/* Add the callback to our list. */
 	*rdp->nxttail[RCU_NEXT_TAIL] = head;
 	rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
 
-	/* Start a new grace period if one not already started. */
-	if (!rcu_gp_in_progress(rsp)) {
-		unsigned long nestflag;
-		struct rcu_node *rnp_root = rcu_get_root(rsp);
-
-		raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
-		rcu_start_gp(rsp, nestflag);  /* releases rnp_root->lock. */
-	}
-
 	/*
 	 * Force the grace period if too many callbacks or too long waiting.
 	 * Enforce hysteresis, and don't invoke force_quiescent_state()
@@ -1459,12 +1448,27 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
 	 * is the only one waiting for a grace period to complete.
 	 */
 	if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
-		rdp->blimit = LONG_MAX;
-		if (rsp->n_force_qs == rdp->n_force_qs_snap &&
-		    *rdp->nxttail[RCU_DONE_TAIL] != head)
-			force_quiescent_state(rsp, 0);
-		rdp->n_force_qs_snap = rsp->n_force_qs;
-		rdp->qlen_last_fqs_check = rdp->qlen;
+
+		/* Are we ignoring a completed grace period? */
+		rcu_process_gp_end(rsp, rdp);
+		check_for_new_grace_period(rsp, rdp);
+
+		/* Start a new grace period if one not already started. */
+		if (!rcu_gp_in_progress(rsp)) {
+			unsigned long nestflag;
+			struct rcu_node *rnp_root = rcu_get_root(rsp);
+
+			raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
+			rcu_start_gp(rsp, nestflag);  /* rlses rnp_root->lock */
+		} else {
+			/* Give the grace period a kick. */
+			rdp->blimit = LONG_MAX;
+			if (rsp->n_force_qs == rdp->n_force_qs_snap &&
+			    *rdp->nxttail[RCU_DONE_TAIL] != head)
+				force_quiescent_state(rsp, 0);
+			rdp->n_force_qs_snap = rsp->n_force_qs;
+			rdp->qlen_last_fqs_check = rdp->qlen;
+		}
 	} else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
 		force_quiescent_state(rsp, 1);
 	local_irq_restore(flags);
-- 
1.7.3.2


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH RFC tip/core/rcu 19/20] rculist: fix borked __list_for_each_rcu() macro
  2010-12-17 20:54 [PATCH tip/core/rcu 0/20] second preview of RCU patches for 2.6.38 Paul E. McKenney
                   ` (17 preceding siblings ...)
  2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 18/20] rcu: reduce __call_rcu()-induced contention on rcu_node structures Paul E. McKenney
@ 2010-12-17 20:54 ` Paul E. McKenney
  2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 20/20] rcu: remove unused " Paul E. McKenney
  19 siblings, 0 replies; 34+ messages in thread
From: Paul E. McKenney @ 2010-12-17 20:54 UTC (permalink / raw)
  To: linux-kernel
  Cc: mingo, laijs, dipankar, akpm, mathieu.desnoyers, josh, niv, tglx,
	peterz, rostedt, Valdis.Kletnieks, dhowells, eric.dumazet,
	darren, Mariusz Kozlowski, Paul E. McKenney

From: Mariusz Kozlowski <mk@lab.zgora.pl>

This restores parentheses blance.

Signed-off-by: Mariusz Kozlowski <mk@lab.zgora.pl>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rculist.h |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index f31ef61..70d3ba5 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -244,7 +244,7 @@ static inline void list_splice_init_rcu(struct list_head *list,
 #define __list_for_each_rcu(pos, head) \
 	for (pos = rcu_dereference_raw(list_next_rcu(head)); \
 		pos != (head); \
-		pos = rcu_dereference_raw(list_next_rcu((pos)))
+		pos = rcu_dereference_raw(list_next_rcu(pos)))
 
 /**
  * list_for_each_entry_rcu	-	iterate over rcu list of given type
-- 
1.7.3.2


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH RFC tip/core/rcu 20/20] rcu: remove unused __list_for_each_rcu() macro
  2010-12-17 20:54 [PATCH tip/core/rcu 0/20] second preview of RCU patches for 2.6.38 Paul E. McKenney
                   ` (18 preceding siblings ...)
  2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 19/20] rculist: fix borked __list_for_each_rcu() macro Paul E. McKenney
@ 2010-12-17 20:54 ` Paul E. McKenney
  19 siblings, 0 replies; 34+ messages in thread
From: Paul E. McKenney @ 2010-12-17 20:54 UTC (permalink / raw)
  To: linux-kernel
  Cc: mingo, laijs, dipankar, akpm, mathieu.desnoyers, josh, niv, tglx,
	peterz, rostedt, Valdis.Kletnieks, dhowells, eric.dumazet,
	darren, Paul E. McKenney

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rculist.h |    5 -----
 1 files changed, 0 insertions(+), 5 deletions(-)

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 70d3ba5..2dea94f 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -241,11 +241,6 @@ static inline void list_splice_init_rcu(struct list_head *list,
 #define list_first_entry_rcu(ptr, type, member) \
 	list_entry_rcu((ptr)->next, type, member)
 
-#define __list_for_each_rcu(pos, head) \
-	for (pos = rcu_dereference_raw(list_next_rcu(head)); \
-		pos != (head); \
-		pos = rcu_dereference_raw(list_next_rcu(pos)))
-
 /**
  * list_for_each_entry_rcu	-	iterate over rcu list of given type
  * @pos:	the type * to use as a loop cursor.
-- 
1.7.3.2


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* Re: [PATCH RFC tip/core/rcu 11/20] rcu: fix race condition in synchronize_sched_expedited()
  2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 11/20] rcu: fix race condition in synchronize_sched_expedited() Paul E. McKenney
@ 2010-12-18 15:52   ` Tejun Heo
  2010-12-18 19:58     ` Paul E. McKenney
  0 siblings, 1 reply; 34+ messages in thread
From: Tejun Heo @ 2010-12-18 15:52 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: linux-kernel, mingo, laijs, dipankar, akpm, mathieu.desnoyers,
	josh, niv, tglx, peterz, rostedt, Valdis.Kletnieks, dhowells,
	eric.dumazet, darren

Hello,

On 12/17/2010 09:54 PM, Paul E. McKenney wrote:
> The new (early 2010) implementation of synchronize_sched_expedited() uses
> try_stop_cpu() to force a context switch on every CPU.  It also permits
> concurrent calls to synchronize_sched_expedited() to share a single call
> to try_stop_cpu() through use of an atomically incremented
> synchronize_sched_expedited_count variable.  Unfortunately, this is
> subject to failure as follows:
> 
> o	Task A invokes synchronize_sched_expedited(), try_stop_cpus()
> 	succeeds, but Task A is preempted before getting to the atomic
> 	increment of synchronize_sched_expedited_count.
> 
> o	Task B also invokes synchronize_sched_expedited(), with exactly
> 	the same outcome as Task A.
> 
> o	Task C also invokes synchronize_sched_expedited(), again with
> 	exactly the same outcome as Tasks A and B.
> 
> o	Task D also invokes synchronize_sched_expedited(), but only
> 	gets as far as acquiring the mutex within try_stop_cpus()
> 	before being preempted, interrupted, or otherwise delayed.
> 
> o	Task E also invokes synchronize_sched_expedited(), but only
> 	gets to the snapshotting of synchronize_sched_expedited_count.
> 
> o	Tasks A, B, and C all increment synchronize_sched_expedited_count.
> 
> o	Task E fails to get the mutex, so checks the new value
> 	of synchronize_sched_expedited_count.  It finds that the
> 	value has increased, so (wrongly) assumes that its work
> 	has been done, returning despite there having been no
> 	expedited grace period since it began.
> 
> The solution is to have the lowest-numbered CPU atomically increment
> the synchronize_sched_expedited_count variable within the
> synchronize_sched_expedited_cpu_stop() function, which is under
> the protection of the mutex acquired by try_stop_cpus().  However, this
> also requires that piggybacking tasks wait for three rather than two
> instances of try_stop_cpu(), because we cannot control the order in
> which the per-CPU callback function occur.
> 
> Cc: Tejun Heo <tj@kernel.org>
> Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

Acked-by: Tejun Heo <tj@kernel.org>

I suppose this should go -stable?

-- 
tejun

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH RFC tip/core/rcu 13/20] rcu: increase synchronize_sched_expedited() batching
  2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 13/20] rcu: increase synchronize_sched_expedited() batching Paul E. McKenney
@ 2010-12-18 16:13   ` Tejun Heo
  2010-12-18 20:14     ` Paul E. McKenney
  0 siblings, 1 reply; 34+ messages in thread
From: Tejun Heo @ 2010-12-18 16:13 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: linux-kernel, mingo, laijs, dipankar, akpm, mathieu.desnoyers,
	josh, niv, tglx, peterz, rostedt, Valdis.Kletnieks, dhowells,
	eric.dumazet, darren

Hello,

On 12/17/2010 09:54 PM, Paul E. McKenney wrote:
> From: Tejun Heo <tj@kernel.org>
> 
> The fix in commit #6a0cc49 requires more than three concurrent instances
> of synchronize_sched_expedited() before batching is possible.  This
> patch uses a ticket-counter-like approach that is also not unrelated to
> Lai Jiangshan's Ring RCU to allow sharing of expedited grace periods even
> when there are only two concurrent instances of synchronize_sched_expedited().
> 
> This commit builds on Tejun's original posting, which may be found at
> http://lkml.org/lkml/2010/11/9/204, adding memory barriers, avoiding
> overflow of signed integers (other than via atomic_t), and fixing the
> detection of batching.
> 
> Signed-off-by: Tejun Heo <tj@kernel.org>
> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

Acked-by: Tejun Heo <tj@kernel.org>

Some comments on the sequence testing tho.

> diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
> index 49e8e16..af56148 100644
> --- a/include/linux/rcupdate.h
> +++ b/include/linux/rcupdate.h
> @@ -47,6 +47,8 @@
>  extern int rcutorture_runnable; /* for sysctl */
>  #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
>  
> +#define UINT_CMP_GE(a, b)	(UINT_MAX / 2 >= (a) - (b))
> +#define UINT_CMP_LT(a, b)	(UINT_MAX / 2 < (a) - (b))
>  #define ULONG_CMP_GE(a, b)	(ULONG_MAX / 2 >= (a) - (b))
>  #define ULONG_CMP_LT(a, b)	(ULONG_MAX / 2 < (a) - (b))

I don't think the original comparison had overflow problem.  (a) < (b)
gives the wrong result on overflow but (int)((a) - (b)) < 0 is
correct.

I find the latter approach cleaner and that way the constant in the
instruction can be avoided too although if the compiler might generate
the same code regardless.

Also, I think the names are misleading.  They aren't testing whether
one is greater or less than the other.  They're testing whether one is
before or after the other where the counters are used as monotonically
incrementing (with wrapping) sequence, so wouldn't something like the
following be better?

#define SEQ_TEST(a, b, test_op)	({					\
	typeof(a) __seq_a = (a);					\
	typeof(b) __seq_b = (b);					\
	bool __ret;							\
	(void)(&__seq_a == &__seq_b);					\
	switch (sizeof(__seq_a)) {					\
		case sizeof(char):					\
			__ret = (char)(__seq_a - __seq_b) test_op 0;	\
			break;						\
		case sizeof(int):					\
			__ret = (int)(__seq_a - __seq_b) test_op 0;	\
			break;						\
		case sizeof(long):					\
			__ret = (long)(__seq_a - __seq_b) test_op 0;	\
			break;						\
		case sizeof(long long):					\
			__ret = (long long)(__seq_a - __seq_b) test_op 0; \
			break;						\
		default:						\
			__make_build_fail;				\
	}								\
	__ret;								\
})

#define SEQ_BEFORE(a, b)	SEQ_TEST((a), (b), <)
 and so on...

Please note that the above macro is completely untested.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH RFC tip/core/rcu 11/20] rcu: fix race condition in synchronize_sched_expedited()
  2010-12-18 15:52   ` Tejun Heo
@ 2010-12-18 19:58     ` Paul E. McKenney
  0 siblings, 0 replies; 34+ messages in thread
From: Paul E. McKenney @ 2010-12-18 19:58 UTC (permalink / raw)
  To: Tejun Heo
  Cc: linux-kernel, mingo, laijs, dipankar, akpm, mathieu.desnoyers,
	josh, niv, tglx, peterz, rostedt, Valdis.Kletnieks, dhowells,
	eric.dumazet, darren

On Sat, Dec 18, 2010 at 04:52:39PM +0100, Tejun Heo wrote:
> Hello,
> 
> On 12/17/2010 09:54 PM, Paul E. McKenney wrote:
> > The new (early 2010) implementation of synchronize_sched_expedited() uses
> > try_stop_cpu() to force a context switch on every CPU.  It also permits
> > concurrent calls to synchronize_sched_expedited() to share a single call
> > to try_stop_cpu() through use of an atomically incremented
> > synchronize_sched_expedited_count variable.  Unfortunately, this is
> > subject to failure as follows:
> > 
> > o	Task A invokes synchronize_sched_expedited(), try_stop_cpus()
> > 	succeeds, but Task A is preempted before getting to the atomic
> > 	increment of synchronize_sched_expedited_count.
> > 
> > o	Task B also invokes synchronize_sched_expedited(), with exactly
> > 	the same outcome as Task A.
> > 
> > o	Task C also invokes synchronize_sched_expedited(), again with
> > 	exactly the same outcome as Tasks A and B.
> > 
> > o	Task D also invokes synchronize_sched_expedited(), but only
> > 	gets as far as acquiring the mutex within try_stop_cpus()
> > 	before being preempted, interrupted, or otherwise delayed.
> > 
> > o	Task E also invokes synchronize_sched_expedited(), but only
> > 	gets to the snapshotting of synchronize_sched_expedited_count.
> > 
> > o	Tasks A, B, and C all increment synchronize_sched_expedited_count.
> > 
> > o	Task E fails to get the mutex, so checks the new value
> > 	of synchronize_sched_expedited_count.  It finds that the
> > 	value has increased, so (wrongly) assumes that its work
> > 	has been done, returning despite there having been no
> > 	expedited grace period since it began.
> > 
> > The solution is to have the lowest-numbered CPU atomically increment
> > the synchronize_sched_expedited_count variable within the
> > synchronize_sched_expedited_cpu_stop() function, which is under
> > the protection of the mutex acquired by try_stop_cpus().  However, this
> > also requires that piggybacking tasks wait for three rather than two
> > instances of try_stop_cpu(), because we cannot control the order in
> > which the per-CPU callback function occur.
> > 
> > Cc: Tejun Heo <tj@kernel.org>
> > Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
> > Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> 
> Acked-by: Tejun Heo <tj@kernel.org>

Thank you!

> I suppose this should go -stable?

Given that it is only a theoretical bug, I am targeting 2.6.38 rather
than 2.6.37.  But yes, looks to me like a -stable candidate.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH RFC tip/core/rcu 13/20] rcu: increase synchronize_sched_expedited() batching
  2010-12-18 16:13   ` Tejun Heo
@ 2010-12-18 20:14     ` Paul E. McKenney
  2010-12-19  9:43       ` Tejun Heo
  0 siblings, 1 reply; 34+ messages in thread
From: Paul E. McKenney @ 2010-12-18 20:14 UTC (permalink / raw)
  To: Tejun Heo
  Cc: linux-kernel, mingo, laijs, dipankar, akpm, mathieu.desnoyers,
	josh, niv, tglx, peterz, rostedt, Valdis.Kletnieks, dhowells,
	eric.dumazet, darren

On Sat, Dec 18, 2010 at 05:13:07PM +0100, Tejun Heo wrote:
> Hello,
> 
> On 12/17/2010 09:54 PM, Paul E. McKenney wrote:
> > From: Tejun Heo <tj@kernel.org>
> > 
> > The fix in commit #6a0cc49 requires more than three concurrent instances
> > of synchronize_sched_expedited() before batching is possible.  This
> > patch uses a ticket-counter-like approach that is also not unrelated to
> > Lai Jiangshan's Ring RCU to allow sharing of expedited grace periods even
> > when there are only two concurrent instances of synchronize_sched_expedited().
> > 
> > This commit builds on Tejun's original posting, which may be found at
> > http://lkml.org/lkml/2010/11/9/204, adding memory barriers, avoiding
> > overflow of signed integers (other than via atomic_t), and fixing the
> > detection of batching.
> > 
> > Signed-off-by: Tejun Heo <tj@kernel.org>
> > Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> 
> Acked-by: Tejun Heo <tj@kernel.org>

Thank you again!

> Some comments on the sequence testing tho.
> 
> > diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
> > index 49e8e16..af56148 100644
> > --- a/include/linux/rcupdate.h
> > +++ b/include/linux/rcupdate.h
> > @@ -47,6 +47,8 @@
> >  extern int rcutorture_runnable; /* for sysctl */
> >  #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
> >  
> > +#define UINT_CMP_GE(a, b)	(UINT_MAX / 2 >= (a) - (b))
> > +#define UINT_CMP_LT(a, b)	(UINT_MAX / 2 < (a) - (b))
> >  #define ULONG_CMP_GE(a, b)	(ULONG_MAX / 2 >= (a) - (b))
> >  #define ULONG_CMP_LT(a, b)	(ULONG_MAX / 2 < (a) - (b))
> 
> I don't think the original comparison had overflow problem.  (a) < (b)
> gives the wrong result on overflow but (int)((a) - (b)) < 0 is
> correct.

You are right that it does give the correct result now, but the C
standard never has defined overflow for signed integers, as noted in
Section 6.3.1.3p3 of the N1494 Working Draft of the C standard:

	Otherwise, the new type is signed and the value cannot be
	represented in it; either the result is implementation-defined
	or an implementation-defined signal is raised.

I have heard too many compiler guys gleefully discussing optimizations
that they could use if they took full advantage of this clause, so I
am not comfortable relying on the intuitive semantics for signed
arithmetic.  (Now atomic_t is another story -- both C and C++ will
be requiring twos-complement semantics, thankfully.)

> I find the latter approach cleaner and that way the constant in the
> instruction can be avoided too although if the compiler might generate
> the same code regardless.

I would like your way better if it was defined in the C standard.
But it unfortunately is not.  :-(

> Also, I think the names are misleading.  They aren't testing whether
> one is greater or less than the other.  They're testing whether one is
> before or after the other where the counters are used as monotonically
> incrementing (with wrapping) sequence, so wouldn't something like the
> following be better?

They are comparing the twos-complement difference between the two
numbers against zero.

> #define SEQ_TEST(a, b, test_op)	({					\
> 	typeof(a) __seq_a = (a);					\
> 	typeof(b) __seq_b = (b);					\
> 	bool __ret;							\
> 	(void)(&__seq_a == &__seq_b);					\
> 	switch (sizeof(__seq_a)) {					\
> 		case sizeof(char):					\
> 			__ret = (char)(__seq_a - __seq_b) test_op 0;	\
> 			break;						\
> 		case sizeof(int):					\
> 			__ret = (int)(__seq_a - __seq_b) test_op 0;	\
> 			break;						\
> 		case sizeof(long):					\
> 			__ret = (long)(__seq_a - __seq_b) test_op 0;	\
> 			break;						\
> 		case sizeof(long long):					\
> 			__ret = (long long)(__seq_a - __seq_b) test_op 0; \
> 			break;						\
> 		default:						\
> 			__make_build_fail;				\
> 	}								\
> 	__ret;								\
> })
> 
> #define SEQ_BEFORE(a, b)	SEQ_TEST((a), (b), <)
>  and so on...
> 
> Please note that the above macro is completely untested.

If you make something similar to these macros, but in a way that avoids
overflowing signed integers, I would be happy to use it.  Might be a
good addition to compiler.h, for example.

							Thanx, Paul

> Thanks.
> 
> -- 
> tejun
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH RFC tip/core/rcu 13/20] rcu: increase synchronize_sched_expedited() batching
  2010-12-18 20:14     ` Paul E. McKenney
@ 2010-12-19  9:43       ` Tejun Heo
  2010-12-19 16:35         ` Paul E. McKenney
  2010-12-20 10:31         ` Peter Zijlstra
  0 siblings, 2 replies; 34+ messages in thread
From: Tejun Heo @ 2010-12-19  9:43 UTC (permalink / raw)
  To: paulmck
  Cc: linux-kernel, mingo, laijs, dipankar, akpm, mathieu.desnoyers,
	josh, niv, tglx, peterz, rostedt, Valdis.Kletnieks, dhowells,
	eric.dumazet, darren

Hello,

On 12/18/2010 09:14 PM, Paul E. McKenney wrote:
>>> diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
>>> index 49e8e16..af56148 100644
>>> --- a/include/linux/rcupdate.h
>>> +++ b/include/linux/rcupdate.h
>>> @@ -47,6 +47,8 @@
>>>  extern int rcutorture_runnable; /* for sysctl */
>>>  #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
>>>  
>>> +#define UINT_CMP_GE(a, b)	(UINT_MAX / 2 >= (a) - (b))
>>> +#define UINT_CMP_LT(a, b)	(UINT_MAX / 2 < (a) - (b))
>>>  #define ULONG_CMP_GE(a, b)	(ULONG_MAX / 2 >= (a) - (b))
>>>  #define ULONG_CMP_LT(a, b)	(ULONG_MAX / 2 < (a) - (b))
>>
>> I don't think the original comparison had overflow problem.  (a) < (b)
>> gives the wrong result on overflow but (int)((a) - (b)) < 0 is
>> correct.
> 
> You are right that it does give the correct result now, but the C
> standard never has defined overflow for signed integers, as noted in
> Section 6.3.1.3p3 of the N1494 Working Draft of the C standard:
> 
> 	Otherwise, the new type is signed and the value cannot be
> 	represented in it; either the result is implementation-defined
> 	or an implementation-defined signal is raised.
> 
> I have heard too many compiler guys gleefully discussing optimizations
> that they could use if they took full advantage of this clause, so I
> am not comfortable relying on the intuitive semantics for signed
> arithmetic.  (Now atomic_t is another story -- both C and C++ will
> be requiring twos-complement semantics, thankfully.)
>
>> I find the latter approach cleaner and that way the constant in the
>> instruction can be avoided too although if the compiler might generate
>> the same code regardless.
> 
> I would like your way better if it was defined in the C standard.
> But it unfortunately is not.  :-(

I see, then would something like the following work?

 (int)((unsigned)(a) - (unsigned)(b)) < 0

>> Also, I think the names are misleading.  They aren't testing whether
>> one is greater or less than the other.  They're testing whether one is
>> before or after the other where the counters are used as monotonically
>> incrementing (with wrapping) sequence, so wouldn't something like the
>> following be better?
> 
> They are comparing the twos-complement difference between the two
> numbers against zero.

But still GE/LT are way too misleading.  Anyways, so with the above
change the macro now would look like the following.

#define SEQ_TEST(a, b, op)	({					\
	typeof(a) __seq_a = (a);					\
	typeof(b) __seq_b = (b);					\
	bool __ret;							\
	(void)(&__seq_a == &__seq_b);					\
	switch (sizeof(__seq_a)) {					\
		case sizeof(s8):					\
			__ret = (s8)((u8)__seq_a - (u8)__seq_b) op 0;	\
			break;						\
		case sizeof(s16):					\
			__ret = (s16)((u16)__seq_a - (u16)__seq_b) op 0;\
			break;						\
		case sizeof(s32):					\
			__ret = (s32)((u32)__seq_a - (u32)__seq_b) op 0;\
			break;						\
		case sizeof(s64):					\
			__ret = (s64)((u64)__seq_a - (u64)__seq_b) op 0;\
			break;						\
		default:						\
			__make_build_fail;				\
	}								\
	__ret;								\
})

Would the above work?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH RFC tip/core/rcu 13/20] rcu: increase synchronize_sched_expedited() batching
  2010-12-19  9:43       ` Tejun Heo
@ 2010-12-19 16:35         ` Paul E. McKenney
  2010-12-20 10:33           ` Peter Zijlstra
  2010-12-20 10:31         ` Peter Zijlstra
  1 sibling, 1 reply; 34+ messages in thread
From: Paul E. McKenney @ 2010-12-19 16:35 UTC (permalink / raw)
  To: Tejun Heo
  Cc: linux-kernel, mingo, laijs, dipankar, akpm, mathieu.desnoyers,
	josh, niv, tglx, peterz, rostedt, Valdis.Kletnieks, dhowells,
	eric.dumazet, darren

On Sun, Dec 19, 2010 at 10:43:46AM +0100, Tejun Heo wrote:
> Hello,
> 
> On 12/18/2010 09:14 PM, Paul E. McKenney wrote:
> >>> diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
> >>> index 49e8e16..af56148 100644
> >>> --- a/include/linux/rcupdate.h
> >>> +++ b/include/linux/rcupdate.h
> >>> @@ -47,6 +47,8 @@
> >>>  extern int rcutorture_runnable; /* for sysctl */
> >>>  #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
> >>>  
> >>> +#define UINT_CMP_GE(a, b)	(UINT_MAX / 2 >= (a) - (b))
> >>> +#define UINT_CMP_LT(a, b)	(UINT_MAX / 2 < (a) - (b))
> >>>  #define ULONG_CMP_GE(a, b)	(ULONG_MAX / 2 >= (a) - (b))
> >>>  #define ULONG_CMP_LT(a, b)	(ULONG_MAX / 2 < (a) - (b))
> >>
> >> I don't think the original comparison had overflow problem.  (a) < (b)
> >> gives the wrong result on overflow but (int)((a) - (b)) < 0 is
> >> correct.
> > 
> > You are right that it does give the correct result now, but the C
> > standard never has defined overflow for signed integers, as noted in
> > Section 6.3.1.3p3 of the N1494 Working Draft of the C standard:
> > 
> > 	Otherwise, the new type is signed and the value cannot be
> > 	represented in it; either the result is implementation-defined
> > 	or an implementation-defined signal is raised.
> > 
> > I have heard too many compiler guys gleefully discussing optimizations
> > that they could use if they took full advantage of this clause, so I
> > am not comfortable relying on the intuitive semantics for signed
> > arithmetic.  (Now atomic_t is another story -- both C and C++ will
> > be requiring twos-complement semantics, thankfully.)
> >
> >> I find the latter approach cleaner and that way the constant in the
> >> instruction can be avoided too although if the compiler might generate
> >> the same code regardless.
> > 
> > I would like your way better if it was defined in the C standard.
> > But it unfortunately is not.  :-(
> 
> I see, then would something like the following work?
> 
>  (int)((unsigned)(a) - (unsigned)(b)) < 0

Unfortunately, no.  :-(

The (int) converts from unsigned to signed, and if the upper bit of
the unsigned difference is non-zero, then the paragraph I quoted above
applies, and the standard allows the compiler to do whatever it wants.

> >> Also, I think the names are misleading.  They aren't testing whether
> >> one is greater or less than the other.  They're testing whether one is
> >> before or after the other where the counters are used as monotonically
> >> incrementing (with wrapping) sequence, so wouldn't something like the
> >> following be better?
> > 
> > They are comparing the twos-complement difference between the two
> > numbers against zero.
> 
> But still GE/LT are way too misleading.  Anyways, so with the above
> change the macro now would look like the following.

I am not all that worried about the names.  I can certainly live with
SEQ_TEST() just as easily as I can live with the current names.

> #define SEQ_TEST(a, b, op)	({					\
> 	typeof(a) __seq_a = (a);					\
> 	typeof(b) __seq_b = (b);					\
> 	bool __ret;							\
> 	(void)(&__seq_a == &__seq_b);					\
> 	switch (sizeof(__seq_a)) {					\
> 		case sizeof(s8):					\
> 			__ret = (s8)((u8)__seq_a - (u8)__seq_b) op 0;	\
> 			break;						\
> 		case sizeof(s16):					\
> 			__ret = (s16)((u16)__seq_a - (u16)__seq_b) op 0;\
> 			break;						\
> 		case sizeof(s32):					\
> 			__ret = (s32)((u32)__seq_a - (u32)__seq_b) op 0;\
> 			break;						\
> 		case sizeof(s64):					\
> 			__ret = (s64)((u64)__seq_a - (u64)__seq_b) op 0;\
> 			break;						\
> 		default:						\
> 			__make_build_fail;				\
> 	}								\
> 	__ret;								\
> })
> 
> Would the above work?

Again, unfortunately, no.  The (s8), (s16), (s32), and (s64) casts result
in implementation-defined behavior when the top bit of the difference is
set, just as with (int) above.

To remain within the confines of the standard, the entire computation
needs to be done using unsigned arithmetic.  One way to do this might
be something like the following:

#define U8_MAX		((u8)(~0U))
#define U16_MAX		((u16)(~0U))
#define U32_MAX		((u32)(~0UL))
#define U64_MAX		((u64)(~0ULL))

#define SEQ_TEST(a, b, op)	({					\
	typeof(a) __seq_a = (a);					\
	typeof(b) __seq_b = (b);					\
	bool __ret;							\
	(void)(&__seq_a == &__seq_b);					\
	switch (sizeof(__seq_a)) {					\
		case sizeof(s8):					\
			__ret = (U8_MAX / 2 op (u8)__seq_a - (u8)__seq_b);\
			break;						\
		case sizeof(s16):					\
			__ret = (U16_MAX / 2 op (u16)__seq_a - (u16)__seq_b);\
			break;						\
		case sizeof(s32):					\
			__ret = (U32_MAX / 2 op (u32)__seq_a - (u32)__seq_b);\
			break;						\
		case sizeof(s64):					\
			__ret = (U64_MAX / 2 op (u64)__seq_a - (u64)__seq_b);\
			break;						\
		default:						\
			__make_build_fail;				\
	}								\
	__ret;								\
})

Make sense?

							Thanx, Paul

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH RFC tip/core/rcu 15/20] rcu: Keep gpnum and completed fields synchronized
  2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 15/20] rcu: Keep gpnum and completed fields synchronized Paul E. McKenney
@ 2010-12-20  2:13   ` Lai Jiangshan
  2010-12-20  2:14     ` Frederic Weisbecker
  2010-12-20 16:51     ` Paul E. McKenney
  0 siblings, 2 replies; 34+ messages in thread
From: Lai Jiangshan @ 2010-12-20  2:13 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: linux-kernel, mingo, dipankar, akpm, mathieu.desnoyers, josh,
	niv, tglx, peterz, rostedt, Valdis.Kletnieks, dhowells,
	eric.dumazet, darren, Frederic Weisbecker, Peter Zijlstra

On 12/18/2010 04:54 AM, Paul E. McKenney wrote:
> From: Frederic Weisbecker <fweisbec@gmail.com>
> 
> When a CPU that was in an extended quiescent state wakes
> up and catches up with grace periods that remote CPUs
> completed on its behalf, we update the completed field
> but not the gpnum that keeps a stale value of a backward
> grace period ID.
> 
> Later, note_new_gpnum() will interpret the shift between
> the local CPU and the node grace period ID as some new grace
> period to handle and will then start to hunt quiescent state.
> 
> But if every grace periods have already been completed, this
> interpretation becomes broken. And we'll be stuck in clusters
> of spurious softirqs because rcu_report_qs_rdp() will make
> this broken state run into infinite loop.
> 
> The solution, as suggested by Lai Jiangshan, is to ensure that
> the gpnum and completed fields are well synchronized when we catch
> up with completed grace periods on their behalf by other cpus.
> This way we won't start noting spurious new grace periods.
> 
> Suggested-by: Lai Jiangshan <laijs@cn.fujitsu.com>
> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> Cc: Ingo Molnar <mingo@elte.hu>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
> Cc: Steven Rostedt <rostedt@goodmis.org
> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> ---
>  kernel/rcutree.c |    9 +++++++++
>  1 files changed, 9 insertions(+), 0 deletions(-)
> 
> diff --git a/kernel/rcutree.c b/kernel/rcutree.c
> index 916f42b..8105271 100644
> --- a/kernel/rcutree.c
> +++ b/kernel/rcutree.c
> @@ -680,6 +680,15 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
>  		rdp->completed = rnp->completed;
>  
>  		/*
> +		 * If we were in an extended quiescent state, we may have
> +		 * missed some grace periods that others CPUs took care on
> +		 * our behalf. Catch up with this state to avoid noting
> +		 * spurious new grace periods.
> +		 */
> +		if (rdp->completed > rdp->gpnum)
> +			rdp->gpnum = rdp->completed;

Need to use ULONG_CMP_LT(rdp->gpnum, rdp->completed) instead.

> +
> +		/*
>  		 * If another CPU handled our extended quiescent states and
>  		 * we have no more grace period to complete yet, then stop
>  		 * chasing quiescent states.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH RFC tip/core/rcu 15/20] rcu: Keep gpnum and completed fields synchronized
  2010-12-20  2:13   ` Lai Jiangshan
@ 2010-12-20  2:14     ` Frederic Weisbecker
  2010-12-20 16:51     ` Paul E. McKenney
  1 sibling, 0 replies; 34+ messages in thread
From: Frederic Weisbecker @ 2010-12-20  2:14 UTC (permalink / raw)
  To: Lai Jiangshan
  Cc: Paul E. McKenney, linux-kernel, mingo, dipankar, akpm,
	mathieu.desnoyers, josh, niv, tglx, peterz, rostedt,
	Valdis.Kletnieks, dhowells, eric.dumazet, darren, Peter Zijlstra

On Mon, Dec 20, 2010 at 10:13:35AM +0800, Lai Jiangshan wrote:
> On 12/18/2010 04:54 AM, Paul E. McKenney wrote:
> > From: Frederic Weisbecker <fweisbec@gmail.com>
> > 
> > When a CPU that was in an extended quiescent state wakes
> > up and catches up with grace periods that remote CPUs
> > completed on its behalf, we update the completed field
> > but not the gpnum that keeps a stale value of a backward
> > grace period ID.
> > 
> > Later, note_new_gpnum() will interpret the shift between
> > the local CPU and the node grace period ID as some new grace
> > period to handle and will then start to hunt quiescent state.
> > 
> > But if every grace periods have already been completed, this
> > interpretation becomes broken. And we'll be stuck in clusters
> > of spurious softirqs because rcu_report_qs_rdp() will make
> > this broken state run into infinite loop.
> > 
> > The solution, as suggested by Lai Jiangshan, is to ensure that
> > the gpnum and completed fields are well synchronized when we catch
> > up with completed grace periods on their behalf by other cpus.
> > This way we won't start noting spurious new grace periods.
> > 
> > Suggested-by: Lai Jiangshan <laijs@cn.fujitsu.com>
> > Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
> > Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> > Cc: Ingo Molnar <mingo@elte.hu>
> > Cc: Thomas Gleixner <tglx@linutronix.de>
> > Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
> > Cc: Steven Rostedt <rostedt@goodmis.org
> > Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> > ---
> >  kernel/rcutree.c |    9 +++++++++
> >  1 files changed, 9 insertions(+), 0 deletions(-)
> > 
> > diff --git a/kernel/rcutree.c b/kernel/rcutree.c
> > index 916f42b..8105271 100644
> > --- a/kernel/rcutree.c
> > +++ b/kernel/rcutree.c
> > @@ -680,6 +680,15 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
> >  		rdp->completed = rnp->completed;
> >  
> >  		/*
> > +		 * If we were in an extended quiescent state, we may have
> > +		 * missed some grace periods that others CPUs took care on
> > +		 * our behalf. Catch up with this state to avoid noting
> > +		 * spurious new grace periods.
> > +		 */
> > +		if (rdp->completed > rdp->gpnum)
> > +			rdp->gpnum = rdp->completed;
> 
> Need to use ULONG_CMP_LT(rdp->gpnum, rdp->completed) instead.

Paul fixed that in:

[PATCH RFC tip/core/rcu 16/20] rcu: fine-tune grace-period begin/end checks

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH RFC tip/core/rcu 13/20] rcu: increase synchronize_sched_expedited() batching
  2010-12-19  9:43       ` Tejun Heo
  2010-12-19 16:35         ` Paul E. McKenney
@ 2010-12-20 10:31         ` Peter Zijlstra
  2010-12-21  7:58           ` Paul E. McKenney
  1 sibling, 1 reply; 34+ messages in thread
From: Peter Zijlstra @ 2010-12-20 10:31 UTC (permalink / raw)
  To: Tejun Heo
  Cc: paulmck, linux-kernel, mingo, laijs, dipankar, akpm,
	mathieu.desnoyers, josh, niv, tglx, rostedt, Valdis.Kletnieks,
	dhowells, eric.dumazet, darren

On Sun, 2010-12-19 at 10:43 +0100, Tejun Heo wrote:
> 
> I see, then would something like the following work?
> 
>  (int)((unsigned)(a) - (unsigned)(b)) < 0 

Note that there is lots of kernel code that does the above and
variations thereof and expects it to work.



^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH RFC tip/core/rcu 13/20] rcu: increase synchronize_sched_expedited() batching
  2010-12-19 16:35         ` Paul E. McKenney
@ 2010-12-20 10:33           ` Peter Zijlstra
  2010-12-20 13:40             ` Mathieu Desnoyers
  0 siblings, 1 reply; 34+ messages in thread
From: Peter Zijlstra @ 2010-12-20 10:33 UTC (permalink / raw)
  To: paulmck
  Cc: Tejun Heo, linux-kernel, mingo, laijs, dipankar, akpm,
	mathieu.desnoyers, josh, niv, tglx, rostedt, Valdis.Kletnieks,
	dhowells, eric.dumazet, darren

On Sun, 2010-12-19 at 08:35 -0800, Paul E. McKenney wrote:
> >  (int)((unsigned)(a) - (unsigned)(b)) < 0
> 
> Unfortunately, no.  :-(
> 
> The (int) converts from unsigned to signed, and if the upper bit of
> the unsigned difference is non-zero, then the paragraph I quoted above
> applies, and the standard allows the compiler to do whatever it wants.
> 
As noted in the previous reply, that would render quite a lot of our
time-keeping code broken. I think its safe to assume this works.

Look at time_after() for example:

#define time_after(a,b)		\
	(typecheck(unsigned long, a) && \
	 typecheck(unsigned long, b) && \
	 ((long)(b) - (long)(a) < 0))


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH RFC tip/core/rcu 13/20] rcu: increase synchronize_sched_expedited() batching
  2010-12-20 10:33           ` Peter Zijlstra
@ 2010-12-20 13:40             ` Mathieu Desnoyers
  0 siblings, 0 replies; 34+ messages in thread
From: Mathieu Desnoyers @ 2010-12-20 13:40 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: paulmck, Tejun Heo, linux-kernel, mingo, laijs, dipankar, akpm,
	josh, niv, tglx, rostedt, Valdis.Kletnieks, dhowells,
	eric.dumazet, darren

* Peter Zijlstra (peterz@infradead.org) wrote:
> On Sun, 2010-12-19 at 08:35 -0800, Paul E. McKenney wrote:
> > >  (int)((unsigned)(a) - (unsigned)(b)) < 0
> > 
> > Unfortunately, no.  :-(
> > 
> > The (int) converts from unsigned to signed, and if the upper bit of
> > the unsigned difference is non-zero, then the paragraph I quoted above
> > applies, and the standard allows the compiler to do whatever it wants.
> > 
> As noted in the previous reply, that would render quite a lot of our
> time-keeping code broken. I think its safe to assume this works.
> 
> Look at time_after() for example:
> 
> #define time_after(a,b)		\
> 	(typecheck(unsigned long, a) && \
> 	 typecheck(unsigned long, b) && \
> 	 ((long)(b) - (long)(a) < 0))

I agree with Peter: as long as the difference value is expected not to
overflow a signed long, the time_after() approach should be safe.

Now it depends if the usage Paul spotted is expected to have a
difference that overflows a signed int. It's not clear to me that it's
realistically possible from reading the patch, but I might be missing
something.

And by the way, if the difference is expected to overflow a signed int,
then we're only a factor two away from overflowing an unsigned int, so
the whole approach would be fragile.

Thanks,

Mathieu

-- 
Mathieu Desnoyers
Operating System Efficiency R&D Consultant
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH RFC tip/core/rcu 15/20] rcu: Keep gpnum and completed fields synchronized
  2010-12-20  2:13   ` Lai Jiangshan
  2010-12-20  2:14     ` Frederic Weisbecker
@ 2010-12-20 16:51     ` Paul E. McKenney
  1 sibling, 0 replies; 34+ messages in thread
From: Paul E. McKenney @ 2010-12-20 16:51 UTC (permalink / raw)
  To: Lai Jiangshan
  Cc: linux-kernel, mingo, dipankar, akpm, mathieu.desnoyers, josh,
	niv, tglx, peterz, rostedt, Valdis.Kletnieks, dhowells,
	eric.dumazet, darren, Frederic Weisbecker, Peter Zijlstra

On Mon, Dec 20, 2010 at 10:13:35AM +0800, Lai Jiangshan wrote:
> On 12/18/2010 04:54 AM, Paul E. McKenney wrote:
> > From: Frederic Weisbecker <fweisbec@gmail.com>
> > 
> > When a CPU that was in an extended quiescent state wakes
> > up and catches up with grace periods that remote CPUs
> > completed on its behalf, we update the completed field
> > but not the gpnum that keeps a stale value of a backward
> > grace period ID.
> > 
> > Later, note_new_gpnum() will interpret the shift between
> > the local CPU and the node grace period ID as some new grace
> > period to handle and will then start to hunt quiescent state.
> > 
> > But if every grace periods have already been completed, this
> > interpretation becomes broken. And we'll be stuck in clusters
> > of spurious softirqs because rcu_report_qs_rdp() will make
> > this broken state run into infinite loop.
> > 
> > The solution, as suggested by Lai Jiangshan, is to ensure that
> > the gpnum and completed fields are well synchronized when we catch
> > up with completed grace periods on their behalf by other cpus.
> > This way we won't start noting spurious new grace periods.
> > 
> > Suggested-by: Lai Jiangshan <laijs@cn.fujitsu.com>
> > Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
> > Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> > Cc: Ingo Molnar <mingo@elte.hu>
> > Cc: Thomas Gleixner <tglx@linutronix.de>
> > Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
> > Cc: Steven Rostedt <rostedt@goodmis.org
> > Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> > ---
> >  kernel/rcutree.c |    9 +++++++++
> >  1 files changed, 9 insertions(+), 0 deletions(-)
> > 
> > diff --git a/kernel/rcutree.c b/kernel/rcutree.c
> > index 916f42b..8105271 100644
> > --- a/kernel/rcutree.c
> > +++ b/kernel/rcutree.c
> > @@ -680,6 +680,15 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
> >  		rdp->completed = rnp->completed;
> >  
> >  		/*
> > +		 * If we were in an extended quiescent state, we may have
> > +		 * missed some grace periods that others CPUs took care on
> > +		 * our behalf. Catch up with this state to avoid noting
> > +		 * spurious new grace periods.
> > +		 */
> > +		if (rdp->completed > rdp->gpnum)
> > +			rdp->gpnum = rdp->completed;
> 
> Need to use ULONG_CMP_LT(rdp->gpnum, rdp->completed) instead.

You are quite correct!  And the next patch in this series made exactly
that change.

							Thanx, Paul

> > +
> > +		/*
> >  		 * If another CPU handled our extended quiescent states and
> >  		 * we have no more grace period to complete yet, then stop
> >  		 * chasing quiescent states.
> 

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH RFC tip/core/rcu 13/20] rcu: increase synchronize_sched_expedited() batching
  2010-12-20 10:31         ` Peter Zijlstra
@ 2010-12-21  7:58           ` Paul E. McKenney
  0 siblings, 0 replies; 34+ messages in thread
From: Paul E. McKenney @ 2010-12-21  7:58 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Tejun Heo, linux-kernel, mingo, laijs, dipankar, akpm,
	mathieu.desnoyers, josh, niv, tglx, rostedt, Valdis.Kletnieks,
	dhowells, eric.dumazet, darren

On Mon, Dec 20, 2010 at 11:31:20AM +0100, Peter Zijlstra wrote:
> On Sun, 2010-12-19 at 10:43 +0100, Tejun Heo wrote:
> > 
> > I see, then would something like the following work?
> > 
> >  (int)((unsigned)(a) - (unsigned)(b)) < 0 
> 
> Note that there is lots of kernel code that does the above and
> variations thereof and expects it to work.

Given that the approach I have been using is the same number of lines
and avoids relying on this expectation, I will stick with it.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 34+ messages in thread

end of thread, other threads:[~2010-12-21  7:58 UTC | newest]

Thread overview: 34+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-12-17 20:54 [PATCH tip/core/rcu 0/20] second preview of RCU patches for 2.6.38 Paul E. McKenney
2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 01/20] rcu: add priority-inversion testing to rcutorture Paul E. McKenney
2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 02/20] rcu: move TINY_RCU from softirq to kthread Paul E. McKenney
2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 03/20] rcu: priority boosting for TINY_PREEMPT_RCU Paul E. McKenney
2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 04/20] rcu: add tracing for TINY_RCU and TINY_PREEMPT_RCU Paul E. McKenney
2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 05/20] rcu: document TINY_RCU and TINY_PREEMPT_RCU tracing Paul E. McKenney
2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 06/20] rcu: Distinguish between boosting and boosted Paul E. McKenney
2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 07/20] rcu: get rid of obsolete "classic" names in TREE_RCU tracing Paul E. McKenney
2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 08/20] rcu,cleanup: move synchronize_sched_expedited() out of sched.c Paul E. McKenney
2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 09/20] rcu,cleanup: simplify the code when cpu is dying Paul E. McKenney
2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 10/20] rcu: update documentation/comments for Lai's adoption patch Paul E. McKenney
2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 11/20] rcu: fix race condition in synchronize_sched_expedited() Paul E. McKenney
2010-12-18 15:52   ` Tejun Heo
2010-12-18 19:58     ` Paul E. McKenney
2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 12/20] rcu: Make synchronize_srcu_expedited() fast if running readers Paul E. McKenney
2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 13/20] rcu: increase synchronize_sched_expedited() batching Paul E. McKenney
2010-12-18 16:13   ` Tejun Heo
2010-12-18 20:14     ` Paul E. McKenney
2010-12-19  9:43       ` Tejun Heo
2010-12-19 16:35         ` Paul E. McKenney
2010-12-20 10:33           ` Peter Zijlstra
2010-12-20 13:40             ` Mathieu Desnoyers
2010-12-20 10:31         ` Peter Zijlstra
2010-12-21  7:58           ` Paul E. McKenney
2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 14/20] rcu: Stop chasing QS if another CPU did it for us Paul E. McKenney
2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 15/20] rcu: Keep gpnum and completed fields synchronized Paul E. McKenney
2010-12-20  2:13   ` Lai Jiangshan
2010-12-20  2:14     ` Frederic Weisbecker
2010-12-20 16:51     ` Paul E. McKenney
2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 16/20] rcu: fine-tune grace-period begin/end checks Paul E. McKenney
2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 17/20] rcu: limit rcu_node leaf-level fanout Paul E. McKenney
2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 18/20] rcu: reduce __call_rcu()-induced contention on rcu_node structures Paul E. McKenney
2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 19/20] rculist: fix borked __list_for_each_rcu() macro Paul E. McKenney
2010-12-17 20:54 ` [PATCH RFC tip/core/rcu 20/20] rcu: remove unused " Paul E. McKenney

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).