All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH RFC] v2 expedited "big hammer" RCU grace periods
@ 2009-04-26  5:23 Paul E. McKenney
  2009-04-26 11:27 ` Ingo Molnar
  0 siblings, 1 reply; 13+ messages in thread
From: Paul E. McKenney @ 2009-04-26  5:23 UTC (permalink / raw)
  To: linux-kernel, netdev, netfilter-devel
  Cc: mingo, akpm, torvalds, davem, dada1, zbr, jeff.chua.linux,
	paulus, laijs, jengelh, r000n, benh, mathieu.desnoyers

Second cut of "big hammer" expedited RCU grace periods, but only for
rcu_bh.  This creates another softirq vector, so that entering this
softirq vector will have forced an rcu_bh quiescent state (as noted by
Dave Miller).  Use smp_call_function() to invoke raise_softirq() on all
CPUs in order to cause this to happen.  Track the CPUs that have passed
through a quiescent state (or gone offline) with a cpumask.

Does nothing to expedite callbacks already registered with call_rcu_bh(),
but there is no need to.

Passes light rcutorture testing.  Grace periods take about 28 microseconds
on an 8CPU 1.5GHZ Power box.

Shortcomings:

o	Needs more testing, still not for inclusion.

o	Does not handle rcu, only rcu_bh.

Changes since v1:

o	Added rcutorture support, and added exports required by
	rcutorture.

o	Added comment stating that smp_call_function() implies a
	memory barrier, suggested by Mathieu.

o	Added #include for delay.h.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---

 include/linux/interrupt.h |    1 
 include/linux/rcupdate.h  |    2 
 kernel/rcupdate.c         |  115 +++++++++++++++++++++++++
 kernel/rcutorture.c       |  205 +++++++++++++++++++++++++---------------------
 4 files changed, 231 insertions(+), 92 deletions(-)

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 91bb76f..b7b58cc 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -338,6 +338,7 @@ enum
 	TASKLET_SOFTIRQ,
 	SCHED_SOFTIRQ,
 	HRTIMER_SOFTIRQ,
+	RCU_EXPEDITED_SOFTIRQ,
 	RCU_SOFTIRQ,	/* Preferable RCU should always be the last softirq */
 
 	NR_SOFTIRQS
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 15fbb3c..a606b42 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -264,6 +264,8 @@ extern void synchronize_rcu(void);
 extern void rcu_barrier(void);
 extern void rcu_barrier_bh(void);
 extern void rcu_barrier_sched(void);
+extern void synchronize_rcu_bh_expedited(void);
+extern long rcu_batches_completed_bh_expedited(void);
 
 /* Internal to kernel */
 extern void rcu_init(void);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a967c9f..e995e73 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -45,6 +45,7 @@
 #include <linux/mutex.h>
 #include <linux/module.h>
 #include <linux/kernel_stat.h>
+#include <linux/delay.h>
 
 enum rcu_barrier {
 	RCU_BARRIER_STD,
@@ -217,10 +218,124 @@ static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
 	return NOTIFY_OK;
 }
 
+static DEFINE_MUTEX(synchronize_rcu_bh_mutex);
+static long synchronize_rcu_bh_completed; /* Expedited-grace-period count. */
+
+long rcu_batches_completed_bh_expedited(void)
+{
+	return synchronize_rcu_bh_completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed_bh_expedited);
+
+#ifndef CONFIG_SMP
+
+static void __init synchronize_rcu_expedited_init(void)
+{
+}
+
+void synchronize_rcu_bh_expedited(void)
+{
+	mutex_lock(&synchronize_rcu_bh_mutex);
+	synchronize_rcu_bh_completed++;
+	mutex_unlock(&synchronize_rcu_bh_mutex);
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_bh_expedited);
+
+#else /* #ifndef CONFIG_SMP */
+
+static DEFINE_PER_CPU(int, rcu_bh_need_qs);
+static cpumask_var_t rcu_bh_waiting_map;
+
+static void synchronize_rcu_bh_expedited_help(struct softirq_action *unused)
+{
+	if (__get_cpu_var(rcu_bh_need_qs)) {
+		smp_mb();
+		__get_cpu_var(rcu_bh_need_qs) = 0;
+		smp_mb();
+	}
+}
+
+static void rcu_bh_fast_qs(void *unused)
+{
+	raise_softirq(RCU_EXPEDITED_SOFTIRQ);
+}
+
+static void __init synchronize_rcu_expedited_init(void)
+{
+	open_softirq(RCU_EXPEDITED_SOFTIRQ, synchronize_rcu_bh_expedited_help);
+	alloc_bootmem_cpumask_var(&rcu_bh_waiting_map);
+}
+
+void synchronize_rcu_bh_expedited(void)
+{
+	int cpu;
+	int done;
+	int times = 0;
+
+	mutex_lock(&synchronize_rcu_bh_mutex);
+
+	/* Take snapshot of online CPUs, blocking CPU hotplug. */
+	preempt_disable();
+	cpumask_copy(rcu_bh_waiting_map, &cpu_online_map);
+	preempt_enable();
+
+	/* Mark each online CPU as needing a quiescent state. */
+	for_each_cpu(cpu, rcu_bh_waiting_map)
+		per_cpu(rcu_bh_need_qs, cpu) = 1;
+
+	/* Call for a quiescent state on each online CPU. */
+	preempt_disable();
+	cpumask_clear_cpu(smp_processor_id(), rcu_bh_waiting_map);
+	smp_call_function(rcu_bh_fast_qs, NULL, 1); /* implies smp_mb(). */
+	preempt_enable();
+
+	/*
+	 * Loop waiting for each CPU to either pass through a quiescent
+	 * state or to go offline.  We don't care which.
+	 */
+	for (;;) {
+		
+		/* Ignore CPUs that have gone offline, blocking CPU hotplug. */
+		preempt_disable();
+		cpumask_and(rcu_bh_waiting_map, rcu_bh_waiting_map,
+			    &cpu_online_map);
+		cpumask_clear_cpu(smp_processor_id(), rcu_bh_waiting_map);
+		preempt_enable();
+
+		/* Check if any CPUs still need a quiescent state. */
+		done = 1;
+		for_each_cpu(cpu, rcu_bh_waiting_map) {
+			if (per_cpu(rcu_bh_need_qs, cpu)) {
+				done = 0;
+				break;
+			}
+			cpumask_clear_cpu(cpu, rcu_bh_waiting_map);
+		}
+		if (done)
+			break;
+
+		/*
+		 * Wait a bit.  If we have already waited a fair
+		 * amount of time, sleep.
+		 */
+		if (++times < 10)
+			udelay(10 * times);
+		else
+			schedule_timeout_uninterruptible(1);
+	}
+
+	synchronize_rcu_bh_completed++;
+	mutex_unlock(&synchronize_rcu_bh_mutex);
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_bh_expedited);
+
+#endif /* #else #ifndef CONFIG_SMP */
+
 void __init rcu_init(void)
 {
 	__rcu_init();
 	hotcpu_notifier(rcu_barrier_cpu_hotplug, 0);
+	synchronize_rcu_expedited_init();
 }
 
 void rcu_scheduler_starting(void)
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 9b4a975..8845936 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -257,14 +257,14 @@ struct rcu_torture_ops {
 	void (*init)(void);
 	void (*cleanup)(void);
 	int (*readlock)(void);
-	void (*readdelay)(struct rcu_random_state *rrsp);
+	void (*read_delay)(struct rcu_random_state *rrsp);
 	void (*readunlock)(int idx);
 	int (*completed)(void);
-	void (*deferredfree)(struct rcu_torture *p);
+	void (*deferred_free)(struct rcu_torture *p);
 	void (*sync)(void);
 	void (*cb_barrier)(void);
 	int (*stats)(char *page);
-	int irqcapable;
+	int irq_capable;
 	char *name;
 };
 static struct rcu_torture_ops *cur_ops = NULL;
@@ -320,7 +320,7 @@ rcu_torture_cb(struct rcu_head *p)
 		rp->rtort_mbtest = 0;
 		rcu_torture_free(rp);
 	} else
-		cur_ops->deferredfree(rp);
+		cur_ops->deferred_free(rp);
 }
 
 static void rcu_torture_deferred_free(struct rcu_torture *p)
@@ -329,18 +329,18 @@ static void rcu_torture_deferred_free(struct rcu_torture *p)
 }
 
 static struct rcu_torture_ops rcu_ops = {
-	.init = NULL,
-	.cleanup = NULL,
-	.readlock = rcu_torture_read_lock,
-	.readdelay = rcu_read_delay,
-	.readunlock = rcu_torture_read_unlock,
-	.completed = rcu_torture_completed,
-	.deferredfree = rcu_torture_deferred_free,
-	.sync = synchronize_rcu,
-	.cb_barrier = rcu_barrier,
-	.stats = NULL,
-	.irqcapable = 1,
-	.name = "rcu"
+	.init		= NULL,
+	.cleanup	= NULL,
+	.readlock	= rcu_torture_read_lock,
+	.read_delay	= rcu_read_delay,
+	.readunlock	= rcu_torture_read_unlock,
+	.completed	= rcu_torture_completed,
+	.deferred_free	= rcu_torture_deferred_free,
+	.sync		= synchronize_rcu,
+	.cb_barrier	= rcu_barrier,
+	.stats		= NULL,
+	.irq_capable 	= 1,
+	.name 		= "rcu"
 };
 
 static void rcu_sync_torture_deferred_free(struct rcu_torture *p)
@@ -370,18 +370,18 @@ static void rcu_sync_torture_init(void)
 }
 
 static struct rcu_torture_ops rcu_sync_ops = {
-	.init = rcu_sync_torture_init,
-	.cleanup = NULL,
-	.readlock = rcu_torture_read_lock,
-	.readdelay = rcu_read_delay,
-	.readunlock = rcu_torture_read_unlock,
-	.completed = rcu_torture_completed,
-	.deferredfree = rcu_sync_torture_deferred_free,
-	.sync = synchronize_rcu,
-	.cb_barrier = NULL,
-	.stats = NULL,
-	.irqcapable = 1,
-	.name = "rcu_sync"
+	.init		= rcu_sync_torture_init,
+	.cleanup	= NULL,
+	.readlock	= rcu_torture_read_lock,
+	.read_delay	= rcu_read_delay,
+	.readunlock	= rcu_torture_read_unlock,
+	.completed	= rcu_torture_completed,
+	.deferred_free	= rcu_sync_torture_deferred_free,
+	.sync		= synchronize_rcu,
+	.cb_barrier	= NULL,
+	.stats		= NULL,
+	.irq_capable	= 1,
+	.name		= "rcu_sync"
 };
 
 /*
@@ -432,33 +432,53 @@ static void rcu_bh_torture_synchronize(void)
 }
 
 static struct rcu_torture_ops rcu_bh_ops = {
-	.init = NULL,
-	.cleanup = NULL,
-	.readlock = rcu_bh_torture_read_lock,
-	.readdelay = rcu_read_delay,  /* just reuse rcu's version. */
-	.readunlock = rcu_bh_torture_read_unlock,
-	.completed = rcu_bh_torture_completed,
-	.deferredfree = rcu_bh_torture_deferred_free,
-	.sync = rcu_bh_torture_synchronize,
-	.cb_barrier = rcu_barrier_bh,
-	.stats = NULL,
-	.irqcapable = 1,
-	.name = "rcu_bh"
+	.init		= NULL,
+	.cleanup	= NULL,
+	.readlock	= rcu_bh_torture_read_lock,
+	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */
+	.readunlock	= rcu_bh_torture_read_unlock,
+	.completed	= rcu_bh_torture_completed,
+	.deferred_free	= rcu_bh_torture_deferred_free,
+	.sync		= rcu_bh_torture_synchronize,
+	.cb_barrier	= rcu_barrier_bh,
+	.stats		= NULL,
+	.irq_capable	= 1,
+	.name		= "rcu_bh"
 };
 
 static struct rcu_torture_ops rcu_bh_sync_ops = {
-	.init = rcu_sync_torture_init,
-	.cleanup = NULL,
-	.readlock = rcu_bh_torture_read_lock,
-	.readdelay = rcu_read_delay,  /* just reuse rcu's version. */
-	.readunlock = rcu_bh_torture_read_unlock,
-	.completed = rcu_bh_torture_completed,
-	.deferredfree = rcu_sync_torture_deferred_free,
-	.sync = rcu_bh_torture_synchronize,
-	.cb_barrier = NULL,
-	.stats = NULL,
-	.irqcapable = 1,
-	.name = "rcu_bh_sync"
+	.init		= rcu_sync_torture_init,
+	.cleanup	= NULL,
+	.readlock	= rcu_bh_torture_read_lock,
+	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */
+	.readunlock	= rcu_bh_torture_read_unlock,
+	.completed	= rcu_bh_torture_completed,
+	.deferred_free	= rcu_sync_torture_deferred_free,
+	.sync		= rcu_bh_torture_synchronize,
+	.cb_barrier	= NULL,
+	.stats		= NULL,
+	.irq_capable	= 1,
+	.name		= "rcu_bh_sync"
+};
+
+static int rcu_bh_expedited_torture_completed(void)
+{
+	return rcu_batches_completed_bh_expedited();
+}
+
+static struct rcu_torture_ops rcu_bh_expedited_ops = {
+	.init		= rcu_sync_torture_init,
+	.cleanup	= NULL,
+	.readlock	= rcu_bh_torture_read_lock,
+	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */
+	.readunlock	= rcu_bh_torture_read_unlock,
+	.completed	= rcu_bh_expedited_torture_completed,
+	.deferred_free	= rcu_sync_torture_deferred_free,
+	.sync		= synchronize_rcu_bh_expedited,
+	.cb_barrier	= NULL,
+	.stats		= NULL,
+	.irq_capable	= 1,
+	.name		= "rcu_bh_expedited"
 };
 
 /*
@@ -530,17 +550,17 @@ static int srcu_torture_stats(char *page)
 }
 
 static struct rcu_torture_ops srcu_ops = {
-	.init = srcu_torture_init,
-	.cleanup = srcu_torture_cleanup,
-	.readlock = srcu_torture_read_lock,
-	.readdelay = srcu_read_delay,
-	.readunlock = srcu_torture_read_unlock,
-	.completed = srcu_torture_completed,
-	.deferredfree = rcu_sync_torture_deferred_free,
-	.sync = srcu_torture_synchronize,
-	.cb_barrier = NULL,
-	.stats = srcu_torture_stats,
-	.name = "srcu"
+	.init		= srcu_torture_init,
+	.cleanup	= srcu_torture_cleanup,
+	.readlock	= srcu_torture_read_lock,
+	.read_delay	= srcu_read_delay,
+	.readunlock	= srcu_torture_read_unlock,
+	.completed	= srcu_torture_completed,
+	.deferred_free	= rcu_sync_torture_deferred_free,
+	.sync		= srcu_torture_synchronize,
+	.cb_barrier	= NULL,
+	.stats		= srcu_torture_stats,
+	.name		= "srcu"
 };
 
 /*
@@ -574,32 +594,32 @@ static void sched_torture_synchronize(void)
 }
 
 static struct rcu_torture_ops sched_ops = {
-	.init = rcu_sync_torture_init,
-	.cleanup = NULL,
-	.readlock = sched_torture_read_lock,
-	.readdelay = rcu_read_delay,  /* just reuse rcu's version. */
-	.readunlock = sched_torture_read_unlock,
-	.completed = sched_torture_completed,
-	.deferredfree = rcu_sched_torture_deferred_free,
-	.sync = sched_torture_synchronize,
-	.cb_barrier = rcu_barrier_sched,
-	.stats = NULL,
-	.irqcapable = 1,
-	.name = "sched"
+	.init		= rcu_sync_torture_init,
+	.cleanup	= NULL,
+	.readlock	= sched_torture_read_lock,
+	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */
+	.readunlock	= sched_torture_read_unlock,
+	.completed	= sched_torture_completed,
+	.deferred_free	= rcu_sched_torture_deferred_free,
+	.sync		= sched_torture_synchronize,
+	.cb_barrier	= rcu_barrier_sched,
+	.stats		= NULL,
+	.irq_capable	= 1,
+	.name		= "sched"
 };
 
 static struct rcu_torture_ops sched_ops_sync = {
-	.init = rcu_sync_torture_init,
-	.cleanup = NULL,
-	.readlock = sched_torture_read_lock,
-	.readdelay = rcu_read_delay,  /* just reuse rcu's version. */
-	.readunlock = sched_torture_read_unlock,
-	.completed = sched_torture_completed,
-	.deferredfree = rcu_sync_torture_deferred_free,
-	.sync = sched_torture_synchronize,
-	.cb_barrier = NULL,
-	.stats = NULL,
-	.name = "sched_sync"
+	.init		= rcu_sync_torture_init,
+	.cleanup	= NULL,
+	.readlock	= sched_torture_read_lock,
+	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */
+	.readunlock	= sched_torture_read_unlock,
+	.completed	= sched_torture_completed,
+	.deferred_free	= rcu_sync_torture_deferred_free,
+	.sync		= sched_torture_synchronize,
+	.cb_barrier	= NULL,
+	.stats		= NULL,
+	.name		= "sched_sync"
 };
 
 /*
@@ -635,7 +655,7 @@ rcu_torture_writer(void *arg)
 				i = RCU_TORTURE_PIPE_LEN;
 			atomic_inc(&rcu_torture_wcount[i]);
 			old_rp->rtort_pipe_count++;
-			cur_ops->deferredfree(old_rp);
+			cur_ops->deferred_free(old_rp);
 		}
 		rcu_torture_current_version++;
 		oldbatch = cur_ops->completed();
@@ -700,7 +720,7 @@ static void rcu_torture_timer(unsigned long unused)
 	if (p->rtort_mbtest == 0)
 		atomic_inc(&n_rcu_torture_mberror);
 	spin_lock(&rand_lock);
-	cur_ops->readdelay(&rand);
+	cur_ops->read_delay(&rand);
 	n_rcu_torture_timers++;
 	spin_unlock(&rand_lock);
 	preempt_disable();
@@ -738,11 +758,11 @@ rcu_torture_reader(void *arg)
 
 	VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
 	set_user_nice(current, 19);
-	if (irqreader && cur_ops->irqcapable)
+	if (irqreader && cur_ops->irq_capable)
 		setup_timer_on_stack(&t, rcu_torture_timer, 0);
 
 	do {
-		if (irqreader && cur_ops->irqcapable) {
+		if (irqreader && cur_ops->irq_capable) {
 			if (!timer_pending(&t))
 				mod_timer(&t, 1);
 		}
@@ -757,7 +777,7 @@ rcu_torture_reader(void *arg)
 		}
 		if (p->rtort_mbtest == 0)
 			atomic_inc(&n_rcu_torture_mberror);
-		cur_ops->readdelay(&rand);
+		cur_ops->read_delay(&rand);
 		preempt_disable();
 		pipe_count = p->rtort_pipe_count;
 		if (pipe_count > RCU_TORTURE_PIPE_LEN) {
@@ -778,7 +798,7 @@ rcu_torture_reader(void *arg)
 	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
 	VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
 	rcutorture_shutdown_absorb("rcu_torture_reader");
-	if (irqreader && cur_ops->irqcapable)
+	if (irqreader && cur_ops->irq_capable)
 		del_timer_sync(&t);
 	while (!kthread_should_stop())
 		schedule_timeout_uninterruptible(1);
@@ -1078,6 +1098,7 @@ rcu_torture_init(void)
 	int firsterr = 0;
 	static struct rcu_torture_ops *torture_ops[] =
 		{ &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops,
+		  &rcu_bh_expedited_ops,
 		  &srcu_ops, &sched_ops, &sched_ops_sync, };
 
 	mutex_lock(&fullstop_mutex);

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH RFC] v2 expedited "big hammer" RCU grace periods
  2009-04-26  5:23 [PATCH RFC] v2 expedited "big hammer" RCU grace periods Paul E. McKenney
@ 2009-04-26 11:27 ` Ingo Molnar
  2009-04-26 19:13   ` Mathieu Desnoyers
  2009-04-26 20:54   ` Paul E. McKenney
  0 siblings, 2 replies; 13+ messages in thread
From: Ingo Molnar @ 2009-04-26 11:27 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: linux-kernel, netdev, netfilter-devel, akpm, torvalds, davem,
	dada1, zbr, jeff.chua.linux, paulus, laijs, jengelh, r000n, benh,
	mathieu.desnoyers


* Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote:

> Second cut of "big hammer" expedited RCU grace periods, but only 
> for rcu_bh.  This creates another softirq vector, so that entering 
> this softirq vector will have forced an rcu_bh quiescent state (as 
> noted by Dave Miller).  Use smp_call_function() to invoke 
> raise_softirq() on all CPUs in order to cause this to happen.  
> Track the CPUs that have passed through a quiescent state (or gone 
> offline) with a cpumask.

hm, i'm still asking whether doing this would be simpler via a 
reschedule vector - which not only is an existing facility but also 
forces all RCU domains through a quiescent state - not just bh-RCU 
participants.

Triggering a new softirq is in no way simpler that doing an SMP 
cross-call - in fact softirqs are a finite resource so using some 
other facility would be preferred.

Am i missing something?

	Ingo

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH RFC] v2 expedited "big hammer" RCU grace periods
  2009-04-26 11:27 ` Ingo Molnar
@ 2009-04-26 19:13   ` Mathieu Desnoyers
  2009-04-26 20:22     ` Ingo Molnar
  2009-04-26 20:54   ` Paul E. McKenney
  1 sibling, 1 reply; 13+ messages in thread
From: Mathieu Desnoyers @ 2009-04-26 19:13 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Paul E. McKenney, linux-kernel, netdev, netfilter-devel, akpm,
	torvalds, davem, dada1, zbr, jeff.chua.linux, paulus, laijs,
	jengelh, r000n, benh

* Ingo Molnar (mingo@elte.hu) wrote:
> 
> * Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote:
> 
> > Second cut of "big hammer" expedited RCU grace periods, but only 
> > for rcu_bh.  This creates another softirq vector, so that entering 
> > this softirq vector will have forced an rcu_bh quiescent state (as 
> > noted by Dave Miller).  Use smp_call_function() to invoke 
> > raise_softirq() on all CPUs in order to cause this to happen.  
> > Track the CPUs that have passed through a quiescent state (or gone 
> > offline) with a cpumask.
> 
> hm, i'm still asking whether doing this would be simpler via a 
> reschedule vector - which not only is an existing facility but also 
> forces all RCU domains through a quiescent state - not just bh-RCU 
> participants.
> 
> Triggering a new softirq is in no way simpler that doing an SMP 
> cross-call - in fact softirqs are a finite resource so using some 
> other facility would be preferred.
> 
> Am i missing something?
> 

I think the reason for this whole thread is that waiting for rcu
quiescent state, when called many times e.g. in multiple iptables
invokations, takes too longs (5 seconds to load the netfilter rules at
boot). The three solutions proposed so far are :

- bh disabling + per-cpu read-write lock
- RCU FGP (fast grace periods), where the writer directly check each
  per-cpu variables associated with netfilter to make sure the quiescent
  state for a particular resource has been reached. (derived from my
  userspace RCU implementation)
- expedited "big hammer" rcu GP, where the writer only waits for bh
  quiescent state. This is useful if we can guarantee that all readers
  are either in bh context or disable bottom halves.

Therefore, it's really on purpose that Paul does not wait for global RCU
quiescent states, but rather just for bh : it's faster.

IMHO, the bh rcu GP shares the same problem as the global RCU GP
approach : it monitors global kernel state to ensure quiescence. It's
better in practice because bh quiescent states are much more frequent
than global QS, but it still depends on every other bh handler and bh
disabled section duration to calculate the maximum writer delay. One
might argue that if we keep those small, this should not matter in
practice.

The RCU FGP approach is interesting because it's based solely on
netfilter-specific per-cpu variables to detect QS. Therefore, even if
an unrelated piece of kernel software eventually decides to be a bad
citizen and disable bh for long periods on a 4096 cpu box, it won't slow
down the netfilter tables update.

This last positive aspect of RCU FGP is common with the bh disabling +
per-cpu rw lock approach, where the rw lock is also local to netfilter.
However, taking a rwlock and disabling bh will make the read-side much
slower than RCU FGP (which simply disables preemption and touches a
per-cpu GP/nesting count variable). But given RCU FGP is relatively new,
it makes sense to use a known-good solution in the short term.

Mathieu

-- 
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH RFC] v2 expedited "big hammer" RCU grace periods
  2009-04-26 19:13   ` Mathieu Desnoyers
@ 2009-04-26 20:22     ` Ingo Molnar
  2009-04-26 21:44       ` Paul E. McKenney
  0 siblings, 1 reply; 13+ messages in thread
From: Ingo Molnar @ 2009-04-26 20:22 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Paul E. McKenney, linux-kernel, netdev, netfilter-devel, akpm,
	torvalds, davem, dada1, zbr, jeff.chua.linux, paulus, laijs,
	jengelh, r000n, benh


* Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> wrote:

> * Ingo Molnar (mingo@elte.hu) wrote:
> > 
> > * Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote:
> > 
> > > Second cut of "big hammer" expedited RCU grace periods, but only 
> > > for rcu_bh.  This creates another softirq vector, so that entering 
> > > this softirq vector will have forced an rcu_bh quiescent state (as 
> > > noted by Dave Miller).  Use smp_call_function() to invoke 
> > > raise_softirq() on all CPUs in order to cause this to happen.  
> > > Track the CPUs that have passed through a quiescent state (or gone 
> > > offline) with a cpumask.
> > 
> > hm, i'm still asking whether doing this would be simpler via a 
> > reschedule vector - which not only is an existing facility but also 
> > forces all RCU domains through a quiescent state - not just bh-RCU 
> > participants.
> > 
> > Triggering a new softirq is in no way simpler that doing an SMP 
> > cross-call - in fact softirqs are a finite resource so using some 
> > other facility would be preferred.
> > 
> > Am i missing something?
> > 
> 
> I think the reason for this whole thread is that waiting for rcu 
> quiescent state, when called many times e.g. in multiple iptables 
> invokations, takes too longs (5 seconds to load the netfilter 
> rules at boot). [...]

I'm aware of the problem space.

I was suggesting that to trigger the quiescent state and to wait for 
it to propagate it would be enough to reuse the reschedule 
mechanism.

It would be relatively straightforward: first a send-reschedule then 
do a wait_task_context_switch() on rq->curr - both are existing 
primitives. (a task reference has to be taken but that's pretty much 
all)

By the time wait_task_context_switch() returns from the last CPU we 
know that the quiescent state has passed.

	Ingo

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH RFC] v2 expedited "big hammer" RCU grace periods
  2009-04-26 11:27 ` Ingo Molnar
  2009-04-26 19:13   ` Mathieu Desnoyers
@ 2009-04-26 20:54   ` Paul E. McKenney
  1 sibling, 0 replies; 13+ messages in thread
From: Paul E. McKenney @ 2009-04-26 20:54 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-kernel, netdev, netfilter-devel, akpm, torvalds, davem,
	dada1, zbr, jeff.chua.linux, paulus, laijs, jengelh, r000n, benh,
	mathieu.desnoyers, tglx, rostedt

On Sun, Apr 26, 2009 at 01:27:17PM +0200, Ingo Molnar wrote:
> 
> * Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote:
> 
> > Second cut of "big hammer" expedited RCU grace periods, but only 
> > for rcu_bh.  This creates another softirq vector, so that entering 
> > this softirq vector will have forced an rcu_bh quiescent state (as 
> > noted by Dave Miller).  Use smp_call_function() to invoke 
> > raise_softirq() on all CPUs in order to cause this to happen.  
> > Track the CPUs that have passed through a quiescent state (or gone 
> > offline) with a cpumask.
> 
> hm, i'm still asking whether doing this would be simpler via a 
> reschedule vector - which not only is an existing facility but also 
> forces all RCU domains through a quiescent state - not just bh-RCU 
> participants.
> 
> Triggering a new softirq is in no way simpler that doing an SMP 
> cross-call - in fact softirqs are a finite resource so using some 
> other facility would be preferred.
> 
> Am i missing something?

Well, it is entirely possible that I am the one missing something.

So, here is the line of reasoning that lead me to the bh-RCU approach:

o	The two flavors of RCU that can support an off-to-the-side
	expedited implementation are RCU-bh and RCU-sched.  Preemptable
	RCU requires a more intrusive approach for normal RCU, due to
	the fact that RCU readers can be preempted and can block on locks.
	Therefore, forcing a reschedule on each CPU does not force a
	grace period for preemptable RCU.

	Of course, there is an easy workaround -- for preemptable
	RCU, make the expedited primitive just directly invoke
	synchronize_rcu().  Although this would not provide any speedup,
	it would at least guarantee correct operation.	But I believe
	that we need to have a way to expedite grace periods on -rt
	kernels with preemptable RCU as well as on non-real-time kernels.

o	As you say, an RCU-sched grace period implies an RCU-bh grace
	period on non-realtime kernels.  Unfortunately, for -rt kernels,
	softirq handlers can be preempted and can block while waiting
	for locks, so forcing a reschedule on each CPU does not force
	a grace period for RCU-bh in a -rt kernel.

	Again, there is an easy workaround: in CONFIG_PREEMPT_RT
	kernels, make the RCU-bh variant of the expedited primitive
	invoke a new synchronize_rcu_bh() primitive.

	Of course, allowing an RCU-sched grace period to imply an RCU-bh
	grace period loses the DoS-resistance advantages of RCU-bh.
	However, very few of the RCU updates in the kernel take
	advantage of DoS resistance.  Furthermore, Steve's patch did
	not use RCU-bh, so one could argue that we should forget about
	DoS-resistance for the time being.  Thoughts?

o	The approach in the previous patch works across all kernel
	builds, because of the fact that it forces a new softirq handler
	to run, thus guaranteeing that all prior softirq handlers and
	RCU-bh read-side critical sections for the CPU in question
	have completed.

o	I used a new softirq vector out of laziness.  I could instead
	raise RCU_SOFTIRQ, and then add code to each of the
	rcu_process_callbacks() functions to ack the expedited
	raise_softirq().

	Easy for me to change, though.  I guess I don't have to be
	-that- lazy.  ;-)

o	So, why RCU-bh rather than RCU-sched?

	Again, laziness.  The RCU-sched approach requires greater
	intrusiveness into the existing RCU implementations.  Nothing
	wrong with that, given that this is in fact another RCU API
	member, but given the choice, I would rather do the intruding
	after dropping Classic RCU.

	The easiest way I could see to minimize intrusion for RCU-sched
	is to create a new per-CPU counter that is incremented by each
	implementation of rcu_qsctr_inc().  But even easier to avoid
	the rcu_qsctr_inc() code path entirely.

Once we have dropped Classic RCU and I have merged Preemptable RCU into
Hierarchical RCU, it becomes much more attractive to merge the expediting
into the main RCU state machine.

Thoughts?

							Thanx, Paul

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH RFC] v2 expedited "big hammer" RCU grace periods
  2009-04-26 20:22     ` Ingo Molnar
@ 2009-04-26 21:44       ` Paul E. McKenney
  2009-04-27  3:26         ` Ingo Molnar
  0 siblings, 1 reply; 13+ messages in thread
From: Paul E. McKenney @ 2009-04-26 21:44 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Mathieu Desnoyers, linux-kernel, netdev, netfilter-devel, akpm,
	torvalds, davem, dada1, zbr, jeff.chua.linux, paulus, laijs,
	jengelh, r000n, benh

On Sun, Apr 26, 2009 at 10:22:55PM +0200, Ingo Molnar wrote:
> 
> * Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> wrote:
> 
> > * Ingo Molnar (mingo@elte.hu) wrote:
> > > 
> > > * Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote:
> > > 
> > > > Second cut of "big hammer" expedited RCU grace periods, but only 
> > > > for rcu_bh.  This creates another softirq vector, so that entering 
> > > > this softirq vector will have forced an rcu_bh quiescent state (as 
> > > > noted by Dave Miller).  Use smp_call_function() to invoke 
> > > > raise_softirq() on all CPUs in order to cause this to happen.  
> > > > Track the CPUs that have passed through a quiescent state (or gone 
> > > > offline) with a cpumask.
> > > 
> > > hm, i'm still asking whether doing this would be simpler via a 
> > > reschedule vector - which not only is an existing facility but also 
> > > forces all RCU domains through a quiescent state - not just bh-RCU 
> > > participants.
> > > 
> > > Triggering a new softirq is in no way simpler that doing an SMP 
> > > cross-call - in fact softirqs are a finite resource so using some 
> > > other facility would be preferred.
> > > 
> > > Am i missing something?
> > > 
> > 
> > I think the reason for this whole thread is that waiting for rcu 
> > quiescent state, when called many times e.g. in multiple iptables 
> > invokations, takes too longs (5 seconds to load the netfilter 
> > rules at boot). [...]
> 
> I'm aware of the problem space.
> 
> I was suggesting that to trigger the quiescent state and to wait for 
> it to propagate it would be enough to reuse the reschedule 
> mechanism.
> 
> It would be relatively straightforward: first a send-reschedule then 
> do a wait_task_context_switch() on rq->curr - both are existing 
> primitives. (a task reference has to be taken but that's pretty much 
> all)

Well, one reason I didn't take this approach was that I didn't happen
to think of it.  ;-)

Also that I hadn't heard of wait_task_context_switch().

Hmmm...  Looking for wait_task_context_switch().  OK, found it.

It looks to me that this primitive won't return until the scheduler
actually decides to run something else.  We instead need to have
something that stops waiting once the CPU enters the scheduler, hence
the previous thought of making rcu_qsctr_inc() do a bit of extra work.

This would be a way of making an expedited RCU-sched across all
RCU implementations.  As noted in the earlier email, it would not
handle RCU or RCU-bh in a -rt kernel.

> By the time wait_task_context_switch() returns from the last CPU we 
> know that the quiescent state has passed.

We would want to wait for all of the CPUs in parallel, though, wouldn't
we?  Seems that we would not want to wait for the last CPU to do another
trip through the scheduler if it had already passed through the scheduler
while we were waiting on the earlier CPUs.

So it seems like we would still want a two-pass approach -- one pass to
capture the current state, the second pass to wait for the state to
change.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH RFC] v2 expedited "big hammer" RCU grace periods
  2009-04-26 21:44       ` Paul E. McKenney
@ 2009-04-27  3:26         ` Ingo Molnar
  2009-04-27 13:21           ` Paul E. McKenney
  0 siblings, 1 reply; 13+ messages in thread
From: Ingo Molnar @ 2009-04-27  3:26 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Mathieu Desnoyers, linux-kernel, netdev, netfilter-devel, akpm,
	torvalds, davem, dada1, zbr, jeff.chua.linux, paulus, laijs,
	jengelh, r000n, benh


* Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote:

> On Sun, Apr 26, 2009 at 10:22:55PM +0200, Ingo Molnar wrote:
> > 
> > * Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> wrote:
> > 
> > > * Ingo Molnar (mingo@elte.hu) wrote:
> > > > 
> > > > * Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote:
> > > > 
> > > > > Second cut of "big hammer" expedited RCU grace periods, but only 
> > > > > for rcu_bh.  This creates another softirq vector, so that entering 
> > > > > this softirq vector will have forced an rcu_bh quiescent state (as 
> > > > > noted by Dave Miller).  Use smp_call_function() to invoke 
> > > > > raise_softirq() on all CPUs in order to cause this to happen.  
> > > > > Track the CPUs that have passed through a quiescent state (or gone 
> > > > > offline) with a cpumask.
> > > > 
> > > > hm, i'm still asking whether doing this would be simpler via a 
> > > > reschedule vector - which not only is an existing facility but also 
> > > > forces all RCU domains through a quiescent state - not just bh-RCU 
> > > > participants.
> > > > 
> > > > Triggering a new softirq is in no way simpler that doing an SMP 
> > > > cross-call - in fact softirqs are a finite resource so using some 
> > > > other facility would be preferred.
> > > > 
> > > > Am i missing something?
> > > > 
> > > 
> > > I think the reason for this whole thread is that waiting for rcu 
> > > quiescent state, when called many times e.g. in multiple iptables 
> > > invokations, takes too longs (5 seconds to load the netfilter 
> > > rules at boot). [...]
> > 
> > I'm aware of the problem space.
> > 
> > I was suggesting that to trigger the quiescent state and to wait for 
> > it to propagate it would be enough to reuse the reschedule 
> > mechanism.
> > 
> > It would be relatively straightforward: first a send-reschedule then 
> > do a wait_task_context_switch() on rq->curr - both are existing 
> > primitives. (a task reference has to be taken but that's pretty much 
> > all)
> 
> Well, one reason I didn't take this approach was that I didn't 
> happen to think of it.  ;-)
> 
> Also that I hadn't heard of wait_task_context_switch().
> 
> Hmmm...  Looking for wait_task_context_switch().  OK, found it.
> 
> It looks to me that this primitive won't return until the 
> scheduler actually decides to run something else.  We instead need 
> to have something that stops waiting once the CPU enters the 
> scheduler, hence the previous thought of making rcu_qsctr_inc() do 
> a bit of extra work.
> 
> This would be a way of making an expedited RCU-sched across all 
> RCU implementations.  As noted in the earlier email, it would not 
> handle RCU or RCU-bh in a -rt kernel.
> 
> > By the time wait_task_context_switch() returns from the last CPU 
> > we know that the quiescent state has passed.
> 
> We would want to wait for all of the CPUs in parallel, though, 
> wouldn't we?  Seems that we would not want to wait for the last 
> CPU to do another trip through the scheduler if it had already 
> passed through the scheduler while we were waiting on the earlier 
> CPUs.
> 
> So it seems like we would still want a two-pass approach -- one 
> pass to capture the current state, the second pass to wait for the 
> state to change.

I think waiting in parallel is still possible (first kick all tasks, 
then make sure all tasks have left the CPU at least once).

The busy-waiting in wait_task_context_switch() is indeed a problem - 
but perhaps that could be refactored to be a migration-thread driven 
wait_for_completion() + complete() cycle? It could be driven by 
preempt notifiers perhaps - and become zero-cost.

	Ingo

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH RFC] v2 expedited "big hammer" RCU grace periods
  2009-04-27  3:26         ` Ingo Molnar
@ 2009-04-27 13:21           ` Paul E. McKenney
  2009-04-27 13:43             ` Ingo Molnar
  2009-04-27 15:54             ` Mathieu Desnoyers
  0 siblings, 2 replies; 13+ messages in thread
From: Paul E. McKenney @ 2009-04-27 13:21 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Mathieu Desnoyers, linux-kernel, netdev, netfilter-devel, akpm,
	torvalds, davem, dada1, zbr, jeff.chua.linux, paulus, laijs,
	jengelh, r000n, benh

On Mon, Apr 27, 2009 at 05:26:39AM +0200, Ingo Molnar wrote:
> 
> * Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote:
> 
> > On Sun, Apr 26, 2009 at 10:22:55PM +0200, Ingo Molnar wrote:
> > > 
> > > * Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> wrote:
> > > 
> > > > * Ingo Molnar (mingo@elte.hu) wrote:
> > > > > 
> > > > > * Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote:
> > > > > 
> > > > > > Second cut of "big hammer" expedited RCU grace periods, but only 
> > > > > > for rcu_bh.  This creates another softirq vector, so that entering 
> > > > > > this softirq vector will have forced an rcu_bh quiescent state (as 
> > > > > > noted by Dave Miller).  Use smp_call_function() to invoke 
> > > > > > raise_softirq() on all CPUs in order to cause this to happen.  
> > > > > > Track the CPUs that have passed through a quiescent state (or gone 
> > > > > > offline) with a cpumask.
> > > > > 
> > > > > hm, i'm still asking whether doing this would be simpler via a 
> > > > > reschedule vector - which not only is an existing facility but also 
> > > > > forces all RCU domains through a quiescent state - not just bh-RCU 
> > > > > participants.
> > > > > 
> > > > > Triggering a new softirq is in no way simpler that doing an SMP 
> > > > > cross-call - in fact softirqs are a finite resource so using some 
> > > > > other facility would be preferred.
> > > > > 
> > > > > Am i missing something?
> > > > > 
> > > > 
> > > > I think the reason for this whole thread is that waiting for rcu 
> > > > quiescent state, when called many times e.g. in multiple iptables 
> > > > invokations, takes too longs (5 seconds to load the netfilter 
> > > > rules at boot). [...]
> > > 
> > > I'm aware of the problem space.
> > > 
> > > I was suggesting that to trigger the quiescent state and to wait for 
> > > it to propagate it would be enough to reuse the reschedule 
> > > mechanism.
> > > 
> > > It would be relatively straightforward: first a send-reschedule then 
> > > do a wait_task_context_switch() on rq->curr - both are existing 
> > > primitives. (a task reference has to be taken but that's pretty much 
> > > all)
> > 
> > Well, one reason I didn't take this approach was that I didn't 
> > happen to think of it.  ;-)
> > 
> > Also that I hadn't heard of wait_task_context_switch().
> > 
> > Hmmm...  Looking for wait_task_context_switch().  OK, found it.
> > 
> > It looks to me that this primitive won't return until the 
> > scheduler actually decides to run something else.  We instead need 
> > to have something that stops waiting once the CPU enters the 
> > scheduler, hence the previous thought of making rcu_qsctr_inc() do 
> > a bit of extra work.
> > 
> > This would be a way of making an expedited RCU-sched across all 
> > RCU implementations.  As noted in the earlier email, it would not 
> > handle RCU or RCU-bh in a -rt kernel.
> > 
> > > By the time wait_task_context_switch() returns from the last CPU 
> > > we know that the quiescent state has passed.
> > 
> > We would want to wait for all of the CPUs in parallel, though, 
> > wouldn't we?  Seems that we would not want to wait for the last 
> > CPU to do another trip through the scheduler if it had already 
> > passed through the scheduler while we were waiting on the earlier 
> > CPUs.
> > 
> > So it seems like we would still want a two-pass approach -- one 
> > pass to capture the current state, the second pass to wait for the 
> > state to change.
> 
> I think waiting in parallel is still possible (first kick all tasks, 
> then make sure all tasks have left the CPU at least once).
> 
> The busy-waiting in wait_task_context_switch() is indeed a problem - 
> but perhaps that could be refactored to be a migration-thread driven 
> wait_for_completion() + complete() cycle? It could be driven by 
> preempt notifiers perhaps - and become zero-cost.

Hmmm...  It would need to be informed of the quiescent state even if
that quiescent state did not result in a preemption.

But you are right -- I do need to expedite RCU, not just RCU-bh,
especially given that the boot-speed guys are starting to see grace
periods as a measureable fraction of the boot time.  I will take another
pass at this.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH RFC] v2 expedited "big hammer" RCU grace periods
  2009-04-27 13:21           ` Paul E. McKenney
@ 2009-04-27 13:43             ` Ingo Molnar
  2009-04-27 16:17               ` Paul E. McKenney
  2009-04-27 15:54             ` Mathieu Desnoyers
  1 sibling, 1 reply; 13+ messages in thread
From: Ingo Molnar @ 2009-04-27 13:43 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Mathieu Desnoyers, linux-kernel, netdev, netfilter-devel, akpm,
	torvalds, davem, dada1, zbr, jeff.chua.linux, paulus, laijs,
	jengelh, r000n, benh


* Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote:

> On Mon, Apr 27, 2009 at 05:26:39AM +0200, Ingo Molnar wrote:
> > 
> > * Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote:
> > 
> > > On Sun, Apr 26, 2009 at 10:22:55PM +0200, Ingo Molnar wrote:
> > > > 
> > > > * Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> wrote:
> > > > 
> > > > > * Ingo Molnar (mingo@elte.hu) wrote:
> > > > > > 
> > > > > > * Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote:
> > > > > > 
> > > > > > > Second cut of "big hammer" expedited RCU grace periods, but only 
> > > > > > > for rcu_bh.  This creates another softirq vector, so that entering 
> > > > > > > this softirq vector will have forced an rcu_bh quiescent state (as 
> > > > > > > noted by Dave Miller).  Use smp_call_function() to invoke 
> > > > > > > raise_softirq() on all CPUs in order to cause this to happen.  
> > > > > > > Track the CPUs that have passed through a quiescent state (or gone 
> > > > > > > offline) with a cpumask.
> > > > > > 
> > > > > > hm, i'm still asking whether doing this would be simpler via a 
> > > > > > reschedule vector - which not only is an existing facility but also 
> > > > > > forces all RCU domains through a quiescent state - not just bh-RCU 
> > > > > > participants.
> > > > > > 
> > > > > > Triggering a new softirq is in no way simpler that doing an SMP 
> > > > > > cross-call - in fact softirqs are a finite resource so using some 
> > > > > > other facility would be preferred.
> > > > > > 
> > > > > > Am i missing something?
> > > > > > 
> > > > > 
> > > > > I think the reason for this whole thread is that waiting for rcu 
> > > > > quiescent state, when called many times e.g. in multiple iptables 
> > > > > invokations, takes too longs (5 seconds to load the netfilter 
> > > > > rules at boot). [...]
> > > > 
> > > > I'm aware of the problem space.
> > > > 
> > > > I was suggesting that to trigger the quiescent state and to wait for 
> > > > it to propagate it would be enough to reuse the reschedule 
> > > > mechanism.
> > > > 
> > > > It would be relatively straightforward: first a send-reschedule then 
> > > > do a wait_task_context_switch() on rq->curr - both are existing 
> > > > primitives. (a task reference has to be taken but that's pretty much 
> > > > all)
> > > 
> > > Well, one reason I didn't take this approach was that I didn't 
> > > happen to think of it.  ;-)
> > > 
> > > Also that I hadn't heard of wait_task_context_switch().
> > > 
> > > Hmmm...  Looking for wait_task_context_switch().  OK, found it.
> > > 
> > > It looks to me that this primitive won't return until the 
> > > scheduler actually decides to run something else.  We instead need 
> > > to have something that stops waiting once the CPU enters the 
> > > scheduler, hence the previous thought of making rcu_qsctr_inc() do 
> > > a bit of extra work.
> > > 
> > > This would be a way of making an expedited RCU-sched across all 
> > > RCU implementations.  As noted in the earlier email, it would not 
> > > handle RCU or RCU-bh in a -rt kernel.
> > > 
> > > > By the time wait_task_context_switch() returns from the last CPU 
> > > > we know that the quiescent state has passed.
> > > 
> > > We would want to wait for all of the CPUs in parallel, though, 
> > > wouldn't we?  Seems that we would not want to wait for the last 
> > > CPU to do another trip through the scheduler if it had already 
> > > passed through the scheduler while we were waiting on the earlier 
> > > CPUs.
> > > 
> > > So it seems like we would still want a two-pass approach -- one 
> > > pass to capture the current state, the second pass to wait for the 
> > > state to change.
> > 
> > I think waiting in parallel is still possible (first kick all tasks, 
> > then make sure all tasks have left the CPU at least once).
> > 
> > The busy-waiting in wait_task_context_switch() is indeed a 
> > problem - but perhaps that could be refactored to be a 
> > migration-thread driven wait_for_completion() + complete() 
> > cycle? It could be driven by preempt notifiers perhaps - and 
> > become zero-cost.
> 
> Hmmm...  It would need to be informed of the quiescent state even 
> if that quiescent state did not result in a preemption.
> 
> But you are right -- I do need to expedite RCU, not just RCU-bh, 
> especially given that the boot-speed guys are starting to see 
> grace periods as a measureable fraction of the boot time.  I will 
> take another pass at this.

The precise method of signalling is a detail i suspect - so by all 
means use a new softirq if that is the cleanest. I'd also agree that 
covering not just bh-rcu would definitely be a good idea.

	Ingo

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH RFC] v2 expedited "big hammer" RCU grace periods
  2009-04-27 13:21           ` Paul E. McKenney
  2009-04-27 13:43             ` Ingo Molnar
@ 2009-04-27 15:54             ` Mathieu Desnoyers
  2009-04-27 16:16               ` Paul E. McKenney
  2009-04-27 20:56               ` Evgeniy Polyakov
  1 sibling, 2 replies; 13+ messages in thread
From: Mathieu Desnoyers @ 2009-04-27 15:54 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Ingo Molnar, linux-kernel, netdev, netfilter-devel, akpm,
	torvalds, davem, dada1, zbr, jeff.chua.linux, paulus, laijs,
	jengelh, r000n, benh

* Paul E. McKenney (paulmck@linux.vnet.ibm.com) wrote:
> On Mon, Apr 27, 2009 at 05:26:39AM +0200, Ingo Molnar wrote:
> > 
> > * Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote:
> > 
> > > On Sun, Apr 26, 2009 at 10:22:55PM +0200, Ingo Molnar wrote:
> > > > 
> > > > * Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> wrote:
> > > > 
> > > > > * Ingo Molnar (mingo@elte.hu) wrote:
> > > > > > 
> > > > > > * Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote:
> > > > > > 
> > > > > > > Second cut of "big hammer" expedited RCU grace periods, but only 
> > > > > > > for rcu_bh.  This creates another softirq vector, so that entering 
> > > > > > > this softirq vector will have forced an rcu_bh quiescent state (as 
> > > > > > > noted by Dave Miller).  Use smp_call_function() to invoke 
> > > > > > > raise_softirq() on all CPUs in order to cause this to happen.  
> > > > > > > Track the CPUs that have passed through a quiescent state (or gone 
> > > > > > > offline) with a cpumask.
> > > > > > 
> > > > > > hm, i'm still asking whether doing this would be simpler via a 
> > > > > > reschedule vector - which not only is an existing facility but also 
> > > > > > forces all RCU domains through a quiescent state - not just bh-RCU 
> > > > > > participants.
> > > > > > 
> > > > > > Triggering a new softirq is in no way simpler that doing an SMP 
> > > > > > cross-call - in fact softirqs are a finite resource so using some 
> > > > > > other facility would be preferred.
> > > > > > 
> > > > > > Am i missing something?
> > > > > > 
> > > > > 
> > > > > I think the reason for this whole thread is that waiting for rcu 
> > > > > quiescent state, when called many times e.g. in multiple iptables 
> > > > > invokations, takes too longs (5 seconds to load the netfilter 
> > > > > rules at boot). [...]
> > > > 
> > > > I'm aware of the problem space.
> > > > 
> > > > I was suggesting that to trigger the quiescent state and to wait for 
> > > > it to propagate it would be enough to reuse the reschedule 
> > > > mechanism.
> > > > 
> > > > It would be relatively straightforward: first a send-reschedule then 
> > > > do a wait_task_context_switch() on rq->curr - both are existing 
> > > > primitives. (a task reference has to be taken but that's pretty much 
> > > > all)
> > > 
> > > Well, one reason I didn't take this approach was that I didn't 
> > > happen to think of it.  ;-)
> > > 
> > > Also that I hadn't heard of wait_task_context_switch().
> > > 
> > > Hmmm...  Looking for wait_task_context_switch().  OK, found it.
> > > 
> > > It looks to me that this primitive won't return until the 
> > > scheduler actually decides to run something else.  We instead need 
> > > to have something that stops waiting once the CPU enters the 
> > > scheduler, hence the previous thought of making rcu_qsctr_inc() do 
> > > a bit of extra work.
> > > 
> > > This would be a way of making an expedited RCU-sched across all 
> > > RCU implementations.  As noted in the earlier email, it would not 
> > > handle RCU or RCU-bh in a -rt kernel.
> > > 
> > > > By the time wait_task_context_switch() returns from the last CPU 
> > > > we know that the quiescent state has passed.
> > > 
> > > We would want to wait for all of the CPUs in parallel, though, 
> > > wouldn't we?  Seems that we would not want to wait for the last 
> > > CPU to do another trip through the scheduler if it had already 
> > > passed through the scheduler while we were waiting on the earlier 
> > > CPUs.
> > > 
> > > So it seems like we would still want a two-pass approach -- one 
> > > pass to capture the current state, the second pass to wait for the 
> > > state to change.
> > 
> > I think waiting in parallel is still possible (first kick all tasks, 
> > then make sure all tasks have left the CPU at least once).
> > 
> > The busy-waiting in wait_task_context_switch() is indeed a problem - 
> > but perhaps that could be refactored to be a migration-thread driven 
> > wait_for_completion() + complete() cycle? It could be driven by 
> > preempt notifiers perhaps - and become zero-cost.
> 
> Hmmm...  It would need to be informed of the quiescent state even if
> that quiescent state did not result in a preemption.
> 
> But you are right -- I do need to expedite RCU, not just RCU-bh,
> especially given that the boot-speed guys are starting to see grace
> periods as a measureable fraction of the boot time.  I will take another
> pass at this.
> 
> 							Thanx, Paul

It might sound a bit simplistic, but... scheduling a high-priority
workqueue on every CPUs would give you the guarantees you seem to need
here. Or is the delay of letting the scheduler schedule a high-priority
task a delay you are trying to avoid ?

Some kind of priority boosting done by synchronize_rcu() could probably
work, and you could support rcu callbacks priority boosting by assigning
a priority to each callback registered (same priority as the thread
which invoked call_rcu). The rcu callbacks could then be sorted by
priority in a RB tree, and only the callbacks associated with priority
>= than the next priority task would be executed.

Mathieu


-- 
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH RFC] v2 expedited "big hammer" RCU grace periods
  2009-04-27 15:54             ` Mathieu Desnoyers
@ 2009-04-27 16:16               ` Paul E. McKenney
  2009-04-27 20:56               ` Evgeniy Polyakov
  1 sibling, 0 replies; 13+ messages in thread
From: Paul E. McKenney @ 2009-04-27 16:16 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Ingo Molnar, linux-kernel, netdev, netfilter-devel, akpm,
	torvalds, davem, dada1, zbr, jeff.chua.linux, paulus, laijs,
	jengelh, r000n, benh

On Mon, Apr 27, 2009 at 11:54:24AM -0400, Mathieu Desnoyers wrote:
> * Paul E. McKenney (paulmck@linux.vnet.ibm.com) wrote:
> > On Mon, Apr 27, 2009 at 05:26:39AM +0200, Ingo Molnar wrote:
> > > 
> > > * Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote:
> > > 
> > > > On Sun, Apr 26, 2009 at 10:22:55PM +0200, Ingo Molnar wrote:
> > > > > 
> > > > > * Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> wrote:
> > > > > 
> > > > > > * Ingo Molnar (mingo@elte.hu) wrote:
> > > > > > > 
> > > > > > > * Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote:
> > > > > > > 
> > > > > > > > Second cut of "big hammer" expedited RCU grace periods, but only 
> > > > > > > > for rcu_bh.  This creates another softirq vector, so that entering 
> > > > > > > > this softirq vector will have forced an rcu_bh quiescent state (as 
> > > > > > > > noted by Dave Miller).  Use smp_call_function() to invoke 
> > > > > > > > raise_softirq() on all CPUs in order to cause this to happen.  
> > > > > > > > Track the CPUs that have passed through a quiescent state (or gone 
> > > > > > > > offline) with a cpumask.
> > > > > > > 
> > > > > > > hm, i'm still asking whether doing this would be simpler via a 
> > > > > > > reschedule vector - which not only is an existing facility but also 
> > > > > > > forces all RCU domains through a quiescent state - not just bh-RCU 
> > > > > > > participants.
> > > > > > > 
> > > > > > > Triggering a new softirq is in no way simpler that doing an SMP 
> > > > > > > cross-call - in fact softirqs are a finite resource so using some 
> > > > > > > other facility would be preferred.
> > > > > > > 
> > > > > > > Am i missing something?
> > > > > > > 
> > > > > > 
> > > > > > I think the reason for this whole thread is that waiting for rcu 
> > > > > > quiescent state, when called many times e.g. in multiple iptables 
> > > > > > invokations, takes too longs (5 seconds to load the netfilter 
> > > > > > rules at boot). [...]
> > > > > 
> > > > > I'm aware of the problem space.
> > > > > 
> > > > > I was suggesting that to trigger the quiescent state and to wait for 
> > > > > it to propagate it would be enough to reuse the reschedule 
> > > > > mechanism.
> > > > > 
> > > > > It would be relatively straightforward: first a send-reschedule then 
> > > > > do a wait_task_context_switch() on rq->curr - both are existing 
> > > > > primitives. (a task reference has to be taken but that's pretty much 
> > > > > all)
> > > > 
> > > > Well, one reason I didn't take this approach was that I didn't 
> > > > happen to think of it.  ;-)
> > > > 
> > > > Also that I hadn't heard of wait_task_context_switch().
> > > > 
> > > > Hmmm...  Looking for wait_task_context_switch().  OK, found it.
> > > > 
> > > > It looks to me that this primitive won't return until the 
> > > > scheduler actually decides to run something else.  We instead need 
> > > > to have something that stops waiting once the CPU enters the 
> > > > scheduler, hence the previous thought of making rcu_qsctr_inc() do 
> > > > a bit of extra work.
> > > > 
> > > > This would be a way of making an expedited RCU-sched across all 
> > > > RCU implementations.  As noted in the earlier email, it would not 
> > > > handle RCU or RCU-bh in a -rt kernel.
> > > > 
> > > > > By the time wait_task_context_switch() returns from the last CPU 
> > > > > we know that the quiescent state has passed.
> > > > 
> > > > We would want to wait for all of the CPUs in parallel, though, 
> > > > wouldn't we?  Seems that we would not want to wait for the last 
> > > > CPU to do another trip through the scheduler if it had already 
> > > > passed through the scheduler while we were waiting on the earlier 
> > > > CPUs.
> > > > 
> > > > So it seems like we would still want a two-pass approach -- one 
> > > > pass to capture the current state, the second pass to wait for the 
> > > > state to change.
> > > 
> > > I think waiting in parallel is still possible (first kick all tasks, 
> > > then make sure all tasks have left the CPU at least once).
> > > 
> > > The busy-waiting in wait_task_context_switch() is indeed a problem - 
> > > but perhaps that could be refactored to be a migration-thread driven 
> > > wait_for_completion() + complete() cycle? It could be driven by 
> > > preempt notifiers perhaps - and become zero-cost.
> > 
> > Hmmm...  It would need to be informed of the quiescent state even if
> > that quiescent state did not result in a preemption.
> > 
> > But you are right -- I do need to expedite RCU, not just RCU-bh,
> > especially given that the boot-speed guys are starting to see grace
> > periods as a measureable fraction of the boot time.  I will take another
> > pass at this.
> 
> It might sound a bit simplistic, but... scheduling a high-priority
> workqueue on every CPUs would give you the guarantees you seem to need
> here. Or is the delay of letting the scheduler schedule a high-priority
> task a delay you are trying to avoid ?
> 
> Some kind of priority boosting done by synchronize_rcu() could probably
> work, and you could support rcu callbacks priority boosting by assigning
> a priority to each callback registered (same priority as the thread
> which invoked call_rcu). The rcu callbacks could then be sorted by
> priority in a RB tree, and only the callbacks associated with priority
> >= than the next priority task would be executed.

I did something similar for the implementation of synchronize_sched()
in preemptable RCU.  The interactions with CPU hotplug are a bit ugly.
It will be easier to hook into rcu_qsctr_inc().  ;-)

But this discussion has been quite useful -- my thoughts for the design
of the long-term solution were a bit lacking, as they would have allowed
a heavy callback load to delay an expedited grace period.  So this will
be a bit of a hack until I get all the RCU implementations converged,
but there is a nice long-term solution to be had by integrating the
expediting into the hierarchical-RCU data structures.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH RFC] v2 expedited "big hammer" RCU grace periods
  2009-04-27 13:43             ` Ingo Molnar
@ 2009-04-27 16:17               ` Paul E. McKenney
  0 siblings, 0 replies; 13+ messages in thread
From: Paul E. McKenney @ 2009-04-27 16:17 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Mathieu Desnoyers, linux-kernel, netdev, netfilter-devel, akpm,
	torvalds, davem, dada1, zbr, jeff.chua.linux, paulus, laijs,
	jengelh, r000n, benh

On Mon, Apr 27, 2009 at 03:43:02PM +0200, Ingo Molnar wrote:
> 
> * Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote:
> 
> > On Mon, Apr 27, 2009 at 05:26:39AM +0200, Ingo Molnar wrote:
> > > 
> > > * Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote:
> > > 
> > > > On Sun, Apr 26, 2009 at 10:22:55PM +0200, Ingo Molnar wrote:
> > > > > 
> > > > > * Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> wrote:
> > > > > 
> > > > > > * Ingo Molnar (mingo@elte.hu) wrote:
> > > > > > > 
> > > > > > > * Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote:
> > > > > > > 
> > > > > > > > Second cut of "big hammer" expedited RCU grace periods, but only 
> > > > > > > > for rcu_bh.  This creates another softirq vector, so that entering 
> > > > > > > > this softirq vector will have forced an rcu_bh quiescent state (as 
> > > > > > > > noted by Dave Miller).  Use smp_call_function() to invoke 
> > > > > > > > raise_softirq() on all CPUs in order to cause this to happen.  
> > > > > > > > Track the CPUs that have passed through a quiescent state (or gone 
> > > > > > > > offline) with a cpumask.
> > > > > > > 
> > > > > > > hm, i'm still asking whether doing this would be simpler via a 
> > > > > > > reschedule vector - which not only is an existing facility but also 
> > > > > > > forces all RCU domains through a quiescent state - not just bh-RCU 
> > > > > > > participants.
> > > > > > > 
> > > > > > > Triggering a new softirq is in no way simpler that doing an SMP 
> > > > > > > cross-call - in fact softirqs are a finite resource so using some 
> > > > > > > other facility would be preferred.
> > > > > > > 
> > > > > > > Am i missing something?
> > > > > > > 
> > > > > > 
> > > > > > I think the reason for this whole thread is that waiting for rcu 
> > > > > > quiescent state, when called many times e.g. in multiple iptables 
> > > > > > invokations, takes too longs (5 seconds to load the netfilter 
> > > > > > rules at boot). [...]
> > > > > 
> > > > > I'm aware of the problem space.
> > > > > 
> > > > > I was suggesting that to trigger the quiescent state and to wait for 
> > > > > it to propagate it would be enough to reuse the reschedule 
> > > > > mechanism.
> > > > > 
> > > > > It would be relatively straightforward: first a send-reschedule then 
> > > > > do a wait_task_context_switch() on rq->curr - both are existing 
> > > > > primitives. (a task reference has to be taken but that's pretty much 
> > > > > all)
> > > > 
> > > > Well, one reason I didn't take this approach was that I didn't 
> > > > happen to think of it.  ;-)
> > > > 
> > > > Also that I hadn't heard of wait_task_context_switch().
> > > > 
> > > > Hmmm...  Looking for wait_task_context_switch().  OK, found it.
> > > > 
> > > > It looks to me that this primitive won't return until the 
> > > > scheduler actually decides to run something else.  We instead need 
> > > > to have something that stops waiting once the CPU enters the 
> > > > scheduler, hence the previous thought of making rcu_qsctr_inc() do 
> > > > a bit of extra work.
> > > > 
> > > > This would be a way of making an expedited RCU-sched across all 
> > > > RCU implementations.  As noted in the earlier email, it would not 
> > > > handle RCU or RCU-bh in a -rt kernel.
> > > > 
> > > > > By the time wait_task_context_switch() returns from the last CPU 
> > > > > we know that the quiescent state has passed.
> > > > 
> > > > We would want to wait for all of the CPUs in parallel, though, 
> > > > wouldn't we?  Seems that we would not want to wait for the last 
> > > > CPU to do another trip through the scheduler if it had already 
> > > > passed through the scheduler while we were waiting on the earlier 
> > > > CPUs.
> > > > 
> > > > So it seems like we would still want a two-pass approach -- one 
> > > > pass to capture the current state, the second pass to wait for the 
> > > > state to change.
> > > 
> > > I think waiting in parallel is still possible (first kick all tasks, 
> > > then make sure all tasks have left the CPU at least once).
> > > 
> > > The busy-waiting in wait_task_context_switch() is indeed a 
> > > problem - but perhaps that could be refactored to be a 
> > > migration-thread driven wait_for_completion() + complete() 
> > > cycle? It could be driven by preempt notifiers perhaps - and 
> > > become zero-cost.
> > 
> > Hmmm...  It would need to be informed of the quiescent state even 
> > if that quiescent state did not result in a preemption.
> > 
> > But you are right -- I do need to expedite RCU, not just RCU-bh, 
> > especially given that the boot-speed guys are starting to see 
> > grace periods as a measureable fraction of the boot time.  I will 
> > take another pass at this.
> 
> The precise method of signalling is a detail i suspect - so by all 
> means use a new softirq if that is the cleanest. I'd also agree that 
> covering not just bh-rcu would definitely be a good idea.

Fair enough!  ;-)

							Thanx, Paul

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH RFC] v2 expedited "big hammer" RCU grace periods
  2009-04-27 15:54             ` Mathieu Desnoyers
  2009-04-27 16:16               ` Paul E. McKenney
@ 2009-04-27 20:56               ` Evgeniy Polyakov
  1 sibling, 0 replies; 13+ messages in thread
From: Evgeniy Polyakov @ 2009-04-27 20:56 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Paul E. McKenney, Ingo Molnar, linux-kernel, netdev,
	netfilter-devel, akpm, torvalds, davem, dada1, jeff.chua.linux,
	paulus, laijs, jengelh, r000n, benh

Hi.

On Mon, Apr 27, 2009 at 11:54:24AM -0400, Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca) wrote:
> It might sound a bit simplistic, but... scheduling a high-priority
> workqueue on every CPUs would give you the guarantees you seem to need
> here. Or is the delay of letting the scheduler schedule a high-priority
> task a delay you are trying to avoid ?

I believe not the abstract (empty) task should be invoked, but a real
task which does the work. Presumably if we want to schedule a way
networking part to copy its counters, system should wake up the
appropriate userspace thread blocked in a grace-period waking path, so
effectively either RCU callback processing code or some registered
thread should be awakened.

Just a detail though.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2009-04-27 20:56 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-04-26  5:23 [PATCH RFC] v2 expedited "big hammer" RCU grace periods Paul E. McKenney
2009-04-26 11:27 ` Ingo Molnar
2009-04-26 19:13   ` Mathieu Desnoyers
2009-04-26 20:22     ` Ingo Molnar
2009-04-26 21:44       ` Paul E. McKenney
2009-04-27  3:26         ` Ingo Molnar
2009-04-27 13:21           ` Paul E. McKenney
2009-04-27 13:43             ` Ingo Molnar
2009-04-27 16:17               ` Paul E. McKenney
2009-04-27 15:54             ` Mathieu Desnoyers
2009-04-27 16:16               ` Paul E. McKenney
2009-04-27 20:56               ` Evgeniy Polyakov
2009-04-26 20:54   ` Paul E. McKenney

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.