All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online
@ 2012-03-14  0:24 Paul E. McKenney
  2012-03-14  9:24 ` Mike Galbraith
  0 siblings, 1 reply; 32+ messages in thread
From: Paul E. McKenney @ 2012-03-14  0:24 UTC (permalink / raw)
  To: sivanich; +Cc: linux-kernel

The following builds, but is only very lightly tested.  Probably full
of bug, especially when exercising CPU hotplug.

							Thanx, Paul

------------------------------------------------------------------------

The current grace-period initialization initializes all leaf rcu_node
structures, even those corresponding to CPUs that have never been online.
This is harmless in many configurations, but results in 200-microsecond
latency spikes for kernels built with NR_CPUS=4096.

This commit therefore keeps track of the largest-numbered CPU that has
ever been online, and limits grace-period initialization to rcu_node
structures corresponding to that CPU and to smaller-numbered CPUs.

Reported-by: Dimitri Sivanich <sivanich@sgi.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index c3b05ef..5688443 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -91,6 +91,8 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
 
 static struct rcu_state *rcu_state;
 
+int rcu_max_cpu __read_mostly;	/* Largest # CPU that has ever been online. */
+
 /*
  * The rcu_scheduler_active variable transitions from zero to one just
  * before the first task is spawned.  So when this variable is zero, RCU
@@ -1129,8 +1131,9 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
 	__releases(rcu_get_root(rsp)->lock)
 {
 	unsigned long gp_duration;
-	struct rcu_node *rnp = rcu_get_root(rsp);
 	struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
+	struct rcu_node *rnp;
+	struct rcu_node *rnp_root = rcu_get_root(rsp);
 
 	WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
 
@@ -1159,26 +1162,28 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
 	 * completed.
 	 */
 	if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) {
-		raw_spin_unlock(&rnp->lock);	 /* irqs remain disabled. */
 
 		/*
 		 * Propagate new ->completed value to rcu_node structures
 		 * so that other CPUs don't have to wait until the start
 		 * of the next grace period to process their callbacks.
+		 * We must hold the root rcu_node structure's ->lock
+		 * across rcu_for_each_node_breadth_first() in order to
+		 * synchronize with CPUs coming online for the first time.
 		 */
 		rcu_for_each_node_breadth_first(rsp, rnp) {
+			raw_spin_unlock(&rnp_root->lock); /* remain disabled. */
 			raw_spin_lock(&rnp->lock); /* irqs already disabled. */
 			rnp->completed = rsp->gpnum;
 			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+			raw_spin_lock(&rnp_root->lock); /* already disabled. */
 		}
-		rnp = rcu_get_root(rsp);
-		raw_spin_lock(&rnp->lock); /* irqs already disabled. */
 	}
 
 	rsp->completed = rsp->gpnum;  /* Declare the grace period complete. */
 	trace_rcu_grace_period(rsp->name, rsp->completed, "end");
 	rsp->fqs_state = RCU_GP_IDLE;
-	rcu_start_gp(rsp, flags);  /* releases root node's rnp->lock. */
+	rcu_start_gp(rsp, flags);  /* releases root node's ->lock. */
 }
 
 /*
@@ -2440,6 +2445,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
 	unsigned long mask;
 	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 	struct rcu_node *rnp = rcu_get_root(rsp);
+	struct rcu_node *rnp_init;
 
 	/* Set up local state, ensuring consistent view of global state. */
 	raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -2462,6 +2468,16 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
 	/* Exclude any attempts to start a new GP on large systems. */
 	raw_spin_lock(&rsp->onofflock);		/* irqs already disabled. */
 
+	/* Initialize any rcu_node structures that will see their first use. */
+	raw_spin_lock(&rnp->lock);		/* irqs already disabled. */
+	for (rnp_init = per_cpu_ptr(rsp->rda, rcu_max_cpu)->mynode + 1;
+	     rnp_init <= rdp->mynode;
+	     rnp_init++) {
+		rnp_init->gpnum = rsp->gpnum;
+		rnp_init->completed = rsp->completed;
+	}
+	raw_spin_unlock(&rnp->lock);		/* irqs remain disabled. */
+
 	/* Add CPU to rcu_node bitmasks. */
 	rnp = rdp->mynode;
 	mask = rdp->grpmask;
@@ -2495,6 +2511,8 @@ static void __cpuinit rcu_prepare_cpu(int cpu)
 	rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
 	rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
 	rcu_preempt_init_percpu_data(cpu);
+	if (cpu > rcu_max_cpu)
+		rcu_max_cpu = cpu;
 }
 
 /*
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 1e49c56..1dc74e0 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -192,11 +192,13 @@ struct rcu_node {
 
 /*
  * Do a full breadth-first scan of the rcu_node structures for the
- * specified rcu_state structure.
+ * specified rcu_state structure.  The caller must hold either the
+ * ->onofflock or the root rcu_node structure's ->lock.
  */
+extern int rcu_max_cpu;
 #define rcu_for_each_node_breadth_first(rsp, rnp) \
 	for ((rnp) = &(rsp)->node[0]; \
-	     (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
+	     (rnp) < per_cpu_ptr((rsp)->rda, rcu_max_cpu)->mynode; (rnp)++)
 
 /*
  * Do a breadth-first scan of the non-leaf rcu_node structures for the


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online
  2012-03-14  0:24 [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online Paul E. McKenney
@ 2012-03-14  9:24 ` Mike Galbraith
  2012-03-14 12:40   ` Mike Galbraith
  0 siblings, 1 reply; 32+ messages in thread
From: Mike Galbraith @ 2012-03-14  9:24 UTC (permalink / raw)
  To: paulmck; +Cc: sivanich, linux-kernel

On Tue, 2012-03-13 at 17:24 -0700, Paul E. McKenney wrote: 
> The following builds, but is only very lightly tested.  Probably full
> of bug, especially when exercising CPU hotplug.

You didn't say RFT, but...

To beat on this in a rotund 3.0 kernel, the equivalent patch would be
the below?  My box may well answer that before you can.. hope not ;-) 

---
 kernel/rcutree.c |   31 ++++++++++++++++++++++++++-----
 kernel/rcutree.h |    6 ++++--
 2 files changed, 30 insertions(+), 7 deletions(-)

--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -84,6 +84,8 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_d
 
 static struct rcu_state *rcu_state;
 
+int rcu_max_cpu __read_mostly;	/* Largest # CPU that has ever been online. */
+
 /*
  * The rcu_scheduler_active variable transitions from zero to one just
  * before the first task is spawned.  So when this variable is zero, RCU
@@ -827,25 +829,31 @@ rcu_start_gp(struct rcu_state *rsp, unsi
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
 	if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
+		struct rcu_node *rnp_root = rnp;
+
 		if (cpu_needs_another_gp(rsp, rdp))
 			rsp->fqs_need_gp = 1;
-		if (rnp->completed == rsp->completed) {
-			raw_spin_unlock_irqrestore(&rnp->lock, flags);
+		if (rnp_root->completed == rsp->completed) {
+			raw_spin_unlock_irqrestore(&rnp_root->lock, flags);
 			return;
 		}
-		raw_spin_unlock(&rnp->lock);	 /* irqs remain disabled. */
 
 		/*
 		 * Propagate new ->completed value to rcu_node structures
 		 * so that other CPUs don't have to wait until the start
 		 * of the next grace period to process their callbacks.
+		 * We must hold the root rcu_node structure's ->lock
+		 * across rcu_for_each_node_breadth_first() in order to
+		 * synchronize with CPUs coming online for the first time.
 		 */
 		rcu_for_each_node_breadth_first(rsp, rnp) {
+			raw_spin_unlock(&rnp_root->lock); /* remain disabled. */
 			raw_spin_lock(&rnp->lock); /* irqs already disabled. */
 			rnp->completed = rsp->completed;
 			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+			raw_spin_lock(&rnp_root->lock); /* already disabled. */
 		}
-		local_irq_restore(flags);
+		raw_spin_unlock_irqrestore(&rnp_root->lock, flags);
 		return;
 	}
 
@@ -935,7 +943,7 @@ static void rcu_report_qs_rsp(struct rcu
 		rsp->gp_max = gp_duration;
 	rsp->completed = rsp->gpnum;
 	rsp->signaled = RCU_GP_IDLE;
-	rcu_start_gp(rsp, flags);  /* releases root node's rnp->lock. */
+	rcu_start_gp(rsp, flags);  /* releases root node's ->lock. */
 }
 
 /*
@@ -1862,6 +1870,7 @@ rcu_init_percpu_data(int cpu, struct rcu
 	unsigned long mask;
 	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 	struct rcu_node *rnp = rcu_get_root(rsp);
+	struct rcu_node *rnp_init;
 
 	/* Set up local state, ensuring consistent view of global state. */
 	raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -1882,6 +1891,16 @@ rcu_init_percpu_data(int cpu, struct rcu
 	/* Exclude any attempts to start a new GP on large systems. */
 	raw_spin_lock(&rsp->onofflock);		/* irqs already disabled. */
 
+	/* Initialize any rcu_node structures that will see their first use. */
+	raw_spin_lock(&rnp->lock);		/* irqs already disabled. */
+	for (rnp_init = per_cpu_ptr(rsp->rda, rcu_max_cpu)->mynode + 1;
+	     rnp_init <= rdp->mynode;
+	     rnp_init++) {
+		rnp_init->gpnum = rsp->gpnum;
+		rnp_init->completed = rsp->completed;
+	}
+	raw_spin_unlock(&rnp->lock);		/* irqs remain disabled. */
+
 	/* Add CPU to rcu_node bitmasks. */
 	rnp = rdp->mynode;
 	mask = rdp->grpmask;
@@ -1907,6 +1926,8 @@ static void __cpuinit rcu_prepare_cpu(in
 	rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
 	rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
 	rcu_preempt_init_percpu_data(cpu);
+	if (cpu > rcu_max_cpu)
+		rcu_max_cpu = cpu;
 }
 
 /*
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -191,11 +191,13 @@ struct rcu_node {
 
 /*
  * Do a full breadth-first scan of the rcu_node structures for the
- * specified rcu_state structure.
+ * specified rcu_state structure.  The caller must hold either the
+ * ->onofflock or the root rcu_node structure's ->lock.
  */
+extern int rcu_max_cpu;
 #define rcu_for_each_node_breadth_first(rsp, rnp) \
 	for ((rnp) = &(rsp)->node[0]; \
-	     (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
+	     (rnp) < per_cpu_ptr((rsp)->rda, rcu_max_cpu)->mynode; (rnp)++)
 
 /*
  * Do a breadth-first scan of the non-leaf rcu_node structures for the



^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online
  2012-03-14  9:24 ` Mike Galbraith
@ 2012-03-14 12:40   ` Mike Galbraith
  2012-03-14 13:08     ` Dimitri Sivanich
  0 siblings, 1 reply; 32+ messages in thread
From: Mike Galbraith @ 2012-03-14 12:40 UTC (permalink / raw)
  To: paulmck; +Cc: sivanich, linux-kernel

On Wed, 2012-03-14 at 10:24 +0100, Mike Galbraith wrote: 
> On Tue, 2012-03-13 at 17:24 -0700, Paul E. McKenney wrote: 
> > The following builds, but is only very lightly tested.  Probably full
> > of bug, especially when exercising CPU hotplug.
> 
> You didn't say RFT, but...
> 
> To beat on this in a rotund 3.0 kernel, the equivalent patch would be
> the below?  My box may well answer that before you can.. hope not ;-)

(Darn, it did.  Box says boot stall with virgin patch in tip too though.
Wedging it straight into 3.0 was perhaps a tad premature;)


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online
  2012-03-14 12:40   ` Mike Galbraith
@ 2012-03-14 13:08     ` Dimitri Sivanich
  2012-03-14 15:17       ` Paul E. McKenney
  0 siblings, 1 reply; 32+ messages in thread
From: Dimitri Sivanich @ 2012-03-14 13:08 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: paulmck, linux-kernel

On Wed, Mar 14, 2012 at 01:40:41PM +0100, Mike Galbraith wrote:
> On Wed, 2012-03-14 at 10:24 +0100, Mike Galbraith wrote: 
> > On Tue, 2012-03-13 at 17:24 -0700, Paul E. McKenney wrote: 
> > > The following builds, but is only very lightly tested.  Probably full
> > > of bug, especially when exercising CPU hotplug.
> > 
> > You didn't say RFT, but...
> > 
> > To beat on this in a rotund 3.0 kernel, the equivalent patch would be
> > the below?  My box may well answer that before you can.. hope not ;-)
> 
> (Darn, it did.  Box says boot stall with virgin patch in tip too though.
> Wedging it straight into 3.0 was perhaps a tad premature;)

I saw the same thing with 3.3.0-rc7+ and virgin patch on UV.  Boots fine without the patch.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online
  2012-03-14 13:08     ` Dimitri Sivanich
@ 2012-03-14 15:17       ` Paul E. McKenney
  2012-03-14 16:56         ` Paul E. McKenney
  2012-03-14 17:07         ` Mike Galbraith
  0 siblings, 2 replies; 32+ messages in thread
From: Paul E. McKenney @ 2012-03-14 15:17 UTC (permalink / raw)
  To: Dimitri Sivanich; +Cc: Mike Galbraith, linux-kernel

On Wed, Mar 14, 2012 at 08:08:01AM -0500, Dimitri Sivanich wrote:
> On Wed, Mar 14, 2012 at 01:40:41PM +0100, Mike Galbraith wrote:
> > On Wed, 2012-03-14 at 10:24 +0100, Mike Galbraith wrote: 
> > > On Tue, 2012-03-13 at 17:24 -0700, Paul E. McKenney wrote: 
> > > > The following builds, but is only very lightly tested.  Probably full
> > > > of bug, especially when exercising CPU hotplug.
> > > 
> > > You didn't say RFT, but...
> > > 
> > > To beat on this in a rotund 3.0 kernel, the equivalent patch would be
> > > the below?  My box may well answer that before you can.. hope not ;-)
> > 
> > (Darn, it did.  Box says boot stall with virgin patch in tip too though.
> > Wedging it straight into 3.0 was perhaps a tad premature;)
> 
> I saw the same thing with 3.3.0-rc7+ and virgin patch on UV.  Boots fine without the patch.

Right...  Bozo here forgot to set the kernel parameters for large-system
emulation during testing.  Apologies for the busted patch, will fix.

And thank you both for the testing!!!

Hey, at least I labeled it "RFC".  ;-)

							Thanx, Paul


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online
  2012-03-14 15:17       ` Paul E. McKenney
@ 2012-03-14 16:56         ` Paul E. McKenney
  2012-03-15  2:42           ` Mike Galbraith
  2012-03-15 17:58           ` Dimitri Sivanich
  2012-03-14 17:07         ` Mike Galbraith
  1 sibling, 2 replies; 32+ messages in thread
From: Paul E. McKenney @ 2012-03-14 16:56 UTC (permalink / raw)
  To: Dimitri Sivanich; +Cc: Mike Galbraith, linux-kernel

On Wed, Mar 14, 2012 at 08:17:17AM -0700, Paul E. McKenney wrote:
> On Wed, Mar 14, 2012 at 08:08:01AM -0500, Dimitri Sivanich wrote:
> > On Wed, Mar 14, 2012 at 01:40:41PM +0100, Mike Galbraith wrote:
> > > On Wed, 2012-03-14 at 10:24 +0100, Mike Galbraith wrote: 
> > > > On Tue, 2012-03-13 at 17:24 -0700, Paul E. McKenney wrote: 
> > > > > The following builds, but is only very lightly tested.  Probably full
> > > > > of bug, especially when exercising CPU hotplug.
> > > > 
> > > > You didn't say RFT, but...
> > > > 
> > > > To beat on this in a rotund 3.0 kernel, the equivalent patch would be
> > > > the below?  My box may well answer that before you can.. hope not ;-)
> > > 
> > > (Darn, it did.  Box says boot stall with virgin patch in tip too though.
> > > Wedging it straight into 3.0 was perhaps a tad premature;)
> > 
> > I saw the same thing with 3.3.0-rc7+ and virgin patch on UV.  Boots fine without the patch.
> 
> Right...  Bozo here forgot to set the kernel parameters for large-system
> emulation during testing.  Apologies for the busted patch, will fix.
> 
> And thank you both for the testing!!!
> 
> Hey, at least I labeled it "RFC".  ;-)

Does the following work better?  It does pass my fake-big-system tests
(more testing in the works).

							Thanx, Paul

------------------------------------------------------------------------

rcu: Limit GP initialization to CPUs that have been online

The current grace-period initialization initializes all leaf rcu_node
structures, even those corresponding to CPUs that have never been online.
This is harmless in many configurations, but results in 200-microsecond
latency spikes for kernels built with NR_CPUS=4096.

This commit therefore keeps track of the largest-numbered CPU that has
ever been online, and limits grace-period initialization to rcu_node
structures corresponding to that CPU and to smaller-numbered CPUs.

Reported-by: Dimitri Sivanich <sivanich@sgi.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index c3b05ef..5688443 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -91,6 +91,8 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
 
 static struct rcu_state *rcu_state;
 
+int rcu_max_cpu __read_mostly;	/* Largest # CPU that has ever been online. */
+
 /*
  * The rcu_scheduler_active variable transitions from zero to one just
  * before the first task is spawned.  So when this variable is zero, RCU
@@ -1129,8 +1131,9 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
 	__releases(rcu_get_root(rsp)->lock)
 {
 	unsigned long gp_duration;
-	struct rcu_node *rnp = rcu_get_root(rsp);
 	struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
+	struct rcu_node *rnp;
+	struct rcu_node *rnp_root = rcu_get_root(rsp);
 
 	WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
 
@@ -1159,26 +1162,28 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
 	 * completed.
 	 */
 	if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) {
-		raw_spin_unlock(&rnp->lock);	 /* irqs remain disabled. */
 
 		/*
 		 * Propagate new ->completed value to rcu_node structures
 		 * so that other CPUs don't have to wait until the start
 		 * of the next grace period to process their callbacks.
+		 * We must hold the root rcu_node structure's ->lock
+		 * across rcu_for_each_node_breadth_first() in order to
+		 * synchronize with CPUs coming online for the first time.
 		 */
 		rcu_for_each_node_breadth_first(rsp, rnp) {
+			raw_spin_unlock(&rnp_root->lock); /* remain disabled. */
 			raw_spin_lock(&rnp->lock); /* irqs already disabled. */
 			rnp->completed = rsp->gpnum;
 			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+			raw_spin_lock(&rnp_root->lock); /* already disabled. */
 		}
-		rnp = rcu_get_root(rsp);
-		raw_spin_lock(&rnp->lock); /* irqs already disabled. */
 	}
 
 	rsp->completed = rsp->gpnum;  /* Declare the grace period complete. */
 	trace_rcu_grace_period(rsp->name, rsp->completed, "end");
 	rsp->fqs_state = RCU_GP_IDLE;
-	rcu_start_gp(rsp, flags);  /* releases root node's rnp->lock. */
+	rcu_start_gp(rsp, flags);  /* releases root node's ->lock. */
 }
 
 /*
@@ -2440,6 +2445,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
 	unsigned long mask;
 	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 	struct rcu_node *rnp = rcu_get_root(rsp);
+	struct rcu_node *rnp_init;
 
 	/* Set up local state, ensuring consistent view of global state. */
 	raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -2462,6 +2468,16 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
 	/* Exclude any attempts to start a new GP on large systems. */
 	raw_spin_lock(&rsp->onofflock);		/* irqs already disabled. */
 
+	/* Initialize any rcu_node structures that will see their first use. */
+	raw_spin_lock(&rnp->lock);		/* irqs already disabled. */
+	for (rnp_init = per_cpu_ptr(rsp->rda, rcu_max_cpu)->mynode + 1;
+	     rnp_init <= rdp->mynode;
+	     rnp_init++) {
+		rnp_init->gpnum = rsp->gpnum;
+		rnp_init->completed = rsp->completed;
+	}
+	raw_spin_unlock(&rnp->lock);		/* irqs remain disabled. */
+
 	/* Add CPU to rcu_node bitmasks. */
 	rnp = rdp->mynode;
 	mask = rdp->grpmask;
@@ -2495,6 +2511,8 @@ static void __cpuinit rcu_prepare_cpu(int cpu)
 	rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
 	rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
 	rcu_preempt_init_percpu_data(cpu);
+	if (cpu > rcu_max_cpu)
+		rcu_max_cpu = cpu;
 }
 
 /*
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 1e49c56..afdf410 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -192,11 +192,13 @@ struct rcu_node {
 
 /*
  * Do a full breadth-first scan of the rcu_node structures for the
- * specified rcu_state structure.
+ * specified rcu_state structure.  The caller must hold either the
+ * ->onofflock or the root rcu_node structure's ->lock.
  */
+extern int rcu_max_cpu;
 #define rcu_for_each_node_breadth_first(rsp, rnp) \
 	for ((rnp) = &(rsp)->node[0]; \
-	     (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
+	     (rnp) <= per_cpu_ptr((rsp)->rda, rcu_max_cpu)->mynode; (rnp)++)
 
 /*
  * Do a breadth-first scan of the non-leaf rcu_node structures for the


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online
  2012-03-14 15:17       ` Paul E. McKenney
  2012-03-14 16:56         ` Paul E. McKenney
@ 2012-03-14 17:07         ` Mike Galbraith
  1 sibling, 0 replies; 32+ messages in thread
From: Mike Galbraith @ 2012-03-14 17:07 UTC (permalink / raw)
  To: paulmck; +Cc: Dimitri Sivanich, linux-kernel

On Wed, 2012-03-14 at 08:17 -0700, Paul E. McKenney wrote:

> And thank you both for the testing!!!

Try to beat RCU into submission with rocks and sticks, or test patch
from RCU expert... not a hard choice.  I will most happily test ;-)

-Mike 


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online
  2012-03-14 16:56         ` Paul E. McKenney
@ 2012-03-15  2:42           ` Mike Galbraith
  2012-03-15  3:07             ` Mike Galbraith
  2012-03-15 17:58           ` Dimitri Sivanich
  1 sibling, 1 reply; 32+ messages in thread
From: Mike Galbraith @ 2012-03-15  2:42 UTC (permalink / raw)
  To: paulmck; +Cc: Dimitri Sivanich, linux-kernel

On Wed, 2012-03-14 at 09:56 -0700, Paul E. McKenney wrote:

> Does the following work better?  It does pass my fake-big-system tests
> (more testing in the works).

Yup, tip booted fine.  Thanks!  I'll test, see if it gets upset.

-Mike


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online
  2012-03-15  2:42           ` Mike Galbraith
@ 2012-03-15  3:07             ` Mike Galbraith
  2012-03-15 17:02               ` Paul E. McKenney
  2012-03-15 17:59               ` Dimitri Sivanich
  0 siblings, 2 replies; 32+ messages in thread
From: Mike Galbraith @ 2012-03-15  3:07 UTC (permalink / raw)
  To: paulmck; +Cc: Dimitri Sivanich, linux-kernel

On Thu, 2012-03-15 at 03:42 +0100, Mike Galbraith wrote: 
> On Wed, 2012-03-14 at 09:56 -0700, Paul E. McKenney wrote:
> 
> > Does the following work better?  It does pass my fake-big-system tests
> > (more testing in the works).
> 
> Yup, tip booted fine.  Thanks!  I'll test, see if it gets upset.

Wedged into 3.0 enterprise booted fine too, is now running rcutorture.
I'll add hotplug after it runs explosion free for a while.  Any
suggestions for giving both virgin and 3.0 a thorough trouncing?

-Mike


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online
  2012-03-15  3:07             ` Mike Galbraith
@ 2012-03-15 17:02               ` Paul E. McKenney
  2012-03-15 17:21                 ` Dimitri Sivanich
  2012-03-16  4:45                 ` Mike Galbraith
  2012-03-15 17:59               ` Dimitri Sivanich
  1 sibling, 2 replies; 32+ messages in thread
From: Paul E. McKenney @ 2012-03-15 17:02 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Dimitri Sivanich, linux-kernel

On Thu, Mar 15, 2012 at 04:07:11AM +0100, Mike Galbraith wrote:
> On Thu, 2012-03-15 at 03:42 +0100, Mike Galbraith wrote: 
> > On Wed, 2012-03-14 at 09:56 -0700, Paul E. McKenney wrote:
> > 
> > > Does the following work better?  It does pass my fake-big-system tests
> > > (more testing in the works).
> > 
> > Yup, tip booted fine.  Thanks!  I'll test, see if it gets upset.
> 
> Wedged into 3.0 enterprise booted fine too, is now running rcutorture.
> I'll add hotplug after it runs explosion free for a while.  Any
> suggestions for giving both virgin and 3.0 a thorough trouncing?

It would be good to check the latency of RCU's grace-period
initialization, and comparing the results with and without this patch.
Dimitri might have scripting for that.

							Thanx, Paul


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online
  2012-03-15 17:02               ` Paul E. McKenney
@ 2012-03-15 17:21                 ` Dimitri Sivanich
  2012-03-16  4:45                 ` Mike Galbraith
  1 sibling, 0 replies; 32+ messages in thread
From: Dimitri Sivanich @ 2012-03-15 17:21 UTC (permalink / raw)
  To: Paul E. McKenney; +Cc: Mike Galbraith, linux-kernel

On Thu, Mar 15, 2012 at 10:02:36AM -0700, Paul E. McKenney wrote:
> On Thu, Mar 15, 2012 at 04:07:11AM +0100, Mike Galbraith wrote:
> > On Thu, 2012-03-15 at 03:42 +0100, Mike Galbraith wrote: 
> > > On Wed, 2012-03-14 at 09:56 -0700, Paul E. McKenney wrote:
> > > 
> > > > Does the following work better?  It does pass my fake-big-system tests
> > > > (more testing in the works).
> > > 
> > > Yup, tip booted fine.  Thanks!  I'll test, see if it gets upset.
> > 
> > Wedged into 3.0 enterprise booted fine too, is now running rcutorture.
> > I'll add hotplug after it runs explosion free for a while.  Any
> > suggestions for giving both virgin and 3.0 a thorough trouncing?
> 
> It would be good to check the latency of RCU's grace-period
> initialization, and comparing the results with and without this patch.
> Dimitri might have scripting for that.
> 

I'll attempt to get that tested yet this week.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online
  2012-03-14 16:56         ` Paul E. McKenney
  2012-03-15  2:42           ` Mike Galbraith
@ 2012-03-15 17:58           ` Dimitri Sivanich
  2012-03-15 18:23             ` Paul E. McKenney
  1 sibling, 1 reply; 32+ messages in thread
From: Dimitri Sivanich @ 2012-03-15 17:58 UTC (permalink / raw)
  To: Paul E. McKenney; +Cc: Mike Galbraith, linux-kernel

On Wed, Mar 14, 2012 at 09:56:57AM -0700, Paul E. McKenney wrote:
> On Wed, Mar 14, 2012 at 08:17:17AM -0700, Paul E. McKenney wrote:
> > On Wed, Mar 14, 2012 at 08:08:01AM -0500, Dimitri Sivanich wrote:
> > > On Wed, Mar 14, 2012 at 01:40:41PM +0100, Mike Galbraith wrote:
> > > > On Wed, 2012-03-14 at 10:24 +0100, Mike Galbraith wrote: 
> > > > > On Tue, 2012-03-13 at 17:24 -0700, Paul E. McKenney wrote: 
> > > > > > The following builds, but is only very lightly tested.  Probably full
> > > > > > of bug, especially when exercising CPU hotplug.
> > > > > 
> > > > > You didn't say RFT, but...
> > > > > 
> > > > > To beat on this in a rotund 3.0 kernel, the equivalent patch would be
> > > > > the below?  My box may well answer that before you can.. hope not ;-)
> > > > 
> > > > (Darn, it did.  Box says boot stall with virgin patch in tip too though.
> > > > Wedging it straight into 3.0 was perhaps a tad premature;)
> > > 
> > > I saw the same thing with 3.3.0-rc7+ and virgin patch on UV.  Boots fine without the patch.
> > 
> > Right...  Bozo here forgot to set the kernel parameters for large-system
> > emulation during testing.  Apologies for the busted patch, will fix.
> > 
> > And thank you both for the testing!!!
> > 
> > Hey, at least I labeled it "RFC".  ;-)
> 
> Does the following work better?  It does pass my fake-big-system tests
> (more testing in the works).

This one stalls for me at the same place the other one did.  Once again,
if I remove the patch and rebuild, it boots just fine.

Is there some debug/trace information that you would like me to provide?

> 
> 							Thanx, Paul
> 
> ------------------------------------------------------------------------
> 
> rcu: Limit GP initialization to CPUs that have been online
> 
> The current grace-period initialization initializes all leaf rcu_node
> structures, even those corresponding to CPUs that have never been online.
> This is harmless in many configurations, but results in 200-microsecond
> latency spikes for kernels built with NR_CPUS=4096.
> 
> This commit therefore keeps track of the largest-numbered CPU that has
> ever been online, and limits grace-period initialization to rcu_node
> structures corresponding to that CPU and to smaller-numbered CPUs.
> 
> Reported-by: Dimitri Sivanich <sivanich@sgi.com>
> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> 
> diff --git a/kernel/rcutree.c b/kernel/rcutree.c
> index c3b05ef..5688443 100644
> --- a/kernel/rcutree.c
> +++ b/kernel/rcutree.c
> @@ -91,6 +91,8 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
>  
>  static struct rcu_state *rcu_state;
>  
> +int rcu_max_cpu __read_mostly;	/* Largest # CPU that has ever been online. */
> +
>  /*
>   * The rcu_scheduler_active variable transitions from zero to one just
>   * before the first task is spawned.  So when this variable is zero, RCU
> @@ -1129,8 +1131,9 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
>  	__releases(rcu_get_root(rsp)->lock)
>  {
>  	unsigned long gp_duration;
> -	struct rcu_node *rnp = rcu_get_root(rsp);
>  	struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
> +	struct rcu_node *rnp;
> +	struct rcu_node *rnp_root = rcu_get_root(rsp);
>  
>  	WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
>  
> @@ -1159,26 +1162,28 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
>  	 * completed.
>  	 */
>  	if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) {
> -		raw_spin_unlock(&rnp->lock);	 /* irqs remain disabled. */
>  
>  		/*
>  		 * Propagate new ->completed value to rcu_node structures
>  		 * so that other CPUs don't have to wait until the start
>  		 * of the next grace period to process their callbacks.
> +		 * We must hold the root rcu_node structure's ->lock
> +		 * across rcu_for_each_node_breadth_first() in order to
> +		 * synchronize with CPUs coming online for the first time.
>  		 */
>  		rcu_for_each_node_breadth_first(rsp, rnp) {
> +			raw_spin_unlock(&rnp_root->lock); /* remain disabled. */
>  			raw_spin_lock(&rnp->lock); /* irqs already disabled. */
>  			rnp->completed = rsp->gpnum;
>  			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
> +			raw_spin_lock(&rnp_root->lock); /* already disabled. */
>  		}
> -		rnp = rcu_get_root(rsp);
> -		raw_spin_lock(&rnp->lock); /* irqs already disabled. */
>  	}
>  
>  	rsp->completed = rsp->gpnum;  /* Declare the grace period complete. */
>  	trace_rcu_grace_period(rsp->name, rsp->completed, "end");
>  	rsp->fqs_state = RCU_GP_IDLE;
> -	rcu_start_gp(rsp, flags);  /* releases root node's rnp->lock. */
> +	rcu_start_gp(rsp, flags);  /* releases root node's ->lock. */
>  }
>  
>  /*
> @@ -2440,6 +2445,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
>  	unsigned long mask;
>  	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
>  	struct rcu_node *rnp = rcu_get_root(rsp);
> +	struct rcu_node *rnp_init;
>  
>  	/* Set up local state, ensuring consistent view of global state. */
>  	raw_spin_lock_irqsave(&rnp->lock, flags);
> @@ -2462,6 +2468,16 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
>  	/* Exclude any attempts to start a new GP on large systems. */
>  	raw_spin_lock(&rsp->onofflock);		/* irqs already disabled. */
>  
> +	/* Initialize any rcu_node structures that will see their first use. */
> +	raw_spin_lock(&rnp->lock);		/* irqs already disabled. */
> +	for (rnp_init = per_cpu_ptr(rsp->rda, rcu_max_cpu)->mynode + 1;
> +	     rnp_init <= rdp->mynode;
> +	     rnp_init++) {
> +		rnp_init->gpnum = rsp->gpnum;
> +		rnp_init->completed = rsp->completed;
> +	}
> +	raw_spin_unlock(&rnp->lock);		/* irqs remain disabled. */
> +
>  	/* Add CPU to rcu_node bitmasks. */
>  	rnp = rdp->mynode;
>  	mask = rdp->grpmask;
> @@ -2495,6 +2511,8 @@ static void __cpuinit rcu_prepare_cpu(int cpu)
>  	rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
>  	rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
>  	rcu_preempt_init_percpu_data(cpu);
> +	if (cpu > rcu_max_cpu)
> +		rcu_max_cpu = cpu;
>  }
>  
>  /*
> diff --git a/kernel/rcutree.h b/kernel/rcutree.h
> index 1e49c56..afdf410 100644
> --- a/kernel/rcutree.h
> +++ b/kernel/rcutree.h
> @@ -192,11 +192,13 @@ struct rcu_node {
>  
>  /*
>   * Do a full breadth-first scan of the rcu_node structures for the
> - * specified rcu_state structure.
> + * specified rcu_state structure.  The caller must hold either the
> + * ->onofflock or the root rcu_node structure's ->lock.
>   */
> +extern int rcu_max_cpu;
>  #define rcu_for_each_node_breadth_first(rsp, rnp) \
>  	for ((rnp) = &(rsp)->node[0]; \
> -	     (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
> +	     (rnp) <= per_cpu_ptr((rsp)->rda, rcu_max_cpu)->mynode; (rnp)++)
>  
>  /*
>   * Do a breadth-first scan of the non-leaf rcu_node structures for the

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online
  2012-03-15  3:07             ` Mike Galbraith
  2012-03-15 17:02               ` Paul E. McKenney
@ 2012-03-15 17:59               ` Dimitri Sivanich
  2012-03-16  7:27                 ` Mike Galbraith
  1 sibling, 1 reply; 32+ messages in thread
From: Dimitri Sivanich @ 2012-03-15 17:59 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: paulmck, linux-kernel

On Thu, Mar 15, 2012 at 04:07:11AM +0100, Mike Galbraith wrote:
> On Thu, 2012-03-15 at 03:42 +0100, Mike Galbraith wrote: 
> > On Wed, 2012-03-14 at 09:56 -0700, Paul E. McKenney wrote:
> > 
> > > Does the following work better?  It does pass my fake-big-system tests
> > > (more testing in the works).
> > 
> > Yup, tip booted fine.  Thanks!  I'll test, see if it gets upset.
> 
> Wedged into 3.0 enterprise booted fine too, is now running rcutorture.
> I'll add hotplug after it runs explosion free for a while.  Any
> suggestions for giving both virgin and 3.0 a thorough trouncing?

Mike,

Could I try your 3.0 enterprise patch?

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online
  2012-03-15 17:58           ` Dimitri Sivanich
@ 2012-03-15 18:23             ` Paul E. McKenney
  2012-03-15 21:07               ` Paul E. McKenney
  0 siblings, 1 reply; 32+ messages in thread
From: Paul E. McKenney @ 2012-03-15 18:23 UTC (permalink / raw)
  To: Dimitri Sivanich; +Cc: Mike Galbraith, linux-kernel

On Thu, Mar 15, 2012 at 12:58:57PM -0500, Dimitri Sivanich wrote:
> On Wed, Mar 14, 2012 at 09:56:57AM -0700, Paul E. McKenney wrote:
> > On Wed, Mar 14, 2012 at 08:17:17AM -0700, Paul E. McKenney wrote:
> > > On Wed, Mar 14, 2012 at 08:08:01AM -0500, Dimitri Sivanich wrote:
> > > > On Wed, Mar 14, 2012 at 01:40:41PM +0100, Mike Galbraith wrote:
> > > > > On Wed, 2012-03-14 at 10:24 +0100, Mike Galbraith wrote: 
> > > > > > On Tue, 2012-03-13 at 17:24 -0700, Paul E. McKenney wrote: 
> > > > > > > The following builds, but is only very lightly tested.  Probably full
> > > > > > > of bug, especially when exercising CPU hotplug.
> > > > > > 
> > > > > > You didn't say RFT, but...
> > > > > > 
> > > > > > To beat on this in a rotund 3.0 kernel, the equivalent patch would be
> > > > > > the below?  My box may well answer that before you can.. hope not ;-)
> > > > > 
> > > > > (Darn, it did.  Box says boot stall with virgin patch in tip too though.
> > > > > Wedging it straight into 3.0 was perhaps a tad premature;)
> > > > 
> > > > I saw the same thing with 3.3.0-rc7+ and virgin patch on UV.  Boots fine without the patch.
> > > 
> > > Right...  Bozo here forgot to set the kernel parameters for large-system
> > > emulation during testing.  Apologies for the busted patch, will fix.
> > > 
> > > And thank you both for the testing!!!
> > > 
> > > Hey, at least I labeled it "RFC".  ;-)
> > 
> > Does the following work better?  It does pass my fake-big-system tests
> > (more testing in the works).
> 
> This one stalls for me at the same place the other one did.  Once again,
> if I remove the patch and rebuild, it boots just fine.
> 
> Is there some debug/trace information that you would like me to provide?

Very strange.

Could you please send your dmesg and .config?

							Thanx, Paul


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online
  2012-03-15 18:23             ` Paul E. McKenney
@ 2012-03-15 21:07               ` Paul E. McKenney
  2012-03-16 15:46                 ` Dimitri Sivanich
  0 siblings, 1 reply; 32+ messages in thread
From: Paul E. McKenney @ 2012-03-15 21:07 UTC (permalink / raw)
  To: Dimitri Sivanich; +Cc: Mike Galbraith, linux-kernel

On Thu, Mar 15, 2012 at 11:23:14AM -0700, Paul E. McKenney wrote:
> On Thu, Mar 15, 2012 at 12:58:57PM -0500, Dimitri Sivanich wrote:
> > On Wed, Mar 14, 2012 at 09:56:57AM -0700, Paul E. McKenney wrote:
> > > On Wed, Mar 14, 2012 at 08:17:17AM -0700, Paul E. McKenney wrote:
> > > > On Wed, Mar 14, 2012 at 08:08:01AM -0500, Dimitri Sivanich wrote:
> > > > > On Wed, Mar 14, 2012 at 01:40:41PM +0100, Mike Galbraith wrote:
> > > > > > On Wed, 2012-03-14 at 10:24 +0100, Mike Galbraith wrote: 
> > > > > > > On Tue, 2012-03-13 at 17:24 -0700, Paul E. McKenney wrote: 
> > > > > > > > The following builds, but is only very lightly tested.  Probably full
> > > > > > > > of bug, especially when exercising CPU hotplug.
> > > > > > > 
> > > > > > > You didn't say RFT, but...
> > > > > > > 
> > > > > > > To beat on this in a rotund 3.0 kernel, the equivalent patch would be
> > > > > > > the below?  My box may well answer that before you can.. hope not ;-)
> > > > > > 
> > > > > > (Darn, it did.  Box says boot stall with virgin patch in tip too though.
> > > > > > Wedging it straight into 3.0 was perhaps a tad premature;)
> > > > > 
> > > > > I saw the same thing with 3.3.0-rc7+ and virgin patch on UV.  Boots fine without the patch.
> > > > 
> > > > Right...  Bozo here forgot to set the kernel parameters for large-system
> > > > emulation during testing.  Apologies for the busted patch, will fix.
> > > > 
> > > > And thank you both for the testing!!!
> > > > 
> > > > Hey, at least I labeled it "RFC".  ;-)
> > > 
> > > Does the following work better?  It does pass my fake-big-system tests
> > > (more testing in the works).
> > 
> > This one stalls for me at the same place the other one did.  Once again,
> > if I remove the patch and rebuild, it boots just fine.
> > 
> > Is there some debug/trace information that you would like me to provide?
> 
> Very strange.
> 
> Could you please send your dmesg and .config?

Hmmm...  Memory ordering could be a problem, though in that case I would
have expected the hand during the onlining process.  However, the memory
ordering does need to be cleaned up in any case, please see below.

							Thanx, Paul

------------------------------------------------------------------------

rcu: Limit GP initialization to CPUs that have been online

The current grace-period initialization initializes all leaf rcu_node
structures, even those corresponding to CPUs that have never been online.
This is harmless in many configurations, but results in 200-microsecond
latency spikes for kernels built with NR_CPUS=4096.

This commit therefore keeps track of the largest-numbered CPU that has
ever been online, and limits grace-period initialization to rcu_node
structures corresponding to that CPU and to smaller-numbered CPUs.

Reported-by: Dimitri Sivanich <sivanich@sgi.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Mike Galbraith <efault@gmx.de>

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 8269656..7247fa8 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -91,6 +91,8 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
 
 static struct rcu_state *rcu_state;
 
+int rcu_max_cpu __read_mostly;	/* Largest # CPU that has ever been online. */
+
 /*
  * The rcu_scheduler_active variable transitions from zero to one just
  * before the first task is spawned.  So when this variable is zero, RCU
@@ -1129,8 +1131,9 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
 	__releases(rcu_get_root(rsp)->lock)
 {
 	unsigned long gp_duration;
-	struct rcu_node *rnp = rcu_get_root(rsp);
 	struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
+	struct rcu_node *rnp;
+	struct rcu_node *rnp_root = rcu_get_root(rsp);
 
 	WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
 
@@ -1159,26 +1162,28 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
 	 * completed.
 	 */
 	if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) {
-		raw_spin_unlock(&rnp->lock);	 /* irqs remain disabled. */
 
 		/*
 		 * Propagate new ->completed value to rcu_node structures
 		 * so that other CPUs don't have to wait until the start
 		 * of the next grace period to process their callbacks.
+		 * We must hold the root rcu_node structure's ->lock
+		 * across rcu_for_each_node_breadth_first() in order to
+		 * synchronize with CPUs coming online for the first time.
 		 */
 		rcu_for_each_node_breadth_first(rsp, rnp) {
+			raw_spin_unlock(&rnp_root->lock); /* remain disabled. */
 			raw_spin_lock(&rnp->lock); /* irqs already disabled. */
 			rnp->completed = rsp->gpnum;
 			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+			raw_spin_lock(&rnp_root->lock); /* already disabled. */
 		}
-		rnp = rcu_get_root(rsp);
-		raw_spin_lock(&rnp->lock); /* irqs already disabled. */
 	}
 
 	rsp->completed = rsp->gpnum;  /* Declare the grace period complete. */
 	trace_rcu_grace_period(rsp->name, rsp->completed, "end");
 	rsp->fqs_state = RCU_GP_IDLE;
-	rcu_start_gp(rsp, flags);  /* releases root node's rnp->lock. */
+	rcu_start_gp(rsp, flags);  /* releases root node's ->lock. */
 }
 
 /*
@@ -2447,6 +2452,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
 	unsigned long mask;
 	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 	struct rcu_node *rnp = rcu_get_root(rsp);
+	struct rcu_node *rnp_init;
 
 	/* Set up local state, ensuring consistent view of global state. */
 	raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -2469,6 +2475,20 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
 	/* Exclude any attempts to start a new GP on large systems. */
 	raw_spin_lock(&rsp->onofflock);		/* irqs already disabled. */
 
+	/*
+	 * Initialize any rcu_node structures that will see their first use.
+	 * Note that rcu_max_cpu cannot change out from under us because the
+	 * hotplug locks are held.
+	 */
+	raw_spin_lock(&rnp->lock);		/* irqs already disabled. */
+	for (rnp_init = per_cpu_ptr(rsp->rda, rcu_max_cpu)->mynode + 1;
+	     rnp_init <= rdp->mynode;
+	     rnp_init++) {
+		rnp_init->gpnum = rsp->gpnum;
+		rnp_init->completed = rsp->completed;
+	}
+	raw_spin_unlock(&rnp->lock);		/* irqs remain disabled. */
+
 	/* Add CPU to rcu_node bitmasks. */
 	rnp = rdp->mynode;
 	mask = rdp->grpmask;
@@ -2502,6 +2522,11 @@ static void __cpuinit rcu_prepare_cpu(int cpu)
 	rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
 	rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
 	rcu_preempt_init_percpu_data(cpu);
+	if (cpu > rcu_max_cpu) {
+		smp_mb(); /* Initialization before rcu_max_cpu assignment. */
+		rcu_max_cpu = cpu;
+		smp_mb(); /* rcu_max_cpu assignment before later uses. */
+	}
 }
 
 /*
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 1e49c56..772df1c 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -192,11 +192,23 @@ struct rcu_node {
 
 /*
  * Do a full breadth-first scan of the rcu_node structures for the
- * specified rcu_state structure.
+ * specified rcu_state structure.  The caller must hold either the
+ * ->onofflock or the root rcu_node structure's ->lock.
  */
+extern int rcu_max_cpu;
+static inline int rcu_get_max_cpu(void)
+{
+	int ret;
+
+	smp_mb();  /* Pairs with barriers in rcu_prepare_cpu(). */
+	ret = rcu_max_cpu;
+	smp_mb();  /* Pairs with barriers in rcu_prepare_cpu(). */
+	return ret;
+}
 #define rcu_for_each_node_breadth_first(rsp, rnp) \
 	for ((rnp) = &(rsp)->node[0]; \
-	     (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
+	     (rnp) <= per_cpu_ptr((rsp)->rda, rcu_get_max_cpu())->mynode; \
+	     (rnp)++)
 
 /*
  * Do a breadth-first scan of the non-leaf rcu_node structures for the


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online
  2012-03-15 17:02               ` Paul E. McKenney
  2012-03-15 17:21                 ` Dimitri Sivanich
@ 2012-03-16  4:45                 ` Mike Galbraith
  1 sibling, 0 replies; 32+ messages in thread
From: Mike Galbraith @ 2012-03-16  4:45 UTC (permalink / raw)
  To: paulmck; +Cc: Dimitri Sivanich, linux-kernel

On Thu, 2012-03-15 at 10:02 -0700, Paul E. McKenney wrote: 
> On Thu, Mar 15, 2012 at 04:07:11AM +0100, Mike Galbraith wrote:
> > On Thu, 2012-03-15 at 03:42 +0100, Mike Galbraith wrote: 
> > > On Wed, 2012-03-14 at 09:56 -0700, Paul E. McKenney wrote:
> > > 
> > > > Does the following work better?  It does pass my fake-big-system tests
> > > > (more testing in the works).
> > > 
> > > Yup, tip booted fine.  Thanks!  I'll test, see if it gets upset.
> > 
> > Wedged into 3.0 enterprise booted fine too, is now running rcutorture.
> > I'll add hotplug after it runs explosion free for a while.  Any
> > suggestions for giving both virgin and 3.0 a thorough trouncing?
> 
> It would be good to check the latency of RCU's grace-period
> initialization, and comparing the results with and without this patch.
> Dimitri might have scripting for that.

Yeah, I'll zoom in.  Generic irqsoff tracing with rcutorture running
was... not the cleverest of ideas.  (between fbcon and serial console, a
300us RCU blip would be a pimple on giants left butt cheek)

-Mike


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online
  2012-03-15 17:59               ` Dimitri Sivanich
@ 2012-03-16  7:27                 ` Mike Galbraith
  2012-03-16  8:09                   ` Mike Galbraith
  0 siblings, 1 reply; 32+ messages in thread
From: Mike Galbraith @ 2012-03-16  7:27 UTC (permalink / raw)
  To: Dimitri Sivanich; +Cc: paulmck, linux-kernel

On Thu, 2012-03-15 at 12:59 -0500, Dimitri Sivanich wrote: 
> On Thu, Mar 15, 2012 at 04:07:11AM +0100, Mike Galbraith wrote:
> > On Thu, 2012-03-15 at 03:42 +0100, Mike Galbraith wrote: 
> > > On Wed, 2012-03-14 at 09:56 -0700, Paul E. McKenney wrote:
> > > 
> > > > Does the following work better?  It does pass my fake-big-system tests
> > > > (more testing in the works).
> > > 
> > > Yup, tip booted fine.  Thanks!  I'll test, see if it gets upset.
> > 
> > Wedged into 3.0 enterprise booted fine too, is now running rcutorture.
> > I'll add hotplug after it runs explosion free for a while.  Any
> > suggestions for giving both virgin and 3.0 a thorough trouncing?
> 
> Mike,
> 
> Could I try your 3.0 enterprise patch?

Sure, v3 below.  Boots on my little boxen.

caveat: looks to me like it should be equivalent, but what I know about
RCUs gizzard will cover the bottom of a thimble.. maybe. 

rcu: Limit GP initialization to CPUs that have been online

The current grace-period initialization initializes all leaf rcu_node
structures, even those corresponding to CPUs that have never been online.
This is harmless in many configurations, but results in 200-microsecond
latency spikes for kernels built with NR_CPUS=4096.

This commit therefore keeps track of the largest-numbered CPU that has
ever been online, and limits grace-period initialization to rcu_node
structures corresponding to that CPU and to smaller-numbered CPUs.

Reported-by: Dimitri Sivanich <sivanich@sgi.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Mike Galbraith <mgalbraith@suse.de>

---
 kernel/rcutree.c |   24 +++++++++++++++++++++++-
 kernel/rcutree.h |   16 ++++++++++++++--
 2 files changed, 37 insertions(+), 3 deletions(-)

--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -84,6 +84,8 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_d
 
 static struct rcu_state *rcu_state;
 
+int rcu_max_cpu __read_mostly;	/* Largest # CPU that has ever been online. */
+
 /*
  * The rcu_scheduler_active variable transitions from zero to one just
  * before the first task is spawned.  So when this variable is zero, RCU
@@ -935,7 +937,7 @@ static void rcu_report_qs_rsp(struct rcu
 		rsp->gp_max = gp_duration;
 	rsp->completed = rsp->gpnum;
 	rsp->signaled = RCU_GP_IDLE;
-	rcu_start_gp(rsp, flags);  /* releases root node's rnp->lock. */
+	rcu_start_gp(rsp, flags);  /* releases root node's ->lock. */
 }
 
 /*
@@ -1862,6 +1864,7 @@ rcu_init_percpu_data(int cpu, struct rcu
 	unsigned long mask;
 	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 	struct rcu_node *rnp = rcu_get_root(rsp);
+	struct rcu_node *rnp_init;
 
 	/* Set up local state, ensuring consistent view of global state. */
 	raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -1882,6 +1885,20 @@ rcu_init_percpu_data(int cpu, struct rcu
 	/* Exclude any attempts to start a new GP on large systems. */
 	raw_spin_lock(&rsp->onofflock);		/* irqs already disabled. */
 
+	/*
+	 * Initialize any rcu_node structures that will see their first use.
+	 * Note that rcu_max_cpu cannot change out from under us because the
+	 * hotplug locks are held.
+	 */
+	raw_spin_lock(&rnp->lock);		/* irqs already disabled. */
+	for (rnp_init = per_cpu_ptr(rsp->rda, rcu_max_cpu)->mynode + 1;
+	     rnp_init <= rdp->mynode;
+	     rnp_init++) {
+		rnp_init->gpnum = rsp->gpnum;
+		rnp_init->completed = rsp->completed;
+	}
+	raw_spin_unlock(&rnp->lock);		/* irqs remain disabled. */
+
 	/* Add CPU to rcu_node bitmasks. */
 	rnp = rdp->mynode;
 	mask = rdp->grpmask;
@@ -1907,6 +1924,11 @@ static void __cpuinit rcu_prepare_cpu(in
 	rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
 	rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
 	rcu_preempt_init_percpu_data(cpu);
+	if (cpu > rcu_max_cpu) {
+		smp_mb(); /* Initialization before rcu_max_cpu assignment. */
+		rcu_max_cpu = cpu;
+		smp_mb(); /* rcu_max_cpu assignment before later uses. */
+	}
 }
 
 /*
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -191,11 +191,23 @@ struct rcu_node {
 
 /*
  * Do a full breadth-first scan of the rcu_node structures for the
- * specified rcu_state structure.
+ * specified rcu_state structure.  The caller must hold either the
+ * ->onofflock or the root rcu_node structure's ->lock.
  */
+extern int rcu_max_cpu;
+static inline int rcu_get_max_cpu(void)
+{
+	int ret;
+
+	smp_mb();  /* Pairs with barriers in rcu_prepare_cpu(). */
+	ret = rcu_max_cpu;
+	smp_mb();  /* Pairs with barriers in rcu_prepare_cpu(). */
+	return ret;
+}
 #define rcu_for_each_node_breadth_first(rsp, rnp) \
 	for ((rnp) = &(rsp)->node[0]; \
-	     (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
+	     (rnp) <= per_cpu_ptr((rsp)->rda, rcu_get_max_cpu())->mynode; \
+	     (rnp)++)
 
 /*
  * Do a breadth-first scan of the non-leaf rcu_node structures for the




^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online
  2012-03-16  7:27                 ` Mike Galbraith
@ 2012-03-16  8:09                   ` Mike Galbraith
  2012-03-16  8:45                     ` Mike Galbraith
  0 siblings, 1 reply; 32+ messages in thread
From: Mike Galbraith @ 2012-03-16  8:09 UTC (permalink / raw)
  To: Dimitri Sivanich; +Cc: paulmck, linux-kernel

On Fri, 2012-03-16 at 08:27 +0100, Mike Galbraith wrote: 
> On Thu, 2012-03-15 at 12:59 -0500, Dimitri Sivanich wrote: 
> > On Thu, Mar 15, 2012 at 04:07:11AM +0100, Mike Galbraith wrote:
> > > On Thu, 2012-03-15 at 03:42 +0100, Mike Galbraith wrote: 
> > > > On Wed, 2012-03-14 at 09:56 -0700, Paul E. McKenney wrote:
> > > > 
> > > > > Does the following work better?  It does pass my fake-big-system tests
> > > > > (more testing in the works).
> > > > 
> > > > Yup, tip booted fine.  Thanks!  I'll test, see if it gets upset.
> > > 
> > > Wedged into 3.0 enterprise booted fine too, is now running rcutorture.
> > > I'll add hotplug after it runs explosion free for a while.  Any
> > > suggestions for giving both virgin and 3.0 a thorough trouncing?
> > 
> > Mike,
> > 
> > Could I try your 3.0 enterprise patch?
> 
> Sure, v3 below.

Erm, a bit of that went missing.  I'll put it back.

-Mike


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online
  2012-03-16  8:09                   ` Mike Galbraith
@ 2012-03-16  8:45                     ` Mike Galbraith
  2012-03-16 17:28                       ` Dimitri Sivanich
  0 siblings, 1 reply; 32+ messages in thread
From: Mike Galbraith @ 2012-03-16  8:45 UTC (permalink / raw)
  To: Dimitri Sivanich; +Cc: paulmck, linux-kernel

On Fri, 2012-03-16 at 09:09 +0100, Mike Galbraith wrote: 
> On Fri, 2012-03-16 at 08:27 +0100, Mike Galbraith wrote: 
> > On Thu, 2012-03-15 at 12:59 -0500, Dimitri Sivanich wrote: 

> > > Could I try your 3.0 enterprise patch?
> > 
> > Sure, v3 below.
> 
> Erm, a bit of that went missing.  I'll put it back.

OK, box finally finished rebuild/boot.


rcu: Limit GP initialization to CPUs that have been online

The current grace-period initialization initializes all leaf rcu_node
structures, even those corresponding to CPUs that have never been online.
This is harmless in many configurations, but results in 200-microsecond
latency spikes for kernels built with NR_CPUS=4096.

This commit therefore keeps track of the largest-numbered CPU that has
ever been online, and limits grace-period initialization to rcu_node
structures corresponding to that CPU and to smaller-numbered CPUs.

Reported-by: Dimitri Sivanich <sivanich@sgi.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Mike Galbraith <mgalbraith@suse.de>

---
 kernel/rcutree.c |   36 ++++++++++++++++++++++++++++++++----
 kernel/rcutree.h |   16 ++++++++++++++--
 2 files changed, 46 insertions(+), 6 deletions(-)

--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -84,6 +84,8 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_d
 
 static struct rcu_state *rcu_state;
 
+int rcu_max_cpu __read_mostly;	/* Largest # CPU that has ever been online. */
+
 /*
  * The rcu_scheduler_active variable transitions from zero to one just
  * before the first task is spawned.  So when this variable is zero, RCU
@@ -827,25 +829,31 @@ rcu_start_gp(struct rcu_state *rsp, unsi
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
 	if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
+		struct rcu_node *rnp_root = rnp;
+
 		if (cpu_needs_another_gp(rsp, rdp))
 			rsp->fqs_need_gp = 1;
 		if (rnp->completed == rsp->completed) {
-			raw_spin_unlock_irqrestore(&rnp->lock, flags);
+			raw_spin_unlock_irqrestore(&rnp_root->lock, flags);
 			return;
 		}
-		raw_spin_unlock(&rnp->lock);	 /* irqs remain disabled. */
 
 		/*
 		 * Propagate new ->completed value to rcu_node structures
 		 * so that other CPUs don't have to wait until the start
 		 * of the next grace period to process their callbacks.
+		 * We must hold the root rcu_node structure's ->lock
+		 * across rcu_for_each_node_breadth_first() in order to
+		 * synchronize with CPUs coming online for the first time.
 		 */
 		rcu_for_each_node_breadth_first(rsp, rnp) {
+			raw_spin_unlock(&rnp_root->lock); /* remain disabled. */
 			raw_spin_lock(&rnp->lock); /* irqs already disabled. */
 			rnp->completed = rsp->completed;
 			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+			raw_spin_lock(&rnp_root->lock); /* already disabled. */
 		}
-		local_irq_restore(flags);
+		raw_spin_unlock_irqrestore(&rnp_root->lock, flags);
 		return;
 	}
 
@@ -935,7 +943,7 @@ static void rcu_report_qs_rsp(struct rcu
 		rsp->gp_max = gp_duration;
 	rsp->completed = rsp->gpnum;
 	rsp->signaled = RCU_GP_IDLE;
-	rcu_start_gp(rsp, flags);  /* releases root node's rnp->lock. */
+	rcu_start_gp(rsp, flags);  /* releases root node's ->lock. */
 }
 
 /*
@@ -1862,6 +1870,7 @@ rcu_init_percpu_data(int cpu, struct rcu
 	unsigned long mask;
 	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 	struct rcu_node *rnp = rcu_get_root(rsp);
+	struct rcu_node *rnp_init;
 
 	/* Set up local state, ensuring consistent view of global state. */
 	raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -1882,6 +1891,20 @@ rcu_init_percpu_data(int cpu, struct rcu
 	/* Exclude any attempts to start a new GP on large systems. */
 	raw_spin_lock(&rsp->onofflock);		/* irqs already disabled. */
 
+	/*
+	 * Initialize any rcu_node structures that will see their first use.
+	 * Note that rcu_max_cpu cannot change out from under us because the
+	 * hotplug locks are held.
+	 */
+	raw_spin_lock(&rnp->lock);		/* irqs already disabled. */
+	for (rnp_init = per_cpu_ptr(rsp->rda, rcu_max_cpu)->mynode + 1;
+	     rnp_init <= rdp->mynode;
+	     rnp_init++) {
+		rnp_init->gpnum = rsp->gpnum;
+		rnp_init->completed = rsp->completed;
+	}
+	raw_spin_unlock(&rnp->lock);		/* irqs remain disabled. */
+
 	/* Add CPU to rcu_node bitmasks. */
 	rnp = rdp->mynode;
 	mask = rdp->grpmask;
@@ -1907,6 +1930,11 @@ static void __cpuinit rcu_prepare_cpu(in
 	rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
 	rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
 	rcu_preempt_init_percpu_data(cpu);
+	if (cpu > rcu_max_cpu) {
+		smp_mb(); /* Initialization before rcu_max_cpu assignment. */
+		rcu_max_cpu = cpu;
+		smp_mb(); /* rcu_max_cpu assignment before later uses. */
+	}
 }
 
 /*
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -191,11 +191,23 @@ struct rcu_node {
 
 /*
  * Do a full breadth-first scan of the rcu_node structures for the
- * specified rcu_state structure.
+ * specified rcu_state structure.  The caller must hold either the
+ * ->onofflock or the root rcu_node structure's ->lock.
  */
+extern int rcu_max_cpu;
+static inline int rcu_get_max_cpu(void)
+{
+	int ret;
+
+	smp_mb();  /* Pairs with barriers in rcu_prepare_cpu(). */
+	ret = rcu_max_cpu;
+	smp_mb();  /* Pairs with barriers in rcu_prepare_cpu(). */
+	return ret;
+}
 #define rcu_for_each_node_breadth_first(rsp, rnp) \
 	for ((rnp) = &(rsp)->node[0]; \
-	     (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
+	     (rnp) <= per_cpu_ptr((rsp)->rda, rcu_get_max_cpu())->mynode; \
+	     (rnp)++)
 
 /*
  * Do a breadth-first scan of the non-leaf rcu_node structures for the



^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online
  2012-03-15 21:07               ` Paul E. McKenney
@ 2012-03-16 15:46                 ` Dimitri Sivanich
  2012-03-16 17:21                   ` Paul E. McKenney
  0 siblings, 1 reply; 32+ messages in thread
From: Dimitri Sivanich @ 2012-03-16 15:46 UTC (permalink / raw)
  To: Paul E. McKenney; +Cc: Mike Galbraith, linux-kernel

On Thu, Mar 15, 2012 at 02:07:53PM -0700, Paul E. McKenney wrote:
> On Thu, Mar 15, 2012 at 11:23:14AM -0700, Paul E. McKenney wrote:
> > On Thu, Mar 15, 2012 at 12:58:57PM -0500, Dimitri Sivanich wrote:
> > > On Wed, Mar 14, 2012 at 09:56:57AM -0700, Paul E. McKenney wrote:
> > > > On Wed, Mar 14, 2012 at 08:17:17AM -0700, Paul E. McKenney wrote:
> > > > > On Wed, Mar 14, 2012 at 08:08:01AM -0500, Dimitri Sivanich wrote:
> > > > > > On Wed, Mar 14, 2012 at 01:40:41PM +0100, Mike Galbraith wrote:
> > > > > > > On Wed, 2012-03-14 at 10:24 +0100, Mike Galbraith wrote: 
> > > > > > > > On Tue, 2012-03-13 at 17:24 -0700, Paul E. McKenney wrote: 
> > > > > > > > > The following builds, but is only very lightly tested.  Probably full
> > > > > > > > > of bug, especially when exercising CPU hotplug.
> > > > > > > > 
> > > > > > > > You didn't say RFT, but...
> > > > > > > > 
> > > > > > > > To beat on this in a rotund 3.0 kernel, the equivalent patch would be
> > > > > > > > the below?  My box may well answer that before you can.. hope not ;-)
> > > > > > > 
> > > > > > > (Darn, it did.  Box says boot stall with virgin patch in tip too though.
> > > > > > > Wedging it straight into 3.0 was perhaps a tad premature;)
> > > > > > 
> > > > > > I saw the same thing with 3.3.0-rc7+ and virgin patch on UV.  Boots fine without the patch.
> > > > > 
> > > > > Right...  Bozo here forgot to set the kernel parameters for large-system
> > > > > emulation during testing.  Apologies for the busted patch, will fix.
> > > > > 
> > > > > And thank you both for the testing!!!
> > > > > 
> > > > > Hey, at least I labeled it "RFC".  ;-)
> > > > 
> > > > Does the following work better?  It does pass my fake-big-system tests
> > > > (more testing in the works).
> > > 
> > > This one stalls for me at the same place the other one did.  Once again,
> > > if I remove the patch and rebuild, it boots just fine.
> > > 
> > > Is there some debug/trace information that you would like me to provide?
> > 
> > Very strange.
> > 
> > Could you please send your dmesg and .config?
> 
> Hmmm...  Memory ordering could be a problem, though in that case I would
> have expected the hand during the onlining process.  However, the memory
> ordering does need to be cleaned up in any case, please see below.
>
After testing this on 3.3.0-rc7+ I can say that this very much improves the
latency in the two rcu_for_each_node_breadth_first() loops.

Without the patch, under moderate load and while running an interrupt latency
test, I see the majority of loops taking 100-200 usec.

With the patch there are a few that take between 20-30, the rest are below
that.

Not that everything is OK latency-wise in RCU land.  There is still an
interrupt holdoff in force_quiescent_state() that is taking > 100usec,
with or without the patch.  I'm having difficulty finding exactly where
the other holdoff is happening because the kernel isn't accepting my nmi
handler.

That said, this fix is a nice improvement in those two loops.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online
  2012-03-16 15:46                 ` Dimitri Sivanich
@ 2012-03-16 17:21                   ` Paul E. McKenney
  0 siblings, 0 replies; 32+ messages in thread
From: Paul E. McKenney @ 2012-03-16 17:21 UTC (permalink / raw)
  To: Dimitri Sivanich; +Cc: Mike Galbraith, linux-kernel

On Fri, Mar 16, 2012 at 10:46:23AM -0500, Dimitri Sivanich wrote:
> On Thu, Mar 15, 2012 at 02:07:53PM -0700, Paul E. McKenney wrote:
> > On Thu, Mar 15, 2012 at 11:23:14AM -0700, Paul E. McKenney wrote:
> > > On Thu, Mar 15, 2012 at 12:58:57PM -0500, Dimitri Sivanich wrote:
> > > > On Wed, Mar 14, 2012 at 09:56:57AM -0700, Paul E. McKenney wrote:
> > > > > On Wed, Mar 14, 2012 at 08:17:17AM -0700, Paul E. McKenney wrote:
> > > > > > On Wed, Mar 14, 2012 at 08:08:01AM -0500, Dimitri Sivanich wrote:
> > > > > > > On Wed, Mar 14, 2012 at 01:40:41PM +0100, Mike Galbraith wrote:
> > > > > > > > On Wed, 2012-03-14 at 10:24 +0100, Mike Galbraith wrote: 
> > > > > > > > > On Tue, 2012-03-13 at 17:24 -0700, Paul E. McKenney wrote: 
> > > > > > > > > > The following builds, but is only very lightly tested.  Probably full
> > > > > > > > > > of bug, especially when exercising CPU hotplug.
> > > > > > > > > 
> > > > > > > > > You didn't say RFT, but...
> > > > > > > > > 
> > > > > > > > > To beat on this in a rotund 3.0 kernel, the equivalent patch would be
> > > > > > > > > the below?  My box may well answer that before you can.. hope not ;-)
> > > > > > > > 
> > > > > > > > (Darn, it did.  Box says boot stall with virgin patch in tip too though.
> > > > > > > > Wedging it straight into 3.0 was perhaps a tad premature;)
> > > > > > > 
> > > > > > > I saw the same thing with 3.3.0-rc7+ and virgin patch on UV.  Boots fine without the patch.
> > > > > > 
> > > > > > Right...  Bozo here forgot to set the kernel parameters for large-system
> > > > > > emulation during testing.  Apologies for the busted patch, will fix.
> > > > > > 
> > > > > > And thank you both for the testing!!!
> > > > > > 
> > > > > > Hey, at least I labeled it "RFC".  ;-)
> > > > > 
> > > > > Does the following work better?  It does pass my fake-big-system tests
> > > > > (more testing in the works).
> > > > 
> > > > This one stalls for me at the same place the other one did.  Once again,
> > > > if I remove the patch and rebuild, it boots just fine.
> > > > 
> > > > Is there some debug/trace information that you would like me to provide?
> > > 
> > > Very strange.
> > > 
> > > Could you please send your dmesg and .config?
> > 
> > Hmmm...  Memory ordering could be a problem, though in that case I would
> > have expected the hand during the onlining process.  However, the memory
> > ordering does need to be cleaned up in any case, please see below.
> >
> After testing this on 3.3.0-rc7+ I can say that this very much improves the
> latency in the two rcu_for_each_node_breadth_first() loops.
> 
> Without the patch, under moderate load and while running an interrupt latency
> test, I see the majority of loops taking 100-200 usec.
> 
> With the patch there are a few that take between 20-30, the rest are below
> that.
> 
> Not that everything is OK latency-wise in RCU land.  There is still an
> interrupt holdoff in force_quiescent_state() that is taking > 100usec,
> with or without the patch.  I'm having difficulty finding exactly where
> the other holdoff is happening because the kernel isn't accepting my nmi
> handler.

Please see my subsequent patch for force_quiescent_state().  ;-)

It is at https://lkml.org/lkml/2012/3/16/286 in case you missed it.

> That said, this fix is a nice improvement in those two loops.

Glad it helps, I have documented the improvement in the commit message.

Thank you both again for your testing efforts!

							Thanx, Paul


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online
  2012-03-16  8:45                     ` Mike Galbraith
@ 2012-03-16 17:28                       ` Dimitri Sivanich
  2012-03-16 17:51                         ` Paul E. McKenney
                                           ` (2 more replies)
  0 siblings, 3 replies; 32+ messages in thread
From: Dimitri Sivanich @ 2012-03-16 17:28 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: paulmck, linux-kernel

On Fri, Mar 16, 2012 at 09:45:35AM +0100, Mike Galbraith wrote:
> On Fri, 2012-03-16 at 09:09 +0100, Mike Galbraith wrote: 
> > On Fri, 2012-03-16 at 08:27 +0100, Mike Galbraith wrote: 
> > > On Thu, 2012-03-15 at 12:59 -0500, Dimitri Sivanich wrote: 
> 
> > > > Could I try your 3.0 enterprise patch?
> > > 
> > > Sure, v3 below.
> > 
> > Erm, a bit of that went missing.  I'll put it back.
> 
> OK, box finally finished rebuild/boot.
> 

This patch also shows great improvement in the two
rcu_for_each_node_breadth_first() (nothing over 20 usec and most less than
10 in initial testing).

However, there are spinlock holdoffs at the following tracebacks (my nmi
handler does work on the 3.0 kernel):

[  584.157019]  [<ffffffff8144c5a0>] nmi+0x20/0x30
[  584.157023]  [<ffffffff8144bc8a>] _raw_spin_lock_irqsave+0x1a/0x30
[  584.157026]  [<ffffffff810c5f18>] force_qs_rnp+0x58/0x170
[  584.157030]  [<ffffffff810c6192>] force_quiescent_state+0x162/0x1d0
[  584.157033]  [<ffffffff810c6c95>] __rcu_process_callbacks+0x165/0x200
[  584.157037]  [<ffffffff810c6d4d>] rcu_process_callbacks+0x1d/0x80
[  584.157041]  [<ffffffff81061eaf>] __do_softirq+0xef/0x220
[  584.157044]  [<ffffffff81454cbc>] call_softirq+0x1c/0x30
[  584.157048]  [<ffffffff810043a5>] do_softirq+0x65/0xa0
[  584.157051]  [<ffffffff81061c85>] irq_exit+0xb5/0xe0
[  584.157054]  [<ffffffff810212c8>] smp_apic_timer_interrupt+0x68/0xa0
[  584.157057]  [<ffffffff81454473>] apic_timer_interrupt+0x13/0x20
[  584.157061]  [<ffffffff8102b352>] native_safe_halt+0x2/0x10
[  584.157064]  [<ffffffff8100adf5>] default_idle+0x145/0x150
[  584.157067]  [<ffffffff810020c6>] cpu_idle+0x66/0xc0

> 
> rcu: Limit GP initialization to CPUs that have been online
> 
> The current grace-period initialization initializes all leaf rcu_node
> structures, even those corresponding to CPUs that have never been online.
> This is harmless in many configurations, but results in 200-microsecond
> latency spikes for kernels built with NR_CPUS=4096.
> 
> This commit therefore keeps track of the largest-numbered CPU that has
> ever been online, and limits grace-period initialization to rcu_node
> structures corresponding to that CPU and to smaller-numbered CPUs.
> 
> Reported-by: Dimitri Sivanich <sivanich@sgi.com>
> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> Acked-by: Mike Galbraith <mgalbraith@suse.de>
> 
> ---
>  kernel/rcutree.c |   36 ++++++++++++++++++++++++++++++++----
>  kernel/rcutree.h |   16 ++++++++++++++--
>  2 files changed, 46 insertions(+), 6 deletions(-)
> 
> --- a/kernel/rcutree.c
> +++ b/kernel/rcutree.c
> @@ -84,6 +84,8 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_d
>  
>  static struct rcu_state *rcu_state;
>  
> +int rcu_max_cpu __read_mostly;	/* Largest # CPU that has ever been online. */
> +
>  /*
>   * The rcu_scheduler_active variable transitions from zero to one just
>   * before the first task is spawned.  So when this variable is zero, RCU
> @@ -827,25 +829,31 @@ rcu_start_gp(struct rcu_state *rsp, unsi
>  	struct rcu_node *rnp = rcu_get_root(rsp);
>  
>  	if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
> +		struct rcu_node *rnp_root = rnp;
> +
>  		if (cpu_needs_another_gp(rsp, rdp))
>  			rsp->fqs_need_gp = 1;
>  		if (rnp->completed == rsp->completed) {
> -			raw_spin_unlock_irqrestore(&rnp->lock, flags);
> +			raw_spin_unlock_irqrestore(&rnp_root->lock, flags);
>  			return;
>  		}
> -		raw_spin_unlock(&rnp->lock);	 /* irqs remain disabled. */
>  
>  		/*
>  		 * Propagate new ->completed value to rcu_node structures
>  		 * so that other CPUs don't have to wait until the start
>  		 * of the next grace period to process their callbacks.
> +		 * We must hold the root rcu_node structure's ->lock
> +		 * across rcu_for_each_node_breadth_first() in order to
> +		 * synchronize with CPUs coming online for the first time.
>  		 */
>  		rcu_for_each_node_breadth_first(rsp, rnp) {
> +			raw_spin_unlock(&rnp_root->lock); /* remain disabled. */
>  			raw_spin_lock(&rnp->lock); /* irqs already disabled. */
>  			rnp->completed = rsp->completed;
>  			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
> +			raw_spin_lock(&rnp_root->lock); /* already disabled. */
>  		}
> -		local_irq_restore(flags);
> +		raw_spin_unlock_irqrestore(&rnp_root->lock, flags);
>  		return;
>  	}
>  
> @@ -935,7 +943,7 @@ static void rcu_report_qs_rsp(struct rcu
>  		rsp->gp_max = gp_duration;
>  	rsp->completed = rsp->gpnum;
>  	rsp->signaled = RCU_GP_IDLE;
> -	rcu_start_gp(rsp, flags);  /* releases root node's rnp->lock. */
> +	rcu_start_gp(rsp, flags);  /* releases root node's ->lock. */
>  }
>  
>  /*
> @@ -1862,6 +1870,7 @@ rcu_init_percpu_data(int cpu, struct rcu
>  	unsigned long mask;
>  	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
>  	struct rcu_node *rnp = rcu_get_root(rsp);
> +	struct rcu_node *rnp_init;
>  
>  	/* Set up local state, ensuring consistent view of global state. */
>  	raw_spin_lock_irqsave(&rnp->lock, flags);
> @@ -1882,6 +1891,20 @@ rcu_init_percpu_data(int cpu, struct rcu
>  	/* Exclude any attempts to start a new GP on large systems. */
>  	raw_spin_lock(&rsp->onofflock);		/* irqs already disabled. */
>  
> +	/*
> +	 * Initialize any rcu_node structures that will see their first use.
> +	 * Note that rcu_max_cpu cannot change out from under us because the
> +	 * hotplug locks are held.
> +	 */
> +	raw_spin_lock(&rnp->lock);		/* irqs already disabled. */
> +	for (rnp_init = per_cpu_ptr(rsp->rda, rcu_max_cpu)->mynode + 1;
> +	     rnp_init <= rdp->mynode;
> +	     rnp_init++) {
> +		rnp_init->gpnum = rsp->gpnum;
> +		rnp_init->completed = rsp->completed;
> +	}
> +	raw_spin_unlock(&rnp->lock);		/* irqs remain disabled. */
> +
>  	/* Add CPU to rcu_node bitmasks. */
>  	rnp = rdp->mynode;
>  	mask = rdp->grpmask;
> @@ -1907,6 +1930,11 @@ static void __cpuinit rcu_prepare_cpu(in
>  	rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
>  	rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
>  	rcu_preempt_init_percpu_data(cpu);
> +	if (cpu > rcu_max_cpu) {
> +		smp_mb(); /* Initialization before rcu_max_cpu assignment. */
> +		rcu_max_cpu = cpu;
> +		smp_mb(); /* rcu_max_cpu assignment before later uses. */
> +	}
>  }
>  
>  /*
> --- a/kernel/rcutree.h
> +++ b/kernel/rcutree.h
> @@ -191,11 +191,23 @@ struct rcu_node {
>  
>  /*
>   * Do a full breadth-first scan of the rcu_node structures for the
> - * specified rcu_state structure.
> + * specified rcu_state structure.  The caller must hold either the
> + * ->onofflock or the root rcu_node structure's ->lock.
>   */
> +extern int rcu_max_cpu;
> +static inline int rcu_get_max_cpu(void)
> +{
> +	int ret;
> +
> +	smp_mb();  /* Pairs with barriers in rcu_prepare_cpu(). */
> +	ret = rcu_max_cpu;
> +	smp_mb();  /* Pairs with barriers in rcu_prepare_cpu(). */
> +	return ret;
> +}
>  #define rcu_for_each_node_breadth_first(rsp, rnp) \
>  	for ((rnp) = &(rsp)->node[0]; \
> -	     (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
> +	     (rnp) <= per_cpu_ptr((rsp)->rda, rcu_get_max_cpu())->mynode; \
> +	     (rnp)++)
>  
>  /*
>   * Do a breadth-first scan of the non-leaf rcu_node structures for the
> 

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online
  2012-03-16 17:28                       ` Dimitri Sivanich
@ 2012-03-16 17:51                         ` Paul E. McKenney
  2012-03-16 17:56                           ` Dimitri Sivanich
  2012-03-16 19:11                         ` Mike Galbraith
  2012-03-22 15:35                         ` Mike Galbraith
  2 siblings, 1 reply; 32+ messages in thread
From: Paul E. McKenney @ 2012-03-16 17:51 UTC (permalink / raw)
  To: Dimitri Sivanich; +Cc: Mike Galbraith, linux-kernel

On Fri, Mar 16, 2012 at 12:28:50PM -0500, Dimitri Sivanich wrote:
> On Fri, Mar 16, 2012 at 09:45:35AM +0100, Mike Galbraith wrote:
> > On Fri, 2012-03-16 at 09:09 +0100, Mike Galbraith wrote: 
> > > On Fri, 2012-03-16 at 08:27 +0100, Mike Galbraith wrote: 
> > > > On Thu, 2012-03-15 at 12:59 -0500, Dimitri Sivanich wrote: 
> > 
> > > > > Could I try your 3.0 enterprise patch?
> > > > 
> > > > Sure, v3 below.
> > > 
> > > Erm, a bit of that went missing.  I'll put it back.
> > 
> > OK, box finally finished rebuild/boot.
> > 
> 
> This patch also shows great improvement in the two
> rcu_for_each_node_breadth_first() (nothing over 20 usec and most less than
> 10 in initial testing).
> 
> However, there are spinlock holdoffs at the following tracebacks (my nmi
> handler does work on the 3.0 kernel):
> 
> [  584.157019]  [<ffffffff8144c5a0>] nmi+0x20/0x30
> [  584.157023]  [<ffffffff8144bc8a>] _raw_spin_lock_irqsave+0x1a/0x30
> [  584.157026]  [<ffffffff810c5f18>] force_qs_rnp+0x58/0x170

This is a holdoff, not a deadlock hang, correct?

If so...

Presumably this is the last raw_spin_lock_irqsave() in force_qs_rnp(),
the one right before the call to rcu_initiate_boost().  Could you please
verify for me?

If so, someone is holding the root rcu_node structure's lock for longer
than is good.  Or that there is significant contention on that lock,
which there might well be on large configurations.  Any info on who
is holding or contending for this lock would be very helpful!

Are you running TREE_RCU or TREE_PREEMPT_RCU?

							Thanx, Paul

> [  584.157030]  [<ffffffff810c6192>] force_quiescent_state+0x162/0x1d0
> [  584.157033]  [<ffffffff810c6c95>] __rcu_process_callbacks+0x165/0x200
> [  584.157037]  [<ffffffff810c6d4d>] rcu_process_callbacks+0x1d/0x80
> [  584.157041]  [<ffffffff81061eaf>] __do_softirq+0xef/0x220
> [  584.157044]  [<ffffffff81454cbc>] call_softirq+0x1c/0x30
> [  584.157048]  [<ffffffff810043a5>] do_softirq+0x65/0xa0
> [  584.157051]  [<ffffffff81061c85>] irq_exit+0xb5/0xe0
> [  584.157054]  [<ffffffff810212c8>] smp_apic_timer_interrupt+0x68/0xa0
> [  584.157057]  [<ffffffff81454473>] apic_timer_interrupt+0x13/0x20
> [  584.157061]  [<ffffffff8102b352>] native_safe_halt+0x2/0x10
> [  584.157064]  [<ffffffff8100adf5>] default_idle+0x145/0x150
> [  584.157067]  [<ffffffff810020c6>] cpu_idle+0x66/0xc0
> 
> > 
> > rcu: Limit GP initialization to CPUs that have been online
> > 
> > The current grace-period initialization initializes all leaf rcu_node
> > structures, even those corresponding to CPUs that have never been online.
> > This is harmless in many configurations, but results in 200-microsecond
> > latency spikes for kernels built with NR_CPUS=4096.
> > 
> > This commit therefore keeps track of the largest-numbered CPU that has
> > ever been online, and limits grace-period initialization to rcu_node
> > structures corresponding to that CPU and to smaller-numbered CPUs.
> > 
> > Reported-by: Dimitri Sivanich <sivanich@sgi.com>
> > Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> > Acked-by: Mike Galbraith <mgalbraith@suse.de>
> > 
> > ---
> >  kernel/rcutree.c |   36 ++++++++++++++++++++++++++++++++----
> >  kernel/rcutree.h |   16 ++++++++++++++--
> >  2 files changed, 46 insertions(+), 6 deletions(-)
> > 
> > --- a/kernel/rcutree.c
> > +++ b/kernel/rcutree.c
> > @@ -84,6 +84,8 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_d
> >  
> >  static struct rcu_state *rcu_state;
> >  
> > +int rcu_max_cpu __read_mostly;	/* Largest # CPU that has ever been online. */
> > +
> >  /*
> >   * The rcu_scheduler_active variable transitions from zero to one just
> >   * before the first task is spawned.  So when this variable is zero, RCU
> > @@ -827,25 +829,31 @@ rcu_start_gp(struct rcu_state *rsp, unsi
> >  	struct rcu_node *rnp = rcu_get_root(rsp);
> >  
> >  	if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
> > +		struct rcu_node *rnp_root = rnp;
> > +
> >  		if (cpu_needs_another_gp(rsp, rdp))
> >  			rsp->fqs_need_gp = 1;
> >  		if (rnp->completed == rsp->completed) {
> > -			raw_spin_unlock_irqrestore(&rnp->lock, flags);
> > +			raw_spin_unlock_irqrestore(&rnp_root->lock, flags);
> >  			return;
> >  		}
> > -		raw_spin_unlock(&rnp->lock);	 /* irqs remain disabled. */
> >  
> >  		/*
> >  		 * Propagate new ->completed value to rcu_node structures
> >  		 * so that other CPUs don't have to wait until the start
> >  		 * of the next grace period to process their callbacks.
> > +		 * We must hold the root rcu_node structure's ->lock
> > +		 * across rcu_for_each_node_breadth_first() in order to
> > +		 * synchronize with CPUs coming online for the first time.
> >  		 */
> >  		rcu_for_each_node_breadth_first(rsp, rnp) {
> > +			raw_spin_unlock(&rnp_root->lock); /* remain disabled. */
> >  			raw_spin_lock(&rnp->lock); /* irqs already disabled. */
> >  			rnp->completed = rsp->completed;
> >  			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
> > +			raw_spin_lock(&rnp_root->lock); /* already disabled. */
> >  		}
> > -		local_irq_restore(flags);
> > +		raw_spin_unlock_irqrestore(&rnp_root->lock, flags);
> >  		return;
> >  	}
> >  
> > @@ -935,7 +943,7 @@ static void rcu_report_qs_rsp(struct rcu
> >  		rsp->gp_max = gp_duration;
> >  	rsp->completed = rsp->gpnum;
> >  	rsp->signaled = RCU_GP_IDLE;
> > -	rcu_start_gp(rsp, flags);  /* releases root node's rnp->lock. */
> > +	rcu_start_gp(rsp, flags);  /* releases root node's ->lock. */
> >  }
> >  
> >  /*
> > @@ -1862,6 +1870,7 @@ rcu_init_percpu_data(int cpu, struct rcu
> >  	unsigned long mask;
> >  	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
> >  	struct rcu_node *rnp = rcu_get_root(rsp);
> > +	struct rcu_node *rnp_init;
> >  
> >  	/* Set up local state, ensuring consistent view of global state. */
> >  	raw_spin_lock_irqsave(&rnp->lock, flags);
> > @@ -1882,6 +1891,20 @@ rcu_init_percpu_data(int cpu, struct rcu
> >  	/* Exclude any attempts to start a new GP on large systems. */
> >  	raw_spin_lock(&rsp->onofflock);		/* irqs already disabled. */
> >  
> > +	/*
> > +	 * Initialize any rcu_node structures that will see their first use.
> > +	 * Note that rcu_max_cpu cannot change out from under us because the
> > +	 * hotplug locks are held.
> > +	 */
> > +	raw_spin_lock(&rnp->lock);		/* irqs already disabled. */
> > +	for (rnp_init = per_cpu_ptr(rsp->rda, rcu_max_cpu)->mynode + 1;
> > +	     rnp_init <= rdp->mynode;
> > +	     rnp_init++) {
> > +		rnp_init->gpnum = rsp->gpnum;
> > +		rnp_init->completed = rsp->completed;
> > +	}
> > +	raw_spin_unlock(&rnp->lock);		/* irqs remain disabled. */
> > +
> >  	/* Add CPU to rcu_node bitmasks. */
> >  	rnp = rdp->mynode;
> >  	mask = rdp->grpmask;
> > @@ -1907,6 +1930,11 @@ static void __cpuinit rcu_prepare_cpu(in
> >  	rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
> >  	rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
> >  	rcu_preempt_init_percpu_data(cpu);
> > +	if (cpu > rcu_max_cpu) {
> > +		smp_mb(); /* Initialization before rcu_max_cpu assignment. */
> > +		rcu_max_cpu = cpu;
> > +		smp_mb(); /* rcu_max_cpu assignment before later uses. */
> > +	}
> >  }
> >  
> >  /*
> > --- a/kernel/rcutree.h
> > +++ b/kernel/rcutree.h
> > @@ -191,11 +191,23 @@ struct rcu_node {
> >  
> >  /*
> >   * Do a full breadth-first scan of the rcu_node structures for the
> > - * specified rcu_state structure.
> > + * specified rcu_state structure.  The caller must hold either the
> > + * ->onofflock or the root rcu_node structure's ->lock.
> >   */
> > +extern int rcu_max_cpu;
> > +static inline int rcu_get_max_cpu(void)
> > +{
> > +	int ret;
> > +
> > +	smp_mb();  /* Pairs with barriers in rcu_prepare_cpu(). */
> > +	ret = rcu_max_cpu;
> > +	smp_mb();  /* Pairs with barriers in rcu_prepare_cpu(). */
> > +	return ret;
> > +}
> >  #define rcu_for_each_node_breadth_first(rsp, rnp) \
> >  	for ((rnp) = &(rsp)->node[0]; \
> > -	     (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
> > +	     (rnp) <= per_cpu_ptr((rsp)->rda, rcu_get_max_cpu())->mynode; \
> > +	     (rnp)++)
> >  
> >  /*
> >   * Do a breadth-first scan of the non-leaf rcu_node structures for the
> > 
> 


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online
  2012-03-16 17:51                         ` Paul E. McKenney
@ 2012-03-16 17:56                           ` Dimitri Sivanich
  0 siblings, 0 replies; 32+ messages in thread
From: Dimitri Sivanich @ 2012-03-16 17:56 UTC (permalink / raw)
  To: Paul E. McKenney; +Cc: Mike Galbraith, linux-kernel

On Fri, Mar 16, 2012 at 10:51:42AM -0700, Paul E. McKenney wrote:
> On Fri, Mar 16, 2012 at 12:28:50PM -0500, Dimitri Sivanich wrote:
> > On Fri, Mar 16, 2012 at 09:45:35AM +0100, Mike Galbraith wrote:
> > > On Fri, 2012-03-16 at 09:09 +0100, Mike Galbraith wrote: 
> > > > On Fri, 2012-03-16 at 08:27 +0100, Mike Galbraith wrote: 
> > > > > On Thu, 2012-03-15 at 12:59 -0500, Dimitri Sivanich wrote: 
> > > 
> > > > > > Could I try your 3.0 enterprise patch?
> > > > > 
> > > > > Sure, v3 below.
> > > > 
> > > > Erm, a bit of that went missing.  I'll put it back.
> > > 
> > > OK, box finally finished rebuild/boot.
> > > 
> > 
> > This patch also shows great improvement in the two
> > rcu_for_each_node_breadth_first() (nothing over 20 usec and most less than
> > 10 in initial testing).
> > 
> > However, there are spinlock holdoffs at the following tracebacks (my nmi
> > handler does work on the 3.0 kernel):
> > 
> > [  584.157019]  [<ffffffff8144c5a0>] nmi+0x20/0x30
> > [  584.157023]  [<ffffffff8144bc8a>] _raw_spin_lock_irqsave+0x1a/0x30
> > [  584.157026]  [<ffffffff810c5f18>] force_qs_rnp+0x58/0x170
> 
> This is a holdoff, not a deadlock hang, correct?

Correct.  Likely < 200 usec.

> 
> If so...
> 
> Presumably this is the last raw_spin_lock_irqsave() in force_qs_rnp(),
> the one right before the call to rcu_initiate_boost().  Could you please
> verify for me?

Stay tuned.  I need to reproduce this without the extra tracecode I had 
added around the loops this morning (it -shouldn't- have had an effect..).
> 
> If so, someone is holding the root rcu_node structure's lock for longer
> than is good.  Or that there is significant contention on that lock,
> which there might well be on large configurations.  Any info on who
> is holding or contending for this lock would be very helpful!
> 
> Are you running TREE_RCU or TREE_PREEMPT_RCU?

TREE_RCU


> 
> 							Thanx, Paul
> 
> > [  584.157030]  [<ffffffff810c6192>] force_quiescent_state+0x162/0x1d0
> > [  584.157033]  [<ffffffff810c6c95>] __rcu_process_callbacks+0x165/0x200
> > [  584.157037]  [<ffffffff810c6d4d>] rcu_process_callbacks+0x1d/0x80
> > [  584.157041]  [<ffffffff81061eaf>] __do_softirq+0xef/0x220
> > [  584.157044]  [<ffffffff81454cbc>] call_softirq+0x1c/0x30
> > [  584.157048]  [<ffffffff810043a5>] do_softirq+0x65/0xa0
> > [  584.157051]  [<ffffffff81061c85>] irq_exit+0xb5/0xe0
> > [  584.157054]  [<ffffffff810212c8>] smp_apic_timer_interrupt+0x68/0xa0
> > [  584.157057]  [<ffffffff81454473>] apic_timer_interrupt+0x13/0x20
> > [  584.157061]  [<ffffffff8102b352>] native_safe_halt+0x2/0x10
> > [  584.157064]  [<ffffffff8100adf5>] default_idle+0x145/0x150
> > [  584.157067]  [<ffffffff810020c6>] cpu_idle+0x66/0xc0
> > 
> > > 
> > > rcu: Limit GP initialization to CPUs that have been online
> > > 
> > > The current grace-period initialization initializes all leaf rcu_node
> > > structures, even those corresponding to CPUs that have never been online.
> > > This is harmless in many configurations, but results in 200-microsecond
> > > latency spikes for kernels built with NR_CPUS=4096.
> > > 
> > > This commit therefore keeps track of the largest-numbered CPU that has
> > > ever been online, and limits grace-period initialization to rcu_node
> > > structures corresponding to that CPU and to smaller-numbered CPUs.
> > > 
> > > Reported-by: Dimitri Sivanich <sivanich@sgi.com>
> > > Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> > > Acked-by: Mike Galbraith <mgalbraith@suse.de>
> > > 
> > > ---
> > >  kernel/rcutree.c |   36 ++++++++++++++++++++++++++++++++----
> > >  kernel/rcutree.h |   16 ++++++++++++++--
> > >  2 files changed, 46 insertions(+), 6 deletions(-)
> > > 
> > > --- a/kernel/rcutree.c
> > > +++ b/kernel/rcutree.c
> > > @@ -84,6 +84,8 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_d
> > >  
> > >  static struct rcu_state *rcu_state;
> > >  
> > > +int rcu_max_cpu __read_mostly;	/* Largest # CPU that has ever been online. */
> > > +
> > >  /*
> > >   * The rcu_scheduler_active variable transitions from zero to one just
> > >   * before the first task is spawned.  So when this variable is zero, RCU
> > > @@ -827,25 +829,31 @@ rcu_start_gp(struct rcu_state *rsp, unsi
> > >  	struct rcu_node *rnp = rcu_get_root(rsp);
> > >  
> > >  	if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
> > > +		struct rcu_node *rnp_root = rnp;
> > > +
> > >  		if (cpu_needs_another_gp(rsp, rdp))
> > >  			rsp->fqs_need_gp = 1;
> > >  		if (rnp->completed == rsp->completed) {
> > > -			raw_spin_unlock_irqrestore(&rnp->lock, flags);
> > > +			raw_spin_unlock_irqrestore(&rnp_root->lock, flags);
> > >  			return;
> > >  		}
> > > -		raw_spin_unlock(&rnp->lock);	 /* irqs remain disabled. */
> > >  
> > >  		/*
> > >  		 * Propagate new ->completed value to rcu_node structures
> > >  		 * so that other CPUs don't have to wait until the start
> > >  		 * of the next grace period to process their callbacks.
> > > +		 * We must hold the root rcu_node structure's ->lock
> > > +		 * across rcu_for_each_node_breadth_first() in order to
> > > +		 * synchronize with CPUs coming online for the first time.
> > >  		 */
> > >  		rcu_for_each_node_breadth_first(rsp, rnp) {
> > > +			raw_spin_unlock(&rnp_root->lock); /* remain disabled. */
> > >  			raw_spin_lock(&rnp->lock); /* irqs already disabled. */
> > >  			rnp->completed = rsp->completed;
> > >  			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
> > > +			raw_spin_lock(&rnp_root->lock); /* already disabled. */
> > >  		}
> > > -		local_irq_restore(flags);
> > > +		raw_spin_unlock_irqrestore(&rnp_root->lock, flags);
> > >  		return;
> > >  	}
> > >  
> > > @@ -935,7 +943,7 @@ static void rcu_report_qs_rsp(struct rcu
> > >  		rsp->gp_max = gp_duration;
> > >  	rsp->completed = rsp->gpnum;
> > >  	rsp->signaled = RCU_GP_IDLE;
> > > -	rcu_start_gp(rsp, flags);  /* releases root node's rnp->lock. */
> > > +	rcu_start_gp(rsp, flags);  /* releases root node's ->lock. */
> > >  }
> > >  
> > >  /*
> > > @@ -1862,6 +1870,7 @@ rcu_init_percpu_data(int cpu, struct rcu
> > >  	unsigned long mask;
> > >  	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
> > >  	struct rcu_node *rnp = rcu_get_root(rsp);
> > > +	struct rcu_node *rnp_init;
> > >  
> > >  	/* Set up local state, ensuring consistent view of global state. */
> > >  	raw_spin_lock_irqsave(&rnp->lock, flags);
> > > @@ -1882,6 +1891,20 @@ rcu_init_percpu_data(int cpu, struct rcu
> > >  	/* Exclude any attempts to start a new GP on large systems. */
> > >  	raw_spin_lock(&rsp->onofflock);		/* irqs already disabled. */
> > >  
> > > +	/*
> > > +	 * Initialize any rcu_node structures that will see their first use.
> > > +	 * Note that rcu_max_cpu cannot change out from under us because the
> > > +	 * hotplug locks are held.
> > > +	 */
> > > +	raw_spin_lock(&rnp->lock);		/* irqs already disabled. */
> > > +	for (rnp_init = per_cpu_ptr(rsp->rda, rcu_max_cpu)->mynode + 1;
> > > +	     rnp_init <= rdp->mynode;
> > > +	     rnp_init++) {
> > > +		rnp_init->gpnum = rsp->gpnum;
> > > +		rnp_init->completed = rsp->completed;
> > > +	}
> > > +	raw_spin_unlock(&rnp->lock);		/* irqs remain disabled. */
> > > +
> > >  	/* Add CPU to rcu_node bitmasks. */
> > >  	rnp = rdp->mynode;
> > >  	mask = rdp->grpmask;
> > > @@ -1907,6 +1930,11 @@ static void __cpuinit rcu_prepare_cpu(in
> > >  	rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
> > >  	rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
> > >  	rcu_preempt_init_percpu_data(cpu);
> > > +	if (cpu > rcu_max_cpu) {
> > > +		smp_mb(); /* Initialization before rcu_max_cpu assignment. */
> > > +		rcu_max_cpu = cpu;
> > > +		smp_mb(); /* rcu_max_cpu assignment before later uses. */
> > > +	}
> > >  }
> > >  
> > >  /*
> > > --- a/kernel/rcutree.h
> > > +++ b/kernel/rcutree.h
> > > @@ -191,11 +191,23 @@ struct rcu_node {
> > >  
> > >  /*
> > >   * Do a full breadth-first scan of the rcu_node structures for the
> > > - * specified rcu_state structure.
> > > + * specified rcu_state structure.  The caller must hold either the
> > > + * ->onofflock or the root rcu_node structure's ->lock.
> > >   */
> > > +extern int rcu_max_cpu;
> > > +static inline int rcu_get_max_cpu(void)
> > > +{
> > > +	int ret;
> > > +
> > > +	smp_mb();  /* Pairs with barriers in rcu_prepare_cpu(). */
> > > +	ret = rcu_max_cpu;
> > > +	smp_mb();  /* Pairs with barriers in rcu_prepare_cpu(). */
> > > +	return ret;
> > > +}
> > >  #define rcu_for_each_node_breadth_first(rsp, rnp) \
> > >  	for ((rnp) = &(rsp)->node[0]; \
> > > -	     (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
> > > +	     (rnp) <= per_cpu_ptr((rsp)->rda, rcu_get_max_cpu())->mynode; \
> > > +	     (rnp)++)
> > >  
> > >  /*
> > >   * Do a breadth-first scan of the non-leaf rcu_node structures for the
> > > 
> > 

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online
  2012-03-16 17:28                       ` Dimitri Sivanich
  2012-03-16 17:51                         ` Paul E. McKenney
@ 2012-03-16 19:11                         ` Mike Galbraith
  2012-03-22 15:35                         ` Mike Galbraith
  2 siblings, 0 replies; 32+ messages in thread
From: Mike Galbraith @ 2012-03-16 19:11 UTC (permalink / raw)
  To: Dimitri Sivanich; +Cc: paulmck, linux-kernel

On Fri, 2012-03-16 at 12:28 -0500, Dimitri Sivanich wrote:

> However, there are spinlock holdoffs at the following tracebacks (my nmi
> handler does work on the 3.0 kernel):

I'll try wedging/testing the subsequent patch Paul mentioned in both RT,
and NR_CPUS=max NOPREEMPT kernels.

-Mike


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online
  2012-03-16 17:28                       ` Dimitri Sivanich
  2012-03-16 17:51                         ` Paul E. McKenney
  2012-03-16 19:11                         ` Mike Galbraith
@ 2012-03-22 15:35                         ` Mike Galbraith
  2012-03-22 20:24                           ` Dimitri Sivanich
  2 siblings, 1 reply; 32+ messages in thread
From: Mike Galbraith @ 2012-03-22 15:35 UTC (permalink / raw)
  To: Dimitri Sivanich; +Cc: paulmck, linux-kernel

On Fri, 2012-03-16 at 12:28 -0500, Dimitri Sivanich wrote: 
> On Fri, Mar 16, 2012 at 09:45:35AM +0100, Mike Galbraith wrote:
> > On Fri, 2012-03-16 at 09:09 +0100, Mike Galbraith wrote: 
> > > On Fri, 2012-03-16 at 08:27 +0100, Mike Galbraith wrote: 
> > > > On Thu, 2012-03-15 at 12:59 -0500, Dimitri Sivanich wrote: 
> > 
> > > > > Could I try your 3.0 enterprise patch?
> > > > 
> > > > Sure, v3 below.
> > > 
> > > Erm, a bit of that went missing.  I'll put it back.
> > 
> > OK, box finally finished rebuild/boot.
> > 
> 
> This patch also shows great improvement in the two
> rcu_for_each_node_breadth_first() (nothing over 20 usec and most less than
> 10 in initial testing).
> 
> However, there are spinlock holdoffs at the following tracebacks (my nmi
> handler does work on the 3.0 kernel):
> 
> [  584.157019]  [<ffffffff8144c5a0>] nmi+0x20/0x30
> [  584.157023]  [<ffffffff8144bc8a>] _raw_spin_lock_irqsave+0x1a/0x30
> [  584.157026]  [<ffffffff810c5f18>] force_qs_rnp+0x58/0x170
> [  584.157030]  [<ffffffff810c6192>] force_quiescent_state+0x162/0x1d0
> [  584.157033]  [<ffffffff810c6c95>] __rcu_process_callbacks+0x165/0x200
> [  584.157037]  [<ffffffff810c6d4d>] rcu_process_callbacks+0x1d/0x80
> [  584.157041]  [<ffffffff81061eaf>] __do_softirq+0xef/0x220
> [  584.157044]  [<ffffffff81454cbc>] call_softirq+0x1c/0x30
> [  584.157048]  [<ffffffff810043a5>] do_softirq+0x65/0xa0
> [  584.157051]  [<ffffffff81061c85>] irq_exit+0xb5/0xe0
> [  584.157054]  [<ffffffff810212c8>] smp_apic_timer_interrupt+0x68/0xa0
> [  584.157057]  [<ffffffff81454473>] apic_timer_interrupt+0x13/0x20
> [  584.157061]  [<ffffffff8102b352>] native_safe_halt+0x2/0x10
> [  584.157064]  [<ffffffff8100adf5>] default_idle+0x145/0x150
> [  584.157067]  [<ffffffff810020c6>] cpu_idle+0x66/0xc0

Care to try this?  There's likely a better way to defeat ->qsmask == 0
take/release all locks thingy, however, if Paul can safely bail in
force_qs_rnp() in tweakable latency for big boxen patch, I should be
able to safely (and shamelessly) steal that, and should someone hotplug
a CPU, and we race, do the same thing bail for small boxen.

(RCU Sherrif Paul may foil that dastardly deed...)

I decided I should sit on the rmp_root lock in the first loop as well. 

---
 kernel/rcutree.c |   78 +++++++++++++++++++++++++++++++++++++++++++------------
 kernel/rcutree.h |   16 +++++++++--
 2 files changed, 76 insertions(+), 18 deletions(-)

--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -84,6 +84,8 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_d
 
 static struct rcu_state *rcu_state;
 
+int rcu_max_cpu __read_mostly;	/* Largest # CPU that has ever been online. */
+
 /*
  * The rcu_scheduler_active variable transitions from zero to one just
  * before the first task is spawned.  So when this variable is zero, RCU
@@ -827,25 +829,33 @@ rcu_start_gp(struct rcu_state *rsp, unsi
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
 	if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
+		struct rcu_node *rnp_root = rnp;
+
 		if (cpu_needs_another_gp(rsp, rdp))
 			rsp->fqs_need_gp = 1;
 		if (rnp->completed == rsp->completed) {
-			raw_spin_unlock_irqrestore(&rnp->lock, flags);
+			raw_spin_unlock_irqrestore(&rnp_root->lock, flags);
 			return;
 		}
-		raw_spin_unlock(&rnp->lock);	 /* irqs remain disabled. */
 
 		/*
 		 * Propagate new ->completed value to rcu_node structures
 		 * so that other CPUs don't have to wait until the start
 		 * of the next grace period to process their callbacks.
+		 * We must hold the root rcu_node structure's ->lock
+		 * across rcu_for_each_node_breadth_first() in order to
+		 * synchronize with CPUs coming online for the first time.
 		 */
 		rcu_for_each_node_breadth_first(rsp, rnp) {
+			if (rnp == rnp_root) {
+				rnp->completed = rsp->completed;
+				continue;
+			}
 			raw_spin_lock(&rnp->lock); /* irqs already disabled. */
 			rnp->completed = rsp->completed;
 			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
 		}
-		local_irq_restore(flags);
+		raw_spin_unlock_irqrestore(&rnp_root->lock, flags);
 		return;
 	}
 
@@ -935,7 +945,7 @@ static void rcu_report_qs_rsp(struct rcu
 		rsp->gp_max = gp_duration;
 	rsp->completed = rsp->gpnum;
 	rsp->signaled = RCU_GP_IDLE;
-	rcu_start_gp(rsp, flags);  /* releases root node's rnp->lock. */
+	rcu_start_gp(rsp, flags);  /* releases root node's ->lock. */
 }
 
 /*
@@ -1310,45 +1320,57 @@ void rcu_check_callbacks(int cpu, int us
  *
  * The caller must have suppressed start of new grace periods.
  */
-static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
+static int force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
 {
 	unsigned long bit;
-	int cpu;
+	int cpu = 0, max_cpu = rcu_max_cpu, done;
 	unsigned long flags;
 	unsigned long mask;
 	struct rcu_node *rnp;
 
 	rcu_for_each_leaf_node(rsp, rnp) {
-		mask = 0;
+		if (cpu > max_cpu)
+			break;
 		raw_spin_lock_irqsave(&rnp->lock, flags);
 		if (!rcu_gp_in_progress(rsp)) {
 			raw_spin_unlock_irqrestore(&rnp->lock, flags);
-			return;
+			return 1;
 		}
 		if (rnp->qsmask == 0) {
 			rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
 			continue;
 		}
+
 		cpu = rnp->grplo;
 		bit = 1;
+		mask = 0;
 		for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
+			if (cpu > max_cpu)
+				break;
 			if ((rnp->qsmask & bit) != 0 &&
 			    f(per_cpu_ptr(rsp->rda, cpu)))
 				mask |= bit;
 		}
 		if (mask != 0) {
-
 			/* rcu_report_qs_rnp() releases rnp->lock. */
 			rcu_report_qs_rnp(mask, rsp, rnp, flags);
 			continue;
 		}
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	}
+
 	rnp = rcu_get_root(rsp);
-	if (rnp->qsmask == 0) {
-		raw_spin_lock_irqsave(&rnp->lock, flags);
+	raw_spin_lock_irqsave(&rnp->lock, flags);
+
+	/* If we raced with hotplug, ask for a repeat. */
+	done = max_cpu == rcu_get_max_cpu();
+
+	if (rnp->qsmask == 0)
 		rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
-	}
+	else
+		raw_spin_unlock_irqrestore(&rnp->lock, flags);
+
+	return done;
 }
 
 /*
@@ -1359,6 +1381,7 @@ static void force_quiescent_state(struct
 {
 	unsigned long flags;
 	struct rcu_node *rnp = rcu_get_root(rsp);
+	int retval;
 
 	if (!rcu_gp_in_progress(rsp))
 		return;  /* No grace period in progress, nothing to force. */
@@ -1390,21 +1413,24 @@ static void force_quiescent_state(struct
 		raw_spin_unlock(&rnp->lock);  /* irqs remain disabled */
 
 		/* Record dyntick-idle state. */
-		force_qs_rnp(rsp, dyntick_save_progress_counter);
+		retval = force_qs_rnp(rsp, dyntick_save_progress_counter);
 		raw_spin_lock(&rnp->lock);  /* irqs already disabled */
-		if (rcu_gp_in_progress(rsp))
+		if (rcu_gp_in_progress(rsp) && retval)
 			rsp->signaled = RCU_FORCE_QS;
+		else if (rcu_gp_in_progress(rsp))
+			rsp->jiffies_force_qs = jiffies - 1;
 		break;
 
 	case RCU_FORCE_QS:
 
 		/* Check dyntick-idle state, send IPI to laggarts. */
 		raw_spin_unlock(&rnp->lock);  /* irqs remain disabled */
-		force_qs_rnp(rsp, rcu_implicit_dynticks_qs);
+		retval = force_qs_rnp(rsp, rcu_implicit_dynticks_qs);
 
 		/* Leave state in case more forcing is required. */
-
 		raw_spin_lock(&rnp->lock);  /* irqs already disabled */
+		if (rcu_gp_in_progress(rsp) && !retval)
+			rsp->jiffies_force_qs = jiffies - 1;
 		break;
 	}
 	rsp->fqs_active = 0;
@@ -1862,6 +1888,7 @@ rcu_init_percpu_data(int cpu, struct rcu
 	unsigned long mask;
 	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 	struct rcu_node *rnp = rcu_get_root(rsp);
+	struct rcu_node *rnp_init;
 
 	/* Set up local state, ensuring consistent view of global state. */
 	raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -1882,6 +1909,20 @@ rcu_init_percpu_data(int cpu, struct rcu
 	/* Exclude any attempts to start a new GP on large systems. */
 	raw_spin_lock(&rsp->onofflock);		/* irqs already disabled. */
 
+	/*
+	 * Initialize any rcu_node structures that will see their first use.
+	 * Note that rcu_max_cpu cannot change out from under us because the
+	 * hotplug locks are held.
+	 */
+	raw_spin_lock(&rnp->lock);		/* irqs already disabled. */
+	for (rnp_init = per_cpu_ptr(rsp->rda, rcu_max_cpu)->mynode + 1;
+	     rnp_init <= rdp->mynode;
+	     rnp_init++) {
+		rnp_init->gpnum = rsp->gpnum;
+		rnp_init->completed = rsp->completed;
+	}
+	raw_spin_unlock(&rnp->lock);		/* irqs remain disabled. */
+
 	/* Add CPU to rcu_node bitmasks. */
 	rnp = rdp->mynode;
 	mask = rdp->grpmask;
@@ -1907,6 +1948,11 @@ static void __cpuinit rcu_prepare_cpu(in
 	rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
 	rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
 	rcu_preempt_init_percpu_data(cpu);
+	if (cpu > rcu_max_cpu) {
+		smp_mb(); /* Initialization before rcu_max_cpu assignment. */
+		rcu_max_cpu = cpu;
+		smp_mb(); /* rcu_max_cpu assignment before later uses. */
+	}
 }
 
 /*
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -191,11 +191,23 @@ struct rcu_node {
 
 /*
  * Do a full breadth-first scan of the rcu_node structures for the
- * specified rcu_state structure.
+ * specified rcu_state structure.  The caller must hold either the
+ * ->onofflock or the root rcu_node structure's ->lock.
  */
+extern int rcu_max_cpu;
+static inline int rcu_get_max_cpu(void)
+{
+	int ret;
+
+	smp_mb();  /* Pairs with barriers in rcu_prepare_cpu(). */
+	ret = rcu_max_cpu;
+	smp_mb();  /* Pairs with barriers in rcu_prepare_cpu(). */
+	return ret;
+}
 #define rcu_for_each_node_breadth_first(rsp, rnp) \
 	for ((rnp) = &(rsp)->node[0]; \
-	     (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
+	     (rnp) <= per_cpu_ptr((rsp)->rda, rcu_get_max_cpu())->mynode; \
+	     (rnp)++)
 
 /*
  * Do a breadth-first scan of the non-leaf rcu_node structures for the



^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online
  2012-03-22 15:35                         ` Mike Galbraith
@ 2012-03-22 20:24                           ` Dimitri Sivanich
  2012-03-23  4:48                             ` Mike Galbraith
  0 siblings, 1 reply; 32+ messages in thread
From: Dimitri Sivanich @ 2012-03-22 20:24 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: paulmck, linux-kernel

On Thu, Mar 22, 2012 at 04:35:33PM +0100, Mike Galbraith wrote:
> On Fri, 2012-03-16 at 12:28 -0500, Dimitri Sivanich wrote: 
> > On Fri, Mar 16, 2012 at 09:45:35AM +0100, Mike Galbraith wrote:
> > > On Fri, 2012-03-16 at 09:09 +0100, Mike Galbraith wrote: 
> > > > On Fri, 2012-03-16 at 08:27 +0100, Mike Galbraith wrote: 
> > > > > On Thu, 2012-03-15 at 12:59 -0500, Dimitri Sivanich wrote: 
> > > 
> > > > > > Could I try your 3.0 enterprise patch?
> > > > > 
> > > > > Sure, v3 below.
> > > > 
> > > > Erm, a bit of that went missing.  I'll put it back.
> > > 
> > > OK, box finally finished rebuild/boot.
> > > 
> > 
> > This patch also shows great improvement in the two
> > rcu_for_each_node_breadth_first() (nothing over 20 usec and most less than
> > 10 in initial testing).
> > 
> > However, there are spinlock holdoffs at the following tracebacks (my nmi
> > handler does work on the 3.0 kernel):
> > 
> > [  584.157019]  [<ffffffff8144c5a0>] nmi+0x20/0x30
> > [  584.157023]  [<ffffffff8144bc8a>] _raw_spin_lock_irqsave+0x1a/0x30
> > [  584.157026]  [<ffffffff810c5f18>] force_qs_rnp+0x58/0x170
> > [  584.157030]  [<ffffffff810c6192>] force_quiescent_state+0x162/0x1d0
> > [  584.157033]  [<ffffffff810c6c95>] __rcu_process_callbacks+0x165/0x200
> > [  584.157037]  [<ffffffff810c6d4d>] rcu_process_callbacks+0x1d/0x80
> > [  584.157041]  [<ffffffff81061eaf>] __do_softirq+0xef/0x220
> > [  584.157044]  [<ffffffff81454cbc>] call_softirq+0x1c/0x30
> > [  584.157048]  [<ffffffff810043a5>] do_softirq+0x65/0xa0
> > [  584.157051]  [<ffffffff81061c85>] irq_exit+0xb5/0xe0
> > [  584.157054]  [<ffffffff810212c8>] smp_apic_timer_interrupt+0x68/0xa0
> > [  584.157057]  [<ffffffff81454473>] apic_timer_interrupt+0x13/0x20
> > [  584.157061]  [<ffffffff8102b352>] native_safe_halt+0x2/0x10
> > [  584.157064]  [<ffffffff8100adf5>] default_idle+0x145/0x150
> > [  584.157067]  [<ffffffff810020c6>] cpu_idle+0x66/0xc0
> 
> Care to try this?  There's likely a better way to defeat ->qsmask == 0
> take/release all locks thingy, however, if Paul can safely bail in
> force_qs_rnp() in tweakable latency for big boxen patch, I should be
> able to safely (and shamelessly) steal that, and should someone hotplug
> a CPU, and we race, do the same thing bail for small boxen.

Tested on a 48 cpu UV system with an interrupt latency test on isolated
cpus and a moderate to heavy load on the rest of the system.

This patch appears to take care of all excessive (> 35 usec) RCU-based
latency in the 3.0 kernel on this particular system for this particular
setup.  Without the patch, I see many latencies on this system > 150 usec
(and some > 200 usec).

> 
> (RCU Sherrif Paul may foil that dastardly deed...)
> 
> I decided I should sit on the rmp_root lock in the first loop as well. 
> 
> ---
>  kernel/rcutree.c |   78 +++++++++++++++++++++++++++++++++++++++++++------------
>  kernel/rcutree.h |   16 +++++++++--
>  2 files changed, 76 insertions(+), 18 deletions(-)
> 
> --- a/kernel/rcutree.c
> +++ b/kernel/rcutree.c
> @@ -84,6 +84,8 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_d
>  
>  static struct rcu_state *rcu_state;
>  
> +int rcu_max_cpu __read_mostly;	/* Largest # CPU that has ever been online. */
> +
>  /*
>   * The rcu_scheduler_active variable transitions from zero to one just
>   * before the first task is spawned.  So when this variable is zero, RCU
> @@ -827,25 +829,33 @@ rcu_start_gp(struct rcu_state *rsp, unsi
>  	struct rcu_node *rnp = rcu_get_root(rsp);
>  
>  	if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
> +		struct rcu_node *rnp_root = rnp;
> +
>  		if (cpu_needs_another_gp(rsp, rdp))
>  			rsp->fqs_need_gp = 1;
>  		if (rnp->completed == rsp->completed) {
> -			raw_spin_unlock_irqrestore(&rnp->lock, flags);
> +			raw_spin_unlock_irqrestore(&rnp_root->lock, flags);
>  			return;
>  		}
> -		raw_spin_unlock(&rnp->lock);	 /* irqs remain disabled. */
>  
>  		/*
>  		 * Propagate new ->completed value to rcu_node structures
>  		 * so that other CPUs don't have to wait until the start
>  		 * of the next grace period to process their callbacks.
> +		 * We must hold the root rcu_node structure's ->lock
> +		 * across rcu_for_each_node_breadth_first() in order to
> +		 * synchronize with CPUs coming online for the first time.
>  		 */
>  		rcu_for_each_node_breadth_first(rsp, rnp) {
> +			if (rnp == rnp_root) {
> +				rnp->completed = rsp->completed;
> +				continue;
> +			}
>  			raw_spin_lock(&rnp->lock); /* irqs already disabled. */
>  			rnp->completed = rsp->completed;
>  			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
>  		}
> -		local_irq_restore(flags);
> +		raw_spin_unlock_irqrestore(&rnp_root->lock, flags);
>  		return;
>  	}
>  
> @@ -935,7 +945,7 @@ static void rcu_report_qs_rsp(struct rcu
>  		rsp->gp_max = gp_duration;
>  	rsp->completed = rsp->gpnum;
>  	rsp->signaled = RCU_GP_IDLE;
> -	rcu_start_gp(rsp, flags);  /* releases root node's rnp->lock. */
> +	rcu_start_gp(rsp, flags);  /* releases root node's ->lock. */
>  }
>  
>  /*
> @@ -1310,45 +1320,57 @@ void rcu_check_callbacks(int cpu, int us
>   *
>   * The caller must have suppressed start of new grace periods.
>   */
> -static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
> +static int force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
>  {
>  	unsigned long bit;
> -	int cpu;
> +	int cpu = 0, max_cpu = rcu_max_cpu, done;
>  	unsigned long flags;
>  	unsigned long mask;
>  	struct rcu_node *rnp;
>  
>  	rcu_for_each_leaf_node(rsp, rnp) {
> -		mask = 0;
> +		if (cpu > max_cpu)
> +			break;
>  		raw_spin_lock_irqsave(&rnp->lock, flags);
>  		if (!rcu_gp_in_progress(rsp)) {
>  			raw_spin_unlock_irqrestore(&rnp->lock, flags);
> -			return;
> +			return 1;
>  		}
>  		if (rnp->qsmask == 0) {
>  			rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
>  			continue;
>  		}
> +
>  		cpu = rnp->grplo;
>  		bit = 1;
> +		mask = 0;
>  		for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
> +			if (cpu > max_cpu)
> +				break;
>  			if ((rnp->qsmask & bit) != 0 &&
>  			    f(per_cpu_ptr(rsp->rda, cpu)))
>  				mask |= bit;
>  		}
>  		if (mask != 0) {
> -
>  			/* rcu_report_qs_rnp() releases rnp->lock. */
>  			rcu_report_qs_rnp(mask, rsp, rnp, flags);
>  			continue;
>  		}
>  		raw_spin_unlock_irqrestore(&rnp->lock, flags);
>  	}
> +
>  	rnp = rcu_get_root(rsp);
> -	if (rnp->qsmask == 0) {
> -		raw_spin_lock_irqsave(&rnp->lock, flags);
> +	raw_spin_lock_irqsave(&rnp->lock, flags);
> +
> +	/* If we raced with hotplug, ask for a repeat. */
> +	done = max_cpu == rcu_get_max_cpu();
> +
> +	if (rnp->qsmask == 0)
>  		rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
> -	}
> +	else
> +		raw_spin_unlock_irqrestore(&rnp->lock, flags);
> +
> +	return done;
>  }
>  
>  /*
> @@ -1359,6 +1381,7 @@ static void force_quiescent_state(struct
>  {
>  	unsigned long flags;
>  	struct rcu_node *rnp = rcu_get_root(rsp);
> +	int retval;
>  
>  	if (!rcu_gp_in_progress(rsp))
>  		return;  /* No grace period in progress, nothing to force. */
> @@ -1390,21 +1413,24 @@ static void force_quiescent_state(struct
>  		raw_spin_unlock(&rnp->lock);  /* irqs remain disabled */
>  
>  		/* Record dyntick-idle state. */
> -		force_qs_rnp(rsp, dyntick_save_progress_counter);
> +		retval = force_qs_rnp(rsp, dyntick_save_progress_counter);
>  		raw_spin_lock(&rnp->lock);  /* irqs already disabled */
> -		if (rcu_gp_in_progress(rsp))
> +		if (rcu_gp_in_progress(rsp) && retval)
>  			rsp->signaled = RCU_FORCE_QS;
> +		else if (rcu_gp_in_progress(rsp))
> +			rsp->jiffies_force_qs = jiffies - 1;
>  		break;
>  
>  	case RCU_FORCE_QS:
>  
>  		/* Check dyntick-idle state, send IPI to laggarts. */
>  		raw_spin_unlock(&rnp->lock);  /* irqs remain disabled */
> -		force_qs_rnp(rsp, rcu_implicit_dynticks_qs);
> +		retval = force_qs_rnp(rsp, rcu_implicit_dynticks_qs);
>  
>  		/* Leave state in case more forcing is required. */
> -
>  		raw_spin_lock(&rnp->lock);  /* irqs already disabled */
> +		if (rcu_gp_in_progress(rsp) && !retval)
> +			rsp->jiffies_force_qs = jiffies - 1;
>  		break;
>  	}
>  	rsp->fqs_active = 0;
> @@ -1862,6 +1888,7 @@ rcu_init_percpu_data(int cpu, struct rcu
>  	unsigned long mask;
>  	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
>  	struct rcu_node *rnp = rcu_get_root(rsp);
> +	struct rcu_node *rnp_init;
>  
>  	/* Set up local state, ensuring consistent view of global state. */
>  	raw_spin_lock_irqsave(&rnp->lock, flags);
> @@ -1882,6 +1909,20 @@ rcu_init_percpu_data(int cpu, struct rcu
>  	/* Exclude any attempts to start a new GP on large systems. */
>  	raw_spin_lock(&rsp->onofflock);		/* irqs already disabled. */
>  
> +	/*
> +	 * Initialize any rcu_node structures that will see their first use.
> +	 * Note that rcu_max_cpu cannot change out from under us because the
> +	 * hotplug locks are held.
> +	 */
> +	raw_spin_lock(&rnp->lock);		/* irqs already disabled. */
> +	for (rnp_init = per_cpu_ptr(rsp->rda, rcu_max_cpu)->mynode + 1;
> +	     rnp_init <= rdp->mynode;
> +	     rnp_init++) {
> +		rnp_init->gpnum = rsp->gpnum;
> +		rnp_init->completed = rsp->completed;
> +	}
> +	raw_spin_unlock(&rnp->lock);		/* irqs remain disabled. */
> +
>  	/* Add CPU to rcu_node bitmasks. */
>  	rnp = rdp->mynode;
>  	mask = rdp->grpmask;
> @@ -1907,6 +1948,11 @@ static void __cpuinit rcu_prepare_cpu(in
>  	rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
>  	rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
>  	rcu_preempt_init_percpu_data(cpu);
> +	if (cpu > rcu_max_cpu) {
> +		smp_mb(); /* Initialization before rcu_max_cpu assignment. */
> +		rcu_max_cpu = cpu;
> +		smp_mb(); /* rcu_max_cpu assignment before later uses. */
> +	}
>  }
>  
>  /*
> --- a/kernel/rcutree.h
> +++ b/kernel/rcutree.h
> @@ -191,11 +191,23 @@ struct rcu_node {
>  
>  /*
>   * Do a full breadth-first scan of the rcu_node structures for the
> - * specified rcu_state structure.
> + * specified rcu_state structure.  The caller must hold either the
> + * ->onofflock or the root rcu_node structure's ->lock.
>   */
> +extern int rcu_max_cpu;
> +static inline int rcu_get_max_cpu(void)
> +{
> +	int ret;
> +
> +	smp_mb();  /* Pairs with barriers in rcu_prepare_cpu(). */
> +	ret = rcu_max_cpu;
> +	smp_mb();  /* Pairs with barriers in rcu_prepare_cpu(). */
> +	return ret;
> +}
>  #define rcu_for_each_node_breadth_first(rsp, rnp) \
>  	for ((rnp) = &(rsp)->node[0]; \
> -	     (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
> +	     (rnp) <= per_cpu_ptr((rsp)->rda, rcu_get_max_cpu())->mynode; \
> +	     (rnp)++)
>  
>  /*
>   * Do a breadth-first scan of the non-leaf rcu_node structures for the
> 

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online
  2012-03-22 20:24                           ` Dimitri Sivanich
@ 2012-03-23  4:48                             ` Mike Galbraith
  2012-03-23 19:23                               ` Paul E. McKenney
  0 siblings, 1 reply; 32+ messages in thread
From: Mike Galbraith @ 2012-03-23  4:48 UTC (permalink / raw)
  To: Dimitri Sivanich; +Cc: paulmck, linux-kernel

On Thu, 2012-03-22 at 15:24 -0500, Dimitri Sivanich wrote: 
> On Thu, Mar 22, 2012 at 04:35:33PM +0100, Mike Galbraith wrote:

> > > This patch also shows great improvement in the two
> > > rcu_for_each_node_breadth_first() (nothing over 20 usec and most less than
> > > 10 in initial testing).
> > > 
> > > However, there are spinlock holdoffs at the following tracebacks (my nmi
> > > handler does work on the 3.0 kernel):
> > > 
> > > [  584.157019]  [<ffffffff8144c5a0>] nmi+0x20/0x30
> > > [  584.157023]  [<ffffffff8144bc8a>] _raw_spin_lock_irqsave+0x1a/0x30
> > > [  584.157026]  [<ffffffff810c5f18>] force_qs_rnp+0x58/0x170
> > > [  584.157030]  [<ffffffff810c6192>] force_quiescent_state+0x162/0x1d0
> > > [  584.157033]  [<ffffffff810c6c95>] __rcu_process_callbacks+0x165/0x200
> > > [  584.157037]  [<ffffffff810c6d4d>] rcu_process_callbacks+0x1d/0x80
> > > [  584.157041]  [<ffffffff81061eaf>] __do_softirq+0xef/0x220
> > > [  584.157044]  [<ffffffff81454cbc>] call_softirq+0x1c/0x30
> > > [  584.157048]  [<ffffffff810043a5>] do_softirq+0x65/0xa0
> > > [  584.157051]  [<ffffffff81061c85>] irq_exit+0xb5/0xe0
> > > [  584.157054]  [<ffffffff810212c8>] smp_apic_timer_interrupt+0x68/0xa0
> > > [  584.157057]  [<ffffffff81454473>] apic_timer_interrupt+0x13/0x20
> > > [  584.157061]  [<ffffffff8102b352>] native_safe_halt+0x2/0x10
> > > [  584.157064]  [<ffffffff8100adf5>] default_idle+0x145/0x150
> > > [  584.157067]  [<ffffffff810020c6>] cpu_idle+0x66/0xc0
> > 
> > Care to try this?  There's likely a better way to defeat ->qsmask == 0
> > take/release all locks thingy, however, if Paul can safely bail in
> > force_qs_rnp() in tweakable latency for big boxen patch, I should be
> > able to safely (and shamelessly) steal that, and should someone hotplug
> > a CPU, and we race, do the same thing bail for small boxen.
> 
> Tested on a 48 cpu UV system with an interrupt latency test on isolated
> cpus and a moderate to heavy load on the rest of the system.
> 
> This patch appears to take care of all excessive (> 35 usec) RCU-based
> latency in the 3.0 kernel on this particular system for this particular
> setup.  Without the patch, I see many latencies on this system > 150 usec
> (and some > 200 usec).

Figures.  I bet Paul has a better idea though.  Too bad we can't whack
those extra barriers, that would likely wipe RCU from your radar.

-Mike


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online
  2012-03-23  4:48                             ` Mike Galbraith
@ 2012-03-23 19:23                               ` Paul E. McKenney
  2012-04-11 11:04                                 ` Mike Galbraith
  0 siblings, 1 reply; 32+ messages in thread
From: Paul E. McKenney @ 2012-03-23 19:23 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Dimitri Sivanich, linux-kernel

On Fri, Mar 23, 2012 at 05:48:06AM +0100, Mike Galbraith wrote:
> On Thu, 2012-03-22 at 15:24 -0500, Dimitri Sivanich wrote: 
> > On Thu, Mar 22, 2012 at 04:35:33PM +0100, Mike Galbraith wrote:
> 
> > > > This patch also shows great improvement in the two
> > > > rcu_for_each_node_breadth_first() (nothing over 20 usec and most less than
> > > > 10 in initial testing).
> > > > 
> > > > However, there are spinlock holdoffs at the following tracebacks (my nmi
> > > > handler does work on the 3.0 kernel):
> > > > 
> > > > [  584.157019]  [<ffffffff8144c5a0>] nmi+0x20/0x30
> > > > [  584.157023]  [<ffffffff8144bc8a>] _raw_spin_lock_irqsave+0x1a/0x30
> > > > [  584.157026]  [<ffffffff810c5f18>] force_qs_rnp+0x58/0x170
> > > > [  584.157030]  [<ffffffff810c6192>] force_quiescent_state+0x162/0x1d0
> > > > [  584.157033]  [<ffffffff810c6c95>] __rcu_process_callbacks+0x165/0x200
> > > > [  584.157037]  [<ffffffff810c6d4d>] rcu_process_callbacks+0x1d/0x80
> > > > [  584.157041]  [<ffffffff81061eaf>] __do_softirq+0xef/0x220
> > > > [  584.157044]  [<ffffffff81454cbc>] call_softirq+0x1c/0x30
> > > > [  584.157048]  [<ffffffff810043a5>] do_softirq+0x65/0xa0
> > > > [  584.157051]  [<ffffffff81061c85>] irq_exit+0xb5/0xe0
> > > > [  584.157054]  [<ffffffff810212c8>] smp_apic_timer_interrupt+0x68/0xa0
> > > > [  584.157057]  [<ffffffff81454473>] apic_timer_interrupt+0x13/0x20
> > > > [  584.157061]  [<ffffffff8102b352>] native_safe_halt+0x2/0x10
> > > > [  584.157064]  [<ffffffff8100adf5>] default_idle+0x145/0x150
> > > > [  584.157067]  [<ffffffff810020c6>] cpu_idle+0x66/0xc0
> > > 
> > > Care to try this?  There's likely a better way to defeat ->qsmask == 0
> > > take/release all locks thingy, however, if Paul can safely bail in
> > > force_qs_rnp() in tweakable latency for big boxen patch, I should be
> > > able to safely (and shamelessly) steal that, and should someone hotplug
> > > a CPU, and we race, do the same thing bail for small boxen.
> > 
> > Tested on a 48 cpu UV system with an interrupt latency test on isolated
> > cpus and a moderate to heavy load on the rest of the system.
> > 
> > This patch appears to take care of all excessive (> 35 usec) RCU-based
> > latency in the 3.0 kernel on this particular system for this particular
> > setup.  Without the patch, I see many latencies on this system > 150 usec
> > (and some > 200 usec).
> 
> Figures.  I bet Paul has a better idea though.  Too bad we can't whack
> those extra barriers, that would likely wipe RCU from your radar.

Sorry for the silence -- was hit by the germs going around.  I do have
some concerns about some of the code, but very much appreciate the two
of you continuing on this in my absence!

							Thanx, Paul


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online
  2012-03-23 19:23                               ` Paul E. McKenney
@ 2012-04-11 11:04                                 ` Mike Galbraith
  2012-04-13 18:42                                   ` Paul E. McKenney
  0 siblings, 1 reply; 32+ messages in thread
From: Mike Galbraith @ 2012-04-11 11:04 UTC (permalink / raw)
  To: paulmck; +Cc: Dimitri Sivanich, linux-kernel

On Fri, 2012-03-23 at 12:23 -0700, Paul E. McKenney wrote:
> On Fri, Mar 23, 2012 at 05:48:06AM +0100, Mike Galbraith wrote:
> > On Thu, 2012-03-22 at 15:24 -0500, Dimitri Sivanich wrote: 
> > > On Thu, Mar 22, 2012 at 04:35:33PM +0100, Mike Galbraith wrote:

> > > > Care to try this?  There's likely a better way to defeat ->qsmask == 0
> > > > take/release all locks thingy, however, if Paul can safely bail in
> > > > force_qs_rnp() in tweakable latency for big boxen patch, I should be
> > > > able to safely (and shamelessly) steal that, and should someone hotplug
> > > > a CPU, and we race, do the same thing bail for small boxen.
> > > 
> > > Tested on a 48 cpu UV system with an interrupt latency test on isolated
> > > cpus and a moderate to heavy load on the rest of the system.
> > > 
> > > This patch appears to take care of all excessive (> 35 usec) RCU-based
> > > latency in the 3.0 kernel on this particular system for this particular
> > > setup.  Without the patch, I see many latencies on this system > 150 usec
> > > (and some > 200 usec).
> > 
> > Figures.  I bet Paul has a better idea though.  Too bad we can't whack
> > those extra barriers, that would likely wipe RCU from your radar.
> 
> Sorry for the silence -- was hit by the germs going around.  I do have
> some concerns about some of the code, but very much appreciate the two
> of you continuing on this in my absence!

Hi Paul (and Dimitri),

Just got back to this.  I changed the patch around to check for a
hotplug event in force_qs_rnp(), and should that happen, process any
freshly added CPUs immediately rather than tell the caller to restart
from scratch.  The rest of the delta is cosmetic, and there should be
zero performance change.

Does this change address any of your concerns? 

---
 kernel/rcutree.c |   71 +++++++++++++++++++++++++++++++++++++++++++++----------
 kernel/rcutree.h |   16 ++++++++++--
 2 files changed, 73 insertions(+), 14 deletions(-)

--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -84,6 +84,8 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_d
 
 static struct rcu_state *rcu_state;
 
+int rcu_max_cpu __read_mostly;	/* Largest # CPU that has ever been online. */
+
 /*
  * The rcu_scheduler_active variable transitions from zero to one just
  * before the first task is spawned.  So when this variable is zero, RCU
@@ -827,25 +829,33 @@ rcu_start_gp(struct rcu_state *rsp, unsi
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
 	if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
+		struct rcu_node *rnp_root = rnp;
+
 		if (cpu_needs_another_gp(rsp, rdp))
 			rsp->fqs_need_gp = 1;
 		if (rnp->completed == rsp->completed) {
-			raw_spin_unlock_irqrestore(&rnp->lock, flags);
+			raw_spin_unlock_irqrestore(&rnp_root->lock, flags);
 			return;
 		}
-		raw_spin_unlock(&rnp->lock);	 /* irqs remain disabled. */
 
 		/*
 		 * Propagate new ->completed value to rcu_node structures
 		 * so that other CPUs don't have to wait until the start
 		 * of the next grace period to process their callbacks.
+		 * We must hold the root rcu_node structure's ->lock
+		 * across rcu_for_each_node_breadth_first() in order to
+		 * synchronize with CPUs coming online for the first time.
 		 */
 		rcu_for_each_node_breadth_first(rsp, rnp) {
+			if (rnp == rnp_root) {
+				rnp->completed = rsp->completed;
+				continue;
+			}
 			raw_spin_lock(&rnp->lock); /* irqs already disabled. */
 			rnp->completed = rsp->completed;
 			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
 		}
-		local_irq_restore(flags);
+		raw_spin_unlock_irqrestore(&rnp_root->lock, flags);
 		return;
 	}
 
@@ -935,7 +945,7 @@ static void rcu_report_qs_rsp(struct rcu
 		rsp->gp_max = gp_duration;
 	rsp->completed = rsp->gpnum;
 	rsp->signaled = RCU_GP_IDLE;
-	rcu_start_gp(rsp, flags);  /* releases root node's rnp->lock. */
+	rcu_start_gp(rsp, flags);  /* releases root node's ->lock. */
 }
 
 /*
@@ -1313,13 +1323,18 @@ void rcu_check_callbacks(int cpu, int us
 static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
 {
 	unsigned long bit;
-	int cpu;
+	int cpu, max_cpu = rcu_max_cpu, next = 0;
 	unsigned long flags;
 	unsigned long mask;
 	struct rcu_node *rnp;
 
+cpus_hotplugged:
 	rcu_for_each_leaf_node(rsp, rnp) {
-		mask = 0;
+		if (rnp->grplo > max_cpu)
+			break;
+		if(rnp->grphi < next)
+			continue;
+
 		raw_spin_lock_irqsave(&rnp->lock, flags);
 		if (!rcu_gp_in_progress(rsp)) {
 			raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -1330,14 +1345,16 @@ static void force_qs_rnp(struct rcu_stat
 			continue;
 		}
 		cpu = rnp->grplo;
-		bit = 1;
-		for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
+		if (unlikely(cpu < next))
+			cpu = next;
+		for (bit = 1, mask = 0; cpu <= rnp->grphi; cpu++, bit <<= 1) {
+			if (cpu > max_cpu)
+				break;
 			if ((rnp->qsmask & bit) != 0 &&
 			    f(per_cpu_ptr(rsp->rda, cpu)))
 				mask |= bit;
 		}
 		if (mask != 0) {
-
 			/* rcu_report_qs_rnp() releases rnp->lock. */
 			rcu_report_qs_rnp(mask, rsp, rnp, flags);
 			continue;
@@ -1345,10 +1362,20 @@ static void force_qs_rnp(struct rcu_stat
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	}
 	rnp = rcu_get_root(rsp);
-	if (rnp->qsmask == 0) {
-		raw_spin_lock_irqsave(&rnp->lock, flags);
-		rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
+	raw_spin_lock_irqsave(&rnp->lock, flags);
+
+	/* Handle unlikely intervening hotplug event. */
+	next = ++max_cpu;
+	max_cpu = rcu_get_max_cpu();
+	if (unlikely(next <= max_cpu)) {
+		raw_spin_unlock_irqrestore(&rnp->lock, flags);
+		goto cpus_hotplugged;
 	}
+
+	if (rnp->qsmask == 0)
+		rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
+	else
+		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
 
 /*
@@ -1862,6 +1889,7 @@ rcu_init_percpu_data(int cpu, struct rcu
 	unsigned long mask;
 	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 	struct rcu_node *rnp = rcu_get_root(rsp);
+	struct rcu_node *rnp_init;
 
 	/* Set up local state, ensuring consistent view of global state. */
 	raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -1882,6 +1910,20 @@ rcu_init_percpu_data(int cpu, struct rcu
 	/* Exclude any attempts to start a new GP on large systems. */
 	raw_spin_lock(&rsp->onofflock);		/* irqs already disabled. */
 
+	/*
+	 * Initialize any rcu_node structures that will see their first use.
+	 * Note that rcu_max_cpu cannot change out from under us because the
+	 * hotplug locks are held.
+	 */
+	raw_spin_lock(&rnp->lock);		/* irqs already disabled. */
+	for (rnp_init = per_cpu_ptr(rsp->rda, rcu_max_cpu)->mynode + 1;
+	     rnp_init <= rdp->mynode;
+	     rnp_init++) {
+		rnp_init->gpnum = rsp->gpnum;
+		rnp_init->completed = rsp->completed;
+	}
+	raw_spin_unlock(&rnp->lock);		/* irqs remain disabled. */
+
 	/* Add CPU to rcu_node bitmasks. */
 	rnp = rdp->mynode;
 	mask = rdp->grpmask;
@@ -1907,6 +1949,11 @@ static void __cpuinit rcu_prepare_cpu(in
 	rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
 	rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
 	rcu_preempt_init_percpu_data(cpu);
+	if (cpu > rcu_max_cpu) {
+		smp_mb(); /* Initialization before rcu_max_cpu assignment. */
+		rcu_max_cpu = cpu;
+		smp_mb(); /* rcu_max_cpu assignment before later uses. */
+	}
 }
 
 /*
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -191,11 +191,23 @@ struct rcu_node {
 
 /*
  * Do a full breadth-first scan of the rcu_node structures for the
- * specified rcu_state structure.
+ * specified rcu_state structure.  The caller must hold either the
+ * ->onofflock or the root rcu_node structure's ->lock.
  */
+extern int rcu_max_cpu;
+static inline int rcu_get_max_cpu(void)
+{
+	int ret;
+
+	smp_mb();  /* Pairs with barriers in rcu_prepare_cpu(). */
+	ret = rcu_max_cpu;
+	smp_mb();  /* Pairs with barriers in rcu_prepare_cpu(). */
+	return ret;
+}
 #define rcu_for_each_node_breadth_first(rsp, rnp) \
 	for ((rnp) = &(rsp)->node[0]; \
-	     (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
+	     (rnp) <= per_cpu_ptr((rsp)->rda, rcu_get_max_cpu())->mynode; \
+	     (rnp)++)
 
 /*
  * Do a breadth-first scan of the non-leaf rcu_node structures for the



^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online
  2012-04-11 11:04                                 ` Mike Galbraith
@ 2012-04-13 18:42                                   ` Paul E. McKenney
  2012-04-14  5:42                                     ` Mike Galbraith
  0 siblings, 1 reply; 32+ messages in thread
From: Paul E. McKenney @ 2012-04-13 18:42 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Dimitri Sivanich, linux-kernel

On Wed, Apr 11, 2012 at 01:04:36PM +0200, Mike Galbraith wrote:
> On Fri, 2012-03-23 at 12:23 -0700, Paul E. McKenney wrote:
> > On Fri, Mar 23, 2012 at 05:48:06AM +0100, Mike Galbraith wrote:
> > > On Thu, 2012-03-22 at 15:24 -0500, Dimitri Sivanich wrote: 
> > > > On Thu, Mar 22, 2012 at 04:35:33PM +0100, Mike Galbraith wrote:
> 
> > > > > Care to try this?  There's likely a better way to defeat ->qsmask == 0
> > > > > take/release all locks thingy, however, if Paul can safely bail in
> > > > > force_qs_rnp() in tweakable latency for big boxen patch, I should be
> > > > > able to safely (and shamelessly) steal that, and should someone hotplug
> > > > > a CPU, and we race, do the same thing bail for small boxen.
> > > > 
> > > > Tested on a 48 cpu UV system with an interrupt latency test on isolated
> > > > cpus and a moderate to heavy load on the rest of the system.
> > > > 
> > > > This patch appears to take care of all excessive (> 35 usec) RCU-based
> > > > latency in the 3.0 kernel on this particular system for this particular
> > > > setup.  Without the patch, I see many latencies on this system > 150 usec
> > > > (and some > 200 usec).
> > > 
> > > Figures.  I bet Paul has a better idea though.  Too bad we can't whack
> > > those extra barriers, that would likely wipe RCU from your radar.
> > 
> > Sorry for the silence -- was hit by the germs going around.  I do have
> > some concerns about some of the code, but very much appreciate the two
> > of you continuing on this in my absence!
> 
> Hi Paul (and Dimitri),
> 
> Just got back to this.  I changed the patch around to check for a
> hotplug event in force_qs_rnp(), and should that happen, process any
> freshly added CPUs immediately rather than tell the caller to restart
> from scratch.  The rest of the delta is cosmetic, and there should be
> zero performance change.
> 
> Does this change address any of your concerns? 

Apologies for being slow to respond...

One of my main concerns was present in my original patch:  I now believe
that a given grace period needs to operate on the set of rcu_node
structures (and CPUs) that were present at the beginning of the grace
period.  Otherwise, things could get confused if a given CPU participated
in an later force_quiescent_state() state, but not in an earlier one.

I believe that the correct way to handle this is to squirrel the maximum
number of CPUs away in the rcu_state structure at the beginning of each
grace period and use that as the limit.

I call out a few more below.

						Thanx, Paul

> ---
>  kernel/rcutree.c |   71 +++++++++++++++++++++++++++++++++++++++++++++----------
>  kernel/rcutree.h |   16 ++++++++++--
>  2 files changed, 73 insertions(+), 14 deletions(-)
> 
> --- a/kernel/rcutree.c
> +++ b/kernel/rcutree.c
> @@ -84,6 +84,8 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_d
> 
>  static struct rcu_state *rcu_state;
> 
> +int rcu_max_cpu __read_mostly;	/* Largest # CPU that has ever been online. */
> +
>  /*
>   * The rcu_scheduler_active variable transitions from zero to one just
>   * before the first task is spawned.  So when this variable is zero, RCU
> @@ -827,25 +829,33 @@ rcu_start_gp(struct rcu_state *rsp, unsi
>  	struct rcu_node *rnp = rcu_get_root(rsp);
> 
>  	if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
> +		struct rcu_node *rnp_root = rnp;
> +
>  		if (cpu_needs_another_gp(rsp, rdp))
>  			rsp->fqs_need_gp = 1;
>  		if (rnp->completed == rsp->completed) {
> -			raw_spin_unlock_irqrestore(&rnp->lock, flags);
> +			raw_spin_unlock_irqrestore(&rnp_root->lock, flags);
>  			return;
>  		}
> -		raw_spin_unlock(&rnp->lock);	 /* irqs remain disabled. */

Acquiring non-root rcu_node structure ->lock (in loop below) while
holding the root rcu_node lock results in deadlock in some configurations.

>  		/*
>  		 * Propagate new ->completed value to rcu_node structures
>  		 * so that other CPUs don't have to wait until the start
>  		 * of the next grace period to process their callbacks.
> +		 * We must hold the root rcu_node structure's ->lock
> +		 * across rcu_for_each_node_breadth_first() in order to
> +		 * synchronize with CPUs coming online for the first time.
>  		 */
>  		rcu_for_each_node_breadth_first(rsp, rnp) {
> +			if (rnp == rnp_root) {
> +				rnp->completed = rsp->completed;
> +				continue;
> +			}
>  			raw_spin_lock(&rnp->lock); /* irqs already disabled. */
>  			rnp->completed = rsp->completed;
>  			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
>  		}
> -		local_irq_restore(flags);
> +		raw_spin_unlock_irqrestore(&rnp_root->lock, flags);
>  		return;
>  	}
> 
> @@ -935,7 +945,7 @@ static void rcu_report_qs_rsp(struct rcu
>  		rsp->gp_max = gp_duration;
>  	rsp->completed = rsp->gpnum;
>  	rsp->signaled = RCU_GP_IDLE;
> -	rcu_start_gp(rsp, flags);  /* releases root node's rnp->lock. */
> +	rcu_start_gp(rsp, flags);  /* releases root node's ->lock. */
>  }
> 
>  /*
> @@ -1313,13 +1323,18 @@ void rcu_check_callbacks(int cpu, int us
>  static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
>  {
>  	unsigned long bit;
> -	int cpu;
> +	int cpu, max_cpu = rcu_max_cpu, next = 0;
>  	unsigned long flags;
>  	unsigned long mask;
>  	struct rcu_node *rnp;
> 
> +cpus_hotplugged:
>  	rcu_for_each_leaf_node(rsp, rnp) {
> -		mask = 0;

Doesn't this leave mask uninitialized?

> +		if (rnp->grplo > max_cpu)
> +			break;
> +		if(rnp->grphi < next)
> +			continue;
> +
>  		raw_spin_lock_irqsave(&rnp->lock, flags);
>  		if (!rcu_gp_in_progress(rsp)) {
>  			raw_spin_unlock_irqrestore(&rnp->lock, flags);
> @@ -1330,14 +1345,16 @@ static void force_qs_rnp(struct rcu_stat
>  			continue;
>  		}
>  		cpu = rnp->grplo;
> -		bit = 1;
> -		for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
> +		if (unlikely(cpu < next))
> +			cpu = next;
> +		for (bit = 1, mask = 0; cpu <= rnp->grphi; cpu++, bit <<= 1) {
> +			if (cpu > max_cpu)
> +				break;
>  			if ((rnp->qsmask & bit) != 0 &&
>  			    f(per_cpu_ptr(rsp->rda, cpu)))
>  				mask |= bit;
>  		}
>  		if (mask != 0) {
> -
>  			/* rcu_report_qs_rnp() releases rnp->lock. */
>  			rcu_report_qs_rnp(mask, rsp, rnp, flags);
>  			continue;
> @@ -1345,10 +1362,20 @@ static void force_qs_rnp(struct rcu_stat
>  		raw_spin_unlock_irqrestore(&rnp->lock, flags);
>  	}
>  	rnp = rcu_get_root(rsp);
> -	if (rnp->qsmask == 0) {
> -		raw_spin_lock_irqsave(&rnp->lock, flags);
> -		rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
> +	raw_spin_lock_irqsave(&rnp->lock, flags);
> +
> +	/* Handle unlikely intervening hotplug event. */
> +	next = ++max_cpu;
> +	max_cpu = rcu_get_max_cpu();
> +	if (unlikely(next <= max_cpu)) {
> +		raw_spin_unlock_irqrestore(&rnp->lock, flags);
> +		goto cpus_hotplugged;

I don't believe that we need this if we snapshot rcu_max_cpu in the
rcu_state structure at the beginning of each grace period.

>  	}
> +
> +	if (rnp->qsmask == 0)
> +		rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
> +	else
> +		raw_spin_unlock_irqrestore(&rnp->lock, flags);
>  }
> 
>  /*
> @@ -1862,6 +1889,7 @@ rcu_init_percpu_data(int cpu, struct rcu
>  	unsigned long mask;
>  	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
>  	struct rcu_node *rnp = rcu_get_root(rsp);
> +	struct rcu_node *rnp_init;
> 
>  	/* Set up local state, ensuring consistent view of global state. */
>  	raw_spin_lock_irqsave(&rnp->lock, flags);
> @@ -1882,6 +1910,20 @@ rcu_init_percpu_data(int cpu, struct rcu
>  	/* Exclude any attempts to start a new GP on large systems. */
>  	raw_spin_lock(&rsp->onofflock);		/* irqs already disabled. */
> 
> +	/*
> +	 * Initialize any rcu_node structures that will see their first use.
> +	 * Note that rcu_max_cpu cannot change out from under us because the
> +	 * hotplug locks are held.
> +	 */
> +	raw_spin_lock(&rnp->lock);		/* irqs already disabled. */
> +	for (rnp_init = per_cpu_ptr(rsp->rda, rcu_max_cpu)->mynode + 1;
> +	     rnp_init <= rdp->mynode;
> +	     rnp_init++) {
> +		rnp_init->gpnum = rsp->gpnum;
> +		rnp_init->completed = rsp->completed;
> +	}
> +	raw_spin_unlock(&rnp->lock);		/* irqs remain disabled. */
> +
>  	/* Add CPU to rcu_node bitmasks. */
>  	rnp = rdp->mynode;
>  	mask = rdp->grpmask;
> @@ -1907,6 +1949,11 @@ static void __cpuinit rcu_prepare_cpu(in
>  	rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
>  	rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
>  	rcu_preempt_init_percpu_data(cpu);
> +	if (cpu > rcu_max_cpu) {
> +		smp_mb(); /* Initialization before rcu_max_cpu assignment. */
> +		rcu_max_cpu = cpu;
> +		smp_mb(); /* rcu_max_cpu assignment before later uses. */

If we make rcu_init_percpu_data() update a second new field in the
rcu_state structure, we can get rid of the memory barriers.

> +	}
>  }
> 
>  /*
> --- a/kernel/rcutree.h
> +++ b/kernel/rcutree.h
> @@ -191,11 +191,23 @@ struct rcu_node {
> 
>  /*
>   * Do a full breadth-first scan of the rcu_node structures for the
> - * specified rcu_state structure.
> + * specified rcu_state structure.  The caller must hold either the
> + * ->onofflock or the root rcu_node structure's ->lock.
>   */
> +extern int rcu_max_cpu;
> +static inline int rcu_get_max_cpu(void)
> +{
> +	int ret;
> +
> +	smp_mb();  /* Pairs with barriers in rcu_prepare_cpu(). */
> +	ret = rcu_max_cpu;
> +	smp_mb();  /* Pairs with barriers in rcu_prepare_cpu(). */
> +	return ret;
> +}
>  #define rcu_for_each_node_breadth_first(rsp, rnp) \
>  	for ((rnp) = &(rsp)->node[0]; \
> -	     (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
> +	     (rnp) <= per_cpu_ptr((rsp)->rda, rcu_get_max_cpu())->mynode; \
> +	     (rnp)++)
> 
>  /*
>   * Do a breadth-first scan of the non-leaf rcu_node structures for the
> 
> 


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online
  2012-04-13 18:42                                   ` Paul E. McKenney
@ 2012-04-14  5:42                                     ` Mike Galbraith
  0 siblings, 0 replies; 32+ messages in thread
From: Mike Galbraith @ 2012-04-14  5:42 UTC (permalink / raw)
  To: paulmck; +Cc: Dimitri Sivanich, linux-kernel

On Fri, 2012-04-13 at 11:42 -0700, Paul E. McKenney wrote: 
> On Wed, Apr 11, 2012 at 01:04:36PM +0200, Mike Galbraith wrote:

> > Hi Paul (and Dimitri),
> > 
> > Just got back to this.  I changed the patch around to check for a
> > hotplug event in force_qs_rnp(), and should that happen, process any
> > freshly added CPUs immediately rather than tell the caller to restart
> > from scratch.  The rest of the delta is cosmetic, and there should be
> > zero performance change.
> > 
> > Does this change address any of your concerns? 
> 
> Apologies for being slow to respond...

Hey, I'm grateful for any time you have to share.

> One of my main concerns was present in my original patch:  I now believe
> that a given grace period needs to operate on the set of rcu_node
> structures (and CPUs) that were present at the beginning of the grace
> period.  Otherwise, things could get confused if a given CPU participated
> in an later force_quiescent_state() state, but not in an earlier one.
> 
> I believe that the correct way to handle this is to squirrel the maximum
> number of CPUs away in the rcu_state structure at the beginning of each
> grace period and use that as the limit.
> 
> I call out a few more below.

Read it, got it.  The last node being partially initialized still causes
some head scratching, I'll ponder that more between explosions.  Thanks.

-Mike


^ permalink raw reply	[flat|nested] 32+ messages in thread

end of thread, other threads:[~2012-04-14  5:43 UTC | newest]

Thread overview: 32+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-03-14  0:24 [PATCH RFC] rcu: Limit GP initialization to CPUs that have been online Paul E. McKenney
2012-03-14  9:24 ` Mike Galbraith
2012-03-14 12:40   ` Mike Galbraith
2012-03-14 13:08     ` Dimitri Sivanich
2012-03-14 15:17       ` Paul E. McKenney
2012-03-14 16:56         ` Paul E. McKenney
2012-03-15  2:42           ` Mike Galbraith
2012-03-15  3:07             ` Mike Galbraith
2012-03-15 17:02               ` Paul E. McKenney
2012-03-15 17:21                 ` Dimitri Sivanich
2012-03-16  4:45                 ` Mike Galbraith
2012-03-15 17:59               ` Dimitri Sivanich
2012-03-16  7:27                 ` Mike Galbraith
2012-03-16  8:09                   ` Mike Galbraith
2012-03-16  8:45                     ` Mike Galbraith
2012-03-16 17:28                       ` Dimitri Sivanich
2012-03-16 17:51                         ` Paul E. McKenney
2012-03-16 17:56                           ` Dimitri Sivanich
2012-03-16 19:11                         ` Mike Galbraith
2012-03-22 15:35                         ` Mike Galbraith
2012-03-22 20:24                           ` Dimitri Sivanich
2012-03-23  4:48                             ` Mike Galbraith
2012-03-23 19:23                               ` Paul E. McKenney
2012-04-11 11:04                                 ` Mike Galbraith
2012-04-13 18:42                                   ` Paul E. McKenney
2012-04-14  5:42                                     ` Mike Galbraith
2012-03-15 17:58           ` Dimitri Sivanich
2012-03-15 18:23             ` Paul E. McKenney
2012-03-15 21:07               ` Paul E. McKenney
2012-03-16 15:46                 ` Dimitri Sivanich
2012-03-16 17:21                   ` Paul E. McKenney
2012-03-14 17:07         ` Mike Galbraith

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.