All of lore.kernel.org
 help / color / mirror / Atom feed
* RFC: Lower broadcast timer idle path lock contention
@ 2011-05-06 21:40 Andi Kleen
  2011-05-06 21:40 ` [PATCH 1/4] Move C3 stop test outside lock Andi Kleen
                   ` (3 more replies)
  0 siblings, 4 replies; 6+ messages in thread
From: Andi Kleen @ 2011-05-06 21:40 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: Chris Mason, Tim Chen, linux-kernel

This patchkit implements some ideas discussed in the earlier thread
to lower the lock contention of the broadcast state idle path:

- Do check for good C3 timer outside the lock
[Already in tip, but I'm including it to have a full patchkit]
- Change the global bitmask to avoid false sharing
- Exploit that thread siblings keep the timer alive
[This patch is unfortunately quite hairy and needs very indepth review]
This uses a lock, but only a local one for a core.
- Only take the global lock when the timeout of the broadcast device
is set earlier.

Longer term the global lock likely still needs to be eliminated, but 
this at least makes it less pressing.

Comments/testing welcome. I didn't do a lot of scalability testing so far,
just ran a stress tester to make sure I didn't miss some case.

-Andi


^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH 1/4] Move C3 stop test outside lock
  2011-05-06 21:40 RFC: Lower broadcast timer idle path lock contention Andi Kleen
@ 2011-05-06 21:40 ` Andi Kleen
  2011-05-06 21:40 ` [PATCH 2/4] Move oneshot broadcast mask to per cpu variables Andi Kleen
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 6+ messages in thread
From: Andi Kleen @ 2011-05-06 21:40 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: Chris Mason, Tim Chen, linux-kernel, Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

Avoid taking locks in the idle path for systems where the timer
doesn't stop in C3.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 kernel/time/tick-broadcast.c |   11 ++++-------
 1 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index da800ff..2fc424d 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -456,23 +456,22 @@ void tick_broadcast_oneshot_control(unsigned long reason)
 	unsigned long flags;
 	int cpu;
 
-	raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
-
 	/*
 	 * Periodic mode does not care about the enter/exit of power
 	 * states
 	 */
 	if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
-		goto out;
+		return;
 
-	bc = tick_broadcast_device.evtdev;
 	cpu = smp_processor_id();
+	bc = tick_broadcast_device.evtdev;
 	td = &per_cpu(tick_cpu_device, cpu);
 	dev = td->evtdev;
 
 	if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
-		goto out;
+		return;
 
+	raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
 	if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
 		if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) {
 			cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask());
@@ -489,8 +488,6 @@ void tick_broadcast_oneshot_control(unsigned long reason)
 				tick_program_event(dev->next_event, 1);
 		}
 	}
-
-out:
 	raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
 
-- 
1.7.4.4


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 2/4] Move oneshot broadcast mask to per cpu variables
  2011-05-06 21:40 RFC: Lower broadcast timer idle path lock contention Andi Kleen
  2011-05-06 21:40 ` [PATCH 1/4] Move C3 stop test outside lock Andi Kleen
@ 2011-05-06 21:40 ` Andi Kleen
  2011-05-06 21:40 ` [PATCH 3/4] Avoid tick broadcast switch-overs for thread siblings Andi Kleen
  2011-05-06 21:40 ` [PATCH 4/4] Avoid broadcast time lock when not changing the timeout Andi Kleen
  3 siblings, 0 replies; 6+ messages in thread
From: Andi Kleen @ 2011-05-06 21:40 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: Chris Mason, Tim Chen, linux-kernel, Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

Avoid a global cache line hotspot in the oneshot cpu mask. Maintain
this information in per cpu variables instead.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 include/linux/tick.h         |    2 +-
 kernel/time/tick-broadcast.c |   40 ++++++++++++++++++++++++----------------
 kernel/time/timer_list.c     |   11 +++++++++--
 3 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/include/linux/tick.h b/include/linux/tick.h
index b232ccc..3df6e2e 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -90,7 +90,7 @@ extern struct tick_device *tick_get_broadcast_device(void);
 extern struct cpumask *tick_get_broadcast_mask(void);
 
 #  ifdef CONFIG_TICK_ONESHOT
-extern struct cpumask *tick_get_broadcast_oneshot_mask(void);
+extern void tick_get_broadcast_oneshot_mask(struct cpumask *);
 #  endif
 
 # endif /* BROADCAST */
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 2fc424d..92aba0b 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -358,15 +358,22 @@ int tick_resume_broadcast(void)
 
 #ifdef CONFIG_TICK_ONESHOT
 
-/* FIXME: use cpumask_var_t. */
-static DECLARE_BITMAP(tick_broadcast_oneshot_mask, NR_CPUS);
+struct broadcast_cpu_state {
+	int need_oneshot;
+} ____cacheline_aligned;
+static DEFINE_PER_CPU(struct broadcast_cpu_state, state);
 
 /*
  * Exposed for debugging: see timer_list.c
  */
-struct cpumask *tick_get_broadcast_oneshot_mask(void)
+void tick_get_broadcast_oneshot_mask(struct cpumask *mask)
 {
-	return to_cpumask(tick_broadcast_oneshot_mask);
+	int i;
+
+	for_each_online_cpu (i) {
+		if (per_cpu(state, i).need_oneshot)
+			cpumask_set_cpu(i, mask);
+	}
 }
 
 static int tick_broadcast_set_event(ktime_t expires, int force)
@@ -388,7 +395,7 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
  */
 void tick_check_oneshot_broadcast(int cpu)
 {
-	if (cpumask_test_cpu(cpu, to_cpumask(tick_broadcast_oneshot_mask))) {
+	if (per_cpu(state, cpu).need_oneshot) {
 		struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
 
 		clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT);
@@ -411,7 +418,9 @@ again:
 	cpumask_clear(to_cpumask(tmpmask));
 	now = ktime_get();
 	/* Find all expired events */
-	for_each_cpu(cpu, tick_get_broadcast_oneshot_mask()) {
+	for_each_online_cpu(cpu) {
+		if (!per_cpu(state, cpu).need_oneshot)
+			continue;
 		td = &per_cpu(tick_cpu_device, cpu);
 		if (td->evtdev->next_event.tv64 <= now.tv64)
 			cpumask_set_cpu(cpu, to_cpumask(tmpmask));
@@ -473,16 +482,15 @@ void tick_broadcast_oneshot_control(unsigned long reason)
 
 	raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
 	if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
-		if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) {
-			cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask());
+		if (!__get_cpu_var(state).need_oneshot) {
+			__get_cpu_var(state).need_oneshot = 1;
 			clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
 			if (dev->next_event.tv64 < bc->next_event.tv64)
 				tick_broadcast_set_event(dev->next_event, 1);
 		}
 	} else {
-		if (cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) {
-			cpumask_clear_cpu(cpu,
-					  tick_get_broadcast_oneshot_mask());
+		if (__get_cpu_var(state).need_oneshot) {
+			__get_cpu_var(state).need_oneshot = 0;
 			clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
 			if (dev->next_event.tv64 != KTIME_MAX)
 				tick_program_event(dev->next_event, 1);
@@ -498,7 +506,7 @@ void tick_broadcast_oneshot_control(unsigned long reason)
  */
 static void tick_broadcast_clear_oneshot(int cpu)
 {
-	cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask());
+	per_cpu(state, cpu).need_oneshot = 0;
 }
 
 static void tick_broadcast_init_next_event(struct cpumask *mask,
@@ -523,6 +531,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 	if (bc->event_handler != tick_handle_oneshot_broadcast) {
 		int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC;
 		int cpu = smp_processor_id();
+		int i;
 
 		bc->event_handler = tick_handle_oneshot_broadcast;
 		clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
@@ -538,9 +547,8 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 		 */
 		cpumask_copy(to_cpumask(tmpmask), tick_get_broadcast_mask());
 		cpumask_clear_cpu(cpu, to_cpumask(tmpmask));
-		cpumask_or(tick_get_broadcast_oneshot_mask(),
-			   tick_get_broadcast_oneshot_mask(),
-			   to_cpumask(tmpmask));
+		for_each_cpu (i, to_cpumask(tmpmask))
+			per_cpu(state, i).need_oneshot = 1;
 
 		if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) {
 			tick_broadcast_init_next_event(to_cpumask(tmpmask),
@@ -583,7 +591,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
 	 * Clear the broadcast mask flag for the dead cpu, but do not
 	 * stop the broadcast device!
 	 */
-	cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask());
+	per_cpu(state, cpu).need_oneshot = 0;
 
 	raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 3258455..28964ed 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -235,14 +235,21 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
 static void timer_list_show_tickdevices(struct seq_file *m)
 {
 	int cpu;
+#ifdef CONFIG_TICK_ONESHOT
+	cpumask_var_t mask;
+#endif
 
 #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
 	print_tickdevice(m, tick_get_broadcast_device(), -1);
 	SEQ_printf(m, "tick_broadcast_mask: %08lx\n",
 		   cpumask_bits(tick_get_broadcast_mask())[0]);
 #ifdef CONFIG_TICK_ONESHOT
-	SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n",
-		   cpumask_bits(tick_get_broadcast_oneshot_mask())[0]);
+	if (zalloc_cpumask_var(&mask, GFP_KERNEL) == 0) {
+		tick_get_broadcast_oneshot_mask(mask);
+		SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n",
+			   cpumask_bits(mask)[0]);
+		free_cpumask_var(mask);
+	}
 #endif
 	SEQ_printf(m, "\n");
 #endif
-- 
1.7.4.4


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 3/4] Avoid tick broadcast switch-overs for thread siblings
  2011-05-06 21:40 RFC: Lower broadcast timer idle path lock contention Andi Kleen
  2011-05-06 21:40 ` [PATCH 1/4] Move C3 stop test outside lock Andi Kleen
  2011-05-06 21:40 ` [PATCH 2/4] Move oneshot broadcast mask to per cpu variables Andi Kleen
@ 2011-05-06 21:40 ` Andi Kleen
  2011-05-07  5:32   ` Dave Kleikamp
  2011-05-06 21:40 ` [PATCH 4/4] Avoid broadcast time lock when not changing the timeout Andi Kleen
  3 siblings, 1 reply; 6+ messages in thread
From: Andi Kleen @ 2011-05-06 21:40 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: Chris Mason, Tim Chen, linux-kernel, Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

On SMT systems the thread siblings will keep the timer alive
in any power state. Teach the oneshot broadcast logic about this.

As long as any thread sibling is alive keep using the local timer
device. When we actually switch over to broadcast we need
to use the nearest timer expire of all the siblings.

This adds a new "slave" state: a slave is tied to another CPU.
When the other CPU goes idle too switch over all slaves
to broadcast timing.

This lowers locking contention on the broadcast lock and
general overhead.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 kernel/time/tick-broadcast.c |   97 +++++++++++++++++++++++++++++++++++++++---
 1 files changed, 90 insertions(+), 7 deletions(-)

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 92aba0b..c1587cb 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -358,10 +358,16 @@ int tick_resume_broadcast(void)
 
 #ifdef CONFIG_TICK_ONESHOT
 
+/* Lock on the first thread on a core coordinates state */
 struct broadcast_cpu_state {
+	int slave;
 	int need_oneshot;
+	raw_spinlock_t lock;
 } ____cacheline_aligned;
-static DEFINE_PER_CPU(struct broadcast_cpu_state, state);
+
+static DEFINE_PER_CPU(struct broadcast_cpu_state, state) = {
+	.lock = __RAW_SPIN_LOCK_UNLOCKED(lock)
+};
 
 /*
  * Exposed for debugging: see timer_list.c
@@ -454,6 +460,70 @@ again:
 	raw_spin_unlock(&tick_broadcast_lock);
 }
 
+#define for_each_sibling(i, cpu) for_each_cpu(i, topology_thread_cpumask(cpu))
+
+/* 
+ * When another thread sibling is alive our timer keeps ticking.
+ * Check for this here because it's much less expensive.
+ * When this happens the current CPU turns into a slave, tied
+ * to the still running CPU. When that also goes idle both
+ * become serviced by the broadcaster.
+ */
+static int tick_sibling_active(int cpu, ktime_t *timeout, int enter)
+{
+	int i, leader;
+	int running;
+	ktime_t n;
+
+	/* 
+ 	 * Exit can be done lockless because unidling 
+ 	 * does not affect others.
+ 	 */
+	if (!enter) {
+		int was_slave = __get_cpu_var(state).slave;
+		__get_cpu_var(state).slave = 0;
+		return was_slave;
+	}
+
+	leader = cpumask_first(topology_thread_cpumask(cpu));
+	running = 1;
+	raw_spin_lock(&per_cpu(state, leader).lock);
+	for_each_sibling(i, cpu) {
+		struct broadcast_cpu_state *s = &per_cpu(state, i);
+
+		n = per_cpu(tick_cpu_device, i).evtdev->next_event;
+		if (n.tv64 < timeout->tv64 && (s->slave || s->need_oneshot))
+			*timeout = n;
+		if (!s->slave && !s->need_oneshot)
+			running++;
+	}
+	__get_cpu_var(state).slave = running > 1;
+	raw_spin_unlock(&per_cpu(state, leader).lock);
+	return running > 1;
+}
+
+/* 
+ * Sync oneshot state with siblings.
+ */
+static void set_broadcast_sibling_state(int cpu, int enter)
+{
+	int i;
+
+	for_each_sibling(i, cpu) {
+		struct broadcast_cpu_state *s = &per_cpu(state, i);
+
+		if (enter && s->slave) {
+			s->need_oneshot = 1;
+			wmb();
+			s->slave = 0;
+		} else if (!enter && s->need_oneshot) {
+			s->slave = 1;
+			wmb();
+			s->need_oneshot = 0;
+		}
+	}
+}
+
 /*
  * Powerstate information: The system enters/leaves a state, where
  * affected devices might stop
@@ -464,7 +534,8 @@ void tick_broadcast_oneshot_control(unsigned long reason)
 	struct tick_device *td;
 	unsigned long flags;
 	int cpu;
-
+	ktime_t timeout;
+	
 	/*
 	 * Periodic mode does not care about the enter/exit of power
 	 * states
@@ -476,21 +547,28 @@ void tick_broadcast_oneshot_control(unsigned long reason)
 	bc = tick_broadcast_device.evtdev;
 	td = &per_cpu(tick_cpu_device, cpu);
 	dev = td->evtdev;
+	timeout = td->evtdev->next_event;
 
 	if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
 		return;
 
+	if (tick_sibling_active(cpu, &timeout,
+				reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER))
+		return;
+
 	raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
 	if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
 		if (!__get_cpu_var(state).need_oneshot) {
-			__get_cpu_var(state).need_oneshot = 1;
+			/* Turn all slaves into oneshots */
+			set_broadcast_sibling_state(cpu, 1);
 			clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
-			if (dev->next_event.tv64 < bc->next_event.tv64)
-				tick_broadcast_set_event(dev->next_event, 1);
+			if (timeout.tv64 < bc->next_event.tv64)
+				tick_broadcast_set_event(timeout, 1);
 		}
 	} else {
 		if (__get_cpu_var(state).need_oneshot) {
-			__get_cpu_var(state).need_oneshot = 0;
+			/* Turn all oneshots into slaves */
+			set_broadcast_sibling_state(cpu, 0);
 			clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
 			if (dev->next_event.tv64 != KTIME_MAX)
 				tick_program_event(dev->next_event, 1);
@@ -506,7 +584,12 @@ void tick_broadcast_oneshot_control(unsigned long reason)
  */
 static void tick_broadcast_clear_oneshot(int cpu)
 {
-	per_cpu(state, cpu).need_oneshot = 0;
+	int i;
+
+	for_each_sibling (i, cpu) {
+		per_cpu(state, i).need_oneshot = 0;
+		per_cpu(state, i).slave = 0;
+	}
 }
 
 static void tick_broadcast_init_next_event(struct cpumask *mask,
-- 
1.7.4.4


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 4/4] Avoid broadcast time lock when not changing the timeout
  2011-05-06 21:40 RFC: Lower broadcast timer idle path lock contention Andi Kleen
                   ` (2 preceding siblings ...)
  2011-05-06 21:40 ` [PATCH 3/4] Avoid tick broadcast switch-overs for thread siblings Andi Kleen
@ 2011-05-06 21:40 ` Andi Kleen
  3 siblings, 0 replies; 6+ messages in thread
From: Andi Kleen @ 2011-05-06 21:40 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: Chris Mason, Tim Chen, linux-kernel, Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

Don't take the global lock when the new timeout is larger than the
old timeout.

The timeout on the broadcast device should never shrink, so this
is safe to do.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 kernel/time/tick-broadcast.c |   17 ++++++++++++-----
 1 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index c1587cb..4776742 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -524,6 +524,17 @@ static void set_broadcast_sibling_state(int cpu, int enter)
 	}
 }
 
+static void set_broadcast_timeout(ktime_t to, struct clock_event_device *bc)
+{
+	unsigned long flags;
+
+	if (to.tv64 < bc->next_event.tv64) {
+		raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
+		tick_broadcast_set_event(to, 1);
+		raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+	}
+}
+
 /*
  * Powerstate information: The system enters/leaves a state, where
  * affected devices might stop
@@ -532,7 +543,6 @@ void tick_broadcast_oneshot_control(unsigned long reason)
 {
 	struct clock_event_device *bc, *dev;
 	struct tick_device *td;
-	unsigned long flags;
 	int cpu;
 	ktime_t timeout;
 	
@@ -556,14 +566,12 @@ void tick_broadcast_oneshot_control(unsigned long reason)
 				reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER))
 		return;
 
-	raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
 	if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
 		if (!__get_cpu_var(state).need_oneshot) {
 			/* Turn all slaves into oneshots */
 			set_broadcast_sibling_state(cpu, 1);
 			clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
-			if (timeout.tv64 < bc->next_event.tv64)
-				tick_broadcast_set_event(timeout, 1);
+			set_broadcast_timeout(timeout, bc);
 		}
 	} else {
 		if (__get_cpu_var(state).need_oneshot) {
@@ -574,7 +582,6 @@ void tick_broadcast_oneshot_control(unsigned long reason)
 				tick_program_event(dev->next_event, 1);
 		}
 	}
-	raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
 
 /*
-- 
1.7.4.4


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH 3/4] Avoid tick broadcast switch-overs for thread siblings
  2011-05-06 21:40 ` [PATCH 3/4] Avoid tick broadcast switch-overs for thread siblings Andi Kleen
@ 2011-05-07  5:32   ` Dave Kleikamp
  0 siblings, 0 replies; 6+ messages in thread
From: Dave Kleikamp @ 2011-05-07  5:32 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Thomas Gleixner, Chris Mason, Tim Chen, linux-kernel, Andi Kleen

On 05/06/2011 04:40 PM, Andi Kleen wrote:
> From: Andi Kleen<ak@linux.intel.com>
>
> On SMT systems the thread siblings will keep the timer alive
> in any power state. Teach the oneshot broadcast logic about this.
>
> As long as any thread sibling is alive keep using the local timer
> device. When we actually switch over to broadcast we need
> to use the nearest timer expire of all the siblings.
>
> This adds a new "slave" state: a slave is tied to another CPU.
> When the other CPU goes idle too switch over all slaves
> to broadcast timing.
>
> This lowers locking contention on the broadcast lock and
> general overhead.
>
> Signed-off-by: Andi Kleen<ak@linux.intel.com>

This patch causes a 128-cpu system to hang during boot. I've got a busy 
weekend planned, so I might not get a chance to look at this much more 
before Monday.

I tried fixing the problems I found below, but it still doesn't make it 
all the way through the boot, so I'm missing something.

> ---
>   kernel/time/tick-broadcast.c |   97 +++++++++++++++++++++++++++++++++++++++---
>   1 files changed, 90 insertions(+), 7 deletions(-)
>
> diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
> index 92aba0b..c1587cb 100644
> --- a/kernel/time/tick-broadcast.c
> +++ b/kernel/time/tick-broadcast.c
> @@ -358,10 +358,16 @@ int tick_resume_broadcast(void)
>
>   #ifdef CONFIG_TICK_ONESHOT
>
> +/* Lock on the first thread on a core coordinates state */
>   struct broadcast_cpu_state {
> +	int slave;
>   	int need_oneshot;
> +	raw_spinlock_t lock;
>   } ____cacheline_aligned;
> -static DEFINE_PER_CPU(struct broadcast_cpu_state, state);
> +
> +static DEFINE_PER_CPU(struct broadcast_cpu_state, state) = {
> +	.lock = __RAW_SPIN_LOCK_UNLOCKED(lock)
> +};
>
>   /*
>    * Exposed for debugging: see timer_list.c
> @@ -454,6 +460,70 @@ again:
>   	raw_spin_unlock(&tick_broadcast_lock);
>   }
>
> +#define for_each_sibling(i, cpu) for_each_cpu(i, topology_thread_cpumask(cpu))
> +
> +/*
> + * When another thread sibling is alive our timer keeps ticking.
> + * Check for this here because it's much less expensive.
> + * When this happens the current CPU turns into a slave, tied
> + * to the still running CPU. When that also goes idle both
> + * become serviced by the broadcaster.
> + */
> +static int tick_sibling_active(int cpu, ktime_t *timeout, int enter)
> +{
> +	int i, leader;
> +	int running;
> +	ktime_t n;
> +
> +	/*
> + 	 * Exit can be done lockless because unidling
> + 	 * does not affect others.
> + 	 */
> +	if (!enter) {
> +		int was_slave = __get_cpu_var(state).slave;
> +		__get_cpu_var(state).slave = 0;
> +		return was_slave;
> +	}
> +
> +	leader = cpumask_first(topology_thread_cpumask(cpu));
> +	running = 1;

I don't understand this initialization. Won't the following loop 
increment running for the calling cpu? shouldn't it be initialized to 0?

> +	raw_spin_lock(&per_cpu(state, leader).lock);
> +	for_each_sibling(i, cpu) {
> +		struct broadcast_cpu_state *s =&per_cpu(state, i);
> +
> +		n = per_cpu(tick_cpu_device, i).evtdev->next_event;
> +		if (n.tv64<  timeout->tv64&&  (s->slave || s->need_oneshot))
> +			*timeout = n;
> +		if (!s->slave&&  !s->need_oneshot)
> +			running++;
> +	}
> +	__get_cpu_var(state).slave = running>  1;
> +	raw_spin_unlock(&per_cpu(state, leader).lock);
> +	return running>  1;
> +}
> +
> +/*
> + * Sync oneshot state with siblings.
> + */
> +static void set_broadcast_sibling_state(int cpu, int enter)
> +{
> +	int i;
> +
> +	for_each_sibling(i, cpu) {
> +		struct broadcast_cpu_state *s =&per_cpu(state, i);
> +
> +		if (enter&&  s->slave) {
> +			s->need_oneshot = 1;
> +			wmb();
> +			s->slave = 0;
> +		} else if (!enter&&  s->need_oneshot) {
> +			s->slave = 1;
> +			wmb();
> +			s->need_oneshot = 0;
> +		}
> +	}
> +}
> +
>   /*
>    * Powerstate information: The system enters/leaves a state, where
>    * affected devices might stop
> @@ -464,7 +534,8 @@ void tick_broadcast_oneshot_control(unsigned long reason)
>   	struct tick_device *td;
>   	unsigned long flags;
>   	int cpu;
> -
> +	ktime_t timeout;
> +	
>   	/*
>   	 * Periodic mode does not care about the enter/exit of power
>   	 * states
> @@ -476,21 +547,28 @@ void tick_broadcast_oneshot_control(unsigned long reason)
>   	bc = tick_broadcast_device.evtdev;
>   	td =&per_cpu(tick_cpu_device, cpu);
>   	dev = td->evtdev;
> +	timeout = td->evtdev->next_event;
>
>   	if (!(dev->features&  CLOCK_EVT_FEAT_C3STOP))
>   		return;
>
> +	if (tick_sibling_active(cpu,&timeout,
> +				reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER))
> +		return;
> +
>   	raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
>   	if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
>   		if (!__get_cpu_var(state).need_oneshot) {
> -			__get_cpu_var(state).need_oneshot = 1;

Don't we still need to set need_oneshot here for this cpu?

> +			/* Turn all slaves into oneshots */
> +			set_broadcast_sibling_state(cpu, 1);
>   			clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
> -			if (dev->next_event.tv64<  bc->next_event.tv64)
> -				tick_broadcast_set_event(dev->next_event, 1);
> +			if (timeout.tv64<  bc->next_event.tv64)
> +				tick_broadcast_set_event(timeout, 1);
>   		}
>   	} else {
>   		if (__get_cpu_var(state).need_oneshot) {
> -			__get_cpu_var(state).need_oneshot = 0;

And don't we still need to clear it here?

> +			/* Turn all oneshots into slaves */
> +			set_broadcast_sibling_state(cpu, 0);
>   			clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
>   			if (dev->next_event.tv64 != KTIME_MAX)
>   				tick_program_event(dev->next_event, 1);
> @@ -506,7 +584,12 @@ void tick_broadcast_oneshot_control(unsigned long reason)
>    */
>   static void tick_broadcast_clear_oneshot(int cpu)
>   {
> -	per_cpu(state, cpu).need_oneshot = 0;
> +	int i;
> +
> +	for_each_sibling (i, cpu) {
> +		per_cpu(state, i).need_oneshot = 0;
> +		per_cpu(state, i).slave = 0;
> +	}
>   }
>
>   static void tick_broadcast_init_next_event(struct cpumask *mask,

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2011-05-07  5:32 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-05-06 21:40 RFC: Lower broadcast timer idle path lock contention Andi Kleen
2011-05-06 21:40 ` [PATCH 1/4] Move C3 stop test outside lock Andi Kleen
2011-05-06 21:40 ` [PATCH 2/4] Move oneshot broadcast mask to per cpu variables Andi Kleen
2011-05-06 21:40 ` [PATCH 3/4] Avoid tick broadcast switch-overs for thread siblings Andi Kleen
2011-05-07  5:32   ` Dave Kleikamp
2011-05-06 21:40 ` [PATCH 4/4] Avoid broadcast time lock when not changing the timeout Andi Kleen

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.