linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2 RESEND/RFC] timer: make deferrable cpu unbound timers really not bound to a cpu
@ 2014-09-16  2:59 Joonwoo Park
  2014-09-23 18:33 ` Thomas Gleixner
  0 siblings, 1 reply; 8+ messages in thread
From: Joonwoo Park @ 2014-09-16  2:59 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: linux-kernel, Joonwoo Park, John Stultz, Tejun Heo

When a deferrable work (INIT_DEFERRABLE_WORK, etc.) is queued via
queue_delayed_work() it's probably intended to run the work item on any
CPU that isn't idle. However, we queue the work to run at a later time
by starting a deferrable timer that binds to whatever CPU the work is
queued on which is same with queue_delayed_work_on(smp_processor_id())
effectively.

As a result WORK_CPU_UNBOUND work items aren't really cpu unbound now.
In fact this is perfectly fine with UP kernel and also won't affect much a
system without dyntick with SMP kernel too as every cpus run timers
periodically.  But on SMP systems with dyntick current implementation leads
deferrable timers not very scalable because the timer's base which has
queued the deferrable timer won't wake up till next non-deferrable timer
expires even though there are possible other non idle cpus are running
which are able to run expired deferrable timers.

The deferrable work is a good example of the current implementation's
victim like below.

INIT_DEFERRABLE_WORK(&dwork, fn);
CPU 0                                 CPU 1
queue_delayed_work(wq, &dwork, HZ);
    queue_delayed_work_on(WORK_CPU_UNBOUND);
        ...
	__mod_timer() -> queues timer to the
			 current cpu's timer
			 base.
	...
tick_nohz_idle_enter() -> cpu enters idle.
A second later
cpu 0 is now in idle.                 cpu 1 exits idle or wasn't in idle so
                                      now it's in active but won't
cpu 0 won't wake up till next         handle cpu unbound deferrable timer
non-deferrable timer expires.         as it's in cpu 0's timer base.

To make all cpu unbound deferrable timers are scalable, introduce a common
timer base which is only for cpu unbound deferrable timers to make those
are indeed cpu unbound so that can be scheduled by any of non idle cpus.
This common timer fixes scalability issue of delayed work and all other cpu
unbound deferrable timer using implementations.

CC: Thomas Gleixner <tglx@linutronix.de>
CC: John Stultz <john.stultz@linaro.org>
CC: Tejun Heo <tj@kernel.org>
Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
---
 Changes in v2:
 * Use kzalloc_node()/kzalloc()

 kernel/time/timer.c | 106 +++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 80 insertions(+), 26 deletions(-)

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index aca5dfe..5313cb0 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -93,6 +93,9 @@ struct tvec_base {
 struct tvec_base boot_tvec_bases;
 EXPORT_SYMBOL(boot_tvec_bases);
 static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
+#ifdef CONFIG_SMP
+static struct tvec_base *tvec_base_deferral = &boot_tvec_bases;
+#endif
 
 /* Functions below help us manage 'deferrable' flag */
 static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
@@ -655,7 +658,14 @@ static inline void debug_assert_init(struct timer_list *timer)
 static void do_init_timer(struct timer_list *timer, unsigned int flags,
 			  const char *name, struct lock_class_key *key)
 {
-	struct tvec_base *base = __raw_get_cpu_var(tvec_bases);
+	struct tvec_base *base;
+
+#ifdef CONFIG_SMP
+	if (flags & TIMER_DEFERRABLE)
+		base = tvec_base_deferral;
+	else
+#endif
+		base = __raw_get_cpu_var(tvec_bases);
 
 	timer->entry.next = NULL;
 	timer->base = (void *)((unsigned long)base | flags);
@@ -777,26 +787,32 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
 
 	debug_activate(timer, expires);
 
-	cpu = get_nohz_timer_target(pinned);
-	new_base = per_cpu(tvec_bases, cpu);
+#ifdef CONFIG_SMP
+	if (base != tvec_base_deferral) {
+#endif
+		cpu = get_nohz_timer_target(pinned);
+		new_base = per_cpu(tvec_bases, cpu);
 
-	if (base != new_base) {
-		/*
-		 * We are trying to schedule the timer on the local CPU.
-		 * However we can't change timer's base while it is running,
-		 * otherwise del_timer_sync() can't detect that the timer's
-		 * handler yet has not finished. This also guarantees that
-		 * the timer is serialized wrt itself.
-		 */
-		if (likely(base->running_timer != timer)) {
-			/* See the comment in lock_timer_base() */
-			timer_set_base(timer, NULL);
-			spin_unlock(&base->lock);
-			base = new_base;
-			spin_lock(&base->lock);
-			timer_set_base(timer, base);
+		if (base != new_base) {
+			/*
+			 * We are trying to schedule the timer on the local CPU.
+			 * However we can't change timer's base while it is
+			 * running, otherwise del_timer_sync() can't detect that
+			 * the timer's handler yet has not finished. This also
+			 * guarantees that the timer is serialized wrt itself.
+			 */
+			if (likely(base->running_timer != timer)) {
+				/* See the comment in lock_timer_base() */
+				timer_set_base(timer, NULL);
+				spin_unlock(&base->lock);
+				base = new_base;
+				spin_lock(&base->lock);
+				timer_set_base(timer, base);
+			}
 		}
+#ifdef CONFIG_SMP
 	}
+#endif
 
 	timer->expires = expires;
 	internal_add_timer(base, timer);
@@ -1161,15 +1177,20 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
 /**
  * __run_timers - run all expired timers (if any) on this CPU.
  * @base: the timer vector to be processed.
+ * @try: try and just return if base's lock already acquired.
  *
  * This function cascades all vectors and executes all expired timer
  * vectors.
  */
-static inline void __run_timers(struct tvec_base *base)
+static inline void __run_timers(struct tvec_base *base, bool try)
 {
 	struct timer_list *timer;
 
-	spin_lock_irq(&base->lock);
+	if (!try)
+		spin_lock_irq(&base->lock);
+	else if (!spin_trylock_irq(&base->lock))
+		return;
+
 	if (catchup_timer_jiffies(base)) {
 		spin_unlock_irq(&base->lock);
 		return;
@@ -1400,8 +1421,17 @@ static void run_timer_softirq(struct softirq_action *h)
 
 	hrtimer_run_pending();
 
+#ifdef CONFIG_SMP
+	if (time_after_eq(jiffies, tvec_base_deferral->timer_jiffies))
+		/*
+		 * if other cpu is handling cpu unbound deferrable timer base,
+		 * current cpu doesn't need to handle it so pass try=true.
+		 */
+		__run_timers(tvec_base_deferral, true);
+#endif
+
 	if (time_after_eq(jiffies, base->timer_jiffies))
-		__run_timers(base);
+		__run_timers(base, false);
 }
 
 /*
@@ -1537,7 +1567,7 @@ static int init_timers_cpu(int cpu)
 {
 	int j;
 	struct tvec_base *base;
-	static char tvec_base_done[NR_CPUS];
+	static char tvec_base_done[NR_CPUS + 1];
 
 	if (!tvec_base_done[cpu]) {
 		static char boot_done;
@@ -1546,8 +1576,12 @@ static int init_timers_cpu(int cpu)
 			/*
 			 * The APs use this path later in boot
 			 */
-			base = kzalloc_node(sizeof(*base), GFP_KERNEL,
-					    cpu_to_node(cpu));
+			if (cpu != NR_CPUS)
+				base = kzalloc_node(sizeof(*base), GFP_KERNEL,
+						    cpu_to_node(cpu));
+			else
+				base = kzalloc(sizeof(*base), GFP_KERNEL);
+
 			if (!base)
 				return -ENOMEM;
 
@@ -1556,7 +1590,13 @@ static int init_timers_cpu(int cpu)
 				kfree(base);
 				return -ENOMEM;
 			}
-			per_cpu(tvec_bases, cpu) = base;
+
+			if (cpu != NR_CPUS)
+				per_cpu(tvec_bases, cpu) = base;
+#ifdef CONFIG_SMP
+			else
+				tvec_base_deferral = base;
+#endif
 		} else {
 			/*
 			 * This is for the boot CPU - we use compile-time
@@ -1571,7 +1611,12 @@ static int init_timers_cpu(int cpu)
 		tvec_base_done[cpu] = 1;
 		base->cpu = cpu;
 	} else {
-		base = per_cpu(tvec_bases, cpu);
+		if (cpu != NR_CPUS)
+			base = per_cpu(tvec_bases, cpu);
+#ifdef CONFIG_SMP
+		else
+			base = tvec_base_deferral;
+#endif
 	}
 
 
@@ -1679,6 +1724,15 @@ void __init init_timers(void)
 			       (void *)(long)smp_processor_id());
 	BUG_ON(err != NOTIFY_OK);
 
+#ifdef CONFIG_SMP
+	/*
+	 * initialize cpu unbound deferrable timer base only when CONFIG_SMP.
+	 * UP kernel handles the timers with cpu 0 timer base.
+	 */
+	err = init_timers_cpu(NR_CPUS);
+	BUG_ON(err);
+#endif
+
 	init_timer_stats();
 	register_cpu_notifier(&timers_nb);
 	open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
hosted by The Linux Foundation


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH v2 RESEND/RFC] timer: make deferrable cpu unbound timers really not bound to a cpu
  2014-09-16  2:59 [PATCH v2 RESEND/RFC] timer: make deferrable cpu unbound timers really not bound to a cpu Joonwoo Park
@ 2014-09-23 18:33 ` Thomas Gleixner
  2014-09-26 20:54   ` Joonwoo Park
  2015-03-20  0:10   ` Saravana Kannan
  0 siblings, 2 replies; 8+ messages in thread
From: Thomas Gleixner @ 2014-09-23 18:33 UTC (permalink / raw)
  To: Joonwoo Park; +Cc: linux-kernel, John Stultz, Tejun Heo

On Mon, 15 Sep 2014, Joonwoo Park wrote:
> +#ifdef CONFIG_SMP
> +static struct tvec_base *tvec_base_deferral = &boot_tvec_bases;
> +#endif

In principle I like the idea of a deferrable wheel, but this
implementation is going to go nowhere.

First of all making it SMP only is silly. The deferrable stuff is a
pain in other places as well.

But whats way worse is:

> +static inline void __run_timers(struct tvec_base *base, bool try)
>  {
>  	struct timer_list *timer;
>  
> -	spin_lock_irq(&base->lock);
> +	if (!try)
> +		spin_lock_irq(&base->lock);
> +	else if (!spin_trylock_irq(&base->lock))
> +		return;

Yuck. All cpus fighting about a single spinlock roughly at the same
time? You just created a proper thundering herd cacheline bouncing
issue. 

No way. We have already mechanisms in place to deal with such
problems, you just have to use them.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2 RESEND/RFC] timer: make deferrable cpu unbound timers really not bound to a cpu
  2014-09-23 18:33 ` Thomas Gleixner
@ 2014-09-26 20:54   ` Joonwoo Park
  2015-03-20  0:10   ` Saravana Kannan
  1 sibling, 0 replies; 8+ messages in thread
From: Joonwoo Park @ 2014-09-26 20:54 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: linux-kernel, John Stultz, Tejun Heo

On Tue, Sep 23, 2014 at 08:33:34PM +0200, Thomas Gleixner wrote:
> On Mon, 15 Sep 2014, Joonwoo Park wrote:
> > +#ifdef CONFIG_SMP
> > +static struct tvec_base *tvec_base_deferral = &boot_tvec_bases;
> > +#endif
> 
> In principle I like the idea of a deferrable wheel, but this
> implementation is going to go nowhere.
> 
> First of all making it SMP only is silly. The deferrable stuff is a
> pain in other places as well.
> 
> But whats way worse is:
> 
> > +static inline void __run_timers(struct tvec_base *base, bool try)
> >  {
> >  	struct timer_list *timer;
> >  
> > -	spin_lock_irq(&base->lock);
> > +	if (!try)
> > +		spin_lock_irq(&base->lock);
> > +	else if (!spin_trylock_irq(&base->lock))
> > +		return;
> 
> Yuck. All cpus fighting about a single spinlock roughly at the same
> time? You just created a proper thundering herd cacheline bouncing
> issue. 

Since __run_timers() for deferrable wheel do spin_lock_try() always, none of cpus would spin but just return if spinlock is already acquired.

I agree with cacheline bouncing issue of timer base (maybe you're worried about coherency of spinlock too?).
The other approach I thought about is the way that waking up cpus which has expired deferrable timer from active cpu rather than having global deferrable wheel.
I didn't go with this way because this sounds conflict with the idea of 'deferrable' and consumes more power compare to the patch I proposed.  Cache prefetching isn't free of power consumption either though.
What do you think about this approach?

Or I think we can migrate expired deferrable timers from idle cpu to active cpus but I doubt if this good idea as migration seems expensive.

> 
> No way. We have already mechanisms in place to deal with such
> problems, you just have to use them.

The problem I'm trying to tackle down is a case that a driver needs a deferrable delayed_work to prevent from waking up cpus because of that timer while it's still in need of making sure scheduling the deferrable timer in time if any of cpus are active.

Would you mind shed some lights on me about the mechanisms you're referring to?
I thought about queuing cpu bound deferrable timer to all cpus and cancel all others when any of them got scheduled, but this overkill.

Thanks,
Joonwoo

> 
> Thanks,
> 
> 	tglx

-- 
The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
hosted by The Linux Foundation

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: Re: [PATCH v2 RESEND/RFC] timer: make deferrable cpu unbound timers really not bound to a cpu
  2014-09-23 18:33 ` Thomas Gleixner
  2014-09-26 20:54   ` Joonwoo Park
@ 2015-03-20  0:10   ` Saravana Kannan
  2015-03-24 19:36     ` Saravana Kannan
  1 sibling, 1 reply; 8+ messages in thread
From: Saravana Kannan @ 2015-03-20  0:10 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: Joonwoo Park, linux-kernel, John Stultz, Tejun Heo

On 09/23/2014 11:33 AM, Thomas Gleixner wrote:
> On Mon, 15 Sep 2014, Joonwoo Park wrote:
>> +#ifdef CONFIG_SMP
>> +static struct tvec_base *tvec_base_deferral = &boot_tvec_bases;
>> +#endif
>
> In principle I like the idea of a deferrable wheel, but this
> implementation is going to go nowhere.

Hi Thomas,

To give some more context:

This bug is a serious pain in the a** for anyone using deferrable timers 
or deferrable workqueues today for some periodic work and don't care for 
which CPU the code runs in.

Couple of examples of such issues in existing code:

1) In a SMP system, CPUfreq governors (ondemand and conservative) end up 
queueing a deferrable work on every single CPU and the first one to run 
the deferrable workqueue cancels the work on all other CPUS, runs the 
work and then sets up a workqueue on all the CPUs again for the next 
sampling point.

2) Devfreq actually doesn't work well today because it doesn't do this 
nasty hack like CPUfreq. So, if the devfreq work happens to run on a CPU 
that's typically idle, then it doesn't run for a long time. I've 
actually seen logs where the devfreq polling interval is set to 20ms, 
but ends up running 800ms later.

Don't know how many other drivers are suffering from this bug. Yes, 
IMHO, this is a bug. When the timer is CPU unbound, anyone who hasn't 
looked at the actual timer code would assume that it'll run as long as 
any CPU is busy.

> First of all making it SMP only is silly. The deferrable stuff is a
> pain in other places as well.
>
> But whats way worse is:
>
>> +static inline void __run_timers(struct tvec_base *base, bool try)
>>   {
>>   	struct timer_list *timer;
>>
>> -	spin_lock_irq(&base->lock);
>> +	if (!try)
>> +		spin_lock_irq(&base->lock);
>> +	else if (!spin_trylock_irq(&base->lock))
>> +		return;
>
> Yuck. All cpus fighting about a single spinlock roughly at the same
> time? You just created a proper thundering herd cacheline bouncing
> issue.

I agree, This is not good. I think a simple way to fix this is to have 
only the CPU that's the jiffy owner to run through this deferrable timer 
list.

That should address any unnecessary cache bouncing issues. Would that be 
an acceptable solution to this?

>
> No way. We have already mechanisms in place to deal with such
> problems, you just have to use them.

I'm not sure which problem you are referring to here. Or what the 
already existing solutions are.

I don't think you were referring to the "deferrable timer getting 
delayed for long periods despite CPUs being active" problem, because I 
don't think we have a better solution than this patch (with the jiffy 
owner CPU fix). Asking every driver that doesn't care which CPU the work 
runs on to queue a work or set up a timer on every CPU is definitely not 
a good solution -- that's spreading the complexity to every other driver 
instead of fixing it in one place. And that causes unnecessary cache 
pollution too.

Thoughts? I would really like to see a fix for this go in soon so that 
we can simplify cpufreq and have devfreq and other drivers work correctly.

Thanks,
Saravana

-- 
The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
a Linux Foundation Collaborative Project

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2 RESEND/RFC] timer: make deferrable cpu unbound timers really not bound to a cpu
  2015-03-20  0:10   ` Saravana Kannan
@ 2015-03-24 19:36     ` Saravana Kannan
  2015-03-30 20:16       ` Joonwoo Park
  2015-04-28  2:39       ` [PATCH 2/2] " Joonwoo Park
  0 siblings, 2 replies; 8+ messages in thread
From: Saravana Kannan @ 2015-03-24 19:36 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: Joonwoo Park, linux-kernel, John Stultz, Tejun Heo

On 03/19/2015 05:10 PM, Saravana Kannan wrote:
> On 09/23/2014 11:33 AM, Thomas Gleixner wrote:
>> On Mon, 15 Sep 2014, Joonwoo Park wrote:
>>> +#ifdef CONFIG_SMP
>>> +static struct tvec_base *tvec_base_deferral = &boot_tvec_bases;
>>> +#endif
>>
>> In principle I like the idea of a deferrable wheel, but this
>> implementation is going to go nowhere.
>
> Hi Thomas,
>
> To give some more context:
>
> This bug is a serious pain in the a** for anyone using deferrable timers
> or deferrable workqueues today for some periodic work and don't care for
> which CPU the code runs in.
>
> Couple of examples of such issues in existing code:
>
> 1) In a SMP system, CPUfreq governors (ondemand and conservative) end up
> queueing a deferrable work on every single CPU and the first one to run
> the deferrable workqueue cancels the work on all other CPUS, runs the
> work and then sets up a workqueue on all the CPUs again for the next
> sampling point.
>
> 2) Devfreq actually doesn't work well today because it doesn't do this
> nasty hack like CPUfreq. So, if the devfreq work happens to run on a CPU
> that's typically idle, then it doesn't run for a long time. I've
> actually seen logs where the devfreq polling interval is set to 20ms,
> but ends up running 800ms later.
>
> Don't know how many other drivers are suffering from this bug. Yes,
> IMHO, this is a bug. When the timer is CPU unbound, anyone who hasn't
> looked at the actual timer code would assume that it'll run as long as
> any CPU is busy.
>
>> First of all making it SMP only is silly. The deferrable stuff is a
>> pain in other places as well.
>>
>> But whats way worse is:
>>
>>> +static inline void __run_timers(struct tvec_base *base, bool try)
>>>   {
>>>       struct timer_list *timer;
>>>
>>> -    spin_lock_irq(&base->lock);
>>> +    if (!try)
>>> +        spin_lock_irq(&base->lock);
>>> +    else if (!spin_trylock_irq(&base->lock))
>>> +        return;
>>
>> Yuck. All cpus fighting about a single spinlock roughly at the same
>> time? You just created a proper thundering herd cacheline bouncing
>> issue.
>
> I agree, This is not good. I think a simple way to fix this is to have
> only the CPU that's the jiffy owner to run through this deferrable timer
> list.
>
> That should address any unnecessary cache bouncing issues. Would that be
> an acceptable solution to this?
>
>>
>> No way. We have already mechanisms in place to deal with such
>> problems, you just have to use them.
>
> I'm not sure which problem you are referring to here. Or what the
> already existing solutions are.
>
> I don't think you were referring to the "deferrable timer getting
> delayed for long periods despite CPUs being active" problem, because I
> don't think we have a better solution than this patch (with the jiffy
> owner CPU fix). Asking every driver that doesn't care which CPU the work
> runs on to queue a work or set up a timer on every CPU is definitely not
> a good solution -- that's spreading the complexity to every other driver
> instead of fixing it in one place. And that causes unnecessary cache
> pollution too.
>
> Thoughts? I would really like to see a fix for this go in soon so that
> we can simplify cpufreq and have devfreq and other drivers work correctly.
>
> Thanks,
> Saravana
>

Thomas,

Bump.

-Saravana

-- 
The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
a Linux Foundation Collaborative Project

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2 RESEND/RFC] timer: make deferrable cpu unbound timers really not bound to a cpu
  2015-03-24 19:36     ` Saravana Kannan
@ 2015-03-30 20:16       ` Joonwoo Park
  2015-04-28  2:39       ` [PATCH 2/2] " Joonwoo Park
  1 sibling, 0 replies; 8+ messages in thread
From: Joonwoo Park @ 2015-03-30 20:16 UTC (permalink / raw)
  To: Thomas Gleixner, Saravana Kannan; +Cc: linux-kernel, John Stultz, Tejun Heo

Hi Thomas,

Please find patch v3 which makes only tick_do_timer_cpu to run deferral timer wheel to reduce cache bouncing and let me know what you think.

Thanks,
Joonwoo

>From 0c91f82a0b43b247f1ed310212ef3aada7ccc9f7 Mon Sep 17 00:00:00 2001
From: Joonwoo Park <joonwoop@codeaurora.org>
Date: Thu, 11 Sep 2014 15:34:25 -0700
Subject: [PATCH] timer: make deferrable cpu unbound timers really not bound to
 a cpu

When a deferrable work (INIT_DEFERRABLE_WORK, etc.) is queued via
queue_delayed_work() it's probably intended to run the work item on any
CPU that isn't idle. However, we queue the work to run at a later time
by starting a deferrable timer that binds to whatever CPU the work is
queued on which is same with queue_delayed_work_on(smp_processor_id())
effectively.

As a result WORK_CPU_UNBOUND work items aren't really cpu unbound now.
In fact this is perfectly fine with UP kernel and also won't affect much a
system without dyntick with SMP kernel too as every cpus run timers
periodically.  But on SMP systems with dyntick current implementation leads
deferrable timers not very scalable because the timer's base which has
queued the deferrable timer won't wake up till next non-deferrable timer
expires even though there are possible other non idle cpus are running
which are able to run expired deferrable timers.

The deferrable work is a good example of the current implementation's
victim like below.

INIT_DEFERRABLE_WORK(&dwork, fn);
CPU 0                                 CPU 1
queue_delayed_work(wq, &dwork, HZ);
    queue_delayed_work_on(WORK_CPU_UNBOUND);
        ...
	__mod_timer() -> queues timer to the
			 current cpu's timer
			 base.
	...
tick_nohz_idle_enter() -> cpu enters idle.
A second later
cpu 0 is now in idle.                 cpu 1 exits idle or wasn't in idle so
                                      now it's in active but won't
cpu 0 won't wake up till next         handle cpu unbound deferrable timer
non-deferrable timer expires.         as it's in cpu 0's timer base.

To make all cpu unbound deferrable timers are scalable, introduce a common
timer base which is only for cpu unbound deferrable timers to make those
are indeed cpu unbound so that can be scheduled by tick_do_timer_cpu.
This common timer fixes scalability issue of delayed work and all other cpu
unbound deferrable timer using implementations.

CC: Thomas Gleixner <tglx@linutronix.de>
CC: John Stultz <john.stultz@linaro.org>
CC: Tejun Heo <tj@kernel.org>
Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
---
Changes in v3:
 * Make only tick_do_timer_cpu to run deferral timer wheel to reduce cache bouncing.

 kernel/time/timer.c | 94 ++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 71 insertions(+), 23 deletions(-)

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 2d3f5c5..59306fe 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -49,6 +49,8 @@
 #include <asm/timex.h>
 #include <asm/io.h>
 
+#include "tick-internal.h"
+
 #define CREATE_TRACE_POINTS
 #include <trace/events/timer.h>
 
@@ -93,6 +95,9 @@ struct tvec_base {
 struct tvec_base boot_tvec_bases;
 EXPORT_SYMBOL(boot_tvec_bases);
 static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
+#ifdef CONFIG_SMP
+static struct tvec_base *tvec_base_deferral = &boot_tvec_bases;
+#endif
 
 /* Functions below help us manage 'deferrable' flag */
 static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
@@ -655,7 +660,14 @@ static inline void debug_assert_init(struct timer_list *timer)
 static void do_init_timer(struct timer_list *timer, unsigned int flags,
 			  const char *name, struct lock_class_key *key)
 {
-	struct tvec_base *base = raw_cpu_read(tvec_bases);
+	struct tvec_base *base;
+
+#ifdef CONFIG_SMP
+	if (flags & TIMER_DEFERRABLE)
+		base = tvec_base_deferral;
+	else
+#endif
+		base = raw_cpu_read(tvec_bases);
 
 	timer->entry.next = NULL;
 	timer->base = (void *)((unsigned long)base | flags);
@@ -777,26 +789,32 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
 
 	debug_activate(timer, expires);
 
-	cpu = get_nohz_timer_target(pinned);
-	new_base = per_cpu(tvec_bases, cpu);
+#ifdef CONFIG_SMP
+	if (base != tvec_base_deferral) {
+#endif
+		cpu = get_nohz_timer_target(pinned);
+		new_base = per_cpu(tvec_bases, cpu);
 
-	if (base != new_base) {
-		/*
-		 * We are trying to schedule the timer on the local CPU.
-		 * However we can't change timer's base while it is running,
-		 * otherwise del_timer_sync() can't detect that the timer's
-		 * handler yet has not finished. This also guarantees that
-		 * the timer is serialized wrt itself.
-		 */
-		if (likely(base->running_timer != timer)) {
-			/* See the comment in lock_timer_base() */
-			timer_set_base(timer, NULL);
-			spin_unlock(&base->lock);
-			base = new_base;
-			spin_lock(&base->lock);
-			timer_set_base(timer, base);
+		if (base != new_base) {
+			/*
+			 * We are trying to schedule the timer on the local CPU.
+			 * However we can't change timer's base while it is
+			 * running, otherwise del_timer_sync() can't detect that
+			 * the timer's handler yet has not finished. This also
+			 * guarantees that the timer is serialized wrt itself.
+			 */
+			if (likely(base->running_timer != timer)) {
+				/* See the comment in lock_timer_base() */
+				timer_set_base(timer, NULL);
+				spin_unlock(&base->lock);
+				base = new_base;
+				spin_lock(&base->lock);
+				timer_set_base(timer, base);
+			}
 		}
+#ifdef CONFIG_SMP
 	}
+#endif
 
 	timer->expires = expires;
 	internal_add_timer(base, timer);
@@ -1399,6 +1417,12 @@ static void run_timer_softirq(struct softirq_action *h)
 
 	hrtimer_run_pending();
 
+#ifdef CONFIG_SMP
+	if (smp_processor_id() == tick_do_timer_cpu &&
+	    time_after_eq(jiffies, tvec_base_deferral->timer_jiffies))
+		__run_timers(tvec_base_deferral);
+#endif
+
 	if (time_after_eq(jiffies, base->timer_jiffies))
 		__run_timers(base);
 }
@@ -1536,7 +1560,7 @@ static int init_timers_cpu(int cpu)
 {
 	int j;
 	struct tvec_base *base;
-	static char tvec_base_done[NR_CPUS];
+	static char tvec_base_done[NR_CPUS + 1];
 
 	if (!tvec_base_done[cpu]) {
 		static char boot_done;
@@ -1545,8 +1569,12 @@ static int init_timers_cpu(int cpu)
 			/*
 			 * The APs use this path later in boot
 			 */
-			base = kzalloc_node(sizeof(*base), GFP_KERNEL,
-					    cpu_to_node(cpu));
+			if (cpu != NR_CPUS)
+				base = kzalloc_node(sizeof(*base), GFP_KERNEL,
+						    cpu_to_node(cpu));
+			else
+				base = kzalloc(sizeof(*base), GFP_KERNEL);
+
 			if (!base)
 				return -ENOMEM;
 
@@ -1555,7 +1583,13 @@ static int init_timers_cpu(int cpu)
 				kfree(base);
 				return -ENOMEM;
 			}
-			per_cpu(tvec_bases, cpu) = base;
+
+			if (cpu != NR_CPUS)
+				per_cpu(tvec_bases, cpu) = base;
+#ifdef CONFIG_SMP
+			else
+				tvec_base_deferral = base;
+#endif
 		} else {
 			/*
 			 * This is for the boot CPU - we use compile-time
@@ -1570,7 +1604,12 @@ static int init_timers_cpu(int cpu)
 		tvec_base_done[cpu] = 1;
 		base->cpu = cpu;
 	} else {
-		base = per_cpu(tvec_bases, cpu);
+		if (cpu != NR_CPUS)
+			base = per_cpu(tvec_bases, cpu);
+#ifdef CONFIG_SMP
+		else
+			base = tvec_base_deferral;
+#endif
 	}
 
 
@@ -1678,6 +1717,15 @@ void __init init_timers(void)
 			       (void *)(long)smp_processor_id());
 	BUG_ON(err != NOTIFY_OK);
 
+#ifdef CONFIG_SMP
+	/*
+	 * initialize cpu unbound deferrable timer base only when CONFIG_SMP.
+	 * UP kernel handles the timers with cpu 0 timer base.
+	 */
+	err = init_timers_cpu(NR_CPUS);
+	BUG_ON(err);
+#endif
+
 	init_timer_stats();
 	register_cpu_notifier(&timers_nb);
 	open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
-- 

On Tue, Mar 24, 2015 at 12:36:59PM -0700, Saravana Kannan wrote:
> On 03/19/2015 05:10 PM, Saravana Kannan wrote:
> >On 09/23/2014 11:33 AM, Thomas Gleixner wrote:
> >>On Mon, 15 Sep 2014, Joonwoo Park wrote:
> >>>+#ifdef CONFIG_SMP
> >>>+static struct tvec_base *tvec_base_deferral = &boot_tvec_bases;
> >>>+#endif
> >>
> >>In principle I like the idea of a deferrable wheel, but this
> >>implementation is going to go nowhere.
> >
> >Hi Thomas,
> >
> >To give some more context:
> >
> >This bug is a serious pain in the a** for anyone using deferrable timers
> >or deferrable workqueues today for some periodic work and don't care for
> >which CPU the code runs in.
> >
> >Couple of examples of such issues in existing code:
> >
> >1) In a SMP system, CPUfreq governors (ondemand and conservative) end up
> >queueing a deferrable work on every single CPU and the first one to run
> >the deferrable workqueue cancels the work on all other CPUS, runs the
> >work and then sets up a workqueue on all the CPUs again for the next
> >sampling point.
> >
> >2) Devfreq actually doesn't work well today because it doesn't do this
> >nasty hack like CPUfreq. So, if the devfreq work happens to run on a CPU
> >that's typically idle, then it doesn't run for a long time. I've
> >actually seen logs where the devfreq polling interval is set to 20ms,
> >but ends up running 800ms later.
> >
> >Don't know how many other drivers are suffering from this bug. Yes,
> >IMHO, this is a bug. When the timer is CPU unbound, anyone who hasn't
> >looked at the actual timer code would assume that it'll run as long as
> >any CPU is busy.
> >
> >>First of all making it SMP only is silly. The deferrable stuff is a
> >>pain in other places as well.
> >>
> >>But whats way worse is:
> >>
> >>>+static inline void __run_timers(struct tvec_base *base, bool try)
> >>>  {
> >>>      struct timer_list *timer;
> >>>
> >>>-    spin_lock_irq(&base->lock);
> >>>+    if (!try)
> >>>+        spin_lock_irq(&base->lock);
> >>>+    else if (!spin_trylock_irq(&base->lock))
> >>>+        return;
> >>
> >>Yuck. All cpus fighting about a single spinlock roughly at the same
> >>time? You just created a proper thundering herd cacheline bouncing
> >>issue.
> >
> >I agree, This is not good. I think a simple way to fix this is to have
> >only the CPU that's the jiffy owner to run through this deferrable timer
> >list.
> >
> >That should address any unnecessary cache bouncing issues. Would that be
> >an acceptable solution to this?
> >
> >>
> >>No way. We have already mechanisms in place to deal with such
> >>problems, you just have to use them.
> >
> >I'm not sure which problem you are referring to here. Or what the
> >already existing solutions are.
> >
> >I don't think you were referring to the "deferrable timer getting
> >delayed for long periods despite CPUs being active" problem, because I
> >don't think we have a better solution than this patch (with the jiffy
> >owner CPU fix). Asking every driver that doesn't care which CPU the work
> >runs on to queue a work or set up a timer on every CPU is definitely not
> >a good solution -- that's spreading the complexity to every other driver
> >instead of fixing it in one place. And that causes unnecessary cache
> >pollution too.
> >
> >Thoughts? I would really like to see a fix for this go in soon so that
> >we can simplify cpufreq and have devfreq and other drivers work correctly.
> >
> >Thanks,
> >Saravana
> >
> 
> Thomas,
> 
> Bump.
> 
> -Saravana
> 
> -- 
> The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
> a Linux Foundation Collaborative Project

-- 
The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
hosted by The Linux Foundation

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 2/2] timer: make deferrable cpu unbound timers really not bound to a cpu
  2015-03-24 19:36     ` Saravana Kannan
  2015-03-30 20:16       ` Joonwoo Park
@ 2015-04-28  2:39       ` Joonwoo Park
  2015-04-28  2:41         ` Joonwoo Park
  1 sibling, 1 reply; 8+ messages in thread
From: Joonwoo Park @ 2015-04-28  2:39 UTC (permalink / raw)
  To: tglx; +Cc: skannan, linux-kernel, Joonwoo Park, John Stultz, Tejun Heo

When a deferrable work (INIT_DEFERRABLE_WORK, etc.) is queued via
queue_delayed_work() it's probably intended to run the work item on any
CPU that isn't idle. However, we queue the work to run at a later time
by starting a deferrable timer that binds to whatever CPU the work is
queued on which is same with queue_delayed_work_on(smp_processor_id())
effectively.

As a result WORK_CPU_UNBOUND work items aren't really cpu unbound now.
In fact this is perfectly fine with UP kernel and also won't affect much a
system without dyntick with SMP kernel too as every cpus run timers
periodically.  But on SMP systems with dyntick current implementation leads
deferrable timers not very scalable because the timer's base which has
queued the deferrable timer won't wake up till next non-deferrable timer
expires even though there are possible other non idle cpus are running
which are able to run expired deferrable timers.

The deferrable work is a good example of the current implementation's
victim like below.

INIT_DEFERRABLE_WORK(&dwork, fn);
CPU 0                                 CPU 1
queue_delayed_work(wq, &dwork, HZ);
    queue_delayed_work_on(WORK_CPU_UNBOUND);
        ...
	__mod_timer() -> queues timer to the
			 current cpu's timer
			 base.
	...
tick_nohz_idle_enter() -> cpu enters idle.
A second later
cpu 0 is now in idle.                 cpu 1 exits idle or wasn't in idle so
                                      now it's in active but won't
cpu 0 won't wake up till next         handle cpu unbound deferrable timer
non-deferrable timer expires.         as it's in cpu 0's timer base.

To make all cpu unbound deferrable timers are scalable, introduce a common
timer base which is only for cpu unbound deferrable timers to make those
are indeed cpu unbound so that can be scheduled by tick_do_timer_cpu.
This common timer fixes scalability issue of delayed work and all other cpu
unbound deferrable timer using implementations.

CC: Thomas Gleixner <tglx@linutronix.de>
CC: John Stultz <john.stultz@linaro.org>
CC: Tejun Heo <tj@kernel.org>
Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
---
Changes in v3:
 * Make only tick_do_timer_cpu to run deferral timer wheel to reduce cache bouncing.

Changes in v4:
 * Kill CONFIG_SMP ifdefry.
 * Allocate and initialize tvec_base_deferrable at compile time.
 * Pin pinned deferrable timer. 
 * s/deferral/deferrable/

 include/linux/timer.h |  14 ++++++-
 kernel/time/timer.c   | 103 ++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 97 insertions(+), 20 deletions(-)

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 8c5a197..45847ca 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -34,6 +34,9 @@ struct timer_list {
 };
 
 extern struct tvec_base boot_tvec_bases;
+#ifdef CONFIG_SMP
+extern struct tvec_base tvec_base_deferrable;
+#endif
 
 #ifdef CONFIG_LOCKDEP
 /*
@@ -70,12 +73,21 @@ extern struct tvec_base boot_tvec_bases;
 
 #define TIMER_FLAG_MASK			0x3LU
 
+#ifdef CONFIG_SMP
+#define __TIMER_BASE(_flags) \
+	((_flags) & TIMER_DEFERRABLE ? \
+	 (unsigned long)&tvec_base_deferrable + (_flags) : \
+	 (unsigned long)&boot_tvec_bases + (_flags))
+#else
+#define __TIMER_BASE(_flags) ((unsigned long)&boot_tvec_bases + (_flags))
+#endif
+
 #define __TIMER_INITIALIZER(_function, _expires, _data, _flags) { \
 		.entry = { .prev = TIMER_ENTRY_STATIC },	\
 		.function = (_function),			\
 		.expires = (_expires),				\
 		.data = (_data),				\
-		.base = (void *)((unsigned long)&boot_tvec_bases + (_flags)), \
+		.base = (void *)(__TIMER_BASE(_flags)),		\
 		.slack = -1,					\
 		__TIMER_LOCKDEP_MAP_INITIALIZER(		\
 			__FILE__ ":" __stringify(__LINE__))	\
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index e5d5733c..133e94a 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -49,6 +49,8 @@
 #include <asm/timex.h>
 #include <asm/io.h>
 
+#include "tick-internal.h"
+
 #define CREATE_TRACE_POINTS
 #include <trace/events/timer.h>
 
@@ -103,6 +105,9 @@ struct tvec_base boot_tvec_bases;
 EXPORT_SYMBOL(boot_tvec_bases);
 
 static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
+#ifdef CONFIG_SMP
+struct tvec_base tvec_base_deferrable;
+#endif
 
 /* Functions below help us manage 'deferrable' flag */
 static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
@@ -662,10 +667,63 @@ static inline void debug_assert_init(struct timer_list *timer)
 	debug_timer_assert_init(timer);
 }
 
+#ifdef CONFIG_SMP
+static inline struct tvec_base *__get_timer_base(unsigned int flags)
+{
+	if (flags & TIMER_DEFERRABLE)
+		return &tvec_base_deferrable;
+	else
+		return raw_cpu_read(tvec_bases);
+}
+
+static inline bool is_deferrable_timer_base(struct tvec_base *base)
+{
+	return base == &tvec_base_deferrable;
+}
+
+static inline void __run_timers(struct tvec_base *base);
+static inline void __run_deferrable_timers(void)
+{
+	if (smp_processor_id() == tick_do_timer_cpu &&
+	    time_after_eq(jiffies, tvec_base_deferrable.timer_jiffies))
+		__run_timers(&tvec_base_deferrable);
+}
+
+static void __init init_timer_cpu(struct tvec_base *base, int cpu);
+static inline void init_deferrable_timer(void)
+{
+	init_timer_cpu(&tvec_base_deferrable, NR_CPUS);
+}
+#else
+static inline struct tvec_base *__get_timer_base(unsigned int flags)
+{
+	return raw_cpu_read(tvec_bases);
+}
+
+static inline bool is_deferrable_timer_base(struct tvec_base *base)
+{
+	return false;
+}
+
+static inline void __run_deferrable_timers(void)
+{
+}
+
+static inline void init_deferrable_timer(void)
+{
+	/*
+	 * initialize cpu unbound deferrable timer base only when CONFIG_SMP.
+	 * UP kernel handles the timers with cpu 0 timer base.
+	 */
+}
+#endif
+
 static void do_init_timer(struct timer_list *timer, unsigned int flags,
 			  const char *name, struct lock_class_key *key)
 {
-	struct tvec_base *base = raw_cpu_read(tvec_bases);
+	struct tvec_base *base;
+
+	base = __get_timer_base(flags);
 
 	timer->entry.next = NULL;
 	timer->base = (void *)((unsigned long)base | flags);
@@ -787,24 +845,26 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
 
 	debug_activate(timer, expires);
 
-	cpu = get_nohz_timer_target(pinned);
-	new_base = per_cpu(tvec_bases, cpu);
+	if (!is_deferrable_timer_base(base) || pinned == TIMER_PINNED) {
+		cpu = get_nohz_timer_target(pinned);
+		new_base = per_cpu(tvec_bases, cpu);
 
-	if (base != new_base) {
-		/*
-		 * We are trying to schedule the timer on the local CPU.
-		 * However we can't change timer's base while it is running,
-		 * otherwise del_timer_sync() can't detect that the timer's
-		 * handler yet has not finished. This also guarantees that
-		 * the timer is serialized wrt itself.
-		 */
-		if (likely(base->running_timer != timer)) {
-			/* See the comment in lock_timer_base() */
-			timer_set_base(timer, NULL);
-			spin_unlock(&base->lock);
-			base = new_base;
-			spin_lock(&base->lock);
-			timer_set_base(timer, base);
+		if (base != new_base) {
+			/*
+			 * We are trying to schedule the timer on the local CPU.
+			 * However we can't change timer's base while it is
+			 * running, otherwise del_timer_sync() can't detect that
+			 * the timer's handler yet has not finished. This also
+			 * guarantees that the timer is serialized wrt itself.
+			 */
+			if (likely(base->running_timer != timer)) {
+				/* See the comment in lock_timer_base() */
+				timer_set_base(timer, NULL);
+				spin_unlock(&base->lock);
+				base = new_base;
+				spin_lock(&base->lock);
+				timer_set_base(timer, base);
+			}
 		}
 	}
 
@@ -1411,6 +1471,8 @@ static void run_timer_softirq(struct softirq_action *h)
 
 	hrtimer_run_pending();
 
+	__run_deferrable_timers();
+
 	if (time_after_eq(jiffies, base->timer_jiffies))
 		__run_timers(base);
 }
@@ -1623,7 +1685,8 @@ static void __init init_timer_cpu(struct tvec_base *base, int cpu)
 	BUG_ON(base != tbase_get_base(base));
 
 	base->cpu = cpu;
-	per_cpu(tvec_bases, cpu) = base;
+	if (cpu != NR_CPUS)
+		per_cpu(tvec_bases, cpu) = base;
 	spin_lock_init(&base->lock);
 
 	for (j = 0; j < TVN_SIZE; j++) {
@@ -1655,6 +1718,8 @@ static void __init init_timer_cpus(void)
 
 		init_timer_cpu(base, cpu);
 	}
+
+	init_deferrable_timer();
 }
 
 void __init init_timers(void)
-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
hosted by The Linux Foundation


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH 2/2] timer: make deferrable cpu unbound timers really not bound to a cpu
  2015-04-28  2:39       ` [PATCH 2/2] " Joonwoo Park
@ 2015-04-28  2:41         ` Joonwoo Park
  0 siblings, 0 replies; 8+ messages in thread
From: Joonwoo Park @ 2015-04-28  2:41 UTC (permalink / raw)
  To: tglx; +Cc: skannan, linux-kernel, John Stultz, Tejun Heo

Thomas, I made some clean up.  Will much appreciate if you can give me some feedback on this.

Thanks,
Joonwoo

On 04/27/2015 07:39 PM, Joonwoo Park wrote:
> When a deferrable work (INIT_DEFERRABLE_WORK, etc.) is queued via
> queue_delayed_work() it's probably intended to run the work item on any
> CPU that isn't idle. However, we queue the work to run at a later time
> by starting a deferrable timer that binds to whatever CPU the work is
> queued on which is same with queue_delayed_work_on(smp_processor_id())
> effectively.
> 
> As a result WORK_CPU_UNBOUND work items aren't really cpu unbound now.
> In fact this is perfectly fine with UP kernel and also won't affect much a
> system without dyntick with SMP kernel too as every cpus run timers
> periodically.  But on SMP systems with dyntick current implementation leads
> deferrable timers not very scalable because the timer's base which has
> queued the deferrable timer won't wake up till next non-deferrable timer
> expires even though there are possible other non idle cpus are running
> which are able to run expired deferrable timers.
> 
> The deferrable work is a good example of the current implementation's
> victim like below.
> 
> INIT_DEFERRABLE_WORK(&dwork, fn);
> CPU 0                                 CPU 1
> queue_delayed_work(wq, &dwork, HZ);
>     queue_delayed_work_on(WORK_CPU_UNBOUND);
>         ...
> 	__mod_timer() -> queues timer to the
> 			 current cpu's timer
> 			 base.
> 	...
> tick_nohz_idle_enter() -> cpu enters idle.
> A second later
> cpu 0 is now in idle.                 cpu 1 exits idle or wasn't in idle so
>                                       now it's in active but won't
> cpu 0 won't wake up till next         handle cpu unbound deferrable timer
> non-deferrable timer expires.         as it's in cpu 0's timer base.
> 
> To make all cpu unbound deferrable timers are scalable, introduce a common
> timer base which is only for cpu unbound deferrable timers to make those
> are indeed cpu unbound so that can be scheduled by tick_do_timer_cpu.
> This common timer fixes scalability issue of delayed work and all other cpu
> unbound deferrable timer using implementations.
> 
> CC: Thomas Gleixner <tglx@linutronix.de>
> CC: John Stultz <john.stultz@linaro.org>
> CC: Tejun Heo <tj@kernel.org>
> Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
> ---
> Changes in v3:
>  * Make only tick_do_timer_cpu to run deferral timer wheel to reduce cache bouncing.
> 
> Changes in v4:
>  * Kill CONFIG_SMP ifdefry.
>  * Allocate and initialize tvec_base_deferrable at compile time.
>  * Pin pinned deferrable timer. 
>  * s/deferral/deferrable/
> 
>  include/linux/timer.h |  14 ++++++-
>  kernel/time/timer.c   | 103 ++++++++++++++++++++++++++++++++++++++++----------
>  2 files changed, 97 insertions(+), 20 deletions(-)
> 
> diff --git a/include/linux/timer.h b/include/linux/timer.h
> index 8c5a197..45847ca 100644
> --- a/include/linux/timer.h
> +++ b/include/linux/timer.h
> @@ -34,6 +34,9 @@ struct timer_list {
>  };
>  
>  extern struct tvec_base boot_tvec_bases;
> +#ifdef CONFIG_SMP
> +extern struct tvec_base tvec_base_deferrable;
> +#endif
>  
>  #ifdef CONFIG_LOCKDEP
>  /*
> @@ -70,12 +73,21 @@ extern struct tvec_base boot_tvec_bases;
>  
>  #define TIMER_FLAG_MASK			0x3LU
>  
> +#ifdef CONFIG_SMP
> +#define __TIMER_BASE(_flags) \
> +	((_flags) & TIMER_DEFERRABLE ? \
> +	 (unsigned long)&tvec_base_deferrable + (_flags) : \
> +	 (unsigned long)&boot_tvec_bases + (_flags))
> +#else
> +#define __TIMER_BASE(_flags) ((unsigned long)&boot_tvec_bases + (_flags))
> +#endif
> +
>  #define __TIMER_INITIALIZER(_function, _expires, _data, _flags) { \
>  		.entry = { .prev = TIMER_ENTRY_STATIC },	\
>  		.function = (_function),			\
>  		.expires = (_expires),				\
>  		.data = (_data),				\
> -		.base = (void *)((unsigned long)&boot_tvec_bases + (_flags)), \
> +		.base = (void *)(__TIMER_BASE(_flags)),		\
>  		.slack = -1,					\
>  		__TIMER_LOCKDEP_MAP_INITIALIZER(		\
>  			__FILE__ ":" __stringify(__LINE__))	\
> diff --git a/kernel/time/timer.c b/kernel/time/timer.c
> index e5d5733c..133e94a 100644
> --- a/kernel/time/timer.c
> +++ b/kernel/time/timer.c
> @@ -49,6 +49,8 @@
>  #include <asm/timex.h>
>  #include <asm/io.h>
>  
> +#include "tick-internal.h"
> +
>  #define CREATE_TRACE_POINTS
>  #include <trace/events/timer.h>
>  
> @@ -103,6 +105,9 @@ struct tvec_base boot_tvec_bases;
>  EXPORT_SYMBOL(boot_tvec_bases);
>  
>  static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
> +#ifdef CONFIG_SMP
> +struct tvec_base tvec_base_deferrable;
> +#endif
>  
>  /* Functions below help us manage 'deferrable' flag */
>  static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
> @@ -662,10 +667,63 @@ static inline void debug_assert_init(struct timer_list *timer)
>  	debug_timer_assert_init(timer);
>  }
>  
> +#ifdef CONFIG_SMP
> +static inline struct tvec_base *__get_timer_base(unsigned int flags)
> +{
> +	if (flags & TIMER_DEFERRABLE)
> +		return &tvec_base_deferrable;
> +	else
> +		return raw_cpu_read(tvec_bases);
> +}
> +
> +static inline bool is_deferrable_timer_base(struct tvec_base *base)
> +{
> +	return base == &tvec_base_deferrable;
> +}
> +
> +static inline void __run_timers(struct tvec_base *base);
> +static inline void __run_deferrable_timers(void)
> +{
> +	if (smp_processor_id() == tick_do_timer_cpu &&
> +	    time_after_eq(jiffies, tvec_base_deferrable.timer_jiffies))
> +		__run_timers(&tvec_base_deferrable);
> +}
> +
> +static void __init init_timer_cpu(struct tvec_base *base, int cpu);
> +static inline void init_deferrable_timer(void)
> +{
> +	init_timer_cpu(&tvec_base_deferrable, NR_CPUS);
> +}
> +#else
> +static inline struct tvec_base *__get_timer_base(unsigned int flags)
> +{
> +	return raw_cpu_read(tvec_bases);
> +}
> +
> +static inline bool is_deferrable_timer_base(struct tvec_base *base)
> +{
> +	return false;
> +}
> +
> +static inline void __run_deferrable_timers(void)
> +{
> +}
> +
> +static inline void init_deferrable_timer(void)
> +{
> +	/*
> +	 * initialize cpu unbound deferrable timer base only when CONFIG_SMP.
> +	 * UP kernel handles the timers with cpu 0 timer base.
> +	 */
> +}
> +#endif
> +
>  static void do_init_timer(struct timer_list *timer, unsigned int flags,
>  			  const char *name, struct lock_class_key *key)
>  {
> -	struct tvec_base *base = raw_cpu_read(tvec_bases);
> +	struct tvec_base *base;
> +
> +	base = __get_timer_base(flags);
>  
>  	timer->entry.next = NULL;
>  	timer->base = (void *)((unsigned long)base | flags);
> @@ -787,24 +845,26 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
>  
>  	debug_activate(timer, expires);
>  
> -	cpu = get_nohz_timer_target(pinned);
> -	new_base = per_cpu(tvec_bases, cpu);
> +	if (!is_deferrable_timer_base(base) || pinned == TIMER_PINNED) {
> +		cpu = get_nohz_timer_target(pinned);
> +		new_base = per_cpu(tvec_bases, cpu);
>  
> -	if (base != new_base) {
> -		/*
> -		 * We are trying to schedule the timer on the local CPU.
> -		 * However we can't change timer's base while it is running,
> -		 * otherwise del_timer_sync() can't detect that the timer's
> -		 * handler yet has not finished. This also guarantees that
> -		 * the timer is serialized wrt itself.
> -		 */
> -		if (likely(base->running_timer != timer)) {
> -			/* See the comment in lock_timer_base() */
> -			timer_set_base(timer, NULL);
> -			spin_unlock(&base->lock);
> -			base = new_base;
> -			spin_lock(&base->lock);
> -			timer_set_base(timer, base);
> +		if (base != new_base) {
> +			/*
> +			 * We are trying to schedule the timer on the local CPU.
> +			 * However we can't change timer's base while it is
> +			 * running, otherwise del_timer_sync() can't detect that
> +			 * the timer's handler yet has not finished. This also
> +			 * guarantees that the timer is serialized wrt itself.
> +			 */
> +			if (likely(base->running_timer != timer)) {
> +				/* See the comment in lock_timer_base() */
> +				timer_set_base(timer, NULL);
> +				spin_unlock(&base->lock);
> +				base = new_base;
> +				spin_lock(&base->lock);
> +				timer_set_base(timer, base);
> +			}
>  		}
>  	}
>  
> @@ -1411,6 +1471,8 @@ static void run_timer_softirq(struct softirq_action *h)
>  
>  	hrtimer_run_pending();
>  
> +	__run_deferrable_timers();
> +
>  	if (time_after_eq(jiffies, base->timer_jiffies))
>  		__run_timers(base);
>  }
> @@ -1623,7 +1685,8 @@ static void __init init_timer_cpu(struct tvec_base *base, int cpu)
>  	BUG_ON(base != tbase_get_base(base));
>  
>  	base->cpu = cpu;
> -	per_cpu(tvec_bases, cpu) = base;
> +	if (cpu != NR_CPUS)
> +		per_cpu(tvec_bases, cpu) = base;
>  	spin_lock_init(&base->lock);
>  
>  	for (j = 0; j < TVN_SIZE; j++) {
> @@ -1655,6 +1718,8 @@ static void __init init_timer_cpus(void)
>  
>  		init_timer_cpu(base, cpu);
>  	}
> +
> +	init_deferrable_timer();
>  }
>  
>  void __init init_timers(void)
> 

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2015-04-28  2:41 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-09-16  2:59 [PATCH v2 RESEND/RFC] timer: make deferrable cpu unbound timers really not bound to a cpu Joonwoo Park
2014-09-23 18:33 ` Thomas Gleixner
2014-09-26 20:54   ` Joonwoo Park
2015-03-20  0:10   ` Saravana Kannan
2015-03-24 19:36     ` Saravana Kannan
2015-03-30 20:16       ` Joonwoo Park
2015-04-28  2:39       ` [PATCH 2/2] " Joonwoo Park
2015-04-28  2:41         ` Joonwoo Park

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).