linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC/PATCH] PM / Sleep: Timer quiesce in freeze state
@ 2014-10-21 15:15 Li, Aubrey
  2014-10-24 15:36 ` Peter Zijlstra
  0 siblings, 1 reply; 28+ messages in thread
From: Li, Aubrey @ 2014-10-21 15:15 UTC (permalink / raw)
  To: peterz, Rafael J. Wysocki, Brown, Len, alan, Thomas Gleixner,
	H. Peter Anvin
  Cc: linux-kernel, linux-pm@vger.kernel.org >> Linux PM list

The patch is based on v3.17, merged with Rafael's pm+acpi-3.18-rc1 tag from
linux-pm.git tree.

The patch is based on the patch PeterZ initially wrote.
---
Freeze is a general power saving state that processes are frozen, devices
are suspended and CPUs are in idle state. However, when the system enters
freeze state, there are a few timers keep ticking and hence consumes more
power unnecessarily. The observed timer events in freeze state are:
- tick_sched_timer
- watchdog lockup detector
- realtime scheduler period timer

The system power consumption in freeze state will be reduced significantly
if we quiesce these timers.

On Baytrail-T(ASUS_T100) platform, when the system is freezed to low power
idle state(S0ix), quiescing these timers saves 29.8% power(94.48mw -> 66.32mw).

The patch is also tested on:
- Sandybrdige-EP system, both RTC alarm and power button are able to wake
  the system up from freeze state.
- HP laptop EliteBook 8460p, both RTC alarm and power button are able to
  wake the system up from freeze state.

Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Alan Cox <alan@linux.intel.com>
---
 arch/x86/kernel/apic/apic.c        |   8 ++
 drivers/cpuidle/cpuidle.c          |  12 +++
 kernel/power/suspend.c             | 175 +++++++++++++++++++++++++++++++++++--
 kernel/time/timekeeping.c          |   4 +-
 kernel/time/timekeeping_internal.h |   3 +
 5 files changed, 193 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 6776027..f2bb645 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -917,6 +917,14 @@ static void local_apic_timer_interrupt(void)
 	 */
 	inc_irq_stat(apic_timer_irqs);
 
+	/*
+	 * if timekeeping is suspended, the clock event device will be
+	 * suspended as well, so we are not supposed to invoke the event
+	 * handler of clock event device.
+	 */
+	if (unlikely(timekeeping_suspended))
+		return;
+
 	evt->event_handler(evt);
 }
 
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index ee9df5e..8f84f40 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -119,6 +119,18 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
 	ktime_t time_start, time_end;
 	s64 diff;
 
+	/*
+	 * under the scenario of use deepest idle state, the timekeeping
+	 * could be suspended as well as the clock source device, so we
+	 * bypass the idle counter update for this case
+	 */
+	if (unlikely(use_deepest_state)) {
+		entered_state = target_state->enter(dev, drv, index);
+		if (!cpuidle_state_is_coupled(dev, drv, entered_state))
+			local_irq_enable();
+		return entered_state;
+	}
+
 	trace_cpu_idle_rcuidle(index, dev->cpu);
 	time_start = ktime_get();
 
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 4ca9a33..e58d880 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -28,16 +28,20 @@
 #include <linux/ftrace.h>
 #include <trace/events/power.h>
 #include <linux/compiler.h>
+#include <linux/stop_machine.h>
+#include <linux/clockchips.h>
+#include <linux/hrtimer.h>
 
 #include "power.h"
+#include "../time/tick-internal.h"
+#include "../time/timekeeping_internal.h"
 
 const char *pm_labels[] = { "mem", "standby", "freeze", NULL };
 const char *pm_states[PM_SUSPEND_MAX];
 
 static const struct platform_suspend_ops *suspend_ops;
 static const struct platform_freeze_ops *freeze_ops;
-static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
-static bool suspend_freeze_wake;
+static int suspend_freeze_wake;
 
 void freeze_set_ops(const struct platform_freeze_ops *ops)
 {
@@ -48,22 +52,179 @@ void freeze_set_ops(const struct platform_freeze_ops *ops)
 
 static void freeze_begin(void)
 {
-	suspend_freeze_wake = false;
+	suspend_freeze_wake = -1;
 }
 
-static void freeze_enter(void)
+enum freezer_state {
+	FREEZER_NONE,
+	FREEZER_PICK_TK,
+	FREEZER_SUSPEND_CLKEVT,
+	FREEZER_SUSPEND_TK,
+	FREEZER_IDLE,
+	FREEZER_RESUME_TK,
+	FREEZER_RESUME_CLKEVT,
+	FREEZER_EXIT,
+};
+
+struct freezer_data {
+	int			thread_num;
+	atomic_t		thread_ack;
+	enum freezer_state	state;
+};
+
+static void set_state(struct freezer_data *fd, enum freezer_state state)
+{
+	/* set ack counter */
+	atomic_set(&fd->thread_ack, fd->thread_num);
+	/* guarantee the write ordering between ack counter and state */
+	smp_wmb();
+	fd->state = state;
+}
+
+static void ack_state(struct freezer_data *fd)
+{
+	if (atomic_dec_and_test(&fd->thread_ack))
+		set_state(fd, fd->state + 1);
+}
+
+static void freezer_pick_tk(int cpu)
+{
+	if (tick_do_timer_cpu == TICK_DO_TIMER_NONE) {
+		static DEFINE_SPINLOCK(lock);
+
+		spin_lock(&lock);
+		if (tick_do_timer_cpu == TICK_DO_TIMER_NONE)
+			tick_do_timer_cpu = cpu;
+		spin_unlock(&lock);
+	}
+}
+
+static void freezer_suspend_clkevt(int cpu)
+{
+	if (tick_do_timer_cpu == cpu)
+		return;
+
+	clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
+}
+
+static void freezer_suspend_tk(int cpu)
 {
+	if (tick_do_timer_cpu != cpu)
+		return;
+
+	timekeeping_suspend();
+
 	cpuidle_use_deepest_state(true);
 	cpuidle_resume();
-	wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
+}
+
+static void freezer_idle(int cpu)
+{
+	struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
+	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
+
+	stop_critical_timings();
+
+	while (suspend_freeze_wake == -1) {
+		int next_state;
+
+		/*
+		 * interrupt must be disabled before cpu enters idle
+		 */
+		local_irq_disable();
+
+		next_state = cpuidle_select(drv, dev);
+		if (next_state < 0) {
+			arch_cpu_idle();
+			continue;
+		}
+		/*
+		 * cpuidle_enter will return with interrupt enabled
+		 */
+		cpuidle_enter(drv, dev, next_state);
+	}
+
+	if (suspend_freeze_wake == cpu)
+		kick_all_cpus_sync();
+
+	start_critical_timings();
+}
+
+static void freezer_resume_tk(int cpu)
+{
+	if (tick_do_timer_cpu != cpu)
+		return;
+
 	cpuidle_pause();
 	cpuidle_use_deepest_state(false);
+
+	local_irq_disable();
+	timekeeping_resume();
+	local_irq_enable();
+}
+
+static void freezer_resume_clkevt(int cpu)
+{
+	if (tick_do_timer_cpu == cpu)
+		return;
+
+	touch_softlockup_watchdog();
+	clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
+	local_irq_disable();
+	hrtimers_resume();
+	local_irq_enable();
+}
+
+typedef void (*freezer_fn)(int);
+
+static freezer_fn freezer_func[FREEZER_EXIT] = {
+	NULL,
+	freezer_pick_tk,
+	freezer_suspend_clkevt,
+	freezer_suspend_tk,
+	freezer_idle,
+	freezer_resume_tk,
+	freezer_resume_clkevt,
+};
+
+static int freezer_stopper_fn(void *arg)
+{
+	struct freezer_data *fd = arg;
+	enum freezer_state state = FREEZER_NONE;
+	int cpu = smp_processor_id();
+
+	do {
+		cpu_relax();
+		if (fd->state != state) {
+			state = fd->state;
+			if (freezer_func[state])
+				(*freezer_func[state])(cpu);
+			ack_state(fd);
+		}
+	} while (fd->state != FREEZER_EXIT);
+
+	return 0;
+}
+
+static void freeze_enter(void)
+{
+	struct freezer_data fd;
+
+	get_online_cpus();
+
+	fd.thread_num = num_online_cpus();
+	set_state(&fd, FREEZER_PICK_TK);
+
+	__stop_machine(freezer_stopper_fn, &fd, cpu_online_mask);
+
+	put_online_cpus();
 }
 
 void freeze_wake(void)
 {
-	suspend_freeze_wake = true;
-	wake_up(&suspend_freeze_wait_head);
+	if (suspend_freeze_wake != -1)
+		return;
+	suspend_freeze_wake = smp_processor_id();
 }
 EXPORT_SYMBOL_GPL(freeze_wake);
 
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index ec1791f..23d8feb 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1114,7 +1114,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
  * xtime/wall_to_monotonic/jiffies/etc are
  * still managed by arch specific suspend/resume code.
  */
-static void timekeeping_resume(void)
+void timekeeping_resume(void)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
 	struct clocksource *clock = tk->tkr.clock;
@@ -1195,7 +1195,7 @@ static void timekeeping_resume(void)
 	hrtimers_resume();
 }
 
-static int timekeeping_suspend(void)
+int timekeeping_suspend(void)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
 	unsigned long flags;
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
index 4ea005a..ed7a574 100644
--- a/kernel/time/timekeeping_internal.h
+++ b/kernel/time/timekeeping_internal.h
@@ -26,4 +26,7 @@ static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask)
 }
 #endif
 
+extern int timekeeping_suspend(void);
+extern void timekeeping_resume(void);
+
 #endif /* _TIMEKEEPING_INTERNAL_H */
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [RFC/PATCH] PM / Sleep: Timer quiesce in freeze state
  2014-10-21 15:15 [RFC/PATCH] PM / Sleep: Timer quiesce in freeze state Li, Aubrey
@ 2014-10-24 15:36 ` Peter Zijlstra
  2014-10-27  6:27   ` Li, Aubrey
  2014-10-28  4:39   ` [RFC/PATCH] " Li, Aubrey
  0 siblings, 2 replies; 28+ messages in thread
From: Peter Zijlstra @ 2014-10-24 15:36 UTC (permalink / raw)
  To: Li, Aubrey
  Cc: Rafael J. Wysocki, Brown, Len, alan, Thomas Gleixner,
	H. Peter Anvin, linux-kernel,
	linux-pm@vger.kernel.org >> Linux PM list

On Tue, Oct 21, 2014 at 11:15:10PM +0800, Li, Aubrey wrote:
> +++ b/arch/x86/kernel/apic/apic.c
> @@ -917,6 +917,14 @@ static void local_apic_timer_interrupt(void)
>  	 */
>  	inc_irq_stat(apic_timer_irqs);
>  
> +	/*
> +	 * if timekeeping is suspended, the clock event device will be
> +	 * suspended as well, so we are not supposed to invoke the event
> +	 * handler of clock event device.
> +	 */
> +	if (unlikely(timekeeping_suspended))
> +		return;
> +
>  	evt->event_handler(evt);
>  }
>  

How would this even happen? Didn't we just suspend the lapic?

> diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
> index 4ca9a33..e58d880 100644
> --- a/kernel/power/suspend.c
> +++ b/kernel/power/suspend.c
> @@ -28,16 +28,20 @@
>  #include <linux/ftrace.h>
>  #include <trace/events/power.h>
>  #include <linux/compiler.h>
> +#include <linux/stop_machine.h>
> +#include <linux/clockchips.h>
> +#include <linux/hrtimer.h>
>  
>  #include "power.h"
> +#include "../time/tick-internal.h"
> +#include "../time/timekeeping_internal.h"
>  
>  const char *pm_labels[] = { "mem", "standby", "freeze", NULL };
>  const char *pm_states[PM_SUSPEND_MAX];
>  
>  static const struct platform_suspend_ops *suspend_ops;
>  static const struct platform_freeze_ops *freeze_ops;
> -static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
> -static bool suspend_freeze_wake;
> +static int suspend_freeze_wake;
>  
>  void freeze_set_ops(const struct platform_freeze_ops *ops)
>  {
> @@ -48,22 +52,179 @@ void freeze_set_ops(const struct platform_freeze_ops *ops)
>  
>  static void freeze_begin(void)
>  {
> -	suspend_freeze_wake = false;
> +	suspend_freeze_wake = -1;
>  }
>  
> -static void freeze_enter(void)
> +enum freezer_state {
> +	FREEZER_NONE,
> +	FREEZER_PICK_TK,
> +	FREEZER_SUSPEND_CLKEVT,
> +	FREEZER_SUSPEND_TK,
> +	FREEZER_IDLE,
> +	FREEZER_RESUME_TK,
> +	FREEZER_RESUME_CLKEVT,
> +	FREEZER_EXIT,
> +};
> +
> +struct freezer_data {
> +	int			thread_num;
> +	atomic_t		thread_ack;
> +	enum freezer_state	state;
> +};
> +
> +static void set_state(struct freezer_data *fd, enum freezer_state state)
> +{
> +	/* set ack counter */
> +	atomic_set(&fd->thread_ack, fd->thread_num);
> +	/* guarantee the write ordering between ack counter and state */
> +	smp_wmb();
> +	fd->state = state;
> +}
> +
> +static void ack_state(struct freezer_data *fd)
> +{
> +	if (atomic_dec_and_test(&fd->thread_ack))
> +		set_state(fd, fd->state + 1);
> +}
> +
> +static void freezer_pick_tk(int cpu)
> +{
> +	if (tick_do_timer_cpu == TICK_DO_TIMER_NONE) {
> +		static DEFINE_SPINLOCK(lock);
> +
> +		spin_lock(&lock);
> +		if (tick_do_timer_cpu == TICK_DO_TIMER_NONE)
> +			tick_do_timer_cpu = cpu;
> +		spin_unlock(&lock);
> +	}
> +}
> +
> +static void freezer_suspend_clkevt(int cpu)
> +{
> +	if (tick_do_timer_cpu == cpu)
> +		return;
> +
> +	clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
> +}
> +
> +static void freezer_suspend_tk(int cpu)
>  {
> +	if (tick_do_timer_cpu != cpu)
> +		return;
> +

I had a note here that this might be broken for clocksource drivers that
have suspend/resume methods. You seem to have 'lost' that note, is that
because you found it isn't a problem?

> +	timekeeping_suspend();
> +
>  	cpuidle_use_deepest_state(true);
>  	cpuidle_resume();
> -	wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
> +}
> +
> +static void freezer_idle(int cpu)
> +{
> +	struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
> +	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
> +
> +	stop_critical_timings();
> +
> +	while (suspend_freeze_wake == -1) {
> +		int next_state;
> +
> +		/*
> +		 * interrupt must be disabled before cpu enters idle
> +		 */
> +		local_irq_disable();
> +
> +		next_state = cpuidle_select(drv, dev);
> +		if (next_state < 0) {
> +			arch_cpu_idle();
> +			continue;
> +		}
> +		/*
> +		 * cpuidle_enter will return with interrupt enabled
> +		 */
> +		cpuidle_enter(drv, dev, next_state);
> +	}
> +
> +	if (suspend_freeze_wake == cpu)
> +		kick_all_cpus_sync();
> +

So I disabled IRQs here

> +	start_critical_timings();
> +}
> +
> +static void freezer_resume_tk(int cpu)
> +{
> +	if (tick_do_timer_cpu != cpu)
> +		return;
> +
>  	cpuidle_pause();
>  	cpuidle_use_deepest_state(false);
> +

Such that they would still be disabled here

> +	local_irq_disable();
> +	timekeeping_resume();
> +	local_irq_enable();
> +}
> +
> +static void freezer_resume_clkevt(int cpu)
> +{
> +	if (tick_do_timer_cpu == cpu)
> +		return;
> +
> +	touch_softlockup_watchdog();
> +	clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);

And here.

> +	local_irq_disable();
> +	hrtimers_resume();
> +	local_irq_enable();
> +}
> +
> +typedef void (*freezer_fn)(int);
> +
> +static freezer_fn freezer_func[FREEZER_EXIT] = {
> +	NULL,
> +	freezer_pick_tk,
> +	freezer_suspend_clkevt,
> +	freezer_suspend_tk,
> +	freezer_idle,
> +	freezer_resume_tk,
> +	freezer_resume_clkevt,
> +};

Because this is a stop_machine callback, which are nominally run with
IRQs disabled.

> +static int freezer_stopper_fn(void *arg)
> +{
> +	struct freezer_data *fd = arg;
> +	enum freezer_state state = FREEZER_NONE;
> +	int cpu = smp_processor_id();
> +
> +	do {
> +		cpu_relax();
> +		if (fd->state != state) {
> +			state = fd->state;
> +			if (freezer_func[state])
> +				(*freezer_func[state])(cpu);
> +			ack_state(fd);
> +		}
> +	} while (fd->state != FREEZER_EXIT);
> +	return 0;
> +}

Now I suppose the problem is with cpu_pause() which needs IPIs to
complete? Do we really need cpuidle_pause() there?
cpuidle_uninstall_handlers() seems like a daft thing to call just about
there.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC/PATCH] PM / Sleep: Timer quiesce in freeze state
  2014-10-24 15:36 ` Peter Zijlstra
@ 2014-10-27  6:27   ` Li, Aubrey
  2014-10-27  7:28     ` Peter Zijlstra
  2014-10-27  7:44     ` Peter Zijlstra
  2014-10-28  4:39   ` [RFC/PATCH] " Li, Aubrey
  1 sibling, 2 replies; 28+ messages in thread
From: Li, Aubrey @ 2014-10-27  6:27 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Rafael J. Wysocki, Brown, Len, alan, Thomas Gleixner,
	H. Peter Anvin, linux-kernel,
	linux-pm@vger.kernel.org >> Linux PM list

On 2014/10/24 23:36, Peter Zijlstra wrote:
> On Tue, Oct 21, 2014 at 11:15:10PM +0800, Li, Aubrey wrote:
>> +++ b/arch/x86/kernel/apic/apic.c
>> @@ -917,6 +917,14 @@ static void local_apic_timer_interrupt(void)
>>  	 */
>>  	inc_irq_stat(apic_timer_irqs);
>>  
>> +	/*
>> +	 * if timekeeping is suspended, the clock event device will be
>> +	 * suspended as well, so we are not supposed to invoke the event
>> +	 * handler of clock event device.
>> +	 */
>> +	if (unlikely(timekeeping_suspended))
>> +		return;
>> +
>>  	evt->event_handler(evt);
>>  }
>>  
> 
> How would this even happen? Didn't we just suspend the lapic?

There are two race conditions in my mind.

The first one occurs after the interrupt is disabled and before we
suspend lapic. In this time slot, if apic timer interrupt occurs, the
interrupt is pending there because the interrupt is disabled. Then we
suspend timekeeping, and then we enter idle and exit idle with interrupt
re-enabled, the timer interrupt is handled with timekeeping is
suspended. So we at least will hit WARN_ON(timekeeping_suspended) in
ktime_get().

The other occurs after timekeeping_suspended = 1 and before we suspend
lapic. In this time slot, if apic timer interrupt occurs, we will hit
the WARN in ktime_get() as above.

> 
>> diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
>> index 4ca9a33..e58d880 100644
>> --- a/kernel/power/suspend.c
>> +++ b/kernel/power/suspend.c
>> @@ -28,16 +28,20 @@
>>  #include <linux/ftrace.h>
>>  #include <trace/events/power.h>
>>  #include <linux/compiler.h>
>> +#include <linux/stop_machine.h>
>> +#include <linux/clockchips.h>
>> +#include <linux/hrtimer.h>
>>  
>>  #include "power.h"
>> +#include "../time/tick-internal.h"
>> +#include "../time/timekeeping_internal.h"
>>  
>>  const char *pm_labels[] = { "mem", "standby", "freeze", NULL };
>>  const char *pm_states[PM_SUSPEND_MAX];
>>  
>>  static const struct platform_suspend_ops *suspend_ops;
>>  static const struct platform_freeze_ops *freeze_ops;
>> -static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
>> -static bool suspend_freeze_wake;
>> +static int suspend_freeze_wake;
>>  
>>  void freeze_set_ops(const struct platform_freeze_ops *ops)
>>  {
>> @@ -48,22 +52,179 @@ void freeze_set_ops(const struct platform_freeze_ops *ops)
>>  
>>  static void freeze_begin(void)
>>  {
>> -	suspend_freeze_wake = false;
>> +	suspend_freeze_wake = -1;
>>  }
>>  
>> -static void freeze_enter(void)
>> +enum freezer_state {
>> +	FREEZER_NONE,
>> +	FREEZER_PICK_TK,
>> +	FREEZER_SUSPEND_CLKEVT,
>> +	FREEZER_SUSPEND_TK,
>> +	FREEZER_IDLE,
>> +	FREEZER_RESUME_TK,
>> +	FREEZER_RESUME_CLKEVT,
>> +	FREEZER_EXIT,
>> +};
>> +
>> +struct freezer_data {
>> +	int			thread_num;
>> +	atomic_t		thread_ack;
>> +	enum freezer_state	state;
>> +};
>> +
>> +static void set_state(struct freezer_data *fd, enum freezer_state state)
>> +{
>> +	/* set ack counter */
>> +	atomic_set(&fd->thread_ack, fd->thread_num);
>> +	/* guarantee the write ordering between ack counter and state */
>> +	smp_wmb();
>> +	fd->state = state;
>> +}
>> +
>> +static void ack_state(struct freezer_data *fd)
>> +{
>> +	if (atomic_dec_and_test(&fd->thread_ack))
>> +		set_state(fd, fd->state + 1);
>> +}
>> +
>> +static void freezer_pick_tk(int cpu)
>> +{
>> +	if (tick_do_timer_cpu == TICK_DO_TIMER_NONE) {
>> +		static DEFINE_SPINLOCK(lock);
>> +
>> +		spin_lock(&lock);
>> +		if (tick_do_timer_cpu == TICK_DO_TIMER_NONE)
>> +			tick_do_timer_cpu = cpu;
>> +		spin_unlock(&lock);
>> +	}
>> +}
>> +
>> +static void freezer_suspend_clkevt(int cpu)
>> +{
>> +	if (tick_do_timer_cpu == cpu)
>> +		return;
>> +
>> +	clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
>> +}
>> +
>> +static void freezer_suspend_tk(int cpu)
>>  {
>> +	if (tick_do_timer_cpu != cpu)
>> +		return;
>> +
> 
> I had a note here that this might be broken for clocksource drivers that
> have suspend/resume methods. You seem to have 'lost' that note, is that
> because you found it isn't a problem?
> 
I don't see it's a problem as long as we do not refer the clock source
before it resumes. But I think my testing didn't cover that case that
clock source drivers have suspend/resume methods. Can you please give
more details what do you worry about?

>> +	timekeeping_suspend();
>> +
>>  	cpuidle_use_deepest_state(true);
>>  	cpuidle_resume();
>> -	wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
>> +}
>> +
>> +static void freezer_idle(int cpu)
>> +{
>> +	struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
>> +	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
>> +
>> +	stop_critical_timings();
>> +
>> +	while (suspend_freeze_wake == -1) {
>> +		int next_state;
>> +
>> +		/*
>> +		 * interrupt must be disabled before cpu enters idle
>> +		 */
>> +		local_irq_disable();
>> +
>> +		next_state = cpuidle_select(drv, dev);
>> +		if (next_state < 0) {
>> +			arch_cpu_idle();
>> +			continue;
>> +		}
>> +		/*
>> +		 * cpuidle_enter will return with interrupt enabled
>> +		 */
>> +		cpuidle_enter(drv, dev, next_state);
>> +	}
>> +
>> +	if (suspend_freeze_wake == cpu)
>> +		kick_all_cpus_sync();
>> +
> 
> So I disabled IRQs here
> 
>> +	start_critical_timings();
>> +}
>> +
>> +static void freezer_resume_tk(int cpu)
>> +{
>> +	if (tick_do_timer_cpu != cpu)
>> +		return;
>> +
>>  	cpuidle_pause();
>>  	cpuidle_use_deepest_state(false);
>> +
> 
> Such that they would still be disabled here
> 
>> +	local_irq_disable();
>> +	timekeeping_resume();
>> +	local_irq_enable();
>> +}
>> +
>> +static void freezer_resume_clkevt(int cpu)
>> +{
>> +	if (tick_do_timer_cpu == cpu)
>> +		return;
>> +
>> +	touch_softlockup_watchdog();
>> +	clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
> 
> And here.
> 
>> +	local_irq_disable();
>> +	hrtimers_resume();
>> +	local_irq_enable();
>> +}
>> +
>> +typedef void (*freezer_fn)(int);
>> +
>> +static freezer_fn freezer_func[FREEZER_EXIT] = {
>> +	NULL,
>> +	freezer_pick_tk,
>> +	freezer_suspend_clkevt,
>> +	freezer_suspend_tk,
>> +	freezer_idle,
>> +	freezer_resume_tk,
>> +	freezer_resume_clkevt,
>> +};
> 
> Because this is a stop_machine callback, which are nominally run with
> IRQs disabled.

Let me double confirm and address this concern in a later mail.
> 
>> +static int freezer_stopper_fn(void *arg)
>> +{
>> +	struct freezer_data *fd = arg;
>> +	enum freezer_state state = FREEZER_NONE;
>> +	int cpu = smp_processor_id();
>> +
>> +	do {
>> +		cpu_relax();
>> +		if (fd->state != state) {
>> +			state = fd->state;
>> +			if (freezer_func[state])
>> +				(*freezer_func[state])(cpu);
>> +			ack_state(fd);
>> +		}
>> +	} while (fd->state != FREEZER_EXIT);
>> +	return 0;
>> +}
> 
> Now I suppose the problem is with cpu_pause() which needs IPIs to
> complete? Do we really need cpuidle_pause() there?
> cpuidle_uninstall_handlers() seems like a daft thing to call just about
> there.

Please check the log of 8651f97bd951d0bb1c10fa24e3fa3455193f3548.
Rafael should know more this question than me.

Thanks,
-Aubrey
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 
> 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC/PATCH] PM / Sleep: Timer quiesce in freeze state
  2014-10-27  6:27   ` Li, Aubrey
@ 2014-10-27  7:28     ` Peter Zijlstra
  2014-10-28  4:32       ` Li, Aubrey
  2014-10-27  7:44     ` Peter Zijlstra
  1 sibling, 1 reply; 28+ messages in thread
From: Peter Zijlstra @ 2014-10-27  7:28 UTC (permalink / raw)
  To: Li, Aubrey
  Cc: Rafael J. Wysocki, Brown, Len, alan, Thomas Gleixner,
	H. Peter Anvin, linux-kernel,
	linux-pm@vger.kernel.org >> Linux PM list

On Mon, Oct 27, 2014 at 02:27:27PM +0800, Li, Aubrey wrote:
> > Now I suppose the problem is with cpu_pause() which needs IPIs to
> > complete? Do we really need cpuidle_pause() there?
> > cpuidle_uninstall_handlers() seems like a daft thing to call just about
> > there.
> 
> Please check the log of 8651f97bd951d0bb1c10fa24e3fa3455193f3548.
> Rafael should know more this question than me.

That changelog explains its complete bollocks to do it here. We _want_
to enter and/or remain in deep idle states.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC/PATCH] PM / Sleep: Timer quiesce in freeze state
  2014-10-27  6:27   ` Li, Aubrey
  2014-10-27  7:28     ` Peter Zijlstra
@ 2014-10-27  7:44     ` Peter Zijlstra
  2014-10-28  7:52       ` Li, Aubrey
  1 sibling, 1 reply; 28+ messages in thread
From: Peter Zijlstra @ 2014-10-27  7:44 UTC (permalink / raw)
  To: Li, Aubrey
  Cc: Rafael J. Wysocki, Brown, Len, alan, Thomas Gleixner,
	H. Peter Anvin, linux-kernel,
	linux-pm@vger.kernel.org >> Linux PM list

On Mon, Oct 27, 2014 at 02:27:27PM +0800, Li, Aubrey wrote:
> >> +static void freezer_suspend_tk(int cpu)
> >>  {
> >> +	if (tick_do_timer_cpu != cpu)
> >> +		return;
> >> +
> > 
> > I had a note here that this might be broken for clocksource drivers that
> > have suspend/resume methods. You seem to have 'lost' that note, is that
> > because you found it isn't a problem?
> > 
> I don't see it's a problem as long as we do not refer the clock source
> before it resumes. But I think my testing didn't cover that case that
> clock source drivers have suspend/resume methods. Can you please give
> more details what do you worry about?

I can't seem to recall :/ Maybe I conflated clocksource and clockevents
and figured we'd need to run the clocksource suspend callback on each
cpu.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC/PATCH] PM / Sleep: Timer quiesce in freeze state
  2014-10-27  7:28     ` Peter Zijlstra
@ 2014-10-28  4:32       ` Li, Aubrey
  2014-10-28  8:29         ` Peter Zijlstra
  0 siblings, 1 reply; 28+ messages in thread
From: Li, Aubrey @ 2014-10-28  4:32 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Rafael J. Wysocki, Brown, Len, alan, Thomas Gleixner,
	H. Peter Anvin, linux-kernel,
	linux-pm@vger.kernel.org >> Linux PM list

On 2014/10/27 15:28, Peter Zijlstra wrote:
> On Mon, Oct 27, 2014 at 02:27:27PM +0800, Li, Aubrey wrote:
>>> Now I suppose the problem is with cpu_pause() which needs IPIs to
>>> complete? Do we really need cpuidle_pause() there?
>>> cpuidle_uninstall_handlers() seems like a daft thing to call just about
>>> there.
>>
>> Please check the log of 8651f97bd951d0bb1c10fa24e3fa3455193f3548.
>> Rafael should know more this question than me.
> 
> That changelog explains its complete bollocks to do it here. We _want_
> to enter and/or remain in deep idle states.

cpuidle_resume() will be called at the end of dpm_resume_noirq(). So we
still are able to enter deep idle states after resume.

Thanks,
-Aubrey

> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 
> 


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC/PATCH] PM / Sleep: Timer quiesce in freeze state
  2014-10-24 15:36 ` Peter Zijlstra
  2014-10-27  6:27   ` Li, Aubrey
@ 2014-10-28  4:39   ` Li, Aubrey
  2014-10-28  8:25     ` Peter Zijlstra
  1 sibling, 1 reply; 28+ messages in thread
From: Li, Aubrey @ 2014-10-28  4:39 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Rafael J. Wysocki, Brown, Len, alan, Thomas Gleixner,
	H. Peter Anvin, linux-kernel,
	linux-pm@vger.kernel.org >> Linux PM list

On 2014/10/24 23:36, Peter Zijlstra wrote:
>> +
>> +static void freezer_idle(int cpu)
>> +{
>> +	struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
>> +	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
>> +
>> +	stop_critical_timings();
>> +
>> +	while (suspend_freeze_wake == -1) {
>> +		int next_state;
>> +
>> +		/*
>> +		 * interrupt must be disabled before cpu enters idle
>> +		 */
>> +		local_irq_disable();
>> +
>> +		next_state = cpuidle_select(drv, dev);
>> +		if (next_state < 0) {
>> +			arch_cpu_idle();
>> +			continue;
>> +		}
>> +		/*
>> +		 * cpuidle_enter will return with interrupt enabled
>> +		 */
>> +		cpuidle_enter(drv, dev, next_state);
>> +	}
>> +
>> +	if (suspend_freeze_wake == cpu)
>> +		kick_all_cpus_sync();
>> +
> 
> So I disabled IRQs here
> 
>> +	start_critical_timings();
>> +}
>> +
>> +static void freezer_resume_tk(int cpu)
>> +{
>> +	if (tick_do_timer_cpu != cpu)
>> +		return;
>> +
>>  	cpuidle_pause();
>>  	cpuidle_use_deepest_state(false);
>> +
> 
> Such that they would still be disabled here
> 
>> +	local_irq_disable();
>> +	timekeeping_resume();
>> +	local_irq_enable();
>> +}
>> +
>> +static void freezer_resume_clkevt(int cpu)
>> +{
>> +	if (tick_do_timer_cpu == cpu)
>> +		return;
>> +
>> +	touch_softlockup_watchdog();
>> +	clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
> 
> And here.
> 
>> +	local_irq_disable();
>> +	hrtimers_resume();
>> +	local_irq_enable();
>> +}
>> +
>> +typedef void (*freezer_fn)(int);
>> +
>> +static freezer_fn freezer_func[FREEZER_EXIT] = {
>> +	NULL,
>> +	freezer_pick_tk,
>> +	freezer_suspend_clkevt,
>> +	freezer_suspend_tk,
>> +	freezer_idle,
>> +	freezer_resume_tk,
>> +	freezer_resume_clkevt,
>> +};
> 
> Because this is a stop_machine callback, which are nominally run with
> IRQs disabled.
> 
>> +static int freezer_stopper_fn(void *arg)
>> +{
>> +	struct freezer_data *fd = arg;
>> +	enum freezer_state state = FREEZER_NONE;
>> +	int cpu = smp_processor_id();
>> +
>> +	do {
>> +		cpu_relax();
>> +		if (fd->state != state) {
>> +			state = fd->state;
>> +			if (freezer_func[state])
>> +				(*freezer_func[state])(cpu);
>> +			ack_state(fd);
>> +		}
>> +	} while (fd->state != FREEZER_EXIT);
>> +	return 0;
>> +}
> 
> Now I suppose the problem is with cpu_pause() which needs IPIs to
> complete? 

Yes, cpu_pause() will invoke smp IPI functions which need interrupt is
enabled. So I changed irq ops like above. Actually, I have an early
version to move cpuidle_pause()/cpuidle_resume() out of stop_machine(),
that might be a better solution?

Thanks,
-Aubrey


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC/PATCH] PM / Sleep: Timer quiesce in freeze state
  2014-10-27  7:44     ` Peter Zijlstra
@ 2014-10-28  7:52       ` Li, Aubrey
  2014-10-28  8:25         ` Peter Zijlstra
  0 siblings, 1 reply; 28+ messages in thread
From: Li, Aubrey @ 2014-10-28  7:52 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Rafael J. Wysocki, Brown, Len, alan, Thomas Gleixner,
	H. Peter Anvin, linux-kernel,
	linux-pm@vger.kernel.org >> Linux PM list

On 2014/10/27 15:44, Peter Zijlstra wrote:
> On Mon, Oct 27, 2014 at 02:27:27PM +0800, Li, Aubrey wrote:
>>>> +static void freezer_suspend_tk(int cpu)
>>>>  {
>>>> +	if (tick_do_timer_cpu != cpu)
>>>> +		return;
>>>> +
>>>
>>> I had a note here that this might be broken for clocksource drivers that
>>> have suspend/resume methods. You seem to have 'lost' that note, is that
>>> because you found it isn't a problem?
>>>
>> I don't see it's a problem as long as we do not refer the clock source
>> before it resumes. But I think my testing didn't cover that case that
>> clock source drivers have suspend/resume methods. Can you please give
>> more details what do you worry about?
> 
> I can't seem to recall :/ Maybe I conflated clocksource and clockevents
> and figured we'd need to run the clocksource suspend callback on each
> cpu.

Both clocksource and clockevents are not per-cpu device, why do we need
to run their suspend callback on *each* cpu?

Thanks,
-Aubrey

> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 
> 


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC/PATCH] PM / Sleep: Timer quiesce in freeze state
  2014-10-28  7:52       ` Li, Aubrey
@ 2014-10-28  8:25         ` Peter Zijlstra
  2014-10-28 23:22           ` Li, Aubrey
  0 siblings, 1 reply; 28+ messages in thread
From: Peter Zijlstra @ 2014-10-28  8:25 UTC (permalink / raw)
  To: Li, Aubrey
  Cc: Rafael J. Wysocki, Brown, Len, alan, Thomas Gleixner,
	H. Peter Anvin, linux-kernel,
	linux-pm@vger.kernel.org >> Linux PM list

On Tue, Oct 28, 2014 at 03:52:17PM +0800, Li, Aubrey wrote:

> Both clocksource and clockevents are not per-cpu device, why do we need
> to run their suspend callback on *each* cpu?

Uhm, you mean to say we don't use per-cpu timer lists and per-cpu timer
hardware for clockevents then?

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC/PATCH] PM / Sleep: Timer quiesce in freeze state
  2014-10-28  4:39   ` [RFC/PATCH] " Li, Aubrey
@ 2014-10-28  8:25     ` Peter Zijlstra
  0 siblings, 0 replies; 28+ messages in thread
From: Peter Zijlstra @ 2014-10-28  8:25 UTC (permalink / raw)
  To: Li, Aubrey
  Cc: Rafael J. Wysocki, Brown, Len, alan, Thomas Gleixner,
	H. Peter Anvin, linux-kernel,
	linux-pm@vger.kernel.org >> Linux PM list

On Tue, Oct 28, 2014 at 12:39:53PM +0800, Li, Aubrey wrote:

> > Now I suppose the problem is with cpu_pause() which needs IPIs to
> > complete? 
> 
> Yes, cpu_pause() will invoke smp IPI functions which need interrupt is
> enabled. So I changed irq ops like above. Actually, I have an early
> version to move cpuidle_pause()/cpuidle_resume() out of stop_machine(),
> that might be a better solution?

I think you can simply remove them altogether, they're nonsense.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC/PATCH] PM / Sleep: Timer quiesce in freeze state
  2014-10-28  4:32       ` Li, Aubrey
@ 2014-10-28  8:29         ` Peter Zijlstra
  2014-10-28 22:46           ` Li, Aubrey
  0 siblings, 1 reply; 28+ messages in thread
From: Peter Zijlstra @ 2014-10-28  8:29 UTC (permalink / raw)
  To: Li, Aubrey
  Cc: Rafael J. Wysocki, Brown, Len, alan, Thomas Gleixner,
	H. Peter Anvin, linux-kernel,
	linux-pm@vger.kernel.org >> Linux PM list

On Tue, Oct 28, 2014 at 12:32:16PM +0800, Li, Aubrey wrote:
> On 2014/10/27 15:28, Peter Zijlstra wrote:
> > On Mon, Oct 27, 2014 at 02:27:27PM +0800, Li, Aubrey wrote:
> >>> Now I suppose the problem is with cpu_pause() which needs IPIs to
> >>> complete? Do we really need cpuidle_pause() there?
> >>> cpuidle_uninstall_handlers() seems like a daft thing to call just about
> >>> there.
> >>
> >> Please check the log of 8651f97bd951d0bb1c10fa24e3fa3455193f3548.
> >> Rafael should know more this question than me.
> > 
> > That changelog explains its complete bollocks to do it here. We _want_
> > to enter and/or remain in deep idle states.
> 
> cpuidle_resume() will be called at the end of dpm_resume_noirq(). So we
> still are able to enter deep idle states after resume.

cpuidle_resume is absolute crap, as is cpuidle_suspend for that matter
-- in this case.

The only reason we needed cpuidle_suspend is because some BIOS shat its
pants when some CPUs were in higher C states while trying to do the S3
thing. We're not going to use S states or BIOS calls _at_all_, so no
point in kicking CPUs out of their deep C states.

Read that changelog you referred me to again.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC/PATCH] PM / Sleep: Timer quiesce in freeze state
  2014-10-28  8:29         ` Peter Zijlstra
@ 2014-10-28 22:46           ` Li, Aubrey
  2014-10-29  8:21             ` Peter Zijlstra
  0 siblings, 1 reply; 28+ messages in thread
From: Li, Aubrey @ 2014-10-28 22:46 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Rafael J. Wysocki, Brown, Len, alan, Thomas Gleixner,
	H. Peter Anvin, linux-kernel,
	linux-pm@vger.kernel.org >> Linux PM list

On 2014/10/28 16:29, Peter Zijlstra wrote:
> On Tue, Oct 28, 2014 at 12:32:16PM +0800, Li, Aubrey wrote:
>> On 2014/10/27 15:28, Peter Zijlstra wrote:
>>> On Mon, Oct 27, 2014 at 02:27:27PM +0800, Li, Aubrey wrote:
>>>>> Now I suppose the problem is with cpu_pause() which needs IPIs to
>>>>> complete? Do we really need cpuidle_pause() there?
>>>>> cpuidle_uninstall_handlers() seems like a daft thing to call just about
>>>>> there.
>>>>
>>>> Please check the log of 8651f97bd951d0bb1c10fa24e3fa3455193f3548.
>>>> Rafael should know more this question than me.
>>>
>>> That changelog explains its complete bollocks to do it here. We _want_
>>> to enter and/or remain in deep idle states.
>>
>> cpuidle_resume() will be called at the end of dpm_resume_noirq(). So we
>> still are able to enter deep idle states after resume.
> 
> cpuidle_resume is absolute crap, as is cpuidle_suspend for that matter
> -- in this case.
> 
> The only reason we needed cpuidle_suspend is because some BIOS shat its
> pants when some CPUs were in higher C states while trying to do the S3
> thing. We're not going to use S states or BIOS calls _at_all_, so no
> point in kicking CPUs out of their deep C states.

We already kicked CPUs out of their deep C states in dpm_suspend_noirq().

We pause cpuidle in dpm_suspend_noirq() and resume cpuidle in
dpm_resume_noirq(), so currently we can't enter deep c-state during S
states. That's an intention for some buggy BIOS.

However, for freeze state, there is another intention that we want
always to enter the *deepest* c-state every time we enter freeze.
So we need cpuidle_resume() to make sure we have deep cstate  in freeze.

So back to your question in another email,

> I think you can simply remove them altogether, they're nonsense.

We need them to resume cpuidle in freeze.

Thanks,
-Aubrey

> 
> Read that changelog you referred me to again.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 
> 


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC/PATCH] PM / Sleep: Timer quiesce in freeze state
  2014-10-28  8:25         ` Peter Zijlstra
@ 2014-10-28 23:22           ` Li, Aubrey
  2014-10-29  8:24             ` Peter Zijlstra
  0 siblings, 1 reply; 28+ messages in thread
From: Li, Aubrey @ 2014-10-28 23:22 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Rafael J. Wysocki, Brown, Len, alan, Thomas Gleixner,
	H. Peter Anvin, linux-kernel,
	linux-pm@vger.kernel.org >> Linux PM list

On 2014/10/28 16:25, Peter Zijlstra wrote:
> On Tue, Oct 28, 2014 at 03:52:17PM +0800, Li, Aubrey wrote:
> 
>> Both clocksource and clockevents are not per-cpu device, why do we need
>> to run their suspend callback on *each* cpu?
> 
> Uhm, you mean to say we don't use per-cpu timer lists and per-cpu timer
> hardware for clockevents then?
> 

>From OS level, currently tick device is per-cpu implementation while
clocksource and clockevent devices are global device.

We already stop tick by clockevents_notify(suspend) on each cpu, that
addresses per-cpu timer list.

And, we already call clocksource_suspend() and clockevents_suspend() in
timekeeping_suspend() on the tick timer CPU. Yes, we didn't suspend
per-cpu timer hardware on x86 because x86 does not have lapic timer
suspend implementation. If we need to implement this, I think we can do
the cross-CPU calls in clocksource/clockevents suspend(), but I didn't
see any necessary we need to do this now.

so, I think we are okay now, :)

Thanks,
-Aubrey

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC/PATCH] PM / Sleep: Timer quiesce in freeze state
  2014-10-28 22:46           ` Li, Aubrey
@ 2014-10-29  8:21             ` Peter Zijlstra
  2014-10-29 15:09               ` Li, Aubrey
  0 siblings, 1 reply; 28+ messages in thread
From: Peter Zijlstra @ 2014-10-29  8:21 UTC (permalink / raw)
  To: Li, Aubrey
  Cc: Rafael J. Wysocki, Brown, Len, alan, Thomas Gleixner,
	H. Peter Anvin, linux-kernel,
	linux-pm@vger.kernel.org >> Linux PM list

On Wed, Oct 29, 2014 at 06:46:03AM +0800, Li, Aubrey wrote:
> On 2014/10/28 16:29, Peter Zijlstra wrote:
> > On Tue, Oct 28, 2014 at 12:32:16PM +0800, Li, Aubrey wrote:
> >> On 2014/10/27 15:28, Peter Zijlstra wrote:
> >>> On Mon, Oct 27, 2014 at 02:27:27PM +0800, Li, Aubrey wrote:
> >>>>> Now I suppose the problem is with cpu_pause() which needs IPIs to
> >>>>> complete? Do we really need cpuidle_pause() there?
> >>>>> cpuidle_uninstall_handlers() seems like a daft thing to call just about
> >>>>> there.
> >>>>
> >>>> Please check the log of 8651f97bd951d0bb1c10fa24e3fa3455193f3548.
> >>>> Rafael should know more this question than me.
> >>>
> >>> That changelog explains its complete bollocks to do it here. We _want_
> >>> to enter and/or remain in deep idle states.
> >>
> >> cpuidle_resume() will be called at the end of dpm_resume_noirq(). So we
> >> still are able to enter deep idle states after resume.
> > 
> > cpuidle_resume is absolute crap, as is cpuidle_suspend for that matter
> > -- in this case.
> > 
> > The only reason we needed cpuidle_suspend is because some BIOS shat its
> > pants when some CPUs were in higher C states while trying to do the S3
> > thing. We're not going to use S states or BIOS calls _at_all_, so no
> > point in kicking CPUs out of their deep C states.
> 
> We already kicked CPUs out of their deep C states in dpm_suspend_noirq().
> 
> We pause cpuidle in dpm_suspend_noirq() and resume cpuidle in
> dpm_resume_noirq(), so currently we can't enter deep c-state during S
> states. That's an intention for some buggy BIOS.

And work arounds for buggy crap hardware should not be in generic code.
They should be in the platform drivers associated with said crap bugs.

But I think I see what you're saying, we're going through this dpm_ crap
even for suspend to idle, which is wrong too.

> However, for freeze state, there is another intention that we want
> always to enter the *deepest* c-state every time we enter freeze.
> So we need cpuidle_resume() to make sure we have deep cstate  in freeze.
> 
> So back to your question in another email,
> 
> > I think you can simply remove them altogether, they're nonsense.
> 
> We need them to resume cpuidle in freeze.

So you can do cpuidle_resume() before we do the stop machine dance, but
ideally it'd all be ripped out from generic code and stuffed into
the platform drivers where it belongs. But at the very least it should
be isolated to the S3 path, I bet suspend to disk doesn't care either.


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC/PATCH] PM / Sleep: Timer quiesce in freeze state
  2014-10-28 23:22           ` Li, Aubrey
@ 2014-10-29  8:24             ` Peter Zijlstra
  2014-10-30  2:58               ` [PATCH v2] " Li, Aubrey
  0 siblings, 1 reply; 28+ messages in thread
From: Peter Zijlstra @ 2014-10-29  8:24 UTC (permalink / raw)
  To: Li, Aubrey
  Cc: Rafael J. Wysocki, Brown, Len, alan, Thomas Gleixner,
	H. Peter Anvin, linux-kernel,
	linux-pm@vger.kernel.org >> Linux PM list

On Wed, Oct 29, 2014 at 07:22:35AM +0800, Li, Aubrey wrote:
> On 2014/10/28 16:25, Peter Zijlstra wrote:
> > On Tue, Oct 28, 2014 at 03:52:17PM +0800, Li, Aubrey wrote:
> > 
> >> Both clocksource and clockevents are not per-cpu device, why do we need
> >> to run their suspend callback on *each* cpu?
> > 
> > Uhm, you mean to say we don't use per-cpu timer lists and per-cpu timer
> > hardware for clockevents then?
> > 
> 
> From OS level, currently tick device is per-cpu implementation while
> clocksource and clockevent devices are global device.
> 
> We already stop tick by clockevents_notify(suspend) on each cpu, that
> addresses per-cpu timer list.

Right, I know. But I was saying I might have confused myself between
events and sources while going through that call chain, thereby
(mistakenly) thinking the source suspend code needed more than the 1
cpu.

Its easy to confuse yourself trying to reverse engineer that opaque
callchain :-)

> And, we already call clocksource_suspend() and clockevents_suspend() in
> timekeeping_suspend() on the tick timer CPU. Yes, we didn't suspend
> per-cpu timer hardware on x86 because x86 does not have lapic timer
> suspend implementation. If we need to implement this, I think we can do
> the cross-CPU calls in clocksource/clockevents suspend(), but I didn't
> see any necessary we need to do this now.
> 
> so, I think we are okay now, :)

Right, I tend to agree, we'll find out quickly enough once those
platforms will try this code anyhow ;-)

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [RFC/PATCH] PM / Sleep: Timer quiesce in freeze state
  2014-10-29  8:21             ` Peter Zijlstra
@ 2014-10-29 15:09               ` Li, Aubrey
  0 siblings, 0 replies; 28+ messages in thread
From: Li, Aubrey @ 2014-10-29 15:09 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Rafael J. Wysocki, Brown, Len, alan, Thomas Gleixner,
	H. Peter Anvin, linux-kernel,
	linux-pm@vger.kernel.org >> Linux PM list

On 2014/10/29 16:21, Peter Zijlstra wrote:
> On Wed, Oct 29, 2014 at 06:46:03AM +0800, Li, Aubrey wrote:
>> On 2014/10/28 16:29, Peter Zijlstra wrote:
>>> On Tue, Oct 28, 2014 at 12:32:16PM +0800, Li, Aubrey wrote:
>>>> On 2014/10/27 15:28, Peter Zijlstra wrote:
>>>>> On Mon, Oct 27, 2014 at 02:27:27PM +0800, Li, Aubrey wrote:
>>>>>>> Now I suppose the problem is with cpu_pause() which needs IPIs to
>>>>>>> complete? Do we really need cpuidle_pause() there?
>>>>>>> cpuidle_uninstall_handlers() seems like a daft thing to call just about
>>>>>>> there.
>>>>>>
>>>>>> Please check the log of 8651f97bd951d0bb1c10fa24e3fa3455193f3548.
>>>>>> Rafael should know more this question than me.
>>>>>
>>>>> That changelog explains its complete bollocks to do it here. We _want_
>>>>> to enter and/or remain in deep idle states.
>>>>
>>>> cpuidle_resume() will be called at the end of dpm_resume_noirq(). So we
>>>> still are able to enter deep idle states after resume.
>>>
>>> cpuidle_resume is absolute crap, as is cpuidle_suspend for that matter
>>> -- in this case.
>>>
>>> The only reason we needed cpuidle_suspend is because some BIOS shat its
>>> pants when some CPUs were in higher C states while trying to do the S3
>>> thing. We're not going to use S states or BIOS calls _at_all_, so no
>>> point in kicking CPUs out of their deep C states.
>>
>> We already kicked CPUs out of their deep C states in dpm_suspend_noirq().
>>
>> We pause cpuidle in dpm_suspend_noirq() and resume cpuidle in
>> dpm_resume_noirq(), so currently we can't enter deep c-state during S
>> states. That's an intention for some buggy BIOS.
> 
> And work arounds for buggy crap hardware should not be in generic code.
> They should be in the platform drivers associated with said crap bugs.
> 
> But I think I see what you're saying, we're going through this dpm_ crap
> even for suspend to idle, which is wrong too.
> 
>> However, for freeze state, there is another intention that we want
>> always to enter the *deepest* c-state every time we enter freeze.
>> So we need cpuidle_resume() to make sure we have deep cstate  in freeze.
>>
>> So back to your question in another email,
>>
>>> I think you can simply remove them altogether, they're nonsense.
>>
>> We need them to resume cpuidle in freeze.
> 
> So you can do cpuidle_resume() before we do the stop machine dance,

I agree, this can leave the current behavior unchanged. I'll send out a
refined version soon.

> but ideally it'd all be ripped out from generic code and stuffed into
> the platform drivers where it belongs. But at the very least it should
> be isolated to the S3 path, I bet suspend to disk doesn't care either.
> 
I think this is a good question to Rafael, which I ever asked before.

Thanks,
-Aubrey

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH v2] PM / Sleep: Timer quiesce in freeze state
  2014-10-29  8:24             ` Peter Zijlstra
@ 2014-10-30  2:58               ` Li, Aubrey
  2014-11-08  2:05                 ` Rafael J. Wysocki
  2014-11-12 21:09                 ` Thomas Gleixner
  0 siblings, 2 replies; 28+ messages in thread
From: Li, Aubrey @ 2014-10-30  2:58 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Rafael J. Wysocki, Brown, Len, alan, Thomas Gleixner,
	H. Peter Anvin, linux-kernel,
	linux-pm@vger.kernel.org >> Linux PM list

The patch is based on v3.17, merged with Rafael's pm+acpi-3.18-rc1 tag from
linux-pm.git tree.

The patch is based on the patch PeterZ initially wrote.
---
Freeze is a general power saving state that processes are frozen, devices
are suspended and CPUs are in idle state. However, when the system enters
freeze state, there are a few timers keep ticking and hence consumes more
power unnecessarily. The observed timer events in freeze state are:
- tick_sched_timer
- watchdog lockup detector
- realtime scheduler period timer

The system power consumption in freeze state will be reduced significantly
if we quiesce these timers.

On Baytrail-T(ASUS_T100) platform, when the system is freezed to low power
idle state(S0ix), quiescing these timers saves 29.8% power(94.48mw -> 66.32mw).

The patch is also tested on:
- Sandybrdige-EP system, both RTC alarm and power button are able to wake
  the system up from freeze state.
- HP laptop EliteBook 8460p, both RTC alarm and power button are able to
  wake the system up from freeze state.

Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Alan Cox <alan@linux.intel.com>
---
 arch/x86/kernel/apic/apic.c        |   8 ++
 drivers/cpuidle/cpuidle.c          |  12 +++
 kernel/power/suspend.c             | 185 +++++++++++++++++++++++++++++++++++--
 kernel/time/timekeeping.c          |   4 +-
 kernel/time/timekeeping_internal.h |   3 +
 5 files changed, 204 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 6776027..f2bb645 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -917,6 +917,14 @@ static void local_apic_timer_interrupt(void)
 	 */
 	inc_irq_stat(apic_timer_irqs);
 
+	/*
+	 * if timekeeping is suspended, the clock event device will be
+	 * suspended as well, so we are not supposed to invoke the event
+	 * handler of clock event device.
+	 */
+	if (unlikely(timekeeping_suspended))
+		return;
+
 	evt->event_handler(evt);
 }
 
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index ee9df5e..8f84f40 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -119,6 +119,18 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
 	ktime_t time_start, time_end;
 	s64 diff;
 
+	/*
+	 * under the scenario of use deepest idle state, the timekeeping
+	 * could be suspended as well as the clock source device, so we
+	 * bypass the idle counter update for this case
+	 */
+	if (unlikely(use_deepest_state)) {
+		entered_state = target_state->enter(dev, drv, index);
+		if (!cpuidle_state_is_coupled(dev, drv, entered_state))
+			local_irq_enable();
+		return entered_state;
+	}
+
 	trace_cpu_idle_rcuidle(index, dev->cpu);
 	time_start = ktime_get();
 
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 4ca9a33..660fd15 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -28,16 +28,20 @@
 #include <linux/ftrace.h>
 #include <trace/events/power.h>
 #include <linux/compiler.h>
+#include <linux/stop_machine.h>
+#include <linux/clockchips.h>
+#include <linux/hrtimer.h>
 
 #include "power.h"
+#include "../time/tick-internal.h"
+#include "../time/timekeeping_internal.h"
 
 const char *pm_labels[] = { "mem", "standby", "freeze", NULL };
 const char *pm_states[PM_SUSPEND_MAX];
 
 static const struct platform_suspend_ops *suspend_ops;
 static const struct platform_freeze_ops *freeze_ops;
-static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
-static bool suspend_freeze_wake;
+static int suspend_freeze_wake;
 
 void freeze_set_ops(const struct platform_freeze_ops *ops)
 {
@@ -48,22 +52,191 @@ void freeze_set_ops(const struct platform_freeze_ops *ops)
 
 static void freeze_begin(void)
 {
-	suspend_freeze_wake = false;
+	suspend_freeze_wake = -1;
+}
+
+enum freezer_state {
+	FREEZER_NONE,
+	FREEZER_PICK_TK,
+	FREEZER_SUSPEND_CLKEVT,
+	FREEZER_SUSPEND_TK,
+	FREEZER_IDLE,
+	FREEZER_RESUME_TK,
+	FREEZER_RESUME_CLKEVT,
+	FREEZER_EXIT,
+};
+
+struct freezer_data {
+	int			thread_num;
+	atomic_t		thread_ack;
+	enum freezer_state	state;
+};
+
+static void set_state(struct freezer_data *fd, enum freezer_state state)
+{
+	/* set ack counter */
+	atomic_set(&fd->thread_ack, fd->thread_num);
+	/* guarantee the write ordering between ack counter and state */
+	smp_wmb();
+	fd->state = state;
+}
+
+static void ack_state(struct freezer_data *fd)
+{
+	if (atomic_dec_and_test(&fd->thread_ack))
+		set_state(fd, fd->state + 1);
+}
+
+static void freezer_pick_tk(int cpu)
+{
+	if (tick_do_timer_cpu == TICK_DO_TIMER_NONE) {
+		static DEFINE_SPINLOCK(lock);
+
+		spin_lock(&lock);
+		if (tick_do_timer_cpu == TICK_DO_TIMER_NONE)
+			tick_do_timer_cpu = cpu;
+		spin_unlock(&lock);
+	}
+}
+
+static void freezer_suspend_clkevt(int cpu)
+{
+	if (tick_do_timer_cpu == cpu)
+		return;
+
+	clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
+}
+
+static void freezer_suspend_tk(int cpu)
+{
+	if (tick_do_timer_cpu != cpu)
+		return;
+
+	timekeeping_suspend();
+
+}
+
+static void freezer_idle(int cpu)
+{
+	struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
+	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
+
+	stop_critical_timings();
+
+	while (suspend_freeze_wake == -1) {
+		int next_state;
+
+		/*
+		 * interrupt must be disabled before cpu enters idle
+		 */
+		local_irq_disable();
+
+		next_state = cpuidle_select(drv, dev);
+		if (next_state < 0) {
+			arch_cpu_idle();
+			continue;
+		}
+		/*
+		 * cpuidle_enter will return with interrupt enabled
+		 */
+		cpuidle_enter(drv, dev, next_state);
+	}
+
+	if (suspend_freeze_wake == cpu)
+		kick_all_cpus_sync();
+
+	/*
+	 * We disable interrupt here for the rest of resume operations
+	 */
+	local_irq_disable();
+	start_critical_timings();
+}
+
+static void freezer_resume_tk(int cpu)
+{
+	if (tick_do_timer_cpu != cpu)
+		return;
+
+	timekeeping_resume();
+}
+
+static void freezer_resume_clkevt(int cpu)
+{
+	if (tick_do_timer_cpu == cpu) {
+		/*
+		 * Turn on the interrupt on the tick timer CPU as freezer
+		 * tasks are finished.
+		 */
+		local_irq_enable();
+		return;
+	}
+
+	touch_softlockup_watchdog();
+	clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
+	hrtimers_resume();
+	/*
+	 * Turn on the interrupt on the non-tick-timer CPUs as freezer
+	 * tasks are finished
+	 */
+	local_irq_enable();
+}
+
+typedef void (*freezer_fn)(int);
+
+static freezer_fn freezer_func[FREEZER_EXIT] = {
+	NULL,
+	freezer_pick_tk,
+	freezer_suspend_clkevt,
+	freezer_suspend_tk,
+	freezer_idle,
+	freezer_resume_tk,
+	freezer_resume_clkevt,
+};
+
+static int freezer_stopper_fn(void *arg)
+{
+	struct freezer_data *fd = arg;
+	enum freezer_state state = FREEZER_NONE;
+	int cpu = smp_processor_id();
+
+	do {
+		cpu_relax();
+		if (fd->state != state) {
+			state = fd->state;
+			if (freezer_func[state])
+				(*freezer_func[state])(cpu);
+			ack_state(fd);
+		}
+	} while (fd->state != FREEZER_EXIT);
+
+	return 0;
 }
 
 static void freeze_enter(void)
 {
+	struct freezer_data fd;
+
 	cpuidle_use_deepest_state(true);
 	cpuidle_resume();
-	wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
+
+	get_online_cpus();
+
+	fd.thread_num = num_online_cpus();
+	set_state(&fd, FREEZER_PICK_TK);
+
+	__stop_machine(freezer_stopper_fn, &fd, cpu_online_mask);
+
+	put_online_cpus();
+
 	cpuidle_pause();
 	cpuidle_use_deepest_state(false);
 }
 
 void freeze_wake(void)
 {
-	suspend_freeze_wake = true;
-	wake_up(&suspend_freeze_wait_head);
+	if (suspend_freeze_wake != -1)
+		return;
+	suspend_freeze_wake = smp_processor_id();
 }
 EXPORT_SYMBOL_GPL(freeze_wake);
 
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index ec1791f..23d8feb 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1114,7 +1114,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
  * xtime/wall_to_monotonic/jiffies/etc are
  * still managed by arch specific suspend/resume code.
  */
-static void timekeeping_resume(void)
+void timekeeping_resume(void)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
 	struct clocksource *clock = tk->tkr.clock;
@@ -1195,7 +1195,7 @@ static void timekeeping_resume(void)
 	hrtimers_resume();
 }
 
-static int timekeeping_suspend(void)
+int timekeeping_suspend(void)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
 	unsigned long flags;
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
index 4ea005a..ed7a574 100644
--- a/kernel/time/timekeeping_internal.h
+++ b/kernel/time/timekeeping_internal.h
@@ -26,4 +26,7 @@ static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask)
 }
 #endif
 
+extern int timekeeping_suspend(void);
+extern void timekeeping_resume(void);
+
 #endif /* _TIMEKEEPING_INTERNAL_H */
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH v2] PM / Sleep: Timer quiesce in freeze state
  2014-10-30  2:58               ` [PATCH v2] " Li, Aubrey
@ 2014-11-08  2:05                 ` Rafael J. Wysocki
  2014-11-10 11:49                   ` Peter Zijlstra
  2014-11-12 21:09                 ` Thomas Gleixner
  1 sibling, 1 reply; 28+ messages in thread
From: Rafael J. Wysocki @ 2014-11-08  2:05 UTC (permalink / raw)
  To: Li, Aubrey, Peter Zijlstra, Thomas Gleixner
  Cc: Brown, Len, alan, H. Peter Anvin, linux-kernel,
	linux-pm@vger.kernel.org >> Linux PM list

On Thursday, October 30, 2014 10:58:23 AM Li, Aubrey wrote:
> The patch is based on v3.17, merged with Rafael's pm+acpi-3.18-rc1 tag from
> linux-pm.git tree.
> 
> The patch is based on the patch PeterZ initially wrote.
> ---
> Freeze is a general power saving state that processes are frozen, devices
> are suspended and CPUs are in idle state. However, when the system enters
> freeze state, there are a few timers keep ticking and hence consumes more
> power unnecessarily. The observed timer events in freeze state are:
> - tick_sched_timer
> - watchdog lockup detector
> - realtime scheduler period timer
> 
> The system power consumption in freeze state will be reduced significantly
> if we quiesce these timers.
> 
> On Baytrail-T(ASUS_T100) platform, when the system is freezed to low power
> idle state(S0ix), quiescing these timers saves 29.8% power(94.48mw -> 66.32mw).
> 
> The patch is also tested on:
> - Sandybrdige-EP system, both RTC alarm and power button are able to wake
>   the system up from freeze state.
> - HP laptop EliteBook 8460p, both RTC alarm and power button are able to
>   wake the system up from freeze state.
> 
> Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
> Signed-off-by: Peter Zijlstra <peterz@infradead.org>
> Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> Cc: Len Brown <len.brown@intel.com>
> Cc: Alan Cox <alan@linux.intel.com>

Peter, Thomas, any comments here?

> ---
>  arch/x86/kernel/apic/apic.c        |   8 ++
>  drivers/cpuidle/cpuidle.c          |  12 +++
>  kernel/power/suspend.c             | 185 +++++++++++++++++++++++++++++++++++--
>  kernel/time/timekeeping.c          |   4 +-
>  kernel/time/timekeeping_internal.h |   3 +
>  5 files changed, 204 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
> index 6776027..f2bb645 100644
> --- a/arch/x86/kernel/apic/apic.c
> +++ b/arch/x86/kernel/apic/apic.c
> @@ -917,6 +917,14 @@ static void local_apic_timer_interrupt(void)
>  	 */
>  	inc_irq_stat(apic_timer_irqs);
>  
> +	/*
> +	 * if timekeeping is suspended, the clock event device will be
> +	 * suspended as well, so we are not supposed to invoke the event
> +	 * handler of clock event device.
> +	 */
> +	if (unlikely(timekeeping_suspended))
> +		return;
> +
>  	evt->event_handler(evt);
>  }
>  
> diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
> index ee9df5e..8f84f40 100644
> --- a/drivers/cpuidle/cpuidle.c
> +++ b/drivers/cpuidle/cpuidle.c
> @@ -119,6 +119,18 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
>  	ktime_t time_start, time_end;
>  	s64 diff;
>  
> +	/*
> +	 * under the scenario of use deepest idle state, the timekeeping
> +	 * could be suspended as well as the clock source device, so we
> +	 * bypass the idle counter update for this case
> +	 */
> +	if (unlikely(use_deepest_state)) {
> +		entered_state = target_state->enter(dev, drv, index);
> +		if (!cpuidle_state_is_coupled(dev, drv, entered_state))
> +			local_irq_enable();
> +		return entered_state;
> +	}
> +
>  	trace_cpu_idle_rcuidle(index, dev->cpu);
>  	time_start = ktime_get();
>  
> diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
> index 4ca9a33..660fd15 100644
> --- a/kernel/power/suspend.c
> +++ b/kernel/power/suspend.c
> @@ -28,16 +28,20 @@
>  #include <linux/ftrace.h>
>  #include <trace/events/power.h>
>  #include <linux/compiler.h>
> +#include <linux/stop_machine.h>
> +#include <linux/clockchips.h>
> +#include <linux/hrtimer.h>
>  
>  #include "power.h"
> +#include "../time/tick-internal.h"
> +#include "../time/timekeeping_internal.h"
>  
>  const char *pm_labels[] = { "mem", "standby", "freeze", NULL };
>  const char *pm_states[PM_SUSPEND_MAX];
>  
>  static const struct platform_suspend_ops *suspend_ops;
>  static const struct platform_freeze_ops *freeze_ops;
> -static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
> -static bool suspend_freeze_wake;
> +static int suspend_freeze_wake;
>  
>  void freeze_set_ops(const struct platform_freeze_ops *ops)
>  {
> @@ -48,22 +52,191 @@ void freeze_set_ops(const struct platform_freeze_ops *ops)
>  
>  static void freeze_begin(void)
>  {
> -	suspend_freeze_wake = false;
> +	suspend_freeze_wake = -1;
> +}
> +
> +enum freezer_state {
> +	FREEZER_NONE,
> +	FREEZER_PICK_TK,
> +	FREEZER_SUSPEND_CLKEVT,
> +	FREEZER_SUSPEND_TK,
> +	FREEZER_IDLE,
> +	FREEZER_RESUME_TK,
> +	FREEZER_RESUME_CLKEVT,
> +	FREEZER_EXIT,
> +};
> +
> +struct freezer_data {
> +	int			thread_num;
> +	atomic_t		thread_ack;
> +	enum freezer_state	state;
> +};
> +
> +static void set_state(struct freezer_data *fd, enum freezer_state state)
> +{
> +	/* set ack counter */
> +	atomic_set(&fd->thread_ack, fd->thread_num);
> +	/* guarantee the write ordering between ack counter and state */
> +	smp_wmb();
> +	fd->state = state;
> +}
> +
> +static void ack_state(struct freezer_data *fd)
> +{
> +	if (atomic_dec_and_test(&fd->thread_ack))
> +		set_state(fd, fd->state + 1);
> +}
> +
> +static void freezer_pick_tk(int cpu)
> +{
> +	if (tick_do_timer_cpu == TICK_DO_TIMER_NONE) {
> +		static DEFINE_SPINLOCK(lock);
> +
> +		spin_lock(&lock);
> +		if (tick_do_timer_cpu == TICK_DO_TIMER_NONE)
> +			tick_do_timer_cpu = cpu;
> +		spin_unlock(&lock);
> +	}
> +}
> +
> +static void freezer_suspend_clkevt(int cpu)
> +{
> +	if (tick_do_timer_cpu == cpu)
> +		return;
> +
> +	clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
> +}
> +
> +static void freezer_suspend_tk(int cpu)
> +{
> +	if (tick_do_timer_cpu != cpu)
> +		return;
> +
> +	timekeeping_suspend();
> +
> +}
> +
> +static void freezer_idle(int cpu)
> +{
> +	struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
> +	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
> +
> +	stop_critical_timings();
> +
> +	while (suspend_freeze_wake == -1) {
> +		int next_state;
> +
> +		/*
> +		 * interrupt must be disabled before cpu enters idle
> +		 */
> +		local_irq_disable();
> +
> +		next_state = cpuidle_select(drv, dev);
> +		if (next_state < 0) {
> +			arch_cpu_idle();
> +			continue;
> +		}
> +		/*
> +		 * cpuidle_enter will return with interrupt enabled
> +		 */
> +		cpuidle_enter(drv, dev, next_state);
> +	}
> +
> +	if (suspend_freeze_wake == cpu)
> +		kick_all_cpus_sync();
> +
> +	/*
> +	 * We disable interrupt here for the rest of resume operations
> +	 */
> +	local_irq_disable();
> +	start_critical_timings();
> +}
> +
> +static void freezer_resume_tk(int cpu)
> +{
> +	if (tick_do_timer_cpu != cpu)
> +		return;
> +
> +	timekeeping_resume();
> +}
> +
> +static void freezer_resume_clkevt(int cpu)
> +{
> +	if (tick_do_timer_cpu == cpu) {
> +		/*
> +		 * Turn on the interrupt on the tick timer CPU as freezer
> +		 * tasks are finished.
> +		 */
> +		local_irq_enable();
> +		return;
> +	}
> +
> +	touch_softlockup_watchdog();
> +	clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
> +	hrtimers_resume();
> +	/*
> +	 * Turn on the interrupt on the non-tick-timer CPUs as freezer
> +	 * tasks are finished
> +	 */
> +	local_irq_enable();
> +}
> +
> +typedef void (*freezer_fn)(int);
> +
> +static freezer_fn freezer_func[FREEZER_EXIT] = {
> +	NULL,
> +	freezer_pick_tk,
> +	freezer_suspend_clkevt,
> +	freezer_suspend_tk,
> +	freezer_idle,
> +	freezer_resume_tk,
> +	freezer_resume_clkevt,
> +};
> +
> +static int freezer_stopper_fn(void *arg)
> +{
> +	struct freezer_data *fd = arg;
> +	enum freezer_state state = FREEZER_NONE;
> +	int cpu = smp_processor_id();
> +
> +	do {
> +		cpu_relax();
> +		if (fd->state != state) {
> +			state = fd->state;
> +			if (freezer_func[state])
> +				(*freezer_func[state])(cpu);
> +			ack_state(fd);
> +		}
> +	} while (fd->state != FREEZER_EXIT);
> +
> +	return 0;
>  }
>  
>  static void freeze_enter(void)
>  {
> +	struct freezer_data fd;
> +
>  	cpuidle_use_deepest_state(true);
>  	cpuidle_resume();
> -	wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
> +
> +	get_online_cpus();
> +
> +	fd.thread_num = num_online_cpus();
> +	set_state(&fd, FREEZER_PICK_TK);
> +
> +	__stop_machine(freezer_stopper_fn, &fd, cpu_online_mask);
> +
> +	put_online_cpus();
> +
>  	cpuidle_pause();
>  	cpuidle_use_deepest_state(false);
>  }
>  
>  void freeze_wake(void)
>  {
> -	suspend_freeze_wake = true;
> -	wake_up(&suspend_freeze_wait_head);
> +	if (suspend_freeze_wake != -1)
> +		return;
> +	suspend_freeze_wake = smp_processor_id();
>  }
>  EXPORT_SYMBOL_GPL(freeze_wake);
>  
> diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
> index ec1791f..23d8feb 100644
> --- a/kernel/time/timekeeping.c
> +++ b/kernel/time/timekeeping.c
> @@ -1114,7 +1114,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
>   * xtime/wall_to_monotonic/jiffies/etc are
>   * still managed by arch specific suspend/resume code.
>   */
> -static void timekeeping_resume(void)
> +void timekeeping_resume(void)
>  {
>  	struct timekeeper *tk = &tk_core.timekeeper;
>  	struct clocksource *clock = tk->tkr.clock;
> @@ -1195,7 +1195,7 @@ static void timekeeping_resume(void)
>  	hrtimers_resume();
>  }
>  
> -static int timekeeping_suspend(void)
> +int timekeeping_suspend(void)
>  {
>  	struct timekeeper *tk = &tk_core.timekeeper;
>  	unsigned long flags;
> diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
> index 4ea005a..ed7a574 100644
> --- a/kernel/time/timekeeping_internal.h
> +++ b/kernel/time/timekeeping_internal.h
> @@ -26,4 +26,7 @@ static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask)
>  }
>  #endif
>  
> +extern int timekeeping_suspend(void);
> +extern void timekeeping_resume(void);
> +
>  #endif /* _TIMEKEEPING_INTERNAL_H */
> 

-- 
I speak only for myself.
Rafael J. Wysocki, Intel Open Source Technology Center.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v2] PM / Sleep: Timer quiesce in freeze state
  2014-11-08  2:05                 ` Rafael J. Wysocki
@ 2014-11-10 11:49                   ` Peter Zijlstra
  0 siblings, 0 replies; 28+ messages in thread
From: Peter Zijlstra @ 2014-11-10 11:49 UTC (permalink / raw)
  To: Rafael J. Wysocki
  Cc: Li, Aubrey, Thomas Gleixner, Brown, Len, alan, H. Peter Anvin,
	linux-kernel, linux-pm@vger.kernel.org >> Linux PM list

On Sat, Nov 08, 2014 at 03:05:56AM +0100, Rafael J. Wysocki wrote:
> Peter, Thomas, any comments here?

I'm fine with this; but Thomas needs to ack, lets give him a few more
days to reply with this reminder.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v2] PM / Sleep: Timer quiesce in freeze state
  2014-10-30  2:58               ` [PATCH v2] " Li, Aubrey
  2014-11-08  2:05                 ` Rafael J. Wysocki
@ 2014-11-12 21:09                 ` Thomas Gleixner
  2014-11-13  1:37                   ` Peter Zijlstra
  1 sibling, 1 reply; 28+ messages in thread
From: Thomas Gleixner @ 2014-11-12 21:09 UTC (permalink / raw)
  To: Li, Aubrey
  Cc: Peter Zijlstra, Rafael J. Wysocki, Brown, Len, alan,
	H. Peter Anvin, linux-kernel,
	linux-pm@vger.kernel.org >> Linux PM list

On Thu, 30 Oct 2014, Li, Aubrey wrote:

> Freeze is a general power saving state that processes are frozen, devices
> are suspended and CPUs are in idle state. However, when the system enters
> freeze state, there are a few timers keep ticking and hence consumes more
> power unnecessarily. The observed timer events in freeze state are:
> - tick_sched_timer
> - watchdog lockup detector
> - realtime scheduler period timer
> 
> The system power consumption in freeze state will be reduced significantly
> if we quiesce these timers.

So the obvious question is why dont we quiesce these timers by telling
the subsystems which manage these timers to shut them down?

I really want a proper answer for this in the first place, but let me
look at the proposed "solution" as well.

> diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
> index 6776027..f2bb645 100644
> --- a/arch/x86/kernel/apic/apic.c
> +++ b/arch/x86/kernel/apic/apic.c
> @@ -917,6 +917,14 @@ static void local_apic_timer_interrupt(void)
>  	 */
>  	inc_irq_stat(apic_timer_irqs);
>  
> +	/*
> +	 * if timekeeping is suspended, the clock event device will be
> +	 * suspended as well, so we are not supposed to invoke the event
> +	 * handler of clock event device.
> +	 */
> +	if (unlikely(timekeeping_suspended))
> +		return;

Why do you need that if you already suspended the clock event device?
The above comment does not explain that at all.

So if there is a proper reason to do so, we rather do the following in
tick_suspend():

	td->evtdev.real_handler = td->evtdev.event_handler;
	td->evtdev.event_handler = clockevents_handle_noop;

and restore that on resume instead of sprinkling if (tk_suspended)
checks all over the place. x86/apic is probably not the only one which
wants that treatment.

But before we do that we want a proper explanation why the interrupt
fires at all. The lack of explanation cleary documents that this is a
'hacked it into submission' approach.

> diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
> index 4ca9a33..660fd15 100644
> --- a/kernel/power/suspend.c
> +++ b/kernel/power/suspend.c
> @@ -28,16 +28,20 @@
>  #include <linux/ftrace.h>
>  #include <trace/events/power.h>
>  #include <linux/compiler.h>
> +#include <linux/stop_machine.h>
> +#include <linux/clockchips.h>
> +#include <linux/hrtimer.h>
>  
>  #include "power.h"
> +#include "../time/tick-internal.h"
> +#include "../time/timekeeping_internal.h"

Eew.
  
> +static void freezer_pick_tk(int cpu)
> +{
> +	if (tick_do_timer_cpu == TICK_DO_TIMER_NONE) {
> +		static DEFINE_SPINLOCK(lock);
> +
> +		spin_lock(&lock);
> +		if (tick_do_timer_cpu == TICK_DO_TIMER_NONE)
> +			tick_do_timer_cpu = cpu;
> +		spin_unlock(&lock);
> +	}
> +}
> +static void freezer_suspend_clkevt(int cpu)
> +{
> +	if (tick_do_timer_cpu == cpu)
> +		return;
> +
> +	clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
> +}
> +
> +static void freezer_suspend_tk(int cpu)
> +{
> +	if (tick_do_timer_cpu != cpu)
> +		return;
> +
> +	timekeeping_suspend();
> +
> +}

So you export the world and some more from timekeeping and the tick
code and fiddle with it randomly just to do:

1) Suspend clock event devices
2) Suspend timekeeping
3) Resume timekeeping
4) Resume clock event devices

And for that you kick the frozen cpus out of idle into the
stomp_machine task and let them enter deep idle from there.

stomp_machine() is in 99% of all use cases a clear indicator for a
complete design failure.

It's not that hard to solve that problem, w/o stomp_machine and w/o
all the tick_do_timer_cpu mess.

1) Run the freeze code until freeze_enter()

2) Prevent CPU hotplug and switch state.

   That tells the cpu idle code to enter the deepest idle state and
   also tells the clock events code about the desire to freeze
   everything.

   clock_events_set_freeze_state(true);

   And let that be:

   clock_events_set_freeze_state(bool on)
   {
	raw_spin_lock_irq(&clockevents_lock);
	if (on)
		tobefrozen_cpus = num_online_cpus();
	idle_freeze = on;
	raw_spin_unlock_irq(&clockevents_lock);
   }

   So the generic idle task needs a check like this:

   if (idle_should_freeze())
      	frozen_idle();

   with the implementation:

   bool idle_should_freeze()
   {
	return clock_events_get_freeze_state();
   }

   which resolves to:

   bool clock_events_get_freeze_state()
   {
        /*
	 * Lockfree access because it does not matter.
	 *
	 * See below at CLOCK_EVT_NOTIFY_FREEZE
	 */
	return idle_freeze;
   }

4) Kick all cpus out of idle, so they enter the deep idle state via
   frozen_idle()

   frozen_idle()
   {
   	if (clock_events_notify(CLOCK_EVT_NOTIFY_FREEZE))
	      return;

	while (idle_should_freeze())
	      magic_frozen_idle();

	clock_events_notify(CLOCK_EVT_NOTIFY_UNFREEZE);
   }

   Let clock_events_notify() have these new cases:

   CLOCK_EVT_NOTIFY_FREEZE:
	ret = tick_freeze();
	break;

   CLOCK_EVT_NOTIFY_UNFREEZE:
	tick_unfreeze();
	break;

   and

   tick_freeze()
   {
	/*
	 * This is serialized against a concurrent wakeup
	 * via clockevents_lock!
	 */
	if (!idle_freeze)
	   return -EBUSY;

	if (--tobefrozen_cpus) {
	   tick_suspend();
	} else
	   /*
	    * Needs to be a seperate interface due to
	    * clockevents_lock being held in clock_events_notify()
	    */
	   timekeeping_freeze();
        }
   }

   and

   tick_unfreeze()
   {
	if (!timekeeping_frozen)
	   tick_resume();
	else
	   timekeeping_unfreeze();
   }

   and the wakeup notification wants to have a proper interface as
   well:

   wakeup_the_whole_thing()
   {
	do_whatever_unfreeze_needs();

	clock_events_set_freeze_state(false);
   }

5) Reenable cpu hotplug when the freezer task returns.

No stomp_machine, no tick_do_timer_cpu() abuse. All nicely serialized
via clockevents_lock.

All abortable at any given point in time and not dependend on running
through another state machine nested into the stomp_machine() state
machine.

Thoughts?

Thanks,

	tglx




^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v2] PM / Sleep: Timer quiesce in freeze state
  2014-11-12 21:09                 ` Thomas Gleixner
@ 2014-11-13  1:37                   ` Peter Zijlstra
  2014-11-13  2:20                     ` Li, Aubrey
  2014-11-13  9:10                     ` Thomas Gleixner
  0 siblings, 2 replies; 28+ messages in thread
From: Peter Zijlstra @ 2014-11-13  1:37 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Li, Aubrey, Rafael J. Wysocki, Brown, Len, alan, H. Peter Anvin,
	linux-kernel, linux-pm@vger.kernel.org >> Linux PM list

On Wed, Nov 12, 2014 at 10:09:47PM +0100, Thomas Gleixner wrote:
> On Thu, 30 Oct 2014, Li, Aubrey wrote:
> 
> > Freeze is a general power saving state that processes are frozen, devices
> > are suspended and CPUs are in idle state. However, when the system enters
> > freeze state, there are a few timers keep ticking and hence consumes more
> > power unnecessarily. The observed timer events in freeze state are:
> > - tick_sched_timer
> > - watchdog lockup detector
> > - realtime scheduler period timer
> > 
> > The system power consumption in freeze state will be reduced significantly
> > if we quiesce these timers.
> 
> So the obvious question is why dont we quiesce these timers by telling
> the subsystems which manage these timers to shut them down?
> 
> I really want a proper answer for this in the first place, but let me
> look at the proposed "solution" as well.

Two arguments here:

 1) the current suspend modes don't care, so if this suspend mode starts
 to care, its likely to 'break' in the future simply because people
 never cared about timers.

 2) there could be userland timers, userland is frozen but they'll still
 have their timers set and those can and will fire.

But sure, we can add suspend notifiers to stuff to shut down timers; I
should have a patch for at least one of the offenders somewhere. But I
really think that we should not be looking at the individual timers for
this, none of the other suspend modes care about active timers.

> But before we do that we want a proper explanation why the interrupt
> fires at all. The lack of explanation cleary documents that this is a
> 'hacked it into submission' approach.

>From what I remember its the waking interrupt that ends up in the
timekeeping code, Li should have a backtrace somwhere.

> > +#include "../time/tick-internal.h"
> > +#include "../time/timekeeping_internal.h"
> 
> Eew.

I knew you'd love that :-)

> So you export the world and some more from timekeeping and the tick
> code and fiddle with it randomly just to do:
> 
> 1) Suspend clock event devices
> 2) Suspend timekeeping
> 3) Resume timekeeping
> 4) Resume clock event devices

Sure, we can add some exports and clean that up, but..

> stomp_machine() is in 99% of all use cases a clear indicator for a
> complete design failure.

>    So the generic idle task needs a check like this:
> 
>    if (idle_should_freeze())
>       	frozen_idle();

So that is adding extra code to fairly common/hot paths just for this
one extra special case. I tried to avoid doing that.

But I suppose we can try and merge that with the offline case and guard
both special cases with a single variable or so.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v2] PM / Sleep: Timer quiesce in freeze state
  2014-11-13  1:37                   ` Peter Zijlstra
@ 2014-11-13  2:20                     ` Li, Aubrey
  2014-11-13  9:19                       ` Thomas Gleixner
  2014-11-13  9:10                     ` Thomas Gleixner
  1 sibling, 1 reply; 28+ messages in thread
From: Li, Aubrey @ 2014-11-13  2:20 UTC (permalink / raw)
  To: Peter Zijlstra, Thomas Gleixner
  Cc: Rafael J. Wysocki, Brown, Len, alan, H. Peter Anvin,
	linux-kernel, linux-pm@vger.kernel.org >> Linux PM list

On 2014/11/13 9:37, Peter Zijlstra wrote:
> On Wed, Nov 12, 2014 at 10:09:47PM +0100, Thomas Gleixner wrote:
>> On Thu, 30 Oct 2014, Li, Aubrey wrote:
>>
>>> Freeze is a general power saving state that processes are frozen, devices
>>> are suspended and CPUs are in idle state. However, when the system enters
>>> freeze state, there are a few timers keep ticking and hence consumes more
>>> power unnecessarily. The observed timer events in freeze state are:
>>> - tick_sched_timer
>>> - watchdog lockup detector
>>> - realtime scheduler period timer
>>>
>>> The system power consumption in freeze state will be reduced significantly
>>> if we quiesce these timers.
>>
>> So the obvious question is why dont we quiesce these timers by telling
>> the subsystems which manage these timers to shut them down?
>>
>> I really want a proper answer for this in the first place, but let me
>> look at the proposed "solution" as well.
> 
> Two arguments here:
> 
>  1) the current suspend modes don't care, so if this suspend mode starts
>  to care, its likely to 'break' in the future simply because people
>  never cared about timers.
> 
>  2) there could be userland timers, userland is frozen but they'll still
>  have their timers set and those can and will fire.
> 
> But sure, we can add suspend notifiers to stuff to shut down timers; I
> should have a patch for at least one of the offenders somewhere. But I
> really think that we should not be looking at the individual timers for
> this, none of the other suspend modes care about active timers.
> 
>> But before we do that we want a proper explanation why the interrupt
>> fires at all. The lack of explanation cleary documents that this is a
>> 'hacked it into submission' approach.
> 
>>From what I remember its the waking interrupt that ends up in the
> timekeeping code, Li should have a backtrace somwhere.

There are two race conditions:

The first one occurs after the interrupt is disabled and before we
suspend lapic. In this time slot, if apic timer interrupt occurs, the
interrupt is pending there because the interrupt is disabled. Then we
suspend timekeeping, and then we enter idle and exit idle with interrupt
re-enabled, the timer interrupt is handled with timekeeping is
suspended.

The other occurs after timekeeping_suspended = 1 and before we suspend
lapic. In this time slot, if apic timer interrupt occurs, we invoke the
timer interrupt while timekeeping is suspended as above.

Thanks,
-Aubrey
> 
>>> +#include "../time/tick-internal.h"
>>> +#include "../time/timekeeping_internal.h"
>>
>> Eew.
> 
> I knew you'd love that :-)
> 
>> So you export the world and some more from timekeeping and the tick
>> code and fiddle with it randomly just to do:
>>
>> 1) Suspend clock event devices
>> 2) Suspend timekeeping
>> 3) Resume timekeeping
>> 4) Resume clock event devices
> 
> Sure, we can add some exports and clean that up, but..
> 
>> stomp_machine() is in 99% of all use cases a clear indicator for a
>> complete design failure.
> 
>>    So the generic idle task needs a check like this:
>>
>>    if (idle_should_freeze())
>>       	frozen_idle();
> 
> So that is adding extra code to fairly common/hot paths just for this
> one extra special case. I tried to avoid doing that.
> 
> But I suppose we can try and merge that with the offline case and guard
> both special cases with a single variable or so.
> 
> 


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v2] PM / Sleep: Timer quiesce in freeze state
  2014-11-13  1:37                   ` Peter Zijlstra
  2014-11-13  2:20                     ` Li, Aubrey
@ 2014-11-13  9:10                     ` Thomas Gleixner
  2014-11-13 10:47                       ` Li, Aubrey
  1 sibling, 1 reply; 28+ messages in thread
From: Thomas Gleixner @ 2014-11-13  9:10 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Li, Aubrey, Rafael J. Wysocki, Brown, Len, alan, H. Peter Anvin,
	linux-kernel, linux-pm@vger.kernel.org >> Linux PM list

On Thu, 13 Nov 2014, Peter Zijlstra wrote:
> On Wed, Nov 12, 2014 at 10:09:47PM +0100, Thomas Gleixner wrote:
> But sure, we can add suspend notifiers to stuff to shut down timers; I
> should have a patch for at least one of the offenders somewhere. But I
> really think that we should not be looking at the individual timers for
> this, none of the other suspend modes care about active timers.

Fair enough.
 
> > But before we do that we want a proper explanation why the interrupt
> > fires at all. The lack of explanation cleary documents that this is a
> > 'hacked it into submission' approach.
> 
> >From what I remember its the waking interrupt that ends up in the
> timekeeping code, Li should have a backtrace somwhere.

I can imagine what happens :)

> > stomp_machine() is in 99% of all use cases a clear indicator for a
> > complete design failure.
> 
> >    So the generic idle task needs a check like this:
> > 
> >    if (idle_should_freeze())
> >       	frozen_idle();
> 
> So that is adding extra code to fairly common/hot paths just for this
> one extra special case. I tried to avoid doing that.

idle enter is not that much of a hot path, really.
 
Thanks,

	tglx

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v2] PM / Sleep: Timer quiesce in freeze state
  2014-11-13  2:20                     ` Li, Aubrey
@ 2014-11-13  9:19                       ` Thomas Gleixner
  2014-11-13 10:50                         ` Li, Aubrey
  0 siblings, 1 reply; 28+ messages in thread
From: Thomas Gleixner @ 2014-11-13  9:19 UTC (permalink / raw)
  To: Li, Aubrey
  Cc: Peter Zijlstra, Rafael J. Wysocki, Brown, Len, alan,
	H. Peter Anvin, linux-kernel,
	linux-pm@vger.kernel.org >> Linux PM list

On Thu, 13 Nov 2014, Li, Aubrey wrote:
> On 2014/11/13 9:37, Peter Zijlstra wrote:
> > On Wed, Nov 12, 2014 at 10:09:47PM +0100, Thomas Gleixner wrote:
> >> On Thu, 30 Oct 2014, Li, Aubrey wrote:
> >>
> >>> Freeze is a general power saving state that processes are frozen, devices
> >>> are suspended and CPUs are in idle state. However, when the system enters
> >>> freeze state, there are a few timers keep ticking and hence consumes more
> >>> power unnecessarily. The observed timer events in freeze state are:
> >>> - tick_sched_timer
> >>> - watchdog lockup detector
> >>> - realtime scheduler period timer
> >>>
> >>> The system power consumption in freeze state will be reduced significantly
> >>> if we quiesce these timers.
> >>
> >> So the obvious question is why dont we quiesce these timers by telling
> >> the subsystems which manage these timers to shut them down?
> >>
> >> I really want a proper answer for this in the first place, but let me
> >> look at the proposed "solution" as well.
> > 
> > Two arguments here:
> > 
> >  1) the current suspend modes don't care, so if this suspend mode starts
> >  to care, its likely to 'break' in the future simply because people
> >  never cared about timers.
> > 
> >  2) there could be userland timers, userland is frozen but they'll still
> >  have their timers set and those can and will fire.
> > 
> > But sure, we can add suspend notifiers to stuff to shut down timers; I
> > should have a patch for at least one of the offenders somewhere. But I
> > really think that we should not be looking at the individual timers for
> > this, none of the other suspend modes care about active timers.
> > 
> >> But before we do that we want a proper explanation why the interrupt
> >> fires at all. The lack of explanation cleary documents that this is a
> >> 'hacked it into submission' approach.
> > 
> >>From what I remember its the waking interrupt that ends up in the
> > timekeeping code, Li should have a backtrace somwhere.
> 
> There are two race conditions:
> 
> The first one occurs after the interrupt is disabled and before we
> suspend lapic. In this time slot, if apic timer interrupt occurs, the
> interrupt is pending there because the interrupt is disabled. Then we
> suspend timekeeping, and then we enter idle and exit idle with interrupt
> re-enabled, the timer interrupt is handled with timekeeping is
> suspended.
> 
> The other occurs after timekeeping_suspended = 1 and before we suspend
> lapic. In this time slot, if apic timer interrupt occurs, we invoke the
> timer interrupt while timekeeping is suspended as above.

And that race exists for every implementation and is not at all apic
timer specific. So we fix it at the core and not at some random place
in the architecture code.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v2] PM / Sleep: Timer quiesce in freeze state
  2014-11-13  9:10                     ` Thomas Gleixner
@ 2014-11-13 10:47                       ` Li, Aubrey
  2014-11-13 13:06                         ` Thomas Gleixner
  0 siblings, 1 reply; 28+ messages in thread
From: Li, Aubrey @ 2014-11-13 10:47 UTC (permalink / raw)
  To: Thomas Gleixner, Peter Zijlstra
  Cc: Rafael J. Wysocki, Brown, Len, alan, H. Peter Anvin,
	linux-kernel, linux-pm@vger.kernel.org >> Linux PM list

On 2014/11/13 17:10, Thomas Gleixner wrote:
> On Thu, 13 Nov 2014, Peter Zijlstra wrote:
>> On Wed, Nov 12, 2014 at 10:09:47PM +0100, Thomas Gleixner wrote:
>> But sure, we can add suspend notifiers to stuff to shut down timers; I
>> should have a patch for at least one of the offenders somewhere. But I
>> really think that we should not be looking at the individual timers for
>> this, none of the other suspend modes care about active timers.
> 
> Fair enough.
>  

If you are okay with the current method to suspend timekeeping entirely,
then we can go further to fix the rest concerns.

Thanks,
-Aubrey

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v2] PM / Sleep: Timer quiesce in freeze state
  2014-11-13  9:19                       ` Thomas Gleixner
@ 2014-11-13 10:50                         ` Li, Aubrey
  0 siblings, 0 replies; 28+ messages in thread
From: Li, Aubrey @ 2014-11-13 10:50 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Peter Zijlstra, Rafael J. Wysocki, Brown, Len, alan,
	H. Peter Anvin, linux-kernel,
	linux-pm@vger.kernel.org >> Linux PM list

On 2014/11/13 17:19, Thomas Gleixner wrote:
> On Thu, 13 Nov 2014, Li, Aubrey wrote:
>> On 2014/11/13 9:37, Peter Zijlstra wrote:
>>> On Wed, Nov 12, 2014 at 10:09:47PM +0100, Thomas Gleixner wrote:
>>>> On Thu, 30 Oct 2014, Li, Aubrey wrote:
>>>>
>>>>> Freeze is a general power saving state that processes are frozen, devices
>>>>> are suspended and CPUs are in idle state. However, when the system enters
>>>>> freeze state, there are a few timers keep ticking and hence consumes more
>>>>> power unnecessarily. The observed timer events in freeze state are:
>>>>> - tick_sched_timer
>>>>> - watchdog lockup detector
>>>>> - realtime scheduler period timer
>>>>>
>>>>> The system power consumption in freeze state will be reduced significantly
>>>>> if we quiesce these timers.
>>>>
>>>> So the obvious question is why dont we quiesce these timers by telling
>>>> the subsystems which manage these timers to shut them down?
>>>>
>>>> I really want a proper answer for this in the first place, but let me
>>>> look at the proposed "solution" as well.
>>>
>>> Two arguments here:
>>>
>>>  1) the current suspend modes don't care, so if this suspend mode starts
>>>  to care, its likely to 'break' in the future simply because people
>>>  never cared about timers.
>>>
>>>  2) there could be userland timers, userland is frozen but they'll still
>>>  have their timers set and those can and will fire.
>>>
>>> But sure, we can add suspend notifiers to stuff to shut down timers; I
>>> should have a patch for at least one of the offenders somewhere. But I
>>> really think that we should not be looking at the individual timers for
>>> this, none of the other suspend modes care about active timers.
>>>
>>>> But before we do that we want a proper explanation why the interrupt
>>>> fires at all. The lack of explanation cleary documents that this is a
>>>> 'hacked it into submission' approach.
>>>
>>> >From what I remember its the waking interrupt that ends up in the
>>> timekeeping code, Li should have a backtrace somwhere.
>>
>> There are two race conditions:
>>
>> The first one occurs after the interrupt is disabled and before we
>> suspend lapic. In this time slot, if apic timer interrupt occurs, the
>> interrupt is pending there because the interrupt is disabled. Then we
>> suspend timekeeping, and then we enter idle and exit idle with interrupt
>> re-enabled, the timer interrupt is handled with timekeeping is
>> suspended.
>>
>> The other occurs after timekeeping_suspended = 1 and before we suspend
>> lapic. In this time slot, if apic timer interrupt occurs, we invoke the
>> timer interrupt while timekeeping is suspended as above.
> 
> And that race exists for every implementation and is not at all apic
> timer specific. So we fix it at the core and not at some random place
> in the architecture code.
> 
You're right, will refine this in the next patch version.

Thanks,
-Aubrey

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v2] PM / Sleep: Timer quiesce in freeze state
  2014-11-13 10:47                       ` Li, Aubrey
@ 2014-11-13 13:06                         ` Thomas Gleixner
  2014-11-14  7:58                           ` Li, Aubrey
  0 siblings, 1 reply; 28+ messages in thread
From: Thomas Gleixner @ 2014-11-13 13:06 UTC (permalink / raw)
  To: Li, Aubrey
  Cc: Peter Zijlstra, Rafael J. Wysocki, Brown, Len, alan,
	H. Peter Anvin, linux-kernel,
	linux-pm@vger.kernel.org >> Linux PM list

On Thu, 13 Nov 2014, Li, Aubrey wrote:

> On 2014/11/13 17:10, Thomas Gleixner wrote:
> > On Thu, 13 Nov 2014, Peter Zijlstra wrote:
> >> On Wed, Nov 12, 2014 at 10:09:47PM +0100, Thomas Gleixner wrote:
> >> But sure, we can add suspend notifiers to stuff to shut down timers; I
> >> should have a patch for at least one of the offenders somewhere. But I
> >> really think that we should not be looking at the individual timers for
> >> this, none of the other suspend modes care about active timers.
> > 
> > Fair enough.
> >  
> 
> If you are okay with the current method to suspend timekeeping entirely,
> then we can go further to fix the rest concerns.

I'm fine with that when it's done proper :)

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v2] PM / Sleep: Timer quiesce in freeze state
  2014-11-13 13:06                         ` Thomas Gleixner
@ 2014-11-14  7:58                           ` Li, Aubrey
  0 siblings, 0 replies; 28+ messages in thread
From: Li, Aubrey @ 2014-11-14  7:58 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Peter Zijlstra, Rafael J. Wysocki, Brown, Len, alan,
	H. Peter Anvin, linux-kernel,
	linux-pm@vger.kernel.org >> Linux PM list

On 2014/11/13 21:06, Thomas Gleixner wrote:
> On Thu, 13 Nov 2014, Li, Aubrey wrote:
> 
>> On 2014/11/13 17:10, Thomas Gleixner wrote:
>>> On Thu, 13 Nov 2014, Peter Zijlstra wrote:
>>>> On Wed, Nov 12, 2014 at 10:09:47PM +0100, Thomas Gleixner wrote:
>>>> But sure, we can add suspend notifiers to stuff to shut down timers; I
>>>> should have a patch for at least one of the offenders somewhere. But I
>>>> really think that we should not be looking at the individual timers for
>>>> this, none of the other suspend modes care about active timers.
>>>
>>> Fair enough.
>>>  
>>
>> If you are okay with the current method to suspend timekeeping entirely,
>> then we can go further to fix the rest concerns.
> 
> I'm fine with that when it's done proper :)
> 

Sure, thanks for the suggestion, let me try my best to make you happy, ;)

> Thanks,
> 
> 	tglx
> 
> 


^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2014-11-14  7:58 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-10-21 15:15 [RFC/PATCH] PM / Sleep: Timer quiesce in freeze state Li, Aubrey
2014-10-24 15:36 ` Peter Zijlstra
2014-10-27  6:27   ` Li, Aubrey
2014-10-27  7:28     ` Peter Zijlstra
2014-10-28  4:32       ` Li, Aubrey
2014-10-28  8:29         ` Peter Zijlstra
2014-10-28 22:46           ` Li, Aubrey
2014-10-29  8:21             ` Peter Zijlstra
2014-10-29 15:09               ` Li, Aubrey
2014-10-27  7:44     ` Peter Zijlstra
2014-10-28  7:52       ` Li, Aubrey
2014-10-28  8:25         ` Peter Zijlstra
2014-10-28 23:22           ` Li, Aubrey
2014-10-29  8:24             ` Peter Zijlstra
2014-10-30  2:58               ` [PATCH v2] " Li, Aubrey
2014-11-08  2:05                 ` Rafael J. Wysocki
2014-11-10 11:49                   ` Peter Zijlstra
2014-11-12 21:09                 ` Thomas Gleixner
2014-11-13  1:37                   ` Peter Zijlstra
2014-11-13  2:20                     ` Li, Aubrey
2014-11-13  9:19                       ` Thomas Gleixner
2014-11-13 10:50                         ` Li, Aubrey
2014-11-13  9:10                     ` Thomas Gleixner
2014-11-13 10:47                       ` Li, Aubrey
2014-11-13 13:06                         ` Thomas Gleixner
2014-11-14  7:58                           ` Li, Aubrey
2014-10-28  4:39   ` [RFC/PATCH] " Li, Aubrey
2014-10-28  8:25     ` Peter Zijlstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).