All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3]PM/Sleep: Timer quiesce in freeze state
@ 2014-12-09  3:01 Li, Aubrey
  2015-01-14  0:24 ` Li, Aubrey
  0 siblings, 1 reply; 29+ messages in thread
From: Li, Aubrey @ 2014-12-09  3:01 UTC (permalink / raw)
  To: Thomas Gleixner, Peter Zijlstra, Rafael J. Wysocki, Brown, Len,
	alan, LKML, Linux PM list

The patch is based on v3.18.

Freeze is a general power saving state that processes are frozen, devices
are suspended and CPUs are in idle state. However, when the system enters
freeze state, there are a few timers keep ticking and hence consumes more
power unnecessarily. The observed timer events in freeze state are:
- tick_sched_timer
- watchdog lockup detector
- realtime scheduler period timer

The system power consumption in freeze state will be reduced significantly
if we quiesce these timers.

The patch is tested on:
- Sandybrdige-EP system, both RTC alarm and power button are able to wake
  the system up from freeze state.
- HP laptop EliteBook 8460p, both RTC alarm and power button are able to
  wake the system up from freeze state.
- Baytrail-T(ASUS_T100) platform, power button is able to wake the system
  up from freeze state.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Alan Cox <alan@linux.intel.com>
---
 drivers/cpuidle/cpuidle.c   | 13 ++++++++
 include/linux/clockchips.h  |  4 +++
 include/linux/suspend.h     |  4 +++
 include/linux/timekeeping.h |  2 ++
 kernel/power/suspend.c      | 50 ++++++++++++++++++++++++++---
 kernel/sched/idle.c         | 45 ++++++++++++++++++++++++++
 kernel/softirq.c            |  5 +--
 kernel/time/clockevents.c   | 13 ++++++++
 kernel/time/tick-common.c   | 53 +++++++++++++++++++++++++++++++
 kernel/time/tick-internal.h |  3 ++
 kernel/time/timekeeping.c   | 77 +++++++++++++++++++++++++++++++++------------
 11 files changed, 243 insertions(+), 26 deletions(-)

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 125150d..b9a3ada 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -20,6 +20,7 @@
 #include <linux/hrtimer.h>
 #include <linux/module.h>
 #include <trace/events/power.h>
+#include <linux/suspend.h>
 
 #include "cpuidle.h"
 
@@ -119,6 +120,18 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
 	ktime_t time_start, time_end;
 	s64 diff;
 
+	/*
+	 * under the freeze scenario, the timekeeping is suspended
+	 * as well as the clock source device, so we bypass the idle
+	 * counter update in freeze idle
+	 */
+	if (in_freeze()) {
+		entered_state = target_state->enter(dev, drv, index);
+		if (!cpuidle_state_is_coupled(dev, drv, entered_state))
+			local_irq_enable();
+		return entered_state;
+	}
+
 	trace_cpu_idle_rcuidle(index, dev->cpu);
 	time_start = ktime_get();
 
diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index 2e4cb67..d118e0b 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -18,6 +18,9 @@ enum clock_event_nofitiers {
 	CLOCK_EVT_NOTIFY_BROADCAST_EXIT,
 	CLOCK_EVT_NOTIFY_SUSPEND,
 	CLOCK_EVT_NOTIFY_RESUME,
+	CLOCK_EVT_NOTIFY_FREEZE_PREPARE,
+	CLOCK_EVT_NOTIFY_FREEZE,
+	CLOCK_EVT_NOTIFY_UNFREEZE,
 	CLOCK_EVT_NOTIFY_CPU_DYING,
 	CLOCK_EVT_NOTIFY_CPU_DEAD,
 };
@@ -95,6 +98,7 @@ enum clock_event_mode {
  */
 struct clock_event_device {
 	void			(*event_handler)(struct clock_event_device *);
+	void			(*real_handler)(struct clock_event_device *);
 	int			(*set_next_event)(unsigned long evt,
 						  struct clock_event_device *);
 	int			(*set_next_ktime)(ktime_t expires,
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 3388c1b..86a651c 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -203,6 +203,8 @@ extern void suspend_set_ops(const struct platform_suspend_ops *ops);
 extern int suspend_valid_only_mem(suspend_state_t state);
 extern void freeze_set_ops(const struct platform_freeze_ops *ops);
 extern void freeze_wake(void);
+extern bool in_freeze(void);
+extern bool idle_should_freeze(void);
 
 /**
  * arch_suspend_disable_irqs - disable IRQs for suspend
@@ -230,6 +232,8 @@ static inline void suspend_set_ops(const struct platform_suspend_ops *ops) {}
 static inline int pm_suspend(suspend_state_t state) { return -ENOSYS; }
 static inline void freeze_set_ops(const struct platform_freeze_ops *ops) {}
 static inline void freeze_wake(void) {}
+static inline bool in_freeze(void) { return false; }
+static inline bool idle_should_freeze(void) { return false; }
 #endif /* !CONFIG_SUSPEND */
 
 /* struct pbe is used for creating lists of pages that should be restored
diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 1caa6b0..07957a9 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -5,6 +5,8 @@
 
 void timekeeping_init(void);
 extern int timekeeping_suspended;
+extern void timekeeping_freeze(void);
+extern void timekeeping_unfreeze(void);
 
 /*
  * Get and set timeofday
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index c347e3c..6467fb8 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -28,6 +28,7 @@
 #include <linux/ftrace.h>
 #include <trace/events/power.h>
 #include <linux/compiler.h>
+#include <linux/clockchips.h>
 
 #include "power.h"
 
@@ -37,7 +38,15 @@ const char *pm_states[PM_SUSPEND_MAX];
 static const struct platform_suspend_ops *suspend_ops;
 static const struct platform_freeze_ops *freeze_ops;
 static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
-static bool suspend_freeze_wake;
+
+/* freeze state machine */
+enum freeze_state {
+	FREEZE_STATE_NONE,      /* not in freeze */
+	FREEZE_STATE_ENTER,     /* enter freeze */
+	FREEZE_STATE_WAKE,      /* in freeze wakeup context */
+};
+
+static enum freeze_state suspend_freeze_state;
 
 void freeze_set_ops(const struct platform_freeze_ops *ops)
 {
@@ -46,23 +55,56 @@ void freeze_set_ops(const struct platform_freeze_ops *ops)
 	unlock_system_sleep();
 }
 
+bool in_freeze(void)
+{
+	return (suspend_freeze_state > FREEZE_STATE_NONE);
+}
+EXPORT_SYMBOL_GPL(in_freeze);
+
+bool idle_should_freeze(void)
+{
+	return (suspend_freeze_state == FREEZE_STATE_ENTER);
+}
+EXPORT_SYMBOL_GPL(idle_should_freeze);
+
 static void freeze_begin(void)
 {
-	suspend_freeze_wake = false;
+	suspend_freeze_state = FREEZE_STATE_NONE;
 }
 
 static void freeze_enter(void)
 {
+	suspend_freeze_state = FREEZE_STATE_ENTER;
+	get_online_cpus();
 	cpuidle_use_deepest_state(true);
 	cpuidle_resume();
-	wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
+	clockevents_notify(CLOCK_EVT_NOTIFY_FREEZE_PREPARE, NULL);
+	/*
+	 * push all the CPUs into freeze idle loop
+	 */
+	wake_up_all_idle_cpus();
+	printk(KERN_INFO "PM: suspend to idle\n");
+	/*
+	 * put the current CPU into wait queue so that this CPU
+	 * is able to enter freeze idle loop as well
+	 */
+	wait_event(suspend_freeze_wait_head,
+		(suspend_freeze_state == FREEZE_STATE_WAKE));
+	printk(KERN_INFO "PM: resume from freeze\n");
 	cpuidle_pause();
 	cpuidle_use_deepest_state(false);
+	put_online_cpus();
+	suspend_freeze_state = FREEZE_STATE_NONE;
 }
 
 void freeze_wake(void)
 {
-	suspend_freeze_wake = true;
+	if (!in_freeze())
+		return;
+	/*
+	 * wake freeze task up
+	 */
+	suspend_freeze_state = FREEZE_STATE_WAKE;
 	wake_up(&suspend_freeze_wait_head);
 }
 EXPORT_SYMBOL_GPL(freeze_wake);
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index c47fce7..f28f8cb 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -7,6 +7,7 @@
 #include <linux/tick.h>
 #include <linux/mm.h>
 #include <linux/stackprotector.h>
+#include <linux/suspend.h>
 
 #include <asm/tlb.h>
 
@@ -182,6 +183,45 @@ exit_idle:
 }
 
 /*
+ * cpu idle freeze function
+ */
+static void cpu_idle_freeze(void)
+{
+	struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
+	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
+
+	/*
+	 * suspend tick, the last CPU suspend timekeeping
+	 */
+	clockevents_notify(CLOCK_EVT_NOTIFY_FREEZE, NULL);
+	/*
+	 * idle loop here if idle_should_freeze()
+	 */
+	while (idle_should_freeze()) {
+		int next_state;
+		/*
+		 * interrupt must be disabled before cpu enters idle
+		 */
+		local_irq_disable();
+
+		next_state = cpuidle_select(drv, dev);
+		if (next_state < 0) {
+			arch_cpu_idle();
+			continue;
+		}
+		/*
+		 * cpuidle_enter will return with interrupt enabled
+		 */
+		cpuidle_enter(drv, dev, next_state);
+	}
+
+	/*
+	 * resume tick, the first wakeup CPU resume timekeeping
+	 */
+	clockevents_notify(CLOCK_EVT_NOTIFY_UNFREEZE, NULL);
+}
+
+/*
  * Generic idle loop implementation
  *
  * Called with polling cleared.
@@ -208,6 +248,11 @@ static void cpu_idle_loop(void)
 			if (cpu_is_offline(smp_processor_id()))
 				arch_cpu_idle_dead();
 
+			if (idle_should_freeze()) {
+				cpu_idle_freeze();
+				continue;
+			}
+
 			local_irq_disable();
 			arch_cpu_idle_enter();
 
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 0699add..a231bf6 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -26,6 +26,7 @@
 #include <linux/smpboot.h>
 #include <linux/tick.h>
 #include <linux/irq.h>
+#include <linux/suspend.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/irq.h>
@@ -321,7 +322,7 @@ asmlinkage __visible void do_softirq(void)
 void irq_enter(void)
 {
 	rcu_irq_enter();
-	if (is_idle_task(current) && !in_interrupt()) {
+	if (is_idle_task(current) && !in_interrupt() && !in_freeze()) {
 		/*
 		 * Prevent raise_softirq from needlessly waking up ksoftirqd
 		 * here, as softirq will be serviced on return from interrupt.
@@ -364,7 +365,7 @@ static inline void tick_irq_exit(void)
 
 	/* Make sure that timer wheel updates are propagated */
 	if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) {
-		if (!in_interrupt())
+		if (!in_interrupt() && !in_freeze())
 			tick_nohz_irq_exit();
 	}
 #endif
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 5544990..6d9a4a3 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -17,6 +17,7 @@
 #include <linux/module.h>
 #include <linux/smp.h>
 #include <linux/device.h>
+#include <linux/suspend.h>
 
 #include "tick-internal.h"
 
@@ -579,6 +580,18 @@ int clockevents_notify(unsigned long reason, void *arg)
 		tick_resume();
 		break;
 
+	case CLOCK_EVT_NOTIFY_FREEZE_PREPARE:
+		tick_freeze_prepare();
+		break;
+
+	case CLOCK_EVT_NOTIFY_FREEZE:
+		tick_freeze();
+		break;
+
+	case CLOCK_EVT_NOTIFY_UNFREEZE:
+		tick_unfreeze();
+		break;
+
 	case CLOCK_EVT_NOTIFY_CPU_DEAD:
 		tick_shutdown_broadcast_oneshot(arg);
 		tick_shutdown_broadcast(arg);
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 7efeedf..0bbc886 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -19,6 +19,7 @@
 #include <linux/profile.h>
 #include <linux/sched.h>
 #include <linux/module.h>
+#include <linux/suspend.h>
 
 #include <asm/irq_regs.h>
 
@@ -51,6 +52,16 @@ ktime_t tick_period;
 int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
 
 /*
+ * Tick device is per CPU device, when we freeze the timekeeping stuff, we
+ * want to freeze the tick device on all of the online CPUs.
+ *
+ * tick_freeze_target_depth is a counter used for freezing tick device, the
+ * initial value of it is online CPU number. When it is counted down to ZERO,
+ * all of the tick devices are freezed.
+ */
+static unsigned int tick_freeze_target_depth;
+
+/*
  * Debugging: see timer_list.c
  */
 struct tick_device *tick_get_device(int cpu)
@@ -375,15 +386,21 @@ void tick_shutdown(unsigned int *cpup)
 void tick_suspend(void)
 {
 	struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
+	struct clock_event_device *dev = td->evtdev;
 
+	dev->real_handler = dev->event_handler;
+	dev->event_handler = clockevents_handle_noop;
 	clockevents_shutdown(td->evtdev);
 }
 
 void tick_resume(void)
 {
 	struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
+	struct clock_event_device *dev = td->evtdev;
 	int broadcast = tick_resume_broadcast();
 
+	dev->event_handler = dev->real_handler;
+	dev->real_handler = NULL;
 	clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME);
 
 	if (!broadcast) {
@@ -394,6 +411,42 @@ void tick_resume(void)
 	}
 }
 
+void tick_freeze_prepare(void)
+{
+	tick_freeze_target_depth = num_online_cpus();
+}
+
+void tick_freeze(void)
+{
+	/*
+	 * This is serialized against a concurrent wakeup
+	 * via clockevents_lock
+	 */
+	tick_freeze_target_depth--;
+	tick_suspend();
+
+	/*
+	 * the last tick_suspend CPU suspends timekeeping
+	 */
+	if (!tick_freeze_target_depth)
+		timekeeping_freeze();
+}
+
+void tick_unfreeze(void)
+{
+	/*
+	 * the first wakeup CPU resumes timekeeping
+	 */
+	if (timekeeping_suspended) {
+		timekeeping_unfreeze();
+		touch_softlockup_watchdog();
+		tick_resume();
+		hrtimers_resume();
+	} else {
+		tick_resume();
+	}
+}
+
 /**
  * tick_init - initialize the tick control
  */
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 366aeb4..8b5bab6 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -27,6 +27,9 @@ extern void tick_handover_do_timer(int *cpup);
 extern void tick_shutdown(unsigned int *cpup);
 extern void tick_suspend(void);
 extern void tick_resume(void);
+extern void tick_freeze_prepare(void);
+extern void tick_freeze(void);
+extern void tick_unfreeze(void);
 extern bool tick_check_replacement(struct clock_event_device *curdev,
 				   struct clock_event_device *newdev);
 extern void tick_install_replacement(struct clock_event_device *dev);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index ec1791f..a11065f 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1107,14 +1107,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
 	clock_was_set();
 }
 
-/**
- * timekeeping_resume - Resumes the generic timekeeping subsystem.
- *
- * This is for the generic clocksource timekeeping.
- * xtime/wall_to_monotonic/jiffies/etc are
- * still managed by arch specific suspend/resume code.
- */
-static void timekeeping_resume(void)
+static void timekeeping_resume_compensate_time(void)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
 	struct clocksource *clock = tk->tkr.clock;
@@ -1127,9 +1120,6 @@ static void timekeeping_resume(void)
 	read_persistent_clock(&tmp);
 	ts_new = timespec_to_timespec64(tmp);
 
-	clockevents_resume();
-	clocksource_resume();
-
 	raw_spin_lock_irqsave(&timekeeper_lock, flags);
 	write_seqcount_begin(&tk_core.seq);
 
@@ -1186,16 +1176,9 @@ static void timekeeping_resume(void)
 	timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
 	write_seqcount_end(&tk_core.seq);
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
-
-	touch_softlockup_watchdog();
-
-	clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
-
-	/* Resume hrtimers */
-	hrtimers_resume();
 }
 
-static int timekeeping_suspend(void)
+static void timekeeping_suspend_get_time(void)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
 	unsigned long flags;
@@ -1242,11 +1225,65 @@ static int timekeeping_suspend(void)
 	timekeeping_update(tk, TK_MIRROR);
 	write_seqcount_end(&tk_core.seq);
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+}
 
-	clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
+/*
+ * The following operations:
+ *	clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
+ *	clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
+ * are moved out of
+ *	timekeeping_freeze() and timekeeping_unfreeze()
+ * and, replaced by
+ *	tick_suspend() and tick_resume()
+ * and, put into:
+ *	tick_freeze() and tick_unfreeze()
+ * so we avoid clockevents_lock multiple access
+ */
+void timekeeping_freeze(void)
+{
+	/*
+	 * clockevents_lock being held
+	 */
+	timekeeping_suspend_get_time();
 	clocksource_suspend();
 	clockevents_suspend();
+}
 
+void timekeeping_unfreeze(void)
+{
+	/*
+	 * clockevents_lock being held
+	 */
+	clockevents_resume();
+	clocksource_resume();
+	timekeeping_resume_compensate_time();
+}
+
+/**
+ * timekeeping_resume - Resumes the generic timekeeping subsystem.
+ *
+ * This is for the generic clocksource timekeeping.
+ * xtime/wall_to_monotonic/jiffies/etc are
+ * still managed by arch specific suspend/resume code.
+ */
+static void timekeeping_resume(void)
+{
+	clockevents_resume();
+	clocksource_resume();
+	timekeeping_resume_compensate_time();
+
+	touch_softlockup_watchdog();
+	clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
+	/* Resume hrtimers */
+	hrtimers_resume();
+}
+
+static int timekeeping_suspend(void)
+{
+	timekeeping_suspend_get_time();
+	clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
+	clocksource_suspend();
+	clockevents_suspend();
 	return 0;
 }
 
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [PATCH v3]PM/Sleep: Timer quiesce in freeze state
  2014-12-09  3:01 [PATCH v3]PM/Sleep: Timer quiesce in freeze state Li, Aubrey
@ 2015-01-14  0:24 ` Li, Aubrey
  2015-01-19 15:24   ` Rafael J. Wysocki
  0 siblings, 1 reply; 29+ messages in thread
From: Li, Aubrey @ 2015-01-14  0:24 UTC (permalink / raw)
  To: Thomas Gleixner, Peter Zijlstra, Rafael J. Wysocki, Brown, Len,
	alan, LKML, Linux PM list

Happy New Year, can you please take a look at this patch now?

Thanks,
-Aubrey

On 2014/12/9 11:01, Li, Aubrey wrote:
> The patch is based on v3.18.
> 
> Freeze is a general power saving state that processes are frozen, devices
> are suspended and CPUs are in idle state. However, when the system enters
> freeze state, there are a few timers keep ticking and hence consumes more
> power unnecessarily. The observed timer events in freeze state are:
> - tick_sched_timer
> - watchdog lockup detector
> - realtime scheduler period timer
> 
> The system power consumption in freeze state will be reduced significantly
> if we quiesce these timers.
> 
> The patch is tested on:
> - Sandybrdige-EP system, both RTC alarm and power button are able to wake
>   the system up from freeze state.
> - HP laptop EliteBook 8460p, both RTC alarm and power button are able to
>   wake the system up from freeze state.
> - Baytrail-T(ASUS_T100) platform, power button is able to wake the system
>   up from freeze state.
> 
> Suggested-by: Thomas Gleixner <tglx@linutronix.de>
> Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> Cc: Len Brown <len.brown@intel.com>
> Cc: Alan Cox <alan@linux.intel.com>
> ---
>  drivers/cpuidle/cpuidle.c   | 13 ++++++++
>  include/linux/clockchips.h  |  4 +++
>  include/linux/suspend.h     |  4 +++
>  include/linux/timekeeping.h |  2 ++
>  kernel/power/suspend.c      | 50 ++++++++++++++++++++++++++---
>  kernel/sched/idle.c         | 45 ++++++++++++++++++++++++++
>  kernel/softirq.c            |  5 +--
>  kernel/time/clockevents.c   | 13 ++++++++
>  kernel/time/tick-common.c   | 53 +++++++++++++++++++++++++++++++
>  kernel/time/tick-internal.h |  3 ++
>  kernel/time/timekeeping.c   | 77 +++++++++++++++++++++++++++++++++------------
>  11 files changed, 243 insertions(+), 26 deletions(-)
> 
> diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
> index 125150d..b9a3ada 100644
> --- a/drivers/cpuidle/cpuidle.c
> +++ b/drivers/cpuidle/cpuidle.c
> @@ -20,6 +20,7 @@
>  #include <linux/hrtimer.h>
>  #include <linux/module.h>
>  #include <trace/events/power.h>
> +#include <linux/suspend.h>
>  
>  #include "cpuidle.h"
>  
> @@ -119,6 +120,18 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
>  	ktime_t time_start, time_end;
>  	s64 diff;
>  
> +	/*
> +	 * under the freeze scenario, the timekeeping is suspended
> +	 * as well as the clock source device, so we bypass the idle
> +	 * counter update in freeze idle
> +	 */
> +	if (in_freeze()) {
> +		entered_state = target_state->enter(dev, drv, index);
> +		if (!cpuidle_state_is_coupled(dev, drv, entered_state))
> +			local_irq_enable();
> +		return entered_state;
> +	}
> +
>  	trace_cpu_idle_rcuidle(index, dev->cpu);
>  	time_start = ktime_get();
>  
> diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
> index 2e4cb67..d118e0b 100644
> --- a/include/linux/clockchips.h
> +++ b/include/linux/clockchips.h
> @@ -18,6 +18,9 @@ enum clock_event_nofitiers {
>  	CLOCK_EVT_NOTIFY_BROADCAST_EXIT,
>  	CLOCK_EVT_NOTIFY_SUSPEND,
>  	CLOCK_EVT_NOTIFY_RESUME,
> +	CLOCK_EVT_NOTIFY_FREEZE_PREPARE,
> +	CLOCK_EVT_NOTIFY_FREEZE,
> +	CLOCK_EVT_NOTIFY_UNFREEZE,
>  	CLOCK_EVT_NOTIFY_CPU_DYING,
>  	CLOCK_EVT_NOTIFY_CPU_DEAD,
>  };
> @@ -95,6 +98,7 @@ enum clock_event_mode {
>   */
>  struct clock_event_device {
>  	void			(*event_handler)(struct clock_event_device *);
> +	void			(*real_handler)(struct clock_event_device *);
>  	int			(*set_next_event)(unsigned long evt,
>  						  struct clock_event_device *);
>  	int			(*set_next_ktime)(ktime_t expires,
> diff --git a/include/linux/suspend.h b/include/linux/suspend.h
> index 3388c1b..86a651c 100644
> --- a/include/linux/suspend.h
> +++ b/include/linux/suspend.h
> @@ -203,6 +203,8 @@ extern void suspend_set_ops(const struct platform_suspend_ops *ops);
>  extern int suspend_valid_only_mem(suspend_state_t state);
>  extern void freeze_set_ops(const struct platform_freeze_ops *ops);
>  extern void freeze_wake(void);
> +extern bool in_freeze(void);
> +extern bool idle_should_freeze(void);
>  
>  /**
>   * arch_suspend_disable_irqs - disable IRQs for suspend
> @@ -230,6 +232,8 @@ static inline void suspend_set_ops(const struct platform_suspend_ops *ops) {}
>  static inline int pm_suspend(suspend_state_t state) { return -ENOSYS; }
>  static inline void freeze_set_ops(const struct platform_freeze_ops *ops) {}
>  static inline void freeze_wake(void) {}
> +static inline bool in_freeze(void) { return false; }
> +static inline bool idle_should_freeze(void) { return false; }
>  #endif /* !CONFIG_SUSPEND */
>  
>  /* struct pbe is used for creating lists of pages that should be restored
> diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
> index 1caa6b0..07957a9 100644
> --- a/include/linux/timekeeping.h
> +++ b/include/linux/timekeeping.h
> @@ -5,6 +5,8 @@
>  
>  void timekeeping_init(void);
>  extern int timekeeping_suspended;
> +extern void timekeeping_freeze(void);
> +extern void timekeeping_unfreeze(void);
>  
>  /*
>   * Get and set timeofday
> diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
> index c347e3c..6467fb8 100644
> --- a/kernel/power/suspend.c
> +++ b/kernel/power/suspend.c
> @@ -28,6 +28,7 @@
>  #include <linux/ftrace.h>
>  #include <trace/events/power.h>
>  #include <linux/compiler.h>
> +#include <linux/clockchips.h>
>  
>  #include "power.h"
>  
> @@ -37,7 +38,15 @@ const char *pm_states[PM_SUSPEND_MAX];
>  static const struct platform_suspend_ops *suspend_ops;
>  static const struct platform_freeze_ops *freeze_ops;
>  static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
> -static bool suspend_freeze_wake;
> +
> +/* freeze state machine */
> +enum freeze_state {
> +	FREEZE_STATE_NONE,      /* not in freeze */
> +	FREEZE_STATE_ENTER,     /* enter freeze */
> +	FREEZE_STATE_WAKE,      /* in freeze wakeup context */
> +};
> +
> +static enum freeze_state suspend_freeze_state;
>  
>  void freeze_set_ops(const struct platform_freeze_ops *ops)
>  {
> @@ -46,23 +55,56 @@ void freeze_set_ops(const struct platform_freeze_ops *ops)
>  	unlock_system_sleep();
>  }
>  
> +bool in_freeze(void)
> +{
> +	return (suspend_freeze_state > FREEZE_STATE_NONE);
> +}
> +EXPORT_SYMBOL_GPL(in_freeze);
> +
> +bool idle_should_freeze(void)
> +{
> +	return (suspend_freeze_state == FREEZE_STATE_ENTER);
> +}
> +EXPORT_SYMBOL_GPL(idle_should_freeze);
> +
>  static void freeze_begin(void)
>  {
> -	suspend_freeze_wake = false;
> +	suspend_freeze_state = FREEZE_STATE_NONE;
>  }
>  
>  static void freeze_enter(void)
>  {
> +	suspend_freeze_state = FREEZE_STATE_ENTER;
> +	get_online_cpus();
>  	cpuidle_use_deepest_state(true);
>  	cpuidle_resume();
> -	wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
> +	clockevents_notify(CLOCK_EVT_NOTIFY_FREEZE_PREPARE, NULL);
> +	/*
> +	 * push all the CPUs into freeze idle loop
> +	 */
> +	wake_up_all_idle_cpus();
> +	printk(KERN_INFO "PM: suspend to idle\n");
> +	/*
> +	 * put the current CPU into wait queue so that this CPU
> +	 * is able to enter freeze idle loop as well
> +	 */
> +	wait_event(suspend_freeze_wait_head,
> +		(suspend_freeze_state == FREEZE_STATE_WAKE));
> +	printk(KERN_INFO "PM: resume from freeze\n");
>  	cpuidle_pause();
>  	cpuidle_use_deepest_state(false);
> +	put_online_cpus();
> +	suspend_freeze_state = FREEZE_STATE_NONE;
>  }
>  
>  void freeze_wake(void)
>  {
> -	suspend_freeze_wake = true;
> +	if (!in_freeze())
> +		return;
> +	/*
> +	 * wake freeze task up
> +	 */
> +	suspend_freeze_state = FREEZE_STATE_WAKE;
>  	wake_up(&suspend_freeze_wait_head);
>  }
>  EXPORT_SYMBOL_GPL(freeze_wake);
> diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
> index c47fce7..f28f8cb 100644
> --- a/kernel/sched/idle.c
> +++ b/kernel/sched/idle.c
> @@ -7,6 +7,7 @@
>  #include <linux/tick.h>
>  #include <linux/mm.h>
>  #include <linux/stackprotector.h>
> +#include <linux/suspend.h>
>  
>  #include <asm/tlb.h>
>  
> @@ -182,6 +183,45 @@ exit_idle:
>  }
>  
>  /*
> + * cpu idle freeze function
> + */
> +static void cpu_idle_freeze(void)
> +{
> +	struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
> +	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
> +
> +	/*
> +	 * suspend tick, the last CPU suspend timekeeping
> +	 */
> +	clockevents_notify(CLOCK_EVT_NOTIFY_FREEZE, NULL);
> +	/*
> +	 * idle loop here if idle_should_freeze()
> +	 */
> +	while (idle_should_freeze()) {
> +		int next_state;
> +		/*
> +		 * interrupt must be disabled before cpu enters idle
> +		 */
> +		local_irq_disable();
> +
> +		next_state = cpuidle_select(drv, dev);
> +		if (next_state < 0) {
> +			arch_cpu_idle();
> +			continue;
> +		}
> +		/*
> +		 * cpuidle_enter will return with interrupt enabled
> +		 */
> +		cpuidle_enter(drv, dev, next_state);
> +	}
> +
> +	/*
> +	 * resume tick, the first wakeup CPU resume timekeeping
> +	 */
> +	clockevents_notify(CLOCK_EVT_NOTIFY_UNFREEZE, NULL);
> +}
> +
> +/*
>   * Generic idle loop implementation
>   *
>   * Called with polling cleared.
> @@ -208,6 +248,11 @@ static void cpu_idle_loop(void)
>  			if (cpu_is_offline(smp_processor_id()))
>  				arch_cpu_idle_dead();
>  
> +			if (idle_should_freeze()) {
> +				cpu_idle_freeze();
> +				continue;
> +			}
> +
>  			local_irq_disable();
>  			arch_cpu_idle_enter();
>  
> diff --git a/kernel/softirq.c b/kernel/softirq.c
> index 0699add..a231bf6 100644
> --- a/kernel/softirq.c
> +++ b/kernel/softirq.c
> @@ -26,6 +26,7 @@
>  #include <linux/smpboot.h>
>  #include <linux/tick.h>
>  #include <linux/irq.h>
> +#include <linux/suspend.h>
>  
>  #define CREATE_TRACE_POINTS
>  #include <trace/events/irq.h>
> @@ -321,7 +322,7 @@ asmlinkage __visible void do_softirq(void)
>  void irq_enter(void)
>  {
>  	rcu_irq_enter();
> -	if (is_idle_task(current) && !in_interrupt()) {
> +	if (is_idle_task(current) && !in_interrupt() && !in_freeze()) {
>  		/*
>  		 * Prevent raise_softirq from needlessly waking up ksoftirqd
>  		 * here, as softirq will be serviced on return from interrupt.
> @@ -364,7 +365,7 @@ static inline void tick_irq_exit(void)
>  
>  	/* Make sure that timer wheel updates are propagated */
>  	if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) {
> -		if (!in_interrupt())
> +		if (!in_interrupt() && !in_freeze())
>  			tick_nohz_irq_exit();
>  	}
>  #endif
> diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
> index 5544990..6d9a4a3 100644
> --- a/kernel/time/clockevents.c
> +++ b/kernel/time/clockevents.c
> @@ -17,6 +17,7 @@
>  #include <linux/module.h>
>  #include <linux/smp.h>
>  #include <linux/device.h>
> +#include <linux/suspend.h>
>  
>  #include "tick-internal.h"
>  
> @@ -579,6 +580,18 @@ int clockevents_notify(unsigned long reason, void *arg)
>  		tick_resume();
>  		break;
>  
> +	case CLOCK_EVT_NOTIFY_FREEZE_PREPARE:
> +		tick_freeze_prepare();
> +		break;
> +
> +	case CLOCK_EVT_NOTIFY_FREEZE:
> +		tick_freeze();
> +		break;
> +
> +	case CLOCK_EVT_NOTIFY_UNFREEZE:
> +		tick_unfreeze();
> +		break;
> +
>  	case CLOCK_EVT_NOTIFY_CPU_DEAD:
>  		tick_shutdown_broadcast_oneshot(arg);
>  		tick_shutdown_broadcast(arg);
> diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
> index 7efeedf..0bbc886 100644
> --- a/kernel/time/tick-common.c
> +++ b/kernel/time/tick-common.c
> @@ -19,6 +19,7 @@
>  #include <linux/profile.h>
>  #include <linux/sched.h>
>  #include <linux/module.h>
> +#include <linux/suspend.h>
>  
>  #include <asm/irq_regs.h>
>  
> @@ -51,6 +52,16 @@ ktime_t tick_period;
>  int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
>  
>  /*
> + * Tick device is per CPU device, when we freeze the timekeeping stuff, we
> + * want to freeze the tick device on all of the online CPUs.
> + *
> + * tick_freeze_target_depth is a counter used for freezing tick device, the
> + * initial value of it is online CPU number. When it is counted down to ZERO,
> + * all of the tick devices are freezed.
> + */
> +static unsigned int tick_freeze_target_depth;
> +
> +/*
>   * Debugging: see timer_list.c
>   */
>  struct tick_device *tick_get_device(int cpu)
> @@ -375,15 +386,21 @@ void tick_shutdown(unsigned int *cpup)
>  void tick_suspend(void)
>  {
>  	struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
> +	struct clock_event_device *dev = td->evtdev;
>  
> +	dev->real_handler = dev->event_handler;
> +	dev->event_handler = clockevents_handle_noop;
>  	clockevents_shutdown(td->evtdev);
>  }
>  
>  void tick_resume(void)
>  {
>  	struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
> +	struct clock_event_device *dev = td->evtdev;
>  	int broadcast = tick_resume_broadcast();
>  
> +	dev->event_handler = dev->real_handler;
> +	dev->real_handler = NULL;
>  	clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME);
>  
>  	if (!broadcast) {
> @@ -394,6 +411,42 @@ void tick_resume(void)
>  	}
>  }
>  
> +void tick_freeze_prepare(void)
> +{
> +	tick_freeze_target_depth = num_online_cpus();
> +}
> +
> +void tick_freeze(void)
> +{
> +	/*
> +	 * This is serialized against a concurrent wakeup
> +	 * via clockevents_lock
> +	 */
> +	tick_freeze_target_depth--;
> +	tick_suspend();
> +
> +	/*
> +	 * the last tick_suspend CPU suspends timekeeping
> +	 */
> +	if (!tick_freeze_target_depth)
> +		timekeeping_freeze();
> +}
> +
> +void tick_unfreeze(void)
> +{
> +	/*
> +	 * the first wakeup CPU resumes timekeeping
> +	 */
> +	if (timekeeping_suspended) {
> +		timekeeping_unfreeze();
> +		touch_softlockup_watchdog();
> +		tick_resume();
> +		hrtimers_resume();
> +	} else {
> +		tick_resume();
> +	}
> +}
> +
>  /**
>   * tick_init - initialize the tick control
>   */
> diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
> index 366aeb4..8b5bab6 100644
> --- a/kernel/time/tick-internal.h
> +++ b/kernel/time/tick-internal.h
> @@ -27,6 +27,9 @@ extern void tick_handover_do_timer(int *cpup);
>  extern void tick_shutdown(unsigned int *cpup);
>  extern void tick_suspend(void);
>  extern void tick_resume(void);
> +extern void tick_freeze_prepare(void);
> +extern void tick_freeze(void);
> +extern void tick_unfreeze(void);
>  extern bool tick_check_replacement(struct clock_event_device *curdev,
>  				   struct clock_event_device *newdev);
>  extern void tick_install_replacement(struct clock_event_device *dev);
> diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
> index ec1791f..a11065f 100644
> --- a/kernel/time/timekeeping.c
> +++ b/kernel/time/timekeeping.c
> @@ -1107,14 +1107,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
>  	clock_was_set();
>  }
>  
> -/**
> - * timekeeping_resume - Resumes the generic timekeeping subsystem.
> - *
> - * This is for the generic clocksource timekeeping.
> - * xtime/wall_to_monotonic/jiffies/etc are
> - * still managed by arch specific suspend/resume code.
> - */
> -static void timekeeping_resume(void)
> +static void timekeeping_resume_compensate_time(void)
>  {
>  	struct timekeeper *tk = &tk_core.timekeeper;
>  	struct clocksource *clock = tk->tkr.clock;
> @@ -1127,9 +1120,6 @@ static void timekeeping_resume(void)
>  	read_persistent_clock(&tmp);
>  	ts_new = timespec_to_timespec64(tmp);
>  
> -	clockevents_resume();
> -	clocksource_resume();
> -
>  	raw_spin_lock_irqsave(&timekeeper_lock, flags);
>  	write_seqcount_begin(&tk_core.seq);
>  
> @@ -1186,16 +1176,9 @@ static void timekeeping_resume(void)
>  	timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
>  	write_seqcount_end(&tk_core.seq);
>  	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
> -
> -	touch_softlockup_watchdog();
> -
> -	clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
> -
> -	/* Resume hrtimers */
> -	hrtimers_resume();
>  }
>  
> -static int timekeeping_suspend(void)
> +static void timekeeping_suspend_get_time(void)
>  {
>  	struct timekeeper *tk = &tk_core.timekeeper;
>  	unsigned long flags;
> @@ -1242,11 +1225,65 @@ static int timekeeping_suspend(void)
>  	timekeeping_update(tk, TK_MIRROR);
>  	write_seqcount_end(&tk_core.seq);
>  	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
> +}
>  
> -	clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
> +/*
> + * The following operations:
> + *	clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
> + *	clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
> + * are moved out of
> + *	timekeeping_freeze() and timekeeping_unfreeze()
> + * and, replaced by
> + *	tick_suspend() and tick_resume()
> + * and, put into:
> + *	tick_freeze() and tick_unfreeze()
> + * so we avoid clockevents_lock multiple access
> + */
> +void timekeeping_freeze(void)
> +{
> +	/*
> +	 * clockevents_lock being held
> +	 */
> +	timekeeping_suspend_get_time();
>  	clocksource_suspend();
>  	clockevents_suspend();
> +}
>  
> +void timekeeping_unfreeze(void)
> +{
> +	/*
> +	 * clockevents_lock being held
> +	 */
> +	clockevents_resume();
> +	clocksource_resume();
> +	timekeeping_resume_compensate_time();
> +}
> +
> +/**
> + * timekeeping_resume - Resumes the generic timekeeping subsystem.
> + *
> + * This is for the generic clocksource timekeeping.
> + * xtime/wall_to_monotonic/jiffies/etc are
> + * still managed by arch specific suspend/resume code.
> + */
> +static void timekeeping_resume(void)
> +{
> +	clockevents_resume();
> +	clocksource_resume();
> +	timekeeping_resume_compensate_time();
> +
> +	touch_softlockup_watchdog();
> +	clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
> +	/* Resume hrtimers */
> +	hrtimers_resume();
> +}
> +
> +static int timekeeping_suspend(void)
> +{
> +	timekeeping_suspend_get_time();
> +	clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
> +	clocksource_suspend();
> +	clockevents_suspend();
>  	return 0;
>  }
>  
> 


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v3]PM/Sleep: Timer quiesce in freeze state
  2015-01-14  0:24 ` Li, Aubrey
@ 2015-01-19 15:24   ` Rafael J. Wysocki
  2015-01-22 10:15     ` Thomas Gleixner
  0 siblings, 1 reply; 29+ messages in thread
From: Rafael J. Wysocki @ 2015-01-19 15:24 UTC (permalink / raw)
  To: Li, Aubrey, Thomas Gleixner, Peter Zijlstra
  Cc: Brown, Len, alan, LKML, Linux PM list

On Wednesday, January 14, 2015 08:24:04 AM Li, Aubrey wrote:
> Happy New Year, can you please take a look at this patch now?

Thomas, Peter, any comments?


> On 2014/12/9 11:01, Li, Aubrey wrote:
> > The patch is based on v3.18.
> > 
> > Freeze is a general power saving state that processes are frozen, devices
> > are suspended and CPUs are in idle state. However, when the system enters
> > freeze state, there are a few timers keep ticking and hence consumes more
> > power unnecessarily. The observed timer events in freeze state are:
> > - tick_sched_timer
> > - watchdog lockup detector
> > - realtime scheduler period timer
> > 
> > The system power consumption in freeze state will be reduced significantly
> > if we quiesce these timers.
> > 
> > The patch is tested on:
> > - Sandybrdige-EP system, both RTC alarm and power button are able to wake
> >   the system up from freeze state.
> > - HP laptop EliteBook 8460p, both RTC alarm and power button are able to
> >   wake the system up from freeze state.
> > - Baytrail-T(ASUS_T100) platform, power button is able to wake the system
> >   up from freeze state.
> > 
> > Suggested-by: Thomas Gleixner <tglx@linutronix.de>
> > Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
> > Cc: Peter Zijlstra <peterz@infradead.org>
> > Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> > Cc: Len Brown <len.brown@intel.com>
> > Cc: Alan Cox <alan@linux.intel.com>
> > ---
> >  drivers/cpuidle/cpuidle.c   | 13 ++++++++
> >  include/linux/clockchips.h  |  4 +++
> >  include/linux/suspend.h     |  4 +++
> >  include/linux/timekeeping.h |  2 ++
> >  kernel/power/suspend.c      | 50 ++++++++++++++++++++++++++---
> >  kernel/sched/idle.c         | 45 ++++++++++++++++++++++++++
> >  kernel/softirq.c            |  5 +--
> >  kernel/time/clockevents.c   | 13 ++++++++
> >  kernel/time/tick-common.c   | 53 +++++++++++++++++++++++++++++++
> >  kernel/time/tick-internal.h |  3 ++
> >  kernel/time/timekeeping.c   | 77 +++++++++++++++++++++++++++++++++------------
> >  11 files changed, 243 insertions(+), 26 deletions(-)
> > 
> > diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
> > index 125150d..b9a3ada 100644
> > --- a/drivers/cpuidle/cpuidle.c
> > +++ b/drivers/cpuidle/cpuidle.c
> > @@ -20,6 +20,7 @@
> >  #include <linux/hrtimer.h>
> >  #include <linux/module.h>
> >  #include <trace/events/power.h>
> > +#include <linux/suspend.h>
> >  
> >  #include "cpuidle.h"
> >  
> > @@ -119,6 +120,18 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
> >  	ktime_t time_start, time_end;
> >  	s64 diff;
> >  
> > +	/*
> > +	 * under the freeze scenario, the timekeeping is suspended
> > +	 * as well as the clock source device, so we bypass the idle
> > +	 * counter update in freeze idle
> > +	 */
> > +	if (in_freeze()) {
> > +		entered_state = target_state->enter(dev, drv, index);
> > +		if (!cpuidle_state_is_coupled(dev, drv, entered_state))
> > +			local_irq_enable();
> > +		return entered_state;
> > +	}
> > +
> >  	trace_cpu_idle_rcuidle(index, dev->cpu);
> >  	time_start = ktime_get();
> >  
> > diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
> > index 2e4cb67..d118e0b 100644
> > --- a/include/linux/clockchips.h
> > +++ b/include/linux/clockchips.h
> > @@ -18,6 +18,9 @@ enum clock_event_nofitiers {
> >  	CLOCK_EVT_NOTIFY_BROADCAST_EXIT,
> >  	CLOCK_EVT_NOTIFY_SUSPEND,
> >  	CLOCK_EVT_NOTIFY_RESUME,
> > +	CLOCK_EVT_NOTIFY_FREEZE_PREPARE,
> > +	CLOCK_EVT_NOTIFY_FREEZE,
> > +	CLOCK_EVT_NOTIFY_UNFREEZE,
> >  	CLOCK_EVT_NOTIFY_CPU_DYING,
> >  	CLOCK_EVT_NOTIFY_CPU_DEAD,
> >  };
> > @@ -95,6 +98,7 @@ enum clock_event_mode {
> >   */
> >  struct clock_event_device {
> >  	void			(*event_handler)(struct clock_event_device *);
> > +	void			(*real_handler)(struct clock_event_device *);
> >  	int			(*set_next_event)(unsigned long evt,
> >  						  struct clock_event_device *);
> >  	int			(*set_next_ktime)(ktime_t expires,
> > diff --git a/include/linux/suspend.h b/include/linux/suspend.h
> > index 3388c1b..86a651c 100644
> > --- a/include/linux/suspend.h
> > +++ b/include/linux/suspend.h
> > @@ -203,6 +203,8 @@ extern void suspend_set_ops(const struct platform_suspend_ops *ops);
> >  extern int suspend_valid_only_mem(suspend_state_t state);
> >  extern void freeze_set_ops(const struct platform_freeze_ops *ops);
> >  extern void freeze_wake(void);
> > +extern bool in_freeze(void);
> > +extern bool idle_should_freeze(void);
> >  
> >  /**
> >   * arch_suspend_disable_irqs - disable IRQs for suspend
> > @@ -230,6 +232,8 @@ static inline void suspend_set_ops(const struct platform_suspend_ops *ops) {}
> >  static inline int pm_suspend(suspend_state_t state) { return -ENOSYS; }
> >  static inline void freeze_set_ops(const struct platform_freeze_ops *ops) {}
> >  static inline void freeze_wake(void) {}
> > +static inline bool in_freeze(void) { return false; }
> > +static inline bool idle_should_freeze(void) { return false; }
> >  #endif /* !CONFIG_SUSPEND */
> >  
> >  /* struct pbe is used for creating lists of pages that should be restored
> > diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
> > index 1caa6b0..07957a9 100644
> > --- a/include/linux/timekeeping.h
> > +++ b/include/linux/timekeeping.h
> > @@ -5,6 +5,8 @@
> >  
> >  void timekeeping_init(void);
> >  extern int timekeeping_suspended;
> > +extern void timekeeping_freeze(void);
> > +extern void timekeeping_unfreeze(void);
> >  
> >  /*
> >   * Get and set timeofday
> > diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
> > index c347e3c..6467fb8 100644
> > --- a/kernel/power/suspend.c
> > +++ b/kernel/power/suspend.c
> > @@ -28,6 +28,7 @@
> >  #include <linux/ftrace.h>
> >  #include <trace/events/power.h>
> >  #include <linux/compiler.h>
> > +#include <linux/clockchips.h>
> >  
> >  #include "power.h"
> >  
> > @@ -37,7 +38,15 @@ const char *pm_states[PM_SUSPEND_MAX];
> >  static const struct platform_suspend_ops *suspend_ops;
> >  static const struct platform_freeze_ops *freeze_ops;
> >  static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
> > -static bool suspend_freeze_wake;
> > +
> > +/* freeze state machine */
> > +enum freeze_state {
> > +	FREEZE_STATE_NONE,      /* not in freeze */
> > +	FREEZE_STATE_ENTER,     /* enter freeze */
> > +	FREEZE_STATE_WAKE,      /* in freeze wakeup context */
> > +};
> > +
> > +static enum freeze_state suspend_freeze_state;
> >  
> >  void freeze_set_ops(const struct platform_freeze_ops *ops)
> >  {
> > @@ -46,23 +55,56 @@ void freeze_set_ops(const struct platform_freeze_ops *ops)
> >  	unlock_system_sleep();
> >  }
> >  
> > +bool in_freeze(void)
> > +{
> > +	return (suspend_freeze_state > FREEZE_STATE_NONE);
> > +}
> > +EXPORT_SYMBOL_GPL(in_freeze);
> > +
> > +bool idle_should_freeze(void)
> > +{
> > +	return (suspend_freeze_state == FREEZE_STATE_ENTER);
> > +}
> > +EXPORT_SYMBOL_GPL(idle_should_freeze);
> > +
> >  static void freeze_begin(void)
> >  {
> > -	suspend_freeze_wake = false;
> > +	suspend_freeze_state = FREEZE_STATE_NONE;
> >  }
> >  
> >  static void freeze_enter(void)
> >  {
> > +	suspend_freeze_state = FREEZE_STATE_ENTER;
> > +	get_online_cpus();
> >  	cpuidle_use_deepest_state(true);
> >  	cpuidle_resume();
> > -	wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
> > +	clockevents_notify(CLOCK_EVT_NOTIFY_FREEZE_PREPARE, NULL);
> > +	/*
> > +	 * push all the CPUs into freeze idle loop
> > +	 */
> > +	wake_up_all_idle_cpus();
> > +	printk(KERN_INFO "PM: suspend to idle\n");
> > +	/*
> > +	 * put the current CPU into wait queue so that this CPU
> > +	 * is able to enter freeze idle loop as well
> > +	 */
> > +	wait_event(suspend_freeze_wait_head,
> > +		(suspend_freeze_state == FREEZE_STATE_WAKE));
> > +	printk(KERN_INFO "PM: resume from freeze\n");
> >  	cpuidle_pause();
> >  	cpuidle_use_deepest_state(false);
> > +	put_online_cpus();
> > +	suspend_freeze_state = FREEZE_STATE_NONE;
> >  }
> >  
> >  void freeze_wake(void)
> >  {
> > -	suspend_freeze_wake = true;
> > +	if (!in_freeze())
> > +		return;
> > +	/*
> > +	 * wake freeze task up
> > +	 */
> > +	suspend_freeze_state = FREEZE_STATE_WAKE;
> >  	wake_up(&suspend_freeze_wait_head);
> >  }
> >  EXPORT_SYMBOL_GPL(freeze_wake);
> > diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
> > index c47fce7..f28f8cb 100644
> > --- a/kernel/sched/idle.c
> > +++ b/kernel/sched/idle.c
> > @@ -7,6 +7,7 @@
> >  #include <linux/tick.h>
> >  #include <linux/mm.h>
> >  #include <linux/stackprotector.h>
> > +#include <linux/suspend.h>
> >  
> >  #include <asm/tlb.h>
> >  
> > @@ -182,6 +183,45 @@ exit_idle:
> >  }
> >  
> >  /*
> > + * cpu idle freeze function
> > + */
> > +static void cpu_idle_freeze(void)
> > +{
> > +	struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
> > +	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
> > +
> > +	/*
> > +	 * suspend tick, the last CPU suspend timekeeping
> > +	 */
> > +	clockevents_notify(CLOCK_EVT_NOTIFY_FREEZE, NULL);
> > +	/*
> > +	 * idle loop here if idle_should_freeze()
> > +	 */
> > +	while (idle_should_freeze()) {
> > +		int next_state;
> > +		/*
> > +		 * interrupt must be disabled before cpu enters idle
> > +		 */
> > +		local_irq_disable();
> > +
> > +		next_state = cpuidle_select(drv, dev);
> > +		if (next_state < 0) {
> > +			arch_cpu_idle();
> > +			continue;
> > +		}
> > +		/*
> > +		 * cpuidle_enter will return with interrupt enabled
> > +		 */
> > +		cpuidle_enter(drv, dev, next_state);
> > +	}
> > +
> > +	/*
> > +	 * resume tick, the first wakeup CPU resume timekeeping
> > +	 */
> > +	clockevents_notify(CLOCK_EVT_NOTIFY_UNFREEZE, NULL);
> > +}
> > +
> > +/*
> >   * Generic idle loop implementation
> >   *
> >   * Called with polling cleared.
> > @@ -208,6 +248,11 @@ static void cpu_idle_loop(void)
> >  			if (cpu_is_offline(smp_processor_id()))
> >  				arch_cpu_idle_dead();
> >  
> > +			if (idle_should_freeze()) {
> > +				cpu_idle_freeze();
> > +				continue;
> > +			}
> > +
> >  			local_irq_disable();
> >  			arch_cpu_idle_enter();
> >  
> > diff --git a/kernel/softirq.c b/kernel/softirq.c
> > index 0699add..a231bf6 100644
> > --- a/kernel/softirq.c
> > +++ b/kernel/softirq.c
> > @@ -26,6 +26,7 @@
> >  #include <linux/smpboot.h>
> >  #include <linux/tick.h>
> >  #include <linux/irq.h>
> > +#include <linux/suspend.h>
> >  
> >  #define CREATE_TRACE_POINTS
> >  #include <trace/events/irq.h>
> > @@ -321,7 +322,7 @@ asmlinkage __visible void do_softirq(void)
> >  void irq_enter(void)
> >  {
> >  	rcu_irq_enter();
> > -	if (is_idle_task(current) && !in_interrupt()) {
> > +	if (is_idle_task(current) && !in_interrupt() && !in_freeze()) {
> >  		/*
> >  		 * Prevent raise_softirq from needlessly waking up ksoftirqd
> >  		 * here, as softirq will be serviced on return from interrupt.
> > @@ -364,7 +365,7 @@ static inline void tick_irq_exit(void)
> >  
> >  	/* Make sure that timer wheel updates are propagated */
> >  	if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) {
> > -		if (!in_interrupt())
> > +		if (!in_interrupt() && !in_freeze())
> >  			tick_nohz_irq_exit();
> >  	}
> >  #endif
> > diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
> > index 5544990..6d9a4a3 100644
> > --- a/kernel/time/clockevents.c
> > +++ b/kernel/time/clockevents.c
> > @@ -17,6 +17,7 @@
> >  #include <linux/module.h>
> >  #include <linux/smp.h>
> >  #include <linux/device.h>
> > +#include <linux/suspend.h>
> >  
> >  #include "tick-internal.h"
> >  
> > @@ -579,6 +580,18 @@ int clockevents_notify(unsigned long reason, void *arg)
> >  		tick_resume();
> >  		break;
> >  
> > +	case CLOCK_EVT_NOTIFY_FREEZE_PREPARE:
> > +		tick_freeze_prepare();
> > +		break;
> > +
> > +	case CLOCK_EVT_NOTIFY_FREEZE:
> > +		tick_freeze();
> > +		break;
> > +
> > +	case CLOCK_EVT_NOTIFY_UNFREEZE:
> > +		tick_unfreeze();
> > +		break;
> > +
> >  	case CLOCK_EVT_NOTIFY_CPU_DEAD:
> >  		tick_shutdown_broadcast_oneshot(arg);
> >  		tick_shutdown_broadcast(arg);
> > diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
> > index 7efeedf..0bbc886 100644
> > --- a/kernel/time/tick-common.c
> > +++ b/kernel/time/tick-common.c
> > @@ -19,6 +19,7 @@
> >  #include <linux/profile.h>
> >  #include <linux/sched.h>
> >  #include <linux/module.h>
> > +#include <linux/suspend.h>
> >  
> >  #include <asm/irq_regs.h>
> >  
> > @@ -51,6 +52,16 @@ ktime_t tick_period;
> >  int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
> >  
> >  /*
> > + * Tick device is per CPU device, when we freeze the timekeeping stuff, we
> > + * want to freeze the tick device on all of the online CPUs.
> > + *
> > + * tick_freeze_target_depth is a counter used for freezing tick device, the
> > + * initial value of it is online CPU number. When it is counted down to ZERO,
> > + * all of the tick devices are freezed.
> > + */
> > +static unsigned int tick_freeze_target_depth;
> > +
> > +/*
> >   * Debugging: see timer_list.c
> >   */
> >  struct tick_device *tick_get_device(int cpu)
> > @@ -375,15 +386,21 @@ void tick_shutdown(unsigned int *cpup)
> >  void tick_suspend(void)
> >  {
> >  	struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
> > +	struct clock_event_device *dev = td->evtdev;
> >  
> > +	dev->real_handler = dev->event_handler;
> > +	dev->event_handler = clockevents_handle_noop;
> >  	clockevents_shutdown(td->evtdev);
> >  }
> >  
> >  void tick_resume(void)
> >  {
> >  	struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
> > +	struct clock_event_device *dev = td->evtdev;
> >  	int broadcast = tick_resume_broadcast();
> >  
> > +	dev->event_handler = dev->real_handler;
> > +	dev->real_handler = NULL;
> >  	clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME);
> >  
> >  	if (!broadcast) {
> > @@ -394,6 +411,42 @@ void tick_resume(void)
> >  	}
> >  }
> >  
> > +void tick_freeze_prepare(void)
> > +{
> > +	tick_freeze_target_depth = num_online_cpus();
> > +}
> > +
> > +void tick_freeze(void)
> > +{
> > +	/*
> > +	 * This is serialized against a concurrent wakeup
> > +	 * via clockevents_lock
> > +	 */
> > +	tick_freeze_target_depth--;
> > +	tick_suspend();
> > +
> > +	/*
> > +	 * the last tick_suspend CPU suspends timekeeping
> > +	 */
> > +	if (!tick_freeze_target_depth)
> > +		timekeeping_freeze();
> > +}
> > +
> > +void tick_unfreeze(void)
> > +{
> > +	/*
> > +	 * the first wakeup CPU resumes timekeeping
> > +	 */
> > +	if (timekeeping_suspended) {
> > +		timekeeping_unfreeze();
> > +		touch_softlockup_watchdog();
> > +		tick_resume();
> > +		hrtimers_resume();
> > +	} else {
> > +		tick_resume();
> > +	}
> > +}
> > +
> >  /**
> >   * tick_init - initialize the tick control
> >   */
> > diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
> > index 366aeb4..8b5bab6 100644
> > --- a/kernel/time/tick-internal.h
> > +++ b/kernel/time/tick-internal.h
> > @@ -27,6 +27,9 @@ extern void tick_handover_do_timer(int *cpup);
> >  extern void tick_shutdown(unsigned int *cpup);
> >  extern void tick_suspend(void);
> >  extern void tick_resume(void);
> > +extern void tick_freeze_prepare(void);
> > +extern void tick_freeze(void);
> > +extern void tick_unfreeze(void);
> >  extern bool tick_check_replacement(struct clock_event_device *curdev,
> >  				   struct clock_event_device *newdev);
> >  extern void tick_install_replacement(struct clock_event_device *dev);
> > diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
> > index ec1791f..a11065f 100644
> > --- a/kernel/time/timekeeping.c
> > +++ b/kernel/time/timekeeping.c
> > @@ -1107,14 +1107,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
> >  	clock_was_set();
> >  }
> >  
> > -/**
> > - * timekeeping_resume - Resumes the generic timekeeping subsystem.
> > - *
> > - * This is for the generic clocksource timekeeping.
> > - * xtime/wall_to_monotonic/jiffies/etc are
> > - * still managed by arch specific suspend/resume code.
> > - */
> > -static void timekeeping_resume(void)
> > +static void timekeeping_resume_compensate_time(void)
> >  {
> >  	struct timekeeper *tk = &tk_core.timekeeper;
> >  	struct clocksource *clock = tk->tkr.clock;
> > @@ -1127,9 +1120,6 @@ static void timekeeping_resume(void)
> >  	read_persistent_clock(&tmp);
> >  	ts_new = timespec_to_timespec64(tmp);
> >  
> > -	clockevents_resume();
> > -	clocksource_resume();
> > -
> >  	raw_spin_lock_irqsave(&timekeeper_lock, flags);
> >  	write_seqcount_begin(&tk_core.seq);
> >  
> > @@ -1186,16 +1176,9 @@ static void timekeeping_resume(void)
> >  	timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
> >  	write_seqcount_end(&tk_core.seq);
> >  	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
> > -
> > -	touch_softlockup_watchdog();
> > -
> > -	clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
> > -
> > -	/* Resume hrtimers */
> > -	hrtimers_resume();
> >  }
> >  
> > -static int timekeeping_suspend(void)
> > +static void timekeeping_suspend_get_time(void)
> >  {
> >  	struct timekeeper *tk = &tk_core.timekeeper;
> >  	unsigned long flags;
> > @@ -1242,11 +1225,65 @@ static int timekeeping_suspend(void)
> >  	timekeeping_update(tk, TK_MIRROR);
> >  	write_seqcount_end(&tk_core.seq);
> >  	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
> > +}
> >  
> > -	clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
> > +/*
> > + * The following operations:
> > + *	clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
> > + *	clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
> > + * are moved out of
> > + *	timekeeping_freeze() and timekeeping_unfreeze()
> > + * and, replaced by
> > + *	tick_suspend() and tick_resume()
> > + * and, put into:
> > + *	tick_freeze() and tick_unfreeze()
> > + * so we avoid clockevents_lock multiple access
> > + */
> > +void timekeeping_freeze(void)
> > +{
> > +	/*
> > +	 * clockevents_lock being held
> > +	 */
> > +	timekeeping_suspend_get_time();
> >  	clocksource_suspend();
> >  	clockevents_suspend();
> > +}
> >  
> > +void timekeeping_unfreeze(void)
> > +{
> > +	/*
> > +	 * clockevents_lock being held
> > +	 */
> > +	clockevents_resume();
> > +	clocksource_resume();
> > +	timekeeping_resume_compensate_time();
> > +}
> > +
> > +/**
> > + * timekeeping_resume - Resumes the generic timekeeping subsystem.
> > + *
> > + * This is for the generic clocksource timekeeping.
> > + * xtime/wall_to_monotonic/jiffies/etc are
> > + * still managed by arch specific suspend/resume code.
> > + */
> > +static void timekeeping_resume(void)
> > +{
> > +	clockevents_resume();
> > +	clocksource_resume();
> > +	timekeeping_resume_compensate_time();
> > +
> > +	touch_softlockup_watchdog();
> > +	clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
> > +	/* Resume hrtimers */
> > +	hrtimers_resume();
> > +}
> > +
> > +static int timekeeping_suspend(void)
> > +{
> > +	timekeeping_suspend_get_time();
> > +	clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
> > +	clocksource_suspend();
> > +	clockevents_suspend();
> >  	return 0;
> >  }
> >  
> > 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-pm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

-- 
I speak only for myself.
Rafael J. Wysocki, Intel Open Source Technology Center.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v3]PM/Sleep: Timer quiesce in freeze state
  2015-01-19 15:24   ` Rafael J. Wysocki
@ 2015-01-22 10:15     ` Thomas Gleixner
  2015-01-26  8:44       ` Li, Aubrey
  0 siblings, 1 reply; 29+ messages in thread
From: Thomas Gleixner @ 2015-01-22 10:15 UTC (permalink / raw)
  To: Li, Aubrey, Rafael J. Wysocki
  Cc: Peter Zijlstra, Brown, Len, alan, LKML, Linux PM list

On Tue, 9 Dec 2014, Li, Aubrey wrote:
> diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
> index 2e4cb67..d118e0b 100644
> --- a/include/linux/clockchips.h
> +++ b/include/linux/clockchips.h
> @@ -18,6 +18,9 @@ enum clock_event_nofitiers {
>  	CLOCK_EVT_NOTIFY_BROADCAST_EXIT,
>  	CLOCK_EVT_NOTIFY_SUSPEND,
>  	CLOCK_EVT_NOTIFY_RESUME,
> +	CLOCK_EVT_NOTIFY_FREEZE_PREPARE,
> +	CLOCK_EVT_NOTIFY_FREEZE,
> +	CLOCK_EVT_NOTIFY_UNFREEZE,

Can we please stop adding more crap to that notifier thing? I rather
see that go away than being expanded.

> @@ -95,6 +98,7 @@ enum clock_event_mode {
>   */
>  struct clock_event_device {
>  	void			(*event_handler)(struct clock_event_device *);
> +	void			(*real_handler)(struct clock_event_device *);

This is really not the place to put this. We have the hotpath
functions together at the beginning of the struct and not some random
stuff used once in a while. Think about cache lines.

A proper explanation why you need this at all is missing.

>  /*
> + * cpu idle freeze function
> + */

How is that comment helpful?

Not at all. But its simpler to add pointless comments than to document
the important and non obvious stuff, right?

> +static void cpu_idle_freeze(void)
> +{
> +	struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
> +	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
> +
> +	/*
> +	 * suspend tick, the last CPU suspend timekeeping
> +	 */
> +	clockevents_notify(CLOCK_EVT_NOTIFY_FREEZE, NULL);
> +	/*
> +	 * idle loop here if idle_should_freeze()
> +	 */
> +	while (idle_should_freeze()) {
> +		int next_state;
> +		/*
> +		 * interrupt must be disabled before cpu enters idle
> +		 */
> +		local_irq_disable();
> +
> +		next_state = cpuidle_select(drv, dev);
> +		if (next_state < 0) {
> +			arch_cpu_idle();
> +			continue;
> +		}
> +		/*
> +		 * cpuidle_enter will return with interrupt enabled
> +		 */
> +		cpuidle_enter(drv, dev, next_state);

How is that supposed to work?

If timekeeping is not yet unfrozen, then any interrupt handling code
which calls anything time related is going to hit lala land.

You must guarantee that timekeeping is unfrozen before any interrupt
is handled. If you cannot guarantee that, you cannot freeze
timekeeping ever.

The cpu local tick device is less critical, but it happens to work by
chance, not by design.

>  void irq_enter(void)
>  {
>  	rcu_irq_enter();
> -	if (is_idle_task(current) && !in_interrupt()) {
> +	if (is_idle_task(current) && !in_interrupt() && !in_freeze()) {
>  		/*
>  		 * Prevent raise_softirq from needlessly waking up ksoftirqd
>  		 * here, as softirq will be serviced on return from interrupt.
> @@ -364,7 +365,7 @@ static inline void tick_irq_exit(void)
>  
>  	/* Make sure that timer wheel updates are propagated */
>  	if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) {
> -		if (!in_interrupt())
> +		if (!in_interrupt() && !in_freeze())
>  			tick_nohz_irq_exit();

We keep adding conditional over conditionals to the irq_enter/exit
hotpath. Can we please find some more intelligent solution for this?

> @@ -375,15 +386,21 @@ void tick_shutdown(unsigned int *cpup)
>  void tick_suspend(void)
>  {
>  	struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
> +	struct clock_event_device *dev = td->evtdev;
>  
> +	dev->real_handler = dev->event_handler;
> +	dev->event_handler = clockevents_handle_noop;

Lacks a proper explanation. What's the point of this exercise?

> +void tick_freeze_prepare(void)
> +{
> +	tick_freeze_target_depth = num_online_cpus();

So we have a 'notifier' callback just to store the number of online
cpus? That's beyond silly. What's wrong with having a frozen counter
and comparing that to num_online_cpus()?

> +void tick_freeze(void)
> +{
> +	/*
> +	 * This is serialized against a concurrent wakeup
> +	 * via clockevents_lock
> +	 */
> +	tick_freeze_target_depth--;
> +	tick_suspend();
> +
> +	/*
> +	 * the last tick_suspend CPU suspends timekeeping
> +	 */
> +	if (!tick_freeze_target_depth)
> +		timekeeping_freeze();
> +}
> +
> +void tick_unfreeze(void)
> +{
> +	/*
> +	 * the first wakeup CPU resumes timekeeping
> +	 */
> +	if (timekeeping_suspended) {
> +		timekeeping_unfreeze();
> +		touch_softlockup_watchdog();
> +		tick_resume();
> +		hrtimers_resume();
> +	} else {
> +		tick_resume();
> +	}
> +}

This is really horrible. We now have basically the same code for
freeze/unfreeze and suspend/resume just in different files and with
slightly different functionality. And we have that just because you
want to (ab)use clockevents_lock for serialization.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v3]PM/Sleep: Timer quiesce in freeze state
  2015-01-22 10:15     ` Thomas Gleixner
@ 2015-01-26  8:44       ` Li, Aubrey
  2015-01-26  9:40         ` Thomas Gleixner
  0 siblings, 1 reply; 29+ messages in thread
From: Li, Aubrey @ 2015-01-26  8:44 UTC (permalink / raw)
  To: Thomas Gleixner, Rafael J. Wysocki
  Cc: Peter Zijlstra, Brown, Len, alan, LKML, Linux PM list

Hi Thomas,

Thanks for the comments, my feedback below:

On 2015/1/22 18:15, Thomas Gleixner wrote:
> On Tue, 9 Dec 2014, Li, Aubrey wrote:
>> diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
>> index 2e4cb67..d118e0b 100644
>> --- a/include/linux/clockchips.h
>> +++ b/include/linux/clockchips.h
>> @@ -18,6 +18,9 @@ enum clock_event_nofitiers {
>>  	CLOCK_EVT_NOTIFY_BROADCAST_EXIT,
>>  	CLOCK_EVT_NOTIFY_SUSPEND,
>>  	CLOCK_EVT_NOTIFY_RESUME,
>> +	CLOCK_EVT_NOTIFY_FREEZE_PREPARE,
>> +	CLOCK_EVT_NOTIFY_FREEZE,
>> +	CLOCK_EVT_NOTIFY_UNFREEZE,
> 
> Can we please stop adding more crap to that notifier thing? I rather
> see that go away than being expanded.

Are you referring to FREEZE_PREPARE or remove all of FREEZE staff at all?

What's the disadvantage of adding more notifier?

> 
>> @@ -95,6 +98,7 @@ enum clock_event_mode {
>>   */
>>  struct clock_event_device {
>>  	void			(*event_handler)(struct clock_event_device *);
>> +	void			(*real_handler)(struct clock_event_device *);
> 
> This is really not the place to put this. We have the hotpath
> functions together at the beginning of the struct and not some random
> stuff used once in a while. Think about cache lines.

okay, will move to the tail.

> 
> A proper explanation why you need this at all is missing.
> 
Thanks, will add some comments here.

>>  /*
>> + * cpu idle freeze function
>> + */
> 
> How is that comment helpful?
> 
> Not at all. But its simpler to add pointless comments than to document
> the important and non obvious stuff, right?

okay.

> 
>> +static void cpu_idle_freeze(void)
>> +{
>> +	struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
>> +	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
>> +
>> +	/*
>> +	 * suspend tick, the last CPU suspend timekeeping
>> +	 */
>> +	clockevents_notify(CLOCK_EVT_NOTIFY_FREEZE, NULL);
>> +	/*
>> +	 * idle loop here if idle_should_freeze()
>> +	 */
>> +	while (idle_should_freeze()) {
>> +		int next_state;
>> +		/*
>> +		 * interrupt must be disabled before cpu enters idle
>> +		 */
>> +		local_irq_disable();
>> +
>> +		next_state = cpuidle_select(drv, dev);
>> +		if (next_state < 0) {
>> +			arch_cpu_idle();
>> +			continue;
>> +		}
>> +		/*
>> +		 * cpuidle_enter will return with interrupt enabled
>> +		 */
>> +		cpuidle_enter(drv, dev, next_state);
> 
> How is that supposed to work?
> 
> If timekeeping is not yet unfrozen, then any interrupt handling code
> which calls anything time related is going to hit lala land.
> 
> You must guarantee that timekeeping is unfrozen before any interrupt
> is handled. If you cannot guarantee that, you cannot freeze
> timekeeping ever.
> 
> The cpu local tick device is less critical, but it happens to work by
> chance, not by design.

There are two way to guarantee this: the first way is, disable interrupt
before timekeeping frozen and enable interrupt after timekeeping is
unfrozen. However, we need to handle wakeup handler before unfreeze
timekeeping to wake freeze task up from wait queue.

So we have to go the other way, the other way is, we ignore time related
calls during freeze, like what I added in irq_enter below.

Or, we need to re-implement freeze wait and wake up mechanism?
> 
>>  void irq_enter(void)
>>  {
>>  	rcu_irq_enter();
>> -	if (is_idle_task(current) && !in_interrupt()) {
>> +	if (is_idle_task(current) && !in_interrupt() && !in_freeze()) {
>>  		/*
>>  		 * Prevent raise_softirq from needlessly waking up ksoftirqd
>>  		 * here, as softirq will be serviced on return from interrupt.
>> @@ -364,7 +365,7 @@ static inline void tick_irq_exit(void)
>>  
>>  	/* Make sure that timer wheel updates are propagated */
>>  	if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) {
>> -		if (!in_interrupt())
>> +		if (!in_interrupt() && !in_freeze())
>>  			tick_nohz_irq_exit();
> 
> We keep adding conditional over conditionals to the irq_enter/exit
> hotpath. Can we please find some more intelligent solution for this?

I need more time to consider this, will get back to you if I have an idea.

> 
>> @@ -375,15 +386,21 @@ void tick_shutdown(unsigned int *cpup)
>>  void tick_suspend(void)
>>  {
>>  	struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
>> +	struct clock_event_device *dev = td->evtdev;
>>  
>> +	dev->real_handler = dev->event_handler;
>> +	dev->event_handler = clockevents_handle_noop;
> 
> Lacks a proper explanation. What's the point of this exercise?

Yeah, will add comments in the next version.

> 
>> +void tick_freeze_prepare(void)
>> +{
>> +	tick_freeze_target_depth = num_online_cpus();
> 
> So we have a 'notifier' callback just to store the number of online
> cpus? That's beyond silly. What's wrong with having a frozen counter
> and comparing that to num_online_cpus()?
> 
It looks like a notifier is unpopular, I was trying to avoid a global
variable across different kernel modules. But yes, I can change. May I
know the disadvantage of notifier callback?

>> +void tick_freeze(void)
>> +{
>> +	/*
>> +	 * This is serialized against a concurrent wakeup
>> +	 * via clockevents_lock
>> +	 */
>> +	tick_freeze_target_depth--;
>> +	tick_suspend();
>> +
>> +	/*
>> +	 * the last tick_suspend CPU suspends timekeeping
>> +	 */
>> +	if (!tick_freeze_target_depth)
>> +		timekeeping_freeze();
>> +}
>> +
>> +void tick_unfreeze(void)
>> +{
>> +	/*
>> +	 * the first wakeup CPU resumes timekeeping
>> +	 */
>> +	if (timekeeping_suspended) {
>> +		timekeeping_unfreeze();
>> +		touch_softlockup_watchdog();
>> +		tick_resume();
>> +		hrtimers_resume();
>> +	} else {
>> +		tick_resume();
>> +	}
>> +}
> 
> This is really horrible. We now have basically the same code for
> freeze/unfreeze and suspend/resume just in different files and with
> slightly different functionality.

Let me try to see if I can re-org these functions. Besides the
reduplicated code, do you see anything broken of the implementation?


> And we have that just because you want to (ab)use clockevents_lock for
> serialization.

The V2 patch was using stop machine mechanism according to Peter's
suggestion. This V3 patch, IIRC, clockevents_lock was your suggestion.
Do you have any better idea now?

Thanks,
-Aubrey
> 
> Thanks,
> 
> 	tglx


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v3]PM/Sleep: Timer quiesce in freeze state
  2015-01-26  8:44       ` Li, Aubrey
@ 2015-01-26  9:40         ` Thomas Gleixner
  2015-01-26 14:21           ` Rafael J. Wysocki
                             ` (2 more replies)
  0 siblings, 3 replies; 29+ messages in thread
From: Thomas Gleixner @ 2015-01-26  9:40 UTC (permalink / raw)
  To: Li, Aubrey
  Cc: Rafael J. Wysocki, Peter Zijlstra, Brown, Len, alan, LKML, Linux PM list

On Mon, 26 Jan 2015, Li, Aubrey wrote:
> On 2015/1/22 18:15, Thomas Gleixner wrote:
> > Can we please stop adding more crap to that notifier thing? I rather
> > see that go away than being expanded.
> 
> Are you referring to FREEZE_PREPARE or remove all of FREEZE staff at all?
> 
> What's the disadvantage of adding more notifier?

clockevents_notify() is not a notifier. Its a multiplex call and I
want to get rid of it and replace it with explicit functions.

> >> +		/*
> >> +		 * cpuidle_enter will return with interrupt enabled
> >> +		 */
> >> +		cpuidle_enter(drv, dev, next_state);
> > 
> > How is that supposed to work?
> > 
> > If timekeeping is not yet unfrozen, then any interrupt handling code
> > which calls anything time related is going to hit lala land.
> > 
> > You must guarantee that timekeeping is unfrozen before any interrupt
> > is handled. If you cannot guarantee that, you cannot freeze
> > timekeeping ever.
> > 
> > The cpu local tick device is less critical, but it happens to work by
> > chance, not by design.
> 
> There are two way to guarantee this: the first way is, disable interrupt
> before timekeeping frozen and enable interrupt after timekeeping is
> unfrozen. However, we need to handle wakeup handler before unfreeze
> timekeeping to wake freeze task up from wait queue.
> 
> So we have to go the other way, the other way is, we ignore time related
> calls during freeze, like what I added in irq_enter below.

Groan. You just do not call in irq_enter/exit(), but what prevents any
interrupt handler or whatever to call into the time/timer code after
interrupts got reenabled?

Nothing. 

> Or, we need to re-implement freeze wait and wake up mechanism?

You need to make sure in the low level idle implementation that this
cannot happen.

tick_freeze()
{
	raw_spin_lock(&tick_freeze_lock);
	tick_frozen++;
	if (tick_frozen == num_online_cpus())
		timekeeping_suspend();
	else
		tick_suspend_local();
	raw_spin_unlock(&tick_freeze_lock);
}

tick_unfreeze()
{
	raw_spin_lock(&tick_freeze_lock);
	if (tick_frozen == num_online_cpus())
		timekeeping_resume();
	else
		tick_resume_local();
	tick_frozen--;
	raw_spin_unlock(&tick_freeze_lock);
}

idle_freeze()
{
	local_irq_disable();

	tick_freeze();

	/* Must keep interrupts disabled! */
       	go_deep_idle()

	tick_unfreeze();

	local_irq_enable();
}

That's the only way you can do it proper, everything else will just be
a horrible mess of bandaids and duct tape.

So that does not need any of the irq_enter/exit conditionals, it does
not need the real_handler hack. It just works.

The only remaining issue might be a NMI calling into
ktime_get_mono_fast_ns() before timekeeping is resumed. Its probably a
non issue on x86/tsc, but it might be a problem on other platforms
which turn off devices, clocks, It's not rocket science to prevent
that.

> It looks like a notifier is unpopular, I was trying to avoid a global
> variable across different kernel modules. But yes, I can change. May I
> know the disadvantage of notifier callback?

You do not need a global variable at all. See above.

> The V2 patch was using stop machine mechanism according to Peter's
> suggestion. This V3 patch, IIRC, clockevents_lock was your suggestion.

A suggestion does not mean that you should follow it blindly. If you
see that the result is horrible and not feasible then you should
notice yourself, think about different approaches and discuss that.

I'm about to post a series which gets rid of clockevents_notify() and
distangles the stuff so the above becomes possible.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v3]PM/Sleep: Timer quiesce in freeze state
  2015-01-26 14:21           ` Rafael J. Wysocki
@ 2015-01-26 14:15             ` Thomas Gleixner
  2015-01-26 14:45               ` Rafael J. Wysocki
  0 siblings, 1 reply; 29+ messages in thread
From: Thomas Gleixner @ 2015-01-26 14:15 UTC (permalink / raw)
  To: Rafael J. Wysocki
  Cc: Li, Aubrey, Peter Zijlstra, Brown, Len, alan, LKML, Linux PM list

On Mon, 26 Jan 2015, Rafael J. Wysocki wrote:

> On Monday, January 26, 2015 10:40:24 AM Thomas Gleixner wrote:
> > On Mon, 26 Jan 2015, Li, Aubrey wrote:
> > > On 2015/1/22 18:15, Thomas Gleixner wrote:
> > > > Can we please stop adding more crap to that notifier thing? I rather
> > > > see that go away than being expanded.
> > > 
> > > Are you referring to FREEZE_PREPARE or remove all of FREEZE staff at all?
> > > 
> > > What's the disadvantage of adding more notifier?
> > 
> > clockevents_notify() is not a notifier. Its a multiplex call and I
> > want to get rid of it and replace it with explicit functions.
> 
> OK, so perhaps we need to move _SUSPEND/_RESUME out of there to start with?
> 
> As far as I can say, clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL) and
> clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL) are each only called from
> one place and moreover, since they are in syscore_ops, we don't need any
> locking around them.
> 
> So what about the patch below?

I'm cleaning up the whole replacement of notify. The stuff below is
part of it.

>  
> -	clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
> +	tick_suspend();
> +	tick_suspend_broadcast();

That's exactly the stuff I don't want to see. Blind code
move. tick_suspend_broadcast() wants to be called from tick_suspend().

Still compiling and testing a gazillion of combinations.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v3]PM/Sleep: Timer quiesce in freeze state
  2015-01-26  9:40         ` Thomas Gleixner
@ 2015-01-26 14:21           ` Rafael J. Wysocki
  2015-01-26 14:15             ` Thomas Gleixner
  2015-01-26 14:41           ` Rafael J. Wysocki
  2015-01-29 22:20           ` Rafael J. Wysocki
  2 siblings, 1 reply; 29+ messages in thread
From: Rafael J. Wysocki @ 2015-01-26 14:21 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Li, Aubrey, Peter Zijlstra, Brown, Len, alan, LKML, Linux PM list

On Monday, January 26, 2015 10:40:24 AM Thomas Gleixner wrote:
> On Mon, 26 Jan 2015, Li, Aubrey wrote:
> > On 2015/1/22 18:15, Thomas Gleixner wrote:
> > > Can we please stop adding more crap to that notifier thing? I rather
> > > see that go away than being expanded.
> > 
> > Are you referring to FREEZE_PREPARE or remove all of FREEZE staff at all?
> > 
> > What's the disadvantage of adding more notifier?
> 
> clockevents_notify() is not a notifier. Its a multiplex call and I
> want to get rid of it and replace it with explicit functions.

OK, so perhaps we need to move _SUSPEND/_RESUME out of there to start with?

As far as I can say, clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL) and
clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL) are each only called from
one place and moreover, since they are in syscore_ops, we don't need any
locking around them.

So what about the patch below?


---
 include/linux/clockchips.h |    2 --
 kernel/time/clockevents.c  |    9 ---------
 kernel/time/timekeeping.c  |    6 ++++--
 3 files changed, 4 insertions(+), 13 deletions(-)

Index: linux-pm/include/linux/clockchips.h
===================================================================
--- linux-pm.orig/include/linux/clockchips.h
+++ linux-pm/include/linux/clockchips.h
@@ -16,8 +16,6 @@ enum clock_event_nofitiers {
 	CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
 	CLOCK_EVT_NOTIFY_BROADCAST_ENTER,
 	CLOCK_EVT_NOTIFY_BROADCAST_EXIT,
-	CLOCK_EVT_NOTIFY_SUSPEND,
-	CLOCK_EVT_NOTIFY_RESUME,
 	CLOCK_EVT_NOTIFY_CPU_DYING,
 	CLOCK_EVT_NOTIFY_CPU_DEAD,
 };
Index: linux-pm/kernel/time/clockevents.c
===================================================================
--- linux-pm.orig/kernel/time/clockevents.c
+++ linux-pm/kernel/time/clockevents.c
@@ -570,15 +570,6 @@ int clockevents_notify(unsigned long rea
 		tick_handover_do_timer(arg);
 		break;
 
-	case CLOCK_EVT_NOTIFY_SUSPEND:
-		tick_suspend();
-		tick_suspend_broadcast();
-		break;
-
-	case CLOCK_EVT_NOTIFY_RESUME:
-		tick_resume();
-		break;
-
 	case CLOCK_EVT_NOTIFY_CPU_DEAD:
 		tick_shutdown_broadcast_oneshot(arg);
 		tick_shutdown_broadcast(arg);
Index: linux-pm/kernel/time/timekeeping.c
===================================================================
--- linux-pm.orig/kernel/time/timekeeping.c
+++ linux-pm/kernel/time/timekeeping.c
@@ -1245,7 +1245,7 @@ static void timekeeping_resume(void)
 
 	touch_softlockup_watchdog();
 
-	clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
+	tick_resume();
 
 	/* Resume hrtimers */
 	hrtimers_resume();
@@ -1299,7 +1299,9 @@ static int timekeeping_suspend(void)
 	write_seqcount_end(&tk_core.seq);
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 
-	clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
+	tick_suspend();
+	tick_suspend_broadcast();
+
 	clocksource_suspend();
 	clockevents_suspend();
 


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v3]PM/Sleep: Timer quiesce in freeze state
  2015-01-26 14:41           ` Rafael J. Wysocki
@ 2015-01-26 14:24             ` Thomas Gleixner
  2015-01-26 14:50               ` Rafael J. Wysocki
  2015-01-27  8:03             ` Li, Aubrey
  1 sibling, 1 reply; 29+ messages in thread
From: Thomas Gleixner @ 2015-01-26 14:24 UTC (permalink / raw)
  To: Rafael J. Wysocki
  Cc: Li, Aubrey, Peter Zijlstra, Brown, Len, alan, LKML, Linux PM list

On Mon, 26 Jan 2015, Rafael J. Wysocki wrote:
> On Monday, January 26, 2015 10:40:24 AM Thomas Gleixner wrote:
> > The only remaining issue might be a NMI calling into
> > ktime_get_mono_fast_ns() before timekeeping is resumed. Its probably a
> > non issue on x86/tsc, but it might be a problem on other platforms
> > which turn off devices, clocks, It's not rocket science to prevent
> > that.
> 
> I don't see any users of ktime_get_mono_fast_ns() at all, unless some non-trivial
> macros are involved.  At least grepping for it only returns the definition,
> declarations and the line in trace.c.

You can trace in NMI and perf is going to use ktime_get_mono_fast_ns()
eventually as well.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v3]PM/Sleep: Timer quiesce in freeze state
  2015-01-26 14:50               ` Rafael J. Wysocki
@ 2015-01-26 14:34                 ` Thomas Gleixner
  2015-01-26 15:04                   ` Rafael J. Wysocki
  0 siblings, 1 reply; 29+ messages in thread
From: Thomas Gleixner @ 2015-01-26 14:34 UTC (permalink / raw)
  To: Rafael J. Wysocki
  Cc: Li, Aubrey, Peter Zijlstra, Brown, Len, alan, LKML, Linux PM list

On Mon, 26 Jan 2015, Rafael J. Wysocki wrote:
> On Monday, January 26, 2015 03:24:27 PM Thomas Gleixner wrote:
> > On Mon, 26 Jan 2015, Rafael J. Wysocki wrote:
> > > On Monday, January 26, 2015 10:40:24 AM Thomas Gleixner wrote:
> > > > The only remaining issue might be a NMI calling into
> > > > ktime_get_mono_fast_ns() before timekeeping is resumed. Its probably a
> > > > non issue on x86/tsc, but it might be a problem on other platforms
> > > > which turn off devices, clocks, It's not rocket science to prevent
> > > > that.
> > > 
> > > I don't see any users of ktime_get_mono_fast_ns() at all, unless some non-trivial
> > > macros are involved.  At least grepping for it only returns the definition,
> > > declarations and the line in trace.c.
> > 
> > You can trace in NMI and perf is going to use ktime_get_mono_fast_ns()
> > eventually as well.
> 
> So I'm not sure how to intercept that to be honest.  Any hints?

If we suspend timekeeping we can make the fast timekeeper point at a
dummy readout function which returns the time at suspend, if the clock
source does not resume automagically.

As I said TSC should be a non issue, but other stuff might be. We
could make it conditional on CLOCK_SOURCE_SUSPEND_NONSTOP perhaps.

Thanks,

	tglx



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v3]PM/Sleep: Timer quiesce in freeze state
  2015-01-26  9:40         ` Thomas Gleixner
  2015-01-26 14:21           ` Rafael J. Wysocki
@ 2015-01-26 14:41           ` Rafael J. Wysocki
  2015-01-26 14:24             ` Thomas Gleixner
  2015-01-27  8:03             ` Li, Aubrey
  2015-01-29 22:20           ` Rafael J. Wysocki
  2 siblings, 2 replies; 29+ messages in thread
From: Rafael J. Wysocki @ 2015-01-26 14:41 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Li, Aubrey, Peter Zijlstra, Brown, Len, alan, LKML, Linux PM list

On Monday, January 26, 2015 10:40:24 AM Thomas Gleixner wrote:
> On Mon, 26 Jan 2015, Li, Aubrey wrote:
> > On 2015/1/22 18:15, Thomas Gleixner wrote:

[...]

> > >> +		/*
> > >> +		 * cpuidle_enter will return with interrupt enabled
> > >> +		 */
> > >> +		cpuidle_enter(drv, dev, next_state);
> > > 
> > > How is that supposed to work?
> > > 
> > > If timekeeping is not yet unfrozen, then any interrupt handling code
> > > which calls anything time related is going to hit lala land.
> > > 
> > > You must guarantee that timekeeping is unfrozen before any interrupt
> > > is handled. If you cannot guarantee that, you cannot freeze
> > > timekeeping ever.
> > > 
> > > The cpu local tick device is less critical, but it happens to work by
> > > chance, not by design.
> > 
> > There are two way to guarantee this: the first way is, disable interrupt
> > before timekeeping frozen and enable interrupt after timekeeping is
> > unfrozen. However, we need to handle wakeup handler before unfreeze
> > timekeeping to wake freeze task up from wait queue.
> > 
> > So we have to go the other way, the other way is, we ignore time related
> > calls during freeze, like what I added in irq_enter below.
> 
> Groan. You just do not call in irq_enter/exit(), but what prevents any
> interrupt handler or whatever to call into the time/timer code after
> interrupts got reenabled?
> 
> Nothing. 
> 
> > Or, we need to re-implement freeze wait and wake up mechanism?
> 
> You need to make sure in the low level idle implementation that this
> cannot happen.
> 
> tick_freeze()
> {
> 	raw_spin_lock(&tick_freeze_lock);
> 	tick_frozen++;
> 	if (tick_frozen == num_online_cpus())
> 		timekeeping_suspend();
> 	else
> 		tick_suspend_local();
> 	raw_spin_unlock(&tick_freeze_lock);
> }
> 
> tick_unfreeze()
> {
> 	raw_spin_lock(&tick_freeze_lock);
> 	if (tick_frozen == num_online_cpus())
> 		timekeeping_resume();
> 	else
> 		tick_resume_local();
> 	tick_frozen--;
> 	raw_spin_unlock(&tick_freeze_lock);
> }
> 
> idle_freeze()
> {
> 	local_irq_disable();
> 
> 	tick_freeze();
> 
> 	/* Must keep interrupts disabled! */
>        	go_deep_idle()
> 
> 	tick_unfreeze();
> 
> 	local_irq_enable();
> }
> 
> That's the only way you can do it proper, everything else will just be
> a horrible mess of bandaids and duct tape.
> 
> So that does not need any of the irq_enter/exit conditionals, it does
> not need the real_handler hack. It just works.

As long as go_deep_idle() above does not enable interrupts.  This means we won't
be able to use some C-states for suspend-to-idle (hald-induced C1 on some x86
for one example), but that's not a very big deal.

> The only remaining issue might be a NMI calling into
> ktime_get_mono_fast_ns() before timekeeping is resumed. Its probably a
> non issue on x86/tsc, but it might be a problem on other platforms
> which turn off devices, clocks, It's not rocket science to prevent
> that.

I don't see any users of ktime_get_mono_fast_ns() at all, unless some non-trivial
macros are involved.  At least grepping for it only returns the definition,
declarations and the line in trace.c.

Rafael


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v3]PM/Sleep: Timer quiesce in freeze state
  2015-01-26 14:15             ` Thomas Gleixner
@ 2015-01-26 14:45               ` Rafael J. Wysocki
  2015-01-27  7:12                 ` Li, Aubrey
  0 siblings, 1 reply; 29+ messages in thread
From: Rafael J. Wysocki @ 2015-01-26 14:45 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Li, Aubrey, Peter Zijlstra, Brown, Len, alan, LKML, Linux PM list

On Monday, January 26, 2015 03:15:43 PM Thomas Gleixner wrote:
> On Mon, 26 Jan 2015, Rafael J. Wysocki wrote:
> 
> > On Monday, January 26, 2015 10:40:24 AM Thomas Gleixner wrote:
> > > On Mon, 26 Jan 2015, Li, Aubrey wrote:
> > > > On 2015/1/22 18:15, Thomas Gleixner wrote:
> > > > > Can we please stop adding more crap to that notifier thing? I rather
> > > > > see that go away than being expanded.
> > > > 
> > > > Are you referring to FREEZE_PREPARE or remove all of FREEZE staff at all?
> > > > 
> > > > What's the disadvantage of adding more notifier?
> > > 
> > > clockevents_notify() is not a notifier. Its a multiplex call and I
> > > want to get rid of it and replace it with explicit functions.
> > 
> > OK, so perhaps we need to move _SUSPEND/_RESUME out of there to start with?
> > 
> > As far as I can say, clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL) and
> > clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL) are each only called from
> > one place and moreover, since they are in syscore_ops, we don't need any
> > locking around them.
> > 
> > So what about the patch below?
> 
> I'm cleaning up the whole replacement of notify. The stuff below is
> part of it.
> 
> >  
> > -	clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
> > +	tick_suspend();
> > +	tick_suspend_broadcast();
> 
> That's exactly the stuff I don't want to see. Blind code
> move.

At least it's clear what the patch does. :-)

> tick_suspend_broadcast() wants to be called from tick_suspend().

OK

> Still compiling and testing a gazillion of combinations.

OK, so it looks like we need to wait with the suspend to idle changes until
this lands.

Rafael


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v3]PM/Sleep: Timer quiesce in freeze state
  2015-01-26 14:24             ` Thomas Gleixner
@ 2015-01-26 14:50               ` Rafael J. Wysocki
  2015-01-26 14:34                 ` Thomas Gleixner
  0 siblings, 1 reply; 29+ messages in thread
From: Rafael J. Wysocki @ 2015-01-26 14:50 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Li, Aubrey, Peter Zijlstra, Brown, Len, alan, LKML, Linux PM list

On Monday, January 26, 2015 03:24:27 PM Thomas Gleixner wrote:
> On Mon, 26 Jan 2015, Rafael J. Wysocki wrote:
> > On Monday, January 26, 2015 10:40:24 AM Thomas Gleixner wrote:
> > > The only remaining issue might be a NMI calling into
> > > ktime_get_mono_fast_ns() before timekeeping is resumed. Its probably a
> > > non issue on x86/tsc, but it might be a problem on other platforms
> > > which turn off devices, clocks, It's not rocket science to prevent
> > > that.
> > 
> > I don't see any users of ktime_get_mono_fast_ns() at all, unless some non-trivial
> > macros are involved.  At least grepping for it only returns the definition,
> > declarations and the line in trace.c.
> 
> You can trace in NMI and perf is going to use ktime_get_mono_fast_ns()
> eventually as well.

So I'm not sure how to intercept that to be honest.  Any hints?

Rafael


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v3]PM/Sleep: Timer quiesce in freeze state
  2015-01-26 14:34                 ` Thomas Gleixner
@ 2015-01-26 15:04                   ` Rafael J. Wysocki
  0 siblings, 0 replies; 29+ messages in thread
From: Rafael J. Wysocki @ 2015-01-26 15:04 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Li, Aubrey, Peter Zijlstra, Brown, Len, alan, LKML, Linux PM list

On Monday, January 26, 2015 03:34:22 PM Thomas Gleixner wrote:
> On Mon, 26 Jan 2015, Rafael J. Wysocki wrote:
> > On Monday, January 26, 2015 03:24:27 PM Thomas Gleixner wrote:
> > > On Mon, 26 Jan 2015, Rafael J. Wysocki wrote:
> > > > On Monday, January 26, 2015 10:40:24 AM Thomas Gleixner wrote:
> > > > > The only remaining issue might be a NMI calling into
> > > > > ktime_get_mono_fast_ns() before timekeeping is resumed. Its probably a
> > > > > non issue on x86/tsc, but it might be a problem on other platforms
> > > > > which turn off devices, clocks, It's not rocket science to prevent
> > > > > that.
> > > > 
> > > > I don't see any users of ktime_get_mono_fast_ns() at all, unless some non-trivial
> > > > macros are involved.  At least grepping for it only returns the definition,
> > > > declarations and the line in trace.c.
> > > 
> > > You can trace in NMI and perf is going to use ktime_get_mono_fast_ns()
> > > eventually as well.
> > 
> > So I'm not sure how to intercept that to be honest.  Any hints?
> 
> If we suspend timekeeping we can make the fast timekeeper point at a
> dummy readout function which returns the time at suspend, if the clock
> source does not resume automagically.
> 
> As I said TSC should be a non issue, but other stuff might be. We
> could make it conditional on CLOCK_SOURCE_SUSPEND_NONSTOP perhaps.

OK, thanks!

Rafael


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v3]PM/Sleep: Timer quiesce in freeze state
  2015-01-26 14:45               ` Rafael J. Wysocki
@ 2015-01-27  7:12                 ` Li, Aubrey
  0 siblings, 0 replies; 29+ messages in thread
From: Li, Aubrey @ 2015-01-27  7:12 UTC (permalink / raw)
  To: Rafael J. Wysocki, Thomas Gleixner
  Cc: Peter Zijlstra, Brown, Len, alan, LKML, Linux PM list

On 2015/1/26 22:45, Rafael J. Wysocki wrote:
> On Monday, January 26, 2015 03:15:43 PM Thomas Gleixner wrote:
>> On Mon, 26 Jan 2015, Rafael J. Wysocki wrote:
>>
>>> On Monday, January 26, 2015 10:40:24 AM Thomas Gleixner wrote:
>>>> On Mon, 26 Jan 2015, Li, Aubrey wrote:
>>>>> On 2015/1/22 18:15, Thomas Gleixner wrote:
>>>>>> Can we please stop adding more crap to that notifier thing? I rather
>>>>>> see that go away than being expanded.
>>>>>
>>>>> Are you referring to FREEZE_PREPARE or remove all of FREEZE staff at all?
>>>>>
>>>>> What's the disadvantage of adding more notifier?
>>>>
>>>> clockevents_notify() is not a notifier. Its a multiplex call and I
>>>> want to get rid of it and replace it with explicit functions.
>>>
>>> OK, so perhaps we need to move _SUSPEND/_RESUME out of there to start with?
>>>
>>> As far as I can say, clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL) and
>>> clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL) are each only called from
>>> one place and moreover, since they are in syscore_ops, we don't need any
>>> locking around them.
>>>
>>> So what about the patch below?
>>
>> I'm cleaning up the whole replacement of notify. The stuff below is
>> part of it.
>>
>>>  
>>> -	clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
>>> +	tick_suspend();
>>> +	tick_suspend_broadcast();
>>
>> That's exactly the stuff I don't want to see. Blind code
>> move.
> 
> At least it's clear what the patch does. :-)
> 
>> tick_suspend_broadcast() wants to be called from tick_suspend().
> 
> OK
> 
>> Still compiling and testing a gazillion of combinations.
> 
> OK, so it looks like we need to wait with the suspend to idle changes until
> this lands.

Please cc the patches to me when you post. I'll refine the patch after that.

Thanks,
-Aubrey

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v3]PM/Sleep: Timer quiesce in freeze state
  2015-01-26 14:41           ` Rafael J. Wysocki
  2015-01-26 14:24             ` Thomas Gleixner
@ 2015-01-27  8:03             ` Li, Aubrey
  2015-01-27 15:10               ` Rafael J. Wysocki
  1 sibling, 1 reply; 29+ messages in thread
From: Li, Aubrey @ 2015-01-27  8:03 UTC (permalink / raw)
  To: Rafael J. Wysocki, Thomas Gleixner
  Cc: Peter Zijlstra, Brown, Len, alan, LKML, Linux PM list

On 2015/1/26 22:41, Rafael J. Wysocki wrote:
> On Monday, January 26, 2015 10:40:24 AM Thomas Gleixner wrote:
>> On Mon, 26 Jan 2015, Li, Aubrey wrote:
>>> On 2015/1/22 18:15, Thomas Gleixner wrote:
> 
> [...]
> 
>>>>> +		/*
>>>>> +		 * cpuidle_enter will return with interrupt enabled
>>>>> +		 */
>>>>> +		cpuidle_enter(drv, dev, next_state);
>>>>
>>>> How is that supposed to work?
>>>>
>>>> If timekeeping is not yet unfrozen, then any interrupt handling code
>>>> which calls anything time related is going to hit lala land.
>>>>
>>>> You must guarantee that timekeeping is unfrozen before any interrupt
>>>> is handled. If you cannot guarantee that, you cannot freeze
>>>> timekeeping ever.
>>>>
>>>> The cpu local tick device is less critical, but it happens to work by
>>>> chance, not by design.
>>>
>>> There are two way to guarantee this: the first way is, disable interrupt
>>> before timekeeping frozen and enable interrupt after timekeeping is
>>> unfrozen. However, we need to handle wakeup handler before unfreeze
>>> timekeeping to wake freeze task up from wait queue.
>>>
>>> So we have to go the other way, the other way is, we ignore time related
>>> calls during freeze, like what I added in irq_enter below.
>>
>> Groan. You just do not call in irq_enter/exit(), but what prevents any
>> interrupt handler or whatever to call into the time/timer code after
>> interrupts got reenabled?
>>
>> Nothing. 
>>
>>> Or, we need to re-implement freeze wait and wake up mechanism?
>>
>> You need to make sure in the low level idle implementation that this
>> cannot happen.
>>
>> tick_freeze()
>> {
>> 	raw_spin_lock(&tick_freeze_lock);
>> 	tick_frozen++;
>> 	if (tick_frozen == num_online_cpus())
>> 		timekeeping_suspend();
>> 	else
>> 		tick_suspend_local();
>> 	raw_spin_unlock(&tick_freeze_lock);
>> }
>>
>> tick_unfreeze()
>> {
>> 	raw_spin_lock(&tick_freeze_lock);
>> 	if (tick_frozen == num_online_cpus())
>> 		timekeeping_resume();
>> 	else
>> 		tick_resume_local();
>> 	tick_frozen--;
>> 	raw_spin_unlock(&tick_freeze_lock);
>> }
>>
>> idle_freeze()
>> {
>> 	local_irq_disable();
>>
>> 	tick_freeze();
>>
>> 	/* Must keep interrupts disabled! */
>>        	go_deep_idle()
>>
>> 	tick_unfreeze();
>>
>> 	local_irq_enable();
>> }
>>
>> That's the only way you can do it proper, everything else will just be
>> a horrible mess of bandaids and duct tape.
>>
>> So that does not need any of the irq_enter/exit conditionals, it does
>> not need the real_handler hack. It just works.
> 
> As long as go_deep_idle() above does not enable interrupts.  This means we won't
> be able to use some C-states for suspend-to-idle (hald-induced C1 on some x86
> for one example), but that's not a very big deal.

Does the legacy ACPI system IO method to enter C2/C3 need interrupt
enabled as well?

Do we need some platform ops to cover those legacy platforms? Different
platform go different branch here.

Thanks,
-Aubrey

> 
>> The only remaining issue might be a NMI calling into
>> ktime_get_mono_fast_ns() before timekeeping is resumed. Its probably a
>> non issue on x86/tsc, but it might be a problem on other platforms
>> which turn off devices, clocks, It's not rocket science to prevent
>> that.
> 
> I don't see any users of ktime_get_mono_fast_ns() at all, unless some non-trivial
> macros are involved.  At least grepping for it only returns the definition,
> declarations and the line in trace.c.
> 
> Rafael
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 
> 


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v3]PM/Sleep: Timer quiesce in freeze state
  2015-01-27  8:03             ` Li, Aubrey
@ 2015-01-27 15:10               ` Rafael J. Wysocki
  2015-01-28  0:17                 ` Li, Aubrey
  0 siblings, 1 reply; 29+ messages in thread
From: Rafael J. Wysocki @ 2015-01-27 15:10 UTC (permalink / raw)
  To: Li, Aubrey
  Cc: Thomas Gleixner, Peter Zijlstra, Brown, Len, alan, LKML, Linux PM list

On Tuesday, January 27, 2015 04:03:29 PM Li, Aubrey wrote:
> On 2015/1/26 22:41, Rafael J. Wysocki wrote:
> > On Monday, January 26, 2015 10:40:24 AM Thomas Gleixner wrote:
> >> On Mon, 26 Jan 2015, Li, Aubrey wrote:
> >>> On 2015/1/22 18:15, Thomas Gleixner wrote:
> > 
> > [...]
> > 
> >>>>> +		/*
> >>>>> +		 * cpuidle_enter will return with interrupt enabled
> >>>>> +		 */
> >>>>> +		cpuidle_enter(drv, dev, next_state);
> >>>>
> >>>> How is that supposed to work?
> >>>>
> >>>> If timekeeping is not yet unfrozen, then any interrupt handling code
> >>>> which calls anything time related is going to hit lala land.
> >>>>
> >>>> You must guarantee that timekeeping is unfrozen before any interrupt
> >>>> is handled. If you cannot guarantee that, you cannot freeze
> >>>> timekeeping ever.
> >>>>
> >>>> The cpu local tick device is less critical, but it happens to work by
> >>>> chance, not by design.
> >>>
> >>> There are two way to guarantee this: the first way is, disable interrupt
> >>> before timekeeping frozen and enable interrupt after timekeeping is
> >>> unfrozen. However, we need to handle wakeup handler before unfreeze
> >>> timekeeping to wake freeze task up from wait queue.
> >>>
> >>> So we have to go the other way, the other way is, we ignore time related
> >>> calls during freeze, like what I added in irq_enter below.
> >>
> >> Groan. You just do not call in irq_enter/exit(), but what prevents any
> >> interrupt handler or whatever to call into the time/timer code after
> >> interrupts got reenabled?
> >>
> >> Nothing. 
> >>
> >>> Or, we need to re-implement freeze wait and wake up mechanism?
> >>
> >> You need to make sure in the low level idle implementation that this
> >> cannot happen.
> >>
> >> tick_freeze()
> >> {
> >> 	raw_spin_lock(&tick_freeze_lock);
> >> 	tick_frozen++;
> >> 	if (tick_frozen == num_online_cpus())
> >> 		timekeeping_suspend();
> >> 	else
> >> 		tick_suspend_local();
> >> 	raw_spin_unlock(&tick_freeze_lock);
> >> }
> >>
> >> tick_unfreeze()
> >> {
> >> 	raw_spin_lock(&tick_freeze_lock);
> >> 	if (tick_frozen == num_online_cpus())
> >> 		timekeeping_resume();
> >> 	else
> >> 		tick_resume_local();
> >> 	tick_frozen--;
> >> 	raw_spin_unlock(&tick_freeze_lock);
> >> }
> >>
> >> idle_freeze()
> >> {
> >> 	local_irq_disable();
> >>
> >> 	tick_freeze();
> >>
> >> 	/* Must keep interrupts disabled! */
> >>        	go_deep_idle()
> >>
> >> 	tick_unfreeze();
> >>
> >> 	local_irq_enable();
> >> }
> >>
> >> That's the only way you can do it proper, everything else will just be
> >> a horrible mess of bandaids and duct tape.
> >>
> >> So that does not need any of the irq_enter/exit conditionals, it does
> >> not need the real_handler hack. It just works.
> > 
> > As long as go_deep_idle() above does not enable interrupts.  This means we won't
> > be able to use some C-states for suspend-to-idle (hald-induced C1 on some x86
> > for one example), but that's not a very big deal.
> 
> Does the legacy ACPI system IO method to enter C2/C3 need interrupt
> enabled as well?
> 
> Do we need some platform ops to cover those legacy platforms? Different
> platform go different branch here.

No, we don't.

I think this needs to be addressed in a different way overall.  If you don't
mind, I'd like to prepare my own version of the patch at this point.  That
likely will be simpler than trying to explain what I'd like to do and I guess
I'll need a few iterations to get something acceptable anyway.

Rafael


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v3]PM/Sleep: Timer quiesce in freeze state
  2015-01-27 15:10               ` Rafael J. Wysocki
@ 2015-01-28  0:17                 ` Li, Aubrey
  0 siblings, 0 replies; 29+ messages in thread
From: Li, Aubrey @ 2015-01-28  0:17 UTC (permalink / raw)
  To: Rafael J. Wysocki
  Cc: Thomas Gleixner, Peter Zijlstra, Brown, Len, alan, LKML, Linux PM list

On 2015/1/27 23:10, Rafael J. Wysocki wrote:
> On Tuesday, January 27, 2015 04:03:29 PM Li, Aubrey wrote:
>> On 2015/1/26 22:41, Rafael J. Wysocki wrote:
>>> On Monday, January 26, 2015 10:40:24 AM Thomas Gleixner wrote:
>>>> On Mon, 26 Jan 2015, Li, Aubrey wrote:
>>>>> On 2015/1/22 18:15, Thomas Gleixner wrote:
>>>
>>> [...]
>>>
>>>>>>> +		/*
>>>>>>> +		 * cpuidle_enter will return with interrupt enabled
>>>>>>> +		 */
>>>>>>> +		cpuidle_enter(drv, dev, next_state);
>>>>>>
>>>>>> How is that supposed to work?
>>>>>>
>>>>>> If timekeeping is not yet unfrozen, then any interrupt handling code
>>>>>> which calls anything time related is going to hit lala land.
>>>>>>
>>>>>> You must guarantee that timekeeping is unfrozen before any interrupt
>>>>>> is handled. If you cannot guarantee that, you cannot freeze
>>>>>> timekeeping ever.
>>>>>>
>>>>>> The cpu local tick device is less critical, but it happens to work by
>>>>>> chance, not by design.
>>>>>
>>>>> There are two way to guarantee this: the first way is, disable interrupt
>>>>> before timekeeping frozen and enable interrupt after timekeeping is
>>>>> unfrozen. However, we need to handle wakeup handler before unfreeze
>>>>> timekeeping to wake freeze task up from wait queue.
>>>>>
>>>>> So we have to go the other way, the other way is, we ignore time related
>>>>> calls during freeze, like what I added in irq_enter below.
>>>>
>>>> Groan. You just do not call in irq_enter/exit(), but what prevents any
>>>> interrupt handler or whatever to call into the time/timer code after
>>>> interrupts got reenabled?
>>>>
>>>> Nothing. 
>>>>
>>>>> Or, we need to re-implement freeze wait and wake up mechanism?
>>>>
>>>> You need to make sure in the low level idle implementation that this
>>>> cannot happen.
>>>>
>>>> tick_freeze()
>>>> {
>>>> 	raw_spin_lock(&tick_freeze_lock);
>>>> 	tick_frozen++;
>>>> 	if (tick_frozen == num_online_cpus())
>>>> 		timekeeping_suspend();
>>>> 	else
>>>> 		tick_suspend_local();
>>>> 	raw_spin_unlock(&tick_freeze_lock);
>>>> }
>>>>
>>>> tick_unfreeze()
>>>> {
>>>> 	raw_spin_lock(&tick_freeze_lock);
>>>> 	if (tick_frozen == num_online_cpus())
>>>> 		timekeeping_resume();
>>>> 	else
>>>> 		tick_resume_local();
>>>> 	tick_frozen--;
>>>> 	raw_spin_unlock(&tick_freeze_lock);
>>>> }
>>>>
>>>> idle_freeze()
>>>> {
>>>> 	local_irq_disable();
>>>>
>>>> 	tick_freeze();
>>>>
>>>> 	/* Must keep interrupts disabled! */
>>>>        	go_deep_idle()
>>>>
>>>> 	tick_unfreeze();
>>>>
>>>> 	local_irq_enable();
>>>> }
>>>>
>>>> That's the only way you can do it proper, everything else will just be
>>>> a horrible mess of bandaids and duct tape.
>>>>
>>>> So that does not need any of the irq_enter/exit conditionals, it does
>>>> not need the real_handler hack. It just works.
>>>
>>> As long as go_deep_idle() above does not enable interrupts.  This means we won't
>>> be able to use some C-states for suspend-to-idle (hald-induced C1 on some x86
>>> for one example), but that's not a very big deal.
>>
>> Does the legacy ACPI system IO method to enter C2/C3 need interrupt
>> enabled as well?
>>
>> Do we need some platform ops to cover those legacy platforms? Different
>> platform go different branch here.
> 
> No, we don't.
> 
> I think this needs to be addressed in a different way overall.  If you don't
> mind, I'd like to prepare my own version of the patch at this point.  That
> likely will be simpler than trying to explain what I'd like to do and I guess
> I'll need a few iterations to get something acceptable anyway.

Sure, please go ahead and just keep me posted.

Thanks,
-Aubrey

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v3]PM/Sleep: Timer quiesce in freeze state
  2015-01-26  9:40         ` Thomas Gleixner
  2015-01-26 14:21           ` Rafael J. Wysocki
  2015-01-26 14:41           ` Rafael J. Wysocki
@ 2015-01-29 22:20           ` Rafael J. Wysocki
  2015-02-06  1:20             ` [Update] " Rafael J. Wysocki
  2 siblings, 1 reply; 29+ messages in thread
From: Rafael J. Wysocki @ 2015-01-29 22:20 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Li, Aubrey, Peter Zijlstra, Brown, Len, Alan Cox, LKML, Linux PM list

On Monday, January 26, 2015 10:40:24 AM Thomas Gleixner wrote:
> On Mon, 26 Jan 2015, Li, Aubrey wrote:
> > On 2015/1/22 18:15, Thomas Gleixner wrote:
> > > Can we please stop adding more crap to that notifier thing? I rather
> > > see that go away than being expanded.
> > 
> > Are you referring to FREEZE_PREPARE or remove all of FREEZE staff at all?
> > 
> > What's the disadvantage of adding more notifier?
> 
> clockevents_notify() is not a notifier. Its a multiplex call and I
> want to get rid of it and replace it with explicit functions.
> 
> > >> +		/*
> > >> +		 * cpuidle_enter will return with interrupt enabled
> > >> +		 */
> > >> +		cpuidle_enter(drv, dev, next_state);
> > > 
> > > How is that supposed to work?
> > > 
> > > If timekeeping is not yet unfrozen, then any interrupt handling code
> > > which calls anything time related is going to hit lala land.
> > > 
> > > You must guarantee that timekeeping is unfrozen before any interrupt
> > > is handled. If you cannot guarantee that, you cannot freeze
> > > timekeeping ever.
> > > 
> > > The cpu local tick device is less critical, but it happens to work by
> > > chance, not by design.
> > 
> > There are two way to guarantee this: the first way is, disable interrupt
> > before timekeeping frozen and enable interrupt after timekeeping is
> > unfrozen. However, we need to handle wakeup handler before unfreeze
> > timekeeping to wake freeze task up from wait queue.
> > 
> > So we have to go the other way, the other way is, we ignore time related
> > calls during freeze, like what I added in irq_enter below.
> 
> Groan. You just do not call in irq_enter/exit(), but what prevents any
> interrupt handler or whatever to call into the time/timer code after
> interrupts got reenabled?
> 
> Nothing. 
> 
> > Or, we need to re-implement freeze wait and wake up mechanism?
> 
> You need to make sure in the low level idle implementation that this
> cannot happen.
> 
> tick_freeze()
> {
> 	raw_spin_lock(&tick_freeze_lock);
> 	tick_frozen++;
> 	if (tick_frozen == num_online_cpus())
> 		timekeeping_suspend();
> 	else
> 		tick_suspend_local();
> 	raw_spin_unlock(&tick_freeze_lock);
> }
> 
> tick_unfreeze()
> {
> 	raw_spin_lock(&tick_freeze_lock);
> 	if (tick_frozen == num_online_cpus())
> 		timekeeping_resume();
> 	else
> 		tick_resume_local();
> 	tick_frozen--;
> 	raw_spin_unlock(&tick_freeze_lock);
> }
> 
> idle_freeze()
> {
> 	local_irq_disable();
> 
> 	tick_freeze();
> 
> 	/* Must keep interrupts disabled! */
>        	go_deep_idle()
> 
> 	tick_unfreeze();
> 
> 	local_irq_enable();
> }
> 
> That's the only way you can do it proper, everything else will just be
> a horrible mess of bandaids and duct tape.
> 
> So that does not need any of the irq_enter/exit conditionals, it does
> not need the real_handler hack. It just works.
> 
> The only remaining issue might be a NMI calling into
> ktime_get_mono_fast_ns() before timekeeping is resumed. Its probably a
> non issue on x86/tsc, but it might be a problem on other platforms
> which turn off devices, clocks, It's not rocket science to prevent
> that.

So the patch below is my version of the $subject one trying to take the
above suggestion into account.  It doesn not address the ktime_get_mono_fast_ns()
in NMI case at this stage, because quite frankly I'd prefer to get the core
changes right in the first place.

The new ->enter_freeze callback is there in struct cpuidle_state, because we
generally cannot trust the existing ->enter callbacks to keep interrupts
disabled all the time.  Some of them enable interrupts temporarily and
some idle entry methods enable interrupts automatically on exit.  So
->enter_freeze is required to keep interrupts disabled and it is safe to
suspend/resume timekeeping around it.

The rest is reasonably straightforward.  If suspend-to-idle is requested,
couidle_idle_call() calls cpuidle_enter_freeze() that bypasses the governor
and selects the deepest suitable state.  If ->enter_freeze is available,
it will be used along with tick_freeze()/tick_unfreeze() and the normal
idle entry code path will be used otherwise.

I'm not quite sure if rcu_idle_enter/exit() needs to be done around that.
The reason why it isn't in the current patch is because timekeeping_resume()
was very unhappy about being part of the extended grace period and complained
about that very loudly.

Which of course means that the patch has been tested (lightly, with a hacked
ACPI cpuidle driver) and that's why it still uses tick_suspend_broadcast()
in tick_freeze().

Please let me know what you think.

Rafael


---
 drivers/cpuidle/cpuidle.c |   83 +++++++++++++++++++++++++++++++---------------
 include/linux/cpuidle.h   |    8 +++-
 include/linux/suspend.h   |    2 +
 include/linux/tick.h      |    2 +
 kernel/power/suspend.c    |   37 ++++++++++++++++----
 kernel/sched/idle.c       |    9 ++++
 kernel/time/tick-common.c |   50 +++++++++++++++++++++++++++
 kernel/time/timekeeping.c |    4 +-
 kernel/time/timekeeping.h |    2 +
 9 files changed, 160 insertions(+), 37 deletions(-)

Index: linux-pm/kernel/power/suspend.c
===================================================================
--- linux-pm.orig/kernel/power/suspend.c
+++ linux-pm/kernel/power/suspend.c
@@ -37,7 +37,20 @@ const char *pm_states[PM_SUSPEND_MAX];
 static const struct platform_suspend_ops *suspend_ops;
 static const struct platform_freeze_ops *freeze_ops;
 static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
-static bool suspend_freeze_wake;
+
+/* Suspend-to-idle state machnine. */
+enum freeze_state {
+	FREEZE_STATE_NONE,      /* Not suspended/suspending. */
+	FREEZE_STATE_ENTER,     /* Enter suspend-to-idle. */
+	FREEZE_STATE_WAKE,      /* Wake up from suspend-to-idle. */
+};
+
+static enum freeze_state suspend_freeze_state;
+
+bool idle_should_freeze(void)
+{
+	return suspend_freeze_state == FREEZE_STATE_ENTER;
+}
 
 void freeze_set_ops(const struct platform_freeze_ops *ops)
 {
@@ -48,22 +61,32 @@ void freeze_set_ops(const struct platfor
 
 static void freeze_begin(void)
 {
-	suspend_freeze_wake = false;
+	suspend_freeze_state = FREEZE_STATE_NONE;
 }
 
 static void freeze_enter(void)
 {
-	cpuidle_use_deepest_state(true);
+	suspend_freeze_state = FREEZE_STATE_ENTER;
+	get_online_cpus();
 	cpuidle_resume();
-	wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
+	/* Push all the CPUs into the idle loop. */
+	wake_up_all_idle_cpus();
+	pr_debug("PM: suspend-to-idle\n");
+	/* Make the current CPU wait so it can enter the idle loop too. */
+	wait_event(suspend_freeze_wait_head,
+		   suspend_freeze_state == FREEZE_STATE_WAKE);
+	pr_debug("PM: resume from suspend-to-idle\n");
 	cpuidle_pause();
-	cpuidle_use_deepest_state(false);
+	put_online_cpus();
+	suspend_freeze_state = FREEZE_STATE_NONE;
 }
 
 void freeze_wake(void)
 {
-	suspend_freeze_wake = true;
-	wake_up(&suspend_freeze_wait_head);
+	if (suspend_freeze_state > FREEZE_STATE_NONE) {
+		suspend_freeze_state = FREEZE_STATE_WAKE;
+		wake_up(&suspend_freeze_wait_head);
+	}
 }
 EXPORT_SYMBOL_GPL(freeze_wake);
 
Index: linux-pm/include/linux/suspend.h
===================================================================
--- linux-pm.orig/include/linux/suspend.h
+++ linux-pm/include/linux/suspend.h
@@ -203,6 +203,7 @@ extern void suspend_set_ops(const struct
 extern int suspend_valid_only_mem(suspend_state_t state);
 extern void freeze_set_ops(const struct platform_freeze_ops *ops);
 extern void freeze_wake(void);
+extern bool idle_should_freeze(void);
 
 /**
  * arch_suspend_disable_irqs - disable IRQs for suspend
@@ -230,6 +231,7 @@ static inline void suspend_set_ops(const
 static inline int pm_suspend(suspend_state_t state) { return -ENOSYS; }
 static inline void freeze_set_ops(const struct platform_freeze_ops *ops) {}
 static inline void freeze_wake(void) {}
+static inline bool idle_should_freeze(void) { return false; }
 #endif /* !CONFIG_SUSPEND */
 
 /* struct pbe is used for creating lists of pages that should be restored
Index: linux-pm/kernel/sched/idle.c
===================================================================
--- linux-pm.orig/kernel/sched/idle.c
+++ linux-pm/kernel/sched/idle.c
@@ -7,6 +7,7 @@
 #include <linux/tick.h>
 #include <linux/mm.h>
 #include <linux/stackprotector.h>
+#include <linux/suspend.h>
 
 #include <asm/tlb.h>
 
@@ -96,6 +97,13 @@ static void cpuidle_idle_call(void)
 	 */
 	stop_critical_timings();
 
+	if (idle_should_freeze()) {
+		cpuidle_enter_freeze();
+		local_irq_enable();
+		start_critical_timings();
+		return;
+	}
+
 	/*
 	 * Tell the RCU framework we are entering an idle section,
 	 * so no more rcu read side critical sections and one more
@@ -220,6 +228,7 @@ static void cpu_idle_loop(void)
 			 * know that the IPI is going to arrive right
 			 * away
 			 */
+
 			if (cpu_idle_force_poll || tick_check_broadcast_expired())
 				cpu_idle_poll();
 			else
Index: linux-pm/drivers/cpuidle/cpuidle.c
===================================================================
--- linux-pm.orig/drivers/cpuidle/cpuidle.c
+++ linux-pm/drivers/cpuidle/cpuidle.c
@@ -19,6 +19,8 @@
 #include <linux/ktime.h>
 #include <linux/hrtimer.h>
 #include <linux/module.h>
+#include <linux/suspend.h>
+#include <linux/tick.h>
 #include <trace/events/power.h>
 
 #include "cpuidle.h"
@@ -32,7 +34,6 @@ LIST_HEAD(cpuidle_detected_devices);
 static int enabled_devices;
 static int off __read_mostly;
 static int initialized __read_mostly;
-static bool use_deepest_state __read_mostly;
 
 int cpuidle_disabled(void)
 {
@@ -66,36 +67,23 @@ int cpuidle_play_dead(void)
 }
 
 /**
- * cpuidle_use_deepest_state - Enable/disable the "deepest idle" mode.
- * @enable: Whether enable or disable the feature.
- *
- * If the "deepest idle" mode is enabled, cpuidle will ignore the governor and
- * always use the state with the greatest exit latency (out of the states that
- * are not disabled).
- *
- * This function can only be called after cpuidle_pause() to avoid races.
- */
-void cpuidle_use_deepest_state(bool enable)
-{
-	use_deepest_state = enable;
-}
-
-/**
- * cpuidle_find_deepest_state - Find the state of the greatest exit latency.
- * @drv: cpuidle driver for a given CPU.
- * @dev: cpuidle device for a given CPU.
+ * cpuidle_find_deepest_state - Find deepest state meeting specific conditions.
+ * @drv: cpuidle driver for the given CPU.
+ * @dev: cpuidle device for the given CPU.
+ * @freeze: Whether or not the state should be suitable for suspend-to-idle.
  */
 static int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
-				      struct cpuidle_device *dev)
+				      struct cpuidle_device *dev, bool freeze)
 {
 	unsigned int latency_req = 0;
-	int i, ret = CPUIDLE_DRIVER_STATE_START - 1;
+	int i, ret = freeze ? -1 : CPUIDLE_DRIVER_STATE_START - 1;
 
 	for (i = CPUIDLE_DRIVER_STATE_START; i < drv->state_count; i++) {
 		struct cpuidle_state *s = &drv->states[i];
 		struct cpuidle_state_usage *su = &dev->states_usage[i];
 
-		if (s->disabled || su->disable || s->exit_latency <= latency_req)
+		if (s->disabled || su->disable || s->exit_latency <= latency_req
+		    || (freeze && !s->enter_freeze))
 			continue;
 
 		latency_req = s->exit_latency;
@@ -104,6 +92,52 @@ static int cpuidle_find_deepest_state(st
 	return ret;
 }
 
+static void enter_freeze_proper(struct cpuidle_driver *drv,
+				struct cpuidle_device *dev, int index)
+{
+	tick_freeze();
+	drv->states[index].enter_freeze(dev, drv, index);
+	tick_unfreeze();
+}
+
+/**
+ * cpuidle_enter_freeze - Enter an idle state suitable for suspend-to-idle.
+ *
+ * If there are states with the ->enter_freeze callback, find the deepest of
+ * them and enter it with frozen tick.  Otherwise, find the deepest state
+ * available and enter it normally.
+ */
+void cpuidle_enter_freeze(void)
+{
+	struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
+	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
+	int index;
+
+	/*
+	 * Find the deepest state with ->enter_freeze present, which guarantees
+	 * that interrupts won't be enabled when it exits and allows the tick to
+	 * be frozen safely.
+	 */
+	index = cpuidle_find_deepest_state(drv, dev, true);
+	if (index >= 0) {
+		enter_freeze_proper(drv, dev, index);
+		return;
+	}
+
+	/*
+	 * It is not safe to freeze the tick, find the deepest state available
+	 * at all and try to enter it normally.
+	 */
+	index = cpuidle_find_deepest_state(drv, dev, false);
+	if (index >= 0)
+		cpuidle_enter(drv, dev, index);
+	else
+		arch_cpu_idle();
+
+	/* Interrupts are enabled again here. */
+	local_irq_disable();
+}
+
 /**
  * cpuidle_enter_state - enter the state and update stats
  * @dev: cpuidle device for this cpu
@@ -166,9 +200,6 @@ int cpuidle_select(struct cpuidle_driver
 	if (!drv || !dev || !dev->enabled)
 		return -EBUSY;
 
-	if (unlikely(use_deepest_state))
-		return cpuidle_find_deepest_state(drv, dev);
-
 	return cpuidle_curr_governor->select(drv, dev);
 }
 
@@ -200,7 +231,7 @@ int cpuidle_enter(struct cpuidle_driver
  */
 void cpuidle_reflect(struct cpuidle_device *dev, int index)
 {
-	if (cpuidle_curr_governor->reflect && !unlikely(use_deepest_state))
+	if (cpuidle_curr_governor->reflect)
 		cpuidle_curr_governor->reflect(dev, index);
 }
 
Index: linux-pm/include/linux/cpuidle.h
===================================================================
--- linux-pm.orig/include/linux/cpuidle.h
+++ linux-pm/include/linux/cpuidle.h
@@ -50,6 +50,10 @@ struct cpuidle_state {
 			int index);
 
 	int (*enter_dead) (struct cpuidle_device *dev, int index);
+
+	void (*enter_freeze) (struct cpuidle_device *dev,
+			      struct cpuidle_driver *drv,
+			      int index);
 };
 
 /* Idle State Flags */
@@ -141,7 +145,7 @@ extern void cpuidle_resume(void);
 extern int cpuidle_enable_device(struct cpuidle_device *dev);
 extern void cpuidle_disable_device(struct cpuidle_device *dev);
 extern int cpuidle_play_dead(void);
-extern void cpuidle_use_deepest_state(bool enable);
+extern void cpuidle_enter_freeze(void);
 
 extern struct cpuidle_driver *cpuidle_get_cpu_driver(struct cpuidle_device *dev);
 #else
@@ -174,7 +178,7 @@ static inline int cpuidle_enable_device(
 {return -ENODEV; }
 static inline void cpuidle_disable_device(struct cpuidle_device *dev) { }
 static inline int cpuidle_play_dead(void) {return -ENODEV; }
-static inline void cpuidle_use_deepest_state(bool enable) {}
+static inline void cpuidle_enter_freeze(void) { }
 static inline struct cpuidle_driver *cpuidle_get_cpu_driver(
 	struct cpuidle_device *dev) {return NULL; }
 #endif
Index: linux-pm/kernel/time/tick-common.c
===================================================================
--- linux-pm.orig/kernel/time/tick-common.c
+++ linux-pm/kernel/time/tick-common.c
@@ -394,6 +394,56 @@ void tick_resume(void)
 	}
 }
 
+static DEFINE_RAW_SPINLOCK(tick_freeze_lock);
+static unsigned int tick_freeze_depth;
+
+/**
+ * tick_freeze - Suspend the local tick and (possibly) timekeeping.
+ *
+ * Check if this is the last online CPU executing the function and if so,
+ * suspend timekeeping.  Otherwise suspend the local tick.
+ *
+ * Call with interrupts disabled.  Must be balanced with %tick_unfreeze().
+ * Interrupts must not be enabled before the subsequent %tick_unfreeze().
+ */
+void tick_freeze(void)
+{
+	raw_spin_lock(&tick_freeze_lock);
+
+	tick_freeze_depth++;
+	if (tick_freeze_depth == num_online_cpus()) {
+		timekeeping_suspend();
+	} else {
+		tick_suspend();
+		tick_suspend_broadcast();
+	}
+
+	raw_spin_unlock(&tick_freeze_lock);
+}
+
+/**
+ * tick_unfreeze - Resume the local tick and (possibly) timekeeping.
+ *
+ * Check if this is the first CPU executing the function and if so, resume
+ * timekeeping.  Otherwise resume the local tick.
+ *
+ * Call with interrupts disabled.  Must be balanced with %tick_freeze().
+ * Interrupts must not be enabled after the preceding %tick_freeze().
+ */
+void tick_unfreeze(void)
+{
+	raw_spin_lock(&tick_freeze_lock);
+
+	if (tick_freeze_depth == num_online_cpus())
+		timekeeping_resume();
+	else
+		tick_resume();
+
+	tick_freeze_depth--;
+
+	raw_spin_unlock(&tick_freeze_lock);
+}
+
 /**
  * tick_init - initialize the tick control
  */
Index: linux-pm/include/linux/tick.h
===================================================================
--- linux-pm.orig/include/linux/tick.h
+++ linux-pm/include/linux/tick.h
@@ -226,5 +226,7 @@ static inline void tick_nohz_task_switch
 		__tick_nohz_task_switch(tsk);
 }
 
+extern void tick_freeze(void);
+extern void tick_unfreeze(void);
 
 #endif
Index: linux-pm/kernel/time/timekeeping.c
===================================================================
--- linux-pm.orig/kernel/time/timekeeping.c
+++ linux-pm/kernel/time/timekeeping.c
@@ -1170,7 +1170,7 @@ void timekeeping_inject_sleeptime64(stru
  * xtime/wall_to_monotonic/jiffies/etc are
  * still managed by arch specific suspend/resume code.
  */
-static void timekeeping_resume(void)
+void timekeeping_resume(void)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
 	struct clocksource *clock = tk->tkr.clock;
@@ -1251,7 +1251,7 @@ static void timekeeping_resume(void)
 	hrtimers_resume();
 }
 
-static int timekeeping_suspend(void)
+int timekeeping_suspend(void)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
 	unsigned long flags;
Index: linux-pm/kernel/time/timekeeping.h
===================================================================
--- linux-pm.orig/kernel/time/timekeeping.h
+++ linux-pm/kernel/time/timekeeping.h
@@ -16,5 +16,7 @@ extern int timekeeping_inject_offset(str
 extern s32 timekeeping_get_tai_offset(void);
 extern void timekeeping_set_tai_offset(s32 tai_offset);
 extern void timekeeping_clocktai(struct timespec *ts);
+extern int timekeeping_suspend(void);
+extern void timekeeping_resume(void);
 
 #endif


^ permalink raw reply	[flat|nested] 29+ messages in thread

* [Update] Re: [PATCH v3]PM/Sleep: Timer quiesce in freeze state
  2015-01-29 22:20           ` Rafael J. Wysocki
@ 2015-02-06  1:20             ` Rafael J. Wysocki
  2015-02-06 16:14               ` Peter Zijlstra
  2015-02-09  2:54               ` [Update 2x] " Rafael J. Wysocki
  0 siblings, 2 replies; 29+ messages in thread
From: Rafael J. Wysocki @ 2015-02-06  1:20 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Li, Aubrey, Peter Zijlstra, Brown, Len, Alan Cox, LKML, Linux PM list


On Thursday, January 29, 2015 11:20:10 PM Rafael J. Wysocki wrote:
> On Monday, January 26, 2015 10:40:24 AM Thomas Gleixner wrote:
> > On Mon, 26 Jan 2015, Li, Aubrey wrote:
> > > On 2015/1/22 18:15, Thomas Gleixner wrote:

[cut]

> > 
> > You need to make sure in the low level idle implementation that this
> > cannot happen.
> > 
> > tick_freeze()
> > {
> > 	raw_spin_lock(&tick_freeze_lock);
> > 	tick_frozen++;
> > 	if (tick_frozen == num_online_cpus())
> > 		timekeeping_suspend();
> > 	else
> > 		tick_suspend_local();
> > 	raw_spin_unlock(&tick_freeze_lock);
> > }
> > 
> > tick_unfreeze()
> > {
> > 	raw_spin_lock(&tick_freeze_lock);
> > 	if (tick_frozen == num_online_cpus())
> > 		timekeeping_resume();
> > 	else
> > 		tick_resume_local();
> > 	tick_frozen--;
> > 	raw_spin_unlock(&tick_freeze_lock);
> > }
> > 
> > idle_freeze()
> > {
> > 	local_irq_disable();
> > 
> > 	tick_freeze();
> > 
> > 	/* Must keep interrupts disabled! */
> >        	go_deep_idle()
> > 
> > 	tick_unfreeze();
> > 
> > 	local_irq_enable();
> > }
> > 
> > That's the only way you can do it proper, everything else will just be
> > a horrible mess of bandaids and duct tape.
> > 
> > So that does not need any of the irq_enter/exit conditionals, it does
> > not need the real_handler hack. It just works.
> > 
> > The only remaining issue might be a NMI calling into
> > ktime_get_mono_fast_ns() before timekeeping is resumed. Its probably a
> > non issue on x86/tsc, but it might be a problem on other platforms
> > which turn off devices, clocks, It's not rocket science to prevent
> > that.
> 
> So the patch below is my version of the $subject one trying to take the
> above suggestion into account.  It doesn not address the ktime_get_mono_fast_ns()
> in NMI case at this stage, because quite frankly I'd prefer to get the core
> changes right in the first place.
> 
> The new ->enter_freeze callback is there in struct cpuidle_state, because we
> generally cannot trust the existing ->enter callbacks to keep interrupts
> disabled all the time.  Some of them enable interrupts temporarily and
> some idle entry methods enable interrupts automatically on exit.  So
> ->enter_freeze is required to keep interrupts disabled and it is safe to
> suspend/resume timekeeping around it.
> 
> The rest is reasonably straightforward.  If suspend-to-idle is requested,
> couidle_idle_call() calls cpuidle_enter_freeze() that bypasses the governor
> and selects the deepest suitable state.  If ->enter_freeze is available,
> it will be used along with tick_freeze()/tick_unfreeze() and the normal
> idle entry code path will be used otherwise.

In the meantime I learned how to tell RCU that it actually should track stuff
inside of the idle loop, so cpuidle_enter_freeze() can go under the
rcu_idle_enter/exit() now which makes it look more like "real" idle.

Also I noticed that freeze_enter() could lose a wakeup event if it happened
between dpm_suspend_noirq() and the point where suspend_freeze_state is set,
so that race should be closed now.

In my testing (with a hacked ACPI cpuidle driver) I get a number of wakeups
(usually no more than 30) in enter_freeze_proper() per suspend-to-idle cycle,
but that number doesn't seem to depend on the length of sleep, so my theory
is that this happens early until things relax sufficiently.

ktime_get_mono_fast_ns() is not covered yet, but I'm going to get to that next.

The current patch is appended, please let me know what you think.


---
 drivers/cpuidle/cpuidle.c |   88 ++++++++++++++++++++++++++++++++--------------
 include/linux/cpuidle.h   |    8 +++-
 include/linux/suspend.h   |    2 +
 include/linux/tick.h      |    2 +
 kernel/power/suspend.c    |   55 +++++++++++++++++++++++++---
 kernel/sched/idle.c       |    7 +++
 kernel/time/tick-common.c |   50 ++++++++++++++++++++++++++
 kernel/time/timekeeping.c |    4 +-
 kernel/time/timekeeping.h |    2 +
 9 files changed, 181 insertions(+), 37 deletions(-)

Index: linux-pm/kernel/power/suspend.c
===================================================================
--- linux-pm.orig/kernel/power/suspend.c
+++ linux-pm/kernel/power/suspend.c
@@ -37,7 +37,21 @@ const char *pm_states[PM_SUSPEND_MAX];
 static const struct platform_suspend_ops *suspend_ops;
 static const struct platform_freeze_ops *freeze_ops;
 static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
-static bool suspend_freeze_wake;
+
+/* Suspend-to-idle state machnine. */
+enum freeze_state {
+	FREEZE_STATE_NONE,      /* Not suspended/suspending. */
+	FREEZE_STATE_ENTER,     /* Enter suspend-to-idle. */
+	FREEZE_STATE_WAKE,      /* Wake up from suspend-to-idle. */
+};
+
+static enum freeze_state suspend_freeze_state;
+static DEFINE_SPINLOCK(suspend_freeze_lock);
+
+bool idle_should_freeze(void)
+{
+	return suspend_freeze_state == FREEZE_STATE_ENTER;
+}
 
 void freeze_set_ops(const struct platform_freeze_ops *ops)
 {
@@ -48,22 +62,49 @@ void freeze_set_ops(const struct platfor
 
 static void freeze_begin(void)
 {
-	suspend_freeze_wake = false;
+	suspend_freeze_state = FREEZE_STATE_NONE;
 }
 
 static void freeze_enter(void)
 {
-	cpuidle_use_deepest_state(true);
+	spin_lock_irq(&suspend_freeze_lock);
+	if (pm_wakeup_pending())
+		goto out;
+
+	suspend_freeze_state = FREEZE_STATE_ENTER;
+	spin_unlock_irq(&suspend_freeze_lock);
+
+	get_online_cpus();
 	cpuidle_resume();
-	wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
+
+	/* Push all the CPUs into the idle loop. */
+	wake_up_all_idle_cpus();
+	pr_debug("PM: suspend-to-idle\n");
+	/* Make the current CPU wait so it can enter the idle loop too. */
+	wait_event(suspend_freeze_wait_head,
+		   suspend_freeze_state == FREEZE_STATE_WAKE);
+	pr_debug("PM: resume from suspend-to-idle\n");
+
 	cpuidle_pause();
-	cpuidle_use_deepest_state(false);
+	put_online_cpus();
+
+	spin_lock_irq(&suspend_freeze_lock);
+
+ out:
+	suspend_freeze_state = FREEZE_STATE_NONE;
+	spin_unlock_irq(&suspend_freeze_lock);
 }
 
 void freeze_wake(void)
 {
-	suspend_freeze_wake = true;
-	wake_up(&suspend_freeze_wait_head);
+	unsigned long flags;
+
+	spin_lock_irqsave(&suspend_freeze_lock, flags);
+	if (suspend_freeze_state > FREEZE_STATE_NONE) {
+		suspend_freeze_state = FREEZE_STATE_WAKE;
+		wake_up(&suspend_freeze_wait_head);
+	}
+	spin_unlock_irqrestore(&suspend_freeze_lock, flags);
 }
 EXPORT_SYMBOL_GPL(freeze_wake);
 
Index: linux-pm/include/linux/suspend.h
===================================================================
--- linux-pm.orig/include/linux/suspend.h
+++ linux-pm/include/linux/suspend.h
@@ -203,6 +203,7 @@ extern void suspend_set_ops(const struct
 extern int suspend_valid_only_mem(suspend_state_t state);
 extern void freeze_set_ops(const struct platform_freeze_ops *ops);
 extern void freeze_wake(void);
+extern bool idle_should_freeze(void);
 
 /**
  * arch_suspend_disable_irqs - disable IRQs for suspend
@@ -230,6 +231,7 @@ static inline void suspend_set_ops(const
 static inline int pm_suspend(suspend_state_t state) { return -ENOSYS; }
 static inline void freeze_set_ops(const struct platform_freeze_ops *ops) {}
 static inline void freeze_wake(void) {}
+static inline bool idle_should_freeze(void) { return false; }
 #endif /* !CONFIG_SUSPEND */
 
 /* struct pbe is used for creating lists of pages that should be restored
Index: linux-pm/kernel/sched/idle.c
===================================================================
--- linux-pm.orig/kernel/sched/idle.c
+++ linux-pm/kernel/sched/idle.c
@@ -7,6 +7,7 @@
 #include <linux/tick.h>
 #include <linux/mm.h>
 #include <linux/stackprotector.h>
+#include <linux/suspend.h>
 
 #include <asm/tlb.h>
 
@@ -103,6 +104,12 @@ static void cpuidle_idle_call(void)
 	 */
 	rcu_idle_enter();
 
+	if (idle_should_freeze()) {
+		cpuidle_enter_freeze();
+		local_irq_enable();
+		goto exit_idle;
+	}
+
 	/*
 	 * Ask the cpuidle framework to choose a convenient idle state.
 	 * Fall back to the default arch idle method on errors.
Index: linux-pm/drivers/cpuidle/cpuidle.c
===================================================================
--- linux-pm.orig/drivers/cpuidle/cpuidle.c
+++ linux-pm/drivers/cpuidle/cpuidle.c
@@ -19,6 +19,8 @@
 #include <linux/ktime.h>
 #include <linux/hrtimer.h>
 #include <linux/module.h>
+#include <linux/suspend.h>
+#include <linux/tick.h>
 #include <trace/events/power.h>
 
 #include "cpuidle.h"
@@ -32,7 +34,6 @@ LIST_HEAD(cpuidle_detected_devices);
 static int enabled_devices;
 static int off __read_mostly;
 static int initialized __read_mostly;
-static bool use_deepest_state __read_mostly;
 
 int cpuidle_disabled(void)
 {
@@ -66,36 +67,23 @@ int cpuidle_play_dead(void)
 }
 
 /**
- * cpuidle_use_deepest_state - Enable/disable the "deepest idle" mode.
- * @enable: Whether enable or disable the feature.
- *
- * If the "deepest idle" mode is enabled, cpuidle will ignore the governor and
- * always use the state with the greatest exit latency (out of the states that
- * are not disabled).
- *
- * This function can only be called after cpuidle_pause() to avoid races.
- */
-void cpuidle_use_deepest_state(bool enable)
-{
-	use_deepest_state = enable;
-}
-
-/**
- * cpuidle_find_deepest_state - Find the state of the greatest exit latency.
- * @drv: cpuidle driver for a given CPU.
- * @dev: cpuidle device for a given CPU.
+ * cpuidle_find_deepest_state - Find deepest state meeting specific conditions.
+ * @drv: cpuidle driver for the given CPU.
+ * @dev: cpuidle device for the given CPU.
+ * @freeze: Whether or not the state should be suitable for suspend-to-idle.
  */
 static int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
-				      struct cpuidle_device *dev)
+				      struct cpuidle_device *dev, bool freeze)
 {
 	unsigned int latency_req = 0;
-	int i, ret = CPUIDLE_DRIVER_STATE_START - 1;
+	int i, ret = freeze ? -1 : CPUIDLE_DRIVER_STATE_START - 1;
 
 	for (i = CPUIDLE_DRIVER_STATE_START; i < drv->state_count; i++) {
 		struct cpuidle_state *s = &drv->states[i];
 		struct cpuidle_state_usage *su = &dev->states_usage[i];
 
-		if (s->disabled || su->disable || s->exit_latency <= latency_req)
+		if (s->disabled || su->disable || s->exit_latency <= latency_req
+		    || (freeze && !s->enter_freeze))
 			continue;
 
 		latency_req = s->exit_latency;
@@ -104,6 +92,57 @@ static int cpuidle_find_deepest_state(st
 	return ret;
 }
 
+static void enter_freeze_proper(struct cpuidle_driver *drv,
+				struct cpuidle_device *dev, int index)
+{
+	tick_freeze();
+	drv->states[index].enter_freeze(dev, drv, index);
+	/*
+	 * timekeeping_resume() that will be called by tick_unfreeze() for the
+	 * last CPU executing it calls functions containing RCU read-side
+	 * critical sections, so tell RCU about that.
+	 */
+	RCU_NONIDLE(tick_unfreeze());
+}
+
+/**
+ * cpuidle_enter_freeze - Enter an idle state suitable for suspend-to-idle.
+ *
+ * If there are states with the ->enter_freeze callback, find the deepest of
+ * them and enter it with frozen tick.  Otherwise, find the deepest state
+ * available and enter it normally.
+ */
+void cpuidle_enter_freeze(void)
+{
+	struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
+	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
+	int index;
+
+	/*
+	 * Find the deepest state with ->enter_freeze present, which guarantees
+	 * that interrupts won't be enabled when it exits and allows the tick to
+	 * be frozen safely.
+	 */
+	index = cpuidle_find_deepest_state(drv, dev, true);
+	if (index >= 0) {
+		enter_freeze_proper(drv, dev, index);
+		return;
+	}
+
+	/*
+	 * It is not safe to freeze the tick, find the deepest state available
+	 * at all and try to enter it normally.
+	 */
+	index = cpuidle_find_deepest_state(drv, dev, false);
+	if (index >= 0)
+		cpuidle_enter(drv, dev, index);
+	else
+		arch_cpu_idle();
+
+	/* Interrupts are enabled again here. */
+	local_irq_disable();
+}
+
 /**
  * cpuidle_enter_state - enter the state and update stats
  * @dev: cpuidle device for this cpu
@@ -166,9 +205,6 @@ int cpuidle_select(struct cpuidle_driver
 	if (!drv || !dev || !dev->enabled)
 		return -EBUSY;
 
-	if (unlikely(use_deepest_state))
-		return cpuidle_find_deepest_state(drv, dev);
-
 	return cpuidle_curr_governor->select(drv, dev);
 }
 
@@ -200,7 +236,7 @@ int cpuidle_enter(struct cpuidle_driver
  */
 void cpuidle_reflect(struct cpuidle_device *dev, int index)
 {
-	if (cpuidle_curr_governor->reflect && !unlikely(use_deepest_state))
+	if (cpuidle_curr_governor->reflect)
 		cpuidle_curr_governor->reflect(dev, index);
 }
 
Index: linux-pm/include/linux/cpuidle.h
===================================================================
--- linux-pm.orig/include/linux/cpuidle.h
+++ linux-pm/include/linux/cpuidle.h
@@ -50,6 +50,10 @@ struct cpuidle_state {
 			int index);
 
 	int (*enter_dead) (struct cpuidle_device *dev, int index);
+
+	void (*enter_freeze) (struct cpuidle_device *dev,
+			      struct cpuidle_driver *drv,
+			      int index);
 };
 
 /* Idle State Flags */
@@ -141,7 +145,7 @@ extern void cpuidle_resume(void);
 extern int cpuidle_enable_device(struct cpuidle_device *dev);
 extern void cpuidle_disable_device(struct cpuidle_device *dev);
 extern int cpuidle_play_dead(void);
-extern void cpuidle_use_deepest_state(bool enable);
+extern void cpuidle_enter_freeze(void);
 
 extern struct cpuidle_driver *cpuidle_get_cpu_driver(struct cpuidle_device *dev);
 #else
@@ -174,7 +178,7 @@ static inline int cpuidle_enable_device(
 {return -ENODEV; }
 static inline void cpuidle_disable_device(struct cpuidle_device *dev) { }
 static inline int cpuidle_play_dead(void) {return -ENODEV; }
-static inline void cpuidle_use_deepest_state(bool enable) {}
+static inline void cpuidle_enter_freeze(void) { }
 static inline struct cpuidle_driver *cpuidle_get_cpu_driver(
 	struct cpuidle_device *dev) {return NULL; }
 #endif
Index: linux-pm/kernel/time/tick-common.c
===================================================================
--- linux-pm.orig/kernel/time/tick-common.c
+++ linux-pm/kernel/time/tick-common.c
@@ -394,6 +394,56 @@ void tick_resume(void)
 	}
 }
 
+static DEFINE_RAW_SPINLOCK(tick_freeze_lock);
+static unsigned int tick_freeze_depth;
+
+/**
+ * tick_freeze - Suspend the local tick and (possibly) timekeeping.
+ *
+ * Check if this is the last online CPU executing the function and if so,
+ * suspend timekeeping.  Otherwise suspend the local tick.
+ *
+ * Call with interrupts disabled.  Must be balanced with %tick_unfreeze().
+ * Interrupts must not be enabled before the subsequent %tick_unfreeze().
+ */
+void tick_freeze(void)
+{
+	raw_spin_lock(&tick_freeze_lock);
+
+	tick_freeze_depth++;
+	if (tick_freeze_depth == num_online_cpus()) {
+		timekeeping_suspend();
+	} else {
+		tick_suspend();
+		tick_suspend_broadcast();
+	}
+
+	raw_spin_unlock(&tick_freeze_lock);
+}
+
+/**
+ * tick_unfreeze - Resume the local tick and (possibly) timekeeping.
+ *
+ * Check if this is the first CPU executing the function and if so, resume
+ * timekeeping.  Otherwise resume the local tick.
+ *
+ * Call with interrupts disabled.  Must be balanced with %tick_freeze().
+ * Interrupts must not be enabled after the preceding %tick_freeze().
+ */
+void tick_unfreeze(void)
+{
+	raw_spin_lock(&tick_freeze_lock);
+
+	if (tick_freeze_depth == num_online_cpus())
+		timekeeping_resume();
+	else
+		tick_resume();
+
+	tick_freeze_depth--;
+
+	raw_spin_unlock(&tick_freeze_lock);
+}
+
 /**
  * tick_init - initialize the tick control
  */
Index: linux-pm/include/linux/tick.h
===================================================================
--- linux-pm.orig/include/linux/tick.h
+++ linux-pm/include/linux/tick.h
@@ -226,5 +226,7 @@ static inline void tick_nohz_task_switch
 		__tick_nohz_task_switch(tsk);
 }
 
+extern void tick_freeze(void);
+extern void tick_unfreeze(void);
 
 #endif
Index: linux-pm/kernel/time/timekeeping.c
===================================================================
--- linux-pm.orig/kernel/time/timekeeping.c
+++ linux-pm/kernel/time/timekeeping.c
@@ -1170,7 +1170,7 @@ void timekeeping_inject_sleeptime64(stru
  * xtime/wall_to_monotonic/jiffies/etc are
  * still managed by arch specific suspend/resume code.
  */
-static void timekeeping_resume(void)
+void timekeeping_resume(void)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
 	struct clocksource *clock = tk->tkr.clock;
@@ -1251,7 +1251,7 @@ static void timekeeping_resume(void)
 	hrtimers_resume();
 }
 
-static int timekeeping_suspend(void)
+int timekeeping_suspend(void)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
 	unsigned long flags;
Index: linux-pm/kernel/time/timekeeping.h
===================================================================
--- linux-pm.orig/kernel/time/timekeeping.h
+++ linux-pm/kernel/time/timekeeping.h
@@ -16,5 +16,7 @@ extern int timekeeping_inject_offset(str
 extern s32 timekeeping_get_tai_offset(void);
 extern void timekeeping_set_tai_offset(s32 tai_offset);
 extern void timekeeping_clocktai(struct timespec *ts);
+extern int timekeeping_suspend(void);
+extern void timekeeping_resume(void);
 
 #endif


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [Update] Re: [PATCH v3]PM/Sleep: Timer quiesce in freeze state
  2015-02-06  1:20             ` [Update] " Rafael J. Wysocki
@ 2015-02-06 16:14               ` Peter Zijlstra
  2015-02-06 18:29                 ` Peter Zijlstra
  2015-02-09  2:54               ` [Update 2x] " Rafael J. Wysocki
  1 sibling, 1 reply; 29+ messages in thread
From: Peter Zijlstra @ 2015-02-06 16:14 UTC (permalink / raw)
  To: Rafael J. Wysocki
  Cc: Thomas Gleixner, Li, Aubrey, Brown, Len, Alan Cox, LKML, Linux PM list

On Fri, Feb 06, 2015 at 02:20:13AM +0100, Rafael J. Wysocki wrote:
>  void freeze_wake(void)
>  {
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&suspend_freeze_lock, flags);
> +	if (suspend_freeze_state > FREEZE_STATE_NONE) {
> +		suspend_freeze_state = FREEZE_STATE_WAKE;
> +		wake_up(&suspend_freeze_wait_head);
> +	}
> +	spin_unlock_irqrestore(&suspend_freeze_lock, flags);
>  }


> +static void enter_freeze_proper(struct cpuidle_driver *drv,
> +				struct cpuidle_device *dev, int index)
> +{
> +	tick_freeze();
> +	drv->states[index].enter_freeze(dev, drv, index);
> +	/*
> +	 * timekeeping_resume() that will be called by tick_unfreeze() for the
> +	 * last CPU executing it calls functions containing RCU read-side
> +	 * critical sections, so tell RCU about that.
> +	 */
> +	RCU_NONIDLE(tick_unfreeze());
> +}

So I'm a wee bit confused; if we use an enter_freeze() state that keeps
interrupts disabled; who is going to call the freeze_wake() thing?

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [Update] Re: [PATCH v3]PM/Sleep: Timer quiesce in freeze state
  2015-02-06 16:14               ` Peter Zijlstra
@ 2015-02-06 18:29                 ` Peter Zijlstra
  2015-02-06 22:36                   ` Rafael J. Wysocki
  0 siblings, 1 reply; 29+ messages in thread
From: Peter Zijlstra @ 2015-02-06 18:29 UTC (permalink / raw)
  To: Rafael J. Wysocki
  Cc: Thomas Gleixner, Li, Aubrey, Brown, Len, Alan Cox, LKML, Linux PM list

On Fri, Feb 06, 2015 at 05:14:54PM +0100, Peter Zijlstra wrote:
> On Fri, Feb 06, 2015 at 02:20:13AM +0100, Rafael J. Wysocki wrote:
> >  void freeze_wake(void)
> >  {
> > +	unsigned long flags;
> > +
> > +	spin_lock_irqsave(&suspend_freeze_lock, flags);
> > +	if (suspend_freeze_state > FREEZE_STATE_NONE) {
> > +		suspend_freeze_state = FREEZE_STATE_WAKE;
> > +		wake_up(&suspend_freeze_wait_head);
> > +	}
> > +	spin_unlock_irqrestore(&suspend_freeze_lock, flags);
> >  }
> 
> 
> > +static void enter_freeze_proper(struct cpuidle_driver *drv,
> > +				struct cpuidle_device *dev, int index)
> > +{
> > +	tick_freeze();
> > +	drv->states[index].enter_freeze(dev, drv, index);
> > +	/*
> > +	 * timekeeping_resume() that will be called by tick_unfreeze() for the
> > +	 * last CPU executing it calls functions containing RCU read-side
> > +	 * critical sections, so tell RCU about that.
> > +	 */
> > +	RCU_NONIDLE(tick_unfreeze());
> > +}
> 
> So I'm a wee bit confused; if we use an enter_freeze() state that keeps
> interrupts disabled; who is going to call the freeze_wake() thing?

Ah, I think I see, so we wake up, keep the interrupt pending, re-enable
the tick and time and everybody, then re-enable interrupts, take the
interrupt and go around the idle loop to find we need a reschedule etc..

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [Update] Re: [PATCH v3]PM/Sleep: Timer quiesce in freeze state
  2015-02-06 18:29                 ` Peter Zijlstra
@ 2015-02-06 22:36                   ` Rafael J. Wysocki
  2015-02-09  9:49                     ` Peter Zijlstra
  0 siblings, 1 reply; 29+ messages in thread
From: Rafael J. Wysocki @ 2015-02-06 22:36 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Thomas Gleixner, Li, Aubrey, Brown, Len, Alan Cox, LKML, Linux PM list

On Friday, February 06, 2015 07:29:22 PM Peter Zijlstra wrote:
> On Fri, Feb 06, 2015 at 05:14:54PM +0100, Peter Zijlstra wrote:
> > On Fri, Feb 06, 2015 at 02:20:13AM +0100, Rafael J. Wysocki wrote:
> > >  void freeze_wake(void)
> > >  {
> > > +	unsigned long flags;
> > > +
> > > +	spin_lock_irqsave(&suspend_freeze_lock, flags);
> > > +	if (suspend_freeze_state > FREEZE_STATE_NONE) {
> > > +		suspend_freeze_state = FREEZE_STATE_WAKE;
> > > +		wake_up(&suspend_freeze_wait_head);
> > > +	}
> > > +	spin_unlock_irqrestore(&suspend_freeze_lock, flags);
> > >  }
> > 
> > 
> > > +static void enter_freeze_proper(struct cpuidle_driver *drv,
> > > +				struct cpuidle_device *dev, int index)
> > > +{
> > > +	tick_freeze();
> > > +	drv->states[index].enter_freeze(dev, drv, index);
> > > +	/*
> > > +	 * timekeeping_resume() that will be called by tick_unfreeze() for the
> > > +	 * last CPU executing it calls functions containing RCU read-side
> > > +	 * critical sections, so tell RCU about that.
> > > +	 */
> > > +	RCU_NONIDLE(tick_unfreeze());
> > > +}
> > 
> > So I'm a wee bit confused; if we use an enter_freeze() state that keeps
> > interrupts disabled; who is going to call the freeze_wake() thing?
> 
> Ah, I think I see, so we wake up, keep the interrupt pending, re-enable
> the tick and time and everybody, then re-enable interrupts, take the
> interrupt and go around the idle loop to find we need a reschedule etc..

Exactly.


^ permalink raw reply	[flat|nested] 29+ messages in thread

* [Update 2x] Re: [PATCH v3]PM/Sleep: Timer quiesce in freeze state
  2015-02-06  1:20             ` [Update] " Rafael J. Wysocki
  2015-02-06 16:14               ` Peter Zijlstra
@ 2015-02-09  2:54               ` Rafael J. Wysocki
  2015-02-09 15:20                 ` Peter Zijlstra
  2015-02-09 15:44                 ` Peter Zijlstra
  1 sibling, 2 replies; 29+ messages in thread
From: Rafael J. Wysocki @ 2015-02-09  2:54 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Li, Aubrey, Peter Zijlstra, Brown, Len, Alan Cox, LKML, Linux PM list

On Friday, February 06, 2015 02:20:13 AM Rafael J. Wysocki wrote:
> 
> On Thursday, January 29, 2015 11:20:10 PM Rafael J. Wysocki wrote:
> > On Monday, January 26, 2015 10:40:24 AM Thomas Gleixner wrote:
> > > On Mon, 26 Jan 2015, Li, Aubrey wrote:
> > > > On 2015/1/22 18:15, Thomas Gleixner wrote:
> 
> [cut]
> 
> > > 
> > > You need to make sure in the low level idle implementation that this
> > > cannot happen.
> > > 
> > > tick_freeze()
> > > {
> > > 	raw_spin_lock(&tick_freeze_lock);
> > > 	tick_frozen++;
> > > 	if (tick_frozen == num_online_cpus())
> > > 		timekeeping_suspend();
> > > 	else
> > > 		tick_suspend_local();
> > > 	raw_spin_unlock(&tick_freeze_lock);
> > > }
> > > 
> > > tick_unfreeze()
> > > {
> > > 	raw_spin_lock(&tick_freeze_lock);
> > > 	if (tick_frozen == num_online_cpus())
> > > 		timekeeping_resume();
> > > 	else
> > > 		tick_resume_local();
> > > 	tick_frozen--;
> > > 	raw_spin_unlock(&tick_freeze_lock);
> > > }
> > > 
> > > idle_freeze()
> > > {
> > > 	local_irq_disable();
> > > 
> > > 	tick_freeze();
> > > 
> > > 	/* Must keep interrupts disabled! */
> > >        	go_deep_idle()
> > > 
> > > 	tick_unfreeze();
> > > 
> > > 	local_irq_enable();
> > > }
> > > 
> > > That's the only way you can do it proper, everything else will just be
> > > a horrible mess of bandaids and duct tape.
> > > 
> > > So that does not need any of the irq_enter/exit conditionals, it does
> > > not need the real_handler hack. It just works.
> > > 
> > > The only remaining issue might be a NMI calling into
> > > ktime_get_mono_fast_ns() before timekeeping is resumed. Its probably a
> > > non issue on x86/tsc, but it might be a problem on other platforms
> > > which turn off devices, clocks, It's not rocket science to prevent
> > > that.
> > 
> > So the patch below is my version of the $subject one trying to take the
> > above suggestion into account.  It doesn not address the ktime_get_mono_fast_ns()
> > in NMI case at this stage, because quite frankly I'd prefer to get the core
> > changes right in the first place.
> > 
> > The new ->enter_freeze callback is there in struct cpuidle_state, because we
> > generally cannot trust the existing ->enter callbacks to keep interrupts
> > disabled all the time.  Some of them enable interrupts temporarily and
> > some idle entry methods enable interrupts automatically on exit.  So
> > ->enter_freeze is required to keep interrupts disabled and it is safe to
> > suspend/resume timekeeping around it.
> > 
> > The rest is reasonably straightforward.  If suspend-to-idle is requested,
> > couidle_idle_call() calls cpuidle_enter_freeze() that bypasses the governor
> > and selects the deepest suitable state.  If ->enter_freeze is available,
> > it will be used along with tick_freeze()/tick_unfreeze() and the normal
> > idle entry code path will be used otherwise.
> 
> In the meantime I learned how to tell RCU that it actually should track stuff
> inside of the idle loop, so cpuidle_enter_freeze() can go under the
> rcu_idle_enter/exit() now which makes it look more like "real" idle.
> 
> Also I noticed that freeze_enter() could lose a wakeup event if it happened
> between dpm_suspend_noirq() and the point where suspend_freeze_state is set,
> so that race should be closed now.
> 
> In my testing (with a hacked ACPI cpuidle driver) I get a number of wakeups
> (usually no more than 30) in enter_freeze_proper() per suspend-to-idle cycle,
> but that number doesn't seem to depend on the length of sleep, so my theory
> is that this happens early until things relax sufficiently.
> 
> ktime_get_mono_fast_ns() is not covered yet, but I'm going to get to that next.

I'm not sure if this is what you meant, but here it goes.

The idea is to set up a dummy readout base for the fast timekeeper during
timekeeping_suspend() such that it will always return the same number of cycles.
After the last timekeeping_update() in timekeeping_suspend() we read the
clocksource and store the result as cycles_at_suspend.  The readout base
from the current timekeeper is copied onto the dummy and the ->read pointer
of the dummy is set to a routine unconditionally returning cycles_at_suspend.
Next, the dummy is passed to update_fast_timekeeper() (which has been modified
slightly to accept the readout base instead of a whole timekeeper).

Then, if I'm not mistaken, ktime_get_mono_fast_ns() should work until the
subsequent timekeeping_resume() and then the proper readout base for the
fast timekeeper will be restored by the timekeeping_update() called right
after we've cleared timekeeping_suspended.

Complete patch with that modification is appended.  In the next few days I'm
going to split it into smaller parts and send along with cpuidle driver
patches implementing ->enter_freeze.

Please let me know what you think.

Rafael


---
 drivers/cpuidle/cpuidle.c |   88 ++++++++++++++++++++++++++++++++--------------
 include/linux/cpuidle.h   |    8 +++-
 include/linux/suspend.h   |    2 +
 include/linux/tick.h      |    2 +
 kernel/power/suspend.c    |   55 +++++++++++++++++++++++++---
 kernel/sched/idle.c       |   16 ++++++++
 kernel/time/tick-common.c |   50 ++++++++++++++++++++++++++
 kernel/time/timekeeping.c |   35 +++++++++++++-----
 kernel/time/timekeeping.h |    2 +
 9 files changed, 213 insertions(+), 45 deletions(-)

Index: linux-pm/kernel/power/suspend.c
===================================================================
--- linux-pm.orig/kernel/power/suspend.c
+++ linux-pm/kernel/power/suspend.c
@@ -37,7 +37,21 @@ const char *pm_states[PM_SUSPEND_MAX];
 static const struct platform_suspend_ops *suspend_ops;
 static const struct platform_freeze_ops *freeze_ops;
 static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
-static bool suspend_freeze_wake;
+
+/* Suspend-to-idle state machnine. */
+enum freeze_state {
+	FREEZE_STATE_NONE,      /* Not suspended/suspending. */
+	FREEZE_STATE_ENTER,     /* Enter suspend-to-idle. */
+	FREEZE_STATE_WAKE,      /* Wake up from suspend-to-idle. */
+};
+
+static enum freeze_state __read_mostly suspend_freeze_state;
+static DEFINE_SPINLOCK(suspend_freeze_lock);
+
+bool idle_should_freeze(void)
+{
+	return suspend_freeze_state == FREEZE_STATE_ENTER;
+}
 
 void freeze_set_ops(const struct platform_freeze_ops *ops)
 {
@@ -48,22 +62,49 @@ void freeze_set_ops(const struct platfor
 
 static void freeze_begin(void)
 {
-	suspend_freeze_wake = false;
+	suspend_freeze_state = FREEZE_STATE_NONE;
 }
 
 static void freeze_enter(void)
 {
-	cpuidle_use_deepest_state(true);
+	spin_lock_irq(&suspend_freeze_lock);
+	if (pm_wakeup_pending())
+		goto out;
+
+	suspend_freeze_state = FREEZE_STATE_ENTER;
+	spin_unlock_irq(&suspend_freeze_lock);
+
+	get_online_cpus();
 	cpuidle_resume();
-	wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
+
+	/* Push all the CPUs into the idle loop. */
+	wake_up_all_idle_cpus();
+	pr_debug("PM: suspend-to-idle\n");
+	/* Make the current CPU wait so it can enter the idle loop too. */
+	wait_event(suspend_freeze_wait_head,
+		   suspend_freeze_state == FREEZE_STATE_WAKE);
+	pr_debug("PM: resume from suspend-to-idle\n");
+
 	cpuidle_pause();
-	cpuidle_use_deepest_state(false);
+	put_online_cpus();
+
+	spin_lock_irq(&suspend_freeze_lock);
+
+ out:
+	suspend_freeze_state = FREEZE_STATE_NONE;
+	spin_unlock_irq(&suspend_freeze_lock);
 }
 
 void freeze_wake(void)
 {
-	suspend_freeze_wake = true;
-	wake_up(&suspend_freeze_wait_head);
+	unsigned long flags;
+
+	spin_lock_irqsave(&suspend_freeze_lock, flags);
+	if (suspend_freeze_state > FREEZE_STATE_NONE) {
+		suspend_freeze_state = FREEZE_STATE_WAKE;
+		wake_up(&suspend_freeze_wait_head);
+	}
+	spin_unlock_irqrestore(&suspend_freeze_lock, flags);
 }
 EXPORT_SYMBOL_GPL(freeze_wake);
 
Index: linux-pm/include/linux/suspend.h
===================================================================
--- linux-pm.orig/include/linux/suspend.h
+++ linux-pm/include/linux/suspend.h
@@ -203,6 +203,7 @@ extern void suspend_set_ops(const struct
 extern int suspend_valid_only_mem(suspend_state_t state);
 extern void freeze_set_ops(const struct platform_freeze_ops *ops);
 extern void freeze_wake(void);
+extern bool idle_should_freeze(void);
 
 /**
  * arch_suspend_disable_irqs - disable IRQs for suspend
@@ -230,6 +231,7 @@ static inline void suspend_set_ops(const
 static inline int pm_suspend(suspend_state_t state) { return -ENOSYS; }
 static inline void freeze_set_ops(const struct platform_freeze_ops *ops) {}
 static inline void freeze_wake(void) {}
+static inline bool idle_should_freeze(void) { return false; }
 #endif /* !CONFIG_SUSPEND */
 
 /* struct pbe is used for creating lists of pages that should be restored
Index: linux-pm/kernel/sched/idle.c
===================================================================
--- linux-pm.orig/kernel/sched/idle.c
+++ linux-pm/kernel/sched/idle.c
@@ -7,6 +7,7 @@
 #include <linux/tick.h>
 #include <linux/mm.h>
 #include <linux/stackprotector.h>
+#include <linux/suspend.h>
 
 #include <asm/tlb.h>
 
@@ -104,6 +105,21 @@ static void cpuidle_idle_call(void)
 	rcu_idle_enter();
 
 	/*
+	 * Suspend-to-idle ("freeze") is a system state in which all user space
+	 * has been frozen, all I/O devices have been suspended and the only
+	 * activity happens here and in iterrupts (if any).  In that case bypass
+	 * the cpuidle governor and go stratight for the deepest idle state
+	 * available.  Possibly also suspend the local tick and the entire
+	 * timekeeping to prevent timer interrupts from kicking us out of idle
+	 * until a proper wakeup interrupt happens.
+	 */
+	if (idle_should_freeze()) {
+		cpuidle_enter_freeze();
+		local_irq_enable();
+		goto exit_idle;
+	}
+
+	/*
 	 * Ask the cpuidle framework to choose a convenient idle state.
 	 * Fall back to the default arch idle method on errors.
 	 */
Index: linux-pm/drivers/cpuidle/cpuidle.c
===================================================================
--- linux-pm.orig/drivers/cpuidle/cpuidle.c
+++ linux-pm/drivers/cpuidle/cpuidle.c
@@ -19,6 +19,8 @@
 #include <linux/ktime.h>
 #include <linux/hrtimer.h>
 #include <linux/module.h>
+#include <linux/suspend.h>
+#include <linux/tick.h>
 #include <trace/events/power.h>
 
 #include "cpuidle.h"
@@ -32,7 +34,6 @@ LIST_HEAD(cpuidle_detected_devices);
 static int enabled_devices;
 static int off __read_mostly;
 static int initialized __read_mostly;
-static bool use_deepest_state __read_mostly;
 
 int cpuidle_disabled(void)
 {
@@ -66,36 +67,23 @@ int cpuidle_play_dead(void)
 }
 
 /**
- * cpuidle_use_deepest_state - Enable/disable the "deepest idle" mode.
- * @enable: Whether enable or disable the feature.
- *
- * If the "deepest idle" mode is enabled, cpuidle will ignore the governor and
- * always use the state with the greatest exit latency (out of the states that
- * are not disabled).
- *
- * This function can only be called after cpuidle_pause() to avoid races.
- */
-void cpuidle_use_deepest_state(bool enable)
-{
-	use_deepest_state = enable;
-}
-
-/**
- * cpuidle_find_deepest_state - Find the state of the greatest exit latency.
- * @drv: cpuidle driver for a given CPU.
- * @dev: cpuidle device for a given CPU.
+ * cpuidle_find_deepest_state - Find deepest state meeting specific conditions.
+ * @drv: cpuidle driver for the given CPU.
+ * @dev: cpuidle device for the given CPU.
+ * @freeze: Whether or not the state should be suitable for suspend-to-idle.
  */
 static int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
-				      struct cpuidle_device *dev)
+				      struct cpuidle_device *dev, bool freeze)
 {
 	unsigned int latency_req = 0;
-	int i, ret = CPUIDLE_DRIVER_STATE_START - 1;
+	int i, ret = freeze ? -1 : CPUIDLE_DRIVER_STATE_START - 1;
 
 	for (i = CPUIDLE_DRIVER_STATE_START; i < drv->state_count; i++) {
 		struct cpuidle_state *s = &drv->states[i];
 		struct cpuidle_state_usage *su = &dev->states_usage[i];
 
-		if (s->disabled || su->disable || s->exit_latency <= latency_req)
+		if (s->disabled || su->disable || s->exit_latency <= latency_req
+		    || (freeze && !s->enter_freeze))
 			continue;
 
 		latency_req = s->exit_latency;
@@ -104,6 +92,57 @@ static int cpuidle_find_deepest_state(st
 	return ret;
 }
 
+static void enter_freeze_proper(struct cpuidle_driver *drv,
+				struct cpuidle_device *dev, int index)
+{
+	tick_freeze();
+	drv->states[index].enter_freeze(dev, drv, index);
+	/*
+	 * timekeeping_resume() that will be called by tick_unfreeze() for the
+	 * last CPU executing it calls functions containing RCU read-side
+	 * critical sections, so tell RCU about that.
+	 */
+	RCU_NONIDLE(tick_unfreeze());
+}
+
+/**
+ * cpuidle_enter_freeze - Enter an idle state suitable for suspend-to-idle.
+ *
+ * If there are states with the ->enter_freeze callback, find the deepest of
+ * them and enter it with frozen tick.  Otherwise, find the deepest state
+ * available and enter it normally.
+ */
+void cpuidle_enter_freeze(void)
+{
+	struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
+	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
+	int index;
+
+	/*
+	 * Find the deepest state with ->enter_freeze present, which guarantees
+	 * that interrupts won't be enabled when it exits and allows the tick to
+	 * be frozen safely.
+	 */
+	index = cpuidle_find_deepest_state(drv, dev, true);
+	if (index >= 0) {
+		enter_freeze_proper(drv, dev, index);
+		return;
+	}
+
+	/*
+	 * It is not safe to freeze the tick, find the deepest state available
+	 * at all and try to enter it normally.
+	 */
+	index = cpuidle_find_deepest_state(drv, dev, false);
+	if (index >= 0)
+		cpuidle_enter(drv, dev, index);
+	else
+		arch_cpu_idle();
+
+	/* Interrupts are enabled again here. */
+	local_irq_disable();
+}
+
 /**
  * cpuidle_enter_state - enter the state and update stats
  * @dev: cpuidle device for this cpu
@@ -166,9 +205,6 @@ int cpuidle_select(struct cpuidle_driver
 	if (!drv || !dev || !dev->enabled)
 		return -EBUSY;
 
-	if (unlikely(use_deepest_state))
-		return cpuidle_find_deepest_state(drv, dev);
-
 	return cpuidle_curr_governor->select(drv, dev);
 }
 
@@ -200,7 +236,7 @@ int cpuidle_enter(struct cpuidle_driver
  */
 void cpuidle_reflect(struct cpuidle_device *dev, int index)
 {
-	if (cpuidle_curr_governor->reflect && !unlikely(use_deepest_state))
+	if (cpuidle_curr_governor->reflect)
 		cpuidle_curr_governor->reflect(dev, index);
 }
 
Index: linux-pm/include/linux/cpuidle.h
===================================================================
--- linux-pm.orig/include/linux/cpuidle.h
+++ linux-pm/include/linux/cpuidle.h
@@ -50,6 +50,10 @@ struct cpuidle_state {
 			int index);
 
 	int (*enter_dead) (struct cpuidle_device *dev, int index);
+
+	void (*enter_freeze) (struct cpuidle_device *dev,
+			      struct cpuidle_driver *drv,
+			      int index);
 };
 
 /* Idle State Flags */
@@ -141,7 +145,7 @@ extern void cpuidle_resume(void);
 extern int cpuidle_enable_device(struct cpuidle_device *dev);
 extern void cpuidle_disable_device(struct cpuidle_device *dev);
 extern int cpuidle_play_dead(void);
-extern void cpuidle_use_deepest_state(bool enable);
+extern void cpuidle_enter_freeze(void);
 
 extern struct cpuidle_driver *cpuidle_get_cpu_driver(struct cpuidle_device *dev);
 #else
@@ -174,7 +178,7 @@ static inline int cpuidle_enable_device(
 {return -ENODEV; }
 static inline void cpuidle_disable_device(struct cpuidle_device *dev) { }
 static inline int cpuidle_play_dead(void) {return -ENODEV; }
-static inline void cpuidle_use_deepest_state(bool enable) {}
+static inline void cpuidle_enter_freeze(void) { }
 static inline struct cpuidle_driver *cpuidle_get_cpu_driver(
 	struct cpuidle_device *dev) {return NULL; }
 #endif
Index: linux-pm/kernel/time/tick-common.c
===================================================================
--- linux-pm.orig/kernel/time/tick-common.c
+++ linux-pm/kernel/time/tick-common.c
@@ -394,6 +394,56 @@ void tick_resume(void)
 	}
 }
 
+static DEFINE_RAW_SPINLOCK(tick_freeze_lock);
+static unsigned int tick_freeze_depth;
+
+/**
+ * tick_freeze - Suspend the local tick and (possibly) timekeeping.
+ *
+ * Check if this is the last online CPU executing the function and if so,
+ * suspend timekeeping.  Otherwise suspend the local tick.
+ *
+ * Call with interrupts disabled.  Must be balanced with %tick_unfreeze().
+ * Interrupts must not be enabled before the subsequent %tick_unfreeze().
+ */
+void tick_freeze(void)
+{
+	raw_spin_lock(&tick_freeze_lock);
+
+	tick_freeze_depth++;
+	if (tick_freeze_depth == num_online_cpus()) {
+		timekeeping_suspend();
+	} else {
+		tick_suspend();
+		tick_suspend_broadcast();
+	}
+
+	raw_spin_unlock(&tick_freeze_lock);
+}
+
+/**
+ * tick_unfreeze - Resume the local tick and (possibly) timekeeping.
+ *
+ * Check if this is the first CPU executing the function and if so, resume
+ * timekeeping.  Otherwise resume the local tick.
+ *
+ * Call with interrupts disabled.  Must be balanced with %tick_freeze().
+ * Interrupts must not be enabled after the preceding %tick_freeze().
+ */
+void tick_unfreeze(void)
+{
+	raw_spin_lock(&tick_freeze_lock);
+
+	if (tick_freeze_depth == num_online_cpus())
+		timekeeping_resume();
+	else
+		tick_resume();
+
+	tick_freeze_depth--;
+
+	raw_spin_unlock(&tick_freeze_lock);
+}
+
 /**
  * tick_init - initialize the tick control
  */
Index: linux-pm/include/linux/tick.h
===================================================================
--- linux-pm.orig/include/linux/tick.h
+++ linux-pm/include/linux/tick.h
@@ -226,5 +226,7 @@ static inline void tick_nohz_task_switch
 		__tick_nohz_task_switch(tsk);
 }
 
+extern void tick_freeze(void);
+extern void tick_unfreeze(void);
 
 #endif
Index: linux-pm/kernel/time/timekeeping.c
===================================================================
--- linux-pm.orig/kernel/time/timekeeping.c
+++ linux-pm/kernel/time/timekeeping.c
@@ -230,9 +230,7 @@ static inline s64 timekeeping_get_ns_raw
 
 /**
  * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
- * @tk:		The timekeeper from which we take the update
- * @tkf:	The fast timekeeper to update
- * @tbase:	The time base for the fast timekeeper (mono/raw)
+ * @tkr: Timekeeping readout base from which we take the update
  *
  * We want to use this from any context including NMI and tracing /
  * instrumenting the timekeeping code itself.
@@ -244,11 +242,11 @@ static inline s64 timekeeping_get_ns_raw
  * smp_wmb();	<- Ensure that the last base[1] update is visible
  * tkf->seq++;
  * smp_wmb();	<- Ensure that the seqcount update is visible
- * update(tkf->base[0], tk);
+ * update(tkf->base[0], tkr);
  * smp_wmb();	<- Ensure that the base[0] update is visible
  * tkf->seq++;
  * smp_wmb();	<- Ensure that the seqcount update is visible
- * update(tkf->base[1], tk);
+ * update(tkf->base[1], tkr);
  *
  * The reader side does:
  *
@@ -269,7 +267,7 @@ static inline s64 timekeeping_get_ns_raw
  * slightly wrong timestamp (a few nanoseconds). See
  * @ktime_get_mono_fast_ns.
  */
-static void update_fast_timekeeper(struct timekeeper *tk)
+static void update_fast_timekeeper(struct tk_read_base *tkr)
 {
 	struct tk_read_base *base = tk_fast_mono.base;
 
@@ -277,7 +275,7 @@ static void update_fast_timekeeper(struc
 	raw_write_seqcount_latch(&tk_fast_mono.seq);
 
 	/* Update base[0] */
-	memcpy(base, &tk->tkr, sizeof(*base));
+	memcpy(base, tkr, sizeof(*base));
 
 	/* Force readers back to base[0] */
 	raw_write_seqcount_latch(&tk_fast_mono.seq);
@@ -462,7 +460,7 @@ static void timekeeping_update(struct ti
 		memcpy(&shadow_timekeeper, &tk_core.timekeeper,
 		       sizeof(tk_core.timekeeper));
 
-	update_fast_timekeeper(tk);
+	update_fast_timekeeper(&tk->tkr);
 }
 
 /**
@@ -1170,7 +1168,7 @@ void timekeeping_inject_sleeptime64(stru
  * xtime/wall_to_monotonic/jiffies/etc are
  * still managed by arch specific suspend/resume code.
  */
-static void timekeeping_resume(void)
+void timekeeping_resume(void)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
 	struct clocksource *clock = tk->tkr.clock;
@@ -1251,9 +1249,18 @@ static void timekeeping_resume(void)
 	hrtimers_resume();
 }
 
-static int timekeeping_suspend(void)
+static struct tk_read_base tkr_dummy;
+static cycle_t cycles_at_suspend;
+
+static cycle_t dummy_clock_read(struct clocksource *cs)
+{
+	return cycles_at_suspend;
+}
+
+int timekeeping_suspend(void)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
+	struct clocksource *clock = tk->tkr.clock;
 	unsigned long flags;
 	struct timespec64		delta, delta_delta;
 	static struct timespec64	old_delta;
@@ -1296,6 +1303,14 @@ static int timekeeping_suspend(void)
 	}
 
 	timekeeping_update(tk, TK_MIRROR);
+
+	if (!(clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP)) {
+		memcpy(&tkr_dummy, &tk->tkr, sizeof(tkr_dummy));
+		cycles_at_suspend = tk->tkr.read(clock);
+		tkr_dummy.read = dummy_clock_read;
+		update_fast_timekeeper(&tkr_dummy);
+	}
+
 	write_seqcount_end(&tk_core.seq);
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 
Index: linux-pm/kernel/time/timekeeping.h
===================================================================
--- linux-pm.orig/kernel/time/timekeeping.h
+++ linux-pm/kernel/time/timekeeping.h
@@ -16,5 +16,7 @@ extern int timekeeping_inject_offset(str
 extern s32 timekeeping_get_tai_offset(void);
 extern void timekeeping_set_tai_offset(s32 tai_offset);
 extern void timekeeping_clocktai(struct timespec *ts);
+extern int timekeeping_suspend(void);
+extern void timekeeping_resume(void);
 
 #endif


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [Update] Re: [PATCH v3]PM/Sleep: Timer quiesce in freeze state
  2015-02-06 22:36                   ` Rafael J. Wysocki
@ 2015-02-09  9:49                     ` Peter Zijlstra
  2015-02-09 14:50                       ` Rafael J. Wysocki
  0 siblings, 1 reply; 29+ messages in thread
From: Peter Zijlstra @ 2015-02-09  9:49 UTC (permalink / raw)
  To: Rafael J. Wysocki
  Cc: Thomas Gleixner, Li, Aubrey, Brown, Len, Alan Cox, LKML, Linux PM list

On Fri, Feb 06, 2015 at 11:36:12PM +0100, Rafael J. Wysocki wrote:
> On Friday, February 06, 2015 07:29:22 PM Peter Zijlstra wrote:

> > > So I'm a wee bit confused; if we use an enter_freeze() state that keeps
> > > interrupts disabled; who is going to call the freeze_wake() thing?
> > 
> > Ah, I think I see, so we wake up, keep the interrupt pending, re-enable
> > the tick and time and everybody, then re-enable interrupts, take the
> > interrupt and go around the idle loop to find we need a reschedule etc..
> 
> Exactly.

So x86 mwait can do this; what other archs can 'sleep' and keep
interrupts disabled?

It looks like the ARM WFI thing wakes on pending interrupts and doesn't
actually require interrupts to be enabled, so that too would work.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [Update] Re: [PATCH v3]PM/Sleep: Timer quiesce in freeze state
  2015-02-09  9:49                     ` Peter Zijlstra
@ 2015-02-09 14:50                       ` Rafael J. Wysocki
  0 siblings, 0 replies; 29+ messages in thread
From: Rafael J. Wysocki @ 2015-02-09 14:50 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Thomas Gleixner, Li, Aubrey, Brown, Len, Alan Cox, LKML, Linux PM list

On Monday, February 09, 2015 10:49:26 AM Peter Zijlstra wrote:
> On Fri, Feb 06, 2015 at 11:36:12PM +0100, Rafael J. Wysocki wrote:
> > On Friday, February 06, 2015 07:29:22 PM Peter Zijlstra wrote:
> 
> > > > So I'm a wee bit confused; if we use an enter_freeze() state that keeps
> > > > interrupts disabled; who is going to call the freeze_wake() thing?
> > > 
> > > Ah, I think I see, so we wake up, keep the interrupt pending, re-enable
> > > the tick and time and everybody, then re-enable interrupts, take the
> > > interrupt and go around the idle loop to find we need a reschedule etc..
> > 
> > Exactly.
> 
> So x86 mwait can do this;

Which is a big enough target already as far as I'm concerned. :-)

> what other archs can 'sleep' and keep interrupts disabled?

The IO port based entry method on old(ish) x86 (like my Toshiba test-bed laptop)
keeps interrupts disabled too and that should cover ia64 (if they have ever
cared about anything more than C1 anyway).

There seem to be some entry methods that keep interrupts disabled on Power too.

> It looks like the ARM WFI thing wakes on pending interrupts and doesn't
> actually require interrupts to be enabled, so that too would work.

Yes, it would.  Moreover, for ARM that can do WFI only and nothing more than
that it would be much better than full suspend, because the whole CPU offline
dance we do then is a pure time loss for them.

The newfangled PSCI stuff should work too AFAICS.


-- 
I speak only for myself.
Rafael J. Wysocki, Intel Open Source Technology Center.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [Update 2x] Re: [PATCH v3]PM/Sleep: Timer quiesce in freeze state
  2015-02-09  2:54               ` [Update 2x] " Rafael J. Wysocki
@ 2015-02-09 15:20                 ` Peter Zijlstra
  2015-02-09 15:44                 ` Peter Zijlstra
  1 sibling, 0 replies; 29+ messages in thread
From: Peter Zijlstra @ 2015-02-09 15:20 UTC (permalink / raw)
  To: Rafael J. Wysocki
  Cc: Thomas Gleixner, Li, Aubrey, Brown, Len, Alan Cox, LKML,
	Linux PM list, John Stultz

On Mon, Feb 09, 2015 at 03:54:22AM +0100, Rafael J. Wysocki wrote:
> > > > The only remaining issue might be a NMI calling into
> > > > ktime_get_mono_fast_ns() before timekeeping is resumed. Its probably a
> > > > non issue on x86/tsc, but it might be a problem on other platforms
> > > > which turn off devices, clocks, It's not rocket science to prevent
> > > > that.

> I'm not sure if this is what you meant, but here it goes.
> 
> The idea is to set up a dummy readout base for the fast timekeeper during
> timekeeping_suspend() such that it will always return the same number of cycles.
> After the last timekeeping_update() in timekeeping_suspend() we read the
> clocksource and store the result as cycles_at_suspend.  The readout base
> from the current timekeeper is copied onto the dummy and the ->read pointer
> of the dummy is set to a routine unconditionally returning cycles_at_suspend.
> Next, the dummy is passed to update_fast_timekeeper() (which has been modified
> slightly to accept the readout base instead of a whole timekeeper).
> 
> Then, if I'm not mistaken, ktime_get_mono_fast_ns() should work until the
> subsequent timekeeping_resume() and then the proper readout base for the
> fast timekeeper will be restored by the timekeeping_update() called right
> after we've cleared timekeeping_suspended.


> Index: linux-pm/kernel/time/timekeeping.c
> ===================================================================
> --- linux-pm.orig/kernel/time/timekeeping.c
> +++ linux-pm/kernel/time/timekeeping.c
> @@ -230,9 +230,7 @@ static inline s64 timekeeping_get_ns_raw
>  
>  /**
>   * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
> - * @tk:		The timekeeper from which we take the update
> - * @tkf:	The fast timekeeper to update
> - * @tbase:	The time base for the fast timekeeper (mono/raw)
> + * @tkr: Timekeeping readout base from which we take the update
>   *
>   * We want to use this from any context including NMI and tracing /
>   * instrumenting the timekeeping code itself.
> @@ -244,11 +242,11 @@ static inline s64 timekeeping_get_ns_raw
>   * smp_wmb();	<- Ensure that the last base[1] update is visible
>   * tkf->seq++;
>   * smp_wmb();	<- Ensure that the seqcount update is visible
> - * update(tkf->base[0], tk);
> + * update(tkf->base[0], tkr);
>   * smp_wmb();	<- Ensure that the base[0] update is visible
>   * tkf->seq++;
>   * smp_wmb();	<- Ensure that the seqcount update is visible
> - * update(tkf->base[1], tk);
> + * update(tkf->base[1], tkr);
>   *
>   * The reader side does:
>   *
> @@ -269,7 +267,7 @@ static inline s64 timekeeping_get_ns_raw
>   * slightly wrong timestamp (a few nanoseconds). See
>   * @ktime_get_mono_fast_ns.
>   */
> -static void update_fast_timekeeper(struct timekeeper *tk)
> +static void update_fast_timekeeper(struct tk_read_base *tkr)
>  {
>  	struct tk_read_base *base = tk_fast_mono.base;
>  
> @@ -277,7 +275,7 @@ static void update_fast_timekeeper(struc
>  	raw_write_seqcount_latch(&tk_fast_mono.seq);
>  
>  	/* Update base[0] */
> -	memcpy(base, &tk->tkr, sizeof(*base));
> +	memcpy(base, tkr, sizeof(*base));
>  
>  	/* Force readers back to base[0] */
>  	raw_write_seqcount_latch(&tk_fast_mono.seq);
> @@ -462,7 +460,7 @@ static void timekeeping_update(struct ti
>  		memcpy(&shadow_timekeeper, &tk_core.timekeeper,
>  		       sizeof(tk_core.timekeeper));
>  
> -	update_fast_timekeeper(tk);
> +	update_fast_timekeeper(&tk->tkr);
>  }
>  
>  /**

> @@ -1251,9 +1249,18 @@ static void timekeeping_resume(void)
>  	hrtimers_resume();
>  }
>  
> -static int timekeeping_suspend(void)
> +static struct tk_read_base tkr_dummy;
> +static cycle_t cycles_at_suspend;
> +
> +static cycle_t dummy_clock_read(struct clocksource *cs)
> +{
> +	return cycles_at_suspend;
> +}
> +
> +int timekeeping_suspend(void)
>  {
>  	struct timekeeper *tk = &tk_core.timekeeper;
> +	struct clocksource *clock = tk->tkr.clock;
>  	unsigned long flags;
>  	struct timespec64		delta, delta_delta;
>  	static struct timespec64	old_delta;
> @@ -1296,6 +1303,14 @@ static int timekeeping_suspend(void)
>  	}
>  
>  	timekeeping_update(tk, TK_MIRROR);
> +
> +	if (!(clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP)) {
> +		memcpy(&tkr_dummy, &tk->tkr, sizeof(tkr_dummy));
> +		cycles_at_suspend = tk->tkr.read(clock);
> +		tkr_dummy.read = dummy_clock_read;
> +		update_fast_timekeeper(&tkr_dummy);
> +	}
> +
>  	write_seqcount_end(&tk_core.seq);
>  	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);

Yeah, I think that that should work. John any objections?

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [Update 2x] Re: [PATCH v3]PM/Sleep: Timer quiesce in freeze state
  2015-02-09  2:54               ` [Update 2x] " Rafael J. Wysocki
  2015-02-09 15:20                 ` Peter Zijlstra
@ 2015-02-09 15:44                 ` Peter Zijlstra
  2015-02-09 23:57                   ` Rafael J. Wysocki
  1 sibling, 1 reply; 29+ messages in thread
From: Peter Zijlstra @ 2015-02-09 15:44 UTC (permalink / raw)
  To: Rafael J. Wysocki
  Cc: Thomas Gleixner, Li, Aubrey, Brown, Len, Alan Cox, LKML, Linux PM list

On Mon, Feb 09, 2015 at 03:54:22AM +0100, Rafael J. Wysocki wrote:
> Complete patch with that modification is appended.  In the next few days I'm
> going to split it into smaller parts and send along with cpuidle driver
> patches implementing ->enter_freeze.
> 
> Please let me know what you think.

> @@ -104,6 +105,21 @@ static void cpuidle_idle_call(void)
>  	rcu_idle_enter();
>  
>  	/*
> +	 * Suspend-to-idle ("freeze") is a system state in which all user space
> +	 * has been frozen, all I/O devices have been suspended and the only
> +	 * activity happens here and in iterrupts (if any).  In that case bypass
> +	 * the cpuidle governor and go stratight for the deepest idle state
> +	 * available.  Possibly also suspend the local tick and the entire
> +	 * timekeeping to prevent timer interrupts from kicking us out of idle
> +	 * until a proper wakeup interrupt happens.
> +	 */
> +	if (idle_should_freeze()) {
> +		cpuidle_enter_freeze();
> +		local_irq_enable();
> +		goto exit_idle;
> +	}
> +
> +	/*
>  	 * Ask the cpuidle framework to choose a convenient idle state.
>  	 * Fall back to the default arch idle method on errors.
>  	 */

I was hoping to not have to put that into the regular idle path; say
maybe share a single special branch with the play-dead call. People seem
to start complaining about the total amount of time it takes to just
'run' the idle path.

Now I don't think we can do that, because we need the
arch_cpu_idle_enter() nonsense for the one but not the other; also all
this really only makes sense in the cpuidle context, so nothing to be
done about that.

In any case, you could make that:

static inline bool idle_should_freeze(void)
{
	return unlikely(suspend_freeze_state == FREEZE_STATE_ENTER);
}

which should help a bit I suppose.

> +static void enter_freeze_proper(struct cpuidle_driver *drv,
> +                               struct cpuidle_device *dev, int index)
> +{
> +       tick_freeze();
> +       drv->states[index].enter_freeze(dev, drv, index);

This is slightly different from cpuidle_enter() in that it does not
consider the coupled states nonsense, is that on purpose? And if so,
does that want a comment?

> +       /*
> +        * timekeeping_resume() that will be called by tick_unfreeze() for the
> +        * last CPU executing it calls functions containing RCU read-side
> +        * critical sections, so tell RCU about that.
> +        */
> +       RCU_NONIDLE(tick_unfreeze());
> +}


But over all it looks fine to me.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [Update 2x] Re: [PATCH v3]PM/Sleep: Timer quiesce in freeze state
  2015-02-09 15:44                 ` Peter Zijlstra
@ 2015-02-09 23:57                   ` Rafael J. Wysocki
  0 siblings, 0 replies; 29+ messages in thread
From: Rafael J. Wysocki @ 2015-02-09 23:57 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Thomas Gleixner, Li, Aubrey, Brown, Len, Alan Cox, LKML, Linux PM list

On Monday, February 09, 2015 04:44:08 PM Peter Zijlstra wrote:
> On Mon, Feb 09, 2015 at 03:54:22AM +0100, Rafael J. Wysocki wrote:
> > Complete patch with that modification is appended.  In the next few days I'm
> > going to split it into smaller parts and send along with cpuidle driver
> > patches implementing ->enter_freeze.
> > 
> > Please let me know what you think.
> 
> > @@ -104,6 +105,21 @@ static void cpuidle_idle_call(void)
> >  	rcu_idle_enter();
> >  
> >  	/*
> > +	 * Suspend-to-idle ("freeze") is a system state in which all user space
> > +	 * has been frozen, all I/O devices have been suspended and the only
> > +	 * activity happens here and in iterrupts (if any).  In that case bypass
> > +	 * the cpuidle governor and go stratight for the deepest idle state
> > +	 * available.  Possibly also suspend the local tick and the entire
> > +	 * timekeeping to prevent timer interrupts from kicking us out of idle
> > +	 * until a proper wakeup interrupt happens.
> > +	 */
> > +	if (idle_should_freeze()) {
> > +		cpuidle_enter_freeze();
> > +		local_irq_enable();
> > +		goto exit_idle;
> > +	}
> > +
> > +	/*
> >  	 * Ask the cpuidle framework to choose a convenient idle state.
> >  	 * Fall back to the default arch idle method on errors.
> >  	 */
> 
> I was hoping to not have to put that into the regular idle path; say
> maybe share a single special branch with the play-dead call. People seem
> to start complaining about the total amount of time it takes to just
> 'run' the idle path.

Well, this check really only replaces one in cpuidle_select() and another
one in cpuidle_reflect() and they both are called from the idle path, so
that's a net gain rather. :-)

> Now I don't think we can do that, because we need the
> arch_cpu_idle_enter() nonsense for the one but not the other; also all
> this really only makes sense in the cpuidle context, so nothing to be
> done about that.

Agreed.

> In any case, you could make that:
> 
> static inline bool idle_should_freeze(void)
> {
> 	return unlikely(suspend_freeze_state == FREEZE_STATE_ENTER);
> }
> 
> which should help a bit I suppose.

Won't hurt at least.

> > +static void enter_freeze_proper(struct cpuidle_driver *drv,
> > +                               struct cpuidle_device *dev, int index)
> > +{
> > +       tick_freeze();
> > +       drv->states[index].enter_freeze(dev, drv, index);
> 
> This is slightly different from cpuidle_enter() in that it does not
> consider the coupled states nonsense, is that on purpose? And if so,
> does that want a comment?

Yes, it is on purpose and the reason why is because the coupled thing re-enables
interrupts (unconditionally).  [And it contains a comment about that which does
not make sense to me now that I read it.]

I can add a comment about that, though.

> > +       /*
> > +        * timekeeping_resume() that will be called by tick_unfreeze() for the
> > +        * last CPU executing it calls functions containing RCU read-side
> > +        * critical sections, so tell RCU about that.
> > +        */
> > +       RCU_NONIDLE(tick_unfreeze());
> > +}
> 
> 
> But over all it looks fine to me.

Cool, thanks!


^ permalink raw reply	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2015-02-09 23:34 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-12-09  3:01 [PATCH v3]PM/Sleep: Timer quiesce in freeze state Li, Aubrey
2015-01-14  0:24 ` Li, Aubrey
2015-01-19 15:24   ` Rafael J. Wysocki
2015-01-22 10:15     ` Thomas Gleixner
2015-01-26  8:44       ` Li, Aubrey
2015-01-26  9:40         ` Thomas Gleixner
2015-01-26 14:21           ` Rafael J. Wysocki
2015-01-26 14:15             ` Thomas Gleixner
2015-01-26 14:45               ` Rafael J. Wysocki
2015-01-27  7:12                 ` Li, Aubrey
2015-01-26 14:41           ` Rafael J. Wysocki
2015-01-26 14:24             ` Thomas Gleixner
2015-01-26 14:50               ` Rafael J. Wysocki
2015-01-26 14:34                 ` Thomas Gleixner
2015-01-26 15:04                   ` Rafael J. Wysocki
2015-01-27  8:03             ` Li, Aubrey
2015-01-27 15:10               ` Rafael J. Wysocki
2015-01-28  0:17                 ` Li, Aubrey
2015-01-29 22:20           ` Rafael J. Wysocki
2015-02-06  1:20             ` [Update] " Rafael J. Wysocki
2015-02-06 16:14               ` Peter Zijlstra
2015-02-06 18:29                 ` Peter Zijlstra
2015-02-06 22:36                   ` Rafael J. Wysocki
2015-02-09  9:49                     ` Peter Zijlstra
2015-02-09 14:50                       ` Rafael J. Wysocki
2015-02-09  2:54               ` [Update 2x] " Rafael J. Wysocki
2015-02-09 15:20                 ` Peter Zijlstra
2015-02-09 15:44                 ` Peter Zijlstra
2015-02-09 23:57                   ` Rafael J. Wysocki

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.