linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] hardlockup: detect hard lockups without NMIs using secondary cpus
@ 2013-01-11 21:51 Colin Cross
  2013-01-14 23:49 ` Andrew Morton
  2013-01-15  0:13 ` Frederic Weisbecker
  0 siblings, 2 replies; 14+ messages in thread
From: Colin Cross @ 2013-01-11 21:51 UTC (permalink / raw)
  To: linux-kernel
  Cc: Andrew Morton, Don Zickus, Ingo Molnar, Thomas Gleixner,
	liu chuansheng, linux-arm-kernel, Russell King - ARM Linux,
	Frederic Weisbecker, Tony Lindgren, Colin Cross

Emulate NMIs on systems where they are not available by using timer
interrupts on other cpus.  Each cpu will use its softlockup hrtimer
to check that the next cpu is processing hrtimer interrupts by
verifying that a counter is increasing.

This patch is useful on systems where the hardlockup detector is not
available due to a lack of NMIs, for example most ARM SoCs.
Without this patch any cpu stuck with interrupts disabled can
cause a hardware watchdog reset with no debugging information,
but with this patch the kernel can detect the lockup and panic,
which can result in useful debugging info.

Signed-off-by: Colin Cross <ccross@android.com>
---
 include/linux/nmi.h |    5 ++-
 kernel/watchdog.c   |  123 ++++++++++++++++++++++++++++++++++++++++++++++++--
 lib/Kconfig.debug   |   14 +++++-
 3 files changed, 135 insertions(+), 7 deletions(-)

Changes since v1:
    renamed variables to clarify when referring to the next cpu
    factored out function to get next cpu
    fixed int vs unsigned int mismatches
    fixed race condition when offlining cpus

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index db50840..c8f8aa0 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -14,8 +14,11 @@
  * may be used to reset the timeout - for code which intentionally
  * disables interrupts for a long time. This call is stateless.
  */
-#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
+#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR_NMI)
 #include <asm/nmi.h>
+#endif
+
+#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
 extern void touch_nmi_watchdog(void);
 #else
 static inline void touch_nmi_watchdog(void)
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 75a2ab3..61a0595 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -44,6 +44,11 @@
 static DEFINE_PER_CPU(bool, hard_watchdog_warn);
 static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
 static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
+#endif
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU
+static cpumask_t __read_mostly watchdog_cpus;
+#endif
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_NMI
 static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
 #endif
 
@@ -179,7 +184,7 @@ void touch_softlockup_watchdog_sync(void)
 	__raw_get_cpu_var(watchdog_touch_ts) = 0;
 }
 
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_NMI
 /* watchdog detector functions */
 static int is_hardlockup(void)
 {
@@ -193,6 +198,76 @@ static int is_hardlockup(void)
 }
 #endif
 
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU
+static unsigned int watchdog_next_cpu(unsigned int cpu)
+{
+	cpumask_t cpus = watchdog_cpus;
+	unsigned int next_cpu;
+
+	next_cpu = cpumask_next(cpu, &cpus);
+	if (next_cpu >= nr_cpu_ids)
+		next_cpu = cpumask_first(&cpus);
+
+	if (next_cpu == cpu)
+		return nr_cpu_ids;
+
+	return next_cpu;
+}
+
+static int is_hardlockup_other_cpu(unsigned int cpu)
+{
+	unsigned long hrint = per_cpu(hrtimer_interrupts, cpu);
+
+	if (per_cpu(hrtimer_interrupts_saved, cpu) == hrint)
+		return 1;
+
+	per_cpu(hrtimer_interrupts_saved, cpu) = hrint;
+	return 0;
+}
+
+static void watchdog_check_hardlockup_other_cpu(void)
+{
+	unsigned int next_cpu;
+
+	/*
+	 * Test for hardlockups every 3 samples.  The sample period is
+	 *  watchdog_thresh * 2 / 5, so 3 samples gets us back to slightly over
+	 *  watchdog_thresh (over by 20%).
+	 */
+	if (__this_cpu_read(hrtimer_interrupts) % 3 != 0)
+		return;
+
+	/* check for a hardlockup on the next cpu */
+	next_cpu = watchdog_next_cpu(smp_processor_id());
+	if (next_cpu >= nr_cpu_ids)
+		return;
+
+	smp_rmb();
+
+	if (per_cpu(watchdog_nmi_touch, next_cpu) == true) {
+		per_cpu(watchdog_nmi_touch, next_cpu) = false;
+		return;
+	}
+
+	if (is_hardlockup_other_cpu(next_cpu)) {
+		/* only warn once */
+		if (per_cpu(hard_watchdog_warn, next_cpu) == true)
+			return;
+
+		if (hardlockup_panic)
+			panic("Watchdog detected hard LOCKUP on cpu %u", next_cpu);
+		else
+			WARN(1, "Watchdog detected hard LOCKUP on cpu %u", next_cpu);
+
+		per_cpu(hard_watchdog_warn, next_cpu) = true;
+	} else {
+		per_cpu(hard_watchdog_warn, next_cpu) = false;
+	}
+}
+#else
+static inline void watchdog_check_hardlockup_other_cpu(void) { return; }
+#endif
+
 static int is_softlockup(unsigned long touch_ts)
 {
 	unsigned long now = get_timestamp(smp_processor_id());
@@ -204,7 +279,7 @@ static int is_softlockup(unsigned long touch_ts)
 	return 0;
 }
 
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_NMI
 
 static struct perf_event_attr wd_hw_attr = {
 	.type		= PERF_TYPE_HARDWARE,
@@ -252,7 +327,7 @@ static void watchdog_overflow_callback(struct perf_event *event,
 	__this_cpu_write(hard_watchdog_warn, false);
 	return;
 }
-#endif /* CONFIG_HARDLOCKUP_DETECTOR */
+#endif /* CONFIG_HARDLOCKUP_DETECTOR_NMI */
 
 static void watchdog_interrupt_count(void)
 {
@@ -272,6 +347,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 	/* kick the hardlockup detector */
 	watchdog_interrupt_count();
 
+	/* test for hardlockups on the next cpu */
+	watchdog_check_hardlockup_other_cpu();
+
 	/* kick the softlockup detector */
 	wake_up_process(__this_cpu_read(softlockup_watchdog));
 
@@ -396,7 +474,7 @@ static void watchdog(unsigned int cpu)
 	__touch_watchdog();
 }
 
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_NMI
 /*
  * People like the simple clean cpu node info on boot.
  * Reduce the watchdog noise by only printing messages
@@ -472,9 +550,44 @@ static void watchdog_nmi_disable(unsigned int cpu)
 	return;
 }
 #else
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU
+static int watchdog_nmi_enable(unsigned int cpu)
+{
+	/*
+	 * The new cpu will be marked online before the first hrtimer interrupt
+	 * runs on it.  If another cpu tests for a hardlockup on the new cpu
+	 * before it has run its first hrtimer, it will get a false positive.
+	 * Touch the watchdog on the new cpu to delay the first check for at
+	 * least 3 sampling periods to guarantee one hrtimer has run on the new
+	 * cpu.
+	 */
+	per_cpu(watchdog_nmi_touch, cpu) = true;
+	smp_wmb();
+	cpumask_set_cpu(cpu, &watchdog_cpus);
+	return 0;
+}
+
+static void watchdog_nmi_disable(unsigned int cpu)
+{
+	unsigned int next_cpu = watchdog_next_cpu(cpu);
+
+	/*
+	 * Offlining this cpu will cause the cpu before this one to start
+	 * checking the one after this one.  If this cpu just finished checking
+	 * the next cpu and updating hrtimer_interrupts_saved, and then the
+	 * previous cpu checks it within one sample period, it will trigger a
+	 * false positive.  Touch the watchdog on the next cpu to prevent it.
+	 */
+	if (next_cpu < nr_cpu_ids)
+		per_cpu(watchdog_nmi_touch, next_cpu) = true;
+	smp_wmb();
+	cpumask_clear_cpu(cpu, &watchdog_cpus);
+}
+#else
 static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
 static void watchdog_nmi_disable(unsigned int cpu) { return; }
-#endif /* CONFIG_HARDLOCKUP_DETECTOR */
+#endif /* CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU */
+#endif /* CONFIG_HARDLOCKUP_DETECTOR_NMI */
 
 /* prepare/enable/disable routines */
 /* sysctl functions */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index aaf8baf..f7c4859 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -191,15 +191,27 @@ config LOCKUP_DETECTOR
 	  The overhead should be minimal.  A periodic hrtimer runs to
 	  generate interrupts and kick the watchdog task every 4 seconds.
 	  An NMI is generated every 10 seconds or so to check for hardlockups.
+	  If NMIs are not available on the platform, every 12 seconds the
+	  hrtimer interrupt on one cpu will be used to check for hardlockups
+	  on the next cpu.
 
 	  The frequency of hrtimer and NMI events and the soft and hard lockup
 	  thresholds can be controlled through the sysctl watchdog_thresh.
 
-config HARDLOCKUP_DETECTOR
+config HARDLOCKUP_DETECTOR_NMI
 	def_bool y
 	depends on LOCKUP_DETECTOR && !HAVE_NMI_WATCHDOG
 	depends on PERF_EVENTS && HAVE_PERF_EVENTS_NMI
 
+config HARDLOCKUP_DETECTOR_OTHER_CPU
+	def_bool y
+	depends on LOCKUP_DETECTOR && SMP
+	depends on !HARDLOCKUP_DETECTOR_NMI && !HAVE_NMI_WATCHDOG
+
+config HARDLOCKUP_DETECTOR
+	def_bool y
+	depends on HARDLOCKUP_DETECTOR_NMI || HARDLOCKUP_DETECTOR_OTHER_CPU
+
 config BOOTPARAM_HARDLOCKUP_PANIC
 	bool "Panic (Reboot) On Hard Lockups"
 	depends on HARDLOCKUP_DETECTOR
-- 
1.7.7.3


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] hardlockup: detect hard lockups without NMIs using secondary cpus
  2013-01-11 21:51 [PATCH v2] hardlockup: detect hard lockups without NMIs using secondary cpus Colin Cross
@ 2013-01-14 23:49 ` Andrew Morton
  2013-01-15  0:19   ` Colin Cross
  2013-01-15  0:13 ` Frederic Weisbecker
  1 sibling, 1 reply; 14+ messages in thread
From: Andrew Morton @ 2013-01-14 23:49 UTC (permalink / raw)
  To: Colin Cross
  Cc: linux-kernel, Don Zickus, Ingo Molnar, Thomas Gleixner,
	liu chuansheng, linux-arm-kernel, Russell King - ARM Linux,
	Frederic Weisbecker, Tony Lindgren

On Fri, 11 Jan 2013 13:51:48 -0800
Colin Cross <ccross@android.com> wrote:

> Emulate NMIs on systems where they are not available by using timer
> interrupts on other cpus.  Each cpu will use its softlockup hrtimer
> to check that the next cpu is processing hrtimer interrupts by
> verifying that a counter is increasing.

Seems sensible.

> This patch is useful on systems where the hardlockup detector is not
> available due to a lack of NMIs, for example most ARM SoCs.
> Without this patch any cpu stuck with interrupts disabled can
> cause a hardware watchdog reset with no debugging information,
> but with this patch the kernel can detect the lockup and panic,
> which can result in useful debugging info.

But we don't get the target cpu's stack, yes?  That's a pretty big loss.

>
> ...
>
> +#ifdef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU
> +static unsigned int watchdog_next_cpu(unsigned int cpu)
> +{
> +	cpumask_t cpus = watchdog_cpus;

cpumask_t can be tremendously huge and putting one on the stack is
risky.  Can we use watchdog_cpus directly here?  Perhaps with a lock? 
or take a copy into a static local, with a lock?

> +	unsigned int next_cpu;
> +
> +	next_cpu = cpumask_next(cpu, &cpus);
> +	if (next_cpu >= nr_cpu_ids)
> +		next_cpu = cpumask_first(&cpus);
> +
> +	if (next_cpu == cpu)
> +		return nr_cpu_ids;
> +
> +	return next_cpu;
> +}
> +
> +static int is_hardlockup_other_cpu(unsigned int cpu)
> +{
> +	unsigned long hrint = per_cpu(hrtimer_interrupts, cpu);
> +
> +	if (per_cpu(hrtimer_interrupts_saved, cpu) == hrint)
> +		return 1;
> +
> +	per_cpu(hrtimer_interrupts_saved, cpu) = hrint;
> +	return 0;
> +}

This could return a bool type.

> +static void watchdog_check_hardlockup_other_cpu(void)
> +{
> +	unsigned int next_cpu;
> +
> +	/*
> +	 * Test for hardlockups every 3 samples.  The sample period is
> +	 *  watchdog_thresh * 2 / 5, so 3 samples gets us back to slightly over
> +	 *  watchdog_thresh (over by 20%).
> +	 */
> +	if (__this_cpu_read(hrtimer_interrupts) % 3 != 0)
> +		return;

The hardwired interval Seems Wrong.  watchdog_thresh is tunable at runtime.

The comment could do with some fleshing out.  *why* do we want to test
at an interval "slightly over watchdog_thresh"?  What's going on here?

> +	/* check for a hardlockup on the next cpu */
> +	next_cpu = watchdog_next_cpu(smp_processor_id());
> +	if (next_cpu >= nr_cpu_ids)
> +		return;
> +
> +	smp_rmb();

Mystery barrier (always) needs an explanatory comment, please.

> +	if (per_cpu(watchdog_nmi_touch, next_cpu) == true) {
> +		per_cpu(watchdog_nmi_touch, next_cpu) = false;
> +		return;
> +	}

I wonder if a well-timed CPU plug/unplug could result in two CPUs
simultaneously checking one other CPU's state.

> +	if (is_hardlockup_other_cpu(next_cpu)) {
> +		/* only warn once */
> +		if (per_cpu(hard_watchdog_warn, next_cpu) == true)
> +			return;
> +
> +		if (hardlockup_panic)
> +			panic("Watchdog detected hard LOCKUP on cpu %u", next_cpu);
> +		else
> +			WARN(1, "Watchdog detected hard LOCKUP on cpu %u", next_cpu);

I suggest we use messages here which make it clear to people who read
kernel output that this was triggered by hrtimers, not by NMI.  Most
importantly because people will need to know that the CPU which locked
up is *not this CPU* and that any backtrace from the reporting CPU is
misleading.

Also, there was never any sense in making the LOCKUP all-caps ;)

> +		per_cpu(hard_watchdog_warn, next_cpu) = true;
> +	} else {
> +		per_cpu(hard_watchdog_warn, next_cpu) = false;
> +	}
> +}
> +#else
> +static inline void watchdog_check_hardlockup_other_cpu(void) { return; }
> +#endif
> +
>
> ...
>
> --- a/lib/Kconfig.debug
> +++ b/lib/Kconfig.debug
> @@ -191,15 +191,27 @@ config LOCKUP_DETECTOR
>  	  The overhead should be minimal.  A periodic hrtimer runs to
>  	  generate interrupts and kick the watchdog task every 4 seconds.
>  	  An NMI is generated every 10 seconds or so to check for hardlockups.
> +	  If NMIs are not available on the platform, every 12 seconds the

hm.  Is the old "4 seconds" still true/accurate/complete?

> +	  hrtimer interrupt on one cpu will be used to check for hardlockups
> +	  on the next cpu.
>  
>  	  The frequency of hrtimer and NMI events and the soft and hard lockup
>  	  thresholds can be controlled through the sysctl watchdog_thresh.
>  
> -config HARDLOCKUP_DETECTOR
> +config HARDLOCKUP_DETECTOR_NMI
>  	def_bool y
>  	depends on LOCKUP_DETECTOR && !HAVE_NMI_WATCHDOG
>  	depends on PERF_EVENTS && HAVE_PERF_EVENTS_NMI

Confused.  I'd have expected this to depend on HAVE_NMI_WATCHDOG,
rather than -no-that.  What does "HAVE_NMI_WATCHDOG" actually mean and
what's happening here?

>
> ...
>


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] hardlockup: detect hard lockups without NMIs using secondary cpus
  2013-01-11 21:51 [PATCH v2] hardlockup: detect hard lockups without NMIs using secondary cpus Colin Cross
  2013-01-14 23:49 ` Andrew Morton
@ 2013-01-15  0:13 ` Frederic Weisbecker
  2013-01-15  0:22   ` Colin Cross
  2013-01-15 16:32   ` Paul E. McKenney
  1 sibling, 2 replies; 14+ messages in thread
From: Frederic Weisbecker @ 2013-01-15  0:13 UTC (permalink / raw)
  To: Colin Cross
  Cc: linux-kernel, Andrew Morton, Don Zickus, Ingo Molnar,
	Thomas Gleixner, liu chuansheng, linux-arm-kernel,
	Russell King - ARM Linux, Tony Lindgren

2013/1/11 Colin Cross <ccross@android.com>:
> Emulate NMIs on systems where they are not available by using timer
> interrupts on other cpus.  Each cpu will use its softlockup hrtimer
> to check that the next cpu is processing hrtimer interrupts by
> verifying that a counter is increasing.
>
> This patch is useful on systems where the hardlockup detector is not
> available due to a lack of NMIs, for example most ARM SoCs.
> Without this patch any cpu stuck with interrupts disabled can
> cause a hardware watchdog reset with no debugging information,
> but with this patch the kernel can detect the lockup and panic,
> which can result in useful debugging info.
>
> Signed-off-by: Colin Cross <ccross@android.com>

I believe this is pretty much what the RCU stall detector does
already: checks for other CPUs being responsive. The only difference
is on how it checks that. For RCU it's about checking for CPUs
reporting quiescent states when requested to do so. In your case it's
about ensuring the hrtimer interrupt is well handled.

One thing you can do is to enqueue an RCU callback (cal_rcu()) every
minute so you can force other CPUs to report quiescent states
periodically and thus check for lockups.

Now you'll face the same problem in the end: if you don't have NMIs,
you won't have a very useful report.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] hardlockup: detect hard lockups without NMIs using secondary cpus
  2013-01-14 23:49 ` Andrew Morton
@ 2013-01-15  0:19   ` Colin Cross
  2013-01-15  0:25     ` Andrew Morton
  2013-01-15  1:40     ` Colin Cross
  0 siblings, 2 replies; 14+ messages in thread
From: Colin Cross @ 2013-01-15  0:19 UTC (permalink / raw)
  To: Andrew Morton
  Cc: lkml, Don Zickus, Ingo Molnar, Thomas Gleixner, liu chuansheng,
	linux-arm-kernel, Russell King - ARM Linux, Frederic Weisbecker,
	Tony Lindgren

On Mon, Jan 14, 2013 at 3:49 PM, Andrew Morton
<akpm@linux-foundation.org> wrote:
> On Fri, 11 Jan 2013 13:51:48 -0800
> Colin Cross <ccross@android.com> wrote:
>
>> Emulate NMIs on systems where they are not available by using timer
>> interrupts on other cpus.  Each cpu will use its softlockup hrtimer
>> to check that the next cpu is processing hrtimer interrupts by
>> verifying that a counter is increasing.
>
> Seems sensible.
>
>> This patch is useful on systems where the hardlockup detector is not
>> available due to a lack of NMIs, for example most ARM SoCs.
>> Without this patch any cpu stuck with interrupts disabled can
>> cause a hardware watchdog reset with no debugging information,
>> but with this patch the kernel can detect the lockup and panic,
>> which can result in useful debugging info.
>
> But we don't get the target cpu's stack, yes?  That's a pretty big loss.

It's a huge loss, but its still useful.  For one, it can separate
"linux locked up one cpu" bugs from "the whole cpu complex stopped
responding" bugs, which are much more common than you would hope on
ARM cpus.  Also, as a separate patch I'm hoping to add reading the
DBGPCSR register of both cpus during panic, which will at least give
you the PC of the cpu that is stuck.

>>
>> ...
>>
>> +#ifdef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU
>> +static unsigned int watchdog_next_cpu(unsigned int cpu)
>> +{
>> +     cpumask_t cpus = watchdog_cpus;
>
> cpumask_t can be tremendously huge and putting one on the stack is
> risky.  Can we use watchdog_cpus directly here?  Perhaps with a lock?
> or take a copy into a static local, with a lock?

Sure, I can use a lock around it.  I'm used to very small numbers of cpus.

>> +     unsigned int next_cpu;
>> +
>> +     next_cpu = cpumask_next(cpu, &cpus);
>> +     if (next_cpu >= nr_cpu_ids)
>> +             next_cpu = cpumask_first(&cpus);
>> +
>> +     if (next_cpu == cpu)
>> +             return nr_cpu_ids;
>> +
>> +     return next_cpu;
>> +}
>> +
>> +static int is_hardlockup_other_cpu(unsigned int cpu)
>> +{
>> +     unsigned long hrint = per_cpu(hrtimer_interrupts, cpu);
>> +
>> +     if (per_cpu(hrtimer_interrupts_saved, cpu) == hrint)
>> +             return 1;
>> +
>> +     per_cpu(hrtimer_interrupts_saved, cpu) = hrint;
>> +     return 0;
>> +}
>
> This could return a bool type.

I based it on is_hardlockup, but I can convert it to a bool.

>> +static void watchdog_check_hardlockup_other_cpu(void)
>> +{
>> +     unsigned int next_cpu;
>> +
>> +     /*
>> +      * Test for hardlockups every 3 samples.  The sample period is
>> +      *  watchdog_thresh * 2 / 5, so 3 samples gets us back to slightly over
>> +      *  watchdog_thresh (over by 20%).
>> +      */
>> +     if (__this_cpu_read(hrtimer_interrupts) % 3 != 0)
>> +             return;
>
> The hardwired interval Seems Wrong.  watchdog_thresh is tunable at runtime.
>
> The comment could do with some fleshing out.  *why* do we want to test
> at an interval "slightly over watchdog_thresh"?  What's going on here?

I'll reword it.  We don't want to be slightly over watchdog_thresh,
ideally we would be exactly at watchdog_thresh.  However, since this
relies on the hrtimer interrupts that are scheduled at watchdog_thresh
* 2 / 5, there is no multiple of hrtimer_interrupts that will result
in watchdog_thresh.  watchdog_thresh * 2 / 5 * 3 (watchdog_thresh *
1.2) is the closest I can get to testing for a hardlockup once every
watchdog_thresh seconds.

>> +     /* check for a hardlockup on the next cpu */
>> +     next_cpu = watchdog_next_cpu(smp_processor_id());
>> +     if (next_cpu >= nr_cpu_ids)
>> +             return;
>> +
>> +     smp_rmb();
>
> Mystery barrier (always) needs an explanatory comment, please.

OK.  Will add:  smp_rmb matches smp_wmb in watchdog_nmi_enable and
watchdog_nmi_disable to ensure watchdog_nmi_touch is updated for a cpu
before any other cpu sees it is online.

>> +     if (per_cpu(watchdog_nmi_touch, next_cpu) == true) {
>> +             per_cpu(watchdog_nmi_touch, next_cpu) = false;
>> +             return;
>> +     }
>
> I wonder if a well-timed CPU plug/unplug could result in two CPUs
> simultaneously checking one other CPU's state.

It can, which is why I set watchdog_nmi_touch for a cpu when it comes
online or when the previous cpu goes offline.  That should prevent a
false positive by preventing the first cpu that checks from updating
hrtimer_interrupts_saved.

>> +     if (is_hardlockup_other_cpu(next_cpu)) {
>> +             /* only warn once */
>> +             if (per_cpu(hard_watchdog_warn, next_cpu) == true)
>> +                     return;
>> +
>> +             if (hardlockup_panic)
>> +                     panic("Watchdog detected hard LOCKUP on cpu %u", next_cpu);
>> +             else
>> +                     WARN(1, "Watchdog detected hard LOCKUP on cpu %u", next_cpu);
>
> I suggest we use messages here which make it clear to people who read
> kernel output that this was triggered by hrtimers, not by NMI.  Most
> importantly because people will need to know that the CPU which locked
> up is *not this CPU* and that any backtrace from the reporting CPU is
> misleading.
>
> Also, there was never any sense in making the LOCKUP all-caps ;)

I'll change to "hrtimer watchdog on cpu %u detected hard lockup on cpu
%u".  Given the discussion above about the lack of backtraces on the
affected cpu, I'll also change the WARN to pr_warn, since the
backtrace of the warning cpu is misleading.  The panic will still get
the backtrace of the detecting cpu, but hopefully the clearer message
will avoid confusing people.

>> +             per_cpu(hard_watchdog_warn, next_cpu) = true;
>> +     } else {
>> +             per_cpu(hard_watchdog_warn, next_cpu) = false;
>> +     }
>> +}
>> +#else
>> +static inline void watchdog_check_hardlockup_other_cpu(void) { return; }
>> +#endif
>> +
>>
>> ...
>>
>> --- a/lib/Kconfig.debug
>> +++ b/lib/Kconfig.debug
>> @@ -191,15 +191,27 @@ config LOCKUP_DETECTOR
>>         The overhead should be minimal.  A periodic hrtimer runs to
>>         generate interrupts and kick the watchdog task every 4 seconds.
>>         An NMI is generated every 10 seconds or so to check for hardlockups.
>> +       If NMIs are not available on the platform, every 12 seconds the
>
> hm.  Is the old "4 seconds" still true/accurate/complete?

Yes, unless watchdog_thresh is changed on the command line or at run
time.  I can update the message to clarify that 4 seconds is the
default.

>> +       hrtimer interrupt on one cpu will be used to check for hardlockups
>> +       on the next cpu.
>>
>>         The frequency of hrtimer and NMI events and the soft and hard lockup
>>         thresholds can be controlled through the sysctl watchdog_thresh.
>>
>> -config HARDLOCKUP_DETECTOR
>> +config HARDLOCKUP_DETECTOR_NMI
>>       def_bool y
>>       depends on LOCKUP_DETECTOR && !HAVE_NMI_WATCHDOG
>>       depends on PERF_EVENTS && HAVE_PERF_EVENTS_NMI
>
> Confused.  I'd have expected this to depend on HAVE_NMI_WATCHDOG,
> rather than -no-that.  What does "HAVE_NMI_WATCHDOG" actually mean and
> what's happening here?

HAVE_NMI_WATCHDOG used to be called ARCH_HAS_NMI_WATCHDOG, which was a
little clearer.  I believe it means that the architecture has its own
NMI watchdog implementation, and doesn't require this perf-based one.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] hardlockup: detect hard lockups without NMIs using secondary cpus
  2013-01-15  0:13 ` Frederic Weisbecker
@ 2013-01-15  0:22   ` Colin Cross
  2013-01-15  0:25     ` Frederic Weisbecker
  2013-01-15 16:32   ` Paul E. McKenney
  1 sibling, 1 reply; 14+ messages in thread
From: Colin Cross @ 2013-01-15  0:22 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: lkml, Andrew Morton, Don Zickus, Ingo Molnar, Thomas Gleixner,
	liu chuansheng, linux-arm-kernel, Russell King - ARM Linux,
	Tony Lindgren

On Mon, Jan 14, 2013 at 4:13 PM, Frederic Weisbecker <fweisbec@gmail.com> wrote:
> 2013/1/11 Colin Cross <ccross@android.com>:
>> Emulate NMIs on systems where they are not available by using timer
>> interrupts on other cpus.  Each cpu will use its softlockup hrtimer
>> to check that the next cpu is processing hrtimer interrupts by
>> verifying that a counter is increasing.
>>
>> This patch is useful on systems where the hardlockup detector is not
>> available due to a lack of NMIs, for example most ARM SoCs.
>> Without this patch any cpu stuck with interrupts disabled can
>> cause a hardware watchdog reset with no debugging information,
>> but with this patch the kernel can detect the lockup and panic,
>> which can result in useful debugging info.
>>
>> Signed-off-by: Colin Cross <ccross@android.com>
>
> I believe this is pretty much what the RCU stall detector does
> already: checks for other CPUs being responsive. The only difference
> is on how it checks that. For RCU it's about checking for CPUs
> reporting quiescent states when requested to do so. In your case it's
> about ensuring the hrtimer interrupt is well handled.
>
> One thing you can do is to enqueue an RCU callback (cal_rcu()) every
> minute so you can force other CPUs to report quiescent states
> periodically and thus check for lockups.

That's a good point, I'll take a look at using that.  A minute is too
long, some SoCs have maximum HW watchdog periods of under 30 seconds,
but a call_rcu every 10-20 seconds might be sufficient.

> Now you'll face the same problem in the end: if you don't have NMIs,
> you won't have a very useful report.

Yes, but its still better than a silent reset.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] hardlockup: detect hard lockups without NMIs using secondary cpus
  2013-01-15  0:19   ` Colin Cross
@ 2013-01-15  0:25     ` Andrew Morton
  2013-01-15  0:30       ` Colin Cross
  2013-01-15  1:40     ` Colin Cross
  1 sibling, 1 reply; 14+ messages in thread
From: Andrew Morton @ 2013-01-15  0:25 UTC (permalink / raw)
  To: Colin Cross
  Cc: lkml, Don Zickus, Ingo Molnar, Thomas Gleixner, liu chuansheng,
	linux-arm-kernel, Russell King - ARM Linux, Frederic Weisbecker,
	Tony Lindgren

On Mon, 14 Jan 2013 16:19:23 -0800
Colin Cross <ccross@android.com> wrote:

> >> +static void watchdog_check_hardlockup_other_cpu(void)
> >> +{
> >> +     unsigned int next_cpu;
> >> +
> >> +     /*
> >> +      * Test for hardlockups every 3 samples.  The sample period is
> >> +      *  watchdog_thresh * 2 / 5, so 3 samples gets us back to slightly over
> >> +      *  watchdog_thresh (over by 20%).
> >> +      */
> >> +     if (__this_cpu_read(hrtimer_interrupts) % 3 != 0)
> >> +             return;
> >
> > The hardwired interval Seems Wrong.  watchdog_thresh is tunable at runtime.
> >
> > The comment could do with some fleshing out.  *why* do we want to test
> > at an interval "slightly over watchdog_thresh"?  What's going on here?
> 
> I'll reword it.  We don't want to be slightly over watchdog_thresh,
> ideally we would be exactly at watchdog_thresh.  However, since this
> relies on the hrtimer interrupts that are scheduled at watchdog_thresh
> * 2 / 5, there is no multiple of hrtimer_interrupts that will result
> in watchdog_thresh.  watchdog_thresh * 2 / 5 * 3 (watchdog_thresh *
> 1.2) is the closest I can get to testing for a hardlockup once every
> watchdog_thresh seconds.

It needs more than rewording, doesn't it?  What happens if watchdog_thresh is
altered at runtime?

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] hardlockup: detect hard lockups without NMIs using secondary cpus
  2013-01-15  0:22   ` Colin Cross
@ 2013-01-15  0:25     ` Frederic Weisbecker
  2013-01-15  1:53       ` Colin Cross
  0 siblings, 1 reply; 14+ messages in thread
From: Frederic Weisbecker @ 2013-01-15  0:25 UTC (permalink / raw)
  To: Colin Cross
  Cc: lkml, Andrew Morton, Don Zickus, Ingo Molnar, Thomas Gleixner,
	liu chuansheng, linux-arm-kernel, Russell King - ARM Linux,
	Tony Lindgren

2013/1/15 Colin Cross <ccross@android.com>:
> On Mon, Jan 14, 2013 at 4:13 PM, Frederic Weisbecker <fweisbec@gmail.com> wrote:
>> I believe this is pretty much what the RCU stall detector does
>> already: checks for other CPUs being responsive. The only difference
>> is on how it checks that. For RCU it's about checking for CPUs
>> reporting quiescent states when requested to do so. In your case it's
>> about ensuring the hrtimer interrupt is well handled.
>>
>> One thing you can do is to enqueue an RCU callback (cal_rcu()) every
>> minute so you can force other CPUs to report quiescent states
>> periodically and thus check for lockups.
>
> That's a good point, I'll take a look at using that.  A minute is too
> long, some SoCs have maximum HW watchdog periods of under 30 seconds,
> but a call_rcu every 10-20 seconds might be sufficient.

Sure. And you can tune CONFIG_RCU_CPU_STALL_TIMEOUT accordingly.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] hardlockup: detect hard lockups without NMIs using secondary cpus
  2013-01-15  0:25     ` Andrew Morton
@ 2013-01-15  0:30       ` Colin Cross
  2013-01-23  2:38         ` Colin Cross
  0 siblings, 1 reply; 14+ messages in thread
From: Colin Cross @ 2013-01-15  0:30 UTC (permalink / raw)
  To: Andrew Morton
  Cc: lkml, Don Zickus, Ingo Molnar, Thomas Gleixner, liu chuansheng,
	linux-arm-kernel, Russell King - ARM Linux, Frederic Weisbecker,
	Tony Lindgren

On Mon, Jan 14, 2013 at 4:25 PM, Andrew Morton
<akpm@linux-foundation.org> wrote:
> On Mon, 14 Jan 2013 16:19:23 -0800
> Colin Cross <ccross@android.com> wrote:
>
>> >> +static void watchdog_check_hardlockup_other_cpu(void)
>> >> +{
>> >> +     unsigned int next_cpu;
>> >> +
>> >> +     /*
>> >> +      * Test for hardlockups every 3 samples.  The sample period is
>> >> +      *  watchdog_thresh * 2 / 5, so 3 samples gets us back to slightly over
>> >> +      *  watchdog_thresh (over by 20%).
>> >> +      */
>> >> +     if (__this_cpu_read(hrtimer_interrupts) % 3 != 0)
>> >> +             return;
>> >
>> > The hardwired interval Seems Wrong.  watchdog_thresh is tunable at runtime.
>> >
>> > The comment could do with some fleshing out.  *why* do we want to test
>> > at an interval "slightly over watchdog_thresh"?  What's going on here?
>>
>> I'll reword it.  We don't want to be slightly over watchdog_thresh,
>> ideally we would be exactly at watchdog_thresh.  However, since this
>> relies on the hrtimer interrupts that are scheduled at watchdog_thresh
>> * 2 / 5, there is no multiple of hrtimer_interrupts that will result
>> in watchdog_thresh.  watchdog_thresh * 2 / 5 * 3 (watchdog_thresh *
>> 1.2) is the closest I can get to testing for a hardlockup once every
>> watchdog_thresh seconds.
>
> It needs more than rewording, doesn't it?  What happens if watchdog_thresh is
> altered at runtime?

I'm not sure what you mean.  If watchdog_thresh changes, the next
hrtimer interrupt on each cpu will move the following hrtimer
interrupt forward by the new watchdog_thresh * 2 / 5.  There may be a
single cycle of watchdog checks at an intermediate period, but nothing
bad should happen.

This code doesn't ever deal with watchdog_thresh directly, it is only
counting hrtimer interrupts.  3 hrtimer interrupts is always a
reasonable approximation of watchdog_thresh.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] hardlockup: detect hard lockups without NMIs using secondary cpus
  2013-01-15  0:19   ` Colin Cross
  2013-01-15  0:25     ` Andrew Morton
@ 2013-01-15  1:40     ` Colin Cross
  1 sibling, 0 replies; 14+ messages in thread
From: Colin Cross @ 2013-01-15  1:40 UTC (permalink / raw)
  To: Andrew Morton
  Cc: lkml, Don Zickus, Ingo Molnar, Thomas Gleixner, liu chuansheng,
	linux-arm-kernel, Russell King - ARM Linux, Frederic Weisbecker,
	Tony Lindgren

On Mon, Jan 14, 2013 at 4:19 PM, Colin Cross <ccross@android.com> wrote:
> On Mon, Jan 14, 2013 at 3:49 PM, Andrew Morton
> <akpm@linux-foundation.org> wrote:
>> On Fri, 11 Jan 2013 13:51:48 -0800
>> Colin Cross <ccross@android.com> wrote:
>>
>>> Emulate NMIs on systems where they are not available by using timer
>>> interrupts on other cpus.  Each cpu will use its softlockup hrtimer
>>> to check that the next cpu is processing hrtimer interrupts by
>>> verifying that a counter is increasing.
>>
>> Seems sensible.
>>
>>> This patch is useful on systems where the hardlockup detector is not
>>> available due to a lack of NMIs, for example most ARM SoCs.
>>> Without this patch any cpu stuck with interrupts disabled can
>>> cause a hardware watchdog reset with no debugging information,
>>> but with this patch the kernel can detect the lockup and panic,
>>> which can result in useful debugging info.
>>
>> But we don't get the target cpu's stack, yes?  That's a pretty big loss.
>
> It's a huge loss, but its still useful.  For one, it can separate
> "linux locked up one cpu" bugs from "the whole cpu complex stopped
> responding" bugs, which are much more common than you would hope on
> ARM cpus.  Also, as a separate patch I'm hoping to add reading the
> DBGPCSR register of both cpus during panic, which will at least give
> you the PC of the cpu that is stuck.
>
>>>
>>> ...
>>>
>>> +#ifdef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU
>>> +static unsigned int watchdog_next_cpu(unsigned int cpu)
>>> +{
>>> +     cpumask_t cpus = watchdog_cpus;
>>
>> cpumask_t can be tremendously huge and putting one on the stack is
>> risky.  Can we use watchdog_cpus directly here?  Perhaps with a lock?
>> or take a copy into a static local, with a lock?
>
> Sure, I can use a lock around it.  I'm used to very small numbers of cpus.
>

On second thought, I'm just going to remove the local copy and read
the global directly.  watchdog_cpus is updated with atomic bitmask
operations, so there is no erroneous value that could be returned when
referencing the global directly that couldn't also occur with a
slightly different order of updates.  The local copy is also not
completely atomic, since the bitmask could span multiple words.  All
intermediate values during multiple sequential updates should already
be handled by setting watchdog_nmi_touch on the appropriate cpus
during watchdog_nmi_enable and watchdog_nmi_disable.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] hardlockup: detect hard lockups without NMIs using secondary cpus
  2013-01-15  0:25     ` Frederic Weisbecker
@ 2013-01-15  1:53       ` Colin Cross
  2013-01-15  2:48         ` Frederic Weisbecker
  0 siblings, 1 reply; 14+ messages in thread
From: Colin Cross @ 2013-01-15  1:53 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: lkml, Andrew Morton, Don Zickus, Ingo Molnar, Thomas Gleixner,
	liu chuansheng, linux-arm-kernel, Russell King - ARM Linux,
	Tony Lindgren

On Mon, Jan 14, 2013 at 4:25 PM, Frederic Weisbecker <fweisbec@gmail.com> wrote:
> 2013/1/15 Colin Cross <ccross@android.com>:
>> On Mon, Jan 14, 2013 at 4:13 PM, Frederic Weisbecker <fweisbec@gmail.com> wrote:
>>> I believe this is pretty much what the RCU stall detector does
>>> already: checks for other CPUs being responsive. The only difference
>>> is on how it checks that. For RCU it's about checking for CPUs
>>> reporting quiescent states when requested to do so. In your case it's
>>> about ensuring the hrtimer interrupt is well handled.
>>>
>>> One thing you can do is to enqueue an RCU callback (cal_rcu()) every
>>> minute so you can force other CPUs to report quiescent states
>>> periodically and thus check for lockups.
>>
>> That's a good point, I'll take a look at using that.  A minute is too
>> long, some SoCs have maximum HW watchdog periods of under 30 seconds,
>> but a call_rcu every 10-20 seconds might be sufficient.
>
> Sure. And you can tune CONFIG_RCU_CPU_STALL_TIMEOUT accordingly.

After considering this, I think the hrtimer watchdog is more useful.
RCU stalls are not usually panic events, and I wouldn't want to add a
panic on every RCU stall.  The lack of stack traces on the affected
cpu makes a panic important.  I'm planning to add an ARM DBGPCSR panic
handler, which will be able to dump the PC of a stuck cpu even if it
is not responding to interrupts.  kexec or kgdb on panic might also
allow some inspection of the stack on stuck cpu.

Failing to process interrupts is a much more serious event than an RCU
stall, and being able to detect them separately may be very valuable
for debugging.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] hardlockup: detect hard lockups without NMIs using secondary cpus
  2013-01-15  1:53       ` Colin Cross
@ 2013-01-15  2:48         ` Frederic Weisbecker
  2013-01-15  3:26           ` Colin Cross
  0 siblings, 1 reply; 14+ messages in thread
From: Frederic Weisbecker @ 2013-01-15  2:48 UTC (permalink / raw)
  To: Colin Cross
  Cc: lkml, Andrew Morton, Don Zickus, Ingo Molnar, Thomas Gleixner,
	liu chuansheng, linux-arm-kernel, Russell King - ARM Linux,
	Tony Lindgren

2013/1/15 Colin Cross <ccross@android.com>:
> On Mon, Jan 14, 2013 at 4:25 PM, Frederic Weisbecker <fweisbec@gmail.com> wrote:
>> 2013/1/15 Colin Cross <ccross@android.com>:
>>> On Mon, Jan 14, 2013 at 4:13 PM, Frederic Weisbecker <fweisbec@gmail.com> wrote:
>>>> I believe this is pretty much what the RCU stall detector does
>>>> already: checks for other CPUs being responsive. The only difference
>>>> is on how it checks that. For RCU it's about checking for CPUs
>>>> reporting quiescent states when requested to do so. In your case it's
>>>> about ensuring the hrtimer interrupt is well handled.
>>>>
>>>> One thing you can do is to enqueue an RCU callback (cal_rcu()) every
>>>> minute so you can force other CPUs to report quiescent states
>>>> periodically and thus check for lockups.
>>>
>>> That's a good point, I'll take a look at using that.  A minute is too
>>> long, some SoCs have maximum HW watchdog periods of under 30 seconds,
>>> but a call_rcu every 10-20 seconds might be sufficient.
>>
>> Sure. And you can tune CONFIG_RCU_CPU_STALL_TIMEOUT accordingly.
>
> After considering this, I think the hrtimer watchdog is more useful.
> RCU stalls are not usually panic events, and I wouldn't want to add a
> panic on every RCU stall.  The lack of stack traces on the affected
> cpu makes a panic important.  I'm planning to add an ARM DBGPCSR panic
> handler, which will be able to dump the PC of a stuck cpu even if it
> is not responding to interrupts.  kexec or kgdb on panic might also
> allow some inspection of the stack on stuck cpu.
>
> Failing to process interrupts is a much more serious event than an RCU
> stall, and being able to detect them separately may be very valuable
> for debugging.

RCU stalls can happen for different reasons: softlockup (failure to
schedule another task), hardlockup (failure to process interrupts), or
a bug in RCU itself. But if you have a hardlockup, it will report it.

Now why do you need a panic in any case? I don't know DBGPCSR, is this
a breakpoint register? How do you plan to use it remotely from the CPU
that detects the lockup?

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] hardlockup: detect hard lockups without NMIs using secondary cpus
  2013-01-15  2:48         ` Frederic Weisbecker
@ 2013-01-15  3:26           ` Colin Cross
  0 siblings, 0 replies; 14+ messages in thread
From: Colin Cross @ 2013-01-15  3:26 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: lkml, Andrew Morton, Don Zickus, Ingo Molnar, Thomas Gleixner,
	liu chuansheng, linux-arm-kernel, Russell King - ARM Linux,
	Tony Lindgren

On Mon, Jan 14, 2013 at 6:48 PM, Frederic Weisbecker <fweisbec@gmail.com> wrote:
> 2013/1/15 Colin Cross <ccross@android.com>:
>> On Mon, Jan 14, 2013 at 4:25 PM, Frederic Weisbecker <fweisbec@gmail.com> wrote:
>>> 2013/1/15 Colin Cross <ccross@android.com>:
>>>> On Mon, Jan 14, 2013 at 4:13 PM, Frederic Weisbecker <fweisbec@gmail.com> wrote:
>>>>> I believe this is pretty much what the RCU stall detector does
>>>>> already: checks for other CPUs being responsive. The only difference
>>>>> is on how it checks that. For RCU it's about checking for CPUs
>>>>> reporting quiescent states when requested to do so. In your case it's
>>>>> about ensuring the hrtimer interrupt is well handled.
>>>>>
>>>>> One thing you can do is to enqueue an RCU callback (cal_rcu()) every
>>>>> minute so you can force other CPUs to report quiescent states
>>>>> periodically and thus check for lockups.
>>>>
>>>> That's a good point, I'll take a look at using that.  A minute is too
>>>> long, some SoCs have maximum HW watchdog periods of under 30 seconds,
>>>> but a call_rcu every 10-20 seconds might be sufficient.
>>>
>>> Sure. And you can tune CONFIG_RCU_CPU_STALL_TIMEOUT accordingly.
>>
>> After considering this, I think the hrtimer watchdog is more useful.
>> RCU stalls are not usually panic events, and I wouldn't want to add a
>> panic on every RCU stall.  The lack of stack traces on the affected
>> cpu makes a panic important.  I'm planning to add an ARM DBGPCSR panic
>> handler, which will be able to dump the PC of a stuck cpu even if it
>> is not responding to interrupts.  kexec or kgdb on panic might also
>> allow some inspection of the stack on stuck cpu.
>>
>> Failing to process interrupts is a much more serious event than an RCU
>> stall, and being able to detect them separately may be very valuable
>> for debugging.
>
> RCU stalls can happen for different reasons: softlockup (failure to
> schedule another task), hardlockup (failure to process interrupts), or
> a bug in RCU itself. But if you have a hardlockup, it will report it.

It will report it, but it will report it in the same way that it
reports a less serious issue, and in this case with zero debugging
information since the affected cpu won't dump its backtrace.  Better
than nothing, but not as useful as a panic can be.

> Now why do you need a panic in any case? I don't know DBGPCSR, is this
> a breakpoint register? How do you plan to use it remotely from the CPU
> that detects the lockup?

Panics can trigger extra debugging tools, like my previous examples
kexec and kgdb.

DBGPCSR is the "DeBuG Program Counter Sampling Register".  It is a
memory mapped register available on many (all?) ARM Cortex cpus that
returns a recent PC value for the cpu.  I have used it along with this
patch, and it produces very useful information.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] hardlockup: detect hard lockups without NMIs using secondary cpus
  2013-01-15  0:13 ` Frederic Weisbecker
  2013-01-15  0:22   ` Colin Cross
@ 2013-01-15 16:32   ` Paul E. McKenney
  1 sibling, 0 replies; 14+ messages in thread
From: Paul E. McKenney @ 2013-01-15 16:32 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: Colin Cross, linux-kernel, Andrew Morton, Don Zickus,
	Ingo Molnar, Thomas Gleixner, liu chuansheng, linux-arm-kernel,
	Russell King - ARM Linux, Tony Lindgren

On Tue, Jan 15, 2013 at 01:13:10AM +0100, Frederic Weisbecker wrote:
> 2013/1/11 Colin Cross <ccross@android.com>:
> > Emulate NMIs on systems where they are not available by using timer
> > interrupts on other cpus.  Each cpu will use its softlockup hrtimer
> > to check that the next cpu is processing hrtimer interrupts by
> > verifying that a counter is increasing.
> >
> > This patch is useful on systems where the hardlockup detector is not
> > available due to a lack of NMIs, for example most ARM SoCs.
> > Without this patch any cpu stuck with interrupts disabled can
> > cause a hardware watchdog reset with no debugging information,
> > but with this patch the kernel can detect the lockup and panic,
> > which can result in useful debugging info.
> >
> > Signed-off-by: Colin Cross <ccross@android.com>
> 
> I believe this is pretty much what the RCU stall detector does
> already: checks for other CPUs being responsive. The only difference
> is on how it checks that. For RCU it's about checking for CPUs
> reporting quiescent states when requested to do so. In your case it's
> about ensuring the hrtimer interrupt is well handled.
> 
> One thing you can do is to enqueue an RCU callback (cal_rcu()) every
> minute so you can force other CPUs to report quiescent states
> periodically and thus check for lockups.

This would work in all but one case, and that is where RCU believes
that the non-responsive CPU is in dyntick-idle mode.  In that case,
RCU would not be expecting it to respond and would therefore ignore
any non-responsiveness.

> Now you'll face the same problem in the end: if you don't have NMIs,
> you won't have a very useful report.

Indeed, I must confess that I have thus far chickened out on solving
the general NMI problem.  The RCU stall detector does try to look at
stacks remotely in some cases, but this is often unreliable, and some
architectures seem to refuse to produce a remote stack trace.

							Thanx, Paul


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] hardlockup: detect hard lockups without NMIs using secondary cpus
  2013-01-15  0:30       ` Colin Cross
@ 2013-01-23  2:38         ` Colin Cross
  0 siblings, 0 replies; 14+ messages in thread
From: Colin Cross @ 2013-01-23  2:38 UTC (permalink / raw)
  To: Andrew Morton
  Cc: lkml, Don Zickus, Ingo Molnar, Thomas Gleixner, liu chuansheng,
	linux-arm-kernel, Russell King - ARM Linux, Frederic Weisbecker,
	Tony Lindgren

On Mon, Jan 14, 2013 at 4:30 PM, Colin Cross <ccross@android.com> wrote:
> On Mon, Jan 14, 2013 at 4:25 PM, Andrew Morton
> <akpm@linux-foundation.org> wrote:
>> On Mon, 14 Jan 2013 16:19:23 -0800
>> Colin Cross <ccross@android.com> wrote:
>>
>>> >> +static void watchdog_check_hardlockup_other_cpu(void)
>>> >> +{
>>> >> +     unsigned int next_cpu;
>>> >> +
>>> >> +     /*
>>> >> +      * Test for hardlockups every 3 samples.  The sample period is
>>> >> +      *  watchdog_thresh * 2 / 5, so 3 samples gets us back to slightly over
>>> >> +      *  watchdog_thresh (over by 20%).
>>> >> +      */
>>> >> +     if (__this_cpu_read(hrtimer_interrupts) % 3 != 0)
>>> >> +             return;
>>> >
>>> > The hardwired interval Seems Wrong.  watchdog_thresh is tunable at runtime.
>>> >
>>> > The comment could do with some fleshing out.  *why* do we want to test
>>> > at an interval "slightly over watchdog_thresh"?  What's going on here?
>>>
>>> I'll reword it.  We don't want to be slightly over watchdog_thresh,
>>> ideally we would be exactly at watchdog_thresh.  However, since this
>>> relies on the hrtimer interrupts that are scheduled at watchdog_thresh
>>> * 2 / 5, there is no multiple of hrtimer_interrupts that will result
>>> in watchdog_thresh.  watchdog_thresh * 2 / 5 * 3 (watchdog_thresh *
>>> 1.2) is the closest I can get to testing for a hardlockup once every
>>> watchdog_thresh seconds.
>>
>> It needs more than rewording, doesn't it?  What happens if watchdog_thresh is
>> altered at runtime?
>
> I'm not sure what you mean.  If watchdog_thresh changes, the next
> hrtimer interrupt on each cpu will move the following hrtimer
> interrupt forward by the new watchdog_thresh * 2 / 5.  There may be a
> single cycle of watchdog checks at an intermediate period, but nothing
> bad should happen.
>
> This code doesn't ever deal with watchdog_thresh directly, it is only
> counting hrtimer interrupts.  3 hrtimer interrupts is always a
> reasonable approximation of watchdog_thresh.

I think I see the race you were referring to.  When the hrtimer
interrupt fires on the first cpu after changing the watchdog thresh,
it will see the new value and starting firing the timer at the new
rate.  The other cpus may not see the new value for as long as the old
sample period.  If the new threshold is more than 3 times lower than
the old value, one cpu could easily get 3 hrtimer interrupts while
another cpu doesn't get any, triggering the lockup detector.

The exact same issue is already present in the existing NMI detector,
which is conceptually very similar to this one.  It has two
asynchronous timers, the NMI perf counter and the hrtimer.  If one
timer sees the new threshold before the other their relative rates
will be wrong and the lockup detector may fire.  Setting
watchdog_nmi_touch is not good enough, as one interrupt could fire
enough to trigger the hardlockup detector twice.  The only fix I can
think of is to set a timestamp far enough in the future to catch all
the ordering problems, and ignore lockups until sched_clock() is
higher than the timestamp.  I think it would need to be a delay of
twice the larger of the old and new watchdog_thresh values.

But its worse than that, the NMI perf counter is never updated when
watchdog_thresh changes, so raising watchdog_thresh more than 2.5x may
trigger the NMI detector.

I'll send a separate patch for the first problem, but I don't have
anything to test resetting the NMI counter on so I'll leave that issue
for someone else.

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2013-01-23  2:39 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-01-11 21:51 [PATCH v2] hardlockup: detect hard lockups without NMIs using secondary cpus Colin Cross
2013-01-14 23:49 ` Andrew Morton
2013-01-15  0:19   ` Colin Cross
2013-01-15  0:25     ` Andrew Morton
2013-01-15  0:30       ` Colin Cross
2013-01-23  2:38         ` Colin Cross
2013-01-15  1:40     ` Colin Cross
2013-01-15  0:13 ` Frederic Weisbecker
2013-01-15  0:22   ` Colin Cross
2013-01-15  0:25     ` Frederic Weisbecker
2013-01-15  1:53       ` Colin Cross
2013-01-15  2:48         ` Frederic Weisbecker
2013-01-15  3:26           ` Colin Cross
2013-01-15 16:32   ` Paul E. McKenney

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).