All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v4 1/2] Track hard and soft "short lockups" or "stalls."
@ 2011-08-04 22:13 Alex Neronskiy
  2011-08-04 22:13 ` [PATCH v4 2/2] Output stall traces in /proc Alex Neronskiy
  0 siblings, 1 reply; 12+ messages in thread
From: Alex Neronskiy @ 2011-08-04 22:13 UTC (permalink / raw)
  To: linux-kernel
  Cc: Ingo Molnar, Don Zickus, Mandeep Singh Baines, Alex Neronskiy

Enable the lockup watchdog to not only respond to lockups beyond a
certain threshold, but also keep track of the worst sub-threshold
lockup (called a "stall") seen so far. The stalls are communicated
to userspace via /proc/sys/kernel/softstall_worst
and hardstall_worst, as well as through the kernel log, where a
stack trace is also included. Hard lockups are checked for more
frequently to collect additional data. Soft stalls are described
by the length of time between successive instances of scheduling
of the watchdog thread, and hard stalls are described by the
number of times a hard lockup check found that no interrupts had
been recorded. Some thresholds on these are set, to attempt to
prevent floods of useless data.

Signed-off-by: Alex Neronskiy <zakmagnus@chromium.org>
---
The motivation here is to be able to gather more data about just how
"extreme" latency conditions get inside the kernel, without
necessarily triggering a threshold and potentially setting off a
panic.

 include/linux/nmi.h |    6 ++-
 kernel/sysctl.c     |   18 +++++++-
 kernel/watchdog.c   |  124 ++++++++++++++++++++++++++++++++++++++++++++-------
 3 files changed, 129 insertions(+), 19 deletions(-)

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 2d304ef..57cda21 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -48,9 +48,13 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *);
 u64 hw_nmi_get_sample_period(int watchdog_thresh);
 extern int watchdog_enabled;
 extern int watchdog_thresh;
+extern unsigned long worst_softstall;
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+extern unsigned long worst_hardstall;
+#endif /* CONFIG_HARDLOCKUP_DETECTOR */
 struct ctl_table;
 extern int proc_dowatchdog(struct ctl_table *, int ,
 			   void __user *, size_t *, loff_t *);
-#endif
+#endif /* CONFIG_LOCKUP_DETECTOR */
 
 #endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 11d65b5..a392952 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -767,7 +767,23 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
-#endif
+	{
+		.procname       = "softstall_worst",
+		.data           = &worst_softstall,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = proc_doulongvec_minmax,
+	},
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+	{
+		.procname       = "hardstall_worst",
+		.data           = &worst_hardstall,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = proc_doulongvec_minmax,
+	},
+#endif /* CONFIG_HARDLOCKUP_DETECTOR */
+#endif /* CONFIG_LOCKUP_DETECTOR */
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
 	{
 		.procname       = "unknown_nmi_panic",
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 36491cd..8dd9ed4 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -29,6 +29,8 @@
 
 int watchdog_enabled = 1;
 int __read_mostly watchdog_thresh = 10;
+unsigned long worst_softstall;
+static DEFINE_SPINLOCK(softstall_lock);
 
 static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
 static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
@@ -40,8 +42,29 @@ static DEFINE_PER_CPU(bool, hard_watchdog_warn);
 static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
 static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
 static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
+static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_missed);
 static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
-#endif
+
+unsigned long worst_hardstall;
+static DEFINE_SPINLOCK(hardstall_lock);
+
+/* The number of consecutive hard check failures before it's a lockup */
+#define hardlockup_thresh (5)
+
+/*
+ * The minimum expected number of missed interrupts in a normal hard
+ * lockup check.
+ */
+#define hardstall_thresh (2)
+
+#endif /* CONFIG_HARDLOCKUP_DETECTOR */
+
+/*
+ * The number of milliseconds by which a stall must be worse than the
+ * current worst in order to be recorded. This prevents bursts of near-
+ * identical stalls.
+ */
+#define softstall_diff_thresh (100)
 
 /* boot commands */
 /*
@@ -98,19 +121,28 @@ __setup("nosoftlockup", nosoftlockup_setup);
  * the thresholds with a factor: we make the soft threshold twice the amount of
  * time the hard threshold is.
  */
-static int get_softlockup_thresh(void)
+static inline int get_softlockup_thresh(void)
 {
 	return watchdog_thresh * 2;
 }
 
 /*
- * Returns seconds, approximately.  We don't need nanosecond
+ * This is just the period of the timer function. A "stall" shorter than
+ * this is the expected behavior and is therefore not noteworthy.
+ */
+static inline int get_softstall_thresh(void)
+{
+	return 1000 / 5 * get_softlockup_thresh();
+}
+
+/*
+ * Returns milliseconds, approximately.  We don't need nanosecond
  * resolution, and we don't need to waste time with a big divide when
- * 2^30ns == 1.074s.
+ * 2^20ns == 1.049ms.
  */
 static unsigned long get_timestamp(int this_cpu)
 {
-	return cpu_clock(this_cpu) >> 30LL;  /* 2^30 ~= 10^9 */
+	return cpu_clock(this_cpu) >> 20LL;  /* 2^20 ~= 10^6 */
 }
 
 static unsigned long get_sample_period(void)
@@ -176,25 +208,79 @@ void touch_softlockup_watchdog_sync(void)
 
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
 /* watchdog detector functions */
-static int is_hardlockup(void)
+static void update_hardstall(unsigned long stall, int this_cpu)
+{
+	int update_stall = 0;
+
+	if (stall > hardstall_thresh && stall > worst_hardstall) {
+		unsigned long flags;
+		spin_lock_irqsave(&hardstall_lock, flags);
+		update_stall = stall > worst_hardstall;
+		if (update_stall)
+			worst_hardstall = stall;
+		spin_unlock_irqrestore(&hardstall_lock, flags);
+	}
+
+	if (update_stall) {
+		printk(KERN_WARNING "LOCKUP may be in progress!"
+			"Worst hard stall seen on CPU#%d: %lu interrupts missed\n",
+			this_cpu, stall);
+		dump_stack();
+	}
+}
+
+static int is_hardlockup(int this_cpu)
 {
 	unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
+	unsigned long hrint_saved = __this_cpu_read(hrtimer_interrupts_saved);
+	unsigned long ints_missed = 0;
+
+	__this_cpu_write(hrtimer_interrupts_saved, hrint);
 
-	if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
+	if (hrint_saved == hrint)
+		ints_missed = per_cpu(hrtimer_interrupts_missed, this_cpu)++;
+	else
+		__this_cpu_write(hrtimer_interrupts_missed, 0);
+
+	if (ints_missed >= hardlockup_thresh)
 		return 1;
 
-	__this_cpu_write(hrtimer_interrupts_saved, hrint);
+	update_hardstall(ints_missed, this_cpu);
 	return 0;
 }
 #endif
 
-static int is_softlockup(unsigned long touch_ts)
+static void update_softstall(unsigned long stall, int this_cpu)
 {
-	unsigned long now = get_timestamp(smp_processor_id());
+	int update_stall = 0;
+	if (stall > get_softstall_thresh() &&
+			stall > worst_softstall + softstall_diff_thresh) {
+		unsigned long flags;
+		spin_lock_irqsave(&softstall_lock, flags);
+		update_stall = stall > worst_softstall + softstall_diff_thresh;
+		if (update_stall)
+			worst_softstall = stall;
+		spin_unlock_irqrestore(&softstall_lock, flags);
+	}
+
+	if (update_stall) {
+		printk(KERN_WARNING "LOCKUP may be in progress!"
+				"Worst soft stall seen on CPU#%d: %lums\n",
+				this_cpu, stall);
+		dump_stack();
+	}
+}
+
+static int is_softlockup(unsigned long touch_ts, int this_cpu)
+{
+	unsigned long now = get_timestamp(this_cpu);
+	unsigned long stall = now - touch_ts;
 
 	/* Warn about unreasonable delays: */
-	if (time_after(now, touch_ts + get_softlockup_thresh()))
-		return now - touch_ts;
+	if (time_after(now, touch_ts + 1000 * get_softlockup_thresh()))
+		return stall;
+
+	update_softstall(stall, this_cpu);
 
 	return 0;
 }
@@ -214,6 +300,9 @@ static void watchdog_overflow_callback(struct perf_event *event,
 		 struct perf_sample_data *data,
 		 struct pt_regs *regs)
 {
+	int lockup;
+	int this_cpu = smp_processor_id();
+
 	/* Ensure the watchdog never gets throttled */
 	event->hw.interrupts = 0;
 
@@ -228,9 +317,8 @@ static void watchdog_overflow_callback(struct perf_event *event,
 	 * fired multiple times before we overflow'd.  If it hasn't
 	 * then this is a good indication the cpu is stuck
 	 */
-	if (is_hardlockup()) {
-		int this_cpu = smp_processor_id();
-
+	lockup = is_hardlockup(this_cpu);
+	if (lockup) {
 		/* only print hardlockups once */
 		if (__this_cpu_read(hard_watchdog_warn) == true)
 			return;
@@ -261,6 +349,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 	unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
 	struct pt_regs *regs = get_irq_regs();
 	int duration;
+	int this_cpu = smp_processor_id();
 
 	/* kick the hardlockup detector */
 	watchdog_interrupt_count();
@@ -290,14 +379,14 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 	 * indicate it is getting cpu time.  If it hasn't then
 	 * this is a good indication some task is hogging the cpu
 	 */
-	duration = is_softlockup(touch_ts);
+	duration = is_softlockup(touch_ts, this_cpu);
 	if (unlikely(duration)) {
 		/* only warn once */
 		if (__this_cpu_read(soft_watchdog_warn) == true)
 			return HRTIMER_RESTART;
 
 		printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
-			smp_processor_id(), duration,
+			this_cpu, duration >> 10L,
 			current->comm, task_pid_nr(current));
 		print_modules();
 		print_irqtrace_events(current);
@@ -371,6 +460,7 @@ static int watchdog_nmi_enable(int cpu)
 
 	wd_attr = &wd_hw_attr;
 	wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
+	do_div(wd_attr->sample_period, hardlockup_thresh);
 
 	/* Try to register using hardware perf events */
 	event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
-- 
1.7.3.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v4 2/2] Output stall traces in /proc
  2011-08-04 22:13 [PATCH v4 1/2] Track hard and soft "short lockups" or "stalls." Alex Neronskiy
@ 2011-08-04 22:13 ` Alex Neronskiy
  2011-08-05 13:40   ` Don Zickus
  0 siblings, 1 reply; 12+ messages in thread
From: Alex Neronskiy @ 2011-08-04 22:13 UTC (permalink / raw)
  To: linux-kernel
  Cc: Ingo Molnar, Don Zickus, Mandeep Singh Baines, Alex Neronskiy

Instead of using the log, use files in /proc for trace output.
Printing to the log can result in watchdog touches, distorting the
very events being measured. Additionally, the information will not
distract from lockups when users view the log.

Signed-off-by: Alex Neronskiy <zakmagnus@chromium.org>
---
 kernel/watchdog.c |  107 ++++++++++++++++++++++++++++++++++++++++++----------
 1 files changed, 86 insertions(+), 21 deletions(-)

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 8dd9ed4..9cdc6bb 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -23,13 +23,24 @@
 #include <linux/notifier.h>
 #include <linux/module.h>
 #include <linux/sysctl.h>
+#include <linux/stacktrace.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
 
 #include <asm/irq_regs.h>
 #include <linux/perf_event.h>
 
+#define STALL_MAX_TRACE_DEPTH 50
 int watchdog_enabled = 1;
 int __read_mostly watchdog_thresh = 10;
 unsigned long worst_softstall;
+static unsigned long softstall_trace_entries[STALL_MAX_TRACE_DEPTH];
+static struct stack_trace softstall_trace = {
+	.nr_entries = 0,
+	.max_entries = STALL_MAX_TRACE_DEPTH,
+	.entries = softstall_trace_entries,
+	.skip = 0
+};
 static DEFINE_SPINLOCK(softstall_lock);
 
 static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
@@ -46,6 +57,13 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_missed);
 static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
 
 unsigned long worst_hardstall;
+static unsigned long hardstall_trace_entries[STALL_MAX_TRACE_DEPTH];
+static struct stack_trace hardstall_trace = {
+	.nr_entries = 0,
+	.max_entries = STALL_MAX_TRACE_DEPTH,
+	.entries = hardstall_trace_entries,
+	.skip = 0
+};
 static DEFINE_SPINLOCK(hardstall_lock);
 
 /* The number of consecutive hard check failures before it's a lockup */
@@ -210,23 +228,16 @@ void touch_softlockup_watchdog_sync(void)
 /* watchdog detector functions */
 static void update_hardstall(unsigned long stall, int this_cpu)
 {
-	int update_stall = 0;
-
 	if (stall > hardstall_thresh && stall > worst_hardstall) {
 		unsigned long flags;
 		spin_lock_irqsave(&hardstall_lock, flags);
-		update_stall = stall > worst_hardstall;
-		if (update_stall)
+		if (stall > worst_hardstall) {
 			worst_hardstall = stall;
+			hardstall_trace.nr_entries = 0;
+			save_stack_trace(&hardstall_trace);
+		}
 		spin_unlock_irqrestore(&hardstall_lock, flags);
 	}
-
-	if (update_stall) {
-		printk(KERN_WARNING "LOCKUP may be in progress!"
-			"Worst hard stall seen on CPU#%d: %lu interrupts missed\n",
-			this_cpu, stall);
-		dump_stack();
-	}
 }
 
 static int is_hardlockup(int this_cpu)
@@ -252,23 +263,17 @@ static int is_hardlockup(int this_cpu)
 
 static void update_softstall(unsigned long stall, int this_cpu)
 {
-	int update_stall = 0;
 	if (stall > get_softstall_thresh() &&
 			stall > worst_softstall + softstall_diff_thresh) {
 		unsigned long flags;
 		spin_lock_irqsave(&softstall_lock, flags);
-		update_stall = stall > worst_softstall + softstall_diff_thresh;
-		if (update_stall)
+		if (stall > worst_softstall + softstall_diff_thresh) {
 			worst_softstall = stall;
+			softstall_trace.nr_entries = 0;
+			save_stack_trace(&softstall_trace);
+		}
 		spin_unlock_irqrestore(&softstall_lock, flags);
 	}
-
-	if (update_stall) {
-		printk(KERN_WARNING "LOCKUP may be in progress!"
-				"Worst soft stall seen on CPU#%d: %lums\n",
-				this_cpu, stall);
-		dump_stack();
-	}
 }
 
 static int is_softlockup(unsigned long touch_ts, int this_cpu)
@@ -667,10 +672,63 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
 	.notifier_call = cpu_callback
 };
 
+static int show_stall_trace(struct seq_file *f, void *v)
+{
+	int i, end;
+	struct stack_trace *trace = f->private;
+	spinlock_t *lock;
+	unsigned long flags;
+
+	if (trace == &softstall_trace)
+		lock = &softstall_lock;
+	else
+		lock = &hardstall_lock;
+
+	end = trace->nr_entries;
+	/* don't bother printing the trailing value */
+	if (end < trace->max_entries)
+		end--;
+
+	spin_lock_irqsave(lock, flags);
+	for (i = 0; i < end; i++) {
+		seq_printf(f, "[<%pK>] %pS\n", (void *)trace->entries[i],
+				(void *)trace->entries[i]);
+	}
+	spin_unlock_irqrestore(lock, flags);
+
+	return 0;
+}
+
+static int softstall_trace_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, show_stall_trace, &softstall_trace);
+}
+
+static int hardstall_trace_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, show_stall_trace, &hardstall_trace);
+}
+
+static struct file_operations softstall_trace_ops = {
+	.open = softstall_trace_open,
+	.read = seq_read,
+	.write = seq_write,
+	.llseek = seq_lseek,
+	.release = seq_release
+};
+static struct file_operations hardstall_trace_ops = {
+	.open = hardstall_trace_open,
+	.read = seq_read,
+	.write = seq_write,
+	.llseek = seq_lseek,
+	.release = seq_release
+};
+
 void __init lockup_detector_init(void)
 {
 	void *cpu = (void *)(long)smp_processor_id();
 	int err;
+	struct proc_dir_entry *entry;
 
 	err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
 	WARN_ON(notifier_to_errno(err));
@@ -678,5 +736,12 @@ void __init lockup_detector_init(void)
 	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
 	register_cpu_notifier(&cpu_nfb);
 
+	entry = create_proc_entry("softstall_trace", 0444, NULL);
+	if (entry)
+		entry->proc_fops = &softstall_trace_ops;
+	entry = create_proc_entry("hardstall_trace", 0444, NULL);
+	if (entry)
+		entry->proc_fops = &hardstall_trace_ops;
+
 	return;
 }
-- 
1.7.3.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH v4 2/2] Output stall traces in /proc
  2011-08-04 22:13 ` [PATCH v4 2/2] Output stall traces in /proc Alex Neronskiy
@ 2011-08-05 13:40   ` Don Zickus
  2011-08-05 17:12     ` Alex Neronskiy
  0 siblings, 1 reply; 12+ messages in thread
From: Don Zickus @ 2011-08-05 13:40 UTC (permalink / raw)
  To: Alex Neronskiy; +Cc: linux-kernel, Ingo Molnar, Mandeep Singh Baines

On Thu, Aug 04, 2011 at 03:13:11PM -0700, Alex Neronskiy wrote:
> Instead of using the log, use files in /proc for trace output.
> Printing to the log can result in watchdog touches, distorting the
> very events being measured. Additionally, the information will not
> distract from lockups when users view the log.

Interesting.  I didn't realize how easy it was to save those traces.  I
think this is a better approach because it hides this stuff in a proc
file.

I'll have to play with a little bit, but I was wondering what happens when
you have multiple stack traces coming in.  Before when it was printing to
the console, it was probably a bit of professional courtesy to keep the
warnings to a minimum.  But if you are going to use the proc fs, we can
probably be a little noisier.  So I had two questions

If you don't get to the proc fs fast enough, are you overwriting the
shorter stall stack trace with a slightly longer stall stack trace.  It
seems the 

> +			hardstall_trace.nr_entries = 0;

would reset things and you would lose the original worst_stall.

Do we want to increment the worst stall time now?  Shouldn't we just stay
at the current threshold and continue to print the same stack trace?  To
me that is the definition of 'worst stall'.  This might contradict
something I said earlier, I don't know (if it does I apologize).  Instead
if someone wants to 'skip' that 'worst stall' they can possibly just
increment the threshold?

Or another idea I just had (it adds work to this), perhaps create a sample
frequency for the lockup timers.  What I mean by that is, currently the
timer fires 5 times (or 2.5 for hardlockups) before the NMI triggers.  We
could create a knob that allows us to sample at 10 or 20 or whatever.
Then next time the hrtimer goes off it would just reload the new
frequency.

This would allow better resolution on the worst stall I believe, but it
invovles more knobs now. So the knobs would look like

watchdog_thresh - length of time before NMI fires (currently 10)
watchdog_sample_freq - number of times hrtimer fires before NMI fires
			(currently 5, well 2.5)
watchdog_worst_stall_hard - number of sample_freq before printing warning
				(currently 2 I think)


Because you converted to ms resolution, it is easier to divide the
frequency up.  Of course if you increase the sample frequency, you would
probably have to increase the worst stall number too.

The downside is the power management folks would whine that it drains
laptop batteries.  But Matthew Garret sits a few cubes down from me, so I
can probably have him help me hook into the idle thread to quiesce the
timers.

Thoughts?

code comments below
> 
> Signed-off-by: Alex Neronskiy <zakmagnus@chromium.org>
> ---
>  kernel/watchdog.c |  107 ++++++++++++++++++++++++++++++++++++++++++----------
>  1 files changed, 86 insertions(+), 21 deletions(-)
> 
> diff --git a/kernel/watchdog.c b/kernel/watchdog.c
> index 8dd9ed4..9cdc6bb 100644
> --- a/kernel/watchdog.c
> +++ b/kernel/watchdog.c
> @@ -23,13 +23,24 @@
>  #include <linux/notifier.h>
>  #include <linux/module.h>
>  #include <linux/sysctl.h>
> +#include <linux/stacktrace.h>
> +#include <linux/seq_file.h>
> +#include <linux/proc_fs.h>
>  
>  #include <asm/irq_regs.h>
>  #include <linux/perf_event.h>
>  
> +#define STALL_MAX_TRACE_DEPTH 50
>  int watchdog_enabled = 1;
>  int __read_mostly watchdog_thresh = 10;
>  unsigned long worst_softstall;
> +static unsigned long softstall_trace_entries[STALL_MAX_TRACE_DEPTH];
> +static struct stack_trace softstall_trace = {
> +	.nr_entries = 0,
> +	.max_entries = STALL_MAX_TRACE_DEPTH,
> +	.entries = softstall_trace_entries,
> +	.skip = 0
> +};
>  static DEFINE_SPINLOCK(softstall_lock);
>  
>  static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
> @@ -46,6 +57,13 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_missed);
>  static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
>  
>  unsigned long worst_hardstall;
> +static unsigned long hardstall_trace_entries[STALL_MAX_TRACE_DEPTH];
> +static struct stack_trace hardstall_trace = {
> +	.nr_entries = 0,
> +	.max_entries = STALL_MAX_TRACE_DEPTH,
> +	.entries = hardstall_trace_entries,
> +	.skip = 0
> +};
>  static DEFINE_SPINLOCK(hardstall_lock);
>  
>  /* The number of consecutive hard check failures before it's a lockup */
> @@ -210,23 +228,16 @@ void touch_softlockup_watchdog_sync(void)
>  /* watchdog detector functions */
>  static void update_hardstall(unsigned long stall, int this_cpu)
>  {
> -	int update_stall = 0;
> -
>  	if (stall > hardstall_thresh && stall > worst_hardstall) {
>  		unsigned long flags;
>  		spin_lock_irqsave(&hardstall_lock, flags);
> -		update_stall = stall > worst_hardstall;
> -		if (update_stall)
> +		if (stall > worst_hardstall) {
>  			worst_hardstall = stall;
> +			hardstall_trace.nr_entries = 0;
> +			save_stack_trace(&hardstall_trace);
> +		}
>  		spin_unlock_irqrestore(&hardstall_lock, flags);
>  	}
> -
> -	if (update_stall) {
> -		printk(KERN_WARNING "LOCKUP may be in progress!"
> -			"Worst hard stall seen on CPU#%d: %lu interrupts missed\n",
> -			this_cpu, stall);
> -		dump_stack();
> -	}
>  }
>  
>  static int is_hardlockup(int this_cpu)
> @@ -252,23 +263,17 @@ static int is_hardlockup(int this_cpu)
>  
>  static void update_softstall(unsigned long stall, int this_cpu)
>  {
> -	int update_stall = 0;
>  	if (stall > get_softstall_thresh() &&
>  			stall > worst_softstall + softstall_diff_thresh) {
>  		unsigned long flags;
>  		spin_lock_irqsave(&softstall_lock, flags);
> -		update_stall = stall > worst_softstall + softstall_diff_thresh;
> -		if (update_stall)
> +		if (stall > worst_softstall + softstall_diff_thresh) {
>  			worst_softstall = stall;
> +			softstall_trace.nr_entries = 0;
> +			save_stack_trace(&softstall_trace);
> +		}
>  		spin_unlock_irqrestore(&softstall_lock, flags);
>  	}
> -
> -	if (update_stall) {
> -		printk(KERN_WARNING "LOCKUP may be in progress!"
> -				"Worst soft stall seen on CPU#%d: %lums\n",
> -				this_cpu, stall);
> -		dump_stack();
> -	}
>  }
>  
>  static int is_softlockup(unsigned long touch_ts, int this_cpu)
> @@ -667,10 +672,63 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
>  	.notifier_call = cpu_callback
>  };
>  
> +static int show_stall_trace(struct seq_file *f, void *v)
> +{
> +	int i, end;
> +	struct stack_trace *trace = f->private;
> +	spinlock_t *lock;

This lock is defined locally, I don't think that is what you want.  Locks
need to be a shared variable.  You might want to use a mutex here so it
can sleep in case the stack trace is long.  It is only called from the
user context I believe so it should be fine.

> +	unsigned long flags;
> +
> +	if (trace == &softstall_trace)
> +		lock = &softstall_lock;
> +	else
> +		lock = &hardstall_lock;
> +
> +	end = trace->nr_entries;
> +	/* don't bother printing the trailing value */
> +	if (end < trace->max_entries)
> +		end--;
> +
> +	spin_lock_irqsave(lock, flags);
> +	for (i = 0; i < end; i++) {
> +		seq_printf(f, "[<%pK>] %pS\n", (void *)trace->entries[i],
> +				(void *)trace->entries[i]);
> +	}
> +	spin_unlock_irqrestore(lock, flags);
> +
> +	return 0;
> +}
> +
> +static int softstall_trace_open(struct inode *inode, struct file *file)
> +{
> +	return single_open(file, show_stall_trace, &softstall_trace);
> +}
> +
> +static int hardstall_trace_open(struct inode *inode, struct file *file)
> +{
> +	return single_open(file, show_stall_trace, &hardstall_trace);
> +}
> +
> +static struct file_operations softstall_trace_ops = {
> +	.open = softstall_trace_open,
> +	.read = seq_read,
> +	.write = seq_write,
> +	.llseek = seq_lseek,
> +	.release = seq_release
> +};
> +static struct file_operations hardstall_trace_ops = {
> +	.open = hardstall_trace_open,
> +	.read = seq_read,
> +	.write = seq_write,
> +	.llseek = seq_lseek,
> +	.release = seq_release
> +};
> +
>  void __init lockup_detector_init(void)
>  {
>  	void *cpu = (void *)(long)smp_processor_id();
>  	int err;
> +	struct proc_dir_entry *entry;
>  
>  	err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
>  	WARN_ON(notifier_to_errno(err));
> @@ -678,5 +736,12 @@ void __init lockup_detector_init(void)
>  	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
>  	register_cpu_notifier(&cpu_nfb);
>  
> +	entry = create_proc_entry("softstall_trace", 0444, NULL);
> +	if (entry)
> +		entry->proc_fops = &softstall_trace_ops;
> +	entry = create_proc_entry("hardstall_trace", 0444, NULL);
> +	if (entry)
> +		entry->proc_fops = &hardstall_trace_ops;

Where do these get created?  Under /proc?  I don't think that will be
allowed.  I would think this is a debug feature and we should create
something under /sys/kernel/debug like lockup_detector and stick these files in
there.

Cheers,
Don

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v4 2/2] Output stall traces in /proc
  2011-08-05 13:40   ` Don Zickus
@ 2011-08-05 17:12     ` Alex Neronskiy
  2011-08-05 18:43       ` Don Zickus
  0 siblings, 1 reply; 12+ messages in thread
From: Alex Neronskiy @ 2011-08-05 17:12 UTC (permalink / raw)
  To: Don Zickus; +Cc: linux-kernel, Ingo Molnar, Mandeep Singh Baines

On Fri, Aug 5, 2011 at 6:40 AM, Don Zickus <dzickus@redhat.com> wrote:
> I'll have to play with a little bit, but I was wondering what happens when
> you have multiple stack traces coming in.  Before when it was printing to
> the console, it was probably a bit of professional courtesy to keep the
> warnings to a minimum.  But if you are going to use the proc fs, we can
> probably be a little noisier.  So I had two questions
>
> If you don't get to the proc fs fast enough, are you overwriting the
> shorter stall stack trace with a slightly longer stall stack trace.  It
> seems the
>
>> +                     hardstall_trace.nr_entries = 0;
>
> would reset things and you would lose the original worst_stall.
>
> Do we want to increment the worst stall time now?  Shouldn't we just stay
> at the current threshold and continue to print the same stack trace?  To
> me that is the definition of 'worst stall'.  This might contradict
> something I said earlier, I don't know (if it does I apologize).  Instead
> if someone wants to 'skip' that 'worst stall' they can possibly just
> increment the threshold?
I don't understand your notion of "worst." If there is a stall and a
slightly longer stall, then the longer one is the worst one.

I do see what you're saying about affording more noise. If we do what
you're saying (if I understand correctly), then the "worst" will not
really be the worst at all, but instead a purely user-tunable
threshold. The way the code is now, the user can already set their own
threshold, but the threshold will also adjust itself automatically.
The question becomes, "what is the best use case to support by
default?" For example, if a system is allowed to operate for a while
and then the user goes to check the stalls, then if we can only tell
them about one stall, the most informative one is probably the worst
one that happened during that period of operation. Based on your
suggestion, what they would be shown instead is the most recent stall.
After checking the stalls, the user can reset the worst/threshold to 0
or whatever else they would like and then check again later after
another while.

If the user is willing to micromanage these settings and basically
react to every stall, then they would benefit most from the threshold
never automatically adjusting itself. However, if they only check once
in a while, perhaps not even in response to any particular event, then
they could be better off if the kernel did some simple filtering on
its own and gave them back the most "interesting" piece of
information.

> Or another idea I just had (it adds work to this), perhaps create a sample
> frequency for the lockup timers.  What I mean by that is, currently the
> timer fires 5 times (or 2.5 for hardlockups) before the NMI triggers.  We
> could create a knob that allows us to sample at 10 or 20 or whatever.
> Then next time the hrtimer goes off it would just reload the new
> frequency.
>
> This would allow better resolution on the worst stall I believe, but it
> invovles more knobs now. So the knobs would look like
>
> watchdog_thresh - length of time before NMI fires (currently 10)
> watchdog_sample_freq - number of times hrtimer fires before NMI fires
>                        (currently 5, well 2.5)
> watchdog_worst_stall_hard - number of sample_freq before printing warning
>                                (currently 2 I think)
>
>
> Because you converted to ms resolution, it is easier to divide the
> frequency up.  Of course if you increase the sample frequency, you would
> probably have to increase the worst stall number too.
>
> The downside is the power management folks would whine that it drains
> laptop batteries.  But Matthew Garret sits a few cubes down from me, so I
> can probably have him help me hook into the idle thread to quiesce the
> timers.
>
> Thoughts?
Sounds useful. Doing this stuff excessively may waste not only power
but maybe even CPU time? Anyway, nothing wrong with making it
adjustable, as far as I can tell. However, I'd like it if at least the
basic foundation could get accepted soon, if only so I can pull it
down to Chromium OS ASAP.

> This lock is defined locally, I don't think that is what you want.  Locks
> need to be a shared variable.  You might want to use a mutex here so it
> can sleep in case the stack trace is long.  It is only called from the
> user context I believe so it should be fine.
Doesn't it just use one of the locks defined at the top of the file?
This lock protects the stack trace data structure, so it's not read
and written at once. This also means it is taken up by the kernel
during interrupts.

> Where do these get created?  Under /proc?  I don't think that will be
> allowed.  I would think this is a debug feature and we should create
> something under /sys/kernel/debug like lockup_detector and stick these files in
> there.
Okay, and I suppose I'll shove the *stall_worst files into the same
place (currently they're in /proc/sys/kernel/).

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v4 2/2] Output stall traces in /proc
  2011-08-05 17:12     ` Alex Neronskiy
@ 2011-08-05 18:43       ` Don Zickus
  2011-08-05 21:26         ` ZAK Magnus
  0 siblings, 1 reply; 12+ messages in thread
From: Don Zickus @ 2011-08-05 18:43 UTC (permalink / raw)
  To: Alex Neronskiy; +Cc: linux-kernel, Ingo Molnar, Mandeep Singh Baines

On Fri, Aug 05, 2011 at 10:12:15AM -0700, Alex Neronskiy wrote:
> On Fri, Aug 5, 2011 at 6:40 AM, Don Zickus <dzickus@redhat.com> wrote:
> > I'll have to play with a little bit, but I was wondering what happens when
> > you have multiple stack traces coming in.  Before when it was printing to
> > the console, it was probably a bit of professional courtesy to keep the
> > warnings to a minimum.  But if you are going to use the proc fs, we can
> > probably be a little noisier.  So I had two questions
> >
> > If you don't get to the proc fs fast enough, are you overwriting the
> > shorter stall stack trace with a slightly longer stall stack trace.  It
> > seems the
> >
> >> +                     hardstall_trace.nr_entries = 0;
> >
> > would reset things and you would lose the original worst_stall.
> >
> > Do we want to increment the worst stall time now?  Shouldn't we just stay
> > at the current threshold and continue to print the same stack trace?  To
> > me that is the definition of 'worst stall'.  This might contradict
> > something I said earlier, I don't know (if it does I apologize).  Instead
> > if someone wants to 'skip' that 'worst stall' they can possibly just
> > increment the threshold?
> I don't understand your notion of "worst." If there is a stall and a
> slightly longer stall, then the longer one is the worst one.

Yeah, I don't know why I had it my head this morning that the shorter the
stall the more important it is.  That was pretty dumb.

> 
> I do see what you're saying about affording more noise. If we do what
> you're saying (if I understand correctly), then the "worst" will not
> really be the worst at all, but instead a purely user-tunable
> threshold. The way the code is now, the user can already set their own
> threshold, but the threshold will also adjust itself automatically.
> The question becomes, "what is the best use case to support by
> default?" For example, if a system is allowed to operate for a while
> and then the user goes to check the stalls, then if we can only tell
> them about one stall, the most informative one is probably the worst
> one that happened during that period of operation. Based on your
> suggestion, what they would be shown instead is the most recent stall.
> After checking the stalls, the user can reset the worst/threshold to 0
> or whatever else they would like and then check again later after
> another while.

I understand why you were resetting the stack trace all the time now.

> 
> If the user is willing to micromanage these settings and basically
> react to every stall, then they would benefit most from the threshold
> never automatically adjusting itself. However, if they only check once
> in a while, perhaps not even in response to any particular event, then
> they could be better off if the kernel did some simple filtering on
> its own and gave them back the most "interesting" piece of
> information.

Right.

> 
> > Or another idea I just had (it adds work to this), perhaps create a sample
> > frequency for the lockup timers.  What I mean by that is, currently the
> > timer fires 5 times (or 2.5 for hardlockups) before the NMI triggers.  We
> > could create a knob that allows us to sample at 10 or 20 or whatever.
> > Then next time the hrtimer goes off it would just reload the new
> > frequency.
> >
> > This would allow better resolution on the worst stall I believe, but it
> > invovles more knobs now. So the knobs would look like
> >
> > watchdog_thresh - length of time before NMI fires (currently 10)
> > watchdog_sample_freq - number of times hrtimer fires before NMI fires
> >                        (currently 5, well 2.5)
> > watchdog_worst_stall_hard - number of sample_freq before printing warning
> >                                (currently 2 I think)
> >
> >
> > Because you converted to ms resolution, it is easier to divide the
> > frequency up.  Of course if you increase the sample frequency, you would
> > probably have to increase the worst stall number too.
> >
> > The downside is the power management folks would whine that it drains
> > laptop batteries.  But Matthew Garret sits a few cubes down from me, so I
> > can probably have him help me hook into the idle thread to quiesce the
> > timers.
> >
> > Thoughts?
> Sounds useful. Doing this stuff excessively may waste not only power
> but maybe even CPU time? Anyway, nothing wrong with making it
> adjustable, as far as I can tell. However, I'd like it if at least the
> basic foundation could get accepted soon, if only so I can pull it
> down to Chromium OS ASAP.

Well, the locking needs to be cleaned up *see below*.

> 
> > This lock is defined locally, I don't think that is what you want.  Locks
> > need to be a shared variable.  You might want to use a mutex here so it
> > can sleep in case the stack trace is long.  It is only called from the
> > user context I believe so it should be fine.
> Doesn't it just use one of the locks defined at the top of the file?
> This lock protects the stack trace data structure, so it's not read
> and written at once. This also means it is taken up by the kernel
> during interrupts.

I missed that you defined that as a pointer to a spinlock and assigned it
later.  I see what you are doing now, but I am not a fan of it because you
are now using the same spinlock in both the NMI context and the userspace
context.  This can cause deadlocks if something got screwed up in the
seq_printf functions or produced a very large amount of data.  Normally
you don't want to do that.

What others have done like perf and the APEI error handling is use
something called irq_work_queue(??).  Basically you would capture the
tracae in the NMI context, put it on an irq_work_queue and in the
interrupt context save it to your global trace variable.  Then you could
put spin_lock_irqsave inside the proc sys function and the work queue
function and not have any potential deadlocks.

The softstall case should be ok though.

> 
> > Where do these get created?  Under /proc?  I don't think that will be
> > allowed.  I would think this is a debug feature and we should create
> > something under /sys/kernel/debug like lockup_detector and stick these files in
> > there.
> Okay, and I suppose I'll shove the *stall_worst files into the same
> place (currently they're in /proc/sys/kernel/).

I think that would make sense too.

Cheers,
Don


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v4 2/2] Output stall traces in /proc
  2011-08-05 18:43       ` Don Zickus
@ 2011-08-05 21:26         ` ZAK Magnus
  2011-08-08 20:19           ` Don Zickus
  0 siblings, 1 reply; 12+ messages in thread
From: ZAK Magnus @ 2011-08-05 21:26 UTC (permalink / raw)
  To: Don Zickus; +Cc: linux-kernel, Ingo Molnar, Mandeep Singh Baines

On Fri, Aug 5, 2011 at 11:43 AM, Don Zickus <dzickus@redhat.com> wrote:
> I missed that you defined that as a pointer to a spinlock and assigned it
> later.  I see what you are doing now, but I am not a fan of it because you
> are now using the same spinlock in both the NMI context and the userspace
> context.  This can cause deadlocks if something got screwed up in the
> seq_printf functions or produced a very large amount of data.  Normally
> you don't want to do that.
>
> What others have done like perf and the APEI error handling is use
> something called irq_work_queue(??).  Basically you would capture the
> tracae in the NMI context, put it on an irq_work_queue and in the
> interrupt context save it to your global trace variable.  Then you could
> put spin_lock_irqsave inside the proc sys function and the work queue
> function and not have any potential deadlocks.
Work queue? Okay. The worker thread still needs a lock in order to
share the intermediate buffer with the NMI context, though. Any chance
of something screwing up in the middle of copying that structure,
causing a stall and deadlocking with the NMI?

Or maybe the intermediate buffer should be dynamically allocated. That
would work without a lock, although it seems slightly inefficient.

Regarding the lock between the work queue thread and the system call,
maybe that should become a mutex instead, since it's all outside of
interrupt context at that point?

> The softstall case should be ok though.
Why's that? The soft stall traces are not written in a NMI context but
just in a regular interrupt context, right? Doesn't that pose similar
problems?


These are weird rare corner cases anyway, right? Maybe the simplest
thing could be to let the interrupts only try_lock(), so they might
sometimes fail to record a stall, but it would be a pretty big
coincidence.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v4 2/2] Output stall traces in /proc
  2011-08-05 21:26         ` ZAK Magnus
@ 2011-08-08 20:19           ` Don Zickus
  2011-08-08 20:52             ` Peter Zijlstra
  0 siblings, 1 reply; 12+ messages in thread
From: Don Zickus @ 2011-08-08 20:19 UTC (permalink / raw)
  To: ZAK Magnus, peterz; +Cc: linux-kernel, Ingo Molnar, Mandeep Singh Baines

(adding Peter to the discussion)

On Fri, Aug 05, 2011 at 02:26:12PM -0700, ZAK Magnus wrote:
> On Fri, Aug 5, 2011 at 11:43 AM, Don Zickus <dzickus@redhat.com> wrote:
> > I missed that you defined that as a pointer to a spinlock and assigned it
> > later.  I see what you are doing now, but I am not a fan of it because you
> > are now using the same spinlock in both the NMI context and the userspace
> > context.  This can cause deadlocks if something got screwed up in the
> > seq_printf functions or produced a very large amount of data.  Normally
> > you don't want to do that.
> >
> > What others have done like perf and the APEI error handling is use
> > something called irq_work_queue(??).  Basically you would capture the
> > tracae in the NMI context, put it on an irq_work_queue and in the
> > interrupt context save it to your global trace variable.  Then you could
> > put spin_lock_irqsave inside the proc sys function and the work queue
> > function and not have any potential deadlocks.
> Work queue? Okay. The worker thread still needs a lock in order to

not work_queue, irq_work_queue.

> share the intermediate buffer with the NMI context, though. Any chance
> of something screwing up in the middle of copying that structure,
> causing a stall and deadlocking with the NMI?

I believe irq_work_queue uses cmpxchg for all its locking and just swaps
entries on to a linked list?

> 
> Or maybe the intermediate buffer should be dynamically allocated. That
> would work without a lock, although it seems slightly inefficient.

Peter,

How does the irq_work_queue work such that you can save info in the NMI
context and safely pass it to the irq context for processing? 

> 
> Regarding the lock between the work queue thread and the system call,
> maybe that should become a mutex instead, since it's all outside of
> interrupt context at that point?

No it is still in the irq context.

Peter,

If we want to expose data captured in the NMI context through the procfs,
I assume we can pass that info along using irq_work_queue.  But then when
reading from procfs do we just lock the data with 'spin_lock_irq' to block
the irq_work_queue from manipulating the data?  (note we are expecting
data to be overwritten with fresh data, not serialized out like
trace/perf).

Cheers,
Don

> 
> > The softstall case should be ok though.
> Why's that? The soft stall traces are not written in a NMI context but
> just in a regular interrupt context, right? Doesn't that pose similar
> problems?
> 
> 
> These are weird rare corner cases anyway, right? Maybe the simplest
> thing could be to let the interrupts only try_lock(), so they might
> sometimes fail to record a stall, but it would be a pretty big
> coincidence.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v4 2/2] Output stall traces in /proc
  2011-08-08 20:19           ` Don Zickus
@ 2011-08-08 20:52             ` Peter Zijlstra
  2011-08-08 21:37               ` Don Zickus
  0 siblings, 1 reply; 12+ messages in thread
From: Peter Zijlstra @ 2011-08-08 20:52 UTC (permalink / raw)
  To: Don Zickus; +Cc: ZAK Magnus, linux-kernel, Ingo Molnar, Mandeep Singh Baines

On Mon, 2011-08-08 at 16:19 -0400, Don Zickus wrote:

> I believe irq_work_queue uses cmpxchg for all its locking and just swaps
> entries on to a linked list?

yeah, cmpxchg to add entries, xchg to splice the whole list out.

> > 
> > Or maybe the intermediate buffer should be dynamically allocated. That
> > would work without a lock, although it seems slightly inefficient.
> 
> Peter,
> 
> How does the irq_work_queue work such that you can save info in the NMI
> context and safely pass it to the irq context for processing? 

It doesn't cover that part. It assumes a pre-allocated struct irq_work
exists and can enqueue that on a list, if its already enqueued, nothing
to do (irq_work_queue() returns that state).

Typically irq_work would be embedded in a larger structure.


> > Regarding the lock between the work queue thread and the system call,
> > maybe that should become a mutex instead, since it's all outside of
> > interrupt context at that point?
> 
> No it is still in the irq context.

Right, irq_work is typically ran from hardirq context, either through
some self-IPI, raised at irq_work_queue() or in the fallback case from
the timer interrupt.

> Peter,
> 
> If we want to expose data captured in the NMI context through the procfs,
> I assume we can pass that info along using irq_work_queue.  But then when
> reading from procfs do we just lock the data with 'spin_lock_irq' to block
> the irq_work_queue from manipulating the data?  (note we are expecting
> data to be overwritten with fresh data, not serialized out like
> trace/perf).

Since its the NMI context that's generating the data, disabling IRQs
will obviously not do much good.

Also, if you've got a procfs output, what do you need irq_work for?

Depending on the type of data and kind of data loss you can incur, would
a static per-cpu buffer be ok? You can guard it by a single bit, if
cleared NMI can write, if set NMI will skip over and loose the data.

Then your procfs routine can set the bit, dump out the latest version of
the data and clear the bit again. If you always want the NMI thing to be
able to write, use two buffers, if you 'lock' then one at a time,
there's always one writable.

All you need is atomic bitops :-)

> > > The softstall case should be ok though.
> > Why's that? The soft stall traces are not written in a NMI context but
> > just in a regular interrupt context, right? Doesn't that pose similar
> > problems?

You should be able to do stack traces from NMI context, that's what perf
does after all.

> > These are weird rare corner cases anyway, right? Maybe the simplest
> > thing could be to let the interrupts only try_lock(), so they might
> > sometimes fail to record a stall, but it would be a pretty big
> > coincidence.

Sure you can do a trylock. I'm still not quite sure what you want the
irq_work for.. there's no guarantee the interrupt runs immediately after
the NMI, it could be the NMI is in the middle of a irq disabled region,
or the whole thing is ran on an architecture without
arch_irq_work_raise().

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v4 2/2] Output stall traces in /proc
  2011-08-08 20:52             ` Peter Zijlstra
@ 2011-08-08 21:37               ` Don Zickus
  2011-08-08 23:34                 ` Alex Neronskiy
  0 siblings, 1 reply; 12+ messages in thread
From: Don Zickus @ 2011-08-08 21:37 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: ZAK Magnus, linux-kernel, Ingo Molnar, Mandeep Singh Baines

On Mon, Aug 08, 2011 at 10:52:51PM +0200, Peter Zijlstra wrote:
> Since its the NMI context that's generating the data, disabling IRQs
> will obviously not do much good.
> 
> Also, if you've got a procfs output, what do you need irq_work for?
> 
> Depending on the type of data and kind of data loss you can incur, would
> a static per-cpu buffer be ok? You can guard it by a single bit, if
> cleared NMI can write, if set NMI will skip over and loose the data.
> 
> Then your procfs routine can set the bit, dump out the latest version of
> the data and clear the bit again. If you always want the NMI thing to be
> able to write, use two buffers, if you 'lock' then one at a time,
> there's always one writable.
> 
> All you need is atomic bitops :-)
> 
> > > > The softstall case should be ok though.
> > > Why's that? The soft stall traces are not written in a NMI context but
> > > just in a regular interrupt context, right? Doesn't that pose similar
> > > problems?
> 
> You should be able to do stack traces from NMI context, that's what perf
> does after all.
> 
> > > These are weird rare corner cases anyway, right? Maybe the simplest
> > > thing could be to let the interrupts only try_lock(), so they might
> > > sometimes fail to record a stall, but it would be a pretty big
> > > coincidence.
> 
> Sure you can do a trylock. I'm still not quite sure what you want the
> irq_work for.. there's no guarantee the interrupt runs immediately after
> the NMI, it could be the NMI is in the middle of a irq disabled region,
> or the whole thing is ran on an architecture without
> arch_irq_work_raise().

Maybe irq_work isn't what we needed.  I just wasn't smart enough to figure
out how to make sure we can write data in an NMI context and read it in a
normal context.  I supposed the whole swapping buffers could work and is
simpler.

Basically, Zak was working on a way to save stack traces of potential
hard/soft lockups (ones that lockup for a minimum amount of time but
un-lockup before actually triggering anything).  The idea was to get
visibility of who is spending time doing some wrong before it is too late.

Cheers,
Don


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v4 2/2] Output stall traces in /proc
  2011-08-08 21:37               ` Don Zickus
@ 2011-08-08 23:34                 ` Alex Neronskiy
  2011-08-09 21:08                   ` Don Zickus
  0 siblings, 1 reply; 12+ messages in thread
From: Alex Neronskiy @ 2011-08-08 23:34 UTC (permalink / raw)
  To: Don Zickus
  Cc: Peter Zijlstra, linux-kernel, Ingo Molnar, Mandeep Singh Baines

On Mon, Aug 8, 2011 at 2:37 PM, Don Zickus <dzickus@redhat.com> wrote:
> Maybe irq_work isn't what we needed.  I just wasn't smart enough to figure
> out how to make sure we can write data in an NMI context and read it in a
> normal context.  I supposed the whole swapping buffers could work and is
> simpler.
So each of the buffers has its own lock, right? If a lock protects a
pair of buffers, then: A reader takes the lock, and a writing NMI
comes in and writes to the non-readable buffer and swaps the two. The
reader still has the lock. Another NMI comes in, sees that the lock is
unavailable, and writes to the "backup" buffer, which is actually the
one the reader is still reading from. Bad corrupted read results.

Either way, I don't see how to make the idea work safely for one pair
of buffers shared by multiple CPU's. It works one-pair-per-CPU, but
that's not how the current design is. I guess it would need to
add/remove files every time a processor is added/removed, and there
have to be some other changes too, obviously. What do you think, Don?
Should this be a per-CPU thing, instead of global worst?

Other alternatives:
1. Try lock. The interrupts will simply not save their data if they
are locked out by readers.
2. Work queue + dynamic allocation. No, I don't mean irq_work. This
works, doesn't it? But it's a bit wasteful of cycles and memory.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v4 2/2] Output stall traces in /proc
  2011-08-08 23:34                 ` Alex Neronskiy
@ 2011-08-09 21:08                   ` Don Zickus
  2011-08-09 21:44                     ` Alex Neronskiy
  0 siblings, 1 reply; 12+ messages in thread
From: Don Zickus @ 2011-08-09 21:08 UTC (permalink / raw)
  To: Alex Neronskiy
  Cc: Peter Zijlstra, linux-kernel, Ingo Molnar, Mandeep Singh Baines

On Mon, Aug 08, 2011 at 04:34:33PM -0700, Alex Neronskiy wrote:
> On Mon, Aug 8, 2011 at 2:37 PM, Don Zickus <dzickus@redhat.com> wrote:
> > Maybe irq_work isn't what we needed.  I just wasn't smart enough to figure
> > out how to make sure we can write data in an NMI context and read it in a
> > normal context.  I supposed the whole swapping buffers could work and is
> > simpler.
> So each of the buffers has its own lock, right? If a lock protects a
> pair of buffers, then: A reader takes the lock, and a writing NMI
> comes in and writes to the non-readable buffer and swaps the two. The
> reader still has the lock. Another NMI comes in, sees that the lock is
> unavailable, and writes to the "backup" buffer, which is actually the
> one the reader is still reading from. Bad corrupted read results.

Actually it should just overwrite the non-readable buffer as that data is
now stale and useless.  The reader can atomically set which buffer is
being read.  The only problem is once you read it, you lose it.

> 
> Either way, I don't see how to make the idea work safely for one pair
> of buffers shared by multiple CPU's. It works one-pair-per-CPU, but
> that's not how the current design is. I guess it would need to
> add/remove files every time a processor is added/removed, and there
> have to be some other changes too, obviously. What do you think, Don?
> Should this be a per-CPU thing, instead of global worst?

Well, looking at the code again, I think the spin_locks in the NMI handler
will block the other cpus from writing to the page at the same time.  So
it gets serialized that way, I think.  The next trick is to do something
with procfs like swapping buffers successfully.

I am trying to think how that would work, but I guess if you use the
cmpxchg macros then

procfs could cmpxchg a READ_BIT on the buffer and if successful (no
WRITE_BIT), then proceed to read the buffer.  Otherwise use the other
buffer.  A lock would have to be used to serialize access on the procfs.

The NMI code could do the same with a WRITE_BIT and if succesful (no
READ_BIT), then proceed to write the buffer.  Otherwise use the other
buffer.  Because the NMI is serialized only one write could go on at any
one time.

I was reading the kernel/irq_work.c code to generate the above idea.

Not sure if it works.  Thoughts?

Cheers,
Don

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v4 2/2] Output stall traces in /proc
  2011-08-09 21:08                   ` Don Zickus
@ 2011-08-09 21:44                     ` Alex Neronskiy
  0 siblings, 0 replies; 12+ messages in thread
From: Alex Neronskiy @ 2011-08-09 21:44 UTC (permalink / raw)
  To: Don Zickus
  Cc: Peter Zijlstra, linux-kernel, Ingo Molnar, Mandeep Singh Baines

On Tue, Aug 9, 2011 at 2:08 PM, Don Zickus <dzickus@redhat.com> wrote:
> Well, looking at the code again, I think the spin_locks in the NMI handler
> will block the other cpus from writing to the page at the same time.  So
> it gets serialized that way, I think.  The next trick is to do something
> with procfs like swapping buffers successfully.
>
> I am trying to think how that would work, but I guess if you use the
> cmpxchg macros then
>
> procfs could cmpxchg a READ_BIT on the buffer and if successful (no
> WRITE_BIT), then proceed to read the buffer.  Otherwise use the other
> buffer.  A lock would have to be used to serialize access on the procfs.
>
> The NMI code could do the same with a WRITE_BIT and if succesful (no
> READ_BIT), then proceed to write the buffer.  Otherwise use the other
> buffer.  Because the NMI is serialized only one write could go on at any
> one time.
>
> I was reading the kernel/irq_work.c code to generate the above idea.
>
> Not sure if it works.  Thoughts?
>
> Cheers,
> Don
>
I don't think that the reader should use the other buffer if it finds
contention. Who knows what that other buffer contains? Maybe something
really stale, maybe nothing. However, I think it's okay for the reader
to wait to be able to obtain the lock from the writer; just not the
other way around. I have a similar idea sketched out and I'll try and
get a patch out soon. Basically, the reader will be really simple and
just obtain the lock on whatever buffer it's told and then read that
buffer and unlock. The worst that will happen is that it will read a
slightly stale value, and that's only because it read "before" the
buffers switched. The writer, meanwhile, merely tries to lock a
buffer, and if it fails, sets the other one as the writable one (this
state is not shared with the reader). Then it locks that one
(guaranteed no contention), writes out the data, tells the reader to
read from the new buffer from now on, and unlocks. Both readers and
writers are serialized.

It's probably pretty hard to follow that description. Hopefully the
patch will be clearer.

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2011-08-09 21:44 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-08-04 22:13 [PATCH v4 1/2] Track hard and soft "short lockups" or "stalls." Alex Neronskiy
2011-08-04 22:13 ` [PATCH v4 2/2] Output stall traces in /proc Alex Neronskiy
2011-08-05 13:40   ` Don Zickus
2011-08-05 17:12     ` Alex Neronskiy
2011-08-05 18:43       ` Don Zickus
2011-08-05 21:26         ` ZAK Magnus
2011-08-08 20:19           ` Don Zickus
2011-08-08 20:52             ` Peter Zijlstra
2011-08-08 21:37               ` Don Zickus
2011-08-08 23:34                 ` Alex Neronskiy
2011-08-09 21:08                   ` Don Zickus
2011-08-09 21:44                     ` Alex Neronskiy

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.