linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v6 1/2] Track hard and soft "short lockups" or "stalls."
@ 2011-08-10 18:02 Alex Neronskiy
  2011-08-10 18:02 ` [PATCH v6 2/2] Output stall data in debugfs Alex Neronskiy
  2011-08-11 19:09 ` [PATCH v6 1/2] Track hard and soft "short lockups" or "stalls." Peter Zijlstra
  0 siblings, 2 replies; 21+ messages in thread
From: Alex Neronskiy @ 2011-08-10 18:02 UTC (permalink / raw)
  To: linux-kernel
  Cc: peterz, Ingo Molnar, Don Zickus, Mandeep Singh Baines, Alex Neronskiy

From: Alex Neronskiy <zakmagnus@chromium.com>

Enable the lockup watchdog to not only respond to lockups beyond a
certain threshold, but also keep track of the worst sub-threshold
lockup (called a "stall") seen so far. The stalls are communicated
to userspace via /proc/sys/kernel/softstall_worst
and hardstall_worst, as well as through the kernel log, where a
stack trace is also included. Hard lockups are checked for more
frequently to collect additional data. Soft stalls are described
by the length of time between successive instances of scheduling
of the watchdog thread, and hard stalls are described by the
number of times a hard lockup check found that no interrupts had
been recorded. Some thresholds on these are set, to attempt to
prevent floods of useless data.

Signed-off-by: Alex Neronskiy <zakmagnus@chromium.com>
---
 include/linux/nmi.h |    6 ++-
 kernel/sysctl.c     |   18 +++++++-
 kernel/watchdog.c   |  124 ++++++++++++++++++++++++++++++++++++++++++++-------
 3 files changed, 129 insertions(+), 19 deletions(-)

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 2d304ef..57cda21 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -48,9 +48,13 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *);
 u64 hw_nmi_get_sample_period(int watchdog_thresh);
 extern int watchdog_enabled;
 extern int watchdog_thresh;
+extern unsigned long worst_softstall;
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+extern unsigned long worst_hardstall;
+#endif /* CONFIG_HARDLOCKUP_DETECTOR */
 struct ctl_table;
 extern int proc_dowatchdog(struct ctl_table *, int ,
 			   void __user *, size_t *, loff_t *);
-#endif
+#endif /* CONFIG_LOCKUP_DETECTOR */
 
 #endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 11d65b5..a392952 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -767,7 +767,23 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
-#endif
+	{
+		.procname       = "softstall_worst",
+		.data           = &worst_softstall,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = proc_doulongvec_minmax,
+	},
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+	{
+		.procname       = "hardstall_worst",
+		.data           = &worst_hardstall,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = proc_doulongvec_minmax,
+	},
+#endif /* CONFIG_HARDLOCKUP_DETECTOR */
+#endif /* CONFIG_LOCKUP_DETECTOR */
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
 	{
 		.procname       = "unknown_nmi_panic",
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 36491cd..8dd9ed4 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -29,6 +29,8 @@
 
 int watchdog_enabled = 1;
 int __read_mostly watchdog_thresh = 10;
+unsigned long worst_softstall;
+static DEFINE_SPINLOCK(softstall_lock);
 
 static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
 static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
@@ -40,8 +42,29 @@ static DEFINE_PER_CPU(bool, hard_watchdog_warn);
 static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
 static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
 static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
+static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_missed);
 static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
-#endif
+
+unsigned long worst_hardstall;
+static DEFINE_SPINLOCK(hardstall_lock);
+
+/* The number of consecutive hard check failures before it's a lockup */
+#define hardlockup_thresh (5)
+
+/*
+ * The minimum expected number of missed interrupts in a normal hard
+ * lockup check.
+ */
+#define hardstall_thresh (2)
+
+#endif /* CONFIG_HARDLOCKUP_DETECTOR */
+
+/*
+ * The number of milliseconds by which a stall must be worse than the
+ * current worst in order to be recorded. This prevents bursts of near-
+ * identical stalls.
+ */
+#define softstall_diff_thresh (100)
 
 /* boot commands */
 /*
@@ -98,19 +121,28 @@ __setup("nosoftlockup", nosoftlockup_setup);
  * the thresholds with a factor: we make the soft threshold twice the amount of
  * time the hard threshold is.
  */
-static int get_softlockup_thresh(void)
+static inline int get_softlockup_thresh(void)
 {
 	return watchdog_thresh * 2;
 }
 
 /*
- * Returns seconds, approximately.  We don't need nanosecond
+ * This is just the period of the timer function. A "stall" shorter than
+ * this is the expected behavior and is therefore not noteworthy.
+ */
+static inline int get_softstall_thresh(void)
+{
+	return 1000 / 5 * get_softlockup_thresh();
+}
+
+/*
+ * Returns milliseconds, approximately.  We don't need nanosecond
  * resolution, and we don't need to waste time with a big divide when
- * 2^30ns == 1.074s.
+ * 2^20ns == 1.049ms.
  */
 static unsigned long get_timestamp(int this_cpu)
 {
-	return cpu_clock(this_cpu) >> 30LL;  /* 2^30 ~= 10^9 */
+	return cpu_clock(this_cpu) >> 20LL;  /* 2^20 ~= 10^6 */
 }
 
 static unsigned long get_sample_period(void)
@@ -176,25 +208,79 @@ void touch_softlockup_watchdog_sync(void)
 
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
 /* watchdog detector functions */
-static int is_hardlockup(void)
+static void update_hardstall(unsigned long stall, int this_cpu)
+{
+	int update_stall = 0;
+
+	if (stall > hardstall_thresh && stall > worst_hardstall) {
+		unsigned long flags;
+		spin_lock_irqsave(&hardstall_lock, flags);
+		update_stall = stall > worst_hardstall;
+		if (update_stall)
+			worst_hardstall = stall;
+		spin_unlock_irqrestore(&hardstall_lock, flags);
+	}
+
+	if (update_stall) {
+		printk(KERN_WARNING "LOCKUP may be in progress!"
+			"Worst hard stall seen on CPU#%d: %lu interrupts missed\n",
+			this_cpu, stall);
+		dump_stack();
+	}
+}
+
+static int is_hardlockup(int this_cpu)
 {
 	unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
+	unsigned long hrint_saved = __this_cpu_read(hrtimer_interrupts_saved);
+	unsigned long ints_missed = 0;
+
+	__this_cpu_write(hrtimer_interrupts_saved, hrint);
 
-	if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
+	if (hrint_saved == hrint)
+		ints_missed = per_cpu(hrtimer_interrupts_missed, this_cpu)++;
+	else
+		__this_cpu_write(hrtimer_interrupts_missed, 0);
+
+	if (ints_missed >= hardlockup_thresh)
 		return 1;
 
-	__this_cpu_write(hrtimer_interrupts_saved, hrint);
+	update_hardstall(ints_missed, this_cpu);
 	return 0;
 }
 #endif
 
-static int is_softlockup(unsigned long touch_ts)
+static void update_softstall(unsigned long stall, int this_cpu)
 {
-	unsigned long now = get_timestamp(smp_processor_id());
+	int update_stall = 0;
+	if (stall > get_softstall_thresh() &&
+			stall > worst_softstall + softstall_diff_thresh) {
+		unsigned long flags;
+		spin_lock_irqsave(&softstall_lock, flags);
+		update_stall = stall > worst_softstall + softstall_diff_thresh;
+		if (update_stall)
+			worst_softstall = stall;
+		spin_unlock_irqrestore(&softstall_lock, flags);
+	}
+
+	if (update_stall) {
+		printk(KERN_WARNING "LOCKUP may be in progress!"
+				"Worst soft stall seen on CPU#%d: %lums\n",
+				this_cpu, stall);
+		dump_stack();
+	}
+}
+
+static int is_softlockup(unsigned long touch_ts, int this_cpu)
+{
+	unsigned long now = get_timestamp(this_cpu);
+	unsigned long stall = now - touch_ts;
 
 	/* Warn about unreasonable delays: */
-	if (time_after(now, touch_ts + get_softlockup_thresh()))
-		return now - touch_ts;
+	if (time_after(now, touch_ts + 1000 * get_softlockup_thresh()))
+		return stall;
+
+	update_softstall(stall, this_cpu);
 
 	return 0;
 }
@@ -214,6 +300,9 @@ static void watchdog_overflow_callback(struct perf_event *event,
 		 struct perf_sample_data *data,
 		 struct pt_regs *regs)
 {
+	int lockup;
+	int this_cpu = smp_processor_id();
+
 	/* Ensure the watchdog never gets throttled */
 	event->hw.interrupts = 0;
 
@@ -228,9 +317,8 @@ static void watchdog_overflow_callback(struct perf_event *event,
 	 * fired multiple times before we overflow'd.  If it hasn't
 	 * then this is a good indication the cpu is stuck
 	 */
-	if (is_hardlockup()) {
-		int this_cpu = smp_processor_id();
-
+	lockup = is_hardlockup(this_cpu);
+	if (lockup) {
 		/* only print hardlockups once */
 		if (__this_cpu_read(hard_watchdog_warn) == true)
 			return;
@@ -261,6 +349,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 	unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
 	struct pt_regs *regs = get_irq_regs();
 	int duration;
+	int this_cpu = smp_processor_id();
 
 	/* kick the hardlockup detector */
 	watchdog_interrupt_count();
@@ -290,14 +379,14 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 	 * indicate it is getting cpu time.  If it hasn't then
 	 * this is a good indication some task is hogging the cpu
 	 */
-	duration = is_softlockup(touch_ts);
+	duration = is_softlockup(touch_ts, this_cpu);
 	if (unlikely(duration)) {
 		/* only warn once */
 		if (__this_cpu_read(soft_watchdog_warn) == true)
 			return HRTIMER_RESTART;
 
 		printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
-			smp_processor_id(), duration,
+			this_cpu, duration >> 10L,
 			current->comm, task_pid_nr(current));
 		print_modules();
 		print_irqtrace_events(current);
@@ -371,6 +460,7 @@ static int watchdog_nmi_enable(int cpu)
 
 	wd_attr = &wd_hw_attr;
 	wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
+	do_div(wd_attr->sample_period, hardlockup_thresh);
 
 	/* Try to register using hardware perf events */
 	event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
-- 
1.7.3.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH v6 2/2] Output stall data in debugfs
  2011-08-10 18:02 [PATCH v6 1/2] Track hard and soft "short lockups" or "stalls." Alex Neronskiy
@ 2011-08-10 18:02 ` Alex Neronskiy
  2011-08-11 18:48   ` Andi Kleen
                     ` (3 more replies)
  2011-08-11 19:09 ` [PATCH v6 1/2] Track hard and soft "short lockups" or "stalls." Peter Zijlstra
  1 sibling, 4 replies; 21+ messages in thread
From: Alex Neronskiy @ 2011-08-10 18:02 UTC (permalink / raw)
  To: linux-kernel
  Cc: peterz, Ingo Molnar, Don Zickus, Mandeep Singh Baines,
	Alex Neronskiy, Alex Neronskiy

From: Alex Neronskiy <zakmagnus@chromium.com>

Instead of using the log, use debugfs for output of both stall
lengths and stack traces. Printing to the log can result in
watchdog touches, distorting the very events being measured.
Additionally, the information will not distract from lockups
when users view the log.

A two-buffer system is used to ensure that the trace information
can always be recorded without contention.

Signed-off-by: Alex Neronskiy <zakmagnus@chromium.org>
---
I added comments to help explain how the two-buffer system
works. Also, compared to the last version, I removed some
redundant locking and global variables.
I also changed the volatile *_read_ind variables to
instead have read memory barriers after loads. I want to
avoid the local copy of the variable from just being treated
as an alias for the global variable, being read at different
times and getting different values. I'm not sure what the
correct way to ensure this is.

 kernel/sysctl.c   |   16 ----
 kernel/watchdog.c |  225 +++++++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 200 insertions(+), 41 deletions(-)

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index a392952..b8620a2 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -767,22 +767,6 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
-	{
-		.procname       = "softstall_worst",
-		.data           = &worst_softstall,
-		.maxlen         = sizeof(unsigned long),
-		.mode           = 0644,
-		.proc_handler   = proc_doulongvec_minmax,
-	},
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-	{
-		.procname       = "hardstall_worst",
-		.data           = &worst_hardstall,
-		.maxlen         = sizeof(unsigned long),
-		.mode           = 0644,
-		.proc_handler   = proc_doulongvec_minmax,
-	},
-#endif /* CONFIG_HARDLOCKUP_DETECTOR */
 #endif /* CONFIG_LOCKUP_DETECTOR */
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
 	{
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 8dd9ed4..b908935 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -23,14 +23,35 @@
 #include <linux/notifier.h>
 #include <linux/module.h>
 #include <linux/sysctl.h>
+#include <linux/stacktrace.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
 
 #include <asm/irq_regs.h>
 #include <linux/perf_event.h>
 
+#define STALL_SOFT (0)
+#define STALL_HARD (1)
+
+#define STALL_MAX_TRACE_DEPTH (50)
 int watchdog_enabled = 1;
 int __read_mostly watchdog_thresh = 10;
 unsigned long worst_softstall;
-static DEFINE_SPINLOCK(softstall_lock);
+
+/* For output to debugfs, there are two buffers, each with their own
+ * lock. Only one can ever be locked at a time. The buffers are written
+ * to in interrupt context, but read in a system call, so the writers
+ * can't wait for the readers. Thus, if a writer is excluded from using
+ * a buffer, it will instead switch to the other, uncontended one.
+ * After it's done, it will indicate that the two buffers have switched
+ * roles.
+ */
+static unsigned long softstall_trace_entries[STALL_MAX_TRACE_DEPTH * 2];
+static struct stack_trace softstall_traces[2];
+static spinlock_t softstall_locks[2];
+static DEFINE_SPINLOCK(softstall_read_lock);
+static DEFINE_SPINLOCK(softstall_write_lock);
+static int soft_read_ind;
 
 static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
 static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
@@ -46,7 +67,12 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_missed);
 static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
 
 unsigned long worst_hardstall;
-static DEFINE_SPINLOCK(hardstall_lock);
+static unsigned long hardstall_trace_entries[STALL_MAX_TRACE_DEPTH * 2];
+static struct stack_trace hardstall_traces[2];
+static spinlock_t hardstall_locks[2];
+static DEFINE_SPINLOCK(hardstall_read_lock);
+static DEFINE_SPINLOCK(hardstall_write_lock);
+static int hard_read_ind;
 
 /* The number of consecutive hard check failures before it's a lockup */
 #define hardlockup_thresh (5)
@@ -210,22 +236,27 @@ void touch_softlockup_watchdog_sync(void)
 /* watchdog detector functions */
 static void update_hardstall(unsigned long stall, int this_cpu)
 {
-	int update_stall = 0;
-
 	if (stall > hardstall_thresh && stall > worst_hardstall) {
 		unsigned long flags;
-		spin_lock_irqsave(&hardstall_lock, flags);
-		update_stall = stall > worst_hardstall;
-		if (update_stall)
+		spin_lock_irqsave(&hardstall_write_lock, flags);
+		if (stall > worst_hardstall) {
+			int write_ind = hard_read_ind;
+			int locked = spin_trylock(&hardstall_locks[write_ind]);
+			/* cannot wait, so if there's contention,
+			 * switch buffers */
+			if (!locked)
+				write_ind = !write_ind;
+
 			worst_hardstall = stall;
-		spin_unlock_irqrestore(&hardstall_lock, flags);
-	}
+			hardstall_traces[write_ind].nr_entries = 0;
+			save_stack_trace(&hardstall_traces[write_ind]);
 
-	if (update_stall) {
-		printk(KERN_WARNING "LOCKUP may be in progress!"
-			"Worst hard stall seen on CPU#%d: %lu interrupts missed\n",
-			this_cpu, stall);
-		dump_stack();
+			/* tell readers to use the new buffer from now on */
+			hard_read_ind = write_ind;
+			if (locked)
+				spin_unlock(&hardstall_locks[write_ind]);
+		}
+		spin_unlock_irqrestore(&hardstall_write_lock, flags);
 	}
 }
 
@@ -252,22 +283,28 @@ static int is_hardlockup(int this_cpu)
 
 static void update_softstall(unsigned long stall, int this_cpu)
 {
-	int update_stall = 0;
 	if (stall > get_softstall_thresh() &&
 			stall > worst_softstall + softstall_diff_thresh) {
 		unsigned long flags;
-		spin_lock_irqsave(&softstall_lock, flags);
-		update_stall = stall > worst_softstall + softstall_diff_thresh;
-		if (update_stall)
+		spin_lock_irqsave(&softstall_write_lock, flags);
+		if (stall > worst_softstall + softstall_diff_thresh) {
+			int write_ind = soft_read_ind;
+			int locked = spin_trylock(&softstall_locks[write_ind]);
+			/* cannot wait, so if there's contention,
+			 * switch buffers */
+			if (!locked)
+				write_ind = !write_ind;
+
 			worst_softstall = stall;
-		spin_unlock_irqrestore(&softstall_lock, flags);
-	}
+			softstall_traces[write_ind].nr_entries = 0;
+			save_stack_trace(&softstall_traces[write_ind]);
 
-	if (update_stall) {
-		printk(KERN_WARNING "LOCKUP may be in progress!"
-				"Worst soft stall seen on CPU#%d: %lums\n",
-				this_cpu, stall);
-		dump_stack();
+			/* tell readers to use the new buffer from now on */
+			soft_read_ind = write_ind;
+			if (locked)
+				spin_unlock(&softstall_locks[write_ind]);
+		}
+		spin_unlock_irqrestore(&softstall_write_lock, flags);
 	}
 }
 
@@ -667,6 +704,86 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
 	.notifier_call = cpu_callback
 };
 
+static int show_stall_trace(struct seq_file *f, void *v)
+{
+	int i, end, buf_ind;
+	int type = (int) f->private;
+	spinlock_t *lock;
+	struct stack_trace *trace;
+	unsigned long flags;
+
+	/* The buf_ind is saved locally because the global read_ind
+	 * variable can be changed at any time. Using a stale value
+	 * is okay but reading inconsistent values is not, as it can
+	 * result in locking one buffer and reading the other.
+	 * For this reason, reads are also serialized (via the
+	 * read_lock); otherwise two readers can lock both buffers
+	 * at once, and then a writer will not be able to write
+	 * without contention.
+	 */
+	if (type == STALL_SOFT) {
+		spin_lock_irqsave(&softstall_read_lock, flags);
+
+		buf_ind = soft_read_ind;
+		rmb();
+		lock = &softstall_locks[buf_ind];
+		trace = &softstall_traces[buf_ind];
+	} else {
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+		spin_lock_irqsave(&hardstall_read_lock, flags);
+
+		buf_ind = hard_read_ind;
+		rmb();
+		lock = &hardstall_locks[buf_ind];
+		trace = &hardstall_traces[buf_ind];
+#endif
+	}
+
+	end = trace->nr_entries;
+	/* don't bother printing the trailing value */
+	if (end < trace->max_entries)
+		end--;
+
+	spin_lock(lock); /* excludes writers */
+	for (i = 0; i < end; i++) {
+		seq_printf(f, "[<%pK>] %pS\n", (void *)trace->entries[i],
+				(void *)trace->entries[i]);
+	}
+	spin_unlock(lock);
+
+	if (type == STALL_SOFT)
+		spin_unlock_irqrestore(&softstall_read_lock, flags);
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+	else
+		spin_unlock_irqrestore(&hardstall_read_lock, flags);
+#endif
+
+	return 0;
+}
+
+static int softstall_trace_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, show_stall_trace, (void *)STALL_SOFT);
+}
+
+static int hardstall_trace_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, show_stall_trace, (void *)STALL_HARD);
+}
+
+static const struct file_operations softstall_trace_ops = {
+	.open = softstall_trace_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release
+};
+static const struct file_operations hardstall_trace_ops = {
+	.open = hardstall_trace_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release
+};
+
 void __init lockup_detector_init(void)
 {
 	void *cpu = (void *)(long)smp_processor_id();
@@ -678,5 +795,63 @@ void __init lockup_detector_init(void)
 	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
 	register_cpu_notifier(&cpu_nfb);
 
+	softstall_traces[0].nr_entries = 0;
+	softstall_traces[0].max_entries = STALL_MAX_TRACE_DEPTH;
+	softstall_traces[0].skip = 0;
+	softstall_traces[0].entries = softstall_trace_entries;
+	softstall_traces[1].nr_entries = 0;
+	softstall_traces[1].max_entries = STALL_MAX_TRACE_DEPTH;
+	softstall_traces[1].skip = 0;
+	softstall_traces[1].entries = softstall_trace_entries
+		+ STALL_MAX_TRACE_DEPTH;
+
+	spin_lock_init(&softstall_locks[0]);
+	spin_lock_init(&softstall_locks[1]);
+
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+	hardstall_traces[0].nr_entries = 0;
+	hardstall_traces[0].max_entries = STALL_MAX_TRACE_DEPTH;
+	hardstall_traces[0].skip = 0;
+	hardstall_traces[0].entries = hardstall_trace_entries;
+	hardstall_traces[1].nr_entries = 0;
+	hardstall_traces[1].max_entries = STALL_MAX_TRACE_DEPTH;
+	hardstall_traces[1].skip = 0;
+	hardstall_traces[1].entries = hardstall_trace_entries
+		+ STALL_MAX_TRACE_DEPTH;
+
+	spin_lock_init(&hardstall_locks[0]);
+	spin_lock_init(&hardstall_locks[1]);
+#endif
+
 	return;
 }
+
+/* Must be called after debugfs_init() */
+static int __init lockup_detector_debugfs_init(void)
+{
+	struct dentry *parent = debugfs_create_dir("lockup_watchdog", NULL);
+	if (parent) {
+		debugfs_create_file("softstall_trace", 0444, parent, NULL,
+				&softstall_trace_ops);
+		if (sizeof(worst_softstall) == sizeof(u32))
+			debugfs_create_u32("softstall_worst", 0644, parent,
+					(u32 *) &worst_softstall);
+		else if (sizeof(worst_softstall) == sizeof(u64))
+			debugfs_create_u64("softstall_worst", 0644, parent,
+					(u64 *) &worst_softstall);
+
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+		debugfs_create_file("hardstall_trace", 0444, parent, NULL,
+				&hardstall_trace_ops);
+		if (sizeof(worst_hardstall) == sizeof(u32))
+			debugfs_create_u32("hardstall_worst", 0644, parent,
+					(u32 *) &worst_hardstall);
+		else if (sizeof(worst_hardstall) == sizeof(u64))
+			debugfs_create_u64("hardstall_worst", 0644, parent,
+					(u64 *) &worst_hardstall);
+#endif
+	}
+
+	return 0;
+}
+postcore_initcall(lockup_detector_debugfs_init);
-- 
1.7.3.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH v6 2/2] Output stall data in debugfs
  2011-08-10 18:02 ` [PATCH v6 2/2] Output stall data in debugfs Alex Neronskiy
@ 2011-08-11 18:48   ` Andi Kleen
  2011-08-11 19:04     ` Don Zickus
  2011-08-11 19:13   ` Peter Zijlstra
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 21+ messages in thread
From: Andi Kleen @ 2011-08-11 18:48 UTC (permalink / raw)
  To: Alex Neronskiy
  Cc: linux-kernel, peterz, Ingo Molnar, Don Zickus,
	Mandeep Singh Baines, Alex Neronskiy

Alex Neronskiy <zakmagnus@chromium.org> writes:

> From: Alex Neronskiy <zakmagnus@chromium.com>
>
> Instead of using the log, use debugfs for output of both stall
> lengths and stack traces. Printing to the log can result in
> watchdog touches, 

Why? Because of printk being slow or something else?

The first could be probably workarounded, especially if you
already have "two buffers"

> distorting the very events being measured.
> Additionally, the information will not distract from lockups
> when users view the log.
>
> A two-buffer system is used to ensure that the trace information
> can always be recorded without contention.

This implies that kernel bug reports will often not contain the 
back trace, right? Seems like a bad thing to me because it will
make bug reports worse.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v6 2/2] Output stall data in debugfs
  2011-08-11 18:48   ` Andi Kleen
@ 2011-08-11 19:04     ` Don Zickus
  2011-08-11 21:05       ` Andi Kleen
  0 siblings, 1 reply; 21+ messages in thread
From: Don Zickus @ 2011-08-11 19:04 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Alex Neronskiy, linux-kernel, peterz, Ingo Molnar,
	Mandeep Singh Baines, Alex Neronskiy

On Thu, Aug 11, 2011 at 11:48:26AM -0700, Andi Kleen wrote:
> Alex Neronskiy <zakmagnus@chromium.org> writes:
> 
> > From: Alex Neronskiy <zakmagnus@chromium.com>
> >
> > Instead of using the log, use debugfs for output of both stall
> > lengths and stack traces. Printing to the log can result in
> > watchdog touches, 
> 
> Why? Because of printk being slow or something else?

No because the serial console driver does a touch_nmi_watchdog().  So if
we are trying to output debug info _before_ the lockup detector goes off,
we effectively shoot ourselves in the foot by reseting the lockup detector
everytime we print something.  Hence we are trying to capture the data and
output using another interface.

> 
> The first could be probably workarounded, especially if you
> already have "two buffers"
> 
> > distorting the very events being measured.
> > Additionally, the information will not distract from lockups
> > when users view the log.
> >
> > A two-buffer system is used to ensure that the trace information
> > can always be recorded without contention.
> 
> This implies that kernel bug reports will often not contain the 
> back trace, right? Seems like a bad thing to me because it will
> make bug reports worse.

These are debug traces.  The real lockup traces will still print to the
console as they do today.  Nothing will change from that perspective.

What these patches do is give you insight into what part of your system is
coming close but not causing lockups.  If we have the hardlockup detector
set to warn or panic after 5 seconds of no interrupts, then these patches
can give you backtraces after 3 or 4 seconds (these traces might enable
interrupts after 4 seconds so no lock up occurs, but maybe something worth
noting).  It's just way to gather hueristics on system behaviour
regarding lockups.

Cheers,
Don

> 
> -Andi
> 
> -- 
> ak@linux.intel.com -- Speaking for myself only

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v6 1/2] Track hard and soft "short lockups" or "stalls."
  2011-08-10 18:02 [PATCH v6 1/2] Track hard and soft "short lockups" or "stalls." Alex Neronskiy
  2011-08-10 18:02 ` [PATCH v6 2/2] Output stall data in debugfs Alex Neronskiy
@ 2011-08-11 19:09 ` Peter Zijlstra
  1 sibling, 0 replies; 21+ messages in thread
From: Peter Zijlstra @ 2011-08-11 19:09 UTC (permalink / raw)
  To: Alex Neronskiy
  Cc: linux-kernel, Ingo Molnar, Don Zickus, Mandeep Singh Baines,
	Alex Neronskiy

On Wed, 2011-08-10 at 11:02 -0700, Alex Neronskiy wrote:
> @@ -767,7 +767,23 @@ static struct ctl_table kern_table[] = {
>                 .extra1         = &zero,
>                 .extra2         = &one,
>         },
> -#endif
> +       {
> +               .procname       = "softstall_worst",
> +               .data           = &worst_softstall,
> +               .maxlen         = sizeof(unsigned long),
> +               .mode           = 0644,
> +               .proc_handler   = proc_doulongvec_minmax,
> +       },
> +#ifdef CONFIG_HARDLOCKUP_DETECTOR
> +       {
> +               .procname       = "hardstall_worst",
> +               .data           = &worst_hardstall,
> +               .maxlen         = sizeof(unsigned long),
> +               .mode           = 0644,
> +               .proc_handler   = proc_doulongvec_minmax,
> +       },
> +#endif /* CONFIG_HARDLOCKUP_DETECTOR */
> +#endif /* CONFIG_LOCKUP_DETECTOR */ 

And patch 2 does:


> +++ b/kernel/sysctl.c
> @@ -767,22 +767,6 @@ static struct ctl_table kern_table[] = {
>                 .extra1         = &zero,
>                 .extra2         = &one,
>         },
> -       {
> -               .procname       = "softstall_worst",
> -               .data           = &worst_softstall,
> -               .maxlen         = sizeof(unsigned long),
> -               .mode           = 0644,
> -               .proc_handler   = proc_doulongvec_minmax,
> -       },
> -#ifdef CONFIG_HARDLOCKUP_DETECTOR
> -       {
> -               .procname       = "hardstall_worst",
> -               .data           = &worst_hardstall,
> -               .maxlen         = sizeof(unsigned long),
> -               .mode           = 0644,
> -               .proc_handler   = proc_doulongvec_minmax,
> -       },
> -#endif /* CONFIG_HARDLOCKUP_DETECTOR */
>  #endif /* CONFIG_LOCKUP_DETECTOR */
> 

Makes one wonder wtf patch 1 exists for..

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v6 2/2] Output stall data in debugfs
  2011-08-10 18:02 ` [PATCH v6 2/2] Output stall data in debugfs Alex Neronskiy
  2011-08-11 18:48   ` Andi Kleen
@ 2011-08-11 19:13   ` Peter Zijlstra
  2011-08-11 19:35   ` Peter Zijlstra
  2011-08-11 19:43   ` Peter Zijlstra
  3 siblings, 0 replies; 21+ messages in thread
From: Peter Zijlstra @ 2011-08-11 19:13 UTC (permalink / raw)
  To: Alex Neronskiy
  Cc: linux-kernel, Ingo Molnar, Don Zickus, Mandeep Singh Baines,
	Alex Neronskiy

On Wed, 2011-08-10 at 11:02 -0700, Alex Neronskiy wrote:

> +static unsigned long softstall_trace_entries[STALL_MAX_TRACE_DEPTH * 2];
> +static struct stack_trace softstall_traces[2];
> +static spinlock_t softstall_locks[2];
> +static DEFINE_SPINLOCK(softstall_read_lock);
> +static DEFINE_SPINLOCK(softstall_write_lock);
> +static int soft_read_ind;
>  
>  static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
>  static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
> @@ -46,7 +67,12 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_missed);
>  static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
>  
>  unsigned long worst_hardstall;
> +static unsigned long hardstall_trace_entries[STALL_MAX_TRACE_DEPTH * 2];
> +static struct stack_trace hardstall_traces[2];
> +static spinlock_t hardstall_locks[2];
> +static DEFINE_SPINLOCK(hardstall_read_lock);
> +static DEFINE_SPINLOCK(hardstall_write_lock);
> +static int hard_read_ind; 

You really gotta love typing, but the rest of us know to use structures
for this.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v6 2/2] Output stall data in debugfs
  2011-08-10 18:02 ` [PATCH v6 2/2] Output stall data in debugfs Alex Neronskiy
  2011-08-11 18:48   ` Andi Kleen
  2011-08-11 19:13   ` Peter Zijlstra
@ 2011-08-11 19:35   ` Peter Zijlstra
  2011-08-11 20:10     ` Alex Neronskiy
  2011-08-12  9:06     ` Ingo Molnar
  2011-08-11 19:43   ` Peter Zijlstra
  3 siblings, 2 replies; 21+ messages in thread
From: Peter Zijlstra @ 2011-08-11 19:35 UTC (permalink / raw)
  To: Alex Neronskiy
  Cc: linux-kernel, Ingo Molnar, Don Zickus, Mandeep Singh Baines,
	Alex Neronskiy

On Wed, 2011-08-10 at 11:02 -0700, Alex Neronskiy wrote:
> @@ -210,22 +236,27 @@ void touch_softlockup_watchdog_sync(void)
>  /* watchdog detector functions */
>  static void update_hardstall(unsigned long stall, int this_cpu)
>  {
>         if (stall > hardstall_thresh && stall > worst_hardstall) {
>                 unsigned long flags;
> +               spin_lock_irqsave(&hardstall_write_lock, flags);
> +               if (stall > worst_hardstall) {
> +                       int write_ind = hard_read_ind;
> +                       int locked = spin_trylock(&hardstall_locks[write_ind]);
> +                       /* cannot wait, so if there's contention,
> +                        * switch buffers */
> +                       if (!locked)
> +                               write_ind = !write_ind;
> +
>                         worst_hardstall = stall;
> +                       hardstall_traces[write_ind].nr_entries = 0;
> +                       save_stack_trace(&hardstall_traces[write_ind]);
>  
> +                       /* tell readers to use the new buffer from now on */
> +                       hard_read_ind = write_ind;
> +                       if (locked)
> +                               spin_unlock(&hardstall_locks[write_ind]);
> +               }
> +               spin_unlock_irqrestore(&hardstall_write_lock, flags);
>         }
>  } 

That must be the most convoluted locking I've seen in a while.. OMG!

What's wrong with something like:

static void update_stall(struct stall *s, unsigned long stall)
{
	if (stall <= s->worst)
		return;

again:
	if (!raw_spin_trylock(&s->lock[s->idx])) {
		s->idx ^= 1;
		goto again;
	}

	if (stall <= s->worst)
		goto unlock;

	s->worst = stall;
	s->trace[s->idx].nr_entries = 0;
	save_stack_trace(&s->trace[s->idx]);

unlock:
	raw_spin_unlock(&s->lock[s->idx]);
}


And have your read side do:


static void show_stall_trace(struct seq_file *f, void *v)
{
	struct stall *s = f->private;
	int i, idx = ACCESS_ONCE(s->idx);

	mutex_lock(&stall_mutex);

	raw_spin_lock(&s->lock[idx]);
	seq_printf(f, "stall: %d\n", s->worst);
	for (i = 0; i < s->trace[idx].nr_entries; i++) {
		seq_printf(f, "[<%pK>] %pS\n", 
			(void *)s->trace->entries[i],
			(void *)s->trace->entries[i]);
	}
	raw_spin_unlock(&s->lock[idx]);

	mutex_unlock(&stall_mutex);
}


Yes its racy on s->worst, but who cares (if you do care you can keep a
copy in s->delay[idx] or so). Also, it might be better to not do the
spinlock but simply use an atomic bitop to set an in-use flag, there is
no reason to disable preemption over the seq_printf() loop.



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v6 2/2] Output stall data in debugfs
  2011-08-10 18:02 ` [PATCH v6 2/2] Output stall data in debugfs Alex Neronskiy
                     ` (2 preceding siblings ...)
  2011-08-11 19:35   ` Peter Zijlstra
@ 2011-08-11 19:43   ` Peter Zijlstra
  2011-08-11 19:51     ` Alex Neronskiy
  3 siblings, 1 reply; 21+ messages in thread
From: Peter Zijlstra @ 2011-08-11 19:43 UTC (permalink / raw)
  To: Alex Neronskiy
  Cc: linux-kernel, Ingo Molnar, Don Zickus, Mandeep Singh Baines,
	Alex Neronskiy

zakmagnus seems to bounce, so who am I talking to anyway..

On Thu, 2011-08-11 at 21:35 +0200, Peter Zijlstra wrote:
> 
> static void show_stall_trace(struct seq_file *f, void *v)
> {
>         struct stall *s = f->private;
>         int i, idx = ACCESS_ONCE(s->idx);
> 
>         mutex_lock(&stall_mutex);
> 
>         raw_spin_lock(&s->lock[idx]);
>         seq_printf(f, "stall: %d\n", s->worst);
>         for (i = 0; i < s->trace[idx].nr_entries; i++) {
>                 seq_printf(f, "[<%pK>] %pS\n", 
>                         (void *)s->trace->entries[i],
>                         (void *)s->trace->entries[i]);
>         }
>         raw_spin_unlock(&s->lock[idx]);
> 
>         mutex_unlock(&stall_mutex);
> }
> 
> 
> Yes its racy on s->worst, but who cares (if you do care you can keep a
> copy in s->delay[idx] or so). Also, it might be better to not do the
> spinlock but simply use an atomic bitop to set an in-use flag, there is
> no reason to disable preemption over the seq_printf() loop. 

That also cures another problem you have, a seq_file buffer is only 1
page large, you should be using the seqfile iterator interface and print
one line at a time.. now clearly that won't work with preemption
disabled either.



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v6 2/2] Output stall data in debugfs
  2011-08-11 19:43   ` Peter Zijlstra
@ 2011-08-11 19:51     ` Alex Neronskiy
  2011-08-11 19:57       ` Peter Zijlstra
  0 siblings, 1 reply; 21+ messages in thread
From: Alex Neronskiy @ 2011-08-11 19:51 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, Ingo Molnar, Don Zickus, Mandeep Singh Baines,
	Alex Neronskiy

On Thu, Aug 11, 2011 at 12:43 PM, Peter Zijlstra <peterz@infradead.org> wrote:
> zakmagnus seems to bounce, so who am I talking to anyway..
>
> On Thu, 2011-08-11 at 21:35 +0200, Peter Zijlstra wrote:
>>
>> static void show_stall_trace(struct seq_file *f, void *v)
>> {
>>         struct stall *s = f->private;
>>         int i, idx = ACCESS_ONCE(s->idx);
>>
>>         mutex_lock(&stall_mutex);
>>
>>         raw_spin_lock(&s->lock[idx]);
>>         seq_printf(f, "stall: %d\n", s->worst);
>>         for (i = 0; i < s->trace[idx].nr_entries; i++) {
>>                 seq_printf(f, "[<%pK>] %pS\n",
>>                         (void *)s->trace->entries[i],
>>                         (void *)s->trace->entries[i]);
>>         }
>>         raw_spin_unlock(&s->lock[idx]);
>>
>>         mutex_unlock(&stall_mutex);
>> }
>>
>>
>> Yes its racy on s->worst, but who cares (if you do care you can keep a
>> copy in s->delay[idx] or so). Also, it might be better to not do the
>> spinlock but simply use an atomic bitop to set an in-use flag, there is
>> no reason to disable preemption over the seq_printf() loop.
>
> That also cures another problem you have, a seq_file buffer is only 1
> page large, you should be using the seqfile iterator interface and print
> one line at a time.. now clearly that won't work with preemption
> disabled either.
I didn't want to hold on to a lock while the lines were iterated over,
and instead just dump it in one go. I guess it's not a big deal to do
that, though. Nonetheless, is one page not enough?

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v6 2/2] Output stall data in debugfs
  2011-08-11 19:51     ` Alex Neronskiy
@ 2011-08-11 19:57       ` Peter Zijlstra
  0 siblings, 0 replies; 21+ messages in thread
From: Peter Zijlstra @ 2011-08-11 19:57 UTC (permalink / raw)
  To: Alex Neronskiy
  Cc: linux-kernel, Ingo Molnar, Don Zickus, Mandeep Singh Baines,
	Alex Neronskiy

On Thu, 2011-08-11 at 12:51 -0700, Alex Neronskiy wrote:
> I didn't want to hold on to a lock while the lines were iterated over,
> and instead just dump it in one go. I guess it's not a big deal to do
> that, though. Nonetheless, is one page not enough? 

The "[<%pK>] %pS\n" that's 6bytes + 8bytes + ~64bytes making 78 bytes at
worst (assuming we have a symbol length max of 64 that is), that yields
4096/78 = 52, so yeah, just about enough since your trace depth is
limited at 50. But its close enough to make me feel uncomfortable.



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v6 2/2] Output stall data in debugfs
  2011-08-11 19:35   ` Peter Zijlstra
@ 2011-08-11 20:10     ` Alex Neronskiy
  2011-08-11 20:23       ` Peter Zijlstra
  2011-08-12  9:06     ` Ingo Molnar
  1 sibling, 1 reply; 21+ messages in thread
From: Alex Neronskiy @ 2011-08-11 20:10 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, Ingo Molnar, Don Zickus, Mandeep Singh Baines

Please don't send email to zakmagnus@chromium.com. That does not
exist. The correct address is zakmagnus@chromium.org. I messed up my
own email address somewhere somehow.

On Thu, Aug 11, 2011 at 12:35 PM, Peter Zijlstra <peterz@infradead.org> wrote:
> On Wed, 2011-08-10 at 11:02 -0700, Alex Neronskiy wrote:
>> @@ -210,22 +236,27 @@ void touch_softlockup_watchdog_sync(void)
>>  /* watchdog detector functions */
>>  static void update_hardstall(unsigned long stall, int this_cpu)
>>  {
>>         if (stall > hardstall_thresh && stall > worst_hardstall) {
>>                 unsigned long flags;
>> +               spin_lock_irqsave(&hardstall_write_lock, flags);
>> +               if (stall > worst_hardstall) {
>> +                       int write_ind = hard_read_ind;
>> +                       int locked = spin_trylock(&hardstall_locks[write_ind]);
>> +                       /* cannot wait, so if there's contention,
>> +                        * switch buffers */
>> +                       if (!locked)
>> +                               write_ind = !write_ind;
>> +
>>                         worst_hardstall = stall;
>> +                       hardstall_traces[write_ind].nr_entries = 0;
>> +                       save_stack_trace(&hardstall_traces[write_ind]);
>>
>> +                       /* tell readers to use the new buffer from now on */
>> +                       hard_read_ind = write_ind;
>> +                       if (locked)
>> +                               spin_unlock(&hardstall_locks[write_ind]);
>> +               }
>> +               spin_unlock_irqrestore(&hardstall_write_lock, flags);
>>         }
>>  }
>
> That must be the most convoluted locking I've seen in a while.. OMG!
>
> What's wrong with something like:
>
> static void update_stall(struct stall *s, unsigned long stall)
> {
>        if (stall <= s->worst)
>                return;
>
> again:
>        if (!raw_spin_trylock(&s->lock[s->idx])) {
>                s->idx ^= 1;
>                goto again;
>        }
>
>        if (stall <= s->worst)
>                goto unlock;
>
>        s->worst = stall;
>        s->trace[s->idx].nr_entries = 0;
>        save_stack_trace(&s->trace[s->idx]);
>
> unlock:
>        raw_spin_unlock(&s->lock[s->idx]);
> }
>
>
> And have your read side do:
>
>
> static void show_stall_trace(struct seq_file *f, void *v)
> {
>        struct stall *s = f->private;
>        int i, idx = ACCESS_ONCE(s->idx);
>
>        mutex_lock(&stall_mutex);
>
>        raw_spin_lock(&s->lock[idx]);
>        seq_printf(f, "stall: %d\n", s->worst);
>        for (i = 0; i < s->trace[idx].nr_entries; i++) {
>                seq_printf(f, "[<%pK>] %pS\n",
>                        (void *)s->trace->entries[i],
>                        (void *)s->trace->entries[i]);
>        }
>        raw_spin_unlock(&s->lock[idx]);
>
>        mutex_unlock(&stall_mutex);
> }
>
>
> Yes its racy on s->worst, but who cares (if you do care you can keep a
> copy in s->delay[idx] or so). Also, it might be better to not do the
> spinlock but simply use an atomic bitop to set an in-use flag, there is
> no reason to disable preemption over the seq_printf() loop.
One change here is to use raw_spin functions. Okay, sure. Another is
to use a mutex instead of a spinlock among the readers. Makes a lot of
sense.

Another change is to allow concurrent writers. The readers are
serialized but the writers are concurrent; isn't that a strange
design? The way the "main" index is changed also looks problematic. A
writer will switch the index before anything useful is even known to
be in the buffer, and then a reader can go ahead and get that lock and
read something potentially very old and misleading. I don't think
that's okay.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v6 2/2] Output stall data in debugfs
  2011-08-11 20:10     ` Alex Neronskiy
@ 2011-08-11 20:23       ` Peter Zijlstra
  2011-08-11 20:31         ` Alex Neronskiy
  0 siblings, 1 reply; 21+ messages in thread
From: Peter Zijlstra @ 2011-08-11 20:23 UTC (permalink / raw)
  To: Alex Neronskiy
  Cc: linux-kernel, Ingo Molnar, Don Zickus, Mandeep Singh Baines

On Thu, 2011-08-11 at 13:10 -0700, Alex Neronskiy wrote:
> 
> Another change is to allow concurrent writers. The readers are
> serialized but the writers are concurrent; isn't that a strange
> design? The way the "main" index is changed also looks problematic. A
> writer will switch the index before anything useful is even known to
> be in the buffer, and then a reader can go ahead and get that lock and
> read something potentially very old and misleading. I don't think
> that's okay. 

Ah, right, yeah, stick another lock in there.. One does worry about the
whole writer concurrency thing though, isn't it likely all cpus will
tickle the thing in quick succession? Putting a global lock in that path
isn't good,.. always think of the poor sod with the 4096 cpu machine.

Also, is all of this really useful? The hardlockup watchdog is useful
when you mess up bad, but other than that I've never found it to be
useful at all.

I mean, we're at the point where a PREEMPT=y kernel has a pretty decent
latency and the PREEMPT_RT kernels live at ~30us. So wth are you
measuring?

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v6 2/2] Output stall data in debugfs
  2011-08-11 20:23       ` Peter Zijlstra
@ 2011-08-11 20:31         ` Alex Neronskiy
  2011-08-11 21:17           ` Peter Zijlstra
  0 siblings, 1 reply; 21+ messages in thread
From: Alex Neronskiy @ 2011-08-11 20:31 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, Ingo Molnar, Don Zickus, Mandeep Singh Baines

On Thu, Aug 11, 2011 at 1:23 PM, Peter Zijlstra <peterz@infradead.org> wrote:
> On Thu, 2011-08-11 at 13:10 -0700, Alex Neronskiy wrote:
> Ah, right, yeah, stick another lock in there.. One does worry about the
> whole writer concurrency thing though, isn't it likely all cpus will
> tickle the thing in quick succession? Putting a global lock in that path
> isn't good,.. always think of the poor sod with the 4096 cpu machine.
Is it common to disable interrupts on all 4096 CPU's at once?

> Also, is all of this really useful? The hardlockup watchdog is useful
> when you mess up bad, but other than that I've never found it to be
> useful at all.
>
> I mean, we're at the point where a PREEMPT=y kernel has a pretty decent
> latency and the PREEMPT_RT kernels live at ~30us. So wth are you
> measuring?
Well, not all kernels have PREEMPT. Chromebook kernels don't, for example.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v6 2/2] Output stall data in debugfs
  2011-08-11 19:04     ` Don Zickus
@ 2011-08-11 21:05       ` Andi Kleen
  0 siblings, 0 replies; 21+ messages in thread
From: Andi Kleen @ 2011-08-11 21:05 UTC (permalink / raw)
  To: Don Zickus
  Cc: Andi Kleen, Alex Neronskiy, linux-kernel, peterz, Ingo Molnar,
	Mandeep Singh Baines, Alex Neronskiy

> No because the serial console driver does a touch_nmi_watchdog().  So if
> we are trying to output debug info _before_ the lockup detector goes off,
> we effectively shoot ourselves in the foot by reseting the lockup detector
> everytime we print something.  Hence we are trying to capture the data and
> output using another interface.

Well surely that's fixable in the serial driver? Maybe set some global
on that CPU to disable it.

-Andi

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v6 2/2] Output stall data in debugfs
  2011-08-11 20:31         ` Alex Neronskiy
@ 2011-08-11 21:17           ` Peter Zijlstra
  2011-08-11 22:02             ` Alex Neronskiy
  2011-08-11 23:00             ` Mandeep Singh Baines
  0 siblings, 2 replies; 21+ messages in thread
From: Peter Zijlstra @ 2011-08-11 21:17 UTC (permalink / raw)
  To: Alex Neronskiy
  Cc: linux-kernel, Ingo Molnar, Don Zickus, Mandeep Singh Baines

On Thu, 2011-08-11 at 13:31 -0700, Alex Neronskiy wrote:
> 
> > I mean, we're at the point where a PREEMPT=y kernel has a pretty decent
> > latency and the PREEMPT_RT kernels live at ~30us. So wth are you
> > measuring?
> Well, not all kernels have PREEMPT. Chromebook kernels don't, for example.

Can one infer from that statement that the purpose is trying to measure
non preempt latency? Why not use the tracer build for that purpose?

Still the watchdog ticks at 1s intervals or so, anything that takes that
long, even on voluntary preemption kernels is quite insane.



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v6 2/2] Output stall data in debugfs
  2011-08-11 21:17           ` Peter Zijlstra
@ 2011-08-11 22:02             ` Alex Neronskiy
  2011-08-11 23:00             ` Mandeep Singh Baines
  1 sibling, 0 replies; 21+ messages in thread
From: Alex Neronskiy @ 2011-08-11 22:02 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, Ingo Molnar, Don Zickus, Mandeep Singh Baines

On Thu, Aug 11, 2011 at 2:17 PM, Peter Zijlstra <peterz@infradead.org> wrote:
> On Thu, 2011-08-11 at 13:31 -0700, Alex Neronskiy wrote:
>>
>> > I mean, we're at the point where a PREEMPT=y kernel has a pretty decent
>> > latency and the PREEMPT_RT kernels live at ~30us. So wth are you
>> > measuring?
>> Well, not all kernels have PREEMPT. Chromebook kernels don't, for example.
>
> Can one infer from that statement that the purpose is trying to measure
> non preempt latency? Why not use the tracer build for that purpose?
>
> Still the watchdog ticks at 1s intervals or so, anything that takes that
> long, even on voluntary preemption kernels is quite insane.
I think Don grasped the sentiment well about a month ago when he
commented on the first version of the patch, so I'll just resend his
words:
On Wed, Jul 13, 2011 at 6:43 AM, Don Zickus <dzickus@redhat.com> wrote:
> I like the idea of this patch.  It definitely helps with better statistics
> when trying to set the appropriate values for a soft/hard lockup as
> opposed to pulling numbers from the air.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v6 2/2] Output stall data in debugfs
  2011-08-11 21:17           ` Peter Zijlstra
  2011-08-11 22:02             ` Alex Neronskiy
@ 2011-08-11 23:00             ` Mandeep Singh Baines
  2011-08-12 12:14               ` Peter Zijlstra
  1 sibling, 1 reply; 21+ messages in thread
From: Mandeep Singh Baines @ 2011-08-11 23:00 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Alex Neronskiy, linux-kernel, Ingo Molnar, Don Zickus,
	Mandeep Singh Baines

Hi Peter,

Peter Zijlstra (peterz@infradead.org) wrote:
> On Thu, 2011-08-11 at 13:31 -0700, Alex Neronskiy wrote:
> > 
> > > I mean, we're at the point where a PREEMPT=y kernel has a pretty decent
> > > latency and the PREEMPT_RT kernels live at ~30us. So wth are you
> > > measuring?

In an earlier patch in the series, Alex was looking into using the timer
for getting stack traces but using a different time source (TSC, jiffies) for
measuring the worst latency seen so far.

Since you know when the watchdog should have run, you can measure the
difference between when it did run and when it should have. This would allow
you to be able to measure latency down to the ms or lower.

> > Well, not all kernels have PREEMPT. Chromebook kernels don't, for example.
> 
> Can one infer from that statement that the purpose is trying to measure
> non preempt latency? ...
> 

Our "current" plan is to see if we can get away with PREEMPT_VOLUNTARY
and then fix any issues that we find by adding pre-emption points and
finer grain locking (to enable pre-emption points) where needed.
We were hoping to use this patch to find those issues.

Right now we have softlockup_thresh set to 10 seconds. But we have no idea
whether this is too high or not. Ideally, you'd want to set the threshold
as low as possible but no so low that you start panicking the system
on temporary stalls.

Let's say that worst stall that every happens on our system is < 1s and
any stall that is > 1s is really a lockup. With this patch, we could
say that with confidence and push the threshold down from 10 to 2.
The quicker we can detect a lockup, the better. We minimize downtown
and get the machine up in no time (our boot is only 8 seconds).

But I suspect that there are stalls. In Alex's first run with the patch,
he saw a few lockups which we are now investigating. They were mostly
in the suspend/resume and boot/shutdown paths. One I'm confident
is a bug and we'll send a patch upstream shortly.

> ... Why not use the tracer build for that purpose?
> 

PREEMPT_TRACER is awesome for the lab or development machines but
it adds too much overhead to use in production. Its add a pretty big
overhead every time you context switch. With PREEMPT, the overhead
would be even more.

We're not 100% sure we shouldn't use PREEMPT instead of PREEMPT_VOLUNTARY.
If we had the data from this patch, we might see that latency is too
high and we might have to consider PREEMPT instead.

> Still the watchdog ticks at 1s intervals or so, anything that takes that
> long, even on voluntary preemption kernels is quite insane.
> 
> 

The timer period is 1/5 the period of the threshold. The minimum threshold
is 1s. So if we set the threshold to 1s, we could get a stack trace
for a 200 ms stall. That would rock.

We would love to set the threshold to 1s and know that we aren't seeing any
stalls above 200ms. That would give us at least 800 ms of breathing room.
But right now, we have no idea how much breathing room there is.

Regards,
Mandeep

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v6 2/2] Output stall data in debugfs
  2011-08-11 19:35   ` Peter Zijlstra
  2011-08-11 20:10     ` Alex Neronskiy
@ 2011-08-12  9:06     ` Ingo Molnar
  2011-08-12 19:46       ` Alex Neronskiy
  1 sibling, 1 reply; 21+ messages in thread
From: Ingo Molnar @ 2011-08-12  9:06 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Alex Neronskiy, linux-kernel, Don Zickus, Mandeep Singh Baines,
	Alex Neronskiy


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Wed, 2011-08-10 at 11:02 -0700, Alex Neronskiy wrote:
> > @@ -210,22 +236,27 @@ void touch_softlockup_watchdog_sync(void)
> >  /* watchdog detector functions */
> >  static void update_hardstall(unsigned long stall, int this_cpu)
> >  {
> >         if (stall > hardstall_thresh && stall > worst_hardstall) {
> >                 unsigned long flags;
> > +               spin_lock_irqsave(&hardstall_write_lock, flags);
> > +               if (stall > worst_hardstall) {
> > +                       int write_ind = hard_read_ind;
> > +                       int locked = spin_trylock(&hardstall_locks[write_ind]);
> > +                       /* cannot wait, so if there's contention,
> > +                        * switch buffers */
> > +                       if (!locked)
> > +                               write_ind = !write_ind;
> > +
> >                         worst_hardstall = stall;
> > +                       hardstall_traces[write_ind].nr_entries = 0;
> > +                       save_stack_trace(&hardstall_traces[write_ind]);
> >  
> > +                       /* tell readers to use the new buffer from now on */
> > +                       hard_read_ind = write_ind;
> > +                       if (locked)
> > +                               spin_unlock(&hardstall_locks[write_ind]);
> > +               }
> > +               spin_unlock_irqrestore(&hardstall_write_lock, flags);
> >         }
> >  } 
> 
> That must be the most convoluted locking I've seen in a while.. OMG!

Well, but there are conceptual problems at the higher levels: the 
concept of recording a worst-case (or best-case) latency is not 
limited to the comparatively minor usecase of soft-watchdog stalls.

We have numerous tracers in ftrace that output their own kinds of 
min/max latencies, with associated stack trace signatures.

So the right approach would *not* be to add yet another 
special-purpose debugfs variant for this, but to integrate this 
capability into perf tracing. That way it would be useful for:

 - soft stalls
 - irq service latencies
 - irq disable latencies
 - preempt disable latencies
 - wakeup latencies
 - and much more: it could be used for just about any event that 
   measures some sort of latency.

To implement it i'd first suggest to add a TRACE_EVENT() for the 
softwatchdog latencies, and then look at how a stack-trace attached 
to the worst-case latency could be emitted via the perf ring-buffer.

We do something very, very similar for callchains already, so all the 
low level machinery is already there.

Alex, would you be interested in taking a stab at this approach? Such 
an approach looks a *lot* more palatable from an upstream merge point 
of view and it would give you all the functionality that the current 
patches are providing you (and more).

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v6 2/2] Output stall data in debugfs
  2011-08-11 23:00             ` Mandeep Singh Baines
@ 2011-08-12 12:14               ` Peter Zijlstra
  2011-08-16 19:32                 ` Mandeep Singh Baines
  0 siblings, 1 reply; 21+ messages in thread
From: Peter Zijlstra @ 2011-08-12 12:14 UTC (permalink / raw)
  To: Mandeep Singh Baines
  Cc: Alex Neronskiy, linux-kernel, Ingo Molnar, Don Zickus

On Thu, 2011-08-11 at 16:00 -0700, Mandeep Singh Baines wrote:
> Hi Peter,
> 
> Peter Zijlstra (peterz@infradead.org) wrote:
> > On Thu, 2011-08-11 at 13:31 -0700, Alex Neronskiy wrote:
> > > 
> > > > I mean, we're at the point where a PREEMPT=y kernel has a pretty decent
> > > > latency and the PREEMPT_RT kernels live at ~30us. So wth are you
> > > > measuring?
> 
> In an earlier patch in the series, Alex was looking into using the timer
> for getting stack traces but using a different time source (TSC, jiffies) for
> measuring the worst latency seen so far.
> 
> Since you know when the watchdog should have run, you can measure the
> difference between when it did run and when it should have. This would allow
> you to be able to measure latency down to the ms or lower.

But that's an arse backwards way of measuring latency. And we already
have an implementation that does it the right way around, its the
preempt-off and irq-off latency tracer in ftrace.

They instrument the preempt/irq disable/enable sites and simply trace
everything in between, getting you an exact measurement and overview of
wth the kernel thought it was doing during that time.

That is much much more useful than a random stacktrace at some point.

In fact it is the tool the -rt kernel uses to find and break these
latencies.

> > > Well, not all kernels have PREEMPT. Chromebook kernels don't, for example.
> > 
> > Can one infer from that statement that the purpose is trying to measure
> > non preempt latency? ...
> > 
> 
> Our "current" plan is to see if we can get away with PREEMPT_VOLUNTARY
> and then fix any issues that we find by adding pre-emption points and
> finer grain locking (to enable pre-emption points) where needed.
> We were hoping to use this patch to find those issues.

Then you're doing it wrong. Use the ftrace infrastructure that's in
place for this purpose, the -rt kernel has been using that for many many
years to do exactly that.

> Right now we have softlockup_thresh set to 10 seconds. But we have no idea
> whether this is too high or not. Ideally, you'd want to set the threshold
> as low as possible but no so low that you start panicking the system
> on temporary stalls.

I can't remember the last time the softlockup thing said anything
useful. Just disable the thing, its useless.

> Let's say that worst stall that every happens on our system is < 1s and
> any stall that is > 1s is really a lockup. With this patch, we could
> say that with confidence and push the threshold down from 10 to 2.
> The quicker we can detect a lockup, the better. We minimize downtown
> and get the machine up in no time (our boot is only 8 seconds).

/me fails to grasp the relation between stalls and boot time. Esp
softlockup crap.

> But I suspect that there are stalls. In Alex's first run with the patch,
> he saw a few lockups which we are now investigating. They were mostly
> in the suspend/resume and boot/shutdown paths. One I'm confident
> is a bug and we'll send a patch upstream shortly.

A stall is mostly people forgetting to tickle the watchdog timer, I'm
very sure these paths actually work (the moment they don't people start
to complain rather loud).

Now optimizing those paths is a whole different game, you want a trace
and profile for that again, seeing if it does something particularly
stupid (trace) and making the more expensive things less so (profile).

> > ... Why not use the tracer build for that purpose?
> > 
> 
> PREEMPT_TRACER is awesome for the lab or development machines but
> it adds too much overhead to use in production. Its add a pretty big
> overhead every time you context switch. With PREEMPT, the overhead
> would be even more.

I don't think context switches are the problem, but simply that you
effectively run with the function tracer enabled.

Also, isn't chromium basically a very limited (aka useless) environment?
You should be able to get a fairly good coverage of chrome (the single
app on there afaik) in pre-production testing.

> The timer period is 1/5 the period of the threshold. The minimum threshold
> is 1s. So if we set the threshold to 1s, we could get a stack trace
> for a 200 ms stall. That would rock.

I'm still not sure of the value of stack traces for latencies, it simply
gives a random snapshot of somewhere in the path (at best, you can hit
after the path with the watchdog approach).

> We would love to set the threshold to 1s and know that we aren't seeing any
> stalls above 200ms. That would give us at least 800 ms of breathing room.
> But right now, we have no idea how much breathing room there is.

OK, but why not run cyclictest? That is the timer sampling latency
detection tool of choice, another on of these -rt things we've been
using for donkey-years. Cyclictest can even run ftrace for you if you
want.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v6 2/2] Output stall data in debugfs
  2011-08-12  9:06     ` Ingo Molnar
@ 2011-08-12 19:46       ` Alex Neronskiy
  0 siblings, 0 replies; 21+ messages in thread
From: Alex Neronskiy @ 2011-08-12 19:46 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, linux-kernel, Don Zickus, Mandeep Singh Baines

On Fri, Aug 12, 2011 at 2:06 AM, Ingo Molnar <mingo@elte.hu> wrote:
> Well, but there are conceptual problems at the higher levels: the
> concept of recording a worst-case (or best-case) latency is not
> limited to the comparatively minor usecase of soft-watchdog stalls.
>
> We have numerous tracers in ftrace that output their own kinds of
> min/max latencies, with associated stack trace signatures.
>
> So the right approach would *not* be to add yet another
> special-purpose debugfs variant for this, but to integrate this
> capability into perf tracing. That way it would be useful for:
>
>  - soft stalls
>  - irq service latencies
>  - irq disable latencies
>  - preempt disable latencies
>  - wakeup latencies
>  - and much more: it could be used for just about any event that
>   measures some sort of latency.
>
> To implement it i'd first suggest to add a TRACE_EVENT() for the
> softwatchdog latencies, and then look at how a stack-trace attached
> to the worst-case latency could be emitted via the perf ring-buffer.
>
> We do something very, very similar for callchains already, so all the
> low level machinery is already there.
>
> Alex, would you be interested in taking a stab at this approach? Such
> an approach looks a *lot* more palatable from an upstream merge point
> of view and it would give you all the functionality that the current
> patches are providing you (and more).
>
> Thanks,
>
>        Ingo
The best data comes from real-world use, so we want this used in
production. We felt that ftrace would add excessive overhead and bog
down performance, whereas this random sampling approach is basically
free, especially considering it's piggybacking on watchdogs which are
already running anyway. Is our approach misguided? Is this not as
cheap as we think? Is tracing not as expensive as we think?

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v6 2/2] Output stall data in debugfs
  2011-08-12 12:14               ` Peter Zijlstra
@ 2011-08-16 19:32                 ` Mandeep Singh Baines
  0 siblings, 0 replies; 21+ messages in thread
From: Mandeep Singh Baines @ 2011-08-16 19:32 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Mandeep Singh Baines, Alex Neronskiy, linux-kernel, Ingo Molnar,
	Don Zickus

Peter Zijlstra (peterz@infradead.org) wrote:
> On Thu, 2011-08-11 at 16:00 -0700, Mandeep Singh Baines wrote:
> > Hi Peter,
> > 
> > Peter Zijlstra (peterz@infradead.org) wrote:
> > > On Thu, 2011-08-11 at 13:31 -0700, Alex Neronskiy wrote:
> > > > 
> > > > > I mean, we're at the point where a PREEMPT=y kernel has a pretty decent
> > > > > latency and the PREEMPT_RT kernels live at ~30us. So wth are you
> > > > > measuring?
> > 
> > In an earlier patch in the series, Alex was looking into using the timer
> > for getting stack traces but using a different time source (TSC, jiffies) for
> > measuring the worst latency seen so far.
> > 
> > Since you know when the watchdog should have run, you can measure the
> > difference between when it did run and when it should have. This would allow
> > you to be able to measure latency down to the ms or lower.
> 
> But that's an arse backwards way of measuring latency. And we already
> have an implementation that does it the right way around, its the
> preempt-off and irq-off latency tracer in ftrace.
> 

Agree. Its not perfect but random sampling is super cheap and can be
turned on in production. Its analagous to PMU counters. They don't count
every single instruction like gprof does but given enough samples you
get a pretty good picture of what's going on.

> They instrument the preempt/irq disable/enable sites and simply trace
> everything in between, getting you an exact measurement and overview of
> wth the kernel thought it was doing during that time.
> 
> That is much much more useful than a random stacktrace at some point.
> 

I have found the softlockup stack traces quite useful. They normally
point you right inside the loop where you're stuck.

> In fact it is the tool the -rt kernel uses to find and break these
> latencies.
> 
> > > > Well, not all kernels have PREEMPT. Chromebook kernels don't, for example.
> > > 
> > > Can one infer from that statement that the purpose is trying to measure
> > > non preempt latency? ...
> > > 
> > 
> > Our "current" plan is to see if we can get away with PREEMPT_VOLUNTARY
> > and then fix any issues that we find by adding pre-emption points and
> > finer grain locking (to enable pre-emption points) where needed.
> > We were hoping to use this patch to find those issues.
> 
> Then you're doing it wrong. Use the ftrace infrastructure that's in
> place for this purpose, the -rt kernel has been using that for many many
> years to do exactly that.
> 

The ftrace infrastructure rocks. We can find a lot of issues with it on
our test devices. But crazy stuff happens in the field and I get bug
reports I never would have imagined. If I could run preempt tracing in
the field I would but it just seems like too much overhead. Every time
the preempt counter goes to or leaves zero there is additional overhead.
I suspect that's 1000s of times a second.

> > Right now we have softlockup_thresh set to 10 seconds. But we have no idea
> > whether this is too high or not. Ideally, you'd want to set the threshold
> > as low as possible but no so low that you start panicking the system
> > on temporary stalls.
> 
> I can't remember the last time the softlockup thing said anything
> useful. Just disable the thing, its useless.
> 

http://www.google.com/search?q=site%3Alkml.org+%22Stuck+for%22

A lot of these seem like legitimate bugs.

> > Let's say that worst stall that every happens on our system is < 1s and
> > any stall that is > 1s is really a lockup. With this patch, we could
> > say that with confidence and push the threshold down from 10 to 2.
> > The quicker we can detect a lockup, the better. We minimize downtown
> > and get the machine up in no time (our boot is only 8 seconds).
> 
> /me fails to grasp the relation between stalls and boot time. Esp
> softlockup crap.
> 

Currently, we have the watchdog set to 5 seconds and enable panic on a
lockup. If we're stuck for 5 seconds, the machine is probably wedged. Boot
time is currenty 8 seconds. So the time from the lockup to when the system
was usable again is 13 seconds. Crashes should be extremely rare but
when they do happen, if they're shorter the user is less upset.

A nice testing benefit is that we can test over 200 crashes an hour.

> > But I suspect that there are stalls. In Alex's first run with the patch,
> > he saw a few lockups which we are now investigating. They were mostly
> > in the suspend/resume and boot/shutdown paths. One I'm confident
> > is a bug and we'll send a patch upstream shortly.
> 
> A stall is mostly people forgetting to tickle the watchdog timer, I'm
> very sure these paths actually work (the moment they don't people start
> to complain rather loud).
> 
> Now optimizing those paths is a whole different game, you want a trace
> and profile for that again, seeing if it does something particularly
> stupid (trace) and making the more expensive things less so (profile).
> 

Agree. But you won't profile unless you know there's a stall.

I've definitely come across issues where there were stalls we didn't
catch in our testing. Admittedly our testing was lacking. I remember
one stall where the handler for /proc/net/tcp would disable bottom
halves, iterate over the entire hash table and then re-enable
bottom halves. Imagine if you have a machine with 10000s of connections.
That was eventually fixed by using finer grain locking.

Another issue I remember was where the route cache would get flushed
every few seconds resulting in 10000s of call_rcu()s which resulting
in the rcu softirq running for a few seconds processing those requests.

> > > ... Why not use the tracer build for that purpose?
> > > 
> > 
> > PREEMPT_TRACER is awesome for the lab or development machines but
> > it adds too much overhead to use in production. Its add a pretty big
> > overhead every time you context switch. With PREEMPT, the overhead
> > would be even more.
> 
> I don't think context switches are the problem, but simply that you
> effectively run with the function tracer enabled.
> 
> Also, isn't chromium basically a very limited (aka useless) environment?
> You should be able to get a fairly good coverage of chrome (the single
> app on there afaik) in pre-production testing.
> 

Yes, our software environment is limited but in some ways its
more complex than a server environment. We have to deal with video,
audio, WiFi, 3G, power, USB, etc. A lot of these drivers are just
out of staging and not all that mature. Whereas for a server the
I/O is limited to requests over a network, we need to handle
key presses, trackpad, lid open/close, suspend/resume, light sensors,
etc. I'm not saying its more complex but a consumer device has
its own set of unique complexities which make it hard to test the
entire state space.

> > The timer period is 1/5 the period of the threshold. The minimum threshold
> > is 1s. So if we set the threshold to 1s, we could get a stack trace
> > for a 200 ms stall. That would rock.
> 
> I'm still not sure of the value of stack traces for latencies, it simply
> gives a random snapshot of somewhere in the path (at best, you can hit
> after the path with the watchdog approach).
> 

The softlockup stack trace is saved by the timer which pre-empts the
locked up code. So you see exactly where you're locked up.

> > We would love to set the threshold to 1s and know that we aren't seeing any
> > stalls above 200ms. That would give us at least 800 ms of breathing room.
> > But right now, we have no idea how much breathing room there is.
> 
> OK, but why not run cyclictest? That is the timer sampling latency
> detection tool of choice, another on of these -rt things we've been
> using for donkey-years. Cyclictest can even run ftrace for you if you
> want.
> 

Thanks much for this suggestion. cyclictest looks very cool. I'm going to
add it to our testing. I'll poke around rt.wiki.kernel.org for more good
stuff.

Regards,
Mandeep


^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2011-08-16 19:33 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-08-10 18:02 [PATCH v6 1/2] Track hard and soft "short lockups" or "stalls." Alex Neronskiy
2011-08-10 18:02 ` [PATCH v6 2/2] Output stall data in debugfs Alex Neronskiy
2011-08-11 18:48   ` Andi Kleen
2011-08-11 19:04     ` Don Zickus
2011-08-11 21:05       ` Andi Kleen
2011-08-11 19:13   ` Peter Zijlstra
2011-08-11 19:35   ` Peter Zijlstra
2011-08-11 20:10     ` Alex Neronskiy
2011-08-11 20:23       ` Peter Zijlstra
2011-08-11 20:31         ` Alex Neronskiy
2011-08-11 21:17           ` Peter Zijlstra
2011-08-11 22:02             ` Alex Neronskiy
2011-08-11 23:00             ` Mandeep Singh Baines
2011-08-12 12:14               ` Peter Zijlstra
2011-08-16 19:32                 ` Mandeep Singh Baines
2011-08-12  9:06     ` Ingo Molnar
2011-08-12 19:46       ` Alex Neronskiy
2011-08-11 19:43   ` Peter Zijlstra
2011-08-11 19:51     ` Alex Neronskiy
2011-08-11 19:57       ` Peter Zijlstra
2011-08-11 19:09 ` [PATCH v6 1/2] Track hard and soft "short lockups" or "stalls." Peter Zijlstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).