All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 7/7] sparc64: Add function graph tracer support.
@ 2010-04-13  6:43 David Miller
  2010-04-13 19:18 ` Frederic Weisbecker
                   ` (46 more replies)
  0 siblings, 47 replies; 48+ messages in thread
From: David Miller @ 2010-04-13  6:43 UTC (permalink / raw)
  To: sparclinux


Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/Kconfig              |    2 +
 arch/sparc/kernel/Makefile      |    1 +
 arch/sparc/kernel/ftrace.c      |   58 ++++++++++++++++++++++++++++++++++++
 arch/sparc/kernel/irq_64.c      |    3 +-
 arch/sparc/kernel/kgdb_64.c     |    3 +-
 arch/sparc/kernel/pcr.c         |    3 +-
 arch/sparc/kernel/smp_64.c      |   11 ++++---
 arch/sparc/kernel/time_64.c     |    3 +-
 arch/sparc/kernel/vmlinux.lds.S |    1 +
 arch/sparc/lib/mcount.S         |   62 +++++++++++++++++++++++++++++++++++----
 10 files changed, 132 insertions(+), 15 deletions(-)

diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 035304c..9908d47 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -37,6 +37,8 @@ config SPARC64
 	def_bool 64BIT
 	select ARCH_SUPPORTS_MSI
 	select HAVE_FUNCTION_TRACER
+	select HAVE_FUNCTION_GRAPH_TRACER
+	select HAVE_FUNCTION_GRAPH_FP_TEST
 	select HAVE_FUNCTION_TRACE_MCOUNT_TEST
 	select HAVE_KRETPROBES
 	select HAVE_KPROBES
diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile
index 1b35ed6..0c2dc1f 100644
--- a/arch/sparc/kernel/Makefile
+++ b/arch/sparc/kernel/Makefile
@@ -93,6 +93,7 @@ obj-$(CONFIG_KGDB)        += kgdb_$(BITS).o
 
 
 obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
+obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
 
 obj-$(CONFIG_EARLYFB) += btext.o
 obj-$(CONFIG_STACKTRACE)     += stacktrace.o
diff --git a/arch/sparc/kernel/ftrace.c b/arch/sparc/kernel/ftrace.c
index 2ea6e4f..03ab022 100644
--- a/arch/sparc/kernel/ftrace.c
+++ b/arch/sparc/kernel/ftrace.c
@@ -91,3 +91,61 @@ int __init ftrace_dyn_arch_init(void *data)
 	return 0;
 }
 #endif
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+extern void ftrace_graph_call(void);
+
+int ftrace_enable_ftrace_graph_caller(void)
+{
+	unsigned long ip = (unsigned long)(&ftrace_graph_call);
+	u32 old, new;
+
+	old = *(u32 *) &ftrace_graph_call;
+	new = ftrace_call_replace(ip, (unsigned long) &ftrace_graph_caller);
+	return ftrace_modify_code(ip, old, new);
+}
+
+int ftrace_disable_ftrace_graph_caller(void)
+{
+	unsigned long ip = (unsigned long)(&ftrace_graph_call);
+	u32 old, new;
+
+	old = *(u32 *) &ftrace_graph_call;
+	new = ftrace_call_replace(ip, (unsigned long) &ftrace_stub);
+
+	return ftrace_modify_code(ip, old, new);
+}
+
+#endif /* !CONFIG_DYNAMIC_FTRACE */
+
+/*
+ * Hook the return address and push it in the stack of return addrs
+ * in current thread info.
+ */
+unsigned long prepare_ftrace_return(unsigned long parent,
+				    unsigned long self_addr,
+				    unsigned long frame_pointer)
+{
+	unsigned long return_hooker = (unsigned long) &return_to_handler;
+	struct ftrace_graph_ent trace;
+
+	if (unlikely(atomic_read(&current->tracing_graph_pause)))
+		return parent + 8UL;
+
+	if (ftrace_push_return_trace(parent, self_addr, &trace.depth,
+				     frame_pointer) = -EBUSY)
+		return parent + 8UL;
+
+	trace.func = self_addr;
+
+	/* Only trace if the calling function expects to */
+	if (!ftrace_graph_entry(&trace)) {
+		current->curr_ret_stack--;
+		return parent + 8UL;
+	}
+
+	return return_hooker;
+}
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c
index e1cbdb9..af5c76c 100644
--- a/arch/sparc/kernel/irq_64.c
+++ b/arch/sparc/kernel/irq_64.c
@@ -20,6 +20,7 @@
 #include <linux/delay.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+#include <linux/ftrace.h>
 #include <linux/irq.h>
 
 #include <asm/ptrace.h>
@@ -721,7 +722,7 @@ static __attribute__((always_inline)) void restore_hardirq_stack(void *orig_sp)
 	__asm__ __volatile__("mov %0, %%sp" : : "r" (orig_sp));
 }
 
-void handler_irq(int irq, struct pt_regs *regs)
+void __irq_entry handler_irq(int irq, struct pt_regs *regs)
 {
 	unsigned long pstate, bucket_pa;
 	struct pt_regs *old_regs;
diff --git a/arch/sparc/kernel/kgdb_64.c b/arch/sparc/kernel/kgdb_64.c
index f5a0fd4..0a2bd0f 100644
--- a/arch/sparc/kernel/kgdb_64.c
+++ b/arch/sparc/kernel/kgdb_64.c
@@ -5,6 +5,7 @@
 
 #include <linux/kgdb.h>
 #include <linux/kdebug.h>
+#include <linux/ftrace.h>
 
 #include <asm/kdebug.h>
 #include <asm/ptrace.h>
@@ -108,7 +109,7 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
 }
 
 #ifdef CONFIG_SMP
-void smp_kgdb_capture_client(int irq, struct pt_regs *regs)
+void __irq_entry smp_kgdb_capture_client(int irq, struct pt_regs *regs)
 {
 	unsigned long flags;
 
diff --git a/arch/sparc/kernel/pcr.c b/arch/sparc/kernel/pcr.c
index 2d94e7a..c4a6a50 100644
--- a/arch/sparc/kernel/pcr.c
+++ b/arch/sparc/kernel/pcr.c
@@ -8,6 +8,7 @@
 #include <linux/irq.h>
 
 #include <linux/perf_event.h>
+#include <linux/ftrace.h>
 
 #include <asm/pil.h>
 #include <asm/pcr.h>
@@ -34,7 +35,7 @@ unsigned int picl_shift;
  * Therefore in such situations we defer the work by signalling
  * a lower level cpu IRQ.
  */
-void deferred_pcr_work_irq(int irq, struct pt_regs *regs)
+void __irq_entry deferred_pcr_work_irq(int irq, struct pt_regs *regs)
 {
 	struct pt_regs *old_regs;
 
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index eb14844..1011fbf 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -22,6 +22,7 @@
 #include <linux/profile.h>
 #include <linux/bootmem.h>
 #include <linux/vmalloc.h>
+#include <linux/ftrace.h>
 #include <linux/cpu.h>
 
 #include <asm/head.h>
@@ -822,13 +823,13 @@ void arch_send_call_function_single_ipi(int cpu)
 		      &cpumask_of_cpu(cpu));
 }
 
-void smp_call_function_client(int irq, struct pt_regs *regs)
+void __irq_entry smp_call_function_client(int irq, struct pt_regs *regs)
 {
 	clear_softint(1 << irq);
 	generic_smp_call_function_interrupt();
 }
 
-void smp_call_function_single_client(int irq, struct pt_regs *regs)
+void __irq_entry smp_call_function_single_client(int irq, struct pt_regs *regs)
 {
 	clear_softint(1 << irq);
 	generic_smp_call_function_single_interrupt();
@@ -964,7 +965,7 @@ void flush_dcache_page_all(struct mm_struct *mm, struct page *page)
 	put_cpu();
 }
 
-void smp_new_mmu_context_version_client(int irq, struct pt_regs *regs)
+void __irq_entry smp_new_mmu_context_version_client(int irq, struct pt_regs *regs)
 {
 	struct mm_struct *mm;
 	unsigned long flags;
@@ -1148,7 +1149,7 @@ void smp_release(void)
  */
 extern void prom_world(int);
 
-void smp_penguin_jailcell(int irq, struct pt_regs *regs)
+void __irq_entry smp_penguin_jailcell(int irq, struct pt_regs *regs)
 {
 	clear_softint(1 << irq);
 
@@ -1364,7 +1365,7 @@ void smp_send_reschedule(int cpu)
 		      &cpumask_of_cpu(cpu));
 }
 
-void smp_receive_signal_client(int irq, struct pt_regs *regs)
+void __irq_entry smp_receive_signal_client(int irq, struct pt_regs *regs)
 {
 	clear_softint(1 << irq);
 }
diff --git a/arch/sparc/kernel/time_64.c b/arch/sparc/kernel/time_64.c
index f25858e..c7bbe6c 100644
--- a/arch/sparc/kernel/time_64.c
+++ b/arch/sparc/kernel/time_64.c
@@ -35,6 +35,7 @@
 #include <linux/clocksource.h>
 #include <linux/of_device.h>
 #include <linux/platform_device.h>
+#include <linux/ftrace.h>
 
 #include <asm/oplib.h>
 #include <asm/timer.h>
@@ -717,7 +718,7 @@ static struct clock_event_device sparc64_clockevent = {
 };
 static DEFINE_PER_CPU(struct clock_event_device, sparc64_events);
 
-void timer_interrupt(int irq, struct pt_regs *regs)
+void __irq_entry timer_interrupt(int irq, struct pt_regs *regs)
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
 	unsigned long tick_mask = tick_ops->softint_mask;
diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S
index 4e59925..c4f5758 100644
--- a/arch/sparc/kernel/vmlinux.lds.S
+++ b/arch/sparc/kernel/vmlinux.lds.S
@@ -46,6 +46,7 @@ SECTIONS
 		SCHED_TEXT
 		LOCK_TEXT
 		KPROBES_TEXT
+		IRQENTRY_TEXT
 		*(.gnu.warning)
 	} = 0
 	_etext = .;
diff --git a/arch/sparc/lib/mcount.S b/arch/sparc/lib/mcount.S
index 153c80e..3753e3c 100644
--- a/arch/sparc/lib/mcount.S
+++ b/arch/sparc/lib/mcount.S
@@ -26,22 +26,42 @@ mcount:
 #else
 	sethi		%hi(function_trace_stop), %g1
 	lduw		[%g1 + %lo(function_trace_stop)], %g2
-	brnz,pn		%g2, 1f
+	brnz,pn		%g2, 2f
 	 sethi		%hi(ftrace_trace_function), %g1
 	sethi		%hi(ftrace_stub), %g2
 	ldx		[%g1 + %lo(ftrace_trace_function)], %g1
 	or		%g2, %lo(ftrace_stub), %g2
 	cmp		%g1, %g2
 	be,pn		%icc, 1f
-	 mov		%i7, %g2
+	 mov		%i7, %g3
 	save		%sp, -128, %sp
-	mov		%g2, %o1
+	mov		%g3, %o1
 	jmpl		%g1, %o7
 	 mov		%i7, %o0
 	ret
 	 restore
 	/* not reached */
 1:
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	sethi		%hi(ftrace_graph_return), %g1
+	ldx		[%g1 + %lo(ftrace_graph_return)], %g3
+	cmp		%g2, %g3
+	bne,pn		%xcc, 5f
+	 sethi		%hi(ftrace_graph_entry_stub), %g2
+	sethi		%hi(ftrace_graph_entry), %g1
+	or		%g2, %lo(ftrace_graph_entry_stub), %g2
+	ldx		[%g1 + %lo(ftrace_graph_entry)], %g1
+	cmp		%g1, %g2
+	be,pt		%xcc, 2f
+	 nop
+5:	mov		%i7, %g2
+	mov		%fp, %g3
+	save		%sp, -128, %sp
+	mov		%g2, %l0
+	ba,pt		%xcc, ftrace_graph_caller
+	 mov		%g3, %l1
+#endif
+2:
 #endif
 #endif
 	retl
@@ -62,18 +82,48 @@ ftrace_stub:
 ftrace_caller:
 	sethi		%hi(function_trace_stop), %g1
 	mov		%i7, %g2
-	lduw		[%g1 + %lo(function_trace_stop)], %g3
-	brnz,pn		%g3, ftrace_stub
-	 nop
+	lduw		[%g1 + %lo(function_trace_stop)], %g1
+	brnz,pn		%g1, ftrace_stub
+	 mov		%fp, %g3
 	save		%sp, -128, %sp
 	mov		%g2, %o1
+	mov		%g2, %l0
+	mov		%g3, %l1
 	.globl		ftrace_call
 ftrace_call:
 	call		ftrace_stub
 	 mov		%i7, %o0
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	.globl		ftrace_graph_call
+ftrace_graph_call:
+	call		ftrace_stub
+	 nop
+#endif
 	ret
 	 restore
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	.size		ftrace_graph_call,.-ftrace_graph_call
+#endif
 	.size		ftrace_call,.-ftrace_call
 	.size		ftrace_caller,.-ftrace_caller
 #endif
 #endif
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ENTRY(ftrace_graph_caller)
+	mov		%l0, %o0
+	mov		%i7, %o1
+	call		prepare_ftrace_return
+	 mov		%l1, %o2
+	ret
+	 restore	%o0, -8, %i7
+END(ftrace_graph_caller)
+
+ENTRY(return_to_handler)
+	save		%sp, -128, %sp
+	call		ftrace_return_to_handler
+	 mov		%fp, %o0
+	jmpl		%o0 + 8, %g0
+	 restore
+END(return_to_handler)
+#endif
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
@ 2010-04-13 19:18 ` Frederic Weisbecker
  2010-04-13 19:39 ` Rostedt
                   ` (45 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: Frederic Weisbecker @ 2010-04-13 19:18 UTC (permalink / raw)
  To: sparclinux

On Mon, Apr 12, 2010 at 11:43:00PM -0700, David Miller wrote:
> 
> Signed-off-by: David S. Miller <davem@davemloft.net>
> ---
>  arch/sparc/Kconfig              |    2 +
>  arch/sparc/kernel/Makefile      |    1 +
>  arch/sparc/kernel/ftrace.c      |   58 ++++++++++++++++++++++++++++++++++++
>  arch/sparc/kernel/irq_64.c      |    3 +-
>  arch/sparc/kernel/kgdb_64.c     |    3 +-
>  arch/sparc/kernel/pcr.c         |    3 +-
>  arch/sparc/kernel/smp_64.c      |   11 ++++---
>  arch/sparc/kernel/time_64.c     |    3 +-
>  arch/sparc/kernel/vmlinux.lds.S |    1 +
>  arch/sparc/lib/mcount.S         |   62 +++++++++++++++++++++++++++++++++++----
>  10 files changed, 132 insertions(+), 15 deletions(-)



Works very well :)

 63)               |  do_sparc64_fault() {
 63)   6.153 us    |    down_read_trylock();
 63)   5.164 us    |    find_vma();
 63)               |    handle_mm_fault() {
 63)   2.966 us    |      add_preempt_count();
 63)   2.856 us    |      sub_preempt_count();
 63)               |      _raw_spin_lock() {
 63)   2.857 us    |        add_preempt_count();
 63)   3.076 us    |        add_preempt_count();
 63)   2.527 us    |        sub_preempt_count();
 63) + 23.513 us   |      }


I planned to add this support but got distracted by the
function tracer problems. And more importantly I'm not yet
enough familiar with sparc.

Thanks!


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
  2010-04-13 19:18 ` Frederic Weisbecker
@ 2010-04-13 19:39 ` Rostedt
  2010-04-13 19:45 ` Frederic Weisbecker
                   ` (44 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: Rostedt @ 2010-04-13 19:39 UTC (permalink / raw)
  To: sparclinux



On Apr 13, 2010, at 3:18 PM, Frederic Weisbecker <fweisbec@gmail.com>  
wrote:

> On Mon, Apr 12, 2010 at 11:43:00PM -0700, David Miller wrote:
>>
>> Signed-off-by: David S. Miller <davem@davemloft.net>
>> ---
>> arch/sparc/Kconfig              |    2 +
>> arch/sparc/kernel/Makefile      |    1 +
>> arch/sparc/kernel/ftrace.c      |   58 +++++++++++++++++++++++++++++ 
>> +++++++
>> arch/sparc/kernel/irq_64.c      |    3 +-
>> arch/sparc/kernel/kgdb_64.c     |    3 +-
>> arch/sparc/kernel/pcr.c         |    3 +-
>> arch/sparc/kernel/smp_64.c      |   11 ++++---
>> arch/sparc/kernel/time_64.c     |    3 +-
>> arch/sparc/kernel/vmlinux.lds.S |    1 +
>> arch/sparc/lib/mcount.S         |   62 +++++++++++++++++++++++++++++ 
>> ++++++----
>> 10 files changed, 132 insertions(+), 15 deletions(-)
>
>
>
> Works very well :)
>
> 63)               |  do_sparc64_fault() {
> 63)   6.153 us    |    down_read_trylock();
> 63)   5.164 us    |    find_vma();
> 63)               |    handle_mm_fault() {
> 63)   2.966 us    |      add_preempt_count();
> 63)   2.856 us    |      sub_preempt_count();
> 63)               |      _raw_spin_lock() {
> 63)   2.857 us    |        add_preempt_count();
> 63)   3.076 us    |        add_preempt_count();
> 63)   2.527 us    |        sub_preempt_count();
> 63) + 23.513 us   |      }
>

I just had to comment on the CPU count.

Nice ;-)

-- Steve

>
> I planned to add this support but got distracted by the
> function tracer problems. And more importantly I'm not yet
> enough familiar with sparc.
>
> Thanks!
>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
  2010-04-13 19:18 ` Frederic Weisbecker
  2010-04-13 19:39 ` Rostedt
@ 2010-04-13 19:45 ` Frederic Weisbecker
  2010-04-13 21:34 ` David Miller
                   ` (43 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: Frederic Weisbecker @ 2010-04-13 19:45 UTC (permalink / raw)
  To: sparclinux

On Tue, Apr 13, 2010 at 03:39:21PM -0400, Rostedt wrote:
>
>
> On Apr 13, 2010, at 3:18 PM, Frederic Weisbecker <fweisbec@gmail.com>  
> wrote:
>
>> On Mon, Apr 12, 2010 at 11:43:00PM -0700, David Miller wrote:
>>>
>>> Signed-off-by: David S. Miller <davem@davemloft.net>
>>> ---
>>> arch/sparc/Kconfig              |    2 +
>>> arch/sparc/kernel/Makefile      |    1 +
>>> arch/sparc/kernel/ftrace.c      |   58 +++++++++++++++++++++++++++++ 
>>> +++++++
>>> arch/sparc/kernel/irq_64.c      |    3 +-
>>> arch/sparc/kernel/kgdb_64.c     |    3 +-
>>> arch/sparc/kernel/pcr.c         |    3 +-
>>> arch/sparc/kernel/smp_64.c      |   11 ++++---
>>> arch/sparc/kernel/time_64.c     |    3 +-
>>> arch/sparc/kernel/vmlinux.lds.S |    1 +
>>> arch/sparc/lib/mcount.S         |   62 +++++++++++++++++++++++++++++ 
>>> ++++++----
>>> 10 files changed, 132 insertions(+), 15 deletions(-)
>>
>>
>>
>> Works very well :)
>>
>> 63)               |  do_sparc64_fault() {
>> 63)   6.153 us    |    down_read_trylock();
>> 63)   5.164 us    |    find_vma();
>> 63)               |    handle_mm_fault() {
>> 63)   2.966 us    |      add_preempt_count();
>> 63)   2.856 us    |      sub_preempt_count();
>> 63)               |      _raw_spin_lock() {
>> 63)   2.857 us    |        add_preempt_count();
>> 63)   3.076 us    |        add_preempt_count();
>> 63)   2.527 us    |        sub_preempt_count();
>> 63) + 23.513 us   |      }
>>
>
> I just had to comment on the CPU count.
>
> Nice ;-)
>
> -- Steve


Heh yeah. The trace/trace_pipe in the main directory are basically
unusable there. Only per_cpu can make it :)


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
                   ` (2 preceding siblings ...)
  2010-04-13 19:45 ` Frederic Weisbecker
@ 2010-04-13 21:34 ` David Miller
  2010-04-13 21:35 ` David Miller
                   ` (42 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: David Miller @ 2010-04-13 21:34 UTC (permalink / raw)
  To: sparclinux

From: Rostedt <rostedt@goodmis.org>
Date: Tue, 13 Apr 2010 15:39:21 -0400

>> Works very well :)
>>
>> 63)               |  do_sparc64_fault() {
>> 63)   6.153 us    |    down_read_trylock();
>> 63)   5.164 us    |    find_vma();
>> 63)               |    handle_mm_fault() {
>> 63)   2.966 us    |      add_preempt_count();
>> 63)   2.856 us    |      sub_preempt_count();
>> 63)               |      _raw_spin_lock() {
>> 63)   2.857 us    |        add_preempt_count();
>> 63)   3.076 us    |        add_preempt_count();
>> 63)   2.527 us    |        sub_preempt_count();
>> 63) + 23.513 us   |      }
>>
> 
> I just had to comment on the CPU count.
> 
> Nice ;-)

Neener, neener:

 101)               |          run_local_timers() {
 115) + 24.063 us   |            raise_softirq();
  69)               |          run_local_timers() {
  85) + 23.623 us   |            hrtimer_run_queues();
 103)               |          run_local_timers() {
  66) + 71.749 us   |          }
  91)               |          run_local_timers() {
 106) + 23.184 us   |            raise_softirq();
  65) + 29.117 us   |            raise_softirq();
  68) + 23.843 us   |            hrtimer_run_queues();
 112) + 23.623 us   |            hrtimer_run_queues();
 105) + 24.063 us   |            hrtimer_run_queues();
  17) + 25.161 us   |            raise_softirq();
  82) + 24.282 us   |            hrtimer_run_queues();
 100) + 72.628 us   |          }

BTW, one thing that drives me nuts on this machine is that catting the
'trace' file takes several seconds to start up.  Is it calling
stop_machine() or something else which very expensive with high cpu
counts?

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
                   ` (3 preceding siblings ...)
  2010-04-13 21:34 ` David Miller
@ 2010-04-13 21:35 ` David Miller
  2010-04-13 21:51 ` Frederic Weisbecker
                   ` (41 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: David Miller @ 2010-04-13 21:35 UTC (permalink / raw)
  To: sparclinux

From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 13 Apr 2010 21:45:07 +0200

> Heh yeah. The trace/trace_pipe in the main directory are basically
> unusable there. Only per_cpu can make it :)

Does it delay for several seconds at the beginning when you 'cat' them
just like it does for me?

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
                   ` (4 preceding siblings ...)
  2010-04-13 21:35 ` David Miller
@ 2010-04-13 21:51 ` Frederic Weisbecker
  2010-04-13 21:52 ` Steven Rostedt
                   ` (40 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: Frederic Weisbecker @ 2010-04-13 21:51 UTC (permalink / raw)
  To: sparclinux

On Tue, Apr 13, 2010 at 02:35:37PM -0700, David Miller wrote:
> From: Frederic Weisbecker <fweisbec@gmail.com>
> Date: Tue, 13 Apr 2010 21:45:07 +0200
> 
> > Heh yeah. The trace/trace_pipe in the main directory are basically
> > unusable there. Only per_cpu can make it :)
> 
> Does it delay for several seconds at the beginning when you 'cat' them
> just like it does for me?


More than that actually, I tried and it just crashed :-)


[  344.012840] INFO: RCU detected CPU stalls: 23 37 41 (detected by 0, tE02 jiffies)
[  344.025991] CPU[0]: Unexpected SUN4V mondo error 6
[  344.032430] CPU[0]: Args were cnt(63) cpulist_pa(3ff00e040) mondo_block_pa(3ff00e000)
[  344.045283] CPU[0]: CPU list [ 65535 65535 65535 65535 65535 65535 65535 65535 65535 65535 65535 65535 65535 65535 65535 65535 65535 65535 65535 65535 65535 65535 65535 65535 65535 65535 65535 65535 65535 65535 65535 65535 65535 65535 65535 65535 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 ]
[  344.084875] * CPU[  0]: TSTATE[0000004411001601] TPC[000000000042ca74] TNPC[000000000042ca78] TASK[�^@:-2113896449]
[  344.098508]              TPC[cpu_idle+0xd8/0x164] O7[cpu_idle+0xc4/0x164] I7[after_lock_tlb+0x1b4/0x1cc] RPC[(null)]
[  344.112395]   CPU[  1]: TSTATE[0000004411001601] TPC[000000000042ca74] TNPC[000000000042ca78] TASK[swapper:0]
[  344.126399]              TPC[cpu_idle+0xd8/0x164] O7[cpu_idle+0xc4/0x164] I7[after_lock_tlb+0x1b4/0x1cc] RPC[clocksource_tick_read+0x10/0x20]
[  344.140583]   CPU[  2]: TSTATE[0000004411001601] TPC[000000000042ca74] TNPC[000000000042ca78] TASK[swapper:0]
[  344.154895]              TPC[cpu_idle+0xd8/0x164] O7[cpu_idle+0xc4/0x164] I7[after_lock_tlb+0x1b4/0x1cc] RPC[clocksource_tick_read+0x10/0x20]
[  344.169076]   CPU[  3]: TSTATE[0000004411001601] TPC[000000000042ca74] TNPC[000000000042ca78] TASK[swapper:0]
[  344.183521]              TPC[cpu_idle+0xd8/0x164] O7[cpu_idle+0xc4/0x164] I7[after_lock_tlb+0x1b4/0x1cc] RPC[clocksource_tick_read+0x10/0x20]
[  344.197909]   CPU[  4]: TSTATE[0000004411001601] TPC[000000000042ca74] TNPC[000000000042ca78] TASK[swapper:0]
[  344.212263]              TPC[cpu_idle+0xd8/0x164] O7[cpu_idle+0xc4/0x164] I7[after_lock_tlb+0x1b4/0x1cc] RPC[clocksource_tick_read+0x10/0x20]
[  344.226612]   CPU[  5]: TSTATE[0000004411001601] TPC[000000000042ca74] TNPC[000000000042ca78] TASK[swapper:0]
[  344.240553]              TPC[cpu_idle+0xd8/0x164] O7[cpu_idle+0xc4/0x164] I7[after_lock_tlb+0x1b4/0x1cc] RPC[clocksource_tick_read+0x10/0x20]
[  344.254027]   CPU[  6]: TSTATE[0000004411001601] TPC[000000000042ca74] TNPC[000000000042ca78] TASK[swapper:0]
[  344.267131]              TPC[cpu_idle+0xd8/0x164] O7[cpu_idle+0xc4/0x164] I7[after_lock_tlb+0x1b4/0x1cc] RPC[clocksource_tick_read+0x10/0x20]
[  344.280255]   CPU[  7]: TSTATE[0000004411001601] TPC[000000000042ca74] TNPC[000000000042ca78] TASK[swapper:0]
[  344.293348]              TPC[cpu_idle+0xd8/0x164] O7[cpu_idle+0xc4/0x164] I7[after_lock_tlb+0x1b4/0x1cc] RPC[clocksource_tick_read+0x10/0x20]
[  344.306543]   CPU[  8]: TSTATE[0000009980001603] TPC[000000000042f51c] TNPC[000000000042f520] TASK[swapper:0]
[  344.319678]              TPC[__delay+0x2c/0x4c] O7[__delay+0x2c/0x4c] I7[do_raw_spin_lock+0xbc/0x120] RPC[_raw_spin_lock_irq+0x64/0x80]
[  344.332845]   CPU[  9]: TSTATE[0000004411001601] TPC[000000000042ca74] TNPC[000000000042ca78] TASK[swapper:0]
[  344.345914]              TPC[cpu_idle+0xd8/0x164] O7[cpu_idle+0xc4/0x164] I7[after_lock_tlb+0x1b4/0x1cc] RPC[clocksource_tick_read+0x10/0x20]
[  344.359070]   CPU[ 10]: TSTATE[0000004411001601] TPC[000000000042ca74] TNPC[000000000042ca78] TASK[swapper:0]
[  344.372142]              TPC[cpu_idle+0xd8/0x164] O7[cpu_idle+0xc4/0x164] I7[after_lock_tlb+0x1b4/0x1cc] RPC[clocksource_tick_read+0x10/0x20]
[  344.385268]   CPU[ 11]: TSTATE[0000004411001601] TPC[000000000042ca74] TNPC[000000000042ca78] TASK[swapper:0]
[  344.398325]              TPC[cpu_idle+0xd8/0x164] O7[cpu_idle+0xc4/0x164] I7[after_lock_tlb+0x1b4/0x1cc] RPC[clocksource_tick_read+0x10/0x20]
[  344.411479]   CPU[ 12]: TSTATE[0000004411001601] TPC[000000000042ca74] TNPC[000000000042ca78] TASK[swapper:0]
[  344.424539]              TPC[cpu_idle+0xd8/0x164] O7[cpu_idle+0xc4/0x164] I7[after_lock_tlb+0x1b4/0x1cc] RPC[clocksource_tick_read+0x10/0x20]
[  344.437692]   CPU[ 13]: TSTATE[0000004411001601] TPC[000000000042ca74] TNPC[000000000042ca78] TASK[swapper:0]
[  344.450748]              TPC[cpu_idle+0xd8/0x164] O7[cpu_idle+0xc4/0x164] I7[after_lock_tlb+0x1b4/0x1cc] RPC[clocksource_tick_read+0x10/0x20]
[  344.463870]   CPU[ 14]: TSTATE[0000004411001601] TPC[000000000042ca74] TNPC[000000000042ca78] TASK[swapper:0]
[  344.476900]              TPC[cpu_idle+0xd8/0x164] O7[cpu_idle+0xc4/0x164] I7[after_lock_tlb+0x1b4/0x1cc] RPC[clocksource_tick_read+0x10/0x20]
[  344.490005]   CPU[ 15]: TSTATE[0000004411001601] TPC[000000000042ca74] TNPC[000000000042ca78] TASK[swapper:0]
[  344.503061]              TPC[cpu_idle+0xd8/0x164] O7[cpu_idle+0xc4/0x164] I7[after_lock_tlb+0x1b4/0x1cc] RPC[clocksource_tick_read+0x10/0x20]
[  344.516222]   CPU[ 16]: TSTATE[0000004411001601] TPC[000000000042ca74] TNPC[000000000042ca78] TASK[swapper:0]
[  344.529317]              TPC[cpu_idle+0xd8/0x164] O7[cpu_idle+0xc4/0x164] I7[after_lock_tlb+0x1b4/0x1cc] RPC[clocksource_tick_read+0x10/0x20]
[  344.542513]   CPU[ 17]: TSTATE[0000004411001601] TPC[000000000042ca74] TNPC[000000000042ca78] TASK[swapper:0]
[  344.555608]              TPC[cpu_idle+0xd8/0x164] O7[cpu_idle+0xc4/0x164] I7[after_lock_tlb+0x1b4/0x1cc] RPC[clocksource_tick_read+0x10/0x20]
[  344.568769]   CPU[ 18]: TSTATE[0000004411001601] TPC[000000000042ca74] TNPC[000000000042ca78] TASK[swapper:0]
[  344.581849]              TPC[cpu_idle+0xd8/0x164] O7[cpu_idle+0xc4/0x164] I7[after_lock_tlb+0x1b4/0x1cc] RPC[clocksource_tick_read+0x10/0x20]
[  344.594985]   CPU[ 19]: TSTATE[0000004411001601] TPC[000000000042ca74] TNPC[000000000042ca78] TASK[swapper:0]
[  344.608051]              TPC[cpu_idle+0xd8/0x164] O7[cpu_idle+0xc4/0x164] I7[after_lock_tlb+0x1b4/0x1cc] RPC[clocksource_tick_read+0x10/0x20]
[  344.621238]   CPU[ 20]: TSTATE[0000004411001601] TPC[000000000042ca74] TNPC[000000000042ca78] TASK[swapper:0]
[  344.634357]              TPC[cpu_idle+0xd8/0x164] O7[cpu_idle+0xc4/0x164] I7[after_lock_tlb+0x1b4/0x1cc] RPC[clocksource_tick_read+0x10/0x20]
[  344.647597]   CPU[ 21]: TSTATE[0000004411001601] TPC[000000000042ca74] TNPC[000000000042ca78] TASK[swapper:0]
[  344.660763]              TPC[cpu_idle+0xd8/0x164] O7[cpu_idle+0xc4/0x164] I7[after_lock_tlb+0x1b4/0x1cc] RPC[clocksource_tick_read+0x10/0x20]
[  344.673977]   CPU[ 22]: TSTATE[0000004411001601] TPC[000000000042ca74] TNPC[000000000042ca78] TASK[swapper:0]
[  344.687099]              TPC[cpu_idle+0xd8/0x164] O7[cpu_idle+0xc4/0x164] I7[after_lock_tlb+0x1b4/0x1cc] RPC[clocksource_tick_read+0x10/0x20]
[  344.700291]   CPU[ 23]: TSTATE[0000000080001606] TPC[000000000042f4f4] TNPC[000000000042f4f8] TASK[swapper:0]
[  344.713429]              TPC[__delay+0x4/0x4c] O7[__delay+0x2c/0x4c] I7[do_raw_spin_lock+0xbc/0x120] RPC[_raw_spin_lock_irqsave+0x6c/0x88]
[  344.726659]   CPU[ 24]: TSTATE[0000004411001601] TPC[000000000042ca74] TNPC[000000000042ca78] TASK[swapper:0]
[  344.739760]              TPC[cpu_idle+0xd8/0x164] O7[cpu_idle+0xc4/0x164] I7[after_lock_tlb+0x1b4/0x1cc] RPC[clocksource_tick_read+0x10/0x20]
[  344.752939]   CPU[ 25]: TSTATE[0000004411001601] TPC[000000000042ca74] TNPC[000000000042ca78] TASK[swapper:0]
[  344.766021]              TPC[cpu_idle+0xd8/0x164] O7[cpu_idle+0xc4/0x164] I7[after_lock_tlb+0x1b4/0x1cc] RPC[clocksource_tick_read+0x10/0x20]
[  344.779151]   CPU[ 26]: TSTATE[0000004411001601] TPC[000000000042ca74] TNPC[000000000042ca78] TASK[swapper:0]
[  344.792185]              TPC[cpu_idle+0xd8/0x164] O7[cpu_idle+0xc4/0x164] I7[after_lock_tlb+0x1b4/0x1cc] RPC[clocksource_tick_read+0x10/0x20]
[  344.805257]   CPU[ 27]: TSTATE[0000004411001601] TPC[000000000042ca74] TNPC[000000000042ca78] TASK[swapper:0]
[  344.818276]              TPC[cpu_idle+0xd8/0x164] O7[cpu_idle+0xc4/0x164] I7[after_lock_tlb+0x1b4/0x1cc] RPC[clocksource_tick_read+0x10/0x20]
[  344.831423]   CPU[ 28]: TSTATE[0000004411001601] TPC[000000000042ca74] TNPC[000000000042ca78] TASK[swapper:0]
[  344.844424]              TPC[cpu_idle+0xd8/0x164] O7[cpu_idle+0xc4/0x164] I7[after_lock_tlb+0x1b4/0x1cc] RPC[clocksource_tick_read+0x10/0x20]
[  344.857496]   CPU[ 29]: TSTATE[0000004411001601] TPC[000000000042ca74] TNPC[000000000042ca78] TASK[swapper:0]
[  344.870459]              TPC[cpu_idle+0xd8/0x164] O7[cpu_idle+0xc4/0x164] I7[after_lock_tlb+0x1b4/0x1cc] RPC[clocksource_tick_read+0x10/0x20]
[  344.883482]   CPU[ 30]: TSTATE[0000004411001601] TPC[000000000042ca74] TNPC[000000000042ca78] TASK[swapper:0]
[  344.896482]              TPC[cpu_idle+0xd8/0x164] O7[cpu_idle+0xc4/0x164] I7[after_lock_tlb+0x1b4/0x1cc] RPC[clocksource_tick_read+0x10/0x20]
[  344.909565]   CPU[ 31]: TSTATE[0000004411001601] TPC[000000000042ca74] TNPC[000000000042ca78] TASK[swapper:0]
[  344.922580]              TPC[cpu_idle+0xd8/0x164] O7[cpu_idle+0xc4/0x164] I7[after_lock_tlb+0x1b4/0x1cc] RPC[clocksource_tick_read+0x10/0x20]
[  344.935690]   CPU[ 32]: TSTATE[0000004411001601] TPC[000000000042ca74] TNPC[000000000042ca78] TASK[swapper:0]
[  344.948716]              TPC[cpu_idle+0xd8/0x164] O7[cpu_idle+0xc4/0x164] I7[after_lock_tlb+0x1b4/0x1cc] RPC[clocksource_tick_read+0x10/0x20]
[  344.961824]   CPU[ 33]: TSTATE[0000004411001601] TPC[000000000042ca74] TNPC[000000000042ca78] TASK[swapper:0]
[  344.974824]              TPC[cpu_idle+0xd8/0x164] O7[cpu_idle+0xc4/0x164] I7[after_lock_tlb+0x1b4/0x1cc] RPC[clocksource_tick_read+0x10/0x20]
[  344.987877]   CPU[ 34]: TSTATE[0000004411001601] TPC[000000000042ca74] TNPC[000000000042ca78] TASK[swapper:0]
[  345.000852]              TPC[cpu_idle+0xd8/0x164] O7[cpu_idle+0xc4/0x164] I7[after_lock_tlb+0x1b4/0x1cc] RPC[clocksource_tick_read+0x10/0x20]
[  345.013897]   CPU[ 35]: TSTATE[0000004411001601] TPC[000000000042ca74] TNPC[000000000042ca78] TASK[swapper:0]
[  345.026871]              TPC[cpu_idle+0xd8/0x164] O7[cpu_idle+0xc4/0x164] I7[after_lock_tlb+0x1b4/0x1cc] RPC[clocksource_tick_read+0x10/0x20]
[  345.039967]   CPU[ 36]: TSTATE[0000004411001601] TPC[000000000042ca74] TNPC[000000000042ca78] TASK[swapper:0]
[  345.052993]              TPC[cpu_idle+0xd8/0x164] O7[cpu_idle+0xc4/0x164] I7[after_lock_tlb+0x1b4/0x1cc] RPC[clocksource_tick_read+0x10/0x20]
[  345.066257]   CPU[ 37]: TSTATE[0000000000000000] TPC[0000000000000000] TNPC[0000000000000000] TASK[NULL:-1]
[  345.079270]              TPC[0] O7[0] I7[0] RPC[0]
[  345.085891]   CPU[ 38]: TSTATE[0000000000000000] TPC[0000000000000000] TNPC[0000000000000000] TASK[NULL:-1]
[  345.098833]              TPC[0] O7[0] I7[0] RPC[0]
[  345.105400]   CPU[ 39]: TSTATE[0000000000000000] TPC[0000000000000000] TNPC[0000000000000000] TASK[NULL:-1]
[  345.119523]              TPC[0] O7[0] I7[0] RPC[0]
[  345.126561]   CPU[ 40]: TSTATE[0000000000000000] TPC[0000000000000000] TNPC[0000000000000000] TASK[NULL:-1]
[  345.139451]              TPC[0] O7[0] I7[0] RPC[0]
[  345.145944]   CPU[ 41]: TSTATE[0000000000000000] TPC[0000000000000000] TNPC[0000000000000000] TASK[NULL:-1]
[  345.158607]              TPC[0] O7[0] I7[0] RPC[0]
[  345.165032]   CPU[ 42]: TSTATE[0000000000000000] TPC[0000000000000000] TNPC[0000000000000000] TASK[NULL:-1]
[  345.177583]              TPC[0] O7[0] I7[0] RPC[0]
[  345.183933]   CPU[ 43]: TSTATE[0000000000000000] TPC[0000000000000000] TNPC[0000000000000000] TASK[NULL:-1]
[  345.196318]              TPC[0] O7[0] I7[0] RPC[0]
[  345.202575]   CPU[ 44]: TSTATE[0000000000000000] TPC[0000000000000000] TNPC[0000000000000000] TASK[NULL:-1]
[  345.214796]              TPC[0] O7[0] I7[0] RPC[0]
[  345.220984]   CPU[ 45]: TSTATE[0000000000000000] TPC[0000000000000000] TNPC[0000000000000000] TASK[NULL:-1]
[  345.233051]              TPC[0] O7[0] I7[0] RPC[0]
[  345.239219]   CPU[ 46]: TSTATE[0000000000000000] TPC[0000000000000000] TNPC[0000000000000000] TASK[NULL:-1]
[  345.251249]              TPC[0] O7[0] I7[0] RPC[0]
[  345.257386]   CPU[ 47]: TSTATE[0000000000000000] TPC[0000000000000000] TNPC[0000000000000000] TASK[NULL:-1]
[  345.269344]              TPC[0] O7[0] I7[0] RPC[0]
[  345.275456]   CPU[ 48]: TSTATE[0000000000000000] TPC[0000000000000000] TNPC[0000000000000000] TASK[NULL:-1]
[  345.287363]              TPC[0] O7[0] I7[0] RPC[0]
[  345.293462]   CPU[ 49]: TSTATE[0000000000000000] TPC[0000000000000000] TNPC[0000000000000000] TASK[NULL:-1]
[  345.305399]              TPC[0] O7[0] I7[0] RPC[0]
[  345.311510]   CPU[ 50]: TSTATE[0000000000000000] TPC[0000000000000000] TNPC[0000000000000000] TASK[NULL:-1]
[  345.323437]              TPC[0] O7[0] I7[0] RPC[0]
[  345.329550]   CPU[ 51]: TSTATE[0000000000000000] TPC[0000000000000000] TNPC[0000000000000000] TASK[NULL:-1]
[  345.341503]              TPC[0] O7[0] I7[0] RPC[0]
[  345.347639]   CPU[ 52]: TSTATE[0000000000000000] TPC[0000000000000000] TNPC[0000000000000000] TASK[NULL:-1]
[  345.359611]              TPC[0] O7[0] I7[0] RPC[0]
[  345.365736]   CPU[ 53]: TSTATE[0000000000000000] TPC[0000000000000000] TNPC[0000000000000000] TASK[NULL:-1]
[  345.377683]              TPC[0] O7[0] I7[0] RPC[0]
[  345.383778]   CPU[ 54]: TSTATE[0000000000000000] TPC[0000000000000000] TNPC[0000000000000000] TASK[NULL:-1]
[  345.395690]              TPC[0] O7[0] I7[0] RPC[0]
[  345.401807]   CPU[ 55]: TSTATE[0000000000000000] TPC[0000000000000000] TNPC[0000000000000000] TASK[NULL:-1]
[  345.413781]              TPC[0] O7[0] I7[0] RPC[0]
[  345.419931]   CPU[ 56]: TSTATE[0000000000000000] TPC[0000000000000000] TNPC[0000000000000000] TASK[NULL:-1]
[  345.431927]              TPC[0] O7[0] I7[0] RPC[0]
[  345.438068]   CPU[ 57]: TSTATE[0000000000000000] TPC[0000000000000000] TNPC[0000000000000000] TASK[NULL:-1]
[  345.450041]              TPC[0] O7[0] I7[0] RPC[0]
[  345.456182]   CPU[ 58]: TSTATE[0000000000000000] TPC[0000000000000000] TNPC[0000000000000000] TASK[NULL:-1]
[  345.468155]              TPC[0] O7[0] I7[0] RPC[0]
[  345.474271]   CPU[ 59]: TSTATE[0000000000000000] TPC[0000000000000000] TNPC[0000000000000000] TASK[NULL:-1]
[  345.486216]              TPC[0] O7[0] I7[0] RPC[0]
[  345.492310]   CPU[ 60]: TSTATE[0000000000000000] TPC[0000000000000000] TNPC[0000000000000000] TASK[NULL:-1]
[  345.504249]              TPC[0] O7[0] I7[0] RPC[0]
[  345.510388]   CPU[ 61]: TSTATE[0000000000000000] TPC[0000000000000000] TNPC[0000000000000000] TASK[NULL:-1]
[  345.522387]              TPC[0] O7[0] I7[0] RPC[0]
[  345.528539]   CPU[ 62]: TSTATE[0000000000000000] TPC[0000000000000000] TNPC[0000000000000000] TASK[NULL:-1]
[  345.540542]              TPC[0] O7[0] I7[0] RPC[0]
[  345.546689]   CPU[ 63]: TSTATE[0000000000000000] TPC[0000000000000000] TNPC[0000000000000000] TASK[NULL:-1]
[  345.558657]              TPC[0] O7[0] I7[0] RPC[0]
[  345.564659] CPU[0]: Unexpected SUN4V mondo error 6
[  345.570555] CPU[0]: Args were cnt(1) cpulist_pa(3ff00e040) mondo_block_pa(3ff00e000)
[  345.582283] CPU[0]: CPU list [ 37 ]
[  345.588358] FAULT[�^@:-2113896449]: 32-bit process reports 64-bit fault address [fffffffe094b6000]
[  345.600198] TSTATE: 0000000880e01607 TPC: 00000000004c9d58 TNPC: 00000000004c9d5c Y: 00000000    Tainted: G      D W 
[  345.612325] TPC: <tracing_record_cmdline+0x74/0x150>
[  345.618411] g0: fffff803f41f5220 g1: 000000000149607c g2: fffffffe0801fffc g3: c25e61f0c25e6088
[  345.630440] g4: 0000000000457a98 g5: fffff80011244000 g6: fffff803f6830000 g7: 0000000000457a98
[  345.642799] o0: 0000000000000001 o1: fffff803f08e0500 o2: 0000000000000003 o3: 0000000000000000
[  345.655381] o4: 0000000000000002 o5: 0000000000000000 sp: fffff803f6832b41 ret_pc: 00000000004c9d0c
[  345.668216] RPC: <tracing_record_cmdline+0x28/0x150>
[  345.674722] l0: 00000000008e23c0 l1: 0000000000457a98 l2: 0000000000462ec0 l3: 0000000000bf3d30
[  345.687603] l4: 000000000000e096 l5: 0000000000bf3d10 l6: 0000000000bf3d20 l7: 00000000008d0180
[  345.700812] i0: 0000000000457a98 i1: fffff803f08e0500 i2: 0000000000000001 i3: 0000000000000000
[  345.714324] i4: 0000000000000002 i5: 0000000000000000 i6: fffff803f6832c01 i7: 00000000004ce3a4
[  345.728341] I7: <probe_sched_wakeup+0x18/0xcc>
[  345.735308] Unable to handle kernel paging request at virtual address fffffffe094b6000
[  345.749377] Kernel unaligned access at TPC[44e4f4] unhandled_fault+0x4c/0xa8
[  345.756615] Unable to handle kernel paging request in mna handler
[  345.757001]  at virtual address c25e400082086382
[  345.770927] Kernel unaligned access at TPC[4334d8] kernel_mna_trap_fault+0x80/0x108
[  345.785055] Unable to handle kernel paging request in mna handler
[  345.785441]  at virtual address c25e400082086382
[  345.799386] Kernel unaligned access at TPC[4334d8] kernel_mna_trap_fault+0x80/0x108
[  345.813172] Unable to handle kernel paging request in mna handler
[  345.813558]  at virtual address c25e400082086382
[  345.827112] Kernel unaligned access at TPC[4334d8] kernel_mna_trap_fault+0x80/0x108
[  345.840680] Unable to handle kernel paging request in mna handler
[  345.841067]  at virtual address c25e400082086382
[  345.854636] Kernel unaligned access at TPC[4334d8] kernel_mna_trap_fault+0x80/0x108
[  345.868038] Unable to handle kernel paging request in mna handler
[  345.868423]  at virtual address c25e400082086382
[  345.881562] Unable to handle kernel paging request in mna handler
[  345.881949]  at virtual address c25e400082086382
[  345.894779] Unable to handle kernel paging request in mna handler
[  345.895166]  at virtual address c25e400082086382
[  345.908006] Unable to handle kernel paging request in mna handler
[  345.908394]  at virtual address c25e400082086382
[  345.920982] Unable to handle kernel paging request in mna handler
[  345.921375]  at virtual address fffffffffffff803
[  349.722230]               \|/ ____ \|/
[  349.722239]               "@'/ .. \`@"
[  349.722246]               /_| \__/ |_\
[  349.722254]                  \__U_/
[  349.745936] swapper(0): Kernel illegal instruction [#7]
[  349.751644] TSTATE: 0000004480e01602 TPC: 0000000000457b5c TNPC: 0000000000457b60 Y: 00000002    Tainted: G      D W 
[  349.762859] TPC: <enqueue_task+0x18/0x84>
[  349.768324] g0: 0000000000000002 g1: 0000000000000001 g2: 0000000000457d08 g3: fffff803f7c76fc1
[  349.779262] g4: fffff803f7c72900 g5: fffff80013444000 g6: fffff803f7c74000 g7: 0000000000000000
[  349.790308] o0: 00000000008f06e0 o1: fffff80013fc5c98 o2: 000000000076328c o3: 0000000000000000
[  349.801676] o4: 0000000000000002 o5: 0000000000000001 sp: fffff803f7c76f01 ret_pc: 0000000000457b4c
[  349.813239] RPC: <enqueue_task+0x8/0x84>
[  349.818953] l0: 0000000000000000 l1: 000000000000000e l2: ffffffffffffffff l3: 0000000000000000
[  349.830486] l4: fffff803f7c72900 l5: 0006000000000000 l6: 000000000000000e l7: 0000000080001002
[  349.842331] i0: fffff80013fc5c80 i1: fffff803f792a600 i2: 0000000000000001 i3: 0000000000000000
[  349.854441] i4: 0000000000000002 i5: 0000000000000000 i6: fffff803f7c76fc1 i7: 0000000000457d08
[  349.866888] I7: <activate_task+0x48/0x64>
[  349.873274] Caller[0000000000457d08]: activate_task+0x48/0x64
[  349.879855] Caller[0000000000463080]: try_to_wake_up+0x368/0x57c
[  349.886501] Caller[0000000000463304]: wake_up_process+0x18/0x2c
[  349.893098] Caller[0000000000492854]: hrtimer_wakeup+0x1c/0x2c
[  349.899590] Caller[0000000000492a4c]: __run_hrtimer+0x1e8/0x344
[  349.905973] Caller[0000000000492e1c]: hrtimer_interrupt+0xe0/0x1d8
[  349.912359] Caller[0000000000764534]: timer_interrupt+0xe0/0x11c
[  349.918642] Caller[0000000000426b04]: valid_addr_bitmap_patch+0xbc/0x238
[  349.924822] Caller[000000000042ca60]: cpu_idle+0xc4/0x164
[  349.930805] Caller[000000000075c318]: after_lock_tlb+0x1b4/0x1cc
[  349.936677] Caller[0000000000000000]: (null)
[  349.942263] Instruction DUMP: 901222e0  22ce8005  ca484db2 <da7c9876> c27660b8  c25e61f0  0ac0400d  05002e07  c25e6008 
[  349.953956] Kernel panic - not syncing: Aiee, killing interrupt handler!
[  349.959488] Call Trace:
[  349.964988]  [000000000075f960] panic+0x64/0x130
[  349.970469]  [0000000000472e2c] do_exit+0x70/0x77c
[  349.975756]  [0000000000427e84] die_if_kernel+0x2a8/0x2d4
[  349.980912]  [00000000004281dc] do_illegal_instruction+0x58/0x180
[  349.985940]  [0000000000420210] tl0_ill+0x10/0x20
[  349.990751]  [0000000000457b5c] enqueue_task+0x18/0x84
[  349.995410]  [0000000000457d08] activate_task+0x48/0x64
[  349.999903]  [0000000000463080] try_to_wake_up+0x368/0x57c
[  350.004233]  [0000000000463304] wake_up_process+0x18/0x2c
[  350.008568]  [0000000000492854] hrtimer_wakeup+0x1c/0x2c
[  350.012872]  [0000000000492a4c] __run_hrtimer+0x1e8/0x344
[  350.017180]  [0000000000492e1c] hrtimer_interrupt+0xe0/0x1d8
[  350.021502]  [0000000000764534] timer_interrupt+0xe0/0x11c
[  350.025828]  [0000000000426b04] valid_addr_bitmap_patch+0xbc/0x238
[  350.030195]  [000000000042ca74] cpu_idle+0xd8/0x164
[  350.034505]  [000000000075c318] after_lock_tlb+0x1b4/0x1cc
[  350.038920] Press Stop-A (L1-A) to return to the boot prom
[  354.379142] BUG: NMI Watchdog detected LOCKUP on CPU0, ip 0042f518, registers:
[  354.388171] TSTATE: 0000009980e01603 TPC: 000000000042f518 TNPC: 000000000042f51c Y: 00000000    Tainted: G      D W 
[  354.397811] TPC: <__delay+0x28/0x4c>
[  354.402429] g0: 000000000000000e g1: 00000000008e2440 g2: 00000000008e2440 g3: 0000000000000000
[  354.411707] g4: fffff803f6752ce0 g5: fffff80011244000 g6: fffff803f43c8000 g7: 0000000000000000
[  354.421574] o0: 0000000000000000 o1: fffff803f43c8400 o2: 00000000005ea904 o3: 0000000000000000
[  354.432139] o4: 0000000000000102 o5: 0000000000000008 sp: fffff803ffa6a981 ret_pc: 000000000042f51c
[  354.443299] RPC: <__delay+0x2c/0x4c>
[  354.448944] l0: 000000601f09927f l1: 00000000008e23c8 l2: 00000000005ea900 l3: 0000000000000400
[  354.460644] l4: 000000000000000e l5: 0000000000000002 l6: 0000000000000000 l7: 0000000000000008
[  354.473018] i0: 0000000000000001 i1: 0000000000877be8 i2: 00000000005e32dc i3: 0000000000000000
[  354.486104] i4: 0000000000000002 i5: 0000000000000001 i6: fffff803ffa6aa41 i7: 00000000005ea90c
[  354.499817] I7: <do_raw_spin_lock+0xbc/0x120>
[  354.506789] Call Trace:
[  354.513660]  [000000000043a7f4] perfctr_irq+0x248/0x3f4
[  354.520555]  [00000000004209f4] tl0_irq15+0x14/0x20
[  354.527184]  [000000000042f518] __delay+0x28/0x4c
[  354.533649]  [00000000005ea90c] do_raw_spin_lock+0xbc/0x120
[  354.540074]  [0000000000763410] _raw_spin_lock_irqsave+0x6c/0x88
[  354.546547]  [00000000005e32dc] ftrace_graph_caller+0x10/0x18
[  354.552910]  [00000000005e32dc] ftrace_graph_caller+0x10/0x18
[  354.558978]  [00000000005e32dc] ftrace_graph_caller+0x10/0x18
[  354.564780]  [00000000005e32dc] ftrace_graph_caller+0x10/0x18
[  354.570306]  [00000000005e32dc] ftrace_graph_caller+0x10/0x18
[  364.853704] BUG: NMI Watchdog detected LOCKUP on CPU46, ip 0042f174, registers:
[  364.863806] TSTATE: 0000000080e01604 TPC: 000000000042f174 TNPC: 000000000042f178 Y: 00000000    Tainted: G      D W 
[  364.874219] TPC: <stick_get_tick+0x4/0x18>
[  364.879332] g0: 0000000000001000 g1: 000000000042f170 g2: 00000000008e2440 g3: 0000000000000000
[  364.889400] g4: fffff803f69d80e0 g5: fffff80016e44000 g6: fffff803f69e4000 g7: 0000000000000000
[  364.899456] o0: 0000000000000002 o1: fffff803f69e4400 o2: 000000000042f538 o3: 0000000000000000
[  364.909482] o4: 000000000000004f o5: 0000000000000002 sp: fffff803ff8fb1a1 ret_pc: 00000000004209f4
[  364.919667] RPC: <tl0_irq15+0x14/0x20>
[  364.924821] l0: 0000000000001000 l1: 0000000080001603 l2: 000000000042f534 l3: 0000000000000400
[  364.935493] l4: 000000000000000e l5: 0000000000000002 l6: 0000000000000000 l7: 0000000000000008
[  364.946876] i0: 0000000000000080 i1: fffff803f69e4400 i2: 00000000005ea904 i3: 0000000000000000
[  364.958878] i4: 0000000000000102 i5: 00000000005c8c78 i6: fffff803ff8fb261 i7: 000000000042f508
[  364.971820] I7: <__delay+0x18/0x4c>
[  364.978506] Call Trace:
[  364.985232]  [000000000043a7f4] perfctr_irq+0x248/0x3f4
[  364.992013]  [00000000004209f4] tl0_irq15+0x14/0x20
[  364.998527]  [000000000042f174] stick_get_tick+0x4/0x18
[  365.004942]  [000000000042f508] __delay+0x18/0x4c
[  365.011157]  [00000000005ea90c] do_raw_spin_lock+0xbc/0x120
[  365.017473]  [0000000000763410] _raw_spin_lock_irqsave+0x6c/0x88
[  365.023707]  [00000000005c8c80] blk_rq_timed_out_timer+0x18/0xf0
[  365.029795]  [000000000047f0a0] run_timer_softirq+0x370/0x50c
[  365.035736]  [0000000000476b0c] __do_softirq+0x1c4/0x384
[  365.041495]  [000000000042a720] do_softirq+0x9c/0xdc
[  365.047059]  [00000000004764d0] irq_exit+0x74/0xdc
[  365.052368]  [000000000076453c] timer_interrupt+0xe8/0x11c
[  365.057586]  [0000000000426b04] valid_addr_bitmap_patch+0xbc/0x238
[  365.062701]  [000000000042ca74] cpu_idle+0xd8/0x164
[  365.067743]  [000000000075c318] after_lock_tlb+0x1b4/0x1cc
[  365.072767]  [0000000000000000] (null)
[  368.199367] BUG: NMI Watchdog detected LOCKUP on CPU8, ip 0042f178, registers:
[  368.209054] TSTATE: 0000009980e01604 TPC: 000000000042f178 TNPC: 000000000042f17c Y: 00000000    Tainted: G      D W 
[  368.218808] TPC: <stick_get_tick+0x8/0x18>
[  368.223475] g0: 0000000000001000 g1: 00000063deac43ff g2: 00000000008e2440 g3: 0000000000000000
[  368.232857] g4: fffff803f1c383a0 g5: fffff80012244000 g6: fffff803f0a30000 g7: 0000000000000000
[  368.242978] o0: 0000000000000001 o1: fffff803f0a30400 o2: 000000000042f530 o3: 0000000000000000
[  368.253788] o4: 000000000000004f o5: 0000000000000002 sp: fffff803f0a323d1 ret_pc: 00000000004209f4
[  368.265260] RPC: <tl0_irq15+0x14/0x20>
[  368.271042] l0: 0000000000001000 l1: 0000009980001603 l2: 000000000042f52c l3: 0000000000000400
[  368.282920] l4: 000000000000000e l5: 0000000000000000 l6: 0000000000000000 l7: 0000000000000008
[  368.295455] i0: 0000000000000000 i1: fffff803f0a30400 i2: 000000000042f4f4 i3: 0000000000000000
[  368.308751] i4: 0000000000000000 i5: 00000000005c3cd0 i6: fffff803f0a32491 i7: 000000000042f51c
[  368.322673] I7: <__delay+0x2c/0x4c>
[  368.329662] Call Trace:
[  368.336588]  [000000000043a7f4] perfctr_irq+0x248/0x3f4
[  368.343537]  [00000000004209f4] tl0_irq15+0x14/0x20
[  368.350231]  [000000000042f178] stick_get_tick+0x8/0x18
[  368.356841]  [000000000042f51c] __delay+0x2c/0x4c
[  368.363211]  [00000000005ea90c] do_raw_spin_lock+0xbc/0x120
[  368.369654]  [0000000000763388] _raw_spin_lock_irq+0x64/0x80
[  368.375995]  [00000000005c3d08] __make_request+0x40/0x604
[  368.382150]  [00000000005c1bd0] generic_make_request+0x51c/0x5a4
[  368.388183]  [00000000005c1d44] submit_bio+0xec/0xfc
[  368.394041]  [0000000000545df4] submit_bh+0x150/0x178
[  368.399691]  [0000000000549604] __block_write_full_page+0x29c/0x404
[  368.405236]  [0000000000549840] block_write_full_page_endio+0xd4/0xe8
[  368.410667]  [0000000000549874] block_write_full_page+0x20/0x34
[  368.415904]  [0000000000583f98] ext3_writeback_writepage+0xe0/0x1d0
[  368.421184]  [00000000004e91d4] __writepage+0x1c/0x54
[  368.426319]  [00000000004e9514] write_cache_pages+0x1d4/0x300
[  371.360598] BUG: NMI Watchdog detected LOCKUP on CPU0, ip 0042f520, registers:
[  371.370081] TSTATE: 0000009980001601 TPC: 000000000042f520 TNPC: 000000000042f170 Y: 00000000    Tainted: G      D W 
[  371.379609] TPC: <__delay+0x30/0x4c>
[  371.384111] g0: 0000000000000000 g1: 000000000042f170 g2: 00000000008e2440 g3: fffff803ff922561
[  371.393235] g4: fffff803f68eed00 g5: fffff80011244000 g6: fffff803f68f0000 g7: 0000000000bb0e18
[  371.403069] o0: 0000000012fda000 o1: 000000000046e9ec o2: fffff803ff922c08 o3: fffff803ff922561
[  371.413571] o4: 0000000000000000 o5: 0000001900000058 sp: fffff803ff922321 ret_pc: 000000000042f51c
[  371.424803] RPC: <__delay+0x2c/0x4c>
[  371.430468] l0: 00000064a73050ff l1: 00000000008e23c8 l2: fffff803f78edec0 l3: 0000000000000000
[  371.442169] l4: fffff80011dc2c88 l5: 0000000000000058 l6: 0000000000000000 l7: 0000000000000008
[  371.454500] i0: 0000000045707d40 i1: 000000000046e9ec i2: fffff803ff922561 i3: 0000000000045c34
[  371.467462] i4: 0000000000000000 i5: fffff803ff922b28 i6: fffff803ff9223e1 i7: 000000000042f548
[  371.481096] I7: <udelay+0xc/0x1c>
[  371.487949] Call Trace:
[  371.494725] Kernel panic - not syncing: Attempted to kill the idle task!
[  371.501741] Call Trace:
[  371.508588] Press Stop-A (L1-A) to return to the boot prom
[  379.500916] BUG: NMI Watchdog detected LOCKUP on CPU63, ip 0042f520, registers:
[  379.514625] TSTATE: 0000000080e01603 TPC: 000000000042f520 TNPC: 000000000042f170 Y: 00000000    Tainted: G      D W 
[  379.528626] TPC: <__delay+0x30/0x4c>
[  379.535357] g0: 000000000000000e g1: 000000000042f170 g2: 00000000008e2440 g3: fffff803f6d1ee41
[  379.548748] g4: fffff803f6d156a0 g5: fffff80019044000 g6: fffff803f6d1c000 g7: 0000000000cf3e80
[  379.562091] o0: 00000066ef6b447f o1: fffff803f6d1c400 o2: 00000000005ea904 o3: 0000000000000000
[  379.575670] o4: 0000000000000002 o5: fffff803f6d1fc48 sp: fffff803f6d1ee41 ret_pc: 000000000042f51c
[  379.589562] RPC: <__delay+0x2c/0x4c>
[  379.596306] l0: 00000066ef6b447f l1: 00000000008e23c8 l2: 00000000005ea900 l3: 0000000000000400
[  379.609707] l4: 000000000000000e l5: 0000000000000002 l6: 0000000000000000 l7: 0000000000000008
[  379.623163] i0: 0000000000000001 i1: fffff80019bc5c98 i2: 0000000000000000 i3: 0000000000000000
[  379.636819] i4: 0000000000000002 i5: 000000000046b8d8 i6: fffff803f6d1ef01 i7: 00000000005ea90c
[  379.651076] I7: <do_raw_spin_lock+0xbc/0x120>
[  379.658091] Call Trace:
[  379.664946]  [000000000043a7f4] perfctr_irq+0x248/0x3f4
[  379.671840]  [00000000004209f4] tl0_irq15+0x14/0x20
[  379.678477]  [000000000042f520] __delay+0x30/0x4c
[  379.684930]  [00000000005ea90c] do_raw_spin_lock+0xbc/0x120
[  379.691344]  [0000000000763280] _raw_spin_lock+0x58/0x74
[  379.697763]  [000000000046b8d8] scheduler_tick+0x48/0x358
[  379.704073]  [00000000004803d4] update_process_times+0x4c/0x64
[  379.710271]  [000000000049c210] tick_sched_timer+0xb4/0xdc
[  379.716295]  [0000000000492a4c] __run_hrtimer+0x1e8/0x344
[  379.722164]  [0000000000492e1c] hrtimer_interrupt+0xe0/0x1d8
[  379.727870]  [0000000000764534] timer_interrupt+0xe0/0x11c
[  379.733392]  [0000000000426b04] valid_addr_bitmap_patch+0xbc/0x238
[  379.738799]  [000000000042ca74] cpu_idle+0xd8/0x164
[  379.743964]  [000000000075c318] after_lock_tlb+0x1b4/0x1cc
[  379.749241]  [0000000000000000] (null)
[  379.754287] Kernel panic - not syncing: Aiee, killing interrupt handler!
[  379.759425] Call Trace:
[  379.764308]  [000000000075f960] panic+0x64/0x130
[  379.769088]  [0000000000472e2c] do_exit+0x70/0x77c
[  379.773683]  [000000000043a878] perfctr_irq+0x2cc/0x3f4
[  379.778170]  [00000000004209f4] tl0_irq15+0x14/0x20
[  379.782434]  [000000000042f520] __delay+0x30/0x4c
[  379.786508]  [00000000005ea90c] do_raw_spin_lock+0xbc/0x120
[  379.790632]  [0000000000763280] _raw_spin_lock+0x58/0x74
[  379.794745]  [000000000046b8d8] scheduler_tick+0x48/0x358
[  379.798870]  [00000000004803d4] update_process_times+0x4c/0x64
[  379.803028]  [000000000049c210] tick_sched_timer+0xb4/0xdc
[  379.807211]  [0000000000492a4c] __run_hrtimer+0x1e8/0x344
[  379.811379]  [0000000000492e1c] hrtimer_interrupt+0xe0/0x1d8
[  379.815557]  [0000000000764534] timer_interrupt+0xe0/0x11c
[  379.819713]  [0000000000426b04] valid_addr_bitmap_patch+0xbc/0x238
[  379.823892]  [000000000042ca74] cpu_idle+0xd8/0x164
[  379.827963]  [000000000075c318] after_lock_tlb+0x1b4/0x1cc
[  379.832187] Press Stop-A (L1-A) to return to the boot prom
[  380.330871] BUG: NMI Watchdog detected LOCKUP on CPU53, ip 0042f178, registers:
[  380.339479] TSTATE: 0000009980e01605 TPC: 000000000042f178 TNPC: 000000000042f17c Y: 00000000    Tainted: G      D W 
[  380.348735] TPC: <stick_get_tick+0x8/0x18>
[  380.353369] g0: 0000000000461494 g1: 00000067290c71ff g2: 00000000008e2440 g3: 0000000000000000
[  380.362648] g4: fffff803f6b2e7e0 g5: fffff80017c44000 g6: fffff803f6b30000 g7: 0000000000000000
[  380.372507] o0: 0000000000000002 o1: fffff803f6b30400 o2: 000000000042f4f8 o3: 0000000000000000
[  380.383136] o4: 000000000000004f o5: 0000000000000002 sp: fffff803f6b32cc1 ret_pc: 00000000004209f4
[  380.394519] RPC: <tl0_irq15+0x14/0x20>
[  380.400265] l0: 0000000000001000 l1: 0000000080001604 l2: 000000000042f4f4 l3: 0000000000000400
[  380.412106] l4: 000000000000000e l5: 0000000000000002 l6: 0000000000000000 l7: 0000000000000008
[  380.424652] i0: 0000000000000000 i1: fffff803f6b30400 i2: 000000000042f4f4 i3: 0000000000000000
[  380.437935] i4: 000000467c1cc03f i5: 00000046ee702859 i6: fffff803f6b32d81 i7: 000000000042f51c
[  380.451842] I7: <__delay+0x2c/0x4c>
[  380.458851] Call Trace:
[  380.465769]  [000000000043a7f4] perfctr_irq+0x248/0x3f4
[  380.472708]  [00000000004209f4] tl0_irq15+0x14/0x20
[  380.479391]  [000000000042f178] stick_get_tick+0x8/0x18
[  380.485978]  [000000000042f51c] __delay+0x2c/0x4c
[  380.492352]  [00000000005ea90c] do_raw_spin_lock+0xbc/0x120
[  380.498811]  [0000000000763410] _raw_spin_lock_irqsave+0x6c/0x88
[  380.505185]  [000000000045c384] __wake_up+0x10/0x48
[  380.511293]  [000000000046f774] printk_tick+0x70/0x80
[  380.517288]  [00000000004803c4] update_process_times+0x3c/0x64
[  380.523184]  [000000000049c210] tick_sched_timer+0xb4/0xdc
[  380.528889]  [0000000000492a4c] __run_hrtimer+0x1e8/0x344
[  380.534401]  [0000000000492e1c] hrtimer_interrupt+0xe0/0x1d8
[  380.539754]  [0000000000764534] timer_interrupt+0xe0/0x11c
[  380.544960]  [0000000000426b04] valid_addr_bitmap_patch+0xbc/0x238
[  380.550238]  [000000000042ca74] cpu_idle+0xd8/0x164
[  380.555309]  [000000000075c318] after_lock_tlb+0x1b4/0x1cc
[  380.560279] Kernel panic - not syncing: Aiee, killing interrupt handler!
[  380.565173] Call Trace:
[  380.569823]  [000000000075f960] panic+0x64/0x130
[  380.574369]  [0000000000472e2c] do_exit+0x70/0x77c
[  380.578723]  [000000000043a878] perfctr_irq+0x2cc/0x3f4
[  380.582958]  [00000000004209f4] tl0_irq15+0x14/0x20
[  380.586959]  [000000000042f178] stick_get_tick+0x8/0x18
[  380.591022]  [000000000042f51c] __delay+0x2c/0x4c
[  380.595032]  [00000000005ea90c] do_raw_spin_lock+0xbc/0x120
[  380.599112]  [0000000000763410] _raw_spin_lock_irqsave+0x6c/0x88
[  380.603209]  [000000000045c384] __wake_up+0x10/0x48
[  380.607234]  [000000000046f774] printk_tick+0x70/0x80
[  380.611345]  [00000000004803c4] update_process_times+0x3c/0x64
[  380.615518]  [000000000049c210] tick_sched_timer+0xb4/0xdc
[  380.619668]  [0000000000492a4c] __run_hrtimer+0x1e8/0x344
[  380.623790]  [0000000000492e1c] hrtimer_interrupt+0xe0/0x1d8
[  380.627920]  [0000000000764534] timer_interrupt+0xe0/0x11c
[  380.632014]  [0000000000426b04] valid_addr_bitmap_patch+0xbc/0x238
[  380.636207] Press Stop-A (L1-A) to return to the boot prom
[  381.932130] BUG: NMI Watchdog detected LOCKUP on CPU11, ip 0042f518, registers:
[  381.940854] TSTATE: 0000009980e01604 TPC: 000000000042f518 TNPC: 000000000042f51c Y: 00000000    Tainted: G      D W 
[  381.950254] TPC: <__delay+0x28/0x4c>
[  381.954924] g0: 000000000000000e g1: 00000000008e2440 g2: 00000000008e2440 g3: 0000000000000000
[  381.964403] g4: fffff803f7b62800 g5: fffff80012844000 g6: fffff803f7b64000 g7: 0000000000000000
[  381.974455] o0: 0000000000000000 o1: fffff803f7b64400 o2: 00000000005ea904 o3: 0000000000000000
[  381.985256] o4: 000000467c1cc03f o5: 00000046edc6e5bb



^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
                   ` (5 preceding siblings ...)
  2010-04-13 21:51 ` Frederic Weisbecker
@ 2010-04-13 21:52 ` Steven Rostedt
  2010-04-13 21:56 ` David Miller
                   ` (39 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: Steven Rostedt @ 2010-04-13 21:52 UTC (permalink / raw)
  To: sparclinux

On Tue, 2010-04-13 at 14:34 -0700, David Miller wrote:
> From: Rostedt <rostedt@goodmis.org>

> Neener, neener:
> 

:-p

>  101)               |          run_local_timers() {
>  115) + 24.063 us   |            raise_softirq();
>   69)               |          run_local_timers() {
>   85) + 23.623 us   |            hrtimer_run_queues();
>  103)               |          run_local_timers() {
>   66) + 71.749 us   |          }
>   91)               |          run_local_timers() {
>  106) + 23.184 us   |            raise_softirq();
>   65) + 29.117 us   |            raise_softirq();
>   68) + 23.843 us   |            hrtimer_run_queues();
>  112) + 23.623 us   |            hrtimer_run_queues();
>  105) + 24.063 us   |            hrtimer_run_queues();
>   17) + 25.161 us   |            raise_softirq();
>   82) + 24.282 us   |            hrtimer_run_queues();
>  100) + 72.628 us   |          }
> 
> BTW, one thing that drives me nuts on this machine is that catting the
> 'trace' file takes several seconds to start up.  Is it calling
> stop_machine() or something else which very expensive with high cpu
> counts?

No, in does not call stop_machine, but it stops all tracing on all cpus.
I wonder what is happening :-/

It does allocate an "iterator" that has per cpu descriptors.

Does "trace_pipe" give you the same issues?

-- Steve



^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
                   ` (6 preceding siblings ...)
  2010-04-13 21:52 ` Steven Rostedt
@ 2010-04-13 21:56 ` David Miller
  2010-04-13 21:57 ` David Miller
                   ` (38 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: David Miller @ 2010-04-13 21:56 UTC (permalink / raw)
  To: sparclinux

From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 13 Apr 2010 23:51:46 +0200

> More than that actually, I tried and it just crashed :-)

Strange, something to debug :-)  I'll try to take a look
at what might cause this.



^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
                   ` (7 preceding siblings ...)
  2010-04-13 21:56 ` David Miller
@ 2010-04-13 21:57 ` David Miller
  2010-04-13 21:57 ` Frederic Weisbecker
                   ` (37 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: David Miller @ 2010-04-13 21:57 UTC (permalink / raw)
  To: sparclinux

From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 13 Apr 2010 17:52:21 -0400

> On Tue, 2010-04-13 at 14:34 -0700, David Miller wrote:
>> BTW, one thing that drives me nuts on this machine is that catting the
>> 'trace' file takes several seconds to start up.  Is it calling
>> stop_machine() or something else which very expensive with high cpu
>> counts?
> 
> No, in does not call stop_machine, but it stops all tracing on all cpus.
> I wonder what is happening :-/
> 
> It does allocate an "iterator" that has per cpu descriptors.
> 
> Does "trace_pipe" give you the same issues?

Ok, I'll try to track down the cause, thanks Steven.

'perf' works fine in this configuration so maybe that will
provide some clues.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
                   ` (8 preceding siblings ...)
  2010-04-13 21:57 ` David Miller
@ 2010-04-13 21:57 ` Frederic Weisbecker
  2010-04-13 22:05 ` Frederic Weisbecker
                   ` (36 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: Frederic Weisbecker @ 2010-04-13 21:57 UTC (permalink / raw)
  To: sparclinux

On Tue, Apr 13, 2010 at 02:35:37PM -0700, David Miller wrote:
> From: Frederic Weisbecker <fweisbec@gmail.com>
> Date: Tue, 13 Apr 2010 21:45:07 +0200
> 
> > Heh yeah. The trace/trace_pipe in the main directory are basically
> > unusable there. Only per_cpu can make it :)
> 
> Does it delay for several seconds at the beginning when you 'cat' them
> just like it does for me?


May be the stall started before.

Enabling the tracer now gives me the same result:

echo function_graph > current_tracer


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
                   ` (9 preceding siblings ...)
  2010-04-13 21:57 ` Frederic Weisbecker
@ 2010-04-13 22:05 ` Frederic Weisbecker
  2010-04-13 22:11 ` David Miller
                   ` (35 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: Frederic Weisbecker @ 2010-04-13 22:05 UTC (permalink / raw)
  To: sparclinux

On Tue, Apr 13, 2010 at 02:56:38PM -0700, David Miller wrote:
> From: Frederic Weisbecker <fweisbec@gmail.com>
> Date: Tue, 13 Apr 2010 23:51:46 +0200
> 
> > More than that actually, I tried and it just crashed :-)
> 
> Strange, something to debug :-)  I'll try to take a look
> at what might cause this.


Here is another, again by enabling the tracer.
One problem with the function graph tracer is that it tends
to infect the backtraces, since it overrides return values.

We have hooks to bypass that in x86, IIRC.


[  126.704213] hrtimer: interrupt took 8360145 ns
[  127.898433] ------------[ cut here ]------------
[  127.900406] WARNING: at kernel/sched.c:3568 sub_preempt_count+0x98/0xb8()
[  127.902768] Modules linked in: loop usbhid ehci_hcd ohci_hcd e1000e sg sr_mod cdrom
[  127.908524] Call Trace:
[  127.909615] ---[ end trace a6e237ec24a26f8b ]---
[  127.911796] FAULT[bash:2183]: 32-bit process reports 64-bit fault address [1dc5d706cc]
[  127.914233] TSTATE: 0000004480001603 TPC: 0000001dc5d706cc TNPC: 0000000000404b40 Y: 0000001d    Tainted: G        
W 
[  127.918356] TPC: <0x1dc5d706cc>
[  127.920107] g0: 0000000000000000 g1: 000000000000ee30 g2: 0000000000000004 g3: 04a0000000000000
[  127.923431] g4: fffff803f233d5c0 g5: fffff80011244000 g6: fffff803f0e4c000 g7: 0000001dc5a27c8f
[  127.926759] o0: 000000000000000e o1: 000000000000001b o2: 0000000000200000 o3: 000000000020009e
[  127.930243] o4: 0000000000000000 o5: fffff803ff992fe8 sp: fffff803ff992761 ret_pc: 0000001dc5cf88b7
[  127.933926] RPC: <0x1dc5cf88b7>
[  127.935311] l0: 000000000049dd30 l1: fffff803ec4c9ae0 l2: fffff803f78edec0 l3: 000000000000000e
[  127.938713] l4: fffff800153c2c88 l5: fffffffffff00000 l6: 00000000f8000000 l7: 00000000000fffff
[  127.942043] i0: 000000000049dd38 i1: 0000000000461494 i2: fffff803ff992971 i3: fffffffffff00000
[  127.945419] i4: 0000000001492088 i5: fffff803ff993008 i6: fffff803ff992831 i7: 0000001dc5d75f38
[  127.948925] I7: <0x1dc5d75f38>
[  127.950669] FAULT[bash:2183]: 32-bit process reports 64-bit fault address [1dc5d70000]
[  127.953950] TSTATE: 0000004480001606 TPC: 000000000044e350 TNPC: 000000000044e400 Y: 00000000    Tainted: G        
W 
[  127.957602] TPC: <get_fault_insn+0x40/0xf8>
[  127.959187] g0: 000000000044e318 g1: 0000000000000400 g2: 00000000000012a8 g3: 0000001dc5d706cc
[  127.962599] g4: fffff803f233d5c0 g5: fffff80011244000 g6: fffff803f0e4c000 g7: 0000000000000002
[  127.965623] o0: 00000000008f0500 o1: fffff803f233da68 o2: 0000000000000887 o3: 0000001dc5d706cc
[  127.968722] o4: 0000000000000000 o5: fffff803ff992bc8 sp: fffff803ff9923e1 ret_pc: 000000000044e318
[  127.972222] RPC: <get_fault_insn+0x8/0xf8>
[  127.974159] l0: 0000000000407b08 l1: fffff803ff992601 l2: 0000000000404b3c l3: 0000000000000400
[  127.976987] l4: 0000000000000000 l5: 000000000000001d l6: 0000000000000000 l7: 0000000000000008
[  127.980147] i0: fffff803ff992ec0 i1: 0000000000000000 i2: 0000000000404b40 i3: 0000000000000000
[  127.982703] i4: 0000000000000000 i5: fffff803ff992d28 i6: fffff803ff9924a1 i7: 000000000044eccc
[  127.985441] I7: <do_sparc64_fault+0x670/0x6d0>
[  127.986733] kernel BUG at kernel/sched.c:1164!
[  127.989690] Unable to handle kernel paging request at virtual address 0000001dc5d70000
[  127.992283]               \|/ ____ \|/
[  127.994536]               "@'/ .. \`@"
[  127.996649]               /_| \__/ |_\
[  127.998611]                  \__U_/
[  128.000931] tsk->{mm,active_mm}->context = 0000000000000677
[  128.002756] swapper(0): Kernel bad sw trap 5 [#1]
[  128.005625] tsk->{mm,active_mm}->pgd = fffff803f0f34000
[  128.007538] TSTATE: 0000004480e01607 TPC: 000000000045c40c TNPC: 000000000045c410 Y: 00000000    Tainted: G        
W 
[  128.013332]               \|/ ____ \|/
[  128.014336]               "@'/ .. \`@"
[  128.015289]               /_| \__/ |_\
[  128.016379]                  \__U_/
[  128.018249] TPC: <resched_task+0x50/0xb4>
[  128.020817] bash(2183): Oops [#2]
[  128.021929] g0: 0000000000000000 g1: 000000000000000a g2: 00000000000012a8 g3: fffff803ec624aa0
[  128.027022] TSTATE: 0000004480001606 TPC: 000000000044e350 TNPC: 000000000044e400 Y: 00000000    Tainted: G        
W 
[  128.029768] g4: fffff803f7de4480 g5: fffff80014444000 g6: fffff803f7df0000 g7: 0000000000000001
[  128.035114] TPC: <get_fault_insn+0x40/0xf8>
[  128.036524] o0: 0000000000853cf8 o1: 000000000000048c o2: fffff803ff9a3171 o3: 0000000000000000
[  128.041318] g0: 000000000044e318 g1: 0000000000000400 g2: 00000000000012a8 g3: 0000001dc5d706cc
[  128.043706] o4: 0000000000000002 o5: 0000000000000001 sp: fffff803ff9a2ff1 ret_pc: 00000000005e32dc
[  128.048627] g4: fffff803f233d5c0 g5: fffff80011244000 g6: fffff803f0e4c000 g7: 0000000000000002
[  128.051391] RPC: <ftrace_graph_caller+0x10/0x18>
[  128.054114] o0: 00000000008f0500 o1: fffff803f233da68 o2: 0000000000000887 o3: 0000001dc5d706cc
[  128.056845] l0: 00000000004631c4 l1: fffff803ff9a3171 l2: ffffffffffffffff l3: fffff803ff9a39a0
[  128.061499] o4: 0000000000000000 o5: fffff803ff992bc8 sp: fffff803ff9923e1 ret_pc: 000000000044e318
[  128.064283] l4: fffff803f7de4480 l5: 0006000000000000 l6: 000000000000000e l7: 00000000008d0180
[  128.069543] RPC: <get_fault_insn+0x8/0xf8>
[  128.070915] i0: fffff803f233d5c0 i1: fffff803f7e297a0 i2: 0000000000000001 i3: 0000000000000000
[  128.076055] l0: 0000000000407b08 l1: fffff803ff992601 l2: 0000000000404b3c l3: 0000000000000400
[  128.079017] i4: 0000000000000002 i5: 0000000000b84340 i6: fffff803ff9a30b1 i7: 00000000005e32dc
[  128.083496] l4: 0000000000000000 l5: 000000000000001d l6: 0000000000000000 l7: 0000000000000008
[  128.085823] I7: <ftrace_graph_caller+0x10/0x18>
[  128.088213] i0: fffff803ff992ec0 i1: 0000000000000000 i2: 0000000000404b40 i3: 0000000000000000
[  128.090874] Caller[00000000005e32dc]: ftrace_graph_caller+0x10/0x18
[  128.094238] i4: 0000000000000000 i5: fffff803ff992d28 i6: fffff803ff9924a1 i7: 000000000044eccc
[  128.096690] Caller[00000000005e32dc]: ftrace_graph_caller+0x10/0x18
[  128.100142] I7: <do_sparc64_fault+0x670/0x6d0>
[  128.101565] Caller[00000000005e32dc]: ftrace_graph_caller+0x10/0x18
[  128.104808] Instruction DUMP:Caller[00000000005e32dc]: ftrace_graph_caller+0x10/0x18
[  128.108516]  22c84004 Caller[00000000005e32dc]: ftrace_graph_caller+0x10/0x18
[  128.111996]  c2592238 Caller[00000000005e32dc]: ftrace_graph_caller+0x10/0x18
[  128.115527]  1068002d Caller[00000000005e32dc]: ftrace_graph_caller+0x10/0x18
[  128.118550] <f200c000>Caller[00000000005e32dc]: ftrace_graph_caller+0x10/0x18
[  128.121330]  8530f022 Caller[00000000005e32dc]: ftrace_graph_caller+0x10/0x18
[  128.124087]  8408a7ff Caller[00000000005e32dc]: ftrace_graph_caller+0x10/0x18
[  128.126775]  c2586048 Caller[0000000000426b04]: valid_addr_bitmap_patch+0xbc/0x238
[  128.129547]  8528b002 Caller[0000000000461428]: finish_task_switch+0x28/0x8c
[  128.132215]  c4004002 Caller[00000000005e32dc]: ftrace_graph_caller+0x10/0x18
[  128.134553] 
[  128.135119] Caller[000000000042cae8]: cpu_idle+0x14c/0x164
[  128.137717] Caller[000000000075c318]: after_lock_tlb+0x1b4/0x1cc
[  128.140182] Caller[0000000000000000]: (null)
[  128.141819] Instruction DUMP: 9210248c  7fff372b  901220f8 <91d02005> c25a6008  82086008  0ac84014  92026008  
4006199a 
[  128.147653] Kernel panic - not syncing: Aiee, killing interrupt handler!
[  128.148442] Call Trace:
[  128.148795]  [00000000005e32dc] ftrace_graph_caller+0x10/0x18
[  128.149401]  [00000000005e32dc] ftrace_graph_caller+0x10/0x18
[  128.150005]  [00000000005e32dc] ftrace_graph_caller+0x10/0x18
[  128.150612]  [00000000005e32dc] ftrace_graph_caller+0x10/0x18
[  128.151217]  [00000000005e32dc] ftrace_graph_caller+0x10/0x18
[  128.151823]  [000000000045c40c] resched_task+0x50/0xb4
[  128.152394]  [00000000005e32dc] ftrace_graph_caller+0x10/0x18
[  128.153004]  [00000000005e32dc] ftrace_graph_caller+0x10/0x18
[  128.153613]  [00000000005e32dc] ftrace_graph_caller+0x10/0x18
[  128.154217]  [00000000005e32dc] ftrace_graph_caller+0x10/0x18
[  128.154822]  [00000000005e32dc] ftrace_graph_caller+0x10/0x18
[  128.155427]  [00000000005e32dc] ftrace_graph_caller+0x10/0x18
[  128.156032]  [00000000005e32dc] ftrace_graph_caller+0x10/0x18
[  128.156640]  [00000000005e32dc] ftrace_graph_caller+0x10/0x18
[  128.157245]  [00000000005e32dc] ftrace_graph_caller+0x10/0x18
[  128.157850]  [00000000005e32dc] ftrace_graph_caller+0x10/0x18
[  128.158693] Press Stop-A (L1-A) to return to the boot prom
[  128.160671] Unable to handle kernel NULL pointer dereference
[  128.161055] tsk->{mm,active_mm}->context = 0000000000000677
[  128.161606] tsk->{mm,active_mm}->pgd = fffff803f0f34000
[  128.162121]               \|/ ____ \|/
[  128.162427]               "@'/ .. \`@"
[  128.162730]               /_| \__/ |_\
[  128.163035]                  \__U_/
[  128.163421] bash(2183): Oops [#3]
[  128.163849] TSTATE: 0000004480001603 TPC: 0000000000473078 TNPC: 00000000004af184 Y: 00000000    Tainted: G      
D W 
[  128.164835] TPC: <do_exit+0x2bc/0x77c>
[  128.165245] g0: 0000000000000000 g1: 0000000000000000 g2: 00000000000012a8 g3: 00000000008ff7b8
[  128.166036] g4: fffff803f233d5c0 g5: fffff80011244000 g6: fffff803f0e4c000 g7: 0000000000000002
[  128.166829] o0: 0000000000000001 o1: fffff803f6ff7c04 o2: 0000000000000001 o3: 0000000000000000
[  128.167621] o4: 0000000000855960 o5: 0000000000000001 sp: fffff803ff991ea1 ret_pc: 0000000000473074
[  128.168483] RPC: <do_exit+0x2b8/0x77c>
[  128.168896] l0: fffff803ff992be0 l1: fffff803f233d5c0 l2: fffff803f233d5c0 l3: 0000004480001606
[  128.169687] l4: 0000000000000000 l5: 00000000008f2700 l6: 0000000000000001 l7: 000000003b9aca00
[  128.170482] i0: 0000000000000009 i1: 0000000000000020 i2: 00000000c4004002 i3: 0000000000000020
[  128.171281] i4: 00000000000000ff i5: 000000000000000b i6: fffff803ff991f71 i7: 00000000005e32dc
[  128.172114] I7: <ftrace_graph_caller+0x10/0x18>
[  128.172592] Instruction DUMP: a4100011  c2586030  4000f044 <d0586048> 4001abd1  90100011  8207a7df  90100011  
c277a7e7 
[  128.174516] Fixing recursive fault but reboot is needed!


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
                   ` (10 preceding siblings ...)
  2010-04-13 22:05 ` Frederic Weisbecker
@ 2010-04-13 22:11 ` David Miller
  2010-04-13 23:34 ` David Miller
                   ` (34 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: David Miller @ 2010-04-13 22:11 UTC (permalink / raw)
  To: sparclinux

From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 14 Apr 2010 00:05:43 +0200

> [  127.911796] FAULT[bash:2183]: 32-bit process reports 64-bit fault address [1dc5d706cc]

Any time I see this in a trace it usually means that the
register set of the cpu has been completely corrupted.

I just got a crash shortly after catting the 'trace' file
on my Niagara2 box too.  And I had never seen this before.

One thing that happened since I last tested is I did
a merge of Linus's tree into the sparc-2.6 tree I was
doing testing on.

So I wonder if some tracing et al. change in that merge
introduced this problem.

I'll take a look at the commits under kernel/trace to look
for clues and thanks for the heads up about cleaning up
the backtraces ala x86.  I can't decide which is more useful
for me, since at least half of the time I'll want to see
the real return address values in the cpu registers. :-)


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
                   ` (11 preceding siblings ...)
  2010-04-13 22:11 ` David Miller
@ 2010-04-13 23:34 ` David Miller
  2010-04-13 23:56 ` David Miller
                   ` (33 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: David Miller @ 2010-04-13 23:34 UTC (permalink / raw)
  To: sparclinux

From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 13 Apr 2010 17:52:21 -0400

> On Tue, 2010-04-13 at 14:34 -0700, David Miller wrote:
>> BTW, one thing that drives me nuts on this machine is that catting the
>> 'trace' file takes several seconds to start up.  Is it calling
>> stop_machine() or something else which very expensive with high cpu
>> counts?
> 
> No, in does not call stop_machine, but it stops all tracing on all cpus.
> I wonder what is happening :-/
> 
> It does allocate an "iterator" that has per cpu descriptors.
> 
> Does "trace_pipe" give you the same issues?

Indeed, "trace_pipe" does not show the stall.

And when I 'perf' the stalling case, nothing interesting shows up.

Then I went snooping around the call path for this stuff and looked at
ring_buffer_read_start()

That does a synchronize_sched(), and this function is invoked for
every cpu when we open the "trace" file.

So this likely explains the delay.

This synchronize_sched() is meant to make sure that the buffer's cpu
has finished any pending ring buffer writes, and thus will see the new
->record_disabled setting.  It is only at that point that we can
safely call rb_iter_reset() and start reading.

But synchronize_sched() waits for an RCU grace peiod considering all
cpus, so this is overkill since we only care about writes to a
particular cpu's buffer not all cpu buffers.

I think the criteria we are looking for is that "cpu X has returned
from a function" since that would guarentee that it left the tracing
code paths.  So maybe there's some clever way we can do this more
cheaply.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
                   ` (12 preceding siblings ...)
  2010-04-13 23:34 ` David Miller
@ 2010-04-13 23:56 ` David Miller
  2010-04-14  1:59 ` David Miller
                   ` (32 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: David Miller @ 2010-04-13 23:56 UTC (permalink / raw)
  To: sparclinux

From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 14 Apr 2010 00:05:43 +0200

> [  126.704213] hrtimer: interrupt took 8360145 ns

Ok, I can trigger this and then I get watchdog timeouts saying cpu X
was stuck for 61 seconds etc.

This is just using the function tracer.

If I dump all the cpu regs (Frederic, this is obtained by using 'echo
"y" >/proc/sysrq-trigger' or BREAK+Y on console, you might find this
useful :-) then I see the stuck cpus are in the function tracer code
path, often in the ring buffer entry allocator.

I can "unstick" the machine if I am able to echo "0" into
tracing_enable from one of my shells.

This is beginning to smell like a problem wherein we re-enter the
tracer from the tracer and for some reason we can't get out of the
cycle.  Maybe I forgot to annotate a helper function or file on
sparc64 to elide mcount calls.

Anyways, that is one possibility.

But, wince we see corruptions sometimes too, this could also point to
some issue with the sparc64 specific ftrace assembler stubs and
instruction patching.

I'll dig further.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
                   ` (13 preceding siblings ...)
  2010-04-13 23:56 ` David Miller
@ 2010-04-14  1:59 ` David Miller
  2010-04-14  9:04 ` David Miller
                   ` (31 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: David Miller @ 2010-04-14  1:59 UTC (permalink / raw)
  To: sparclinux

From: David Miller <davem@davemloft.net>
Date: Tue, 13 Apr 2010 16:56:39 -0700 (PDT)

> I'll dig further.

Ok Frederic, I think I figured it out.  We only save 128 bytes
of stack space in the mcount stubs, but that's not enough.

128 bytes is enough for the register window, but any function we call
is allowed to save the first six incoming arguments to pre-assigned
stack frame slots.

That means we have to allocate, at a minimum, 176 bytes of stack space
if we call any C functions.  (176 = 128 + (8 * 6))

These arg save stores only happen in functions where there is a
bit of register pressure, that's why we only sometimes see this.

The net result is that when we hit this, the parent's stack frame gets
writtenw ith garbage.  And that's why the crashes look so random.

I'll commit this to sparc-2.6 and push it out to Linus.  I'm beating
up a machine pretty hard with various tracers enabled and I haven't
see any suspicious symptoms yet :-)

--------------------
sparc64: Allocate sufficient stack space in ftrace stubs.

128 bytes is sufficient for the register window save area, but the
calling conventions allow the callee to save up to 6 incoming argument
registers into the stack frame after the register window save area.

This means a minimal stack frame is 176 bytes (128 + (6 * 8)).

This fixes random crashes when using the function tracer.

Reported-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/lib/mcount.S |    8 ++++----
 1 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/sparc/lib/mcount.S b/arch/sparc/lib/mcount.S
index 3753e3c..3ad6cbd 100644
--- a/arch/sparc/lib/mcount.S
+++ b/arch/sparc/lib/mcount.S
@@ -34,7 +34,7 @@ mcount:
 	cmp		%g1, %g2
 	be,pn		%icc, 1f
 	 mov		%i7, %g3
-	save		%sp, -128, %sp
+	save		%sp, -176, %sp
 	mov		%g3, %o1
 	jmpl		%g1, %o7
 	 mov		%i7, %o0
@@ -56,7 +56,7 @@ mcount:
 	 nop
 5:	mov		%i7, %g2
 	mov		%fp, %g3
-	save		%sp, -128, %sp
+	save		%sp, -176, %sp
 	mov		%g2, %l0
 	ba,pt		%xcc, ftrace_graph_caller
 	 mov		%g3, %l1
@@ -85,7 +85,7 @@ ftrace_caller:
 	lduw		[%g1 + %lo(function_trace_stop)], %g1
 	brnz,pn		%g1, ftrace_stub
 	 mov		%fp, %g3
-	save		%sp, -128, %sp
+	save		%sp, -176, %sp
 	mov		%g2, %o1
 	mov		%g2, %l0
 	mov		%g3, %l1
@@ -120,7 +120,7 @@ ENTRY(ftrace_graph_caller)
 END(ftrace_graph_caller)
 
 ENTRY(return_to_handler)
-	save		%sp, -128, %sp
+	save		%sp, -176, %sp
 	call		ftrace_return_to_handler
 	 mov		%fp, %o0
 	jmpl		%o0 + 8, %g0
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
                   ` (14 preceding siblings ...)
  2010-04-14  1:59 ` David Miller
@ 2010-04-14  9:04 ` David Miller
  2010-04-14 15:29 ` Frederic Weisbecker
                   ` (30 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: David Miller @ 2010-04-14  9:04 UTC (permalink / raw)
  To: sparclinux

From: David Miller <davem@davemloft.net>
Date: Tue, 13 Apr 2010 18:59:31 -0700 (PDT)

> From: David Miller <davem@davemloft.net>
> Date: Tue, 13 Apr 2010 16:56:39 -0700 (PDT)
> 
>> I'll dig further.
> 
> Ok Frederic, I think I figured it out.  We only save 128 bytes
> of stack space in the mcount stubs, but that's not enough.

Ok, one more fix after this one.

The other problem I seem to be hitting is that we overrun the main
kernel stack with tracing enabled.

Let's run NMIs on the hardirq stack to abate this problem for now,
patch below which I'll also push to sparc-2.6

The only remaining issue I have is that one of the cpus can get stuck
in the scheduler load balancer.  Whatever NOHZ cpu gets selected to
balance idle domains can get stuck because the time it takes to
rebalance domains with tracing enabled exceeds the interval of the
scheduler tick, so it just balances endlessly and never makes any
forward progress from the initial timer interrupt.

I think I might jack up the scheduler domain min intervals to make
that less of an issue, but long term this expensive set of
calculations needs to be fixed.  I have made Peter Z. aware of this
and hopefully between he and myself we maybe can come up with some
better fix long-term.

--------------------
sparc64: Run NMIs on the hardirq stack.

Otherwise we can overflow the main stack with the function tracer
enabled.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/kernel/irq_64.c |   19 +------------------
 arch/sparc/kernel/kstack.h |   19 +++++++++++++++++++
 arch/sparc/kernel/nmi.c    |    7 +++++++
 3 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c
index 2b04c72..830d70a 100644
--- a/arch/sparc/kernel/irq_64.c
+++ b/arch/sparc/kernel/irq_64.c
@@ -47,6 +47,7 @@
 
 #include "entry.h"
 #include "cpumap.h"
+#include "kstack.h"
 
 #define NUM_IVECS	(IMAP_INR + 1)
 
@@ -713,24 +714,6 @@ void ack_bad_irq(unsigned int virt_irq)
 void *hardirq_stack[NR_CPUS];
 void *softirq_stack[NR_CPUS];
 
-static __attribute__((always_inline)) void *set_hardirq_stack(void)
-{
-	void *orig_sp, *sp = hardirq_stack[smp_processor_id()];
-
-	__asm__ __volatile__("mov %%sp, %0" : "=r" (orig_sp));
-	if (orig_sp < sp ||
-	    orig_sp > (sp + THREAD_SIZE)) {
-		sp += THREAD_SIZE - 192 - STACK_BIAS;
-		__asm__ __volatile__("mov %0, %%sp" : : "r" (sp));
-	}
-
-	return orig_sp;
-}
-static __attribute__((always_inline)) void restore_hardirq_stack(void *orig_sp)
-{
-	__asm__ __volatile__("mov %0, %%sp" : : "r" (orig_sp));
-}
-
 void __irq_entry handler_irq(int irq, struct pt_regs *regs)
 {
 	unsigned long pstate, bucket_pa;
diff --git a/arch/sparc/kernel/kstack.h b/arch/sparc/kernel/kstack.h
index 5247283..53dfb92 100644
--- a/arch/sparc/kernel/kstack.h
+++ b/arch/sparc/kernel/kstack.h
@@ -61,4 +61,23 @@ check_magic:
 
 }
 
+static inline __attribute__((always_inline)) void *set_hardirq_stack(void)
+{
+	void *orig_sp, *sp = hardirq_stack[smp_processor_id()];
+
+	__asm__ __volatile__("mov %%sp, %0" : "=r" (orig_sp));
+	if (orig_sp < sp ||
+	    orig_sp > (sp + THREAD_SIZE)) {
+		sp += THREAD_SIZE - 192 - STACK_BIAS;
+		__asm__ __volatile__("mov %0, %%sp" : : "r" (sp));
+	}
+
+	return orig_sp;
+}
+
+static inline __attribute__((always_inline)) void restore_hardirq_stack(void *orig_sp)
+{
+	__asm__ __volatile__("mov %0, %%sp" : : "r" (orig_sp));
+}
+
 #endif /* _KSTACK_H */
diff --git a/arch/sparc/kernel/nmi.c b/arch/sparc/kernel/nmi.c
index 75a3d1a..a4bd7ba 100644
--- a/arch/sparc/kernel/nmi.c
+++ b/arch/sparc/kernel/nmi.c
@@ -23,6 +23,8 @@
 #include <asm/ptrace.h>
 #include <asm/pcr.h>
 
+#include "kstack.h"
+
 /* We don't have a real NMI on sparc64, but we can fake one
  * up using profiling counter overflow interrupts and interrupt
  * levels.
@@ -92,6 +94,7 @@ static void die_nmi(const char *str, struct pt_regs *regs, int do_panic)
 notrace __kprobes void perfctr_irq(int irq, struct pt_regs *regs)
 {
 	unsigned int sum, touched = 0;
+	void *orig_sp;
 
 	clear_softint(1 << irq);
 
@@ -99,6 +102,8 @@ notrace __kprobes void perfctr_irq(int irq, struct pt_regs *regs)
 
 	nmi_enter();
 
+	orig_sp = set_hardirq_stack();
+
 	if (notify_die(DIE_NMI, "nmi", regs, 0,
 		       pt_regs_trap_type(regs), SIGINT) = NOTIFY_STOP)
 		touched = 1;
@@ -124,6 +129,8 @@ notrace __kprobes void perfctr_irq(int irq, struct pt_regs *regs)
 		pcr_ops->write(pcr_enable);
 	}
 
+	restore_hardirq_stack(orig_sp);
+
 	nmi_exit();
 }
 
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
                   ` (15 preceding siblings ...)
  2010-04-14  9:04 ` David Miller
@ 2010-04-14 15:29 ` Frederic Weisbecker
  2010-04-14 15:48 ` Frederic Weisbecker
                   ` (29 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: Frederic Weisbecker @ 2010-04-14 15:29 UTC (permalink / raw)
  To: sparclinux

On Wed, Apr 14, 2010 at 02:04:49AM -0700, David Miller wrote:
> From: David Miller <davem@davemloft.net>
> Date: Tue, 13 Apr 2010 18:59:31 -0700 (PDT)
> 
> > From: David Miller <davem@davemloft.net>
> > Date: Tue, 13 Apr 2010 16:56:39 -0700 (PDT)
> > 
> >> I'll dig further.
> > 
> > Ok Frederic, I think I figured it out.  We only save 128 bytes
> > of stack space in the mcount stubs, but that's not enough.
> 
> Ok, one more fix after this one.
> 
> The other problem I seem to be hitting is that we overrun the main
> kernel stack with tracing enabled.
> 
> Let's run NMIs on the hardirq stack to abate this problem for now,
> patch below which I'll also push to sparc-2.6
> 
> The only remaining issue I have is that one of the cpus can get stuck
> in the scheduler load balancer.  Whatever NOHZ cpu gets selected to
> balance idle domains can get stuck because the time it takes to
> rebalance domains with tracing enabled exceeds the interval of the
> scheduler tick, so it just balances endlessly and never makes any
> forward progress from the initial timer interrupt.


I observed the same thing yesterday, cpu 0 was stuck in the balancing
path.

Anyway, I'm testing your patches.


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
                   ` (16 preceding siblings ...)
  2010-04-14 15:29 ` Frederic Weisbecker
@ 2010-04-14 15:48 ` Frederic Weisbecker
  2010-04-14 23:08 ` David Miller
                   ` (28 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: Frederic Weisbecker @ 2010-04-14 15:48 UTC (permalink / raw)
  To: sparclinux

On Tue, Apr 13, 2010 at 06:59:31PM -0700, David Miller wrote:
> From: David Miller <davem@davemloft.net>
> Date: Tue, 13 Apr 2010 16:56:39 -0700 (PDT)
> 
> > I'll dig further.
> 
> Ok Frederic, I think I figured it out.  We only save 128 bytes
> of stack space in the mcount stubs, but that's not enough.
> 
> 128 bytes is enough for the register window, but any function we call
> is allowed to save the first six incoming arguments to pre-assigned
> stack frame slots.
> 
> That means we have to allocate, at a minimum, 176 bytes of stack space
> if we call any C functions.  (176 = 128 + (8 * 6))
> 
> These arg save stores only happen in functions where there is a
> bit of register pressure, that's why we only sometimes see this.
> 
> The net result is that when we hit this, the parent's stack frame gets
> writtenw ith garbage.  And that's why the crashes look so random.
> 
> I'll commit this to sparc-2.6 and push it out to Linus.  I'm beating
> up a machine pretty hard with various tracers enabled and I haven't
> see any suspicious symptoms yet :-)
> 
> --------------------
> sparc64: Allocate sufficient stack space in ftrace stubs.
> 
> 128 bytes is sufficient for the register window save area, but the
> calling conventions allow the callee to save up to 6 incoming argument
> registers into the stack frame after the register window save area.
> 
> This means a minimal stack frame is 176 bytes (128 + (6 * 8)).
> 
> This fixes random crashes when using the function tracer.
> 
> Reported-by: Frederic Weisbecker <fweisbec@gmail.com>
> Signed-off-by: David S. Miller <davem@davemloft.net>


Ok, I've applied the two patches and it fixes the problems
partially.

The crashes happen less likely as run into a manual
loop of

	echo function_graph > current_tracer
	echo nop > current_tracer

But after some trials:

[  711.854617] hrtimer: interrupt took 8050861 ns
[  746.124873] ------------[ cut here ]------------
[  746.410622] WARNING: at kernel/sched.c:3568 ftrace_graph_caller+0x10/0x18()
[  746.697698] Modules linked in: loop usbhid ehci_hcd ohci_hcd e1000e sg sr_mod cdrom
[  747.034140] Call Trace:
[  747.036503]  [00000000005e331c] ftrace_graph_caller+0x10/0x18
[  747.040395]  [00000000005e331c] ftrace_graph_caller+0x10/0x18
[  747.044273] ---[ end trace 5213554e436b8b78 ]---
[  747.059057] Unable to handle kernel NULL pointer dereference
[  747.062799] tsk->{mm,active_mm}->context = 0000000000000661


Or another kind:

(note the hrtimer warnings are normals. This is a hanging prevention
that has been added because of the function graph tracer first but
eventually serves as a general protection for hrtimer. It's about
similar to the balancing problem scheme: the time to service timers
is so slow that timers re-expire before we exit the servicing loop,
so we risk an endless loop).


[  100.159892] hrtimer: interrupt took 8747191 ns
[  128.534074] huh, entered softirq 1 TIMER 000000000047ed70 preempt_count 00000103, exited with 00000100?
[  129.128339] ------------[ cut here ]------------
[  129.450705] Unable to handle kernel NULL pointer dereference
[  129.599834] tsk->{mm,active_mm}->context = 000000000000067b
[  129.741548] tsk->{mm,active_mm}->pgd = fffff803f0fac000
[  129.880609]               \|/ ____ \|/
[  129.881840]               "@'/ .. \`@"
[  129.883098]               /_| \__/ |_\
[  129.884410]                  \__U_/
[  130.433709] ksoftirqd/35(109): Oops [#1]
[  130.568083] TSTATE: 0000000080e01607 TPC: 0000000000476570 TNPC: 0000000000476574 Y: 00000000    Not tainted
[  130.846883] TPC: <irq_exit+0xd4/0xdc>
[  130.980036] g0: 000000000049c19c g1: 0000000000000000 g2: 00000000000012a8 g3: 00000000008ff7b8
[  131.250929] g4: fffff803f7fad900 g5: fffff80011244000 g6: fffff803f7fd4000 g7: 0000000000000002
[  131.543092] o0: 0000000000000001 o1: 0000000000000000 o2: 0000000000000001 o3: fffff803ff9533e8
l NULL pointer dereference
[  132.598525] tsk->{mm,active_mm}->context = 0000000000000513
[  132.598549] tsk->{mm,active_mm}->pgd = fffff803f0ba8000
[  132.598573]               \|/ ____ \|/
[  132.598585]               "@'/ .. \`@"
[  132.598597]               /_| \__/ |_\
[  132.598611]                  \__U_/
[  132.598637] events/0(195): Oops [#3]
[  132.598683] TSTATE: 0000004480e01606 TPC: 0000000000466424 TNPC: 0000000000466428 Y: 00000000    Tainted: G      
D   
[  132.598772] TPC: <set_next_entity+0x10/0xa8>
[  132.598798] g0: 00000000008e7aa0 g1: 0000000000000001 g2: 00000000000012a8 g3: 00000000008ff7b8
[  132.598832] g4: 00000000008e7aa0 g5: fffff80011244000 g6: 00000000008cc000 g7: 0000000000000002
[  132.598861] o0: 00000000008f0d50 o1: 0000000000466520 o2: 00000000008cf401 o3: 0000000000000000
[  132.598890] o4: fffff803f6d60000 o5: 0000000000000000 sp: 00000000008cf281 ret_pc: 000000000046641c
[  132.598927] RPC: <set_next_entity+0x8/0xa8>
[  132.598948] l0: 00000000007602d0 l1: 00000000008cf401 l2: ffffffffffffffff l3: 0000000000000400
[  132.598977] l4: 00000000008e7aa0 l5: 0006000000000000 l6: 000000000000000e l7: 0000000000000008
[  132.599005] i0: fffff80011dc5d18 i1: 0000000000000000 i2: 0000000000000001 i3: 0000000000000000
[  132.599033] i4: 0000000000000002 i5: 0000000000000000 i6: 00000000008cf341 i7: 00000000004665d8
[  132.599067] I7: <pick_next_task_fair+0xc8/0xe8>
[  132.599084] Instruction DUMP: 110023c3  4005f3ad  90122150 <c2066038> 80a06000  02480011  84063f68  90100018  
7fffffa6 
[  132.599160] note: events/0[195] exited with preempt_count 2
[  131.546191] o4: 0000000000855960 o5: 0000000000000001 sp: fffff803ff952aa1 ret_pc: 0000000000476568
[  131.549532] RPC: <irq_exit+0xcc/0xdc>
[  131.551039] l0: 000000000000804e l1: 0000000000bf3d10 l2: 0000000000bf3d20 l3: 00000000008d0180
[  131.554218] l4: 000000000076457c l5: fffff800163c2c88 l6: 0000000000000001 l7: fffff803eccaa038
[  131.557243] i0: 0000000000000000 i1: 0000001616202980 i2: fffff803ff952b81 i3: 00000000005e3328
[  131.560247] i4: 0000000000000001 i5: fffff800163c2c88 i6: 0000000000000001 i7: fffff803eccaa038
[  131.564010] I7: <0xfffff803eccaa038>
[  131.565656] Instruction DUMP: 90102000  7fffabd9  90102001 <81cfe008> 01000000  9de3bf40  110023cd  4005b354  
901220c0 
[  131.575366] Unable to handle kernel NULL pointer dereference
[  131.577418] tsk->{mm,active_mm}->context = 000000000000067b
[  131.579556] tsk->{mm,active_mm}->pgd = fffff803f0fac000
[  131.581611]               \|/ ____ \|/
[  131.582683]               "@'/ .. \`@"
[  131.583668]               /_| \__/ |_\
[  131.584740]                  \__U_/
[  131.586339] ksoftirqd/35(109): Oops [#2]
[  131.588374] TSTATE: 0000004480e01604 TPC: 00000000004730b8 TNPC: 00000000004af1c4 Y: 00000000    Tainted: G      
D   
[  131.591849] TPC: <do_exit+0x2bc/0x77c>
[  131.593318] g0: 0000000000000000 g1: 0000000000000000 g2: 00000000000012a8 g3: 0000000000000015
[  131.596339] g4: fffff803f7fad900 g5: fffff80011244000 g6: fffff803f7fd4000 g7: 0000000000000002
[  131.599249] o0: 0000000000000001 o1: fffff803f7f52624 o2: 0000000000000001 o3: 0000000000000000
[  131.602059] o4: 0000000000855960 o5: 0000000000000001 sp: fffff803ff952561 ret_pc: 00000000004730b4
[  131.605016] RPC: <do_exit+0x2b8/0x77c>
[  131.606452] l0: fffff803ff9532a0 l1: fffff803f7fad900 l2: fffff803f7fad900 l3: 0000000080e01607
[  131.609228] l4: 0000000000000000 l5: 00000000008f2700 l6: 0000000000000001 l7: 000000003b9aca00
[  131.612125] i0: 0000000000000009 i1: 0000000000000020 i2: 00000000901220c0 i3: 0000000000000020
[  131.615054] i4: 00000000000000ff i5: 000000000000000b i6: fffff803ff952631 i7: 00000000005e331c
[  131.618120] I7: <ftrace_graph_caller+0x10/0x18>
[  131.619877] Instruction DUMP: a4100011  c2586030  4000f044 <d0586048> 4001abd1  90100011  8207a7df  90100011  
c277a7e7 
[  131.627682] Fixing recursive fault but reboot is needed!
[  132.598442] ------------[ cut hrnel NULL
p<4>>[  1 32329.48944] Unabointer de efehraenndclee <1>[  132.598499] kerne[  132.890947] WARNING: at 
kernel/trace/trace_functions_graph.c:144 ftrace_return_to_handler+0x94/0x128()
[  132.911378] Bad frame pointer: expected 8cefc1, received fffff803f6d630a1
[  132.911389]   from func get_fault_insn return to 44ed0c
[  132.925314] Modules linked in: loop usbhid ehci_hcd ohci_hcd e1000e sg sr_mod cdrom
[  132.939710] Call Trace:
[  132.946238]  [000000000046ece0] warn_slowpath_common+0x50/0x84
[  132.952799]  [000000000046ed74] warn_slowpath_fmt+0x34/0x44
[  132.959135]  [00000000004d03cc] ftrace_return_to_handler+0x94/0x128
[  132.965529]  [00000000005e3328] return_to_handler+0x4/0x1c
[  132.971834]  [00000000005e331c] ftrace_graph_caller+0x10/0x18
[  132.977974]  [00000000005e331c] ftrace_graph_caller+0x10/0x18
[  132.983840]  [000000000046427c] load_balance+0xcb8/0x111c
[  132.989524]  [00000000005e331c] ftrace_graph_caller+0x10/0x18
[  132.995075]  [0000000000489ab4] worker_thread+0x90/0x358
[  133.000470]  [000000000048e974] kthread+0x64/0x78
[  133.005645]  [000000000042ba68] kernel_thread+0x3c/0x54
[  133.010717]  [000000000048ea4c] kthreadd+0xc4/0x118
[  133.015704] ---[ end trace 04c19b1c514a2c20 ]---
[  133.020612] Kernel panic - not syncing: 
                                           [  133.025362] Call Trace:
[  133.029992]  [000000000075f9a0] panic+0x64/0x130
[  133.034638]  [00000000005e331c] ftrace_graph_caller+0x10/0x18
[  133.039272]  [000000000046427c] load_balance+0xcb8/0x111c
[  133.043798]  [00000000005e331c] ftrace_graph_caller+0x10/0x18
[  133.048261]  [0000000000489ab4] worker_thread+0x90/0x358
[  133.052606]  [000000000048e974] kthread+0x64/0x78
[  133.056797]  [000000000042ba68] kernel_thread+0x3c/0x54
[  133.060914]  [000000000048ea4c] kthreadd+0xc4/0x118



^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
                   ` (17 preceding siblings ...)
  2010-04-14 15:48 ` Frederic Weisbecker
@ 2010-04-14 23:08 ` David Miller
  2010-04-16  9:12 ` David Miller
                   ` (27 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: David Miller @ 2010-04-14 23:08 UTC (permalink / raw)
  To: sparclinux

From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 14 Apr 2010 17:48:04 +0200

> Ok, I've applied the two patches and it fixes the problems
> partially.
> 
> The crashes happen less likely as run into a manual
> loop of
> 
> 	echo function_graph > current_tracer
> 	echo nop > current_tracer
> 
> But after some trials:

Thanks for testing, I still get weird crashes too during stress tests.

At this point I think either we're still getting stack overflow due to
deep call chains, or something that is supposed to be NMI safe
actually isn't :-)

Validating the stack overflow theory is pretty simple so I'll do some
tests tonight to check that out.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
                   ` (18 preceding siblings ...)
  2010-04-14 23:08 ` David Miller
@ 2010-04-16  9:12 ` David Miller
  2010-04-16 15:44 ` Frederic Weisbecker
                   ` (26 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: David Miller @ 2010-04-16  9:12 UTC (permalink / raw)
  To: sparclinux


Hey Frederic, I just wanted you to know that I'm slowly but
surely trying to make progress on these crashes.

I'm trying various different things to narrow down the source of the
corruptions, so here's what I've done so far.

I did some things to eliminate various aspects of the function tracing
code paths, and see if the problem persists.

First, I made function_trace_call() unconditionally return
immediately.

Next, I restored function_trace_call() back to normal, and instead
made trace_function() return immediately.

I could not reproduce the corruptions in either of these cases with
the function tracer enabled in situations where I was guarenteed
normally to see a crash.

So the only part of the code paths left is the ring buffer and the
filling in of the entries.

Therefore, what I'm doing now is trying things like running various
hacked up variants of the ring buffer benchmark module while doing
things that usually trigger the bug (for me a "make -j128" is usually
enough) hoping I can trigger corruption.  No luck on that so far but
I'll keep trying this angle just to make sure.

BTW, I noticed that every single time we see the corruptions now, we
always see that "hrtimer: interrupt took xxx ns" message first.  I
have never seen the corruption messages without that reaching the logs
first.

Have you?

That might be an important clue, who knows...

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
                   ` (19 preceding siblings ...)
  2010-04-16  9:12 ` David Miller
@ 2010-04-16 15:44 ` Frederic Weisbecker
  2010-04-16 20:47 ` David Miller
                   ` (25 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: Frederic Weisbecker @ 2010-04-16 15:44 UTC (permalink / raw)
  To: sparclinux

On Fri, Apr 16, 2010 at 02:12:32AM -0700, David Miller wrote:
> 
> Hey Frederic, I just wanted you to know that I'm slowly but
> surely trying to make progress on these crashes.
> 
> I'm trying various different things to narrow down the source of the
> corruptions, so here's what I've done so far.
> 
> I did some things to eliminate various aspects of the function tracing
> code paths, and see if the problem persists.
> 
> First, I made function_trace_call() unconditionally return
> immediately.
> 
> Next, I restored function_trace_call() back to normal, and instead
> made trace_function() return immediately.
> 
> I could not reproduce the corruptions in either of these cases with
> the function tracer enabled in situations where I was guarenteed
> normally to see a crash.
> 
> So the only part of the code paths left is the ring buffer and the
> filling in of the entries.
> 
> Therefore, what I'm doing now is trying things like running various
> hacked up variants of the ring buffer benchmark module while doing
> things that usually trigger the bug (for me a "make -j128" is usually
> enough) hoping I can trigger corruption.  No luck on that so far but
> I'll keep trying this angle just to make sure.
> 
> BTW, I noticed that every single time we see the corruptions now, we
> always see that "hrtimer: interrupt took xxx ns" message first.  I
> have never seen the corruption messages without that reaching the logs
> first.
> 
> Have you?
> 
> That might be an important clue, who knows...


Yep that's what I told you in my previous mail :)

"""(note the hrtimer warnings are normals. This is a hanging prevention
that has been added because of the function graph tracer first but
eventually serves as a general protection for hrtimer. It's about
similar to the balancing problem scheme: the time to service timers
is so slow that timers re-expire before we exit the servicing loop,
so we risk an endless loop)."""

This comes from the early days of the function graph tracer.
To work on it, I was sometimes using VirtualBox and the function
graph tracer and noticed it was making the system so slow that hrtimers
was hanging (in fact it was also partly promoted by guest switches).

Hence we've made this hanging protection, but that's ok, hrtimer
can sort it out this situation. Though if it happens too much,
some timers may be often delayed.

That said it also means there is a problem I think. It's normal
that it happens in a guest, but not a normal box. May be there
a contention in the tracer fast path that slows down the machine.

Do you have CONFIG_DEBUG_LOCKDEP enabled? This was one of the
sources of these contentions (fixed lately in -tip but for
.35).


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
                   ` (20 preceding siblings ...)
  2010-04-16 15:44 ` Frederic Weisbecker
@ 2010-04-16 20:47 ` David Miller
  2010-04-16 22:51 ` David Miller
                   ` (24 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: David Miller @ 2010-04-16 20:47 UTC (permalink / raw)
  To: sparclinux

From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 16 Apr 2010 17:44:21 +0200

> """(note the hrtimer warnings are normals. This is a hanging prevention
> that has been added because of the function graph tracer first but
> eventually serves as a general protection for hrtimer. It's about
> similar to the balancing problem scheme: the time to service timers
> is so slow that timers re-expire before we exit the servicing loop,
> so we risk an endless loop)."""

I don't think it's normal in this case, I suspect we loop because
of some kind of corruption.

> That said it also means there is a problem I think. It's normal
> that it happens in a guest, but not a normal box. May be there
> a contention in the tracer fast path that slows down the machine.

I think it's looping not because of contention, but because of
corrupted memory/registers.

> Do you have CONFIG_DEBUG_LOCKDEP enabled? This was one of the
> sources of these contentions (fixed lately in -tip but for
> .35).

I'm using PROVE_LOCKING but not DEBUG_LOCKDEP.

Anyways, consistently my machine crashes with completely corrupted
registers in either irq_exit() or __do_softirq().  Usually we get an
unaligned access of some sort, either accessing the stack (because %fp
is garbage) or via an indirect call (usually because %i7 is garbage).

One thing that's interesting about the softirq path is that it uses
the softirq stack.  The only thing that guards us jumping onto the
softirq_stack are the tests done by do_softirq(), mainly
!in_interrupt() and we have softirqs pending.

What if preempt_count() got corrupted in such a way that we end up
evaluating in_interrupt() to zero when we shouldn't?

If that happens, and this makes us jump onto the top of softirq stack
of the current cpu multiple times, that could cause some wild
corruptions.

Another thing I've noticed is that there appears to be some kind of
pattern to many of the register corruptions I've seen.  There is
a pattern of 64-bit values that often looks like this (in memory
order):

0xffffffffc3300000
0xffffffffc33000cc
0xffffffffc3d00000
0xffffffffc3d000cc
0xffffffffc4000000
0xffffffffc40000cc

and, from another trace:

0xffffffffc6100000
0xffffffffc61000cc
0xffffffffc6a00000
0xffffffffc6a000cc
0xffffffffc6e00000
0xffffffffc6e000cc

They look like some kind of descriptor.  The closest thing I could
find were the scatter-gather descriptors used by the Fusion mptsas
driver, but I can't find a way that the descriptors would be formed
exactly like the above, but it does come close.

For example, drivers/message/fusion/mptscsih.c:mptscsih_qcmd()
has this call:

		ioc->add_sge((char *)&pScsiReq->SGL,
			MPT_SGE_FLAGS_SSIMPLE_READ | 0,
			(dma_addr_t) -1);

which puts -1 into the address field, but this doesn't exactly line up
because the 32-bit SGE descriptors are in the order "flags" then
"address" not the other way around.

Ho hum... anyways, just looking for clues.  If those are mptsas
descriptors, then it would be consistent with how I've found that the
block I/O path seems to invariably be involved during the crashes.
In another trace (that time with PROVE_LOCKING disabled) I saw
the host->host_lock passed down into spin_lock_irqsave() being NULL.
And this was in the software interrupt handler.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
                   ` (21 preceding siblings ...)
  2010-04-16 20:47 ` David Miller
@ 2010-04-16 22:51 ` David Miller
  2010-04-16 23:14 ` Frederic Weisbecker
                   ` (23 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: David Miller @ 2010-04-16 22:51 UTC (permalink / raw)
  To: sparclinux


Frederic, I forgot to mention last night that I also did a test
wherein' I disabled all of the NMI stuff.  That made no difference
at all for the crashes, which frankly was a relief for me :-)

Anyways, just FYI...

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
                   ` (22 preceding siblings ...)
  2010-04-16 22:51 ` David Miller
@ 2010-04-16 23:14 ` Frederic Weisbecker
  2010-04-16 23:17 ` David Miller
                   ` (22 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: Frederic Weisbecker @ 2010-04-16 23:14 UTC (permalink / raw)
  To: sparclinux

On Fri, Apr 16, 2010 at 01:47:01PM -0700, David Miller wrote:
> From: Frederic Weisbecker <fweisbec@gmail.com>
> Date: Fri, 16 Apr 2010 17:44:21 +0200
> 
> > """(note the hrtimer warnings are normals. This is a hanging prevention
> > that has been added because of the function graph tracer first but
> > eventually serves as a general protection for hrtimer. It's about
> > similar to the balancing problem scheme: the time to service timers
> > is so slow that timers re-expire before we exit the servicing loop,
> > so we risk an endless loop)."""
> 
> I don't think it's normal in this case, I suspect we loop because
> of some kind of corruption.
> 
> > That said it also means there is a problem I think. It's normal
> > that it happens in a guest, but not a normal box. May be there
> > a contention in the tracer fast path that slows down the machine.
> 
> I think it's looping not because of contention, but because of
> corrupted memory/registers.
> 
> > Do you have CONFIG_DEBUG_LOCKDEP enabled? This was one of the
> > sources of these contentions (fixed lately in -tip but for
> > .35).
> 
> I'm using PROVE_LOCKING but not DEBUG_LOCKDEP.
> 
> Anyways, consistently my machine crashes with completely corrupted
> registers in either irq_exit() or __do_softirq().  Usually we get an
> unaligned access of some sort, either accessing the stack (because %fp
> is garbage) or via an indirect call (usually because %i7 is garbage).
> 
> One thing that's interesting about the softirq path is that it uses
> the softirq stack.  The only thing that guards us jumping onto the
> softirq_stack are the tests done by do_softirq(), mainly
> !in_interrupt() and we have softirqs pending.
> 
> What if preempt_count() got corrupted in such a way that we end up
> evaluating in_interrupt() to zero when we shouldn't?
> 
> If that happens, and this makes us jump onto the top of softirq stack
> of the current cpu multiple times, that could cause some wild
> corruptions.
> 
> Another thing I've noticed is that there appears to be some kind of
> pattern to many of the register corruptions I've seen.  There is
> a pattern of 64-bit values that often looks like this (in memory
> order):
> 
> 0xffffffffc3300000
> 0xffffffffc33000cc
> 0xffffffffc3d00000
> 0xffffffffc3d000cc
> 0xffffffffc4000000
> 0xffffffffc40000cc
> 
> and, from another trace:
> 
> 0xffffffffc6100000
> 0xffffffffc61000cc
> 0xffffffffc6a00000
> 0xffffffffc6a000cc
> 0xffffffffc6e00000
> 0xffffffffc6e000cc
> 
> They look like some kind of descriptor.  The closest thing I could
> find were the scatter-gather descriptors used by the Fusion mptsas
> driver, but I can't find a way that the descriptors would be formed
> exactly like the above, but it does come close.
> 
> For example, drivers/message/fusion/mptscsih.c:mptscsih_qcmd()
> has this call:
> 
> 		ioc->add_sge((char *)&pScsiReq->SGL,
> 			MPT_SGE_FLAGS_SSIMPLE_READ | 0,
> 			(dma_addr_t) -1);
> 
> which puts -1 into the address field, but this doesn't exactly line up
> because the 32-bit SGE descriptors are in the order "flags" then
> "address" not the other way around.
> 
> Ho hum... anyways, just looking for clues.  If those are mptsas
> descriptors, then it would be consistent with how I've found that the
> block I/O path seems to invariably be involved during the crashes.
> In another trace (that time with PROVE_LOCKING disabled) I saw
> the host->host_lock passed down into spin_lock_irqsave() being NULL.
> And this was in the software interrupt handler.



Hmm, just a random idea: do you think it could be due to stack overflows?
Because the function graph eats more stack by digging to function graph
handlers, ring buffer, etc...

It diggs further than what is supposed to happen without tracing.


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
                   ` (23 preceding siblings ...)
  2010-04-16 23:14 ` Frederic Weisbecker
@ 2010-04-16 23:17 ` David Miller
  2010-04-17  7:51 ` David Miller
                   ` (21 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: David Miller @ 2010-04-16 23:17 UTC (permalink / raw)
  To: sparclinux

From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 17 Apr 2010 01:14:12 +0200

> Hmm, just a random idea: do you think it could be due to stack overflows?
> Because the function graph eats more stack by digging to function graph
> handlers, ring buffer, etc...
> 
> It diggs further than what is supposed to happen without tracing.

I have mcount checking for stack-overflow by hand in assembler
during these tests, it even knows about the irq stacks.

And those checks are not triggering.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
                   ` (24 preceding siblings ...)
  2010-04-16 23:17 ` David Miller
@ 2010-04-17  7:51 ` David Miller
  2010-04-17 16:59 ` Frederic Weisbecker
                   ` (20 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: David Miller @ 2010-04-17  7:51 UTC (permalink / raw)
  To: sparclinux

From: David Miller <davem@davemloft.net>
Date: Fri, 16 Apr 2010 16:17:44 -0700 (PDT)

> From: Frederic Weisbecker <fweisbec@gmail.com>
> Date: Sat, 17 Apr 2010 01:14:12 +0200
> 
>> Hmm, just a random idea: do you think it could be due to stack overflows?
>> Because the function graph eats more stack by digging to function graph
>> handlers, ring buffer, etc...
>> 
>> It diggs further than what is supposed to happen without tracing.
> 
> I have mcount checking for stack-overflow by hand in assembler
> during these tests, it even knows about the irq stacks.
> 
> And those checks are not triggering.

Ugh...  hold on.

They're not triggering because I put the test assembler into mcount
and dynamic ftrace patches the call sites to bypass mcount altogether
:-)

Doing real tests now, and I bet you're right.

That's pretty insane though, as we use 16K stacks on sparc64 and
the gcc I'm using has the minimum stack frame decreased down to
176 bytes (used to be 192).  I'd be interested to see what one
of these too-large backtraces look like.

Hmmm, will it be in the scheduler load balancing code? :-)

Will report once I have some goot triggers.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
                   ` (25 preceding siblings ...)
  2010-04-17  7:51 ` David Miller
@ 2010-04-17 16:59 ` Frederic Weisbecker
  2010-04-17 17:22 ` Frederic Weisbecker
                   ` (19 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: Frederic Weisbecker @ 2010-04-17 16:59 UTC (permalink / raw)
  To: sparclinux

On Sat, Apr 17, 2010 at 12:51:09AM -0700, David Miller wrote:
> From: David Miller <davem@davemloft.net>
> Date: Fri, 16 Apr 2010 16:17:44 -0700 (PDT)
> 
> > From: Frederic Weisbecker <fweisbec@gmail.com>
> > Date: Sat, 17 Apr 2010 01:14:12 +0200
> > 
> >> Hmm, just a random idea: do you think it could be due to stack overflows?
> >> Because the function graph eats more stack by digging to function graph
> >> handlers, ring buffer, etc...
> >> 
> >> It diggs further than what is supposed to happen without tracing.
> > 
> > I have mcount checking for stack-overflow by hand in assembler
> > during these tests, it even knows about the irq stacks.
> > 
> > And those checks are not triggering.
> 
> Ugh...  hold on.
> 
> They're not triggering because I put the test assembler into mcount
> and dynamic ftrace patches the call sites to bypass mcount altogether
> :-)
> 
> Doing real tests now, and I bet you're right.
> 
> That's pretty insane though, as we use 16K stacks on sparc64 and
> the gcc I'm using has the minimum stack frame decreased down to
> 176 bytes (used to be 192).  I'd be interested to see what one
> of these too-large backtraces look like.
> 
> Hmmm, will it be in the scheduler load balancing code? :-)


May be yeah :)

This could be also a tracing recursion somewhere.
One good way to know is to put pause_graph_tracing()
and unpause_graph_tracing() in the very beginning and
end of the tracing paths.

/me goes to try this.


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
                   ` (26 preceding siblings ...)
  2010-04-17 16:59 ` Frederic Weisbecker
@ 2010-04-17 17:22 ` Frederic Weisbecker
  2010-04-17 21:24 ` David Miller
                   ` (18 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: Frederic Weisbecker @ 2010-04-17 17:22 UTC (permalink / raw)
  To: sparclinux

On Sat, Apr 17, 2010 at 06:59:25PM +0200, Frederic Weisbecker wrote:
> On Sat, Apr 17, 2010 at 12:51:09AM -0700, David Miller wrote:
> > From: David Miller <davem@davemloft.net>
> > Date: Fri, 16 Apr 2010 16:17:44 -0700 (PDT)
> > 
> > > From: Frederic Weisbecker <fweisbec@gmail.com>
> > > Date: Sat, 17 Apr 2010 01:14:12 +0200
> > > 
> > >> Hmm, just a random idea: do you think it could be due to stack overflows?
> > >> Because the function graph eats more stack by digging to function graph
> > >> handlers, ring buffer, etc...
> > >> 
> > >> It diggs further than what is supposed to happen without tracing.
> > > 
> > > I have mcount checking for stack-overflow by hand in assembler
> > > during these tests, it even knows about the irq stacks.
> > > 
> > > And those checks are not triggering.
> > 
> > Ugh...  hold on.
> > 
> > They're not triggering because I put the test assembler into mcount
> > and dynamic ftrace patches the call sites to bypass mcount altogether
> > :-)
> > 
> > Doing real tests now, and I bet you're right.
> > 
> > That's pretty insane though, as we use 16K stacks on sparc64 and
> > the gcc I'm using has the minimum stack frame decreased down to
> > 176 bytes (used to be 192).  I'd be interested to see what one
> > of these too-large backtraces look like.
> > 
> > Hmmm, will it be in the scheduler load balancing code? :-)
> 
> 
> May be yeah :)
> 
> This could be also a tracing recursion somewhere.
> One good way to know is to put pause_graph_tracing()
> and unpause_graph_tracing() in the very beginning and
> end of the tracing paths.
> 
> /me goes to try this.
> 


This looks like the true cause.

I tested the following patch and it fixes the issue after several
manual loops of:

	echo function_graph > current_tracer
	echo nop > current_tracer


Before it was triggering after two trials. Now it doesn't happen
after 10 iterations.

I'm going to hunt the culprit that causes this, we certainly need
a new __notrace somewhere.


diff --git a/arch/sparc/kernel/ftrace.c b/arch/sparc/kernel/ftrace.c
index 03ab022..3dc19b6 100644
--- a/arch/sparc/kernel/ftrace.c
+++ b/arch/sparc/kernel/ftrace.c
@@ -134,18 +134,24 @@ unsigned long prepare_ftrace_return(unsigned long parent,
        if (unlikely(atomic_read(&current->tracing_graph_pause)))
                return parent + 8UL;
 
+       pause_graph_tracing();
+
        if (ftrace_push_return_trace(parent, self_addr, &trace.depth,
-                                    frame_pointer) = -EBUSY)
+                                    frame_pointer) = -EBUSY) {
+               unpause_graph_tracing();
                return parent + 8UL;
+       }
 
        trace.func = self_addr;
 
        /* Only trace if the calling function expects to */
        if (!ftrace_graph_entry(&trace)) {
                current->curr_ret_stack--;
+               unpause_graph_tracing();
                return parent + 8UL;
        }
 
+       unpause_graph_tracing();
        return return_hooker;
 }
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 9aed1a5..251a46a 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -163,6 +163,8 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
        struct ftrace_graph_ret trace;
        unsigned long ret;
 
+       pause_graph_tracing();
+
        ftrace_pop_return_trace(&trace, &ret, frame_pointer);
        trace.rettime = trace_clock_local();
        ftrace_graph_return(&trace);
@@ -176,6 +178,8 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
                ret = (unsigned long)panic;
        }
 
+       unpause_graph_tracing();
+
        return ret;
 }



^ permalink raw reply related	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
                   ` (27 preceding siblings ...)
  2010-04-17 17:22 ` Frederic Weisbecker
@ 2010-04-17 21:24 ` David Miller
  2010-04-17 21:25 ` David Miller
                   ` (17 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: David Miller @ 2010-04-17 21:24 UTC (permalink / raw)
  To: sparclinux

From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 17 Apr 2010 18:59:25 +0200

> This could be also a tracing recursion somewhere.
> One good way to know is to put pause_graph_tracing()
> and unpause_graph_tracing() in the very beginning and
> end of the tracing paths.
> 
> /me goes to try this.

Last night I still couldn't get the stack overflow checks to trigger,
even once i accounted for dynamic ftrace patchin, unfortunately.

But...

I added a tracing_off() call to the hrtimer deadlock avoidance case
and the machine doesn't crash after that triggers.

So I went to look at the per-cpu trace of the processor on which
the hrtimer deadlock avoidance triggered, and it has a looping
sequence of timestamps which looks really strange.  The other
per-cpu traces look just fine.

This is a major clue I think, so I'll keep snooping around trying
to dig more deeply into this.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
                   ` (28 preceding siblings ...)
  2010-04-17 21:24 ` David Miller
@ 2010-04-17 21:25 ` David Miller
  2010-04-17 21:29 ` David Miller
                   ` (16 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: David Miller @ 2010-04-17 21:25 UTC (permalink / raw)
  To: sparclinux

From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 17 Apr 2010 19:22:21 +0200

> This looks like the true cause.
> 
> I tested the following patch and it fixes the issue after several
> manual loops of:
> 
> 	echo function_graph > current_tracer
> 	echo nop > current_tracer
> 
> 
> Before it was triggering after two trials. Now it doesn't happen
> after 10 iterations.
> 
> I'm going to hunt the culprit that causes this, we certainly need
> a new __notrace somewhere.

Hmmm, interesting... and see my other email about how my
per-cpu traces with the non-graph tracer look corrupted when
that hrtimer timeout triggers....

Let me know where this __notrace marker is needed though, sounds
interesting!

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
                   ` (29 preceding siblings ...)
  2010-04-17 21:25 ` David Miller
@ 2010-04-17 21:29 ` David Miller
  2010-04-17 21:34 ` Frederic Weisbecker
                   ` (15 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: David Miller @ 2010-04-17 21:29 UTC (permalink / raw)
  To: sparclinux

From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 17 Apr 2010 19:22:21 +0200

> I tested the following patch and it fixes the issue after several
> manual loops of:

Wait, this is completely normal.

NMIs can come in at any time and nmi_enter() emits a trace via the
rcu_nmi_enter().

This happens on x86 too.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
                   ` (30 preceding siblings ...)
  2010-04-17 21:29 ` David Miller
@ 2010-04-17 21:34 ` Frederic Weisbecker
  2010-04-17 21:38 ` Frederic Weisbecker
                   ` (14 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: Frederic Weisbecker @ 2010-04-17 21:34 UTC (permalink / raw)
  To: sparclinux

On Sat, Apr 17, 2010 at 02:29:06PM -0700, David Miller wrote:
> From: Frederic Weisbecker <fweisbec@gmail.com>
> Date: Sat, 17 Apr 2010 19:22:21 +0200
> 
> > I tested the following patch and it fixes the issue after several
> > manual loops of:
> 
> Wait, this is completely normal.
> 
> NMIs can come in at any time and nmi_enter() emits a trace via the
> rcu_nmi_enter().
> 
> This happens on x86 too.


Yeah but I've dumped the recursion cases I could find and there
was no NMI path.

I haven't started the watchdog nor perf, I guess NMI don't trigger
in other cases, right?

For now, the only reentrancy I could find was irqs that interrupt
the tracing path. Which means no good clue there. That said I
have only logged recursivity on trace entry path, not yet
on return.

I'm disabling the protections on entry, just to narrow down
the recursivity place, in case it only happens on return.


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
                   ` (31 preceding siblings ...)
  2010-04-17 21:34 ` Frederic Weisbecker
@ 2010-04-17 21:38 ` Frederic Weisbecker
  2010-04-17 21:38 ` David Miller
                   ` (13 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: Frederic Weisbecker @ 2010-04-17 21:38 UTC (permalink / raw)
  To: sparclinux

On Sat, Apr 17, 2010 at 02:29:06PM -0700, David Miller wrote:
> From: Frederic Weisbecker <fweisbec@gmail.com>
> Date: Sat, 17 Apr 2010 19:22:21 +0200
> 
> > I tested the following patch and it fixes the issue after several
> > manual loops of:
> 
> Wait, this is completely normal.
> 
> NMIs can come in at any time and nmi_enter() emits a trace via the
> rcu_nmi_enter().
> 
> This happens on x86 too.


Ok, I've just dropped the protection from the entry path and it runs
fine.

The problem happens on function return path.

I'm going to do some dumps there.


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
                   ` (32 preceding siblings ...)
  2010-04-17 21:38 ` Frederic Weisbecker
@ 2010-04-17 21:38 ` David Miller
  2010-04-17 21:41 ` Frederic Weisbecker
                   ` (12 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: David Miller @ 2010-04-17 21:38 UTC (permalink / raw)
  To: sparclinux

From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 17 Apr 2010 23:34:15 +0200

> I haven't started the watchdog nor perf, I guess NMI don't trigger
> in other cases, right?

They do, for the NMI watchdog, every few seconds.

> For now, the only reentrancy I could find was irqs that interrupt
> the tracing path. Which means no good clue there. That said I
> have only logged recursivity on trace entry path, not yet
> on return.
> 
> I'm disabling the protections on entry, just to narrow down
> the recursivity place, in case it only happens on return.

No need to do so much work, when you hit this case simply
disable tracing and dump_stack().  That way you'll see it
clearly.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
                   ` (33 preceding siblings ...)
  2010-04-17 21:38 ` David Miller
@ 2010-04-17 21:41 ` Frederic Weisbecker
  2010-04-18 15:31 ` Frederic Weisbecker
                   ` (11 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: Frederic Weisbecker @ 2010-04-17 21:41 UTC (permalink / raw)
  To: sparclinux

On Sat, Apr 17, 2010 at 02:38:37PM -0700, David Miller wrote:
> From: Frederic Weisbecker <fweisbec@gmail.com>
> Date: Sat, 17 Apr 2010 23:34:15 +0200
> 
> > I haven't started the watchdog nor perf, I guess NMI don't trigger
> > in other cases, right?
> 
> They do, for the NMI watchdog, every few seconds.
> 
> > For now, the only reentrancy I could find was irqs that interrupt
> > the tracing path. Which means no good clue there. That said I
> > have only logged recursivity on trace entry path, not yet
> > on return.
> > 
> > I'm disabling the protections on entry, just to narrow down
> > the recursivity place, in case it only happens on return.
> 
> No need to do so much work, when you hit this case simply
> disable tracing and dump_stack().  That way you'll see it
> clearly.



This is what I'm doing. But I needed to know where the tracing
recursion happens: on tracing entry or return.

And obviously it looks like that only happens on return. And
now I'm trying the dumps on return


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
                   ` (34 preceding siblings ...)
  2010-04-17 21:41 ` Frederic Weisbecker
@ 2010-04-18 15:31 ` Frederic Weisbecker
  2010-04-18 21:19 ` David Miller
                   ` (10 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: Frederic Weisbecker @ 2010-04-18 15:31 UTC (permalink / raw)
  To: sparclinux

On Sat, Apr 17, 2010 at 02:38:37PM -0700, David Miller wrote:
> From: Frederic Weisbecker <fweisbec@gmail.com>
> Date: Sat, 17 Apr 2010 23:34:15 +0200
> 
> > I haven't started the watchdog nor perf, I guess NMI don't trigger
> > in other cases, right?
> 
> They do, for the NMI watchdog, every few seconds.
> 
> > For now, the only reentrancy I could find was irqs that interrupt
> > the tracing path. Which means no good clue there. That said I
> > have only logged recursivity on trace entry path, not yet
> > on return.
> > 
> > I'm disabling the protections on entry, just to narrow down
> > the recursivity place, in case it only happens on return.
> 
> No need to do so much work, when you hit this case simply
> disable tracing and dump_stack().  That way you'll see it
> clearly.


In fact it's quite hard to dump, because most of them I get
are irrelevant (irqs that do reentering the tracing path).
And after some time, dumps end up crashing.

All I could do is narrowing down the source, everything
happens well with this patch:


diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 9aed1a5..cfcb863 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -287,7 +287,9 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
 		__trace_graph_return(tr, trace, flags, pc);
 	}
 	atomic_dec(&data->disabled);
+	pause_graph_tracing();
 	local_irq_restore(flags);
+	unpause_graph_tracing();
 }
 
 void set_graph_array(struct trace_array *tr)


That reminds me badly the problems with NMIs, and I saw
an NMI path in one of the dumps, so I guess you were
right after all as it looks like more an NMIs related problem
rather than a recursion.

I thought if I won't enable the watchdog I wouldn't have NMIs
but I have some in /proc/interrupts


^ permalink raw reply related	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
                   ` (35 preceding siblings ...)
  2010-04-18 15:31 ` Frederic Weisbecker
@ 2010-04-18 21:19 ` David Miller
  2010-04-19  7:56 ` David Miller
                   ` (9 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: David Miller @ 2010-04-18 21:19 UTC (permalink / raw)
  To: sparclinux

From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 18 Apr 2010 17:31:24 +0200

> That reminds me badly the problems with NMIs, and I saw
> an NMI path in one of the dumps, so I guess you were
> right after all as it looks like more an NMIs related problem
> rather than a recursion.
> 
> I thought if I won't enable the watchdog I wouldn't have NMIs
> but I have some in /proc/interrupts

Maybe this patch helps your case but I still get crashes even with
NMIs disabled, since as I mentioned the other day I've already
tested with them disabled.

To turn them off, simply make arch/sparc/kernel/pcr.c:pcr_arch_init()
immediately return 0 and you'll have no NMIs at all.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
                   ` (36 preceding siblings ...)
  2010-04-18 21:19 ` David Miller
@ 2010-04-19  7:56 ` David Miller
  2010-04-19  8:15 ` David Miller
                   ` (8 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: David Miller @ 2010-04-19  7:56 UTC (permalink / raw)
  To: sparclinux

From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 18 Apr 2010 17:31:24 +0200

> All I could do is narrowing down the source, everything
> happens well with this patch:
> 
> 
> diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
> index 9aed1a5..cfcb863 100644
> --- a/kernel/trace/trace_functions_graph.c
> +++ b/kernel/trace/trace_functions_graph.c
> @@ -287,7 +287,9 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
>  		__trace_graph_return(tr, trace, flags, pc);
>  	}
>  	atomic_dec(&data->disabled);
> +	pause_graph_tracing();
>  	local_irq_restore(flags);
> +	unpause_graph_tracing();
>  }
>  
>  void set_graph_array(struct trace_array *tr)

So I was tooling around, doing some disassembly and seeing what these
local_irq_*() paths look like...

From one perspective, I can't see how we can justify not using the
raw_*() variants in these critical tracer paths.  With lockdep or the
irqsoff tracer enabled, we can call various code paths that will
recurse into the tracer, especially if the debugging checks trigger.

And at this point we've decremented the ->disabled counter, so we will
absolutely not be able to detect tracer recursion triggered by this
local_irq_restore().

In fact, if we enter an error state wrt. trace_hardirqs_{on,off}() we
will likely just trigger the error check there all over again when we
re-enter the tracer and call local_irq_restore() once more.

And this would explain the crazy recursion we get.

Another idea is to decrement the ->disabled counter after the
local_irq_restore().  Yes, we might lose IRQ handler traces which
occur between the local_irq_restore() and the counter decrment, but we
would also be completely immune to recursion problems.

This was a great lead Frederic, it probably explains the bulk of our
problems.  Thanks for narrowing it down like this!

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
                   ` (37 preceding siblings ...)
  2010-04-19  7:56 ` David Miller
@ 2010-04-19  8:15 ` David Miller
  2010-04-19 19:52 ` Frederic Weisbecker
                   ` (7 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: David Miller @ 2010-04-19  8:15 UTC (permalink / raw)
  To: sparclinux

From: David Miller <davem@davemloft.net>
Date: Mon, 19 Apr 2010 00:56:13 -0700 (PDT)

> Another idea is to decrement the ->disabled counter after the
> local_irq_restore().  Yes, we might lose IRQ handler traces which
> occur between the local_irq_restore() and the counter decrment, but we
> would also be completely immune to recursion problems.

So I just gave this a try and I still get the crashes I've
been seeing with the non-graph function tracer, oh well :-)

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
                   ` (38 preceding siblings ...)
  2010-04-19  8:15 ` David Miller
@ 2010-04-19 19:52 ` Frederic Weisbecker
  2010-04-19 19:56 ` David Miller
                   ` (6 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: Frederic Weisbecker @ 2010-04-19 19:52 UTC (permalink / raw)
  To: sparclinux

On Mon, Apr 19, 2010 at 01:15:55AM -0700, David Miller wrote:
> From: David Miller <davem@davemloft.net>
> Date: Mon, 19 Apr 2010 00:56:13 -0700 (PDT)
> 
> > Another idea is to decrement the ->disabled counter after the
> > local_irq_restore().  Yes, we might lose IRQ handler traces which
> > occur between the local_irq_restore() and the counter decrment, but we
> > would also be completely immune to recursion problems.
> 
> So I just gave this a try and I still get the crashes I've
> been seeing with the non-graph function tracer, oh well :-)


Are you using the pause_graph_tracing() thing on the function
tracer? If so it won't work as it only applies to the function
graph tracer.


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
                   ` (39 preceding siblings ...)
  2010-04-19 19:52 ` Frederic Weisbecker
@ 2010-04-19 19:56 ` David Miller
  2010-04-19 20:37 ` Frederic Weisbecker
                   ` (5 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: David Miller @ 2010-04-19 19:56 UTC (permalink / raw)
  To: sparclinux

From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 19 Apr 2010 21:52:47 +0200

> On Mon, Apr 19, 2010 at 01:15:55AM -0700, David Miller wrote:
>> From: David Miller <davem@davemloft.net>
>> Date: Mon, 19 Apr 2010 00:56:13 -0700 (PDT)
>> 
>> > Another idea is to decrement the ->disabled counter after the
>> > local_irq_restore().  Yes, we might lose IRQ handler traces which
>> > occur between the local_irq_restore() and the counter decrment, but we
>> > would also be completely immune to recursion problems.
>> 
>> So I just gave this a try and I still get the crashes I've
>> been seeing with the non-graph function tracer, oh well :-)
> 
> 
> Are you using the pause_graph_tracing() thing on the function
> tracer? If so it won't work as it only applies to the function
> graph tracer.

No, I simply moved the atomic_dec() below the local_irq_restore()
as stated in the first paragraph above :-)

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
                   ` (40 preceding siblings ...)
  2010-04-19 19:56 ` David Miller
@ 2010-04-19 20:37 ` Frederic Weisbecker
  2010-04-20  5:51 ` David Miller
                   ` (4 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: Frederic Weisbecker @ 2010-04-19 20:37 UTC (permalink / raw)
  To: sparclinux

On Mon, Apr 19, 2010 at 12:56:41PM -0700, David Miller wrote:
> From: Frederic Weisbecker <fweisbec@gmail.com>
> Date: Mon, 19 Apr 2010 21:52:47 +0200
> 
> > On Mon, Apr 19, 2010 at 01:15:55AM -0700, David Miller wrote:
> >> From: David Miller <davem@davemloft.net>
> >> Date: Mon, 19 Apr 2010 00:56:13 -0700 (PDT)
> >> 
> >> > Another idea is to decrement the ->disabled counter after the
> >> > local_irq_restore().  Yes, we might lose IRQ handler traces which
> >> > occur between the local_irq_restore() and the counter decrment, but we
> >> > would also be completely immune to recursion problems.
> >> 
> >> So I just gave this a try and I still get the crashes I've
> >> been seeing with the non-graph function tracer, oh well :-)
> > 
> > 
> > Are you using the pause_graph_tracing() thing on the function
> > tracer? If so it won't work as it only applies to the function
> > graph tracer.
> 
> No, I simply moved the atomic_dec() below the local_irq_restore()
> as stated in the first paragraph above :-)


Ah right, read too fast :)


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
                   ` (41 preceding siblings ...)
  2010-04-19 20:37 ` Frederic Weisbecker
@ 2010-04-20  5:51 ` David Miller
  2010-04-20  7:50 ` David Miller
                   ` (3 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: David Miller @ 2010-04-20  5:51 UTC (permalink / raw)
  To: sparclinux


Frederic, I think I got it.

When CONFIG_TRACE_IRQFLAGS is enabled, we can recurse forever in IRQ
traps, and this is exacerbated by the slowness of tracing as it makes
it more likely we'll hit the small sequence of instructions where we
erroneously enable irqs in the trap return path.

I'm doing my stress tests right now, but if you want to try it out
in parallel, comment out the:

	wrpr	%l4, %pil

enclosed in CONFIG_TRACE_IRQFLAGS in arch/sparc/kernel/rtrap_64.S

The issue here is that we normally restore the trap time %pil further
in the trap return sequence, but this is much beyond the point where
we can still invoke C functions properly.  Which means that we can't
call trace_hardirqs_on() there.  So I was trying to do it early.

But this also means we restore the %pil before we hard disable all
interrupts by clearing PSTATE_IE in the %pstate register.  And this
is what creates the window of instructions where every IRQ return
can erroneously take another IRQ trap.

I have to think about why I might have done things this way to
make sure I fix this right.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
                   ` (42 preceding siblings ...)
  2010-04-20  5:51 ` David Miller
@ 2010-04-20  7:50 ` David Miller
  2010-04-20 13:58 ` Frederic Weisbecker
                   ` (2 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: David Miller @ 2010-04-20  7:50 UTC (permalink / raw)
  To: sparclinux

From: David Miller <davem@davemloft.net>
Date: Mon, 19 Apr 2010 22:51:05 -0700 (PDT)

> I have to think about why I might have done things this way to
> make sure I fix this right.

Ok, I don't get crashes any more.  There are all kinds of issues
with the sw watchdog triggering because cpus get slogged in
the scheduler rebalance code, but that's a seperate set of
issues.

Here is the final fix I'll commit.

sparc64: Fix hardirq tracing in trap return path.

We can overflow the hardirq stack if we set the %pil here
so early, just let the normal control flow do it.

This is fine as we are allowed to do the actual IRQ enable
at any point after we call trace_hardirqs_on.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/kernel/rtrap_64.S |   12 +++++++++++-
 1 files changed, 11 insertions(+), 1 deletions(-)

diff --git a/arch/sparc/kernel/rtrap_64.S b/arch/sparc/kernel/rtrap_64.S
index 83f1873..090b9e9 100644
--- a/arch/sparc/kernel/rtrap_64.S
+++ b/arch/sparc/kernel/rtrap_64.S
@@ -130,7 +130,17 @@ rtrap_xcall:
 		 nop
 		call			trace_hardirqs_on
 		 nop
-		wrpr			%l4, %pil
+		/* Do not actually set the %pil here.  We will do that
+		 * below after we clear PSTATE_IE in the %pstate register.
+		 * If we re-enable interrupts here, we can recurse down
+		 * the hardirq stack potentially endlessly, causing a
+		 * stack overflow.
+		 *
+		 * It is tempting to put this test and trace_hardirqs_on
+		 * call at the 'rt_continue' label, but that will not work
+		 * as that path hits unconditionally and we do not want to
+		 * execute this in NMI return paths, for example.
+		 */
 #endif
 rtrap_no_irq_enable:
 		andcc			%l1, TSTATE_PRIV, %l3
-- 
1.7.0.4



^ permalink raw reply related	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
                   ` (43 preceding siblings ...)
  2010-04-20  7:50 ` David Miller
@ 2010-04-20 13:58 ` Frederic Weisbecker
  2010-04-20 21:17 ` David Miller
  2010-04-20 22:52 ` Steven Rostedt
  46 siblings, 0 replies; 48+ messages in thread
From: Frederic Weisbecker @ 2010-04-20 13:58 UTC (permalink / raw)
  To: sparclinux

On Tue, Apr 20, 2010 at 12:50:51AM -0700, David Miller wrote:
> From: David Miller <davem@davemloft.net>
> Date: Mon, 19 Apr 2010 22:51:05 -0700 (PDT)
> 
> > I have to think about why I might have done things this way to
> > make sure I fix this right.
> 
> Ok, I don't get crashes any more.  There are all kinds of issues
> with the sw watchdog triggering because cpus get slogged in
> the scheduler rebalance code, but that's a seperate set of
> issues.



I merged your latest sparc tree and the crashes are gone.

I'm really happy you've solved this. Thanks a lot for having spent
so much time on it!


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
                   ` (44 preceding siblings ...)
  2010-04-20 13:58 ` Frederic Weisbecker
@ 2010-04-20 21:17 ` David Miller
  2010-04-20 22:52 ` Steven Rostedt
  46 siblings, 0 replies; 48+ messages in thread
From: David Miller @ 2010-04-20 21:17 UTC (permalink / raw)
  To: sparclinux

From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 20 Apr 2010 15:58:13 +0200

> I merged your latest sparc tree and the crashes are gone.
> 
> I'm really happy you've solved this. Thanks a lot for having spent
> so much time on it!

Thanks for testing and suffering through this bug hunt :-)

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 7/7] sparc64: Add function graph tracer support.
  2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
                   ` (45 preceding siblings ...)
  2010-04-20 21:17 ` David Miller
@ 2010-04-20 22:52 ` Steven Rostedt
  46 siblings, 0 replies; 48+ messages in thread
From: Steven Rostedt @ 2010-04-20 22:52 UTC (permalink / raw)
  To: sparclinux

On Tue, 2010-04-20 at 14:17 -0700, David Miller wrote:
> From: Frederic Weisbecker <fweisbec@gmail.com>
> Date: Tue, 20 Apr 2010 15:58:13 +0200
> 
> > I merged your latest sparc tree and the crashes are gone.
> > 
> > I'm really happy you've solved this. Thanks a lot for having spent
> > so much time on it!
> 
> Thanks for testing and suffering through this bug hunt :-)

Wow, I was having fun reading this CSI report ;-)

Good job guys!

-- Steve



^ permalink raw reply	[flat|nested] 48+ messages in thread

end of thread, other threads:[~2010-04-20 22:52 UTC | newest]

Thread overview: 48+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-04-13  6:43 [PATCH 7/7] sparc64: Add function graph tracer support David Miller
2010-04-13 19:18 ` Frederic Weisbecker
2010-04-13 19:39 ` Rostedt
2010-04-13 19:45 ` Frederic Weisbecker
2010-04-13 21:34 ` David Miller
2010-04-13 21:35 ` David Miller
2010-04-13 21:51 ` Frederic Weisbecker
2010-04-13 21:52 ` Steven Rostedt
2010-04-13 21:56 ` David Miller
2010-04-13 21:57 ` David Miller
2010-04-13 21:57 ` Frederic Weisbecker
2010-04-13 22:05 ` Frederic Weisbecker
2010-04-13 22:11 ` David Miller
2010-04-13 23:34 ` David Miller
2010-04-13 23:56 ` David Miller
2010-04-14  1:59 ` David Miller
2010-04-14  9:04 ` David Miller
2010-04-14 15:29 ` Frederic Weisbecker
2010-04-14 15:48 ` Frederic Weisbecker
2010-04-14 23:08 ` David Miller
2010-04-16  9:12 ` David Miller
2010-04-16 15:44 ` Frederic Weisbecker
2010-04-16 20:47 ` David Miller
2010-04-16 22:51 ` David Miller
2010-04-16 23:14 ` Frederic Weisbecker
2010-04-16 23:17 ` David Miller
2010-04-17  7:51 ` David Miller
2010-04-17 16:59 ` Frederic Weisbecker
2010-04-17 17:22 ` Frederic Weisbecker
2010-04-17 21:24 ` David Miller
2010-04-17 21:25 ` David Miller
2010-04-17 21:29 ` David Miller
2010-04-17 21:34 ` Frederic Weisbecker
2010-04-17 21:38 ` Frederic Weisbecker
2010-04-17 21:38 ` David Miller
2010-04-17 21:41 ` Frederic Weisbecker
2010-04-18 15:31 ` Frederic Weisbecker
2010-04-18 21:19 ` David Miller
2010-04-19  7:56 ` David Miller
2010-04-19  8:15 ` David Miller
2010-04-19 19:52 ` Frederic Weisbecker
2010-04-19 19:56 ` David Miller
2010-04-19 20:37 ` Frederic Weisbecker
2010-04-20  5:51 ` David Miller
2010-04-20  7:50 ` David Miller
2010-04-20 13:58 ` Frederic Weisbecker
2010-04-20 21:17 ` David Miller
2010-04-20 22:52 ` Steven Rostedt

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.