All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC 1/2] kernel patch for dump user space stack tool
@ 2012-04-11  8:07 Tu, Xiaobing
  2012-04-17  4:43 ` Lin Ming
  2012-04-20  9:44 ` Peter Zijlstra
  0 siblings, 2 replies; 18+ messages in thread
From: Tu, Xiaobing @ 2012-04-11  8:07 UTC (permalink / raw)
  To: akpm, mingo, rusty, a.p.zijlstra, linux-kernel, yanmin_zhang, rostedt
  Cc: Zuo, Jiao

From: xiaobing tu <xiaobing.tu@intel.com> 

Here is the kernel patch for this tool, The idea is to output user space stack call-chain from /proc/xxx/stack, currently, /proc/xxx/stack only output kernel stack call chain. We extend it to output user space call chain in hex format

Signed-off-by: yanmin zhang <yanmin_zhang@linux.intel.com>
Signed-off-by: xiaobing tu <xiaobing.tu@intel.com>
---
 arch/x86/kernel/stacktrace.c |   55 ++++++++++++++++++++++++++++++++++++++++++
 fs/proc/base.c               |   19 +++++++++++++-
 include/linux/stacktrace.h   |    5 +++-
 3 files changed, 77 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index fdd0c64..d802f05 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -7,6 +7,7 @@
 #include <linux/stacktrace.h>
 #include <linux/module.h>
 #include <linux/uaccess.h>
+#include <linux/mm.h>
 #include <asm/stacktrace.h>
 
 static int save_stack_stack(void *data, char *name) @@ -144,3 +145,57 @@ void save_stack_trace_user(struct stack_trace *trace)
 		trace->entries[trace->nr_entries++] = ULONG_MAX;  }
 
+static inline void __save_stack_trace_user_task(struct task_struct *task,
+		struct stack_trace *trace)
+{
+	const struct pt_regs *regs = task_pt_regs(task);
+	const void __user *fp;
+	unsigned long addr;
+
+	if (task != current && task->state == TASK_RUNNING && task->on_cpu) {
+		/* To trap into kernel at least once */
+		smp_send_reschedule(task_cpu(task));
+	}
+
+	fp = (const void __user *)regs->bp;
+	if (trace->nr_entries < trace->max_entries)
+		trace->entries[trace->nr_entries++] = regs->ip;
+
+	while (trace->nr_entries < trace->max_entries) {
+		struct stack_frame_user frame;
+
+		frame.next_fp = NULL;
+		frame.ret_addr = 0;
+
+		addr = (unsigned long)fp;
+		if (!access_process_vm(task, addr, (void *)&frame,
+				sizeof(frame), 0))
+			break;
+		if ((unsigned long)fp < regs->sp)
+			break;
+		if (frame.ret_addr) {
+			trace->entries[trace->nr_entries++] =
+				frame.ret_addr;
+		}
+		if (fp == frame.next_fp)
+			break;
+		fp = frame.next_fp;
+	}
+}
+
+void save_stack_trace_user_task(struct task_struct *task,
+		struct stack_trace *trace)
+{
+	if (task == current || !task) {
+		save_stack_trace_user(trace);
+		return;
+	}
+
+	if (task->mm)
+		__save_stack_trace_user_task(task, trace);
+
+	if (trace->nr_entries < trace->max_entries)
+		trace->entries[trace->nr_entries++] = ULONG_MAX; } 
+EXPORT_SYMBOL_GPL(save_stack_trace_user_task);
+
diff --git a/fs/proc/base.c b/fs/proc/base.c index d4548dd..603e708 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -327,8 +327,25 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
 			seq_printf(m, "[<%pK>] %pS\n",
 				   (void *)entries[i], (void *)entries[i]);
 		}
-		unlock_trace(task);
+	} else
+		goto out;
+
+	trace.nr_entries	= 0;
+	trace.max_entries	= MAX_STACK_TRACE_DEPTH;
+	trace.entries		= entries;
+	trace.skip		= 0;
+
+	seq_printf(m, "userspace\n");
+
+	save_stack_trace_user_task(task, &trace);
+
+	for (i = 0; i < trace.nr_entries; i++) {
+		if (entries[i] != ULONG_MAX)
+			seq_printf(m, "%p\n", (void *)entries[i]);
 	}
+	unlock_trace(task);
+
+out:
 	kfree(entries);
 
 	return err;
diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h index 115b570..65e9ae4 100644
--- a/include/linux/stacktrace.h
+++ b/include/linux/stacktrace.h
@@ -23,8 +23,11 @@ extern void print_stack_trace(struct stack_trace *trace, int spaces);
 
 #ifdef CONFIG_USER_STACKTRACE_SUPPORT
 extern void save_stack_trace_user(struct stack_trace *trace);
+extern void save_stack_trace_user_task(struct task_struct *task,
+		struct stack_trace *trace);
 #else
-# define save_stack_trace_user(trace)              do { } while (0)
+# define save_stack_trace_user(trace)			do { } while (0)
+# define save_stack_trace_user_task(task, trace)	do { } while (0)
 #endif
 
 #else
--
1.7.6

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC 1/2] kernel patch for dump user space stack tool
  2012-04-11  8:07 [RFC 1/2] kernel patch for dump user space stack tool Tu, Xiaobing
@ 2012-04-17  4:43 ` Lin Ming
  2012-04-17 14:38   ` Tu, Xiaobing
  2012-04-20  9:44 ` Peter Zijlstra
  1 sibling, 1 reply; 18+ messages in thread
From: Lin Ming @ 2012-04-17  4:43 UTC (permalink / raw)
  To: Tu, Xiaobing
  Cc: akpm, mingo, rusty, a.p.zijlstra, linux-kernel, yanmin_zhang,
	rostedt, Zuo, Jiao

On Wed, Apr 11, 2012 at 4:07 PM, Tu, Xiaobing <xiaobing.tu@intel.com> wrote:
> From: xiaobing tu <xiaobing.tu@intel.com>
>
> Here is the kernel patch for this tool, The idea is to output user space stack call-chain from /proc/xxx/stack, currently, /proc/xxx/stack only output kernel stack call chain. We extend it to output user space call chain in hex format

The log is too long on a single line.
You should split it to multiple lines.

Lin Ming

^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [RFC 1/2] kernel patch for dump user space stack tool
  2012-04-17  4:43 ` Lin Ming
@ 2012-04-17 14:38   ` Tu, Xiaobing
  0 siblings, 0 replies; 18+ messages in thread
From: Tu, Xiaobing @ 2012-04-17 14:38 UTC (permalink / raw)
  To: Lin Ming
  Cc: akpm, mingo, rusty, a.p.zijlstra, linux-kernel, yanmin_zhang,
	rostedt, Zuo, Jiao

Hi Ming
  Thank you for you kindly advice, I just resend the main for your suggestion. :)

Br
Xiaobing


-----Original Message-----
From: Lin Ming [mailto:mlin@ss.pku.edu.cn] 
Sent: Tuesday, April 17, 2012 12:43 PM
To: Tu, Xiaobing
Cc: akpm@linux-foundation.org; mingo@elte.hu; rusty@rustcorp.com.au; a.p.zijlstra@chello.nl; linux-kernel@vger.kernel.org; yanmin_zhang@linux.intel.com; rostedt@goodmis.org; Zuo, Jiao
Subject: Re: [RFC 1/2] kernel patch for dump user space stack tool

On Wed, Apr 11, 2012 at 4:07 PM, Tu, Xiaobing <xiaobing.tu@intel.com> wrote:
> From: xiaobing tu <xiaobing.tu@intel.com>
>
> Here is the kernel patch for this tool, The idea is to output user space stack call-chain from /proc/xxx/stack, currently, /proc/xxx/stack only output kernel stack call chain. We extend it to output user space call chain in hex format

The log is too long on a single line.
You should split it to multiple lines.

Lin Ming

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC 1/2] kernel patch for dump user space stack tool
  2012-04-11  8:07 [RFC 1/2] kernel patch for dump user space stack tool Tu, Xiaobing
  2012-04-17  4:43 ` Lin Ming
@ 2012-04-20  9:44 ` Peter Zijlstra
  2012-04-24  1:30   ` Yanmin Zhang
  1 sibling, 1 reply; 18+ messages in thread
From: Peter Zijlstra @ 2012-04-20  9:44 UTC (permalink / raw)
  To: Tu, Xiaobing
  Cc: akpm, mingo, rusty, linux-kernel, yanmin_zhang, rostedt, Zuo, Jiao

On Wed, 2012-04-11 at 08:07 +0000, Tu, Xiaobing wrote:
> From: xiaobing tu <xiaobing.tu@intel.com> 
> 
> Here is the kernel patch for this tool, The idea is to output user
> space stack call-chain from /proc/xxx/stack,
> currently, /proc/xxx/stack only output kernel stack call chain. We
> extend it to output user space call chain in hex format
> 
> Signed-off-by: yanmin zhang <yanmin_zhang@linux.intel.com>
> Signed-off-by: xiaobing tu <xiaobing.tu@intel.com>

Ok, so I don't like it.. for one I really don't see the need for this,
secondly the implementation is crappy, thirdly the interface is poor.

> ---
>  arch/x86/kernel/stacktrace.c |   55 ++++++++++++++++++++++++++++++++++++++++++
>  fs/proc/base.c               |   19 +++++++++++++-
>  include/linux/stacktrace.h   |    5 +++-
>  3 files changed, 77 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index fdd0c64..d802f05 100644
> --- a/arch/x86/kernel/stacktrace.c
> +++ b/arch/x86/kernel/stacktrace.c
> @@ -7,6 +7,7 @@
>  #include <linux/stacktrace.h>
>  #include <linux/module.h>
>  #include <linux/uaccess.h>
> +#include <linux/mm.h>
>  #include <asm/stacktrace.h>
>  
>  static int save_stack_stack(void *data, char *name) @@ -144,3 +145,57 @@ void save_stack_trace_user(struct stack_trace *trace)
>  		trace->entries[trace->nr_entries++] = ULONG_MAX;  }
>  
> +static inline void __save_stack_trace_user_task(struct task_struct *task,
> +		struct stack_trace *trace)
> +{
> +	const struct pt_regs *regs = task_pt_regs(task);
> +	const void __user *fp;
> +	unsigned long addr;
> +
> +	if (task != current && task->state == TASK_RUNNING && task->on_cpu) {
> +		/* To trap into kernel at least once */
> +		smp_send_reschedule(task_cpu(task));
> +	}

This doesn't make any sense at all..

> +
> +	fp = (const void __user *)regs->bp;
> +	if (trace->nr_entries < trace->max_entries)
> +		trace->entries[trace->nr_entries++] = regs->ip;
> +
> +	while (trace->nr_entries < trace->max_entries) {
> +		struct stack_frame_user frame;
> +
> +		frame.next_fp = NULL;
> +		frame.ret_addr = 0;
> +
> +		addr = (unsigned long)fp;
> +		if (!access_process_vm(task, addr, (void *)&frame,
> +				sizeof(frame), 0))
> +			break;
> +		if ((unsigned long)fp < regs->sp)
> +			break;
> +		if (frame.ret_addr) {
> +			trace->entries[trace->nr_entries++] =
> +				frame.ret_addr;
> +		}
> +		if (fp == frame.next_fp)
> +			break;
> +		fp = frame.next_fp;
> +	}
> +}
> +
> +void save_stack_trace_user_task(struct task_struct *task,
> +		struct stack_trace *trace)
> +{
> +	if (task == current || !task) {
> +		save_stack_trace_user(trace);
> +		return;
> +	}
> +
> +	if (task->mm)
> +		__save_stack_trace_user_task(task, trace);
> +
> +	if (trace->nr_entries < trace->max_entries)
> +		trace->entries[trace->nr_entries++] = ULONG_MAX; } 
> +EXPORT_SYMBOL_GPL(save_stack_trace_user_task);

There's already userspace stack walkers, don't reimplement them yet
again.

> diff --git a/fs/proc/base.c b/fs/proc/base.c index d4548dd..603e708 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -327,8 +327,25 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
>  			seq_printf(m, "[<%pK>] %pS\n",
>  				   (void *)entries[i], (void *)entries[i]);
>  		}
> -		unlock_trace(task);
> +	} else
> +		goto out;
> +
> +	trace.nr_entries	= 0;
> +	trace.max_entries	= MAX_STACK_TRACE_DEPTH;
> +	trace.entries		= entries;
> +	trace.skip		= 0;
> +
> +	seq_printf(m, "userspace\n");
> +
> +	save_stack_trace_user_task(task, &trace);
> +
> +	for (i = 0; i < trace.nr_entries; i++) {
> +		if (entries[i] != ULONG_MAX)
> +			seq_printf(m, "%p\n", (void *)entries[i]);
>  	}
> +	unlock_trace(task);
> +
> +out:

Writing out just the IPs means you have to have a stored snapshot
of /proc/$PID/maps around to make any sense of them. This seems a
relatively poor interface.


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC 1/2] kernel patch for dump user space stack tool
  2012-04-20  9:44 ` Peter Zijlstra
@ 2012-04-24  1:30   ` Yanmin Zhang
  2012-04-24 10:10     ` Peter Zijlstra
  2012-04-24 10:11     ` Peter Zijlstra
  0 siblings, 2 replies; 18+ messages in thread
From: Yanmin Zhang @ 2012-04-24  1:30 UTC (permalink / raw)
  To: Peter Zijlstra, Cong Wang
  Cc: Tu, Xiaobing, akpm, mingo, rusty, linux-kernel, rostedt, Zuo, Jiao

On Fri, 2012-04-20 at 11:44 +0200, Peter Zijlstra wrote:
> On Wed, 2012-04-11 at 08:07 +0000, Tu, Xiaobing wrote:
> > From: xiaobing tu <xiaobing.tu@intel.com> 
> > 
> > Here is the kernel patch for this tool, The idea is to output user
> > space stack call-chain from /proc/xxx/stack,
> > currently, /proc/xxx/stack only output kernel stack call chain. We
> > extend it to output user space call chain in hex format
> > 
> > Signed-off-by: yanmin zhang <yanmin_zhang@linux.intel.com>
> > Signed-off-by: xiaobing tu <xiaobing.tu@intel.com>
> 
> Ok, so I don't like it.. for one I really don't see the need for this,
Sorry. We didn't write down enough information about why we implemented
it. Cong asked the similar question and I explained it in separate emails.
https://lkml.org/lkml/2012/4/19/11
https://lkml.org/lkml/2012/4/19/27

> secondly the implementation is crappy,
I agree it's a little crappy. Other methods like ptrace is slow although
the codes look like clean. When ptrace is slow and might have other
bad impact, could we also say it's crappy?


>  thirdly the interface is poor.
> 
> > ---
> >  arch/x86/kernel/stacktrace.c |   55 ++++++++++++++++++++++++++++++++++++++++++
> >  fs/proc/base.c               |   19 +++++++++++++-
> >  include/linux/stacktrace.h   |    5 +++-
> >  3 files changed, 77 insertions(+), 2 deletions(-)
> > 
> > diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index fdd0c64..d802f05 100644
> > --- a/arch/x86/kernel/stacktrace.c
> > +++ b/arch/x86/kernel/stacktrace.c
> > @@ -7,6 +7,7 @@
> >  #include <linux/stacktrace.h>
> >  #include <linux/module.h>
> >  #include <linux/uaccess.h>
> > +#include <linux/mm.h>
> >  #include <asm/stacktrace.h>
> >  
> >  static int save_stack_stack(void *data, char *name) @@ -144,3 +145,57 @@ void save_stack_trace_user(struct stack_trace *trace)
> >  		trace->entries[trace->nr_entries++] = ULONG_MAX;  }
> >  
> > +static inline void __save_stack_trace_user_task(struct task_struct *task,
> > +		struct stack_trace *trace)
> > +{
> > +	const struct pt_regs *regs = task_pt_regs(task);
> > +	const void __user *fp;
> > +	unsigned long addr;
> > +
> > +	if (task != current && task->state == TASK_RUNNING && task->on_cpu) {
> > +		/* To trap into kernel at least once */
> > +		smp_send_reschedule(task_cpu(task));
> > +	}
> 
> This doesn't make any sense at all..
ptrace could put the task to a either STOPPED or TRACED state.
But it's time-consuming.

> 
> > +
> > +	fp = (const void __user *)regs->bp;
> > +	if (trace->nr_entries < trace->max_entries)
> > +		trace->entries[trace->nr_entries++] = regs->ip;
> > +
> > +	while (trace->nr_entries < trace->max_entries) {
> > +		struct stack_frame_user frame;
> > +
> > +		frame.next_fp = NULL;
> > +		frame.ret_addr = 0;
> > +
> > +		addr = (unsigned long)fp;
> > +		if (!access_process_vm(task, addr, (void *)&frame,
> > +				sizeof(frame), 0))
> > +			break;
> > +		if ((unsigned long)fp < regs->sp)
> > +			break;
> > +		if (frame.ret_addr) {
> > +			trace->entries[trace->nr_entries++] =
> > +				frame.ret_addr;
> > +		}
> > +		if (fp == frame.next_fp)
> > +			break;
> > +		fp = frame.next_fp;
> > +	}
> > +}
> > +
> > +void save_stack_trace_user_task(struct task_struct *task,
> > +		struct stack_trace *trace)
> > +{
> > +	if (task == current || !task) {
> > +		save_stack_trace_user(trace);
> > +		return;
> > +	}
> > +
> > +	if (task->mm)
> > +		__save_stack_trace_user_task(task, trace);
> > +
> > +	if (trace->nr_entries < trace->max_entries)
> > +		trace->entries[trace->nr_entries++] = ULONG_MAX; } 
> > +EXPORT_SYMBOL_GPL(save_stack_trace_user_task);
> 
> There's already userspace stack walkers, don't reimplement them yet
> again.
Would you like to point out the workable userspace stack walker?
If there is, we would check if we could reuse it.

> 
> > diff --git a/fs/proc/base.c b/fs/proc/base.c index d4548dd..603e708 100644
> > --- a/fs/proc/base.c
> > +++ b/fs/proc/base.c
> > @@ -327,8 +327,25 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
> >  			seq_printf(m, "[<%pK>] %pS\n",
> >  				   (void *)entries[i], (void *)entries[i]);
> >  		}
> > -		unlock_trace(task);
> > +	} else
> > +		goto out;
> > +
> > +	trace.nr_entries	= 0;
> > +	trace.max_entries	= MAX_STACK_TRACE_DEPTH;
> > +	trace.entries		= entries;
> > +	trace.skip		= 0;
> > +
> > +	seq_printf(m, "userspace\n");
> > +
> > +	save_stack_trace_user_task(task, &trace);
> > +
> > +	for (i = 0; i < trace.nr_entries; i++) {
> > +		if (entries[i] != ULONG_MAX)
> > +			seq_printf(m, "%p\n", (void *)entries[i]);
> >  	}
> > +	unlock_trace(task);
> > +
> > +out:
> 
> Writing out just the IPs means you have to have a stored snapshot
> of /proc/$PID/maps around to make any sense of them. This seems a
> relatively poor interface.
Good question.

When implementing it, we had similar questions.
1) Could the parser get the correct data in time, especially when the process
is running fast and doesn't sleep?
2) If /proc/XXX/maps is changing, i.e. it mmaps/munmaps frequently, could
the parser parse data correctly? That's an issue indeed. But Most applications
don't do so.

Currently, in user space, we collect both the HEX stack data and maps,
then save them to a trace file. After all collection is done, we do
the symbol parsing. That mitigates 2) dramatically.

The safest way is to stop the process at a special state, like TASK_STOPPED
or TASK_TRACED. Then, get data and parse. Then, let the process restore. Such
way is used to check more detailed data like variables, and change them by gdb.
But it's too slow and might have bad impact on running system. You might say
it's debugging stuff and debugger should look for a good approach. I don't
think such speaking is to resolve the issue. We need provide more tools to
help developers.

Usage scenario:
A) Performance debug:
When debuggers want to do a quick checking, they don't care if the first
try could get the exact data as system is RUNNING. They would get the data
for many times. So above question 1) doesn't hurt the utilization.
B) Android ANR issue debug: We did root cause some ANR issues with our tools.


Thanks for the comments and sorry for taking you too much time.

Yanmin



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC 1/2] kernel patch for dump user space stack tool
  2012-04-24  1:30   ` Yanmin Zhang
@ 2012-04-24 10:10     ` Peter Zijlstra
  2012-04-25  2:58       ` Yanmin Zhang
  2012-04-24 10:11     ` Peter Zijlstra
  1 sibling, 1 reply; 18+ messages in thread
From: Peter Zijlstra @ 2012-04-24 10:10 UTC (permalink / raw)
  To: Yanmin Zhang
  Cc: Cong Wang, Tu, Xiaobing, akpm, mingo, rusty, linux-kernel,
	rostedt, Zuo, Jiao

On Tue, 2012-04-24 at 09:30 +0800, Yanmin Zhang wrote:
> > > +static inline void __save_stack_trace_user_task(struct task_struct *task,
> > > +           struct stack_trace *trace)
> > > +{
> > > +   const struct pt_regs *regs = task_pt_regs(task);
> > > +   const void __user *fp;
> > > +   unsigned long addr;
> > > +
> > > +   if (task != current && task->state == TASK_RUNNING && task->on_cpu) {
> > > +           /* To trap into kernel at least once */
> > > +           smp_send_reschedule(task_cpu(task));
> > > +   }
> > 
> > This doesn't make any sense at all..
> ptrace could put the task to a either STOPPED or TRACED state.
> But it's time-consuming. 

Yeah, but what is the above meant to achieve? it doesn't actually stop
the task or anything, it will just trap the remote cpu, by the time you
do your stack walk below the cpu might be running another task entirely
or you're walking a life stack with all the 'fun' issues that'll bring.



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC 1/2] kernel patch for dump user space stack tool
  2012-04-24  1:30   ` Yanmin Zhang
  2012-04-24 10:10     ` Peter Zijlstra
@ 2012-04-24 10:11     ` Peter Zijlstra
  2012-04-25  2:44       ` Yanmin Zhang
  1 sibling, 1 reply; 18+ messages in thread
From: Peter Zijlstra @ 2012-04-24 10:11 UTC (permalink / raw)
  To: Yanmin Zhang
  Cc: Cong Wang, Tu, Xiaobing, akpm, mingo, rusty, linux-kernel,
	rostedt, Zuo, Jiao

On Tue, 2012-04-24 at 09:30 +0800, Yanmin Zhang wrote:
> Would you like to point out the workable userspace stack walker?
> If there is, we would check if we could reuse it.
> 
> 
arch/x86/kernel/cpu/perf_event.c:perf_callchain_user(), it also deals
with compat stuffs.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC 1/2] kernel patch for dump user space stack tool
  2012-04-24 10:11     ` Peter Zijlstra
@ 2012-04-25  2:44       ` Yanmin Zhang
  0 siblings, 0 replies; 18+ messages in thread
From: Yanmin Zhang @ 2012-04-25  2:44 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Cong Wang, Tu, Xiaobing, akpm, mingo, rusty, linux-kernel,
	rostedt, Zuo, Jiao

On Tue, 2012-04-24 at 12:11 +0200, Peter Zijlstra wrote:
> On Tue, 2012-04-24 at 09:30 +0800, Yanmin Zhang wrote:
> > Would you like to point out the workable userspace stack walker?
> > If there is, we would check if we could reuse it.
> > 
> > 
> arch/x86/kernel/cpu/perf_event.c:perf_callchain_user(), it also deals
> with compat stuffs.
Yes, it does. But it just collects the user stack call chain of
_current_ task.

When Xiaobing worked out the patch, we did think over if we could implement
it based on perf. We also checked ftrace. Both ftrace and perf collect
user stack of _current_ task.

Xiaobing wrote a similar tool based on ptrace one year ago and gave it up
as it was slow.

I was thinking if we could use the powerful symbol parsing capability of
perf to do the user space parse. I was busy and Xiaobing just changed
his old codes to work out a prototype quickly as other developers pushed hard
for the tool.

Yanmin



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC 1/2] kernel patch for dump user space stack tool
  2012-04-24 10:10     ` Peter Zijlstra
@ 2012-04-25  2:58       ` Yanmin Zhang
  0 siblings, 0 replies; 18+ messages in thread
From: Yanmin Zhang @ 2012-04-25  2:58 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Cong Wang, Tu, Xiaobing, akpm, mingo, rusty, linux-kernel,
	rostedt, Zuo, Jiao

On Tue, 2012-04-24 at 12:10 +0200, Peter Zijlstra wrote:
> On Tue, 2012-04-24 at 09:30 +0800, Yanmin Zhang wrote:
> > > > +static inline void __save_stack_trace_user_task(struct task_struct *task,
> > > > +           struct stack_trace *trace)
> > > > +{
> > > > +   const struct pt_regs *regs = task_pt_regs(task);
> > > > +   const void __user *fp;
> > > > +   unsigned long addr;
> > > > +
> > > > +   if (task != current && task->state == TASK_RUNNING && task->on_cpu) {
> > > > +           /* To trap into kernel at least once */
> > > > +           smp_send_reschedule(task_cpu(task));
> > > > +   }
> > > 
> > > This doesn't make any sense at all..
> > ptrace could put the task to a either STOPPED or TRACED state.
> > But it's time-consuming. 
> 
> Yeah, but what is the above meant to achieve? it doesn't actually stop
> the task or anything, it will just trap the remote cpu, by the time you
> do your stack walk below the cpu might be running another task entirely
> or you're walking a life stack with all the 'fun' issues that'll bring.
When we access the user space stack, it's based on _task_, not cpu.

The IPI is to make sure the task could trap into kernel at least once,
so we could get its regs->bp. If the task is running on another cpu
for a long time, the regs->bp might be too old. I am also a little worried
about that if the task restores to user space to run quickly after the IPI,
regs->bp might be ruined. If it's true, we might get bad data, or couldn't
get useful data.

See below codes.
+       const struct pt_regs *regs = task_pt_regs(task);
Above pt_regs is task's, not current's.

+       const void __user *fp;
+       unsigned long addr;
+
+       if (task != current && task->state == TASK_RUNNING && task->on_cpu) {
+               /* To trap into kernel at least once */
+               smp_send_reschedule(task_cpu(task));
+       }
+
+       fp = (const void __user *)regs->bp;
+       if (trace->nr_entries < trace->max_entries)
+               trace->entries[trace->nr_entries++] = regs->ip;
+
+       while (trace->nr_entries < trace->max_entries) {
+               struct stack_frame_user frame;
+
+               frame.next_fp = NULL;
+               frame.ret_addr = 0;
+
+               addr = (unsigned long)fp;
+               if (!access_process_vm(task, addr, (void *)&frame,
+                               sizeof(frame), 0))
Above line would access the task's user space stack.

We implemented the tool based on real requirement and it's not perfect. So
we need your expertise help.

Thanks for the comments.

Yanmin



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC 1/2] kernel patch for dump user space stack tool
  2012-04-20  9:54     ` Peter Zijlstra
@ 2012-04-24  2:19       ` Yanmin Zhang
  0 siblings, 0 replies; 18+ messages in thread
From: Yanmin Zhang @ 2012-04-24  2:19 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Cong Wang, Tu, Xiaobing, Lin Ming, akpm, mingo, rusty,
	linux-kernel, rostedt, Zuo, Jiao

On Fri, 2012-04-20 at 11:54 +0200, Peter Zijlstra wrote:
> On Thu, 2012-04-19 at 13:17 +0800, Yanmin Zhang wrote:
> > 1) We could collect the HEX-format call chain data and /proc/XXX/maps
> > of all the processes quickly, then parse them either after rebooting, or
> > after the issue is reported. It could catch the scene just at the time point
> > when the error happens. Our experiments shows the tool could collect the data
> > of all processes within 200ms.
> 
> No you can't, ever heard of address space randomization?
No. I googled it a moment ago. Here is my understanding.
ALSR is a security feature. OS arranges the mmap areas randomly. It means
the mmap space might be changed when the same executable runs twice.

Is my understanding correct?

Answer:
With our tool, we collect both user space stack data and /proc/XXX/maps,
and save them to a trace file. Then, parse them either immediately, or
after system reboots.

> 
> > 2) The new tool won't stop the processes and have less impact on them.
> > Considering a scenario of performance bottleneck investigation, statistics collection
> > shouldn't have big impact on running processes. 
> 
> Maybe.. on these tiny systems you're working on most tasks will not be
> runnable anyway since you only have 1 (maybe 2) cpus and what's running
> is your dumper process, so most everything isn't runnable, attaching and
> dumping stack of all tasks isn't really much more expensive than this.
I raised at least 2 usage scenarios. The one is Android OS ANR issue.
The other is the performance bottleneck investigation on _server_.
Android OS does run on a small system with 1 to 2 cpu (might with 4 cores,
but not popular now). It's not so simple like what you said to collect the
user space stacks of all processes by ptrace interface. We did the experiment
and the collection is time-consuming, sometimes even not endurable.

In addition, We extended the patch on our system to dump the user stacks of
all processes when system hangs. Current patch sent to LKML doesn't include it.

> the open/read/close you do on the proc files, along with the readdir
> etc.. are system-calls just like the ptrace alternative.
Good point.
1) With ptrace, there is a syscall when fetching only one call frame.
With our tool, there is only one mostly.
2) With ptrace, we need stop the processes. With Android OS on a small
system, it seems ok like what you said. But with performance tuning on
a large server, it's not ok.

> 
> > 3) It could support both i386 and x86-64. I tried pstack and it doesn't work
> > with x86-64.
> 
> Yeah, and you'll need to extend it to ARM/MIPS/etc..
It's a problem. We implement it on x86 firstly. If it's good, others would
port it to other platforms.

>  whereas there is
> plenty of userspace around that can already work on all those platforms
> -- if pstack cannot its weird, I'd think it would use all the regular
> binutils muck that already supports all the platforms.
Would you like to give me a pointer about the tools in binutils?

> 
> > 4) It follows /proc/XXX/stack interface and it's easy to use it.
> 
> Uhm, not so very much, see your ASLR issue. 


> Furthermore it requires all
> userspace be build with framepointers enabled -- which I think would be
> a good thing anyway -- but with which reality seems to disagree.
You are right indeed. The tool is for debugger and developers.

In addition, I am thinking if we might extend the tool to dump user stack
with blur (or not precise) data, which could just dump the stack from $esp.
The final symbol dump looks like the symbol lines with ? in the output of
dump_stack.
For example, we define the shorted distance between calling in the stack,
and check if the data in the stack maps in a real VMA.
With the blur data, developers could get some good hint at least. As you know,
sometimes, we couldn't get the source codes of some libraries and recompile
them.

Thanks for the kind comments.

Yanmin



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC 1/2] kernel patch for dump user space stack tool
  2012-04-20  9:38     ` Peter Zijlstra
@ 2012-04-24  0:56       ` Yanmin Zhang
  0 siblings, 0 replies; 18+ messages in thread
From: Yanmin Zhang @ 2012-04-24  0:56 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Cong Wang, Tu, Xiaobing, Lin Ming, akpm, mingo, rusty,
	linux-kernel, rostedt, Zuo, Jiao

On Fri, 2012-04-20 at 11:38 +0200, Peter Zijlstra wrote:
> On Thu, 2012-04-19 at 13:17 +0800, Yanmin Zhang wrote:
> > Although not checking the source codes of pstack (sorry, I'm busy in debugging
> > many critical issues), I think pstack is based on ptrace interface, which means:
> > 1) It need traps into system for many times to collect call frames of one
> > task.
> > 2) It need send signal to the ptraced process to stop it. Such behavior
> > might have some impact if the ptraced process also processes many signals.
> 
> Yeah, but who cares.. its debugging stuff..
Real developers, real debuggers care it. End users don't care it.

> 
> > 3) The data parsing to get symbols might not be split from data collection.
> > I mean, it collects call frames of one process, then parses it; then collects the 2nd
> > task's. If there are many processes, it couldn't collect the data just at the monitor
> > time point. 
> 
> This is equally true for your silly patch.
Not true for my patch. We did many experiments. Originally, we used
the similar method like pstack based on ptrace and found it's
very slow to do so when getting all the stacks of all processes
when system reports an issue.

Anyway, thanks for taking your time to look at it.



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC 1/2] kernel patch for dump user space stack tool
  2012-04-19  5:17   ` Yanmin Zhang
  2012-04-19  6:13     ` Cong Wang
  2012-04-20  9:38     ` Peter Zijlstra
@ 2012-04-20  9:54     ` Peter Zijlstra
  2012-04-24  2:19       ` Yanmin Zhang
  2 siblings, 1 reply; 18+ messages in thread
From: Peter Zijlstra @ 2012-04-20  9:54 UTC (permalink / raw)
  To: Yanmin Zhang
  Cc: Cong Wang, Tu, Xiaobing, Lin Ming, akpm, mingo, rusty,
	linux-kernel, rostedt, Zuo, Jiao

On Thu, 2012-04-19 at 13:17 +0800, Yanmin Zhang wrote:
> 1) We could collect the HEX-format call chain data and /proc/XXX/maps
> of all the processes quickly, then parse them either after rebooting, or
> after the issue is reported. It could catch the scene just at the time point
> when the error happens. Our experiments shows the tool could collect the data
> of all processes within 200ms.

No you can't, ever heard of address space randomization?

> 2) The new tool won't stop the processes and have less impact on them.
> Considering a scenario of performance bottleneck investigation, statistics collection
> shouldn't have big impact on running processes. 

Maybe.. on these tiny systems you're working on most tasks will not be
runnable anyway since you only have 1 (maybe 2) cpus and what's running
is your dumper process, so most everything isn't runnable, attaching and
dumping stack of all tasks isn't really much more expensive than this.

the open/read/close you do on the proc files, along with the readdir
etc.. are system-calls just like the ptrace alternative.

> 3) It could support both i386 and x86-64. I tried pstack and it doesn't work
> with x86-64.

Yeah, and you'll need to extend it to ARM/MIPS/etc.. whereas there is
plenty of userspace around that can already work on all those platforms
-- if pstack cannot its weird, I'd think it would use all the regular
binutils muck that already supports all the platforms.

> 4) It follows /proc/XXX/stack interface and it's easy to use it.

Uhm, not so very much, see your ASLR issue. Furthermore it requires all
userspace be build with framepointers enabled -- which I think would be
a good thing anyway -- but with which reality seems to disagree.


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC 1/2] kernel patch for dump user space stack tool
  2012-04-19  5:17   ` Yanmin Zhang
  2012-04-19  6:13     ` Cong Wang
@ 2012-04-20  9:38     ` Peter Zijlstra
  2012-04-24  0:56       ` Yanmin Zhang
  2012-04-20  9:54     ` Peter Zijlstra
  2 siblings, 1 reply; 18+ messages in thread
From: Peter Zijlstra @ 2012-04-20  9:38 UTC (permalink / raw)
  To: Yanmin Zhang
  Cc: Cong Wang, Tu, Xiaobing, Lin Ming, akpm, mingo, rusty,
	linux-kernel, rostedt, Zuo, Jiao

On Thu, 2012-04-19 at 13:17 +0800, Yanmin Zhang wrote:
> Although not checking the source codes of pstack (sorry, I'm busy in debugging
> many critical issues), I think pstack is based on ptrace interface, which means:
> 1) It need traps into system for many times to collect call frames of one
> task.
> 2) It need send signal to the ptraced process to stop it. Such behavior
> might have some impact if the ptraced process also processes many signals.

Yeah, but who cares.. its debugging stuff..

> 3) The data parsing to get symbols might not be split from data collection.
> I mean, it collects call frames of one process, then parses it; then collects the 2nd
> task's. If there are many processes, it couldn't collect the data just at the monitor
> time point. 

This is equally true for your silly patch.


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC 1/2] kernel patch for dump user space stack tool
  2012-04-19  6:13     ` Cong Wang
@ 2012-04-19  6:28       ` Yanmin Zhang
  0 siblings, 0 replies; 18+ messages in thread
From: Yanmin Zhang @ 2012-04-19  6:28 UTC (permalink / raw)
  To: Cong Wang
  Cc: Tu, Xiaobing, Lin Ming, akpm, mingo, rusty, a.p.zijlstra,
	linux-kernel, rostedt, Zuo, Jiao

On Thu, 2012-04-19 at 14:13 +0800, Cong Wang wrote:
> On 04/19/2012 01:17 PM, Yanmin Zhang wrote:
> > On Thu, 2012-04-19 at 11:50 +0800, Cong Wang wrote:
> >> On 04/17/2012 10:37 PM, Tu, Xiaobing wrote:
> >>> Resend the patch because of the log is too long on a single line.
> >>>
> >>> From: xiaobing tu<xiaobing.tu@intel.com>
> >>>
> >>> Here is the kernel patch for this tool, The idea is to output user space stack call-chain from
> >>> /proc/xxx/stack, currently, /proc/xxx/stack only output kernel stack call chain. We extend
> >>> it to output user space call chain in hex format
> >>>
> >>
> >> Can you teach me why we still need this as we have pstack?
> > Cong,
> >
> > Sorry for replying so late. Xiaobing told me you sent him email and I
> > didn't receive the 1st one you sent out.
> 
> 
> Based on the length of your reply and the description of the patch, you 
> hide lots of information in your patch description.
Indeed, we need add more info there.

> 
> >
> > I tried pstack and it does work. It means developers in the world wanted
> > the tool long long ago.
> >
> > Although not checking the source codes of pstack (sorry, I'm busy in debugging
> > many critical issues), I think pstack is based on ptrace interface, which means:
> > 1) It need traps into system for many times to collect call frames of one
> > task.
> > 2) It need send signal to the ptraced process to stop it. Such behavior
> > might have some impact if the ptraced process also processes many signals.
> > 3) The data parsing to get symbols might not be split from data collection.
> > I mean, it collects call frames of one process, then parses it; then collects the 2nd
> > task's. If there are many processes, it couldn't collect the data just at the monitor
> > time point.
> 
> 
> Yet another one who wants to "fix" ptrace. ;-)
Agree. But usually, it's hard to fix very old codes. Ptrace is used by gdb
and people don't touch the kernel part.

> 
> >
> > Why do we work out the tools? The original requirement is from real work.
> > We are enabling Android on Medfield. One typical error of Android is ANR.
> > When a process couldn't respond in 5 seconds, Android reports an ANR error,
> > and dumps JAVA call stack. However, it couldn't dump userspace lib (such like
> > bionic, written by C or C++). In addition, Android just dumps the stack of
> > the non-responding process. It doesn't dump stack of others. As binder is basic
> > framework in Android, processes communicate by binder in the model of client/server.
> > When one process is not responding quickly, maybe another process blocks it. We
> > need dump that process status.
> >
> > Many teams complained it's hard to debug such ANR issues, especially the ones which
> > are triggered at MTBF testing. Sometimes, an ANR happens after MTBF testing runs
> > for one week. Developers ask us to implement such tool over and over again.
> >
> > Besides ANR, sometimes, system might not respond to any user operation. Usually,
> > kernel or firmware would reset system. At that time, we also need get the call
> > chains of all the user space processes before system is reset.
> 
> 
> I am not familiar with Andriod at all, so a quick question is if this is 
> only for Andriod, why you introduce this for all? IOW, why not provide a 
> Kconfig?
Although working on Android, we think it might be useful to use the tool to resolve similar
issue. For example, I worked on performance tuning years ago and got headache why
there was performance drop on a large-scale server. From kernel part, I couldn't
find enough info to debug it. Eventually, I root caused some issues by gdb attach,
then manually checking the user space call chain. It's painful.

In addition, the new tool consists of kernel patch and user space parse tool.
The kernel patch is quite simple and shouldn't hurt system. It reuses
existing CONFIG_USER_STACKTRACE_SUPPORT.

> 
> BTW, I am sure you need to put the above paragraphs into your patch 
> description, to make it clear why the patch is needed.
It's a good idea definitely.

> 
> >
> > With our tool,
> > 1) We could collect the HEX-format call chain data and /proc/XXX/maps
> > of all the processes quickly, then parse them either after rebooting, or
> > after the issue is reported. It could catch the scene just at the time point
> > when the error happens. Our experiments shows the tool could collect the data
> > of all processes within 200ms.
> > 2) The new tool won't stop the processes and have less impact on them.
> > Considering a scenario of performance bottleneck investigation, statistics collection
> > shouldn't have big impact on running processes.
> > 3) It could support both i386 and x86-64. I tried pstack and it doesn't work
> > with x86-64.
> > 4) It follows /proc/XXX/stack interface and it's easy to use it.
> >
> > Besides this tool, we are considering to extend it to collect user space
> > call chain of current process from kernel when kernel detects some other
> > abnormal behavior.
> >
> 
> In my previous reply, I ran 'pstrack' on my x86-64 machine, don't 
> understand why you said it doesn't work with x86-64? I guess pstack 
> supports more than just x86, as ptrace is available in other arch's too.
Ok. I use the latest ubuntu on my workstation and apt-get to install
pstack without recompiling it. The default pstack executable reported
failure on 64bit os. I was wrong and might check pstack again.

Thanks for the information.

Yanmin





^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC 1/2] kernel patch for dump user space stack tool
  2012-04-19  5:17   ` Yanmin Zhang
@ 2012-04-19  6:13     ` Cong Wang
  2012-04-19  6:28       ` Yanmin Zhang
  2012-04-20  9:38     ` Peter Zijlstra
  2012-04-20  9:54     ` Peter Zijlstra
  2 siblings, 1 reply; 18+ messages in thread
From: Cong Wang @ 2012-04-19  6:13 UTC (permalink / raw)
  To: Yanmin Zhang
  Cc: Tu, Xiaobing, Lin Ming, akpm, mingo, rusty, a.p.zijlstra,
	linux-kernel, rostedt, Zuo, Jiao

On 04/19/2012 01:17 PM, Yanmin Zhang wrote:
> On Thu, 2012-04-19 at 11:50 +0800, Cong Wang wrote:
>> On 04/17/2012 10:37 PM, Tu, Xiaobing wrote:
>>> Resend the patch because of the log is too long on a single line.
>>>
>>> From: xiaobing tu<xiaobing.tu@intel.com>
>>>
>>> Here is the kernel patch for this tool, The idea is to output user space stack call-chain from
>>> /proc/xxx/stack, currently, /proc/xxx/stack only output kernel stack call chain. We extend
>>> it to output user space call chain in hex format
>>>
>>
>> Can you teach me why we still need this as we have pstack?
> Cong,
>
> Sorry for replying so late. Xiaobing told me you sent him email and I
> didn't receive the 1st one you sent out.


Based on the length of your reply and the description of the patch, you 
hide lots of information in your patch description.

>
> I tried pstack and it does work. It means developers in the world wanted
> the tool long long ago.
>
> Although not checking the source codes of pstack (sorry, I'm busy in debugging
> many critical issues), I think pstack is based on ptrace interface, which means:
> 1) It need traps into system for many times to collect call frames of one
> task.
> 2) It need send signal to the ptraced process to stop it. Such behavior
> might have some impact if the ptraced process also processes many signals.
> 3) The data parsing to get symbols might not be split from data collection.
> I mean, it collects call frames of one process, then parses it; then collects the 2nd
> task's. If there are many processes, it couldn't collect the data just at the monitor
> time point.


Yet another one who wants to "fix" ptrace. ;-)

>
> Why do we work out the tools? The original requirement is from real work.
> We are enabling Android on Medfield. One typical error of Android is ANR.
> When a process couldn't respond in 5 seconds, Android reports an ANR error,
> and dumps JAVA call stack. However, it couldn't dump userspace lib (such like
> bionic, written by C or C++). In addition, Android just dumps the stack of
> the non-responding process. It doesn't dump stack of others. As binder is basic
> framework in Android, processes communicate by binder in the model of client/server.
> When one process is not responding quickly, maybe another process blocks it. We
> need dump that process status.
>
> Many teams complained it's hard to debug such ANR issues, especially the ones which
> are triggered at MTBF testing. Sometimes, an ANR happens after MTBF testing runs
> for one week. Developers ask us to implement such tool over and over again.
>
> Besides ANR, sometimes, system might not respond to any user operation. Usually,
> kernel or firmware would reset system. At that time, we also need get the call
> chains of all the user space processes before system is reset.


I am not familiar with Andriod at all, so a quick question is if this is 
only for Andriod, why you introduce this for all? IOW, why not provide a 
Kconfig?

BTW, I am sure you need to put the above paragraphs into your patch 
description, to make it clear why the patch is needed.

>
> With our tool,
> 1) We could collect the HEX-format call chain data and /proc/XXX/maps
> of all the processes quickly, then parse them either after rebooting, or
> after the issue is reported. It could catch the scene just at the time point
> when the error happens. Our experiments shows the tool could collect the data
> of all processes within 200ms.
> 2) The new tool won't stop the processes and have less impact on them.
> Considering a scenario of performance bottleneck investigation, statistics collection
> shouldn't have big impact on running processes.
> 3) It could support both i386 and x86-64. I tried pstack and it doesn't work
> with x86-64.
> 4) It follows /proc/XXX/stack interface and it's easy to use it.
>
> Besides this tool, we are considering to extend it to collect user space
> call chain of current process from kernel when kernel detects some other
> abnormal behavior.
>

In my previous reply, I ran 'pstrack' on my x86-64 machine, don't 
understand why you said it doesn't work with x86-64? I guess pstack 
supports more than just x86, as ptrace is available in other arch's too.

Thanks.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC 1/2] kernel patch for dump user space stack tool
  2012-04-19  3:50 ` Cong Wang
@ 2012-04-19  5:17   ` Yanmin Zhang
  2012-04-19  6:13     ` Cong Wang
                       ` (2 more replies)
  0 siblings, 3 replies; 18+ messages in thread
From: Yanmin Zhang @ 2012-04-19  5:17 UTC (permalink / raw)
  To: Cong Wang
  Cc: Tu, Xiaobing, Lin Ming, akpm, mingo, rusty, a.p.zijlstra,
	linux-kernel, rostedt, Zuo, Jiao

On Thu, 2012-04-19 at 11:50 +0800, Cong Wang wrote:
> On 04/17/2012 10:37 PM, Tu, Xiaobing wrote:
> > Resend the patch because of the log is too long on a single line.
> >
> > From: xiaobing tu<xiaobing.tu@intel.com>
> >
> > Here is the kernel patch for this tool, The idea is to output user space stack call-chain from
> > /proc/xxx/stack, currently, /proc/xxx/stack only output kernel stack call chain. We extend
> > it to output user space call chain in hex format
> >
> 
> Can you teach me why we still need this as we have pstack?
Cong,

Sorry for replying so late. Xiaobing told me you sent him email and I
didn't receive the 1st one you sent out.

I tried pstack and it does work. It means developers in the world wanted
the tool long long ago.

Although not checking the source codes of pstack (sorry, I'm busy in debugging
many critical issues), I think pstack is based on ptrace interface, which means:
1) It need traps into system for many times to collect call frames of one
task.
2) It need send signal to the ptraced process to stop it. Such behavior
might have some impact if the ptraced process also processes many signals.
3) The data parsing to get symbols might not be split from data collection.
I mean, it collects call frames of one process, then parses it; then collects the 2nd
task's. If there are many processes, it couldn't collect the data just at the monitor
time point.

Why do we work out the tools? The original requirement is from real work.
We are enabling Android on Medfield. One typical error of Android is ANR.
When a process couldn't respond in 5 seconds, Android reports an ANR error,
and dumps JAVA call stack. However, it couldn't dump userspace lib (such like
bionic, written by C or C++). In addition, Android just dumps the stack of
the non-responding process. It doesn't dump stack of others. As binder is basic
framework in Android, processes communicate by binder in the model of client/server.
When one process is not responding quickly, maybe another process blocks it. We
need dump that process status.

Many teams complained it's hard to debug such ANR issues, especially the ones which
are triggered at MTBF testing. Sometimes, an ANR happens after MTBF testing runs
for one week. Developers ask us to implement such tool over and over again.

Besides ANR, sometimes, system might not respond to any user operation. Usually,
kernel or firmware would reset system. At that time, we also need get the call
chains of all the user space processes before system is reset.

With our tool,
1) We could collect the HEX-format call chain data and /proc/XXX/maps
of all the processes quickly, then parse them either after rebooting, or
after the issue is reported. It could catch the scene just at the time point
when the error happens. Our experiments shows the tool could collect the data
of all processes within 200ms.
2) The new tool won't stop the processes and have less impact on them.
Considering a scenario of performance bottleneck investigation, statistics collection
shouldn't have big impact on running processes. 
3) It could support both i386 and x86-64. I tried pstack and it doesn't work
with x86-64.
4) It follows /proc/XXX/stack interface and it's easy to use it.

Besides this tool, we are considering to extend it to collect user space
call chain of current process from kernel when kernel detects some other
abnormal behavior.

Thanks for your kind comments and welcome to try it.

Yanmin

> 
> ~% pstack $$
> #0  0x00000036ae2365da in sigsuspend () from /lib64/libc.so.6
> #1  0x0000000000472c25 in signal_suspend ()
> #2  0x0000000000443323 in ?? ()
> #3  0x0000000000443be6 in waitjobs ()
> #4  0x000000000042b6d3 in ?? ()
> #5  0x000000000042c0bd in execlist ()
> #6  0x000000000042c64f in execode ()
> #7  0x000000000043cd32 in loop ()
> #8  0x000000000043fb36 in zsh_main ()
> #9  0x00000036ae22169d in __libc_start_main () from /lib64/libc.so.6
> #10 0x000000000040e571 in _start ()
> 
> Thanks!



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC 1/2] kernel patch for dump user space stack tool
  2012-04-17 14:37 Tu, Xiaobing
@ 2012-04-19  3:50 ` Cong Wang
  2012-04-19  5:17   ` Yanmin Zhang
  0 siblings, 1 reply; 18+ messages in thread
From: Cong Wang @ 2012-04-19  3:50 UTC (permalink / raw)
  To: Tu, Xiaobing
  Cc: Lin Ming, akpm, mingo, rusty, a.p.zijlstra, linux-kernel,
	yanmin_zhang, rostedt, Zuo, Jiao

On 04/17/2012 10:37 PM, Tu, Xiaobing wrote:
> Resend the patch because of the log is too long on a single line.
>
> From: xiaobing tu<xiaobing.tu@intel.com>
>
> Here is the kernel patch for this tool, The idea is to output user space stack call-chain from
> /proc/xxx/stack, currently, /proc/xxx/stack only output kernel stack call chain. We extend
> it to output user space call chain in hex format
>

Can you teach me why we still need this as we have pstack?

~% pstack $$
#0  0x00000036ae2365da in sigsuspend () from /lib64/libc.so.6
#1  0x0000000000472c25 in signal_suspend ()
#2  0x0000000000443323 in ?? ()
#3  0x0000000000443be6 in waitjobs ()
#4  0x000000000042b6d3 in ?? ()
#5  0x000000000042c0bd in execlist ()
#6  0x000000000042c64f in execode ()
#7  0x000000000043cd32 in loop ()
#8  0x000000000043fb36 in zsh_main ()
#9  0x00000036ae22169d in __libc_start_main () from /lib64/libc.so.6
#10 0x000000000040e571 in _start ()

Thanks!

^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [RFC 1/2] kernel patch for dump user space stack tool
@ 2012-04-17 14:37 Tu, Xiaobing
  2012-04-19  3:50 ` Cong Wang
  0 siblings, 1 reply; 18+ messages in thread
From: Tu, Xiaobing @ 2012-04-17 14:37 UTC (permalink / raw)
  To: Lin Ming, Tu, Xiaobing, akpm, mingo, rusty, a.p.zijlstra,
	linux-kernel, yanmin_zhang, rostedt
  Cc: Zuo, Jiao

Resend the patch because of the log is too long on a single line.

From: xiaobing tu <xiaobing.tu@intel.com> 

Here is the kernel patch for this tool, The idea is to output user space stack call-chain from 
/proc/xxx/stack, currently, /proc/xxx/stack only output kernel stack call chain. We extend 
it to output user space call chain in hex format

Signed-off-by: yanmin zhang <yanmin_zhang@linux.intel.com>
Signed-off-by: xiaobing tu <xiaobing.tu@intel.com>
---
 arch/x86/kernel/stacktrace.c |   55 ++++++++++++++++++++++++++++++++++++++++++
 fs/proc/base.c               |   19 +++++++++++++-
 include/linux/stacktrace.h   |    5 +++-
 3 files changed, 77 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index fdd0c64..d802f05 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -7,6 +7,7 @@
 #include <linux/stacktrace.h>
 #include <linux/module.h>
 #include <linux/uaccess.h>
+#include <linux/mm.h>
 #include <asm/stacktrace.h>
 
 static int save_stack_stack(void *data, char *name) @@ -144,3 +145,57 @@ void save_stack_trace_user(struct stack_trace *trace)
 		trace->entries[trace->nr_entries++] = ULONG_MAX;  }
 
+static inline void __save_stack_trace_user_task(struct task_struct *task,
+		struct stack_trace *trace)
+{
+	const struct pt_regs *regs = task_pt_regs(task);
+	const void __user *fp;
+	unsigned long addr;
+
+	if (task != current && task->state == TASK_RUNNING && task->on_cpu) {
+		/* To trap into kernel at least once */
+		smp_send_reschedule(task_cpu(task));
+	}
+
+	fp = (const void __user *)regs->bp;
+	if (trace->nr_entries < trace->max_entries)
+		trace->entries[trace->nr_entries++] = regs->ip;
+
+	while (trace->nr_entries < trace->max_entries) {
+		struct stack_frame_user frame;
+
+		frame.next_fp = NULL;
+		frame.ret_addr = 0;
+
+		addr = (unsigned long)fp;
+		if (!access_process_vm(task, addr, (void *)&frame,
+				sizeof(frame), 0))
+			break;
+		if ((unsigned long)fp < regs->sp)
+			break;
+		if (frame.ret_addr) {
+			trace->entries[trace->nr_entries++] =
+				frame.ret_addr;
+		}
+		if (fp == frame.next_fp)
+			break;
+		fp = frame.next_fp;
+	}
+}
+
+void save_stack_trace_user_task(struct task_struct *task,
+		struct stack_trace *trace)
+{
+	if (task == current || !task) {
+		save_stack_trace_user(trace);
+		return;
+	}
+
+	if (task->mm)
+		__save_stack_trace_user_task(task, trace);
+
+	if (trace->nr_entries < trace->max_entries)
+		trace->entries[trace->nr_entries++] = ULONG_MAX; } 
+EXPORT_SYMBOL_GPL(save_stack_trace_user_task);
+
diff --git a/fs/proc/base.c b/fs/proc/base.c index d4548dd..603e708 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -327,8 +327,25 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
 			seq_printf(m, "[<%pK>] %pS\n",
 				   (void *)entries[i], (void *)entries[i]);
 		}
-		unlock_trace(task);
+	} else
+		goto out;
+
+	trace.nr_entries	= 0;
+	trace.max_entries	= MAX_STACK_TRACE_DEPTH;
+	trace.entries		= entries;
+	trace.skip		= 0;
+
+	seq_printf(m, "userspace\n");
+
+	save_stack_trace_user_task(task, &trace);
+
+	for (i = 0; i < trace.nr_entries; i++) {
+		if (entries[i] != ULONG_MAX)
+			seq_printf(m, "%p\n", (void *)entries[i]);
 	}
+	unlock_trace(task);
+
+out:
 	kfree(entries);
 
 	return err;
diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h index 115b570..65e9ae4 100644
--- a/include/linux/stacktrace.h
+++ b/include/linux/stacktrace.h
@@ -23,8 +23,11 @@ extern void print_stack_trace(struct stack_trace *trace, int spaces);
 
 #ifdef CONFIG_USER_STACKTRACE_SUPPORT
 extern void save_stack_trace_user(struct stack_trace *trace);
+extern void save_stack_trace_user_task(struct task_struct *task,
+		struct stack_trace *trace);
 #else
-# define save_stack_trace_user(trace)              do { } while (0)
+# define save_stack_trace_user(trace)			do { } while (0)
+# define save_stack_trace_user_task(task, trace)	do { } while (0)
 #endif
 
 #else
--
1.7.6

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2012-04-25  2:57 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-04-11  8:07 [RFC 1/2] kernel patch for dump user space stack tool Tu, Xiaobing
2012-04-17  4:43 ` Lin Ming
2012-04-17 14:38   ` Tu, Xiaobing
2012-04-20  9:44 ` Peter Zijlstra
2012-04-24  1:30   ` Yanmin Zhang
2012-04-24 10:10     ` Peter Zijlstra
2012-04-25  2:58       ` Yanmin Zhang
2012-04-24 10:11     ` Peter Zijlstra
2012-04-25  2:44       ` Yanmin Zhang
2012-04-17 14:37 Tu, Xiaobing
2012-04-19  3:50 ` Cong Wang
2012-04-19  5:17   ` Yanmin Zhang
2012-04-19  6:13     ` Cong Wang
2012-04-19  6:28       ` Yanmin Zhang
2012-04-20  9:38     ` Peter Zijlstra
2012-04-24  0:56       ` Yanmin Zhang
2012-04-20  9:54     ` Peter Zijlstra
2012-04-24  2:19       ` Yanmin Zhang

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.