[PATCH 0/2] tracing,mm - add kernel pagefault tracepoint for x86 & x86

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 0/2] tracing,mm - add kernel pagefault tracepoint for x86 & x86_64
@ 2010-11-10 11:56 Jiri Olsa
  2010-11-10 11:56 ` [PATCH 1/2] " Jiri Olsa
  2010-11-10 11:56 ` [PATCH 2/2] tracing - fix recursive user stack trace Jiri Olsa
  0 siblings, 2 replies; 27+ messages in thread
From: Jiri Olsa @ 2010-11-10 11:56 UTC (permalink / raw)
  To: mingo, rostedt, andi, lwoodman; +Cc: linux-kernel

This provides a tracepoint to trace kernel pagefault event
for x86 and x86_64 architectures.

There's a problem with this tracepoint when having the
userstacktrace option enabled, since it might generated
the page fault itself. The 2/2 patch address this issue.

attached patches:
1/2 tracing,mm - add kernel pagefault tracepoint for x86 & x86_64
2/2 tracing - fix recursive user stack trace

wbr,
jirka
---
 arch/x86/mm/fault.c         |   32 +++++++++++++++++++++-----------
 include/trace/events/kmem.h |   22 ++++++++++++++++++++++
 kernel/trace/trace.c        |   19 +++++++++++++++++++
 3 files changed, 62 insertions(+), 11 deletions(-)

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 1/2] tracing,mm - add kernel pagefault tracepoint for x86 & x86_64
  2010-11-10 11:56 [PATCH 0/2] tracing,mm - add kernel pagefault tracepoint for x86 & x86_64 Jiri Olsa
@ 2010-11-10 11:56 ` Jiri Olsa
  2010-11-10 13:29   ` Christoph Hellwig
  2010-11-10 11:56 ` [PATCH 2/2] tracing - fix recursive user stack trace Jiri Olsa
  1 sibling, 1 reply; 27+ messages in thread
From: Jiri Olsa @ 2010-11-10 11:56 UTC (permalink / raw)
  To: mingo, rostedt, andi, lwoodman; +Cc: linux-kernel, Jiri Olsa

This provides a tracepoint to trace kernel pagefault event.

When analyzing a vmcore resulting from a kernel failure, we
_often_ hypothesize that "there should have a pagefault event
just before this instruction" or similar.  Sometimes it means
that there should have a small delay between instructions that
extends a critical session and exposed a missing lock.  Since
there have been no evidence of kernel pagefault, it is quite
difficult to adopt the hypothesis.

If we can trace the kernel pagefault event, it will help narrow
the possible cause of failure and will accelerate the
investigation a lot.


Signed-off-by: Larry Woodman <lwoodman@redhat.com>
Signed-off-by: Jiri Olsa <jolsa@redhat.com>
---
 arch/x86/mm/fault.c         |   32 +++++++++++++++++++++-----------
 include/trace/events/kmem.h |   22 ++++++++++++++++++++++
 2 files changed, 43 insertions(+), 11 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 7d90ceb..f776c45 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -12,6 +12,7 @@
 #include <linux/mmiotrace.h>		/* kmmio_handler, ...		*/
 #include <linux/perf_event.h>		/* perf_sw_event		*/
 #include <linux/hugetlb.h>		/* hstate_index_to_shift	*/
+#include <trace/events/kmem.h>
 
 #include <asm/traps.h>			/* dotraplinkage, ...		*/
 #include <asm/pgalloc.h>		/* pgd_*(), ...			*/
@@ -944,17 +945,10 @@ static int fault_in_kernel_space(unsigned long address)
 	return address >= TASK_SIZE_MAX;
 }
 
-/*
- * This routine handles page faults.  It determines the address,
- * and the problem, and then passes it off to one of the appropriate
- * routines.
- */
-dotraplinkage void __kprobes
-do_page_fault(struct pt_regs *regs, unsigned long error_code)
+static inline void __do_page_fault(struct pt_regs *regs, unsigned long address, unsigned long error_code)
 {
 	struct vm_area_struct *vma;
 	struct task_struct *tsk;
-	unsigned long address;
 	struct mm_struct *mm;
 	int fault;
 	int write = error_code & PF_WRITE;
@@ -964,9 +958,6 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	tsk = current;
 	mm = tsk->mm;
 
-	/* Get the faulting address: */
-	address = read_cr2();
-
 	/*
 	 * Detect and handle instructions that would cause a page fault for
 	 * both a tracked kernel page and a userspace page.
@@ -1158,3 +1149,22 @@ good_area:
 
 	up_read(&mm->mmap_sem);
 }
+
+/*
+ * This routine handles page faults.  It determines the address,
+ * and the problem, and then passes it off to one of the appropriate
+ * routines.
+ */
+dotraplinkage void __kprobes
+do_page_fault(struct pt_regs *regs, unsigned long error_code)
+{
+	unsigned long address;
+
+	/* Get the faulting address: */
+	address = read_cr2();
+
+	__do_page_fault(regs, address, error_code);
+
+	if (!user_mode(regs))
+		trace_mm_kernel_pagefault(current, address, regs);
+}
diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index a9c87ad..f14535b 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -302,6 +302,28 @@ TRACE_EVENT(mm_page_alloc_extfrag,
 		__entry->alloc_migratetype == __entry->fallback_migratetype)
 );
 
+TRACE_EVENT(mm_kernel_pagefault,
+
+	TP_PROTO(struct task_struct *task, unsigned long address, struct pt_regs *regs),
+
+	TP_ARGS(task, address, regs),
+
+	TP_STRUCT__entry(
+		__field(struct task_struct *, task)
+		__field(unsigned long, address)
+		__field(struct pt_regs *, regs)
+	),
+
+	TP_fast_assign(
+		__entry->task = task;
+		__entry->address = address;
+		__entry->regs = regs;
+	),
+
+	TP_printk("task=%lx, address=%lx, regs=%lx",
+		(unsigned long)__entry->task, (unsigned long)__entry->address,
+			__entry->regs)
+	);
 #endif /* _TRACE_KMEM_H */
 
 /* This part must be outside protection */
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 2/2] tracing - fix recursive user stack trace
  2010-11-10 11:56 [PATCH 0/2] tracing,mm - add kernel pagefault tracepoint for x86 & x86_64 Jiri Olsa
  2010-11-10 11:56 ` [PATCH 1/2] " Jiri Olsa
@ 2010-11-10 11:56 ` Jiri Olsa
  2010-11-11  0:13   ` Li Zefan
  2010-11-18 14:05   ` [tip:perf/core] tracing: Fix " tip-bot for Steven Rostedt
  1 sibling, 2 replies; 27+ messages in thread
From: Jiri Olsa @ 2010-11-10 11:56 UTC (permalink / raw)
  To: mingo, rostedt, andi, lwoodman; +Cc: linux-kernel, Jiri Olsa

The user stack trace can fault when examining the trace. Which
would call the do_page_fault handler, which would trace again,
which would do the user stack trace, which would fault and call
do_page_fault again ...

Thus this is causing a recursive bug. We need to have a recursion
detector here.


Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Jiri Olsa <jolsa@redhat.com>
---
 kernel/trace/trace.c |   19 +++++++++++++++++++
 1 files changed, 19 insertions(+), 0 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 82d9b81..0215e87 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1284,6 +1284,8 @@ void trace_dump_stack(void)
 	__ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count());
 }
 
+static DEFINE_PER_CPU(int, user_stack_count);
+
 void
 ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
 {
@@ -1302,6 +1304,18 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
 	if (unlikely(in_nmi()))
 		return;
 
+	/*
+	 * prevent recursion, since the user stack tracing may
+	 * trigger other kernel events.
+	 */
+	preempt_disable();
+	if (__get_cpu_var(user_stack_count))
+		goto out;
+
+	__get_cpu_var(user_stack_count)++;
+
+
+
 	event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
 					  sizeof(*entry), flags, pc);
 	if (!event)
@@ -1319,6 +1333,11 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
 	save_stack_trace_user(&trace);
 	if (!filter_check_discard(call, entry, buffer, event))
 		ring_buffer_unlock_commit(buffer, event);
+
+	__get_cpu_var(user_stack_count)--;
+
+ out:
+	preempt_enable();
 }
 
 #ifdef UNUSED
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH 1/2] tracing,mm - add kernel pagefault tracepoint for x86 & x86_64
  2010-11-10 11:56 ` [PATCH 1/2] " Jiri Olsa
@ 2010-11-10 13:29   ` Christoph Hellwig
  2010-11-10 13:44     ` Jiri Olsa
  0 siblings, 1 reply; 27+ messages in thread
From: Christoph Hellwig @ 2010-11-10 13:29 UTC (permalink / raw)
  To: Jiri Olsa; +Cc: mingo, rostedt, andi, lwoodman, linux-kernel

On Wed, Nov 10, 2010 at 12:56:11PM +0100, Jiri Olsa wrote:
> +	TP_printk("task=%lx, address=%lx, regs=%lx",
> +		(unsigned long)__entry->task, (unsigned long)__entry->address,
> +			__entry->regs)

How exactly do you use the information in this trace point?  Especially
the undecoded pt_regs doesn't seem very useful to me at all.


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 1/2] tracing,mm - add kernel pagefault tracepoint for x86 & x86_64
  2010-11-10 13:29   ` Christoph Hellwig
@ 2010-11-10 13:44     ` Jiri Olsa
  2010-11-10 13:52       ` Ingo Molnar
  0 siblings, 1 reply; 27+ messages in thread
From: Jiri Olsa @ 2010-11-10 13:44 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: mingo, rostedt, andi, lwoodman, linux-kernel

On Wed, Nov 10, 2010 at 08:29:54AM -0500, Christoph Hellwig wrote:
> On Wed, Nov 10, 2010 at 12:56:11PM +0100, Jiri Olsa wrote:
> > +	TP_printk("task=%lx, address=%lx, regs=%lx",
> > +		(unsigned long)__entry->task, (unsigned long)__entry->address,
> > +			__entry->regs)
> 
> How exactly do you use the information in this trace point?  Especially
> the undecoded pt_regs doesn't seem very useful to me at all.

agreed, the registers pointer are not very useful in the trace file output,
and could be taken away.. just wanted to be complete I guess

but I believe they are useful when you register the mm_kernel_pagefault
tracepoint and process the information by yourself

wbr,
jirka

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 1/2] tracing,mm - add kernel pagefault tracepoint for x86 & x86_64
  2010-11-10 13:44     ` Jiri Olsa
@ 2010-11-10 13:52       ` Ingo Molnar
  2010-11-10 15:00         ` Frederic Weisbecker
  0 siblings, 1 reply; 27+ messages in thread
From: Ingo Molnar @ 2010-11-10 13:52 UTC (permalink / raw)
  To: Jiri Olsa
  Cc: Christoph Hellwig, rostedt, andi, lwoodman, linux-kernel,
	Peter Zijlstra, Frédéric Weisbecker


* Jiri Olsa <jolsa@redhat.com> wrote:

> On Wed, Nov 10, 2010 at 08:29:54AM -0500, Christoph Hellwig wrote:
> > On Wed, Nov 10, 2010 at 12:56:11PM +0100, Jiri Olsa wrote:
> > > +	TP_printk("task=%lx, address=%lx, regs=%lx",
> > > +		(unsigned long)__entry->task, (unsigned long)__entry->address,
> > > +			__entry->regs)
> > 
> > How exactly do you use the information in this trace point?  Especially
> > the undecoded pt_regs doesn't seem very useful to me at all.
> 
> agreed, the registers pointer are not very useful in the trace file output,
> and could be taken away.. just wanted to be complete I guess
> 
> but I believe they are useful when you register the mm_kernel_pagefault
> tracepoint and process the information by yourself

That would be expressed in a better and more generic fashion via adding 
PERF_SAMPLE_REGS to perf_event_sample_format, and add a ptregs dump in 
kernel/perf_event.c, perf_output_sample(). That way any tracepoint can request a 
(user-space)ptregs state snapshot, not just the pagefault ones.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 1/2] tracing,mm - add kernel pagefault tracepoint for x86 & x86_64
  2010-11-10 13:52       ` Ingo Molnar
@ 2010-11-10 15:00         ` Frederic Weisbecker
  2010-11-10 15:17           ` Jiri Olsa
  0 siblings, 1 reply; 27+ messages in thread
From: Frederic Weisbecker @ 2010-11-10 15:00 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Jiri Olsa, Christoph Hellwig, rostedt, andi, lwoodman,
	linux-kernel, Peter Zijlstra

On Wed, Nov 10, 2010 at 02:52:44PM +0100, Ingo Molnar wrote:
> 
> * Jiri Olsa <jolsa@redhat.com> wrote:
> 
> > On Wed, Nov 10, 2010 at 08:29:54AM -0500, Christoph Hellwig wrote:
> > > On Wed, Nov 10, 2010 at 12:56:11PM +0100, Jiri Olsa wrote:
> > > > +	TP_printk("task=%lx, address=%lx, regs=%lx",
> > > > +		(unsigned long)__entry->task, (unsigned long)__entry->address,
> > > > +			__entry->regs)
> > > 
> > > How exactly do you use the information in this trace point?  Especially
> > > the undecoded pt_regs doesn't seem very useful to me at all.
> > 
> > agreed, the registers pointer are not very useful in the trace file output,
> > and could be taken away.. just wanted to be complete I guess
> > 
> > but I believe they are useful when you register the mm_kernel_pagefault
> > tracepoint and process the information by yourself
> 
> That would be expressed in a better and more generic fashion via adding 
> PERF_SAMPLE_REGS to perf_event_sample_format, and add a ptregs dump in 
> kernel/perf_event.c, perf_output_sample(). That way any tracepoint can request a 
> (user-space)ptregs state snapshot, not just the pagefault ones.
> 
> Thanks,
> 
> 	Ingo


We are going to have that with the dwarf based callchain patchset. I'm cooking
this.

Thanks.


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 1/2] tracing,mm - add kernel pagefault tracepoint for x86 & x86_64
  2010-11-10 15:00         ` Frederic Weisbecker
@ 2010-11-10 15:17           ` Jiri Olsa
  2010-11-10 15:20             ` Christoph Hellwig
  2010-11-10 16:44             ` Frederic Weisbecker
  0 siblings, 2 replies; 27+ messages in thread
From: Jiri Olsa @ 2010-11-10 15:17 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: Ingo Molnar, Christoph Hellwig, rostedt, andi, lwoodman,
	linux-kernel, Peter Zijlstra

On Wed, Nov 10, 2010 at 04:00:25PM +0100, Frederic Weisbecker wrote:
> On Wed, Nov 10, 2010 at 02:52:44PM +0100, Ingo Molnar wrote:
> > 
> > * Jiri Olsa <jolsa@redhat.com> wrote:
> > 
> > > On Wed, Nov 10, 2010 at 08:29:54AM -0500, Christoph Hellwig wrote:
> > > > On Wed, Nov 10, 2010 at 12:56:11PM +0100, Jiri Olsa wrote:
> > > > > +	TP_printk("task=%lx, address=%lx, regs=%lx",
> > > > > +		(unsigned long)__entry->task, (unsigned long)__entry->address,
> > > > > +			__entry->regs)
> > > > 
> > > > How exactly do you use the information in this trace point?  Especially
> > > > the undecoded pt_regs doesn't seem very useful to me at all.
> > > 
> > > agreed, the registers pointer are not very useful in the trace file output,
> > > and could be taken away.. just wanted to be complete I guess
> > > 
> > > but I believe they are useful when you register the mm_kernel_pagefault
> > > tracepoint and process the information by yourself
> > 
> > That would be expressed in a better and more generic fashion via adding 
> > PERF_SAMPLE_REGS to perf_event_sample_format, and add a ptregs dump in 
> > kernel/perf_event.c, perf_output_sample(). That way any tracepoint can request a 
> > (user-space)ptregs state snapshot, not just the pagefault ones.
> > 
> > Thanks,
> > 
> > 	Ingo
> 
> 
> We are going to have that with the dwarf based callchain patchset. I'm cooking
> this.
> 
> Thanks.
> 

I guess I can take the regs out then.. would that patch be acceptable
afterwards..?

thanks,
jirka

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 1/2] tracing,mm - add kernel pagefault tracepoint for x86 & x86_64
  2010-11-10 15:17           ` Jiri Olsa
@ 2010-11-10 15:20             ` Christoph Hellwig
  2010-11-10 16:28               ` Andi Kleen
  2010-11-10 16:44             ` Frederic Weisbecker
  1 sibling, 1 reply; 27+ messages in thread
From: Christoph Hellwig @ 2010-11-10 15:20 UTC (permalink / raw)
  To: Jiri Olsa
  Cc: Frederic Weisbecker, Ingo Molnar, Christoph Hellwig, rostedt,
	andi, lwoodman, linux-kernel, Peter Zijlstra

On Wed, Nov 10, 2010 at 04:17:50PM +0100, Jiri Olsa wrote:
> I guess I can take the regs out then.. would that patch be acceptable
> afterwards..?

I really can't make much sense out of this whole trace point.  What
use it the task_struct address for example?

What would be much more interesting is telling us if we had a
major/minor fault, if ->page_mkwrite was called, etc.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 1/2] tracing,mm - add kernel pagefault tracepoint for x86 & x86_64
  2010-11-10 15:20             ` Christoph Hellwig
@ 2010-11-10 16:28               ` Andi Kleen
  0 siblings, 0 replies; 27+ messages in thread
From: Andi Kleen @ 2010-11-10 16:28 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jiri Olsa, Frederic Weisbecker, Ingo Molnar, rostedt, andi,
	lwoodman, linux-kernel, Peter Zijlstra

On Wed, Nov 10, 2010 at 10:20:02AM -0500, Christoph Hellwig wrote:
> On Wed, Nov 10, 2010 at 04:17:50PM +0100, Jiri Olsa wrote:
> > I guess I can take the regs out then.. would that patch be acceptable
> > afterwards..?
> 
> I really can't make much sense out of this whole trace point.  What
> use it the task_struct address for example?
> 
> What would be much more interesting is telling us if we had a
> major/minor fault, if ->page_mkwrite was called, etc.

Also on x86 I would log the error code.

-Andi
-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 1/2] tracing,mm - add kernel pagefault tracepoint for x86 & x86_64
  2010-11-10 15:17           ` Jiri Olsa
  2010-11-10 15:20             ` Christoph Hellwig
@ 2010-11-10 16:44             ` Frederic Weisbecker
  2010-11-11  9:09               ` [PATCHv2 0/2] " Jiri Olsa
                                 ` (2 more replies)
  1 sibling, 3 replies; 27+ messages in thread
From: Frederic Weisbecker @ 2010-11-10 16:44 UTC (permalink / raw)
  To: Jiri Olsa
  Cc: Ingo Molnar, Christoph Hellwig, rostedt, andi, lwoodman,
	linux-kernel, Peter Zijlstra

On Wed, Nov 10, 2010 at 04:17:50PM +0100, Jiri Olsa wrote:
> On Wed, Nov 10, 2010 at 04:00:25PM +0100, Frederic Weisbecker wrote:
> > On Wed, Nov 10, 2010 at 02:52:44PM +0100, Ingo Molnar wrote:
> > > 
> > > * Jiri Olsa <jolsa@redhat.com> wrote:
> > > 
> > > > On Wed, Nov 10, 2010 at 08:29:54AM -0500, Christoph Hellwig wrote:
> > > > > On Wed, Nov 10, 2010 at 12:56:11PM +0100, Jiri Olsa wrote:
> > > > > > +	TP_printk("task=%lx, address=%lx, regs=%lx",
> > > > > > +		(unsigned long)__entry->task, (unsigned long)__entry->address,
> > > > > > +			__entry->regs)
> > > > > 
> > > > > How exactly do you use the information in this trace point?  Especially
> > > > > the undecoded pt_regs doesn't seem very useful to me at all.
> > > > 
> > > > agreed, the registers pointer are not very useful in the trace file output,
> > > > and could be taken away.. just wanted to be complete I guess
> > > > 
> > > > but I believe they are useful when you register the mm_kernel_pagefault
> > > > tracepoint and process the information by yourself
> > > 
> > > That would be expressed in a better and more generic fashion via adding 
> > > PERF_SAMPLE_REGS to perf_event_sample_format, and add a ptregs dump in 
> > > kernel/perf_event.c, perf_output_sample(). That way any tracepoint can request a 
> > > (user-space)ptregs state snapshot, not just the pagefault ones.
> > > 
> > > Thanks,
> > > 
> > > 	Ingo
> > 
> > 
> > We are going to have that with the dwarf based callchain patchset. I'm cooking
> > this.
> > 
> > Thanks.
> > 
> 
> I guess I can take the regs out then.. would that patch be acceptable
> afterwards..?


Please take out the regs yeah.

Thanks.


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] tracing - fix recursive user stack trace
  2010-11-10 11:56 ` [PATCH 2/2] tracing - fix recursive user stack trace Jiri Olsa
@ 2010-11-11  0:13   ` Li Zefan
  2010-11-11 21:57     ` Steven Rostedt
  2010-11-18 14:05   ` [tip:perf/core] tracing: Fix " tip-bot for Steven Rostedt
  1 sibling, 1 reply; 27+ messages in thread
From: Li Zefan @ 2010-11-11  0:13 UTC (permalink / raw)
  To: Jiri Olsa; +Cc: mingo, rostedt, andi, lwoodman, linux-kernel

Jiri Olsa wrote:
> The user stack trace can fault when examining the trace. Which
> would call the do_page_fault handler, which would trace again,
> which would do the user stack trace, which would fault and call
> do_page_fault again ...
> 
> Thus this is causing a recursive bug. We need to have a recursion
> detector here.
> 

I guess this is from what I reported to Redhat, triggered by
the ftrace stress test. ;)

This patch should be the first patch, otherwise you introduce
a regression. Though it merely a problem in this case, better
avoid it.

A nitpick below:

> 
> Signed-off-by: Steven Rostedt <srostedt@redhat.com>
> Signed-off-by: Jiri Olsa <jolsa@redhat.com>
> ---
>  kernel/trace/trace.c |   19 +++++++++++++++++++
>  1 files changed, 19 insertions(+), 0 deletions(-)
> 
> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
> index 82d9b81..0215e87 100644
> --- a/kernel/trace/trace.c
> +++ b/kernel/trace/trace.c
> @@ -1284,6 +1284,8 @@ void trace_dump_stack(void)
>  	__ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count());
>  }
>  
> +static DEFINE_PER_CPU(int, user_stack_count);
> +
>  void
>  ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
>  {
> @@ -1302,6 +1304,18 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
>  	if (unlikely(in_nmi()))
>  		return;
>  
> +	/*
> +	 * prevent recursion, since the user stack tracing may
> +	 * trigger other kernel events.
> +	 */
> +	preempt_disable();
> +	if (__get_cpu_var(user_stack_count))
> +		goto out;
> +
> +	__get_cpu_var(user_stack_count)++;
> +
> +
> +

redundant blank lines.

>  	event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
>  					  sizeof(*entry), flags, pc);
>  	if (!event)
> @@ -1319,6 +1333,11 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
>  	save_stack_trace_user(&trace);
>  	if (!filter_check_discard(call, entry, buffer, event))
>  		ring_buffer_unlock_commit(buffer, event);
> +
> +	__get_cpu_var(user_stack_count)--;
> +
> + out:
> +	preempt_enable();
>  }
>  
>  #ifdef UNUSED

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCHv2 0/2] tracing,mm - add kernel pagefault tracepoint for x86 & x86_64
  2010-11-10 16:44             ` Frederic Weisbecker
@ 2010-11-11  9:09               ` Jiri Olsa
  2010-11-11  9:09               ` [PATCHv2 1/2] tracing - fix recursive user stack trace Jiri Olsa
  2010-11-11  9:09               ` [PATCHv2 2/2] tracing,mm - add kernel pagefault tracepoint for x86 & x86_64 Jiri Olsa
  2 siblings, 0 replies; 27+ messages in thread
From: Jiri Olsa @ 2010-11-11  9:09 UTC (permalink / raw)
  To: mingo, rostedt, andi, lwoodman, hch; +Cc: linux-kernel

This provides a tracepoint to trace kernel pagefault event
for x86 and x86_64 architectures.

There's a problem with this tracepoint when having the
userstacktrace option enabled, since it might generated
the page fault itself. The 1/2 patch address this issue.

v2 changes:
- replaced regs for error_code in page fault trace
- swapped the patch order not to introduce the regression
- minor code style change

attached patches:
1/2 tracing - fix recursive user stack trace
2/2 tracing,mm - add kernel pagefault tracepoint for x86 & x86_64

wbr,
jirka
---
 arch/x86/mm/fault.c         |   32 +++++++++++++++++++++-----------
 include/trace/events/kmem.h |   22 ++++++++++++++++++++++
 kernel/trace/trace.c        |   17 +++++++++++++++++
 3 files changed, 60 insertions(+), 11 deletions(-)

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCHv2 1/2] tracing - fix recursive user stack trace
  2010-11-10 16:44             ` Frederic Weisbecker
  2010-11-11  9:09               ` [PATCHv2 0/2] " Jiri Olsa
@ 2010-11-11  9:09               ` Jiri Olsa
  2010-11-11 10:34                 ` Andi Kleen
  2010-11-11  9:09               ` [PATCHv2 2/2] tracing,mm - add kernel pagefault tracepoint for x86 & x86_64 Jiri Olsa
  2 siblings, 1 reply; 27+ messages in thread
From: Jiri Olsa @ 2010-11-11  9:09 UTC (permalink / raw)
  To: mingo, rostedt, andi, lwoodman, hch; +Cc: linux-kernel, Jiri Olsa

The user stack trace can fault when examining the trace. Which
would call the do_page_fault handler, which would trace again,
which would do the user stack trace, which would fault and call
do_page_fault again ...

Thus this is causing a recursive bug. We need to have a recursion
detector here.


Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Jiri Olsa <jolsa@redhat.com>
---
 kernel/trace/trace.c |   17 +++++++++++++++++
 1 files changed, 17 insertions(+), 0 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 82d9b81..1905a72 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1284,6 +1284,8 @@ void trace_dump_stack(void)
 	__ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count());
 }
 
+static DEFINE_PER_CPU(int, user_stack_count);
+
 void
 ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
 {
@@ -1302,6 +1304,16 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
 	if (unlikely(in_nmi()))
 		return;
 
+	/*
+	 * Prevent recursion, since the user stack tracing may
+	 * trigger other kernel events.
+	 */
+	preempt_disable();
+	if (__get_cpu_var(user_stack_count))
+		goto out;
+
+	__get_cpu_var(user_stack_count)++;
+
 	event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
 					  sizeof(*entry), flags, pc);
 	if (!event)
@@ -1319,6 +1331,11 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
 	save_stack_trace_user(&trace);
 	if (!filter_check_discard(call, entry, buffer, event))
 		ring_buffer_unlock_commit(buffer, event);
+
+	__get_cpu_var(user_stack_count)--;
+
+ out:
+	preempt_enable();
 }
 
 #ifdef UNUSED
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCHv2 2/2] tracing,mm - add kernel pagefault tracepoint for x86 & x86_64
  2010-11-10 16:44             ` Frederic Weisbecker
  2010-11-11  9:09               ` [PATCHv2 0/2] " Jiri Olsa
  2010-11-11  9:09               ` [PATCHv2 1/2] tracing - fix recursive user stack trace Jiri Olsa
@ 2010-11-11  9:09               ` Jiri Olsa
  2010-11-11 12:51                 ` Christoph Hellwig
  2010-11-15 13:43                 ` Frederic Weisbecker
  2 siblings, 2 replies; 27+ messages in thread
From: Jiri Olsa @ 2010-11-11  9:09 UTC (permalink / raw)
  To: mingo, rostedt, andi, lwoodman, hch; +Cc: linux-kernel, Jiri Olsa

This provides a tracepoint to trace kernel pagefault event.

When analyzing a vmcore resulting from a kernel failure, we
often hypothesize that "there should have a pagefault event
just before this instruction" or similar.  Sometimes it means
that there should have a small delay between instructions that
extends a critical session and exposed a missing lock.  Since
there have been no evidence of kernel pagefault, it is quite
difficult to adopt the hypothesis.

If we can trace the kernel pagefault event, it will help narrow
the possible cause of failure and will accelerate the
investigation a lot.


Signed-off-by: Larry Woodman <lwoodman@redhat.com>
Signed-off-by: Jiri Olsa <jolsa@redhat.com>
---
 arch/x86/mm/fault.c         |   33 ++++++++++++++++++++++-----------
 include/trace/events/kmem.h |   23 +++++++++++++++++++++++
 2 files changed, 45 insertions(+), 11 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 7d90ceb..171dcc9 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -12,6 +12,7 @@
 #include <linux/mmiotrace.h>		/* kmmio_handler, ...		*/
 #include <linux/perf_event.h>		/* perf_sw_event		*/
 #include <linux/hugetlb.h>		/* hstate_index_to_shift	*/
+#include <trace/events/kmem.h>
 
 #include <asm/traps.h>			/* dotraplinkage, ...		*/
 #include <asm/pgalloc.h>		/* pgd_*(), ...			*/
@@ -944,17 +945,11 @@ static int fault_in_kernel_space(unsigned long address)
 	return address >= TASK_SIZE_MAX;
 }
 
-/*
- * This routine handles page faults.  It determines the address,
- * and the problem, and then passes it off to one of the appropriate
- * routines.
- */
-dotraplinkage void __kprobes
-do_page_fault(struct pt_regs *regs, unsigned long error_code)
+static inline void __do_page_fault(struct pt_regs *regs, unsigned long address,
+				   unsigned long error_code)
 {
 	struct vm_area_struct *vma;
 	struct task_struct *tsk;
-	unsigned long address;
 	struct mm_struct *mm;
 	int fault;
 	int write = error_code & PF_WRITE;
@@ -964,9 +959,6 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	tsk = current;
 	mm = tsk->mm;
 
-	/* Get the faulting address: */
-	address = read_cr2();
-
 	/*
 	 * Detect and handle instructions that would cause a page fault for
 	 * both a tracked kernel page and a userspace page.
@@ -1158,3 +1150,22 @@ good_area:
 
 	up_read(&mm->mmap_sem);
 }
+
+/*
+ * This routine handles page faults.  It determines the address,
+ * and the problem, and then passes it off to one of the appropriate
+ * routines.
+ */
+dotraplinkage void __kprobes
+do_page_fault(struct pt_regs *regs, unsigned long error_code)
+{
+	unsigned long address;
+
+	/* Get the faulting address: */
+	address = read_cr2();
+
+	__do_page_fault(regs, address, error_code);
+
+	if (!user_mode(regs))
+		trace_mm_kernel_pagefault(current, address, error_code);
+}
diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index a9c87ad..b17cdf3 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -302,6 +302,29 @@ TRACE_EVENT(mm_page_alloc_extfrag,
 		__entry->alloc_migratetype == __entry->fallback_migratetype)
 );
 
+TRACE_EVENT(mm_kernel_pagefault,
+
+	TP_PROTO(struct task_struct *task, unsigned long address,
+		 int error_code),
+
+	TP_ARGS(task, address, error_code),
+
+	TP_STRUCT__entry(
+		__field(struct task_struct *, task)
+		__field(unsigned long, address)
+		__field(unsigned long, error_code)
+	),
+
+	TP_fast_assign(
+		__entry->task = task;
+		__entry->address = address;
+		__entry->error_code = error_code;
+	),
+
+	TP_printk("task=%lx, address=%lx, error code=%lx",
+		(unsigned long)__entry->task, (unsigned long)__entry->address,
+			__entry->error_code)
+	);
 #endif /* _TRACE_KMEM_H */
 
 /* This part must be outside protection */
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCHv2 1/2] tracing - fix recursive user stack trace
  2010-11-11  9:09               ` [PATCHv2 1/2] tracing - fix recursive user stack trace Jiri Olsa
@ 2010-11-11 10:34                 ` Andi Kleen
  0 siblings, 0 replies; 27+ messages in thread
From: Andi Kleen @ 2010-11-11 10:34 UTC (permalink / raw)
  To: Jiri Olsa; +Cc: mingo, rostedt, andi, lwoodman, hch, linux-kernel

On Thu, Nov 11, 2010 at 10:09:08AM +0100, Jiri Olsa wrote:
> The user stack trace can fault when examining the trace. Which
> would call the do_page_fault handler, which would trace again,
> which would do the user stack trace, which would fault and call
> do_page_fault again ...
> 
> Thus this is causing a recursive bug. We need to have a recursion
> detector here.

I suspect this problem is in more trace points: any NMI or MCE 
trace point or any other trace point that can be triggered
from a page. so it seems to be a general bug fix independent
of the other patch.

-Andi

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCHv2 2/2] tracing,mm - add kernel pagefault tracepoint for x86 & x86_64
  2010-11-11  9:09               ` [PATCHv2 2/2] tracing,mm - add kernel pagefault tracepoint for x86 & x86_64 Jiri Olsa
@ 2010-11-11 12:51                 ` Christoph Hellwig
  2010-11-11 13:15                   ` Jiri Olsa
  2010-11-15 13:43                 ` Frederic Weisbecker
  1 sibling, 1 reply; 27+ messages in thread
From: Christoph Hellwig @ 2010-11-11 12:51 UTC (permalink / raw)
  To: Jiri Olsa; +Cc: mingo, rostedt, andi, lwoodman, hch, linux-kernel

Can you explain again who this is supposed to be used?  It's missing
a lot of essential information about the page fault, but on the other
hand traces totally useless information like the task struct pointer.

What's your motivation for posting this given that you obviously
haven't actually tried to use it in practice?

Btw, this doesn't mean I'm against a page faul trace point - quite
contrary.  It just seems you're not actually pushing something that's
been tested in real life or is generally useful.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCHv2 2/2] tracing,mm - add kernel pagefault tracepoint for x86 & x86_64
  2010-11-11 12:51                 ` Christoph Hellwig
@ 2010-11-11 13:15                   ` Jiri Olsa
  0 siblings, 0 replies; 27+ messages in thread
From: Jiri Olsa @ 2010-11-11 13:15 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: mingo, rostedt, andi, lwoodman, linux-kernel

On Thu, Nov 11, 2010 at 07:51:57AM -0500, Christoph Hellwig wrote:
> Can you explain again who this is supposed to be used?  It's missing
> a lot of essential information about the page fault, but on the other
> hand traces totally useless information like the task struct pointer.
> 
> What's your motivation for posting this given that you obviously
> haven't actually tried to use it in practice?
yep, kind of doing this for Larry... hoping he'd fill in to the
discussion ;)

> 
> Btw, this doesn't mean I'm against a page faul trace point - quite
> contrary.  It just seems you're not actually pushing something that's
> been tested in real life or is generally useful.
What would be your idea of the page fault tracepoint?

thanks,
jirka

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] tracing - fix recursive user stack trace
  2010-11-11  0:13   ` Li Zefan
@ 2010-11-11 21:57     ` Steven Rostedt
  0 siblings, 0 replies; 27+ messages in thread
From: Steven Rostedt @ 2010-11-11 21:57 UTC (permalink / raw)
  To: Li Zefan; +Cc: Jiri Olsa, mingo, andi, lwoodman, linux-kernel

On Thu, 2010-11-11 at 08:13 +0800, Li Zefan wrote:
> Jiri Olsa wrote:
> > The user stack trace can fault when examining the trace. Which
> > would call the do_page_fault handler, which would trace again,
> > which would do the user stack trace, which would fault and call
> > do_page_fault again ...
> > 
> > Thus this is causing a recursive bug. We need to have a recursion
> > detector here.
> > 
> 
> I guess this is from what I reported to Redhat, triggered by
> the ftrace stress test. ;)
> 
> This patch should be the first patch, otherwise you introduce
> a regression. Though it merely a problem in this case, better
> avoid it.

Yeah, this should go into urgent, and the other patch can be queued for
38.


> 
> A nitpick below:
> 
> > 
> > Signed-off-by: Steven Rostedt <srostedt@redhat.com>
> > Signed-off-by: Jiri Olsa <jolsa@redhat.com>
> > ---
> >  kernel/trace/trace.c |   19 +++++++++++++++++++
> >  1 files changed, 19 insertions(+), 0 deletions(-)
> > 
> > diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
> > index 82d9b81..0215e87 100644
> > --- a/kernel/trace/trace.c
> > +++ b/kernel/trace/trace.c
> > @@ -1284,6 +1284,8 @@ void trace_dump_stack(void)
> >  	__ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count());
> >  }
> >  
> > +static DEFINE_PER_CPU(int, user_stack_count);
> > +
> >  void
> >  ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
> >  {
> > @@ -1302,6 +1304,18 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
> >  	if (unlikely(in_nmi()))
> >  		return;
> >  
> > +	/*
> > +	 * prevent recursion, since the user stack tracing may
> > +	 * trigger other kernel events.
> > +	 */
> > +	preempt_disable();
> > +	if (__get_cpu_var(user_stack_count))
> > +		goto out;
> > +
> > +	__get_cpu_var(user_stack_count)++;
> > +
> > +
> > +
> 
> redundant blank lines.

I can pull this patch with the fix.

Thanks!

-- Steve

> 
> >  	event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
> >  					  sizeof(*entry), flags, pc);
> >  	if (!event)
> > @@ -1319,6 +1333,11 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
> >  	save_stack_trace_user(&trace);
> >  	if (!filter_check_discard(call, entry, buffer, event))
> >  		ring_buffer_unlock_commit(buffer, event);
> > +
> > +	__get_cpu_var(user_stack_count)--;
> > +
> > + out:
> > +	preempt_enable();
> >  }
> >  
> >  #ifdef UNUSED



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCHv2 2/2] tracing,mm - add kernel pagefault tracepoint for x86 & x86_64
  2010-11-11  9:09               ` [PATCHv2 2/2] tracing,mm - add kernel pagefault tracepoint for x86 & x86_64 Jiri Olsa
  2010-11-11 12:51                 ` Christoph Hellwig
@ 2010-11-15 13:43                 ` Frederic Weisbecker
  2010-11-15 14:06                   ` Andi Kleen
  2010-11-15 14:19                   ` Steven Rostedt
  1 sibling, 2 replies; 27+ messages in thread
From: Frederic Weisbecker @ 2010-11-15 13:43 UTC (permalink / raw)
  To: Jiri Olsa
  Cc: mingo, rostedt, andi, lwoodman, hch, linux-kernel, Thomas Gleixner

On Thu, Nov 11, 2010 at 10:09:09AM +0100, Jiri Olsa wrote:
> This provides a tracepoint to trace kernel pagefault event.
> 
> When analyzing a vmcore resulting from a kernel failure, we
> often hypothesize that "there should have a pagefault event
> just before this instruction" or similar.  Sometimes it means
> that there should have a small delay between instructions that
> extends a critical session and exposed a missing lock.  Since
> there have been no evidence of kernel pagefault, it is quite
> difficult to adopt the hypothesis.
> 
> If we can trace the kernel pagefault event, it will help narrow
> the possible cause of failure and will accelerate the
> investigation a lot.
> 
> 
> Signed-off-by: Larry Woodman <lwoodman@redhat.com>
> Signed-off-by: Jiri Olsa <jolsa@redhat.com>
> ---
>  arch/x86/mm/fault.c         |   33 ++++++++++++++++++++++-----------
>  include/trace/events/kmem.h |   23 +++++++++++++++++++++++
>  2 files changed, 45 insertions(+), 11 deletions(-)
> 
> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
> index 7d90ceb..171dcc9 100644
> --- a/arch/x86/mm/fault.c
> +++ b/arch/x86/mm/fault.c
> @@ -12,6 +12,7 @@
>  #include <linux/mmiotrace.h>		/* kmmio_handler, ...		*/
>  #include <linux/perf_event.h>		/* perf_sw_event		*/
>  #include <linux/hugetlb.h>		/* hstate_index_to_shift	*/
> +#include <trace/events/kmem.h>
>  
>  #include <asm/traps.h>			/* dotraplinkage, ...		*/
>  #include <asm/pgalloc.h>		/* pgd_*(), ...			*/
> @@ -944,17 +945,11 @@ static int fault_in_kernel_space(unsigned long address)
>  	return address >= TASK_SIZE_MAX;
>  }
>  
> -/*
> - * This routine handles page faults.  It determines the address,
> - * and the problem, and then passes it off to one of the appropriate
> - * routines.
> - */
> -dotraplinkage void __kprobes
> -do_page_fault(struct pt_regs *regs, unsigned long error_code)
> +static inline void __do_page_fault(struct pt_regs *regs, unsigned long address,
> +				   unsigned long error_code)
>  {
>  	struct vm_area_struct *vma;
>  	struct task_struct *tsk;
> -	unsigned long address;
>  	struct mm_struct *mm;
>  	int fault;
>  	int write = error_code & PF_WRITE;
> @@ -964,9 +959,6 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
>  	tsk = current;
>  	mm = tsk->mm;
>  
> -	/* Get the faulting address: */
> -	address = read_cr2();
> -
>  	/*
>  	 * Detect and handle instructions that would cause a page fault for
>  	 * both a tracked kernel page and a userspace page.
> @@ -1158,3 +1150,22 @@ good_area:
>  
>  	up_read(&mm->mmap_sem);
>  }
> +
> +/*
> + * This routine handles page faults.  It determines the address,
> + * and the problem, and then passes it off to one of the appropriate
> + * routines.
> + */
> +dotraplinkage void __kprobes
> +do_page_fault(struct pt_regs *regs, unsigned long error_code)
> +{
> +	unsigned long address;
> +
> +	/* Get the faulting address: */
> +	address = read_cr2();
> +
> +	__do_page_fault(regs, address, error_code);
> +
> +	if (!user_mode(regs))
> +		trace_mm_kernel_pagefault(current, address, error_code);
> +}



I (and others) have been testing your patch to measure the latencies of page
faults.

So I have several comments about it.

First, we don't want a pointer to current, we can already retrieve the pid
from a trace.

Second, it would be definetly interesting to also have the instruction address
that faulted (regs->ip).

Three, I wonder why this tracepoint only traces kernel faults. And in fact
kernel faults is a confusing name. Users will be confused whether this is
about tracing only faults happening in kernel or also faults happening in
kernel data.
Actually I don't see any reason right now to trace only kernel faults. Do you?
If that's needed, one can still check on post-processing that the address
was in the kernel.

And four, measuring page fault handling duration can be desired, it would be
nice to have a page_fault_start, page_fault_end.


So in the end we can get:


dotraplinkage void __kprobes
do_page_fault(struct pt_regs *regs, unsigned long error_code)
{
	unsigned long address;

	/* Get the faulting address: */
	address = read_cr2();

	trace_mm_pagefault_start(address, error_code);
	__do_page_fault(regs, address, error_code);
	trace_mm_pagefault_end(address);
}


Would you be ok with that?

Last thing I worry about is that error_code that is very arch dependent.
If someone writes a script that depends on the x86 code, it won't work
elsewhere while it's fairly possible to have a generic tracepoint there.

So perhaps we rather need a generic enum field instead of the error_code,
to express the most interesting specific fault attributes. Than can
probably be added later though, once someone really needs it.

Hm?


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCHv2 2/2] tracing,mm - add kernel pagefault tracepoint for x86 & x86_64
  2010-11-15 13:43                 ` Frederic Weisbecker
@ 2010-11-15 14:06                   ` Andi Kleen
  2010-11-15 14:54                     ` Frederic Weisbecker
  2010-11-15 14:19                   ` Steven Rostedt
  1 sibling, 1 reply; 27+ messages in thread
From: Andi Kleen @ 2010-11-15 14:06 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: Jiri Olsa, mingo, rostedt, andi, lwoodman, hch, linux-kernel,
	Thomas Gleixner

> Actually I don't see any reason right now to trace only kernel faults. Do you?
> If that's needed, one can still check on post-processing that the address
> was in the kernel.

I think the idea is to get more context on oopses. If the event only covers
that the overhead in the common case (minus *_user) is much less,
versus the more generalized points you use.

For tracing the whole page fault me think it's better to have 
a generalized exception tracer with a filter on page fault.

-Andi

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCHv2 2/2] tracing,mm - add kernel pagefault tracepoint for x86 & x86_64
  2010-11-15 13:43                 ` Frederic Weisbecker
  2010-11-15 14:06                   ` Andi Kleen
@ 2010-11-15 14:19                   ` Steven Rostedt
  2010-11-16  9:23                     ` Jiri Olsa
  1 sibling, 1 reply; 27+ messages in thread
From: Steven Rostedt @ 2010-11-15 14:19 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: Jiri Olsa, mingo, andi, lwoodman, hch, linux-kernel, Thomas Gleixner

On Mon, 2010-11-15 at 14:43 +0100, Frederic Weisbecker wrote:

> dotraplinkage void __kprobes
> do_page_fault(struct pt_regs *regs, unsigned long error_code)
> {
> 	unsigned long address;
> 
> 	/* Get the faulting address: */
> 	address = read_cr2();
> 
> 	trace_mm_pagefault_start(address, error_code);
> 	__do_page_fault(regs, address, error_code);
> 	trace_mm_pagefault_end(address);
> }
> 
> 
> Would you be ok with that?
> 
> Last thing I worry about is that error_code that is very arch dependent.
> If someone writes a script that depends on the x86 code, it won't work
> elsewhere while it's fairly possible to have a generic tracepoint there.
> 
> So perhaps we rather need a generic enum field instead of the error_code,
> to express the most interesting specific fault attributes. Than can
> probably be added later though, once someone really needs it.
> 
> Hm?

Perhaps we should have:

	trace_arch_mm_pagefault_start(address, error_code);
	__do_page_fault(regs, address, error_code);
	trace_mm_pagefault_end(address);

Then we have a arch/x86/kernel/trace.c that can map trace_arch_...
events to generic events. This file will hold the
trace_mm_pagefault_start(), which is called by
trace_arch_mm_pagefault_start(). Have a hook that when the
trace_mm_pagefault_start() is enabled, we also enable
trace_arch_mm_pagefault_start() that calls this tracepoint. The
trace_mm_pagefault_start() will then translate the
trace_arch_mm_pagfault_start() into the generic trace_mm_pagefault_start
event with the generic error_code that all archs will use.

Reason being, we don't need to do any extra processing in the fast path
when tracing is not enabled.

Also, I'm going to start working on the stable ABI today.

-- Steve

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCHv2 2/2] tracing,mm - add kernel pagefault tracepoint for x86 & x86_64
  2010-11-15 14:06                   ` Andi Kleen
@ 2010-11-15 14:54                     ` Frederic Weisbecker
  2010-11-15 15:04                       ` Steven Rostedt
  0 siblings, 1 reply; 27+ messages in thread
From: Frederic Weisbecker @ 2010-11-15 14:54 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Jiri Olsa, mingo, rostedt, lwoodman, hch, linux-kernel, Thomas Gleixner

On Mon, Nov 15, 2010 at 03:06:33PM +0100, Andi Kleen wrote:
> > Actually I don't see any reason right now to trace only kernel faults. Do you?
> > If that's needed, one can still check on post-processing that the address
> > was in the kernel.
> 
> I think the idea is to get more context on oopses. If the event only covers
> that the overhead in the common case (minus *_user) is much less,
> versus the more generalized points you use.

I see. OTOH, page faults should be pretty low freq events most of time,
probably not something that would add much tracing overhead.

The pity is that we have something like exclude_user/exclude_kernel
properties for events when used by perf, but we consider trace event
as always firing in the kernel, this one is an exception but it would
too tricky and too much an overkill to handle these attributes just
for this trace events.

> For tracing the whole page fault me think it's better to have 
> a generalized exception tracer with a filter on page fault.

You're right. A tracepoint in handle_mm_fault() would be perhaps
better. It should catch most tracepoints the users are interested
in. On the other hand we may miss part of the page fault
latency, like the mmap_sem contention. This can be measured using
lock events though.

But I'm probably missing other important things.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCHv2 2/2] tracing,mm - add kernel pagefault tracepoint for x86 & x86_64
  2010-11-15 14:54                     ` Frederic Weisbecker
@ 2010-11-15 15:04                       ` Steven Rostedt
  0 siblings, 0 replies; 27+ messages in thread
From: Steven Rostedt @ 2010-11-15 15:04 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: Andi Kleen, Jiri Olsa, mingo, lwoodman, hch, linux-kernel,
	Thomas Gleixner

On Mon, 2010-11-15 at 15:54 +0100, Frederic Weisbecker wrote:
> On Mon, Nov 15, 2010 at 03:06:33PM +0100, Andi Kleen wrote:

> > For tracing the whole page fault me think it's better to have 
> > a generalized exception tracer with a filter on page fault.
> 
> 
> You're right. A tracepoint in handle_mm_fault() would be perhaps
> better. It should catch most tracepoints the users are interested
> in. On the other hand we may miss part of the page fault
> latency, like the mmap_sem contention. This can be measured using
> lock events though.
> 
> But I'm probably missing other important things.

I would have the general exception handler as a tracepoint that would
have a "stable tracepoint" hook to it. That is, when enabling it as a
infield debugging tracepoint, we would get the "raw tracepoint", but the
stable tracepoint would massage it into different general types of
exceptions, and the code there would do the filtering.

-- Steve



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCHv2 2/2] tracing,mm - add kernel pagefault tracepoint for x86 & x86_64
  2010-11-15 14:19                   ` Steven Rostedt
@ 2010-11-16  9:23                     ` Jiri Olsa
  2010-11-16 13:13                       ` Steven Rostedt
  0 siblings, 1 reply; 27+ messages in thread
From: Jiri Olsa @ 2010-11-16  9:23 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Frederic Weisbecker, mingo, andi, lwoodman, hch, linux-kernel,
	Thomas Gleixner

On Mon, Nov 15, 2010 at 09:19:20AM -0500, Steven Rostedt wrote:
> On Mon, 2010-11-15 at 14:43 +0100, Frederic Weisbecker wrote:
> 
> 
> > dotraplinkage void __kprobes
> > do_page_fault(struct pt_regs *regs, unsigned long error_code)
> > {
> > 	unsigned long address;
> > 
> > 	/* Get the faulting address: */
> > 	address = read_cr2();
> > 
> > 	trace_mm_pagefault_start(address, error_code);
> > 	__do_page_fault(regs, address, error_code);
> > 	trace_mm_pagefault_end(address);
> > }
> > 
> > 
> > Would you be ok with that?
> > 
> > Last thing I worry about is that error_code that is very arch dependent.
> > If someone writes a script that depends on the x86 code, it won't work
> > elsewhere while it's fairly possible to have a generic tracepoint there.
> > 
> > So perhaps we rather need a generic enum field instead of the error_code,
> > to express the most interesting specific fault attributes. Than can
> > probably be added later though, once someone really needs it.
> > 
> > Hm?
> 
> Perhaps we should have:
> 
> 	trace_arch_mm_pagefault_start(address, error_code);
> 	__do_page_fault(regs, address, error_code);
> 	trace_mm_pagefault_end(address);
> 
> 
> Then we have a arch/x86/kernel/trace.c that can map trace_arch_...
> events to generic events. This file will hold the
> trace_mm_pagefault_start(), which is called by
> trace_arch_mm_pagefault_start(). Have a hook that when the
> trace_mm_pagefault_start() is enabled, we also enable
> trace_arch_mm_pagefault_start() that calls this tracepoint. The
> trace_mm_pagefault_start() will then translate the
> trace_arch_mm_pagfault_start() into the generic trace_mm_pagefault_start
> event with the generic error_code that all archs will use.
> 
> Reason being, we don't need to do any extra processing in the fast path
> when tracing is not enabled.
> 
> Also, I'm going to start working on the stable ABI today.
do you mean stable api for the hook you described above?
or should I come up with smth..

thanks,
jirka

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCHv2 2/2] tracing,mm - add kernel pagefault tracepoint for x86 & x86_64
  2010-11-16  9:23                     ` Jiri Olsa
@ 2010-11-16 13:13                       ` Steven Rostedt
  0 siblings, 0 replies; 27+ messages in thread
From: Steven Rostedt @ 2010-11-16 13:13 UTC (permalink / raw)
  To: Jiri Olsa
  Cc: Frederic Weisbecker, mingo, andi, lwoodman, hch, linux-kernel,
	Thomas Gleixner

On Tue, 2010-11-16 at 10:23 +0100, Jiri Olsa wrote:

> > Also, I'm going to start working on the stable ABI today.
> do you mean stable api for the hook you described above?
> or should I come up with smth..

The stable api will hook into the raw tracepoints (I'm calling the
tracepoints we have now "raw"). The tracepoints will not need anything
new so please continue with what you are doing.

-- Steve



^ permalink raw reply	[flat|nested] 27+ messages in thread

* [tip:perf/core] tracing: Fix recursive user stack trace
  2010-11-10 11:56 ` [PATCH 2/2] tracing - fix recursive user stack trace Jiri Olsa
  2010-11-11  0:13   ` Li Zefan
@ 2010-11-18 14:05   ` tip-bot for Steven Rostedt
  1 sibling, 0 replies; 27+ messages in thread
From: tip-bot for Steven Rostedt @ 2010-11-18 14:05 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, hpa, mingo, eric.dumazet, rostedt, srostedt, tglx, jolsa

Commit-ID:  91e86e560d0b3ce4c5fc64fd2bbb99f856a30a4e
Gitweb:     http://git.kernel.org/tip/91e86e560d0b3ce4c5fc64fd2bbb99f856a30a4e
Author:     Steven Rostedt <srostedt@redhat.com>
AuthorDate: Wed, 10 Nov 2010 12:56:12 +0100
Committer:  Steven Rostedt <rostedt@goodmis.org>
CommitDate: Fri, 12 Nov 2010 21:20:08 -0500

tracing: Fix recursive user stack trace

The user stack trace can fault when examining the trace. Which
would call the do_page_fault handler, which would trace again,
which would do the user stack trace, which would fault and call
do_page_fault again ...

Thus this is causing a recursive bug. We need to have a recursion
detector here.

[ Resubmitted by Jiri Olsa ]

[ Eric Dumazet recommended using __this_cpu_* instead of __get_cpu_* ]

Cc: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Jiri Olsa <jolsa@redhat.com>
LKML-Reference: <1289390172-9730-3-git-send-email-jolsa@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c |   19 +++++++++++++++++++
 1 files changed, 19 insertions(+), 0 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 82d9b81..ee6a733 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1284,6 +1284,8 @@ void trace_dump_stack(void)
 	__ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count());
 }
 
+static DEFINE_PER_CPU(int, user_stack_count);
+
 void
 ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
 {
@@ -1302,6 +1304,18 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
 	if (unlikely(in_nmi()))
 		return;
 
+	/*
+	 * prevent recursion, since the user stack tracing may
+	 * trigger other kernel events.
+	 */
+	preempt_disable();
+	if (__this_cpu_read(user_stack_count))
+		goto out;
+
+	__this_cpu_inc(user_stack_count);
+
+
+
 	event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
 					  sizeof(*entry), flags, pc);
 	if (!event)
@@ -1319,6 +1333,11 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
 	save_stack_trace_user(&trace);
 	if (!filter_check_discard(call, entry, buffer, event))
 		ring_buffer_unlock_commit(buffer, event);
+
+	__this_cpu_dec(user_stack_count);
+
+ out:
+	preempt_enable();
 }
 
 #ifdef UNUSED

^ permalink raw reply related	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2010-11-18 14:05 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-11-10 11:56 [PATCH 0/2] tracing,mm - add kernel pagefault tracepoint for x86 & x86_64 Jiri Olsa
2010-11-10 11:56 ` [PATCH 1/2] " Jiri Olsa
2010-11-10 13:29   ` Christoph Hellwig
2010-11-10 13:44     ` Jiri Olsa
2010-11-10 13:52       ` Ingo Molnar
2010-11-10 15:00         ` Frederic Weisbecker
2010-11-10 15:17           ` Jiri Olsa
2010-11-10 15:20             ` Christoph Hellwig
2010-11-10 16:28               ` Andi Kleen
2010-11-10 16:44             ` Frederic Weisbecker
2010-11-11  9:09               ` [PATCHv2 0/2] " Jiri Olsa
2010-11-11  9:09               ` [PATCHv2 1/2] tracing - fix recursive user stack trace Jiri Olsa
2010-11-11 10:34                 ` Andi Kleen
2010-11-11  9:09               ` [PATCHv2 2/2] tracing,mm - add kernel pagefault tracepoint for x86 & x86_64 Jiri Olsa
2010-11-11 12:51                 ` Christoph Hellwig
2010-11-11 13:15                   ` Jiri Olsa
2010-11-15 13:43                 ` Frederic Weisbecker
2010-11-15 14:06                   ` Andi Kleen
2010-11-15 14:54                     ` Frederic Weisbecker
2010-11-15 15:04                       ` Steven Rostedt
2010-11-15 14:19                   ` Steven Rostedt
2010-11-16  9:23                     ` Jiri Olsa
2010-11-16 13:13                       ` Steven Rostedt
2010-11-10 11:56 ` [PATCH 2/2] tracing - fix recursive user stack trace Jiri Olsa
2010-11-11  0:13   ` Li Zefan
2010-11-11 21:57     ` Steven Rostedt
2010-11-18 14:05   ` [tip:perf/core] tracing: Fix " tip-bot for Steven Rostedt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).