All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] trace: add ability to set a target task for events (v2)
@ 2012-07-11 14:14 Andrew Vagin
  2012-07-11 14:31 ` Frederic Weisbecker
                   ` (2 more replies)
  0 siblings, 3 replies; 12+ messages in thread
From: Andrew Vagin @ 2012-07-11 14:14 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, Ingo Molnar, Steven Rostedt, Paul Mackerras,
	Arnaldo Carvalho de Melo, Arun Sharma, Andrew Vagin

A few events are interesting not only for a current task.
For example, sched_stat_* are interesting for a task, which
wake up. For this reason, it will be good, if such events will
be delivered to a target task too.

Now a target task can be set by using __perf_task().

The original idea and a draft patch belongs to Peter Zijlstra.

I need this events for profiling sleep times.  sched_switch is used for
getting callchains and sched_stat_* is used for getting time periods.
This events are combined in user space, then it can be analized by
perf tools.

v2: Disallow cross-task user callchains

Inspired-by: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Arun Sharma <asharma@fb.com>
Signed-off-by: Andrew Vagin <avagin@openvz.org>
---
 include/linux/ftrace_event.h    |    5 +++--
 include/linux/perf_event.h      |    3 ++-
 include/trace/events/sched.h    |    4 ++++
 include/trace/ftrace.h          |    6 +++++-
 kernel/events/callchain.c       |    9 ++++++++-
 kernel/events/core.c            |   28 ++++++++++++++++++++++++++--
 kernel/events/internal.h        |    3 ++-
 kernel/trace/trace_event_perf.c |    2 +-
 kernel/trace/trace_kprobe.c     |    6 ++++--
 kernel/trace/trace_syscalls.c   |    4 ++--
 10 files changed, 57 insertions(+), 13 deletions(-)

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 176a939..55b96e3 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -303,9 +303,10 @@ extern void *perf_trace_buf_prepare(int size, unsigned short type,
 
 static inline void
 perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr,
-		       u64 count, struct pt_regs *regs, void *head)
+		       u64 count, struct pt_regs *regs, void *head,
+		       struct task_struct *task)
 {
-	perf_tp_event(addr, count, raw_data, size, regs, head, rctx);
+	perf_tp_event(addr, count, raw_data, size, regs, head, rctx, task);
 }
 #endif
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 45db49f..14acc5d 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1269,7 +1269,8 @@ static inline bool perf_paranoid_kernel(void)
 extern void perf_event_init(void);
 extern void perf_tp_event(u64 addr, u64 count, void *record,
 			  int entry_size, struct pt_regs *regs,
-			  struct hlist_head *head, int rctx);
+			  struct hlist_head *head, int rctx,
+			  struct task_struct *task);
 extern void perf_bp_event(struct perf_event *event, void *data);
 
 #ifndef perf_misc_flags
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index ea7a203..5a8671e 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -73,6 +73,9 @@ DECLARE_EVENT_CLASS(sched_wakeup_template,
 		__entry->prio		= p->prio;
 		__entry->success	= success;
 		__entry->target_cpu	= task_cpu(p);
+	)
+	TP_perf_assign(
+		__perf_task(p);
 	),
 
 	TP_printk("comm=%s pid=%d prio=%d success=%d target_cpu=%03d",
@@ -325,6 +328,7 @@ DECLARE_EVENT_CLASS(sched_stat_template,
 	)
 	TP_perf_assign(
 		__perf_count(delay);
+		__perf_task(tsk);
 	),
 
 	TP_printk("comm=%s pid=%d delay=%Lu [ns]",
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 7697249..db14daf 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -711,6 +711,9 @@ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call
 #undef __perf_count
 #define __perf_count(c) __count = (c)
 
+#undef __perf_task
+#define __perf_task(t) __task = (t)
+
 #undef TP_perf_assign
 #define TP_perf_assign(args...) args
 
@@ -724,6 +727,7 @@ perf_trace_##call(void *__data, proto)					\
 	struct ftrace_raw_##call *entry;				\
 	struct pt_regs __regs;						\
 	u64 __addr = 0, __count = 1;					\
+	struct task_struct *__task = NULL;				\
 	struct hlist_head *head;					\
 	int __entry_size;						\
 	int __data_size;						\
@@ -751,7 +755,7 @@ perf_trace_##call(void *__data, proto)					\
 									\
 	head = this_cpu_ptr(event_call->perf_events);			\
 	perf_trace_buf_submit(entry, __entry_size, rctx, __addr,	\
-		__count, &__regs, head);				\
+		__count, &__regs, head, __task);			\
 }
 
 /*
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 6581a04..98d4597 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -153,7 +153,8 @@ put_callchain_entry(int rctx)
 	put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
 }
 
-struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+struct perf_callchain_entry *
+perf_callchain(struct perf_event *event, struct pt_regs *regs)
 {
 	int rctx;
 	struct perf_callchain_entry *entry;
@@ -178,6 +179,12 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 	}
 
 	if (regs) {
+		/*
+		 * Disallow cross-task user callchains.
+		 */
+		if (event->ctx->task && event->ctx->task != current)
+			goto exit_put;
+
 		perf_callchain_store(entry, PERF_CONTEXT_USER);
 		perf_callchain_user(entry, regs);
 	}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d7d71d6..1ebee0a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4037,7 +4037,7 @@ void perf_prepare_sample(struct perf_event_header *header,
 	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
 		int size = 1;
 
-		data->callchain = perf_callchain(regs);
+		data->callchain = perf_callchain(event, regs);
 
 		if (data->callchain)
 			size += data->callchain->nr;
@@ -5207,7 +5207,8 @@ static int perf_tp_event_match(struct perf_event *event,
 }
 
 void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
-		   struct pt_regs *regs, struct hlist_head *head, int rctx)
+		   struct pt_regs *regs, struct hlist_head *head, int rctx,
+		   struct task_struct *task)
 {
 	struct perf_sample_data data;
 	struct perf_event *event;
@@ -5226,6 +5227,29 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
 			perf_swevent_event(event, count, &data, regs);
 	}
 
+	/*
+	 * If we got specified a target task, also iterate its context and
+	 * deliver this event there too.
+	 */
+	if (task && task != current) {
+		struct perf_event_context *ctx;
+		struct trace_entry *entry = record;
+
+		rcu_read_lock();
+		ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
+		if (!ctx)
+			goto unlock;
+
+		list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+			if (entry->type != event->attr.config)
+				continue;
+			if (perf_tp_event_match(event, &data, regs))
+				perf_swevent_event(event, count, &data, regs);
+		}
+unlock:
+		rcu_read_unlock();
+	}
+
 	perf_swevent_put_recursion_context(rctx);
 }
 EXPORT_SYMBOL_GPL(perf_tp_event);
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index b0b107f..a096c19 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -101,7 +101,8 @@ __output_copy(struct perf_output_handle *handle,
 }
 
 /* Callchain handling */
-extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
+extern struct perf_callchain_entry *
+perf_callchain(struct perf_event *event, struct pt_regs *regs);
 extern int get_callchain_buffers(void);
 extern void put_callchain_buffers(void);
 
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index fee3752..8a6d2ee 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -281,7 +281,7 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip)
 
 	head = this_cpu_ptr(event_function.perf_events);
 	perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0,
-			      1, &regs, head);
+			      1, &regs, head, NULL);
 
 #undef ENTRY_SIZE
 }
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index b31d3d5..1a21170 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1002,7 +1002,8 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
 	store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
 
 	head = this_cpu_ptr(call->perf_events);
-	perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head);
+	perf_trace_buf_submit(entry, size, rctx,
+					entry->ip, 1, regs, head, NULL);
 }
 
 /* Kretprobe profile handler */
@@ -1033,7 +1034,8 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
 	store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
 
 	head = this_cpu_ptr(call->perf_events);
-	perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head);
+	perf_trace_buf_submit(entry, size, rctx,
+					entry->ret_ip, 1, regs, head, NULL);
 }
 #endif	/* CONFIG_PERF_EVENTS */
 
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 96fc733..60e4d78 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -532,7 +532,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 			       (unsigned long *)&rec->args);
 
 	head = this_cpu_ptr(sys_data->enter_event->perf_events);
-	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
+	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
 }
 
 int perf_sysenter_enable(struct ftrace_event_call *call)
@@ -608,7 +608,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	rec->ret = syscall_get_return_value(current, regs);
 
 	head = this_cpu_ptr(sys_data->exit_event->perf_events);
-	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
+	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
 }
 
 int perf_sysexit_enable(struct ftrace_event_call *call)
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH] trace: add ability to set a target task for events (v2)
  2012-07-11 14:14 [PATCH] trace: add ability to set a target task for events (v2) Andrew Vagin
@ 2012-07-11 14:31 ` Frederic Weisbecker
  2012-07-11 14:33   ` Peter Zijlstra
  2012-07-11 15:09 ` Peter Zijlstra
  2012-07-31 17:58 ` [tip:perf/urgent] perf/trace: Add ability to set a target task for events tip-bot for Andrew Vagin
  2 siblings, 1 reply; 12+ messages in thread
From: Frederic Weisbecker @ 2012-07-11 14:31 UTC (permalink / raw)
  To: Andrew Vagin
  Cc: Peter Zijlstra, linux-kernel, Ingo Molnar, Steven Rostedt,
	Paul Mackerras, Arnaldo Carvalho de Melo, Arun Sharma

On Wed, Jul 11, 2012 at 06:14:58PM +0400, Andrew Vagin wrote:
> A few events are interesting not only for a current task.
> For example, sched_stat_* are interesting for a task, which
> wake up. For this reason, it will be good, if such events will
> be delivered to a target task too.
> 
> Now a target task can be set by using __perf_task().
> 
> The original idea and a draft patch belongs to Peter Zijlstra.
> 
> I need this events for profiling sleep times.  sched_switch is used for
> getting callchains and sched_stat_* is used for getting time periods.
> This events are combined in user space, then it can be analized by
> perf tools.

We've talked about that numerous times. But I still don't really
understand why you're not using sched switch events and compute
the difference between schedule in and schedule out.

I think you said that's because you got too much events with sched
switch. Are you loosing events? Otherwise I don't see why it's
a problem.

Also the sched_stat_sleep event produce an event which period equals the
time slept. Internally, perf split this into as many events as that period
because the requested period for trace events is 1 by default. We probably
should allow to send events with a higher number than the one requested. This
this produce sometimes a huge pile of events, and that even often result in
tons of lost events. We definetly need to fix that.

In the meantime you'll certainly get saner results by just recording
sched switch events.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] trace: add ability to set a target task for events (v2)
  2012-07-11 14:31 ` Frederic Weisbecker
@ 2012-07-11 14:33   ` Peter Zijlstra
  2012-07-11 14:36     ` Frederic Weisbecker
  0 siblings, 1 reply; 12+ messages in thread
From: Peter Zijlstra @ 2012-07-11 14:33 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: Andrew Vagin, linux-kernel, Ingo Molnar, Steven Rostedt,
	Paul Mackerras, Arnaldo Carvalho de Melo, Arun Sharma

On Wed, 2012-07-11 at 16:31 +0200, Frederic Weisbecker wrote:
> On Wed, Jul 11, 2012 at 06:14:58PM +0400, Andrew Vagin wrote:
> > A few events are interesting not only for a current task.
> > For example, sched_stat_* are interesting for a task, which
> > wake up. For this reason, it will be good, if such events will
> > be delivered to a target task too.
> > 
> > Now a target task can be set by using __perf_task().
> > 
> > The original idea and a draft patch belongs to Peter Zijlstra.
> > 
> > I need this events for profiling sleep times.  sched_switch is used for
> > getting callchains and sched_stat_* is used for getting time periods.
> > This events are combined in user space, then it can be analized by
> > perf tools.
> 
> We've talked about that numerous times. But I still don't really
> understand why you're not using sched switch events and compute
> the difference between schedule in and schedule out.
> 
> I think you said that's because you got too much events with sched
> switch. Are you loosing events? Otherwise I don't see why it's
> a problem.
> 
> Also the sched_stat_sleep event produce an event which period equals the
> time slept. Internally, perf split this into as many events as that period
> because the requested period for trace events is 1 by default. We probably
> should allow to send events with a higher number than the one requested. This
> this produce sometimes a huge pile of events, and that even often result in
> tons of lost events. We definetly need to fix that.
> 
> In the meantime you'll certainly get saner results by just recording
> sched switch events.

Not really, there's an arbitrary large delay between wakeup and getting
scheduled back in, which is unrelated to the cause that you went to
sleep.

The wants the time between going to sleep and getting woken up,
sched_switch simply doesn't give you that.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] trace: add ability to set a target task for events (v2)
  2012-07-11 14:33   ` Peter Zijlstra
@ 2012-07-11 14:36     ` Frederic Weisbecker
  2012-07-11 14:38       ` Peter Zijlstra
  0 siblings, 1 reply; 12+ messages in thread
From: Frederic Weisbecker @ 2012-07-11 14:36 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andrew Vagin, linux-kernel, Ingo Molnar, Steven Rostedt,
	Paul Mackerras, Arnaldo Carvalho de Melo, Arun Sharma

On Wed, Jul 11, 2012 at 04:33:41PM +0200, Peter Zijlstra wrote:
> On Wed, 2012-07-11 at 16:31 +0200, Frederic Weisbecker wrote:
> > On Wed, Jul 11, 2012 at 06:14:58PM +0400, Andrew Vagin wrote:
> > > A few events are interesting not only for a current task.
> > > For example, sched_stat_* are interesting for a task, which
> > > wake up. For this reason, it will be good, if such events will
> > > be delivered to a target task too.
> > > 
> > > Now a target task can be set by using __perf_task().
> > > 
> > > The original idea and a draft patch belongs to Peter Zijlstra.
> > > 
> > > I need this events for profiling sleep times.  sched_switch is used for
> > > getting callchains and sched_stat_* is used for getting time periods.
> > > This events are combined in user space, then it can be analized by
> > > perf tools.
> > 
> > We've talked about that numerous times. But I still don't really
> > understand why you're not using sched switch events and compute
> > the difference between schedule in and schedule out.
> > 
> > I think you said that's because you got too much events with sched
> > switch. Are you loosing events? Otherwise I don't see why it's
> > a problem.
> > 
> > Also the sched_stat_sleep event produce an event which period equals the
> > time slept. Internally, perf split this into as many events as that period
> > because the requested period for trace events is 1 by default. We probably
> > should allow to send events with a higher number than the one requested. This
> > this produce sometimes a huge pile of events, and that even often result in
> > tons of lost events. We definetly need to fix that.
> > 
> > In the meantime you'll certainly get saner results by just recording
> > sched switch events.
> 
> Not really, there's an arbitrary large delay between wakeup and getting
> scheduled back in, which is unrelated to the cause that you went to
> sleep.
> 
> The wants the time between going to sleep and getting woken up,
> sched_switch simply doesn't give you that.

In this case he can just record sched wakeup as well. With sched_switch
+ sched_wakeup, he'll unlikely lose events.

With sched_stat_sleep he will lose events, unless we fix this period
demux thing.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] trace: add ability to set a target task for events (v2)
  2012-07-11 14:36     ` Frederic Weisbecker
@ 2012-07-11 14:38       ` Peter Zijlstra
  2012-07-11 14:48         ` Frederic Weisbecker
  0 siblings, 1 reply; 12+ messages in thread
From: Peter Zijlstra @ 2012-07-11 14:38 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: Andrew Vagin, linux-kernel, Ingo Molnar, Steven Rostedt,
	Paul Mackerras, Arnaldo Carvalho de Melo, Arun Sharma

On Wed, 2012-07-11 at 16:36 +0200, Frederic Weisbecker wrote:
> 
> In this case he can just record sched wakeup as well. With sched_switch
> + sched_wakeup, he'll unlikely lose events.
> 
> With sched_stat_sleep he will lose events, unless we fix this period
> demux thing. 

But without this patch, the sched_wakeup will belong to another task, so
if you trace task A, and B wakes you, you'll never see the wakeup.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] trace: add ability to set a target task for events (v2)
  2012-07-11 14:38       ` Peter Zijlstra
@ 2012-07-11 14:48         ` Frederic Weisbecker
  2012-07-11 14:55           ` Peter Zijlstra
  0 siblings, 1 reply; 12+ messages in thread
From: Frederic Weisbecker @ 2012-07-11 14:48 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andrew Vagin, linux-kernel, Ingo Molnar, Steven Rostedt,
	Paul Mackerras, Arnaldo Carvalho de Melo, Arun Sharma

On Wed, Jul 11, 2012 at 04:38:19PM +0200, Peter Zijlstra wrote:
> On Wed, 2012-07-11 at 16:36 +0200, Frederic Weisbecker wrote:
> > 
> > In this case he can just record sched wakeup as well. With sched_switch
> > + sched_wakeup, he'll unlikely lose events.
> > 
> > With sched_stat_sleep he will lose events, unless we fix this period
> > demux thing. 
> 
> But without this patch, the sched_wakeup will belong to another task, so
> if you trace task A, and B wakes you, you'll never see the wakeup.

Ah so the goal is to minimize the amount of events by only tracing task A?
Ok then. Still we need to fix these events that use __perf_count() because
wide tracing of sched_switch/wake_up still generate less events than
sched stat sleep.

I believe:

	perf record -e sched:sched_stat_sleep sleep 1

produces 1 billion events because we sleep 1 billion nanosecs. Or
something like that.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] trace: add ability to set a target task for events (v2)
  2012-07-11 14:48         ` Frederic Weisbecker
@ 2012-07-11 14:55           ` Peter Zijlstra
  2012-07-11 15:07             ` Frederic Weisbecker
  2012-07-11 15:12             ` Peter Zijlstra
  0 siblings, 2 replies; 12+ messages in thread
From: Peter Zijlstra @ 2012-07-11 14:55 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: Andrew Vagin, linux-kernel, Ingo Molnar, Steven Rostedt,
	Paul Mackerras, Arnaldo Carvalho de Melo, Arun Sharma

On Wed, 2012-07-11 at 16:48 +0200, Frederic Weisbecker wrote:
> On Wed, Jul 11, 2012 at 04:38:19PM +0200, Peter Zijlstra wrote:
> > On Wed, 2012-07-11 at 16:36 +0200, Frederic Weisbecker wrote:
> > > 
> > > In this case he can just record sched wakeup as well. With sched_switch
> > > + sched_wakeup, he'll unlikely lose events.
> > > 
> > > With sched_stat_sleep he will lose events, unless we fix this period
> > > demux thing. 
> > 
> > But without this patch, the sched_wakeup will belong to another task, so
> > if you trace task A, and B wakes you, you'll never see the wakeup.
> 
> Ah so the goal is to minimize the amount of events by only tracing task A?

Right, or just not having sufficient privs to trace the world. And a
wakeup of A is very much also part of A, not only the task doing the
wakeup.

Hence the proposed mechanism.

> Ok then. Still we need to fix these events that use __perf_count() because
> wide tracing of sched_switch/wake_up still generate less events than
> sched stat sleep.
> 
> I believe:
> 
> 	perf record -e sched:sched_stat_sleep sleep 1
> 
> produces 1 billion events because we sleep 1 billion nanosecs. Or
> something like that.

Right.. back when I did that the plan was to make PERF_SAMPLE_PERIOD fix
that, of course that never seemed to have happened.

With PERF_SAMPLE_PERIOD you can simply write the 1b into the period of 1
event and be done with it.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] trace: add ability to set a target task for events (v2)
  2012-07-11 14:55           ` Peter Zijlstra
@ 2012-07-11 15:07             ` Frederic Weisbecker
  2012-07-11 15:12             ` Peter Zijlstra
  1 sibling, 0 replies; 12+ messages in thread
From: Frederic Weisbecker @ 2012-07-11 15:07 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andrew Vagin, linux-kernel, Ingo Molnar, Steven Rostedt,
	Paul Mackerras, Arnaldo Carvalho de Melo, Arun Sharma

On Wed, Jul 11, 2012 at 04:55:08PM +0200, Peter Zijlstra wrote:
> On Wed, 2012-07-11 at 16:48 +0200, Frederic Weisbecker wrote:
> > On Wed, Jul 11, 2012 at 04:38:19PM +0200, Peter Zijlstra wrote:
> > > On Wed, 2012-07-11 at 16:36 +0200, Frederic Weisbecker wrote:
> > > > 
> > > > In this case he can just record sched wakeup as well. With sched_switch
> > > > + sched_wakeup, he'll unlikely lose events.
> > > > 
> > > > With sched_stat_sleep he will lose events, unless we fix this period
> > > > demux thing. 
> > > 
> > > But without this patch, the sched_wakeup will belong to another task, so
> > > if you trace task A, and B wakes you, you'll never see the wakeup.
> > 
> > Ah so the goal is to minimize the amount of events by only tracing task A?
> 
> Right, or just not having sufficient privs to trace the world. And a
> wakeup of A is very much also part of A, not only the task doing the
> wakeup.
> 
> Hence the proposed mechanism.

Yeah that's fair.

> 
> > Ok then. Still we need to fix these events that use __perf_count() because
> > wide tracing of sched_switch/wake_up still generate less events than
> > sched stat sleep.
> > 
> > I believe:
> > 
> > 	perf record -e sched:sched_stat_sleep sleep 1
> > 
> > produces 1 billion events because we sleep 1 billion nanosecs. Or
> > something like that.
> 
> Right.. back when I did that the plan was to make PERF_SAMPLE_PERIOD fix
> that, of course that never seemed to have happened.
> 
> With PERF_SAMPLE_PERIOD you can simply write the 1b into the period of 1
> event and be done with it.

I believe the perf tools handle pretty well variable periods of an event
on top of PERF_SAMPLE_PERIOD. We just need to tweak the maths in
perf_swevent_overflow() I think...

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] trace: add ability to set a target task for events (v2)
  2012-07-11 14:14 [PATCH] trace: add ability to set a target task for events (v2) Andrew Vagin
  2012-07-11 14:31 ` Frederic Weisbecker
@ 2012-07-11 15:09 ` Peter Zijlstra
  2012-07-31 17:58 ` [tip:perf/urgent] perf/trace: Add ability to set a target task for events tip-bot for Andrew Vagin
  2 siblings, 0 replies; 12+ messages in thread
From: Peter Zijlstra @ 2012-07-11 15:09 UTC (permalink / raw)
  To: Andrew Vagin
  Cc: linux-kernel, Ingo Molnar, Steven Rostedt, Paul Mackerras,
	Arnaldo Carvalho de Melo, Arun Sharma

On Wed, 2012-07-11 at 18:14 +0400, Andrew Vagin wrote:
> @@ -5226,6 +5227,29 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
>                         perf_swevent_event(event, count, &data, regs);
>         }
>  
> +       /*
> +        * If we got specified a target task, also iterate its context and
> +        * deliver this event there too.
> +        */
> +       if (task && task != current) {
> +               struct perf_event_context *ctx;
> +               struct trace_entry *entry = record;
> +
> +               rcu_read_lock();
> +               ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
> +               if (!ctx)
> +                       goto unlock;
> +
> +               list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {

I added:
			if (event->attr.type != PERF_TYPE_TRACEPOINT)
				continue;

> +                       if (entry->type != event->attr.config)
> +                               continue;
> +                       if (perf_tp_event_match(event, &data, regs))
> +                               perf_swevent_event(event, count, &data, regs);
> +               }
> +unlock:
> +               rcu_read_unlock();
> +       }
> +
>         perf_swevent_put_recursion_context(rctx);
>  } 

Otherwise I've applied the patch.. Thanks!

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] trace: add ability to set a target task for events (v2)
  2012-07-11 14:55           ` Peter Zijlstra
  2012-07-11 15:07             ` Frederic Weisbecker
@ 2012-07-11 15:12             ` Peter Zijlstra
  2012-07-11 15:17               ` Frederic Weisbecker
  1 sibling, 1 reply; 12+ messages in thread
From: Peter Zijlstra @ 2012-07-11 15:12 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: Andrew Vagin, linux-kernel, Ingo Molnar, Steven Rostedt,
	Paul Mackerras, Arnaldo Carvalho de Melo, Arun Sharma

On Wed, 2012-07-11 at 16:55 +0200, Peter Zijlstra wrote:
> Right.. back when I did that the plan was to make PERF_SAMPLE_PERIOD fix
> that, of course that never seemed to have happened.
> 
> With PERF_SAMPLE_PERIOD you can simply write the 1b into the period of 1
> event and be done with it. 

It did! Andrew fixed it..

---
commit 5d81e5cfb37a174e8ddc0413e2e70cdf05807ace
Author: Andrew Vagin <avagin@openvz.org>
Date:   Mon Nov 7 15:54:12 2011 +0300

    events: Don't divide events if it has field period
    
    This patch solves the following problem:
    
    Now some samples may be lost due to throttling. The number of samples is
    restricted by sysctl_perf_event_sample_rate/HZ.  A trace event is
    divided on some samples according to event's period.  I don't sure, that
    we should generate more than one sample on each trace event. I think the
    better way to use SAMPLE_PERIOD.
    
    E.g.: I want to trace when a process sleeps. I created a process, which
    sleeps for 1ms and for 4ms.  perf got 100 events in both cases.
    
    swapper     0 [000]  1141.371830: sched_stat_sleep: comm=foo pid=1801 delay=1386750 [ns]
    swapper     0 [000]  1141.369444: sched_stat_sleep: comm=foo pid=1801 delay=4499585 [ns]
    
    In the first case a kernel want to send 4499585 events and
    in the second case it wants to send 1386750 events.
    perf-reports shows that process sleeps in both places equal time. It's
    bug.
    
    With this patch kernel generates one event on each "sleep" and the time
    slice is saved in the field "period". Perf knows how handle it.
    
    Signed-off-by: Andrew Vagin <avagin@openvz.org>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Link: http://lkml.kernel.org/r/1320670457-2633428-3-git-send-email-avagin@openvz.org
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/events/core.c b/kernel/events/core.c
index eadac69..8d9dea5 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4528,7 +4528,6 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
 	struct hw_perf_event *hwc = &event->hw;
 	int throttle = 0;
 
-	data->period = event->hw.last_period;
 	if (!overflow)
 		overflow = perf_swevent_set_period(event);
 
@@ -4562,6 +4561,12 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
 	if (!is_sampling_event(event))
 		return;
 
+	if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
+		data->period = nr;
+		return perf_swevent_overflow(event, 1, data, regs);
+	} else
+		data->period = event->hw.last_period;
+
 	if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
 		return perf_swevent_overflow(event, 1, data, regs);
 


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH] trace: add ability to set a target task for events (v2)
  2012-07-11 15:12             ` Peter Zijlstra
@ 2012-07-11 15:17               ` Frederic Weisbecker
  0 siblings, 0 replies; 12+ messages in thread
From: Frederic Weisbecker @ 2012-07-11 15:17 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andrew Vagin, linux-kernel, Ingo Molnar, Steven Rostedt,
	Paul Mackerras, Arnaldo Carvalho de Melo, Arun Sharma

On Wed, Jul 11, 2012 at 05:12:04PM +0200, Peter Zijlstra wrote:
> On Wed, 2012-07-11 at 16:55 +0200, Peter Zijlstra wrote:
> > Right.. back when I did that the plan was to make PERF_SAMPLE_PERIOD fix
> > that, of course that never seemed to have happened.
> > 
> > With PERF_SAMPLE_PERIOD you can simply write the 1b into the period of 1
> > event and be done with it. 
> 
> It did! Andrew fixed it..

Ah! Then may be we need to force PERF_SAMPLE_PERIOD on tracepoints from
perf tools. I need to check that.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [tip:perf/urgent] perf/trace: Add ability to set a target task for events
  2012-07-11 14:14 [PATCH] trace: add ability to set a target task for events (v2) Andrew Vagin
  2012-07-11 14:31 ` Frederic Weisbecker
  2012-07-11 15:09 ` Peter Zijlstra
@ 2012-07-31 17:58 ` tip-bot for Andrew Vagin
  2 siblings, 0 replies; 12+ messages in thread
From: tip-bot for Andrew Vagin @ 2012-07-31 17:58 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, paulus, hpa, mingo, a.p.zijlstra, acme, peterz,
	avagin, rostedt, tglx, asharma

Commit-ID:  e6dab5ffab59e910ec0e3355f4a6f29f7a7be474
Gitweb:     http://git.kernel.org/tip/e6dab5ffab59e910ec0e3355f4a6f29f7a7be474
Author:     Andrew Vagin <avagin@openvz.org>
AuthorDate: Wed, 11 Jul 2012 18:14:58 +0400
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Tue, 31 Jul 2012 17:02:05 +0200

perf/trace: Add ability to set a target task for events

A few events are interesting not only for a current task.
For example, sched_stat_* events are interesting for a task
which wakes up. For this reason, it will be good if such
events will be delivered to a target task too.

Now a target task can be set by using __perf_task().

The original idea and a draft patch belongs to Peter Zijlstra.

I need these events for profiling sleep times. sched_switch is used for
getting callchains and sched_stat_* is used for getting time periods.
These events are combined in user space, then it can be analyzed by
perf tools.

Inspired-by: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Arun Sharma <asharma@fb.com>
Signed-off-by: Andrew Vagin <avagin@openvz.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1342016098-213063-1-git-send-email-avagin@openvz.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/ftrace_event.h    |    5 +++--
 include/linux/perf_event.h      |    3 ++-
 include/trace/events/sched.h    |    4 ++++
 include/trace/ftrace.h          |    6 +++++-
 kernel/events/callchain.c       |    9 ++++++++-
 kernel/events/core.c            |   30 ++++++++++++++++++++++++++++--
 kernel/events/internal.h        |    3 ++-
 kernel/trace/trace_event_perf.c |    2 +-
 kernel/trace/trace_kprobe.c     |    6 ++++--
 kernel/trace/trace_syscalls.c   |    4 ++--
 kernel/trace/trace_uprobe.c     |    2 +-
 11 files changed, 60 insertions(+), 14 deletions(-)

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index af961d6..642928c 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -306,9 +306,10 @@ extern void *perf_trace_buf_prepare(int size, unsigned short type,
 
 static inline void
 perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr,
-		       u64 count, struct pt_regs *regs, void *head)
+		       u64 count, struct pt_regs *regs, void *head,
+		       struct task_struct *task)
 {
-	perf_tp_event(addr, count, raw_data, size, regs, head, rctx);
+	perf_tp_event(addr, count, raw_data, size, regs, head, rctx, task);
 }
 #endif
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 76c5c8b..7602ccb 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1272,7 +1272,8 @@ static inline bool perf_paranoid_kernel(void)
 extern void perf_event_init(void);
 extern void perf_tp_event(u64 addr, u64 count, void *record,
 			  int entry_size, struct pt_regs *regs,
-			  struct hlist_head *head, int rctx);
+			  struct hlist_head *head, int rctx,
+			  struct task_struct *task);
 extern void perf_bp_event(struct perf_event *event, void *data);
 
 #ifndef perf_misc_flags
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index ea7a203..5a8671e 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -73,6 +73,9 @@ DECLARE_EVENT_CLASS(sched_wakeup_template,
 		__entry->prio		= p->prio;
 		__entry->success	= success;
 		__entry->target_cpu	= task_cpu(p);
+	)
+	TP_perf_assign(
+		__perf_task(p);
 	),
 
 	TP_printk("comm=%s pid=%d prio=%d success=%d target_cpu=%03d",
@@ -325,6 +328,7 @@ DECLARE_EVENT_CLASS(sched_stat_template,
 	)
 	TP_perf_assign(
 		__perf_count(delay);
+		__perf_task(tsk);
 	),
 
 	TP_printk("comm=%s pid=%d delay=%Lu [ns]",
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index c6bc2fa..a763888 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -712,6 +712,9 @@ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call
 #undef __perf_count
 #define __perf_count(c) __count = (c)
 
+#undef __perf_task
+#define __perf_task(t) __task = (t)
+
 #undef TP_perf_assign
 #define TP_perf_assign(args...) args
 
@@ -725,6 +728,7 @@ perf_trace_##call(void *__data, proto)					\
 	struct ftrace_raw_##call *entry;				\
 	struct pt_regs __regs;						\
 	u64 __addr = 0, __count = 1;					\
+	struct task_struct *__task = NULL;				\
 	struct hlist_head *head;					\
 	int __entry_size;						\
 	int __data_size;						\
@@ -752,7 +756,7 @@ perf_trace_##call(void *__data, proto)					\
 									\
 	head = this_cpu_ptr(event_call->perf_events);			\
 	perf_trace_buf_submit(entry, __entry_size, rctx, __addr,	\
-		__count, &__regs, head);				\
+		__count, &__regs, head, __task);			\
 }
 
 /*
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 6581a04..98d4597 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -153,7 +153,8 @@ put_callchain_entry(int rctx)
 	put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
 }
 
-struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+struct perf_callchain_entry *
+perf_callchain(struct perf_event *event, struct pt_regs *regs)
 {
 	int rctx;
 	struct perf_callchain_entry *entry;
@@ -178,6 +179,12 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 	}
 
 	if (regs) {
+		/*
+		 * Disallow cross-task user callchains.
+		 */
+		if (event->ctx->task && event->ctx->task != current)
+			goto exit_put;
+
 		perf_callchain_store(entry, PERF_CONTEXT_USER);
 		perf_callchain_user(entry, regs);
 	}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f1cf0ed..b7935fc 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4039,7 +4039,7 @@ void perf_prepare_sample(struct perf_event_header *header,
 	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
 		int size = 1;
 
-		data->callchain = perf_callchain(regs);
+		data->callchain = perf_callchain(event, regs);
 
 		if (data->callchain)
 			size += data->callchain->nr;
@@ -5209,7 +5209,8 @@ static int perf_tp_event_match(struct perf_event *event,
 }
 
 void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
-		   struct pt_regs *regs, struct hlist_head *head, int rctx)
+		   struct pt_regs *regs, struct hlist_head *head, int rctx,
+		   struct task_struct *task)
 {
 	struct perf_sample_data data;
 	struct perf_event *event;
@@ -5228,6 +5229,31 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
 			perf_swevent_event(event, count, &data, regs);
 	}
 
+	/*
+	 * If we got specified a target task, also iterate its context and
+	 * deliver this event there too.
+	 */
+	if (task && task != current) {
+		struct perf_event_context *ctx;
+		struct trace_entry *entry = record;
+
+		rcu_read_lock();
+		ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
+		if (!ctx)
+			goto unlock;
+
+		list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+			if (event->attr.type != PERF_TYPE_TRACEPOINT)
+				continue;
+			if (event->attr.config != entry->type)
+				continue;
+			if (perf_tp_event_match(event, &data, regs))
+				perf_swevent_event(event, count, &data, regs);
+		}
+unlock:
+		rcu_read_unlock();
+	}
+
 	perf_swevent_put_recursion_context(rctx);
 }
 EXPORT_SYMBOL_GPL(perf_tp_event);
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index b0b107f..a096c19 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -101,7 +101,8 @@ __output_copy(struct perf_output_handle *handle,
 }
 
 /* Callchain handling */
-extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
+extern struct perf_callchain_entry *
+perf_callchain(struct perf_event *event, struct pt_regs *regs);
 extern int get_callchain_buffers(void);
 extern void put_callchain_buffers(void);
 
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index fee3752..8a6d2ee 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -281,7 +281,7 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip)
 
 	head = this_cpu_ptr(event_function.perf_events);
 	perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0,
-			      1, &regs, head);
+			      1, &regs, head, NULL);
 
 #undef ENTRY_SIZE
 }
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index b31d3d5..1a21170 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1002,7 +1002,8 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
 	store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
 
 	head = this_cpu_ptr(call->perf_events);
-	perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head);
+	perf_trace_buf_submit(entry, size, rctx,
+					entry->ip, 1, regs, head, NULL);
 }
 
 /* Kretprobe profile handler */
@@ -1033,7 +1034,8 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
 	store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
 
 	head = this_cpu_ptr(call->perf_events);
-	perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head);
+	perf_trace_buf_submit(entry, size, rctx,
+					entry->ret_ip, 1, regs, head, NULL);
 }
 #endif	/* CONFIG_PERF_EVENTS */
 
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 96fc733..60e4d78 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -532,7 +532,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 			       (unsigned long *)&rec->args);
 
 	head = this_cpu_ptr(sys_data->enter_event->perf_events);
-	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
+	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
 }
 
 int perf_sysenter_enable(struct ftrace_event_call *call)
@@ -608,7 +608,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	rec->ret = syscall_get_return_value(current, regs);
 
 	head = this_cpu_ptr(sys_data->exit_event->perf_events);
-	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
+	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
 }
 
 int perf_sysexit_enable(struct ftrace_event_call *call)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 2b36ac6..03003cd 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -670,7 +670,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
 		call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
 
 	head = this_cpu_ptr(call->perf_events);
-	perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head);
+	perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head, NULL);
 
  out:
 	preempt_enable();

^ permalink raw reply related	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2012-07-31 17:59 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-07-11 14:14 [PATCH] trace: add ability to set a target task for events (v2) Andrew Vagin
2012-07-11 14:31 ` Frederic Weisbecker
2012-07-11 14:33   ` Peter Zijlstra
2012-07-11 14:36     ` Frederic Weisbecker
2012-07-11 14:38       ` Peter Zijlstra
2012-07-11 14:48         ` Frederic Weisbecker
2012-07-11 14:55           ` Peter Zijlstra
2012-07-11 15:07             ` Frederic Weisbecker
2012-07-11 15:12             ` Peter Zijlstra
2012-07-11 15:17               ` Frederic Weisbecker
2012-07-11 15:09 ` Peter Zijlstra
2012-07-31 17:58 ` [tip:perf/urgent] perf/trace: Add ability to set a target task for events tip-bot for Andrew Vagin

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.