All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] perf/ftrace : Fix repetitious traces when specify a target  task
@ 2017-10-10  1:19 Cheng Jian
  2017-10-10 11:33 ` Peter Zijlstra
  0 siblings, 1 reply; 7+ messages in thread
From: Cheng Jian @ 2017-10-10  1:19 UTC (permalink / raw)
  To: peterz, mingo, acme, alexander.shishkin
  Cc: linux-kernel, xiexiuqi, huawei.libin, cj.chengjian

When use perf to trace the sched_wakeup and sched_wakeup_new tracepoint,
there is a bug that output the same event repetitiously.
It can be reproduced by :

	perf record -e sched:sched_wakeup_new ./bug_fork

bug_fork is an demo that can generating wakeup_new events :

	the parent process does nothing but
	fork a child process, and then they both quit.

perf script :

	bug_fork  1078 [002]   184.669341: sched:sched_wakeup_new:
comm=bug_fork pid=1079 prio=120 target_cpu=000
        bug_fork  1078 [002]   184.670128: sched:sched_wakeup_new:
comm=bug_fork pid=1079 prio=120 target_cpu=000
	bug_fork  1078 [002]   184.670128: sched:sched_wakeup_new:
comm=bug_fork pid=1079 prio=120 target_cpu=000
	bug_fork  1078 [002]   184.670128: sched:sched_wakeup_new:
comm=bug_fork pid=1079 prio=120 target_cpu=000
	bug_fork  1078 [002]   184.670128: sched:sched_wakeup_new:
comm=bug_fork pid=1079 prio=120 target_cpu=000

but ftrace only show one event:

	bug_fork-1078  [002] d...   184.889159: sched_wakeup_new:
comm=bug_fork pid=1079 prio=120 target_cpu=000

perf script repeat prints wakeup_new events multiple times.

These events which trigger this issue not only monitor the current task,
but also specify a target task. For example, the sched_wakeup and
sched_wakeup_new tracepoint will be caught when the current task
wakeup the target task which we traced on.

commit e6dab5ffab59 ("perf/trace: Add ability to set a target task
for events") has designed a method to trace these events which
specify a target task. But there have tow issues when monitoring
multithreaded/multiprocess apps.

First, it match an event(such as wakeup/wakeup_new/stat_*) at the begin,
but the function doesn't return, the event will be matched again because
of task != current.

Second, due to these events are registered/mmaped at per-cpu
or per-thread(--per-thread), so perf_swevent_event will match
these events multiple times in the branch(task != current),
the number of repetitions is just the number of CPUs or threads.

perf_tp_event will only match an event event at a time,
so we will return after an event matched.

Signed-off-by: Cheng Jian <cj.chengjian@huawei.com>
---
 kernel/events/core.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index baa134c..5682ead 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7988,12 +7988,16 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
 
 	/* Use the given event instead of the hlist */
 	if (event) {
-		if (perf_tp_event_match(event, &data, regs))
+		if (perf_tp_event_match(event, &data, regs)) {
 			perf_swevent_event(event, count, &data, regs);
+			goto out;
+		}
 	} else {
 		hlist_for_each_entry_rcu(event, head, hlist_entry) {
-			if (perf_tp_event_match(event, &data, regs))
+			if (perf_tp_event_match(event, &data, regs)) {
 				perf_swevent_event(event, count, &data, regs);
+				goto out;
+			}
 		}
 	}
 
@@ -8015,13 +8019,15 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
 				continue;
 			if (event->attr.config != entry->type)
 				continue;
-			if (perf_tp_event_match(event, &data, regs))
+			if (perf_tp_event_match(event, &data, regs)) {
 				perf_swevent_event(event, count, &data, regs);
+				break;
+			}
 		}
 unlock:
 		rcu_read_unlock();
 	}
-
+out:
 	perf_swevent_put_recursion_context(rctx);
 }
 EXPORT_SYMBOL_GPL(perf_tp_event);
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] perf/ftrace : Fix repetitious traces when specify a target  task
  2017-10-10  1:19 [PATCH] perf/ftrace : Fix repetitious traces when specify a target task Cheng Jian
@ 2017-10-10 11:33 ` Peter Zijlstra
  2017-10-10 12:18   ` chengjian (D)
  2017-10-10 13:04   ` Peter Zijlstra
  0 siblings, 2 replies; 7+ messages in thread
From: Peter Zijlstra @ 2017-10-10 11:33 UTC (permalink / raw)
  To: Cheng Jian
  Cc: mingo, acme, alexander.shishkin, linux-kernel, xiexiuqi,
	huawei.libin, Steven Rostedt, Jiri Olsa

On Tue, Oct 10, 2017 at 09:19:39AM +0800, Cheng Jian wrote:
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index baa134c..5682ead 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -7988,12 +7988,16 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
>  
>  	/* Use the given event instead of the hlist */
>  	if (event) {
> -		if (perf_tp_event_match(event, &data, regs))
> +		if (perf_tp_event_match(event, &data, regs)) {
>  			perf_swevent_event(event, count, &data, regs);
> +			goto out;
> +		}
>  	} else {
>  		hlist_for_each_entry_rcu(event, head, hlist_entry) {
> -			if (perf_tp_event_match(event, &data, regs))
> +			if (perf_tp_event_match(event, &data, regs)) {
>  				perf_swevent_event(event, count, &data, regs);
> +				goto out;
> +			}
>  		}
>  	}
>  
> @@ -8015,13 +8019,15 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
>  				continue;
>  			if (event->attr.config != entry->type)
>  				continue;
> -			if (perf_tp_event_match(event, &data, regs))
> +			if (perf_tp_event_match(event, &data, regs)) {
>  				perf_swevent_event(event, count, &data, regs);
> +				break;
> +			}
>  		}
>  unlock:
>  		rcu_read_unlock();
>  	}
> -
> +out:
>  	perf_swevent_put_recursion_context(rctx);
>  }
>  EXPORT_SYMBOL_GPL(perf_tp_event);

No, this _cannot_ be right. The whole point of the @task argument was to
deliver the event multiple times -- maybe not to the same event, but it
needs to be delivered multiple times in some cases. Therefore this is
broken.

But now you've got me looking at 75e8387685f6, which also looks
completely insane.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] perf/ftrace : Fix repetitious traces when specify a target task
  2017-10-10 11:33 ` Peter Zijlstra
@ 2017-10-10 12:18   ` chengjian (D)
  2017-10-10 13:42     ` Peter Zijlstra
  2017-10-10 13:04   ` Peter Zijlstra
  1 sibling, 1 reply; 7+ messages in thread
From: chengjian (D) @ 2017-10-10 12:18 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, acme, alexander.shishkin, linux-kernel, xiexiuqi,
	huawei.libin, Steven Rostedt, Jiri Olsa

On 2017/10/10 19:33, Peter Zijlstra wrote:
>
> No, this _cannot_ be right. The whole point of the @task argument was to
> deliver the event multiple times -- maybe not to the same event, but it
> needs to be delivered multiple times in some cases. Therefore this is
> broken.
>
> But now you've got me looking at 75e8387685f6, which also looks
> completely insane.
> .
the demo is like this

```cpp
// bug_fork.c
#include <stdio.h>
#include <stdlib.h>

#include <unistd.h>


int main(int argc,char** argv){
     pid_t        pid;

     printf("parent pid = %d\n", getpid( ));

     pid = fork( );

     if(pid < 0)
     {
         perror("fork error");
         exit(-1);
     }
     else if(pid == 0)//  fork return 0 in child process
     {
         printf("child pid = %d\n", getpid( ));
         exit(0);
     }
     else        //  return child pid in parent process
     {
         //sleep(1);
         //count++;
     }

     return EXIT_SUCCESS;
}
```


the parent only wakeup_new child process once, but perf match a lot when 
use per-cpu maped.

In my opition, perf stat use per-thread-map, it will match twice.
sudo perf stat -e sched:sched_wakeup_new ./bug_fork

     parent pid = 86155
     child pid = 86156

     Performance counter stats for './bug_fork':

                  2      sched:sched_wakeup_new

        0.001112926 seconds time elapsed


Does it meen a match for trace parent(86155), a match for trace child(86156)

and what cases should the events be delivered multiple times?


Thanks...

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] perf/ftrace : Fix repetitious traces when specify a target  task
  2017-10-10 11:33 ` Peter Zijlstra
  2017-10-10 12:18   ` chengjian (D)
@ 2017-10-10 13:04   ` Peter Zijlstra
  2017-10-10 15:21     ` Peter Zijlstra
  1 sibling, 1 reply; 7+ messages in thread
From: Peter Zijlstra @ 2017-10-10 13:04 UTC (permalink / raw)
  To: Cheng Jian
  Cc: mingo, acme, alexander.shishkin, linux-kernel, xiexiuqi,
	huawei.libin, Steven Rostedt, Jiri Olsa

On Tue, Oct 10, 2017 at 01:33:21PM +0200, Peter Zijlstra wrote:
> But now you've got me looking at 75e8387685f6, which also looks
> completely insane.

The reason I insta stumbled on that patch is that it only addresses the
ftrace situation and doesn't mention the other _5_ places that use this
interface. It doesn't explain why those don't have the problem and if
not, why their solution doesn't work for ftrace.

So all (well syscall and regular tracepoints, didn't check the others)
avoid that problem by simply not registering multiple times at the
tracepoint. Tracepoints use tp_event->perf_refcount and the syscall
things use sys_perf_refcount_{enter,exit} for that.

Doing the same for function trace looks a little something like the
below (after reverting 75e8387685f6)

Except the below doesn't compile because of
kernel/trace/trace_event_filter.c, which is where I lost the plot.

Steve, can we make the below work?

---
 include/linux/ftrace.h          |   51 --------------------------------------
 include/linux/perf_event.h      |    3 --
 include/linux/trace_events.h    |    2 -
 kernel/trace/ftrace.c           |   53 +++-------------------------------------
 kernel/trace/trace_event_perf.c |   48 +++++++++++++++++-------------------
 kernel/trace/trace_events.c     |    2 -
 kernel/trace/trace_kprobe.c     |    2 -
 kernel/trace/trace_syscalls.c   |    4 ---
 8 files changed, 29 insertions(+), 136 deletions(-)

--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -125,7 +125,7 @@ ftrace_func_t ftrace_ops_get_func(struct
 enum {
 	FTRACE_OPS_FL_ENABLED			= 1 << 0,
 	FTRACE_OPS_FL_DYNAMIC			= 1 << 1,
-	FTRACE_OPS_FL_PER_CPU			= 1 << 2,
+
 	FTRACE_OPS_FL_SAVE_REGS			= 1 << 3,
 	FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED	= 1 << 4,
 	FTRACE_OPS_FL_RECURSION_SAFE		= 1 << 5,
@@ -204,55 +204,6 @@ int register_ftrace_function(struct ftra
 int unregister_ftrace_function(struct ftrace_ops *ops);
 void clear_ftrace_function(void);
 
-/**
- * ftrace_function_local_enable - enable ftrace_ops on current cpu
- *
- * This function enables tracing on current cpu by decreasing
- * the per cpu control variable.
- * It must be called with preemption disabled and only on ftrace_ops
- * registered with FTRACE_OPS_FL_PER_CPU. If called without preemption
- * disabled, this_cpu_ptr will complain when CONFIG_DEBUG_PREEMPT is enabled.
- */
-static inline void ftrace_function_local_enable(struct ftrace_ops *ops)
-{
-	if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_PER_CPU)))
-		return;
-
-	(*this_cpu_ptr(ops->disabled))--;
-}
-
-/**
- * ftrace_function_local_disable - disable ftrace_ops on current cpu
- *
- * This function disables tracing on current cpu by increasing
- * the per cpu control variable.
- * It must be called with preemption disabled and only on ftrace_ops
- * registered with FTRACE_OPS_FL_PER_CPU. If called without preemption
- * disabled, this_cpu_ptr will complain when CONFIG_DEBUG_PREEMPT is enabled.
- */
-static inline void ftrace_function_local_disable(struct ftrace_ops *ops)
-{
-	if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_PER_CPU)))
-		return;
-
-	(*this_cpu_ptr(ops->disabled))++;
-}
-
-/**
- * ftrace_function_local_disabled - returns ftrace_ops disabled value
- *                                  on current cpu
- *
- * This function returns value of ftrace_ops::disabled on current cpu.
- * It must be called with preemption disabled and only on ftrace_ops
- * registered with FTRACE_OPS_FL_PER_CPU. If called without preemption
- * disabled, this_cpu_ptr will complain when CONFIG_DEBUG_PREEMPT is enabled.
- */
-static inline int ftrace_function_local_disabled(struct ftrace_ops *ops)
-{
-	WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_PER_CPU));
-	return *this_cpu_ptr(ops->disabled);
-}
-
 extern void ftrace_stub(unsigned long a0, unsigned long a1,
 			struct ftrace_ops *op, struct pt_regs *regs);
 
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -676,9 +676,6 @@ struct perf_event {
 #ifdef CONFIG_EVENT_TRACING
 	struct trace_event_call		*tp_event;
 	struct event_filter		*filter;
-#ifdef CONFIG_FUNCTION_TRACER
-	struct ftrace_ops               ftrace_ops;
-#endif
 #endif
 
 #ifdef CONFIG_CGROUP_PERF
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -173,8 +173,6 @@ enum trace_reg {
 	TRACE_REG_PERF_UNREGISTER,
 	TRACE_REG_PERF_OPEN,
 	TRACE_REG_PERF_CLOSE,
-	TRACE_REG_PERF_ADD,
-	TRACE_REG_PERF_DEL,
 #endif
 };
 
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -203,30 +203,6 @@ void clear_ftrace_function(void)
 	ftrace_trace_function = ftrace_stub;
 }
 
-static void per_cpu_ops_disable_all(struct ftrace_ops *ops)
-{
-	int cpu;
-
-	for_each_possible_cpu(cpu)
-		*per_cpu_ptr(ops->disabled, cpu) = 1;
-}
-
-static int per_cpu_ops_alloc(struct ftrace_ops *ops)
-{
-	int __percpu *disabled;
-
-	if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_PER_CPU)))
-		return -EINVAL;
-
-	disabled = alloc_percpu(int);
-	if (!disabled)
-		return -ENOMEM;
-
-	ops->disabled = disabled;
-	per_cpu_ops_disable_all(ops);
-	return 0;
-}
-
 static void ftrace_sync(struct work_struct *work)
 {
 	/*
@@ -262,7 +238,7 @@ static ftrace_func_t ftrace_ops_get_list
 	 * If this is a dynamic, RCU, or per CPU ops, or we force list func,
 	 * then it needs to call the list anyway.
 	 */
-	if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU |
+	if (ops->flags & (FTRACE_OPS_FL_DYNAMIC |
 			  FTRACE_OPS_FL_RCU) || FTRACE_FORCE_LIST_FUNC)
 		return ftrace_ops_list_func;
 
@@ -422,11 +398,6 @@ static int __register_ftrace_function(st
 	if (!core_kernel_data((unsigned long)ops))
 		ops->flags |= FTRACE_OPS_FL_DYNAMIC;
 
-	if (ops->flags & FTRACE_OPS_FL_PER_CPU) {
-		if (per_cpu_ops_alloc(ops))
-			return -ENOMEM;
-	}
-
 	add_ftrace_ops(&ftrace_ops_list, ops);
 
 	/* Always save the function, and reset at unregistering */
@@ -2727,11 +2698,6 @@ void __weak arch_ftrace_trampoline_free(
 {
 }
 
-static void per_cpu_ops_free(struct ftrace_ops *ops)
-{
-	free_percpu(ops->disabled);
-}
-
 static void ftrace_startup_enable(int command)
 {
 	if (saved_ftrace_func != ftrace_trace_function) {
@@ -2833,7 +2799,7 @@ static int ftrace_shutdown(struct ftrace
 		 * not currently active, we can just free them
 		 * without synchronizing all CPUs.
 		 */
-		if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU))
+		if (ops->flags & (FTRACE_OPS_FL_DYNAMIC))
 			goto free_ops;
 
 		return 0;
@@ -2880,7 +2846,7 @@ static int ftrace_shutdown(struct ftrace
 	 * The same goes for freeing the per_cpu data of the per_cpu
 	 * ops.
 	 */
-	if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU)) {
+	if (ops->flags & (FTRACE_OPS_FL_DYNAMIC)) {
 		/*
 		 * We need to do a hard force of sched synchronization.
 		 * This is because we use preempt_disable() to do RCU, but
@@ -2903,9 +2869,6 @@ static int ftrace_shutdown(struct ftrace
 
  free_ops:
 		arch_ftrace_trampoline_free(ops);
-
-		if (ops->flags & FTRACE_OPS_FL_PER_CPU)
-			per_cpu_ops_free(ops);
 	}
 
 	return 0;
@@ -6066,10 +6029,7 @@ __ftrace_ops_list_func(unsigned long ip,
 		 * If any of the above fails then the op->func() is not executed.
 		 */
 		if ((!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching()) &&
-		    (!(op->flags & FTRACE_OPS_FL_PER_CPU) ||
-		     !ftrace_function_local_disabled(op)) &&
 		    ftrace_ops_test(op, ip, regs)) {
-		    
 			if (FTRACE_WARN_ON(!op->func)) {
 				pr_warn("op=%p %pS\n", op, op);
 				goto out;
@@ -6127,10 +6087,7 @@ static void ftrace_ops_assist_func(unsig
 
 	preempt_disable_notrace();
 
-	if (!(op->flags & FTRACE_OPS_FL_PER_CPU) ||
-	    !ftrace_function_local_disabled(op)) {
-		op->func(ip, parent_ip, op, regs);
-	}
+	op->func(ip, parent_ip, op, regs);
 
 	preempt_enable_notrace();
 	trace_clear_recursion(bit);
@@ -6154,7 +6111,7 @@ ftrace_func_t ftrace_ops_get_func(struct
 	 * or does per cpu logic, then we need to call the assist handler.
 	 */
 	if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE) ||
-	    ops->flags & (FTRACE_OPS_FL_RCU | FTRACE_OPS_FL_PER_CPU))
+	    ops->flags & (FTRACE_OPS_FL_RCU))
 		return ftrace_ops_assist_func;
 
 	return ops->func;
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -253,14 +253,12 @@ int perf_trace_add(struct perf_event *p_
 	list = this_cpu_ptr(pcpu_list);
 	hlist_add_head_rcu(&p_event->hlist_entry, list);
 
-	return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event);
+	return 0;
 }
 
 void perf_trace_del(struct perf_event *p_event, int flags)
 {
-	struct trace_event_call *tp_event = p_event->tp_event;
 	hlist_del_rcu(&p_event->hlist_entry);
-	tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
 }
 
 void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp)
@@ -302,6 +300,10 @@ void perf_trace_buf_update(void *record,
 NOKPROBE_SYMBOL(perf_trace_buf_update);
 
 #ifdef CONFIG_FUNCTION_TRACER
+static DEFINE_MUTEX(ftrace_lock);
+static int ftrace_perf_refcount;
+static struct ftrace_ops ftrace_perf_ops;
+
 static void
 perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
 			  struct ftrace_ops *ops, struct pt_regs *pt_regs)
@@ -337,29 +339,31 @@ perf_ftrace_function_call(unsigned long
 
 static int perf_ftrace_function_register(struct perf_event *event)
 {
-	struct ftrace_ops *ops = &event->ftrace_ops;
+	int ret = 0;
 
-	ops->flags |= FTRACE_OPS_FL_PER_CPU | FTRACE_OPS_FL_RCU;
-	ops->func = perf_ftrace_function_call;
-	return register_ftrace_function(ops);
-}
+	mutex_lock(&ftrace_lock);
+	if (!ftrace_perf_refcount++) {
+		ftrace_perf_ops.flags = FTRACE_OPS_FL_RCU;
+		ftrace_perf_ops.func  = perf_ftrace_function_call;
+		ret = register_ftrace_function(&ftrace_perf_ops);
+	}
+	mutex_unlock(&ftrace_lock);
 
-static int perf_ftrace_function_unregister(struct perf_event *event)
-{
-	struct ftrace_ops *ops = &event->ftrace_ops;
-	int ret = unregister_ftrace_function(ops);
-	ftrace_free_filter(ops);
 	return ret;
 }
 
-static void perf_ftrace_function_enable(struct perf_event *event)
+static int perf_ftrace_function_unregister(struct perf_event *event)
 {
-	ftrace_function_local_enable(&event->ftrace_ops);
-}
+	int ret = 0;
 
-static void perf_ftrace_function_disable(struct perf_event *event)
-{
-	ftrace_function_local_disable(&event->ftrace_ops);
+	mutex_lock(&ftrace_lock);
+	if (!--ftrace_perf_refcount) {
+		ret = unregister_ftrace_function(&ftrace_perf_ops);
+		ftrace_free_filter(&ftrace_perf_ops);
+	}
+	mutex_unlock(&ftrace_lock);
+
+	return ret;
 }
 
 int perf_ftrace_event_register(struct trace_event_call *call,
@@ -376,12 +380,6 @@ int perf_ftrace_event_register(struct tr
 		return perf_ftrace_function_register(data);
 	case TRACE_REG_PERF_CLOSE:
 		return perf_ftrace_function_unregister(data);
-	case TRACE_REG_PERF_ADD:
-		perf_ftrace_function_enable(data);
-		return 0;
-	case TRACE_REG_PERF_DEL:
-		perf_ftrace_function_disable(data);
-		return 0;
 	}
 
 	return -EINVAL;
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -312,8 +312,6 @@ int trace_event_reg(struct trace_event_c
 		return 0;
 	case TRACE_REG_PERF_OPEN:
 	case TRACE_REG_PERF_CLOSE:
-	case TRACE_REG_PERF_ADD:
-	case TRACE_REG_PERF_DEL:
 		return 0;
 #endif
 	}
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1266,8 +1266,6 @@ static int kprobe_register(struct trace_
 		return disable_trace_kprobe(tk, NULL);
 	case TRACE_REG_PERF_OPEN:
 	case TRACE_REG_PERF_CLOSE:
-	case TRACE_REG_PERF_ADD:
-	case TRACE_REG_PERF_DEL:
 		return 0;
 #endif
 	}
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -775,8 +775,6 @@ static int syscall_enter_register(struct
 		return 0;
 	case TRACE_REG_PERF_OPEN:
 	case TRACE_REG_PERF_CLOSE:
-	case TRACE_REG_PERF_ADD:
-	case TRACE_REG_PERF_DEL:
 		return 0;
 #endif
 	}
@@ -803,8 +801,6 @@ static int syscall_exit_register(struct
 		return 0;
 	case TRACE_REG_PERF_OPEN:
 	case TRACE_REG_PERF_CLOSE:
-	case TRACE_REG_PERF_ADD:
-	case TRACE_REG_PERF_DEL:
 		return 0;
 #endif
 	}

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] perf/ftrace : Fix repetitious traces when specify a target task
  2017-10-10 12:18   ` chengjian (D)
@ 2017-10-10 13:42     ` Peter Zijlstra
  0 siblings, 0 replies; 7+ messages in thread
From: Peter Zijlstra @ 2017-10-10 13:42 UTC (permalink / raw)
  To: chengjian (D)
  Cc: mingo, acme, alexander.shishkin, linux-kernel, xiexiuqi,
	huawei.libin, Steven Rostedt, Jiri Olsa

On Tue, Oct 10, 2017 at 08:18:13PM +0800, chengjian (D) wrote:

> and what cases should the events be delivered multiple times?


Task-A		Task-B		Task-C		Task-D

fd = sys_perf_event_open(B)			fd = sys_perf_event_open(C)

		wake_up_process(C)

Here A traces B, and D traces C. When B wakes C, we want both A and D to
get the event.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] perf/ftrace : Fix repetitious traces when specify a target  task
  2017-10-10 13:04   ` Peter Zijlstra
@ 2017-10-10 15:21     ` Peter Zijlstra
  2017-10-10 16:35       ` Steven Rostedt
  0 siblings, 1 reply; 7+ messages in thread
From: Peter Zijlstra @ 2017-10-10 15:21 UTC (permalink / raw)
  To: Cheng Jian
  Cc: mingo, acme, alexander.shishkin, linux-kernel, xiexiuqi,
	huawei.libin, Steven Rostedt, Jiri Olsa

On Tue, Oct 10, 2017 at 03:04:48PM +0200, Peter Zijlstra wrote:
> On Tue, Oct 10, 2017 at 01:33:21PM +0200, Peter Zijlstra wrote:
> > But now you've got me looking at 75e8387685f6, which also looks
> > completely insane.
> 
> The reason I insta stumbled on that patch is that it only addresses the
> ftrace situation and doesn't mention the other _5_ places that use this
> interface. It doesn't explain why those don't have the problem and if
> not, why their solution doesn't work for ftrace.
> 
> So all (well syscall and regular tracepoints, didn't check the others)
> avoid that problem by simply not registering multiple times at the
> tracepoint. Tracepoints use tp_event->perf_refcount and the syscall
> things use sys_perf_refcount_{enter,exit} for that.
> 
> Doing the same for function trace looks a little something like the
> below (after reverting 75e8387685f6)
> 
> Except the below doesn't compile because of
> kernel/trace/trace_event_filter.c, which is where I lost the plot.

OK, so that filter stuff was the entire reason for this trainwreck :/

Using ftrace_ops filters allows ftrace to patch less functions etc.. So
that requires an ftrace_ops per event. Still that then instantly
suggests we fix the whole hlist situation instead of making it worse.

See below; I now have 3 patches: revert, the below, kill
FTRACE_OPS_FL_PER_CPU.

How's this?

---
 kernel/trace/trace_event_perf.c |   68 +++++++++++++++++++++-------------------
 1 file changed, 36 insertions(+), 32 deletions(-)

--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -240,27 +240,31 @@ void perf_trace_destroy(struct perf_even
 int perf_trace_add(struct perf_event *p_event, int flags)
 {
 	struct trace_event_call *tp_event = p_event->tp_event;
-	struct hlist_head __percpu *pcpu_list;
-	struct hlist_head *list;
 
-	pcpu_list = tp_event->perf_events;
-	if (WARN_ON_ONCE(!pcpu_list))
-		return -EINVAL;
+	if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event)) {
+		struct hlist_head __percpu *pcpu_list;
+		struct hlist_head *list;
+
+		pcpu_list = tp_event->perf_events;
+		if (WARN_ON_ONCE(!pcpu_list))
+			return -EINVAL;
 
-	if (!(flags & PERF_EF_START))
-		p_event->hw.state = PERF_HES_STOPPED;
+		if (!(flags & PERF_EF_START))
+			p_event->hw.state = PERF_HES_STOPPED;
 
-	list = this_cpu_ptr(pcpu_list);
-	hlist_add_head_rcu(&p_event->hlist_entry, list);
+		list = this_cpu_ptr(pcpu_list);
+		hlist_add_head_rcu(&p_event->hlist_entry, list);
+	}
 
-	return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event);
+	return 0;
 }
 
 void perf_trace_del(struct perf_event *p_event, int flags)
 {
 	struct trace_event_call *tp_event = p_event->tp_event;
-	hlist_del_rcu(&p_event->hlist_entry);
-	tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
+
+	if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event))
+		hlist_del_rcu(&p_event->hlist_entry);
 }
 
 void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp)
@@ -306,15 +310,19 @@ static void
 perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
 			  struct ftrace_ops *ops, struct pt_regs *pt_regs)
 {
+	struct hlist_head head = HLIST_HEAD_INIT;
 	struct ftrace_entry *entry;
-	struct hlist_head *head;
+	struct perf_event *event;
 	struct pt_regs regs;
 	int rctx;
 
-	head = this_cpu_ptr(event_function.perf_events);
-	if (hlist_empty(head))
+	event = container_of(ops, struct perf_event, ftrace_ops);
+
+	if (!event->ftrace_ops.private)
 		return;
 
+	hlist_add_head(&event->hlist_entry, &head);
+
 #define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \
 		    sizeof(u64)) - sizeof(u32))
 
@@ -330,17 +338,21 @@ perf_ftrace_function_call(unsigned long
 	entry->ip = ip;
 	entry->parent_ip = parent_ip;
 	perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
-			      1, &regs, head, NULL);
+			      1, &regs, &head, NULL);
 
 #undef ENTRY_SIZE
+
+	hlist_del_init(&event->hlist_entry);
 }
 
 static int perf_ftrace_function_register(struct perf_event *event)
 {
 	struct ftrace_ops *ops = &event->ftrace_ops;
 
-	ops->flags |= FTRACE_OPS_FL_PER_CPU | FTRACE_OPS_FL_RCU;
-	ops->func = perf_ftrace_function_call;
+	ops->flags   = FTRACE_OPS_FL_RCU;
+	ops->func    = perf_ftrace_function_call;
+	ops->private = NULL;
+
 	return register_ftrace_function(ops);
 }
 
@@ -352,19 +364,11 @@ static int perf_ftrace_function_unregist
 	return ret;
 }
 
-static void perf_ftrace_function_enable(struct perf_event *event)
-{
-	ftrace_function_local_enable(&event->ftrace_ops);
-}
-
-static void perf_ftrace_function_disable(struct perf_event *event)
-{
-	ftrace_function_local_disable(&event->ftrace_ops);
-}
-
 int perf_ftrace_event_register(struct trace_event_call *call,
 			       enum trace_reg type, void *data)
 {
+	struct perf_event *event = data;
+
 	switch (type) {
 	case TRACE_REG_REGISTER:
 	case TRACE_REG_UNREGISTER:
@@ -377,11 +381,11 @@ int perf_ftrace_event_register(struct tr
 	case TRACE_REG_PERF_CLOSE:
 		return perf_ftrace_function_unregister(data);
 	case TRACE_REG_PERF_ADD:
-		perf_ftrace_function_enable(data);
-		return 0;
+		event->ftrace_ops.private = (void *)1UL;
+		return 1;
 	case TRACE_REG_PERF_DEL:
-		perf_ftrace_function_disable(data);
-		return 0;
+		event->ftrace_ops.private = (void *)0UL;
+		return 1;
 	}
 
 	return -EINVAL;

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] perf/ftrace : Fix repetitious traces when specify a target  task
  2017-10-10 15:21     ` Peter Zijlstra
@ 2017-10-10 16:35       ` Steven Rostedt
  0 siblings, 0 replies; 7+ messages in thread
From: Steven Rostedt @ 2017-10-10 16:35 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Cheng Jian, mingo, acme, alexander.shishkin, linux-kernel,
	xiexiuqi, huawei.libin, Jiri Olsa


I believe that Jiri understands this code better than I do.

On Tue, 10 Oct 2017 17:21:46 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> On Tue, Oct 10, 2017 at 03:04:48PM +0200, Peter Zijlstra wrote:
> > On Tue, Oct 10, 2017 at 01:33:21PM +0200, Peter Zijlstra wrote:  
> > > But now you've got me looking at 75e8387685f6, which also looks
> > > completely insane.  
> > 
> > The reason I insta stumbled on that patch is that it only addresses the
> > ftrace situation and doesn't mention the other _5_ places that use this
> > interface. It doesn't explain why those don't have the problem and if
> > not, why their solution doesn't work for ftrace.
> > 
> > So all (well syscall and regular tracepoints, didn't check the others)
> > avoid that problem by simply not registering multiple times at the
> > tracepoint. Tracepoints use tp_event->perf_refcount and the syscall
> > things use sys_perf_refcount_{enter,exit} for that.
> > 
> > Doing the same for function trace looks a little something like the
> > below (after reverting 75e8387685f6)
> > 
> > Except the below doesn't compile because of
> > kernel/trace/trace_event_filter.c, which is where I lost the plot.  
> 
> OK, so that filter stuff was the entire reason for this trainwreck :/
> 
> Using ftrace_ops filters allows ftrace to patch less functions etc.. So
> that requires an ftrace_ops per event. Still that then instantly
> suggests we fix the whole hlist situation instead of making it worse.
> 
> See below; I now have 3 patches: revert, the below, kill
> FTRACE_OPS_FL_PER_CPU.
> 
> How's this?
> 
> ---
>  kernel/trace/trace_event_perf.c |   68 +++++++++++++++++++++-------------------
>  1 file changed, 36 insertions(+), 32 deletions(-)
> 
> --- a/kernel/trace/trace_event_perf.c
> +++ b/kernel/trace/trace_event_perf.c
> @@ -240,27 +240,31 @@ void perf_trace_destroy(struct perf_even
>  int perf_trace_add(struct perf_event *p_event, int flags)
>  {
>  	struct trace_event_call *tp_event = p_event->tp_event;
> -	struct hlist_head __percpu *pcpu_list;
> -	struct hlist_head *list;
>  
> -	pcpu_list = tp_event->perf_events;
> -	if (WARN_ON_ONCE(!pcpu_list))
> -		return -EINVAL;
> +	if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event)) {

We probably still want to check for reg returning less than zero.

And there really should be a comment here. If I understand this
correctly, ftrace reg functions return 1 and trace_event reg functions
return zero (entering into this path). But the trace_event register
functions can fail, and will return negative values.


> +		struct hlist_head __percpu *pcpu_list;
> +		struct hlist_head *list;
> +
> +		pcpu_list = tp_event->perf_events;
> +		if (WARN_ON_ONCE(!pcpu_list))
> +			return -EINVAL;
>  
> -	if (!(flags & PERF_EF_START))
> -		p_event->hw.state = PERF_HES_STOPPED;
> +		if (!(flags & PERF_EF_START))
> +			p_event->hw.state = PERF_HES_STOPPED;
>  
> -	list = this_cpu_ptr(pcpu_list);
> -	hlist_add_head_rcu(&p_event->hlist_entry, list);
> +		list = this_cpu_ptr(pcpu_list);
> +		hlist_add_head_rcu(&p_event->hlist_entry, list);
> +	}
>  
> -	return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event);
> +	return 0;
>  }
>  
>  void perf_trace_del(struct perf_event *p_event, int flags)
>  {
>  	struct trace_event_call *tp_event = p_event->tp_event;
> -	hlist_del_rcu(&p_event->hlist_entry);
> -	tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
> +
> +	if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event))

This shouldn't ever fail. I believe all unregister (DEL) functions
return 0 (for trace_events). But probably comment anyway.

-- Steve

> +		hlist_del_rcu(&p_event->hlist_entry);
>  }
>  
>  void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp)
> @@ -306,15 +310,19 @@ static void
>  perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
>  			  struct ftrace_ops *ops, struct pt_regs *pt_regs)
>  {
> +	struct hlist_head head = HLIST_HEAD_INIT;
>  	struct ftrace_entry *entry;
> -	struct hlist_head *head;
> +	struct perf_event *event;
>  	struct pt_regs regs;
>  	int rctx;
>  
> -	head = this_cpu_ptr(event_function.perf_events);
> -	if (hlist_empty(head))
> +	event = container_of(ops, struct perf_event, ftrace_ops);
> +
> +	if (!event->ftrace_ops.private)
>  		return;
>  
> +	hlist_add_head(&event->hlist_entry, &head);
> +
>  #define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \
>  		    sizeof(u64)) - sizeof(u32))
>  
> @@ -330,17 +338,21 @@ perf_ftrace_function_call(unsigned long
>  	entry->ip = ip;
>  	entry->parent_ip = parent_ip;
>  	perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
> -			      1, &regs, head, NULL);
> +			      1, &regs, &head, NULL);
>  
>  #undef ENTRY_SIZE
> +
> +	hlist_del_init(&event->hlist_entry);
>  }
>  
>  static int perf_ftrace_function_register(struct perf_event *event)
>  {
>  	struct ftrace_ops *ops = &event->ftrace_ops;
>  
> -	ops->flags |= FTRACE_OPS_FL_PER_CPU | FTRACE_OPS_FL_RCU;
> -	ops->func = perf_ftrace_function_call;
> +	ops->flags   = FTRACE_OPS_FL_RCU;
> +	ops->func    = perf_ftrace_function_call;
> +	ops->private = NULL;
> +
>  	return register_ftrace_function(ops);
>  }
>  
> @@ -352,19 +364,11 @@ static int perf_ftrace_function_unregist
>  	return ret;
>  }
>  
> -static void perf_ftrace_function_enable(struct perf_event *event)
> -{
> -	ftrace_function_local_enable(&event->ftrace_ops);
> -}
> -
> -static void perf_ftrace_function_disable(struct perf_event *event)
> -{
> -	ftrace_function_local_disable(&event->ftrace_ops);
> -}
> -
>  int perf_ftrace_event_register(struct trace_event_call *call,
>  			       enum trace_reg type, void *data)
>  {
> +	struct perf_event *event = data;
> +
>  	switch (type) {
>  	case TRACE_REG_REGISTER:
>  	case TRACE_REG_UNREGISTER:
> @@ -377,11 +381,11 @@ int perf_ftrace_event_register(struct tr
>  	case TRACE_REG_PERF_CLOSE:
>  		return perf_ftrace_function_unregister(data);
>  	case TRACE_REG_PERF_ADD:
> -		perf_ftrace_function_enable(data);
> -		return 0;
> +		event->ftrace_ops.private = (void *)1UL;
> +		return 1;
>  	case TRACE_REG_PERF_DEL:
> -		perf_ftrace_function_disable(data);
> -		return 0;
> +		event->ftrace_ops.private = (void *)0UL;
> +		return 1;
>  	}
>  
>  	return -EINVAL;

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2017-10-10 16:35 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-10-10  1:19 [PATCH] perf/ftrace : Fix repetitious traces when specify a target task Cheng Jian
2017-10-10 11:33 ` Peter Zijlstra
2017-10-10 12:18   ` chengjian (D)
2017-10-10 13:42     ` Peter Zijlstra
2017-10-10 13:04   ` Peter Zijlstra
2017-10-10 15:21     ` Peter Zijlstra
2017-10-10 16:35       ` Steven Rostedt

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.