linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/4] perf/ftrace: Sanitize perf function-trace interaction
@ 2017-10-11  7:45 Peter Zijlstra
  2017-10-11  7:45 ` [PATCH 1/4] perf/ftrace: Revert ("perf/ftrace: Fix double traces of perf on ftrace:function") Peter Zijlstra
                   ` (3 more replies)
  0 siblings, 4 replies; 13+ messages in thread
From: Peter Zijlstra @ 2017-10-11  7:45 UTC (permalink / raw)
  To: rostedt, mingo; +Cc: linux-kernel, jolsa, zhouchengming1, peterz

less is more..

---
 include/linux/ftrace.h          | 83 +++++++---------------------------------
 include/linux/perf_event.h      |  2 +-
 include/linux/trace_events.h    |  9 ++++-
 kernel/events/core.c            | 13 ++-----
 kernel/trace/ftrace.c           | 55 +++------------------------
 kernel/trace/trace_event_perf.c | 84 ++++++++++++++++++++++++-----------------
 kernel/trace/trace_kprobe.c     |  4 +-
 kernel/trace/trace_syscalls.c   |  4 +-
 kernel/trace/trace_uprobe.c     |  2 +-
 9 files changed, 87 insertions(+), 169 deletions(-)

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 1/4] perf/ftrace: Revert ("perf/ftrace: Fix double traces of perf on ftrace:function")
  2017-10-11  7:45 [PATCH 0/4] perf/ftrace: Sanitize perf function-trace interaction Peter Zijlstra
@ 2017-10-11  7:45 ` Peter Zijlstra
  2017-10-11 12:57   ` Steven Rostedt
  2017-10-11  7:45 ` [PATCH 2/4] perf/ftrace: Fix function trace events Peter Zijlstra
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 13+ messages in thread
From: Peter Zijlstra @ 2017-10-11  7:45 UTC (permalink / raw)
  To: rostedt, mingo; +Cc: linux-kernel, jolsa, zhouchengming1, peterz

[-- Attachment #1: peterz-perf-revert-75e8387685f6 --]
[-- Type: text/plain, Size: 5547 bytes --]

Revert commit:

  75e8387685f6 ("perf/ftrace: Fix double traces of perf on ftrace:function")

The reason I instantly stumbled on that patch is that it only addresses the
ftrace situation and doesn't mention the other _5_ places that use this
interface. It doesn't explain why those don't have the problem and if not, why
their solution doesn't work for ftrace.

It doesn't, but this is just putting more duct tape on.

Cc: Zhou Chengming <zhouchengming1@huawei.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/perf_event.h      |    2 +-
 include/linux/trace_events.h    |    4 ++--
 kernel/events/core.c            |   13 ++++---------
 kernel/trace/trace_event_perf.c |    4 +---
 kernel/trace/trace_kprobe.c     |    4 ++--
 kernel/trace/trace_syscalls.c   |    4 ++--
 kernel/trace/trace_uprobe.c     |    2 +-
 7 files changed, 13 insertions(+), 20 deletions(-)

--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1169,7 +1169,7 @@ extern void perf_event_init(void);
 extern void perf_tp_event(u16 event_type, u64 count, void *record,
 			  int entry_size, struct pt_regs *regs,
 			  struct hlist_head *head, int rctx,
-			  struct task_struct *task, struct perf_event *event);
+			  struct task_struct *task);
 extern void perf_bp_event(struct perf_event *event, void *data);
 
 #ifndef perf_misc_flags
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -507,9 +507,9 @@ void perf_trace_run_bpf_submit(void *raw
 static inline void
 perf_trace_buf_submit(void *raw_data, int size, int rctx, u16 type,
 		       u64 count, struct pt_regs *regs, void *head,
-		       struct task_struct *task, struct perf_event *event)
+		       struct task_struct *task)
 {
-	perf_tp_event(type, count, raw_data, size, regs, head, rctx, task, event);
+	perf_tp_event(type, count, raw_data, size, regs, head, rctx, task);
 }
 #endif
 
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7873,15 +7873,16 @@ void perf_trace_run_bpf_submit(void *raw
 		}
 	}
 	perf_tp_event(call->event.type, count, raw_data, size, regs, head,
-		      rctx, task, NULL);
+		      rctx, task);
 }
 EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
 
 void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
 		   struct pt_regs *regs, struct hlist_head *head, int rctx,
-		   struct task_struct *task, struct perf_event *event)
+		   struct task_struct *task)
 {
 	struct perf_sample_data data;
+	struct perf_event *event;
 
 	struct perf_raw_record raw = {
 		.frag = {
@@ -7895,15 +7896,9 @@ void perf_tp_event(u16 event_type, u64 c
 
 	perf_trace_buf_update(record, event_type);
 
-	/* Use the given event instead of the hlist */
-	if (event) {
+	hlist_for_each_entry_rcu(event, head, hlist_entry) {
 		if (perf_tp_event_match(event, &data, regs))
 			perf_swevent_event(event, count, &data, regs);
-	} else {
-		hlist_for_each_entry_rcu(event, head, hlist_entry) {
-			if (perf_tp_event_match(event, &data, regs))
-				perf_swevent_event(event, count, &data, regs);
-		}
 	}
 
 	/*
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -306,7 +306,6 @@ static void
 perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
 			  struct ftrace_ops *ops, struct pt_regs *pt_regs)
 {
-	struct perf_event *event;
 	struct ftrace_entry *entry;
 	struct hlist_head *head;
 	struct pt_regs regs;
@@ -330,9 +329,8 @@ perf_ftrace_function_call(unsigned long
 
 	entry->ip = ip;
 	entry->parent_ip = parent_ip;
-	event = container_of(ops, struct perf_event, ftrace_ops);
 	perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
-			      1, &regs, head, NULL, event);
+			      1, &regs, head, NULL);
 
 #undef ENTRY_SIZE
 }
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1200,7 +1200,7 @@ kprobe_perf_func(struct trace_kprobe *tk
 	memset(&entry[1], 0, dsize);
 	store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
 	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
-			      head, NULL, NULL);
+			      head, NULL);
 }
 NOKPROBE_SYMBOL(kprobe_perf_func);
 
@@ -1236,7 +1236,7 @@ kretprobe_perf_func(struct trace_kprobe
 	entry->ret_ip = (unsigned long)ri->ret_addr;
 	store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
 	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
-			      head, NULL, NULL);
+			      head, NULL);
 }
 NOKPROBE_SYMBOL(kretprobe_perf_func);
 #endif	/* CONFIG_PERF_EVENTS */
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -622,7 +622,7 @@ static void perf_syscall_enter(void *ign
 
 	perf_trace_buf_submit(rec, size, rctx,
 			      sys_data->enter_event->event.type, 1, regs,
-			      head, NULL, NULL);
+			      head, NULL);
 }
 
 static int perf_sysenter_enable(struct trace_event_call *call)
@@ -716,7 +716,7 @@ static void perf_syscall_exit(void *igno
 	}
 
 	perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type,
-			      1, regs, head, NULL, NULL);
+			      1, regs, head, NULL);
 }
 
 static int perf_sysexit_enable(struct trace_event_call *call)
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1156,7 +1156,7 @@ static void __uprobe_perf_func(struct tr
 	}
 
 	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
-			      head, NULL, NULL);
+			      head, NULL);
  out:
 	preempt_enable();
 }

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 2/4] perf/ftrace: Fix function trace events
  2017-10-11  7:45 [PATCH 0/4] perf/ftrace: Sanitize perf function-trace interaction Peter Zijlstra
  2017-10-11  7:45 ` [PATCH 1/4] perf/ftrace: Revert ("perf/ftrace: Fix double traces of perf on ftrace:function") Peter Zijlstra
@ 2017-10-11  7:45 ` Peter Zijlstra
  2017-10-11  9:02   ` zhouchengming
                     ` (2 more replies)
  2017-10-11  7:45 ` [PATCH 3/4] perf/ftrace: Small cleanup Peter Zijlstra
  2017-10-11  7:45 ` [PATCH 4/4] ftrace: Kill FTRACE_OPS_FL_PER_CPU Peter Zijlstra
  3 siblings, 3 replies; 13+ messages in thread
From: Peter Zijlstra @ 2017-10-11  7:45 UTC (permalink / raw)
  To: rostedt, mingo; +Cc: linux-kernel, jolsa, zhouchengming1, peterz

[-- Attachment #1: peterz-perf-fix-ftrace.patch --]
[-- Type: text/plain, Size: 5620 bytes --]

The function-trace <-> perf interface is a tad messed up. Where all
the other trace <-> perf interfaces use a single trace hook
registration and use per-cpu RCU based hlist to iterate the events,
function-trace actually needs multiple hook registrations in order to
minimize function entry patching when filters are present.

The end result is that we iterate events both on the trace hook and on
the hlist, which results in reporting events multiple times.

Since function-trace cannot use the regular scheme, fix it the other
way around, use singleton hlists.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/trace_events.h    |    5 ++
 kernel/trace/trace_event_perf.c |   82 ++++++++++++++++++++++++----------------
 2 files changed, 55 insertions(+), 32 deletions(-)

--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -173,6 +173,11 @@ enum trace_reg {
 	TRACE_REG_PERF_UNREGISTER,
 	TRACE_REG_PERF_OPEN,
 	TRACE_REG_PERF_CLOSE,
+	/*
+	 * These (ADD/DEL) use a 'boolean' return value, where 1 (true) means a
+	 * custom action was taken and the default action is not to be
+	 * performed.
+	 */
 	TRACE_REG_PERF_ADD,
 	TRACE_REG_PERF_DEL,
 #endif
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -240,27 +240,41 @@ void perf_trace_destroy(struct perf_even
 int perf_trace_add(struct perf_event *p_event, int flags)
 {
 	struct trace_event_call *tp_event = p_event->tp_event;
-	struct hlist_head __percpu *pcpu_list;
-	struct hlist_head *list;
 
-	pcpu_list = tp_event->perf_events;
-	if (WARN_ON_ONCE(!pcpu_list))
-		return -EINVAL;
+	/*
+	 * If TRACE_REG_PERF_ADD returns false; no custom action was performed
+	 * and we need to take the default action of enqueueing our event on
+	 * the right per-cpu hlist.
+	 */
+	if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event)) {
+		struct hlist_head __percpu *pcpu_list;
+		struct hlist_head *list;
+
+		pcpu_list = tp_event->perf_events;
+		if (WARN_ON_ONCE(!pcpu_list))
+			return -EINVAL;
 
-	if (!(flags & PERF_EF_START))
-		p_event->hw.state = PERF_HES_STOPPED;
+		if (!(flags & PERF_EF_START))
+			p_event->hw.state = PERF_HES_STOPPED;
 
-	list = this_cpu_ptr(pcpu_list);
-	hlist_add_head_rcu(&p_event->hlist_entry, list);
+		list = this_cpu_ptr(pcpu_list);
+		hlist_add_head_rcu(&p_event->hlist_entry, list);
+	}
 
-	return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event);
+	return 0;
 }
 
 void perf_trace_del(struct perf_event *p_event, int flags)
 {
 	struct trace_event_call *tp_event = p_event->tp_event;
-	hlist_del_rcu(&p_event->hlist_entry);
-	tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
+
+	/*
+	 * If TRACE_REG_PERF_DEL returns false; no custom action was performed
+	 * and we need to take the default action of dequeueing our event from
+	 * the right per-cpu hlist.
+	 */
+	if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event))
+		hlist_del_rcu(&p_event->hlist_entry);
 }
 
 void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp)
@@ -307,14 +321,24 @@ perf_ftrace_function_call(unsigned long
 			  struct ftrace_ops *ops, struct pt_regs *pt_regs)
 {
 	struct ftrace_entry *entry;
-	struct hlist_head *head;
+	struct perf_event *event;
+	struct hlist_head head;
 	struct pt_regs regs;
 	int rctx;
 
-	head = this_cpu_ptr(event_function.perf_events);
-	if (hlist_empty(head))
+	if ((unsigned long)ops->private != smp_processor_id())
 		return;
 
+	event = container_of(ops, struct perf_event, ftrace_ops);
+
+	/*
+	 * @event->hlist entry is NULL (per INIT_HLIST_NODE), and all
+	 * the perf code does is hlist_for_each_entry_rcu(), so we can
+	 * get away with simply setting the @head.first pointer in order
+	 * to create a singular list.
+	 */
+	head.first = &event->hlist_entry;
+
 #define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \
 		    sizeof(u64)) - sizeof(u32))
 
@@ -330,7 +354,7 @@ perf_ftrace_function_call(unsigned long
 	entry->ip = ip;
 	entry->parent_ip = parent_ip;
 	perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
-			      1, &regs, head, NULL);
+			      1, &regs, &head, NULL);
 
 #undef ENTRY_SIZE
 }
@@ -339,8 +363,10 @@ static int perf_ftrace_function_register
 {
 	struct ftrace_ops *ops = &event->ftrace_ops;
 
-	ops->flags |= FTRACE_OPS_FL_PER_CPU | FTRACE_OPS_FL_RCU;
-	ops->func = perf_ftrace_function_call;
+	ops->flags   |= FTRACE_OPS_FL_RCU;
+	ops->func    = perf_ftrace_function_call;
+	ops->private = (void *)(unsigned long)nr_cpu_ids;
+
 	return register_ftrace_function(ops);
 }
 
@@ -352,19 +378,11 @@ static int perf_ftrace_function_unregist
 	return ret;
 }
 
-static void perf_ftrace_function_enable(struct perf_event *event)
-{
-	ftrace_function_local_enable(&event->ftrace_ops);
-}
-
-static void perf_ftrace_function_disable(struct perf_event *event)
-{
-	ftrace_function_local_disable(&event->ftrace_ops);
-}
-
 int perf_ftrace_event_register(struct trace_event_call *call,
 			       enum trace_reg type, void *data)
 {
+	struct perf_event *event = data;
+
 	switch (type) {
 	case TRACE_REG_REGISTER:
 	case TRACE_REG_UNREGISTER:
@@ -377,11 +395,11 @@ int perf_ftrace_event_register(struct tr
 	case TRACE_REG_PERF_CLOSE:
 		return perf_ftrace_function_unregister(data);
 	case TRACE_REG_PERF_ADD:
-		perf_ftrace_function_enable(data);
-		return 0;
+		event->ftrace_ops.private = (void *)(unsigned long)smp_processor_id();
+		return 1;
 	case TRACE_REG_PERF_DEL:
-		perf_ftrace_function_disable(data);
-		return 0;
+		event->ftrace_ops.private = (void *)(unsigned long)nr_cpu_ids;
+		return 1;
 	}
 
 	return -EINVAL;

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 3/4] perf/ftrace: Small cleanup
  2017-10-11  7:45 [PATCH 0/4] perf/ftrace: Sanitize perf function-trace interaction Peter Zijlstra
  2017-10-11  7:45 ` [PATCH 1/4] perf/ftrace: Revert ("perf/ftrace: Fix double traces of perf on ftrace:function") Peter Zijlstra
  2017-10-11  7:45 ` [PATCH 2/4] perf/ftrace: Fix function trace events Peter Zijlstra
@ 2017-10-11  7:45 ` Peter Zijlstra
  2017-10-11  7:45 ` [PATCH 4/4] ftrace: Kill FTRACE_OPS_FL_PER_CPU Peter Zijlstra
  3 siblings, 0 replies; 13+ messages in thread
From: Peter Zijlstra @ 2017-10-11  7:45 UTC (permalink / raw)
  To: rostedt, mingo; +Cc: linux-kernel, jolsa, zhouchengming1, peterz

[-- Attachment #1: peterz-perf-trace-cleanup.patch --]
[-- Type: text/plain, Size: 660 bytes --]

ops->flags _should_ be 0 at this point, so setting the flag using
bitwise or is a bit daft.

Requested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/trace/trace_event_perf.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -362,7 +362,7 @@ static int perf_ftrace_function_register
 {
 	struct ftrace_ops *ops = &event->ftrace_ops;
 
-	ops->flags   |= FTRACE_OPS_FL_RCU;
+	ops->flags   = FTRACE_OPS_FL_RCU;
 	ops->func    = perf_ftrace_function_call;
 	ops->private = (void *)(unsigned long)nr_cpu_ids;
 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 4/4] ftrace: Kill FTRACE_OPS_FL_PER_CPU
  2017-10-11  7:45 [PATCH 0/4] perf/ftrace: Sanitize perf function-trace interaction Peter Zijlstra
                   ` (2 preceding siblings ...)
  2017-10-11  7:45 ` [PATCH 3/4] perf/ftrace: Small cleanup Peter Zijlstra
@ 2017-10-11  7:45 ` Peter Zijlstra
  3 siblings, 0 replies; 13+ messages in thread
From: Peter Zijlstra @ 2017-10-11  7:45 UTC (permalink / raw)
  To: rostedt, mingo; +Cc: linux-kernel, jolsa, zhouchengming1, peterz

[-- Attachment #1: peterz-trace-percpu.patch --]
[-- Type: text/plain, Size: 8426 bytes --]

The one and only user of FTRACE_OPS_FL_PER_CPU is gone, remove the
lot.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/ftrace.h |   83 ++++++++-----------------------------------------
 kernel/trace/ftrace.c  |   55 +++-----------------------------
 2 files changed, 20 insertions(+), 118 deletions(-)

--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -78,10 +78,6 @@ ftrace_func_t ftrace_ops_get_func(struct
  * ENABLED - set/unset when ftrace_ops is registered/unregistered
  * DYNAMIC - set when ftrace_ops is registered to denote dynamically
  *           allocated ftrace_ops which need special care
- * PER_CPU - set manualy by ftrace_ops user to denote the ftrace_ops
- *           could be controlled by following calls:
- *             ftrace_function_local_enable
- *             ftrace_function_local_disable
  * SAVE_REGS - The ftrace_ops wants regs saved at each function called
  *            and passed to the callback. If this flag is set, but the
  *            architecture does not support passing regs
@@ -125,21 +121,20 @@ ftrace_func_t ftrace_ops_get_func(struct
 enum {
 	FTRACE_OPS_FL_ENABLED			= 1 << 0,
 	FTRACE_OPS_FL_DYNAMIC			= 1 << 1,
-	FTRACE_OPS_FL_PER_CPU			= 1 << 2,
-	FTRACE_OPS_FL_SAVE_REGS			= 1 << 3,
-	FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED	= 1 << 4,
-	FTRACE_OPS_FL_RECURSION_SAFE		= 1 << 5,
-	FTRACE_OPS_FL_STUB			= 1 << 6,
-	FTRACE_OPS_FL_INITIALIZED		= 1 << 7,
-	FTRACE_OPS_FL_DELETED			= 1 << 8,
-	FTRACE_OPS_FL_ADDING			= 1 << 9,
-	FTRACE_OPS_FL_REMOVING			= 1 << 10,
-	FTRACE_OPS_FL_MODIFYING			= 1 << 11,
-	FTRACE_OPS_FL_ALLOC_TRAMP		= 1 << 12,
-	FTRACE_OPS_FL_IPMODIFY			= 1 << 13,
-	FTRACE_OPS_FL_PID			= 1 << 14,
-	FTRACE_OPS_FL_RCU			= 1 << 15,
-	FTRACE_OPS_FL_TRACE_ARRAY		= 1 << 16,
+	FTRACE_OPS_FL_SAVE_REGS			= 1 << 2,
+	FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED	= 1 << 3,
+	FTRACE_OPS_FL_RECURSION_SAFE		= 1 << 4,
+	FTRACE_OPS_FL_STUB			= 1 << 5,
+	FTRACE_OPS_FL_INITIALIZED		= 1 << 6,
+	FTRACE_OPS_FL_DELETED			= 1 << 7,
+	FTRACE_OPS_FL_ADDING			= 1 << 8,
+	FTRACE_OPS_FL_REMOVING			= 1 << 9,
+	FTRACE_OPS_FL_MODIFYING			= 1 << 10,
+	FTRACE_OPS_FL_ALLOC_TRAMP		= 1 << 11,
+	FTRACE_OPS_FL_IPMODIFY			= 1 << 12,
+	FTRACE_OPS_FL_PID			= 1 << 13,
+	FTRACE_OPS_FL_RCU			= 1 << 14,
+	FTRACE_OPS_FL_TRACE_ARRAY		= 1 << 15,
 };
 
 #ifdef CONFIG_DYNAMIC_FTRACE
@@ -172,7 +167,6 @@ struct ftrace_ops {
 	unsigned long			flags;
 	void				*private;
 	ftrace_func_t			saved_func;
-	int __percpu			*disabled;
 #ifdef CONFIG_DYNAMIC_FTRACE
 	struct ftrace_ops_hash		local_hash;
 	struct ftrace_ops_hash		*func_hash;
@@ -204,55 +198,6 @@ int register_ftrace_function(struct ftra
 int unregister_ftrace_function(struct ftrace_ops *ops);
 void clear_ftrace_function(void);
 
-/**
- * ftrace_function_local_enable - enable ftrace_ops on current cpu
- *
- * This function enables tracing on current cpu by decreasing
- * the per cpu control variable.
- * It must be called with preemption disabled and only on ftrace_ops
- * registered with FTRACE_OPS_FL_PER_CPU. If called without preemption
- * disabled, this_cpu_ptr will complain when CONFIG_DEBUG_PREEMPT is enabled.
- */
-static inline void ftrace_function_local_enable(struct ftrace_ops *ops)
-{
-	if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_PER_CPU)))
-		return;
-
-	(*this_cpu_ptr(ops->disabled))--;
-}
-
-/**
- * ftrace_function_local_disable - disable ftrace_ops on current cpu
- *
- * This function disables tracing on current cpu by increasing
- * the per cpu control variable.
- * It must be called with preemption disabled and only on ftrace_ops
- * registered with FTRACE_OPS_FL_PER_CPU. If called without preemption
- * disabled, this_cpu_ptr will complain when CONFIG_DEBUG_PREEMPT is enabled.
- */
-static inline void ftrace_function_local_disable(struct ftrace_ops *ops)
-{
-	if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_PER_CPU)))
-		return;
-
-	(*this_cpu_ptr(ops->disabled))++;
-}
-
-/**
- * ftrace_function_local_disabled - returns ftrace_ops disabled value
- *                                  on current cpu
- *
- * This function returns value of ftrace_ops::disabled on current cpu.
- * It must be called with preemption disabled and only on ftrace_ops
- * registered with FTRACE_OPS_FL_PER_CPU. If called without preemption
- * disabled, this_cpu_ptr will complain when CONFIG_DEBUG_PREEMPT is enabled.
- */
-static inline int ftrace_function_local_disabled(struct ftrace_ops *ops)
-{
-	WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_PER_CPU));
-	return *this_cpu_ptr(ops->disabled);
-}
-
 extern void ftrace_stub(unsigned long a0, unsigned long a1,
 			struct ftrace_ops *op, struct pt_regs *regs);
 
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -203,30 +203,6 @@ void clear_ftrace_function(void)
 	ftrace_trace_function = ftrace_stub;
 }
 
-static void per_cpu_ops_disable_all(struct ftrace_ops *ops)
-{
-	int cpu;
-
-	for_each_possible_cpu(cpu)
-		*per_cpu_ptr(ops->disabled, cpu) = 1;
-}
-
-static int per_cpu_ops_alloc(struct ftrace_ops *ops)
-{
-	int __percpu *disabled;
-
-	if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_PER_CPU)))
-		return -EINVAL;
-
-	disabled = alloc_percpu(int);
-	if (!disabled)
-		return -ENOMEM;
-
-	ops->disabled = disabled;
-	per_cpu_ops_disable_all(ops);
-	return 0;
-}
-
 static void ftrace_sync(struct work_struct *work)
 {
 	/*
@@ -262,8 +238,8 @@ static ftrace_func_t ftrace_ops_get_list
 	 * If this is a dynamic, RCU, or per CPU ops, or we force list func,
 	 * then it needs to call the list anyway.
 	 */
-	if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU |
-			  FTRACE_OPS_FL_RCU) || FTRACE_FORCE_LIST_FUNC)
+	if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_RCU) ||
+	    FTRACE_FORCE_LIST_FUNC)
 		return ftrace_ops_list_func;
 
 	return ftrace_ops_get_func(ops);
@@ -422,11 +398,6 @@ static int __register_ftrace_function(st
 	if (!core_kernel_data((unsigned long)ops))
 		ops->flags |= FTRACE_OPS_FL_DYNAMIC;
 
-	if (ops->flags & FTRACE_OPS_FL_PER_CPU) {
-		if (per_cpu_ops_alloc(ops))
-			return -ENOMEM;
-	}
-
 	add_ftrace_ops(&ftrace_ops_list, ops);
 
 	/* Always save the function, and reset at unregistering */
@@ -2727,11 +2698,6 @@ void __weak arch_ftrace_trampoline_free(
 {
 }
 
-static void per_cpu_ops_free(struct ftrace_ops *ops)
-{
-	free_percpu(ops->disabled);
-}
-
 static void ftrace_startup_enable(int command)
 {
 	if (saved_ftrace_func != ftrace_trace_function) {
@@ -2833,7 +2799,7 @@ static int ftrace_shutdown(struct ftrace
 		 * not currently active, we can just free them
 		 * without synchronizing all CPUs.
 		 */
-		if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU))
+		if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
 			goto free_ops;
 
 		return 0;
@@ -2880,7 +2846,7 @@ static int ftrace_shutdown(struct ftrace
 	 * The same goes for freeing the per_cpu data of the per_cpu
 	 * ops.
 	 */
-	if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU)) {
+	if (ops->flags & FTRACE_OPS_FL_DYNAMIC) {
 		/*
 		 * We need to do a hard force of sched synchronization.
 		 * This is because we use preempt_disable() to do RCU, but
@@ -2903,9 +2869,6 @@ static int ftrace_shutdown(struct ftrace
 
  free_ops:
 		arch_ftrace_trampoline_free(ops);
-
-		if (ops->flags & FTRACE_OPS_FL_PER_CPU)
-			per_cpu_ops_free(ops);
 	}
 
 	return 0;
@@ -6066,10 +6029,7 @@ __ftrace_ops_list_func(unsigned long ip,
 		 * If any of the above fails then the op->func() is not executed.
 		 */
 		if ((!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching()) &&
-		    (!(op->flags & FTRACE_OPS_FL_PER_CPU) ||
-		     !ftrace_function_local_disabled(op)) &&
 		    ftrace_ops_test(op, ip, regs)) {
-		    
 			if (FTRACE_WARN_ON(!op->func)) {
 				pr_warn("op=%p %pS\n", op, op);
 				goto out;
@@ -6127,10 +6087,7 @@ static void ftrace_ops_assist_func(unsig
 
 	preempt_disable_notrace();
 
-	if (!(op->flags & FTRACE_OPS_FL_PER_CPU) ||
-	    !ftrace_function_local_disabled(op)) {
-		op->func(ip, parent_ip, op, regs);
-	}
+	op->func(ip, parent_ip, op, regs);
 
 	preempt_enable_notrace();
 	trace_clear_recursion(bit);
@@ -6154,7 +6111,7 @@ ftrace_func_t ftrace_ops_get_func(struct
 	 * or does per cpu logic, then we need to call the assist handler.
 	 */
 	if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE) ||
-	    ops->flags & (FTRACE_OPS_FL_RCU | FTRACE_OPS_FL_PER_CPU))
+	    ops->flags & FTRACE_OPS_FL_RCU)
 		return ftrace_ops_assist_func;
 
 	return ops->func;

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/4] perf/ftrace: Fix function trace events
  2017-10-11  7:45 ` [PATCH 2/4] perf/ftrace: Fix function trace events Peter Zijlstra
@ 2017-10-11  9:02   ` zhouchengming
  2017-10-11 16:40     ` Peter Zijlstra
  2017-10-11 11:59   ` Jiri Olsa
  2017-10-13  7:17   ` Peter Zijlstra
  2 siblings, 1 reply; 13+ messages in thread
From: zhouchengming @ 2017-10-11  9:02 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: rostedt, mingo, linux-kernel, jolsa

On 2017/10/11 15:45, Peter Zijlstra wrote:
> The function-trace<->  perf interface is a tad messed up. Where all
> the other trace<->  perf interfaces use a single trace hook
> registration and use per-cpu RCU based hlist to iterate the events,
> function-trace actually needs multiple hook registrations in order to
> minimize function entry patching when filters are present.
>
> The end result is that we iterate events both on the trace hook and on
> the hlist, which results in reporting events multiple times.
>
> Since function-trace cannot use the regular scheme, fix it the other
> way around, use singleton hlists.
>
> Signed-off-by: Peter Zijlstra (Intel)<peterz@infradead.org>
> ---
>   include/linux/trace_events.h    |    5 ++
>   kernel/trace/trace_event_perf.c |   82 ++++++++++++++++++++++++----------------
>   2 files changed, 55 insertions(+), 32 deletions(-)
>
> --- a/include/linux/trace_events.h
> +++ b/include/linux/trace_events.h
> @@ -173,6 +173,11 @@ enum trace_reg {
>   	TRACE_REG_PERF_UNREGISTER,
>   	TRACE_REG_PERF_OPEN,
>   	TRACE_REG_PERF_CLOSE,
> +	/*
> +	 * These (ADD/DEL) use a 'boolean' return value, where 1 (true) means a
> +	 * custom action was taken and the default action is not to be
> +	 * performed.
> +	 */
>   	TRACE_REG_PERF_ADD,
>   	TRACE_REG_PERF_DEL,
>   #endif
> --- a/kernel/trace/trace_event_perf.c
> +++ b/kernel/trace/trace_event_perf.c
> @@ -240,27 +240,41 @@ void perf_trace_destroy(struct perf_even
>   int perf_trace_add(struct perf_event *p_event, int flags)
>   {
>   	struct trace_event_call *tp_event = p_event->tp_event;
> -	struct hlist_head __percpu *pcpu_list;
> -	struct hlist_head *list;
>
> -	pcpu_list = tp_event->perf_events;
> -	if (WARN_ON_ONCE(!pcpu_list))
> -		return -EINVAL;
> +	/*
> +	 * If TRACE_REG_PERF_ADD returns false; no custom action was performed
> +	 * and we need to take the default action of enqueueing our event on
> +	 * the right per-cpu hlist.
> +	 */
> +	if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event)) {
> +		struct hlist_head __percpu *pcpu_list;
> +		struct hlist_head *list;
> +
> +		pcpu_list = tp_event->perf_events;
> +		if (WARN_ON_ONCE(!pcpu_list))
> +			return -EINVAL;
>
> -	if (!(flags&  PERF_EF_START))
> -		p_event->hw.state = PERF_HES_STOPPED;
> +		if (!(flags&  PERF_EF_START))
> +			p_event->hw.state = PERF_HES_STOPPED;

Don't we need to check the flags for ftrace perf_event?
So if we should put this outside the if (!tp_event->class->reg()) ?

>
> -	list = this_cpu_ptr(pcpu_list);
> -	hlist_add_head_rcu(&p_event->hlist_entry, list);
> +		list = this_cpu_ptr(pcpu_list);
> +		hlist_add_head_rcu(&p_event->hlist_entry, list);
> +	}

Now we don't add perf_event to the pcpu_list, so we also can avoid
to alloc pcpu_list for function tp_event in perf_trace_event_reg().

Thanks.

>
> -	return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event);
> +	return 0;
>   }
>
>   void perf_trace_del(struct perf_event *p_event, int flags)
>   {
>   	struct trace_event_call *tp_event = p_event->tp_event;
> -	hlist_del_rcu(&p_event->hlist_entry);
> -	tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
> +
> +	/*
> +	 * If TRACE_REG_PERF_DEL returns false; no custom action was performed
> +	 * and we need to take the default action of dequeueing our event from
> +	 * the right per-cpu hlist.
> +	 */
> +	if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event))
> +		hlist_del_rcu(&p_event->hlist_entry);
>   }
>
>   void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp)
> @@ -307,14 +321,24 @@ perf_ftrace_function_call(unsigned long
>   			  struct ftrace_ops *ops, struct pt_regs *pt_regs)
>   {
>   	struct ftrace_entry *entry;
> -	struct hlist_head *head;
> +	struct perf_event *event;
> +	struct hlist_head head;
>   	struct pt_regs regs;
>   	int rctx;
>
> -	head = this_cpu_ptr(event_function.perf_events);
> -	if (hlist_empty(head))
> +	if ((unsigned long)ops->private != smp_processor_id())
>   		return;
>
> +	event = container_of(ops, struct perf_event, ftrace_ops);
> +
> +	/*
> +	 * @event->hlist entry is NULL (per INIT_HLIST_NODE), and all
> +	 * the perf code does is hlist_for_each_entry_rcu(), so we can
> +	 * get away with simply setting the @head.first pointer in order
> +	 * to create a singular list.
> +	 */
> +	head.first =&event->hlist_entry;
> +
>   #define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \
>   		    sizeof(u64)) - sizeof(u32))
>
> @@ -330,7 +354,7 @@ perf_ftrace_function_call(unsigned long
>   	entry->ip = ip;
>   	entry->parent_ip = parent_ip;
>   	perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
> -			      1,&regs, head, NULL);
> +			      1,&regs,&head, NULL);
>
>   #undef ENTRY_SIZE
>   }
> @@ -339,8 +363,10 @@ static int perf_ftrace_function_register
>   {
>   	struct ftrace_ops *ops =&event->ftrace_ops;
>
> -	ops->flags |= FTRACE_OPS_FL_PER_CPU | FTRACE_OPS_FL_RCU;
> -	ops->func = perf_ftrace_function_call;
> +	ops->flags   |= FTRACE_OPS_FL_RCU;
> +	ops->func    = perf_ftrace_function_call;
> +	ops->private = (void *)(unsigned long)nr_cpu_ids;
> +
>   	return register_ftrace_function(ops);
>   }
>
> @@ -352,19 +378,11 @@ static int perf_ftrace_function_unregist
>   	return ret;
>   }
>
> -static void perf_ftrace_function_enable(struct perf_event *event)
> -{
> -	ftrace_function_local_enable(&event->ftrace_ops);
> -}
> -
> -static void perf_ftrace_function_disable(struct perf_event *event)
> -{
> -	ftrace_function_local_disable(&event->ftrace_ops);
> -}
> -
>   int perf_ftrace_event_register(struct trace_event_call *call,
>   			       enum trace_reg type, void *data)
>   {
> +	struct perf_event *event = data;
> +
>   	switch (type) {
>   	case TRACE_REG_REGISTER:
>   	case TRACE_REG_UNREGISTER:
> @@ -377,11 +395,11 @@ int perf_ftrace_event_register(struct tr
>   	case TRACE_REG_PERF_CLOSE:
>   		return perf_ftrace_function_unregister(data);
>   	case TRACE_REG_PERF_ADD:
> -		perf_ftrace_function_enable(data);
> -		return 0;
> +		event->ftrace_ops.private = (void *)(unsigned long)smp_processor_id();
> +		return 1;
>   	case TRACE_REG_PERF_DEL:
> -		perf_ftrace_function_disable(data);
> -		return 0;
> +		event->ftrace_ops.private = (void *)(unsigned long)nr_cpu_ids;
> +		return 1;
>   	}
>
>   	return -EINVAL;
>
>
>
> .
>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/4] perf/ftrace: Fix function trace events
  2017-10-11  7:45 ` [PATCH 2/4] perf/ftrace: Fix function trace events Peter Zijlstra
  2017-10-11  9:02   ` zhouchengming
@ 2017-10-11 11:59   ` Jiri Olsa
  2017-10-11 13:04     ` Steven Rostedt
  2017-10-13  7:17   ` Peter Zijlstra
  2 siblings, 1 reply; 13+ messages in thread
From: Jiri Olsa @ 2017-10-11 11:59 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: rostedt, mingo, linux-kernel, jolsa, zhouchengming1

On Wed, Oct 11, 2017 at 09:45:30AM +0200, Peter Zijlstra wrote:

SNIP

>  void perf_trace_del(struct perf_event *p_event, int flags)
>  {
>  	struct trace_event_call *tp_event = p_event->tp_event;
> -	hlist_del_rcu(&p_event->hlist_entry);
> -	tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
> +
> +	/*
> +	 * If TRACE_REG_PERF_DEL returns false; no custom action was performed
> +	 * and we need to take the default action of dequeueing our event from
> +	 * the right per-cpu hlist.
> +	 */
> +	if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event))
> +		hlist_del_rcu(&p_event->hlist_entry);
>  }
>  
>  void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp)
> @@ -307,14 +321,24 @@ perf_ftrace_function_call(unsigned long
>  			  struct ftrace_ops *ops, struct pt_regs *pt_regs)
>  {
>  	struct ftrace_entry *entry;
> -	struct hlist_head *head;
> +	struct perf_event *event;
> +	struct hlist_head head;
>  	struct pt_regs regs;
>  	int rctx;
>  
> -	head = this_cpu_ptr(event_function.perf_events);
> -	if (hlist_empty(head))
> +	if ((unsigned long)ops->private != smp_processor_id())
>  		return;
>  
> +	event = container_of(ops, struct perf_event, ftrace_ops);
> +
> +	/*
> +	 * @event->hlist entry is NULL (per INIT_HLIST_NODE), and all
> +	 * the perf code does is hlist_for_each_entry_rcu(), so we can
> +	 * get away with simply setting the @head.first pointer in order
> +	 * to create a singular list.
> +	 */
> +	head.first = &event->hlist_entry;
> +
>  #define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \
>  		    sizeof(u64)) - sizeof(u32))
>  
> @@ -330,7 +354,7 @@ perf_ftrace_function_call(unsigned long
>  	entry->ip = ip;
>  	entry->parent_ip = parent_ip;
>  	perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
> -			      1, &regs, head, NULL);
> +			      1, &regs, &head, NULL);

nice idea.. I'll test it

jirka

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 1/4] perf/ftrace: Revert ("perf/ftrace: Fix double traces of perf on ftrace:function")
  2017-10-11  7:45 ` [PATCH 1/4] perf/ftrace: Revert ("perf/ftrace: Fix double traces of perf on ftrace:function") Peter Zijlstra
@ 2017-10-11 12:57   ` Steven Rostedt
  0 siblings, 0 replies; 13+ messages in thread
From: Steven Rostedt @ 2017-10-11 12:57 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: mingo, linux-kernel, jolsa, zhouchengming1

On Wed, 11 Oct 2017 09:45:29 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> Revert commit:
> 
>   75e8387685f6 ("perf/ftrace: Fix double traces of perf on ftrace:function")
> 
> The reason I instantly stumbled on that patch is that it only addresses the
> ftrace situation and doesn't mention the other _5_ places that use this
> interface. It doesn't explain why those don't have the problem and if not, why
> their solution doesn't work for ftrace.
> 
> It doesn't, but this is just putting more duct tape on.
> 
> Cc: Zhou Chengming <zhouchengming1@huawei.com>
> Cc: Jiri Olsa <jolsa@kernel.org>
> Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
> Cc: Ingo Molnar <mingo@kernel.org>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>

Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org>

-- Steve

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/4] perf/ftrace: Fix function trace events
  2017-10-11 11:59   ` Jiri Olsa
@ 2017-10-11 13:04     ` Steven Rostedt
  2017-10-11 14:23       ` Jiri Olsa
  0 siblings, 1 reply; 13+ messages in thread
From: Steven Rostedt @ 2017-10-11 13:04 UTC (permalink / raw)
  To: Jiri Olsa; +Cc: Peter Zijlstra, mingo, linux-kernel, jolsa, zhouchengming1

On Wed, 11 Oct 2017 13:59:54 +0200
Jiri Olsa <jolsa@redhat.com> wrote:

> > @@ -330,7 +354,7 @@ perf_ftrace_function_call(unsigned long
> >  	entry->ip = ip;
> >  	entry->parent_ip = parent_ip;
> >  	perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
> > -			      1, &regs, head, NULL);
> > +			      1, &regs, &head, NULL);  
> 
> nice idea.. I'll test it

After you test it, want me to take this through my tree?

Also, we should address zhouchengming's concerns.

-- Steve

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/4] perf/ftrace: Fix function trace events
  2017-10-11 13:04     ` Steven Rostedt
@ 2017-10-11 14:23       ` Jiri Olsa
  0 siblings, 0 replies; 13+ messages in thread
From: Jiri Olsa @ 2017-10-11 14:23 UTC (permalink / raw)
  To: Steven Rostedt; +Cc: Peter Zijlstra, mingo, linux-kernel, jolsa, zhouchengming1

On Wed, Oct 11, 2017 at 09:04:20AM -0400, Steven Rostedt wrote:
> On Wed, 11 Oct 2017 13:59:54 +0200
> Jiri Olsa <jolsa@redhat.com> wrote:
> 
> > > @@ -330,7 +354,7 @@ perf_ftrace_function_call(unsigned long
> > >  	entry->ip = ip;
> > >  	entry->parent_ip = parent_ip;
> > >  	perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
> > > -			      1, &regs, head, NULL);
> > > +			      1, &regs, &head, NULL);  
> > 
> > nice idea.. I'll test it
> 
> After you test it, want me to take this through my tree?

peter's call ;-)

> Also, we should address zhouchengming's concerns.

agreed.. I tested and it looks ok to me

jirka

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/4] perf/ftrace: Fix function trace events
  2017-10-11  9:02   ` zhouchengming
@ 2017-10-11 16:40     ` Peter Zijlstra
  0 siblings, 0 replies; 13+ messages in thread
From: Peter Zijlstra @ 2017-10-11 16:40 UTC (permalink / raw)
  To: zhouchengming; +Cc: rostedt, mingo, linux-kernel, jolsa

On Wed, Oct 11, 2017 at 05:02:28PM +0800, zhouchengming wrote:
> On 2017/10/11 15:45, Peter Zijlstra wrote:

> > +		if (!(flags&  PERF_EF_START))
> > +			p_event->hw.state = PERF_HES_STOPPED;
> 
> Don't we need to check the flags for ftrace perf_event?
> So if we should put this outside the if (!tp_event->class->reg()) ?

Oh, right you are, that's independent of the hlist nonsense.

> > +		list = this_cpu_ptr(pcpu_list);
> > +		hlist_add_head_rcu(&p_event->hlist_entry, list);
> > +	}
> 
> Now we don't add perf_event to the pcpu_list, so we also can avoid
> to alloc pcpu_list for function tp_event in perf_trace_event_reg().

Yeah that allocation is superfluous, Not entirely sure how to cleanly
get rid of that though; that's another web to untangle.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/4] perf/ftrace: Fix function trace events
  2017-10-11  7:45 ` [PATCH 2/4] perf/ftrace: Fix function trace events Peter Zijlstra
  2017-10-11  9:02   ` zhouchengming
  2017-10-11 11:59   ` Jiri Olsa
@ 2017-10-13  7:17   ` Peter Zijlstra
  2017-10-16 22:15     ` Steven Rostedt
  2 siblings, 1 reply; 13+ messages in thread
From: Peter Zijlstra @ 2017-10-13  7:17 UTC (permalink / raw)
  To: rostedt, mingo; +Cc: linux-kernel, jolsa, zhouchengming1


Updated version; Steve could you route these 4 patches, they're mostly
kernel/trace/ related.

---
Subject: perf/ftrace: Fix function trace events
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue Oct 10 17:15:47 CEST 2017

The function-trace <-> perf interface is a tad messed up. Where all
the other trace <-> perf interfaces use a single trace hook
registration and use per-cpu RCU based hlist to iterate the events,
function-trace actually needs multiple hook registrations in order to
minimize function entry patching when filters are present.

The end result is that we iterate events both on the trace hook and on
the hlist, which results in reporting events multiple times.

Since function-trace cannot use the regular scheme, fix it the other
way around, use singleton hlists.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/trace_events.h    |    5 ++
 kernel/trace/trace_event_perf.c |   80 ++++++++++++++++++++++++----------------
 2 files changed, 54 insertions(+), 31 deletions(-)

--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -173,6 +173,11 @@ enum trace_reg {
 	TRACE_REG_PERF_UNREGISTER,
 	TRACE_REG_PERF_OPEN,
 	TRACE_REG_PERF_CLOSE,
+	/*
+	 * These (ADD/DEL) use a 'boolean' return value, where 1 (true) means a
+	 * custom action was taken and the default action is not to be
+	 * performed.
+	 */
 	TRACE_REG_PERF_ADD,
 	TRACE_REG_PERF_DEL,
 #endif
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -240,27 +240,41 @@ void perf_trace_destroy(struct perf_even
 int perf_trace_add(struct perf_event *p_event, int flags)
 {
 	struct trace_event_call *tp_event = p_event->tp_event;
-	struct hlist_head __percpu *pcpu_list;
-	struct hlist_head *list;
-
-	pcpu_list = tp_event->perf_events;
-	if (WARN_ON_ONCE(!pcpu_list))
-		return -EINVAL;
 
 	if (!(flags & PERF_EF_START))
 		p_event->hw.state = PERF_HES_STOPPED;
 
-	list = this_cpu_ptr(pcpu_list);
-	hlist_add_head_rcu(&p_event->hlist_entry, list);
+	/*
+	 * If TRACE_REG_PERF_ADD returns false; no custom action was performed
+	 * and we need to take the default action of enqueueing our event on
+	 * the right per-cpu hlist.
+	 */
+	if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event)) {
+		struct hlist_head __percpu *pcpu_list;
+		struct hlist_head *list;
+
+		pcpu_list = tp_event->perf_events;
+		if (WARN_ON_ONCE(!pcpu_list))
+			return -EINVAL;
+
+		list = this_cpu_ptr(pcpu_list);
+		hlist_add_head_rcu(&p_event->hlist_entry, list);
+	}
 
-	return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event);
+	return 0;
 }
 
 void perf_trace_del(struct perf_event *p_event, int flags)
 {
 	struct trace_event_call *tp_event = p_event->tp_event;
-	hlist_del_rcu(&p_event->hlist_entry);
-	tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
+
+	/*
+	 * If TRACE_REG_PERF_DEL returns false; no custom action was performed
+	 * and we need to take the default action of dequeueing our event from
+	 * the right per-cpu hlist.
+	 */
+	if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event))
+		hlist_del_rcu(&p_event->hlist_entry);
 }
 
 void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp)
@@ -307,14 +321,24 @@ perf_ftrace_function_call(unsigned long
 			  struct ftrace_ops *ops, struct pt_regs *pt_regs)
 {
 	struct ftrace_entry *entry;
-	struct hlist_head *head;
+	struct perf_event *event;
+	struct hlist_head head;
 	struct pt_regs regs;
 	int rctx;
 
-	head = this_cpu_ptr(event_function.perf_events);
-	if (hlist_empty(head))
+	if ((unsigned long)ops->private != smp_processor_id())
 		return;
 
+	event = container_of(ops, struct perf_event, ftrace_ops);
+
+	/*
+	 * @event->hlist entry is NULL (per INIT_HLIST_NODE), and all
+	 * the perf code does is hlist_for_each_entry_rcu(), so we can
+	 * get away with simply setting the @head.first pointer in order
+	 * to create a singular list.
+	 */
+	head.first = &event->hlist_entry;
+
 #define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \
 		    sizeof(u64)) - sizeof(u32))
 
@@ -330,7 +354,7 @@ perf_ftrace_function_call(unsigned long
 	entry->ip = ip;
 	entry->parent_ip = parent_ip;
 	perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
-			      1, &regs, head, NULL);
+			      1, &regs, &head, NULL);
 
 #undef ENTRY_SIZE
 }
@@ -339,8 +363,10 @@ static int perf_ftrace_function_register
 {
 	struct ftrace_ops *ops = &event->ftrace_ops;
 
-	ops->flags |= FTRACE_OPS_FL_PER_CPU | FTRACE_OPS_FL_RCU;
-	ops->func = perf_ftrace_function_call;
+	ops->flags   |= FTRACE_OPS_FL_RCU;
+	ops->func    = perf_ftrace_function_call;
+	ops->private = (void *)(unsigned long)nr_cpu_ids;
+
 	return register_ftrace_function(ops);
 }
 
@@ -352,19 +378,11 @@ static int perf_ftrace_function_unregist
 	return ret;
 }
 
-static void perf_ftrace_function_enable(struct perf_event *event)
-{
-	ftrace_function_local_enable(&event->ftrace_ops);
-}
-
-static void perf_ftrace_function_disable(struct perf_event *event)
-{
-	ftrace_function_local_disable(&event->ftrace_ops);
-}
-
 int perf_ftrace_event_register(struct trace_event_call *call,
 			       enum trace_reg type, void *data)
 {
+	struct perf_event *event = data;
+
 	switch (type) {
 	case TRACE_REG_REGISTER:
 	case TRACE_REG_UNREGISTER:
@@ -377,11 +395,11 @@ int perf_ftrace_event_register(struct tr
 	case TRACE_REG_PERF_CLOSE:
 		return perf_ftrace_function_unregister(data);
 	case TRACE_REG_PERF_ADD:
-		perf_ftrace_function_enable(data);
-		return 0;
+		event->ftrace_ops.private = (void *)(unsigned long)smp_processor_id();
+		return 1;
 	case TRACE_REG_PERF_DEL:
-		perf_ftrace_function_disable(data);
-		return 0;
+		event->ftrace_ops.private = (void *)(unsigned long)nr_cpu_ids;
+		return 1;
 	}
 
 	return -EINVAL;

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/4] perf/ftrace: Fix function trace events
  2017-10-13  7:17   ` Peter Zijlstra
@ 2017-10-16 22:15     ` Steven Rostedt
  0 siblings, 0 replies; 13+ messages in thread
From: Steven Rostedt @ 2017-10-16 22:15 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: mingo, linux-kernel, jolsa, zhouchengming1

On Fri, 13 Oct 2017 09:17:00 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> Updated version; Steve could you route these 4 patches, they're mostly
> kernel/trace/ related.

I just pull these in. They are fine for 4.15 right? They don't need to
go to stable do they?

-- Steve

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2017-10-16 22:15 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-10-11  7:45 [PATCH 0/4] perf/ftrace: Sanitize perf function-trace interaction Peter Zijlstra
2017-10-11  7:45 ` [PATCH 1/4] perf/ftrace: Revert ("perf/ftrace: Fix double traces of perf on ftrace:function") Peter Zijlstra
2017-10-11 12:57   ` Steven Rostedt
2017-10-11  7:45 ` [PATCH 2/4] perf/ftrace: Fix function trace events Peter Zijlstra
2017-10-11  9:02   ` zhouchengming
2017-10-11 16:40     ` Peter Zijlstra
2017-10-11 11:59   ` Jiri Olsa
2017-10-11 13:04     ` Steven Rostedt
2017-10-11 14:23       ` Jiri Olsa
2017-10-13  7:17   ` Peter Zijlstra
2017-10-16 22:15     ` Steven Rostedt
2017-10-11  7:45 ` [PATCH 3/4] perf/ftrace: Small cleanup Peter Zijlstra
2017-10-11  7:45 ` [PATCH 4/4] ftrace: Kill FTRACE_OPS_FL_PER_CPU Peter Zijlstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).