All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] perf: Drop the obsolete profile naming for trace events
@ 2010-03-05  7:00 Frederic Weisbecker
  2010-03-05  7:00 ` [PATCH 2/2] perf: Walk through the relevant events only Frederic Weisbecker
  0 siblings, 1 reply; 13+ messages in thread
From: Frederic Weisbecker @ 2010-03-05  7:00 UTC (permalink / raw)
  To: LKML
  Cc: LKML, Frederic Weisbecker, Ingo Molnar, Peter Zijlstra,
	Paul Mackerras, Steven Rostedt, Masami Hiramatsu, Jason Baron

Drop the obsolete "profile" naming used by perf for trace events.
Perf can now do more than simple events counting, so generalize
the API naming.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Jason Baron <jbaron@redhat.com>
---
 include/linux/ftrace_event.h                       |   16 ++--
 include/linux/syscalls.h                           |   24 +++---
 include/trace/ftrace.h                             |   38 +++++-----
 include/trace/syscall.h                            |    8 +-
 kernel/perf_event.c                                |    4 +-
 kernel/trace/Makefile                              |    2 +-
 .../{trace_event_profile.c => trace_event_perf.c}  |   44 ++++++------
 kernel/trace/trace_events.c                        |    2 +-
 kernel/trace/trace_kprobe.c                        |   28 ++++----
 kernel/trace/trace_syscalls.c                      |   72 ++++++++++----------
 10 files changed, 119 insertions(+), 119 deletions(-)
 rename kernel/trace/{trace_event_profile.c => trace_event_perf.c} (73%)

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 48b367e..7ee157d 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -132,12 +132,12 @@ struct ftrace_event_call {
 	void			*mod;
 	void			*data;
 
-	int			profile_count;
-	int			(*profile_enable)(struct ftrace_event_call *);
-	void			(*profile_disable)(struct ftrace_event_call *);
+	int			perf_refcount;
+	int			(*perf_event_enable)(struct ftrace_event_call *);
+	void			(*perf_event_disable)(struct ftrace_event_call *);
 };
 
-#define FTRACE_MAX_PROFILE_SIZE	2048
+#define PERF_MAX_TRACE_SIZE	2048
 
 #define MAX_FILTER_PRED		32
 #define MAX_FILTER_STR_VAL	256	/* Should handle KSYM_SYMBOL_LEN */
@@ -191,17 +191,17 @@ struct perf_event;
 
 DECLARE_PER_CPU(struct pt_regs, perf_trace_regs);
 
-extern int ftrace_profile_enable(int event_id);
-extern void ftrace_profile_disable(int event_id);
+extern int perf_trace_enable(int event_id);
+extern void perf_trace_disable(int event_id);
 extern int ftrace_profile_set_filter(struct perf_event *event, int event_id,
 				     char *filter_str);
 extern void ftrace_profile_free_filter(struct perf_event *event);
 extern void *
-ftrace_perf_buf_prepare(int size, unsigned short type, int *rctxp,
+perf_trace_buf_prepare(int size, unsigned short type, int *rctxp,
 			 unsigned long *irq_flags);
 
 static inline void
-ftrace_perf_buf_submit(void *raw_data, int size, int rctx, u64 addr,
+perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr,
 		       u64 count, unsigned long irq_flags, struct pt_regs *regs)
 {
 	struct trace_entry *entry = raw_data;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index feee739..eb46f5b 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -101,18 +101,18 @@ struct perf_event_attr;
 
 #ifdef CONFIG_PERF_EVENTS
 
-#define TRACE_SYS_ENTER_PROFILE_INIT(sname)				       \
-	.profile_enable = prof_sysenter_enable,				       \
-	.profile_disable = prof_sysenter_disable,
+#define TRACE_SYS_ENTER_PERF_INIT(sname)				       \
+	.perf_event_enable = perf_sysenter_enable,			       \
+	.perf_event_disable = perf_sysenter_disable,
 
-#define TRACE_SYS_EXIT_PROFILE_INIT(sname)				       \
-	.profile_enable = prof_sysexit_enable,				       \
-	.profile_disable = prof_sysexit_disable,
+#define TRACE_SYS_EXIT_PERF_INIT(sname)					       \
+	.perf_event_enable = perf_sysexit_enable,			       \
+	.perf_event_disable = perf_sysexit_disable,
 #else
-#define TRACE_SYS_ENTER_PROFILE(sname)
-#define TRACE_SYS_ENTER_PROFILE_INIT(sname)
-#define TRACE_SYS_EXIT_PROFILE(sname)
-#define TRACE_SYS_EXIT_PROFILE_INIT(sname)
+#define TRACE_SYS_ENTER_PERF(sname)
+#define TRACE_SYS_ENTER_PERF_INIT(sname)
+#define TRACE_SYS_EXIT_PERF(sname)
+#define TRACE_SYS_EXIT_PERF_INIT(sname)
 #endif /* CONFIG_PERF_EVENTS */
 
 #ifdef CONFIG_FTRACE_SYSCALLS
@@ -149,7 +149,7 @@ struct perf_event_attr;
 		.regfunc		= reg_event_syscall_enter,	\
 		.unregfunc		= unreg_event_syscall_enter,	\
 		.data			= (void *)&__syscall_meta_##sname,\
-		TRACE_SYS_ENTER_PROFILE_INIT(sname)			\
+		TRACE_SYS_ENTER_PERF_INIT(sname)			\
 	}
 
 #define SYSCALL_TRACE_EXIT_EVENT(sname)					\
@@ -171,7 +171,7 @@ struct perf_event_attr;
 		.regfunc		= reg_event_syscall_exit,	\
 		.unregfunc		= unreg_event_syscall_exit,	\
 		.data			= (void *)&__syscall_meta_##sname,\
-		TRACE_SYS_EXIT_PROFILE_INIT(sname)			\
+		TRACE_SYS_EXIT_PERF_INIT(sname)			\
 	}
 
 #define SYSCALL_METADATA(sname, nb)				\
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index a4fa51c..e92f037 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -524,16 +524,16 @@ static inline int ftrace_get_offsets_##call(				\
 #undef DEFINE_EVENT
 #define DEFINE_EVENT(template, name, proto, args)			\
 									\
-static void ftrace_profile_##name(proto);				\
+static void perf_trace_##name(proto);					\
 									\
-static int ftrace_profile_enable_##name(struct ftrace_event_call *unused)\
+static int perf_trace_enable_##name(struct ftrace_event_call *unused)	\
 {									\
-	return register_trace_##name(ftrace_profile_##name);		\
+	return register_trace_##name(perf_trace_##name);		\
 }									\
 									\
-static void ftrace_profile_disable_##name(struct ftrace_event_call *unused)\
+static void perf_trace_disable_##name(struct ftrace_event_call *unused)\
 {									\
-	unregister_trace_##name(ftrace_profile_##name);			\
+	unregister_trace_##name(perf_trace_##name);			\
 }
 
 #undef DEFINE_EVENT_PRINT
@@ -629,12 +629,12 @@ static void ftrace_profile_disable_##name(struct ftrace_event_call *unused)\
 
 #ifdef CONFIG_PERF_EVENTS
 
-#define _TRACE_PROFILE_INIT(call)					\
-	.profile_enable = ftrace_profile_enable_##call,			\
-	.profile_disable = ftrace_profile_disable_##call,
+#define _TRACE_PERF_INIT(call)						\
+	.perf_event_enable = perf_trace_enable_##call,			\
+	.perf_event_disable = perf_trace_disable_##call,
 
 #else
-#define _TRACE_PROFILE_INIT(call)
+#define _TRACE_PERF_INIT(call)
 #endif /* CONFIG_PERF_EVENTS */
 
 #undef __entry
@@ -739,7 +739,7 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.unregfunc		= ftrace_raw_unreg_event_##call,	\
 	.show_format		= ftrace_format_##template,		\
 	.define_fields		= ftrace_define_fields_##template,	\
-	_TRACE_PROFILE_INIT(call)					\
+	_TRACE_PERF_INIT(call)					\
 }
 
 #undef DEFINE_EVENT_PRINT
@@ -756,18 +756,18 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.unregfunc		= ftrace_raw_unreg_event_##call,	\
 	.show_format		= ftrace_format_##call,			\
 	.define_fields		= ftrace_define_fields_##template,	\
-	_TRACE_PROFILE_INIT(call)					\
+	_TRACE_PERF_INIT(call)					\
 }
 
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
 /*
- * Define the insertion callback to profile events
+ * Define the insertion callback to perf events
  *
  * The job is very similar to ftrace_raw_event_<call> except that we don't
  * insert in the ring buffer but in a perf counter.
  *
- * static void ftrace_profile_<call>(proto)
+ * static void ftrace_perf_<call>(proto)
  * {
  *	struct ftrace_data_offsets_<call> __maybe_unused __data_offsets;
  *	struct ftrace_event_call *event_call = &event_<call>;
@@ -846,7 +846,7 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 #undef DECLARE_EVENT_CLASS
 #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)	\
 static void								\
-ftrace_profile_templ_##call(struct ftrace_event_call *event_call,	\
+perf_trace_templ_##call(struct ftrace_event_call *event_call,		\
 			    proto)					\
 {									\
 	struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
@@ -863,10 +863,10 @@ ftrace_profile_templ_##call(struct ftrace_event_call *event_call,	\
 			     sizeof(u64));				\
 	__entry_size -= sizeof(u32);					\
 									\
-	if (WARN_ONCE(__entry_size > FTRACE_MAX_PROFILE_SIZE,		\
+	if (WARN_ONCE(__entry_size > PERF_MAX_TRACE_SIZE,		\
 		      "profile buffer not large enough"))		\
 		return;							\
-	entry = (struct ftrace_raw_##call *)ftrace_perf_buf_prepare(	\
+	entry = (struct ftrace_raw_##call *)perf_trace_buf_prepare(	\
 		__entry_size, event_call->id, &rctx, &irq_flags);	\
 	if (!entry)							\
 		return;							\
@@ -877,17 +877,17 @@ ftrace_profile_templ_##call(struct ftrace_event_call *event_call,	\
 	__regs = &__get_cpu_var(perf_trace_regs);			\
 	perf_save_regs(__regs, 2);					\
 									\
-	ftrace_perf_buf_submit(entry, __entry_size, rctx, __addr,	\
+	perf_trace_buf_submit(entry, __entry_size, rctx, __addr,	\
 			       __count, irq_flags, __regs);		\
 }
 
 #undef DEFINE_EVENT
 #define DEFINE_EVENT(template, call, proto, args)		\
-static void ftrace_profile_##call(proto)			\
+static void perf_trace_##call(proto)				\
 {								\
 	struct ftrace_event_call *event_call = &event_##call;	\
 								\
-	ftrace_profile_templ_##template(event_call, args);	\
+	perf_trace_templ_##template(event_call, args);		\
 }
 
 #undef DEFINE_EVENT_PRINT
diff --git a/include/trace/syscall.h b/include/trace/syscall.h
index 3d463dc..ad193fa 100644
--- a/include/trace/syscall.h
+++ b/include/trace/syscall.h
@@ -51,10 +51,10 @@ enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags);
 #endif
 
 #ifdef CONFIG_PERF_EVENTS
-int prof_sysenter_enable(struct ftrace_event_call *call);
-void prof_sysenter_disable(struct ftrace_event_call *call);
-int prof_sysexit_enable(struct ftrace_event_call *call);
-void prof_sysexit_disable(struct ftrace_event_call *call);
+int perf_sysenter_enable(struct ftrace_event_call *call);
+void perf_sysenter_disable(struct ftrace_event_call *call);
+int perf_sysexit_enable(struct ftrace_event_call *call);
+void perf_sysexit_disable(struct ftrace_event_call *call);
 #endif
 
 #endif /* _TRACE_SYSCALL_H */
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 763a482..4fa24a5 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -4370,7 +4370,7 @@ static int perf_tp_event_match(struct perf_event *event,
 
 static void tp_perf_event_destroy(struct perf_event *event)
 {
-	ftrace_profile_disable(event->attr.config);
+	perf_trace_disable(event->attr.config);
 }
 
 static const struct pmu *tp_perf_event_init(struct perf_event *event)
@@ -4384,7 +4384,7 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
 			!capable(CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
 
-	if (ftrace_profile_enable(event->attr.config))
+	if (perf_trace_enable(event->attr.config))
 		return NULL;
 
 	event->destroy = tp_perf_event_destroy;
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index d00c6fe..78edc64 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -52,7 +52,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_events.o
 obj-$(CONFIG_EVENT_TRACING) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
 ifeq ($(CONFIG_PERF_EVENTS),y)
-obj-$(CONFIG_EVENT_TRACING) += trace_event_profile.o
+obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
 endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_perf.c
similarity index 73%
rename from kernel/trace/trace_event_profile.c
rename to kernel/trace/trace_event_perf.c
index e66d21e..f315b12 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_perf.c
@@ -1,5 +1,5 @@
 /*
- * trace event based perf counter profiling
+ * trace event based perf event profiling/tracing
  *
  * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com>
  * Copyright (C) 2009-2010 Frederic Weisbecker <fweisbec@gmail.com>
@@ -14,20 +14,20 @@ DEFINE_PER_CPU(struct pt_regs, perf_trace_regs);
 static char *perf_trace_buf;
 static char *perf_trace_buf_nmi;
 
-typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ;
+typedef typeof(char [PERF_MAX_TRACE_SIZE]) perf_trace_t ;
 
 /* Count the events in use (per event id, not per instance) */
-static int	total_profile_count;
+static int	total_ref_count;
 
-static int ftrace_profile_enable_event(struct ftrace_event_call *event)
+static int perf_trace_event_enable(struct ftrace_event_call *event)
 {
 	char *buf;
 	int ret = -ENOMEM;
 
-	if (event->profile_count++ > 0)
+	if (event->perf_refcount++ > 0)
 		return 0;
 
-	if (!total_profile_count) {
+	if (!total_ref_count) {
 		buf = (char *)alloc_percpu(perf_trace_t);
 		if (!buf)
 			goto fail_buf;
@@ -41,35 +41,35 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event)
 		rcu_assign_pointer(perf_trace_buf_nmi, buf);
 	}
 
-	ret = event->profile_enable(event);
+	ret = event->perf_event_enable(event);
 	if (!ret) {
-		total_profile_count++;
+		total_ref_count++;
 		return 0;
 	}
 
 fail_buf_nmi:
-	if (!total_profile_count) {
+	if (!total_ref_count) {
 		free_percpu(perf_trace_buf_nmi);
 		free_percpu(perf_trace_buf);
 		perf_trace_buf_nmi = NULL;
 		perf_trace_buf = NULL;
 	}
 fail_buf:
-	event->profile_count--;
+	event->perf_refcount--;
 
 	return ret;
 }
 
-int ftrace_profile_enable(int event_id)
+int perf_trace_enable(int event_id)
 {
 	struct ftrace_event_call *event;
 	int ret = -EINVAL;
 
 	mutex_lock(&event_mutex);
 	list_for_each_entry(event, &ftrace_events, list) {
-		if (event->id == event_id && event->profile_enable &&
+		if (event->id == event_id && event->perf_event_enable &&
 		    try_module_get(event->mod)) {
-			ret = ftrace_profile_enable_event(event);
+			ret = perf_trace_event_enable(event);
 			break;
 		}
 	}
@@ -78,16 +78,16 @@ int ftrace_profile_enable(int event_id)
 	return ret;
 }
 
-static void ftrace_profile_disable_event(struct ftrace_event_call *event)
+static void perf_trace_event_disable(struct ftrace_event_call *event)
 {
 	char *buf, *nmi_buf;
 
-	if (--event->profile_count > 0)
+	if (--event->perf_refcount > 0)
 		return;
 
-	event->profile_disable(event);
+	event->perf_event_disable(event);
 
-	if (!--total_profile_count) {
+	if (!--total_ref_count) {
 		buf = perf_trace_buf;
 		rcu_assign_pointer(perf_trace_buf, NULL);
 
@@ -105,14 +105,14 @@ static void ftrace_profile_disable_event(struct ftrace_event_call *event)
 	}
 }
 
-void ftrace_profile_disable(int event_id)
+void perf_trace_disable(int event_id)
 {
 	struct ftrace_event_call *event;
 
 	mutex_lock(&event_mutex);
 	list_for_each_entry(event, &ftrace_events, list) {
 		if (event->id == event_id) {
-			ftrace_profile_disable_event(event);
+			perf_trace_event_disable(event);
 			module_put(event->mod);
 			break;
 		}
@@ -120,8 +120,8 @@ void ftrace_profile_disable(int event_id)
 	mutex_unlock(&event_mutex);
 }
 
-__kprobes void *ftrace_perf_buf_prepare(int size, unsigned short type,
-					int *rctxp, unsigned long *irq_flags)
+__kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
+				       int *rctxp, unsigned long *irq_flags)
 {
 	struct trace_entry *entry;
 	char *trace_buf, *raw_data;
@@ -162,4 +162,4 @@ err_recursion:
 	local_irq_restore(*irq_flags);
 	return NULL;
 }
-EXPORT_SYMBOL_GPL(ftrace_perf_buf_prepare);
+EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 189b09b..27296c7 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -931,7 +931,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
 		trace_create_file("enable", 0644, call->dir, call,
 				  enable);
 
-	if (call->id && call->profile_enable)
+	if (call->id && call->perf_event_enable)
 		trace_create_file("id", 0444, call->dir, call,
 		 		  id);
 
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 73e0526..41ef5fa 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1234,7 +1234,7 @@ static int kretprobe_event_show_format(struct ftrace_event_call *call,
 #ifdef CONFIG_PERF_EVENTS
 
 /* Kprobe profile handler */
-static __kprobes void kprobe_profile_func(struct kprobe *kp,
+static __kprobes void kprobe_perf_func(struct kprobe *kp,
 					 struct pt_regs *regs)
 {
 	struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
@@ -1247,11 +1247,11 @@ static __kprobes void kprobe_profile_func(struct kprobe *kp,
 	__size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
 	size = ALIGN(__size + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
-	if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
+	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
 		     "profile buffer not large enough"))
 		return;
 
-	entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags);
+	entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags);
 	if (!entry)
 		return;
 
@@ -1260,11 +1260,11 @@ static __kprobes void kprobe_profile_func(struct kprobe *kp,
 	for (i = 0; i < tp->nr_args; i++)
 		entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
 
-	ftrace_perf_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags, regs);
+	perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags, regs);
 }
 
 /* Kretprobe profile handler */
-static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri,
+static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
 					    struct pt_regs *regs)
 {
 	struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
@@ -1277,11 +1277,11 @@ static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri,
 	__size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
 	size = ALIGN(__size + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
-	if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
+	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
 		     "profile buffer not large enough"))
 		return;
 
-	entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags);
+	entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags);
 	if (!entry)
 		return;
 
@@ -1291,11 +1291,11 @@ static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri,
 	for (i = 0; i < tp->nr_args; i++)
 		entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
 
-	ftrace_perf_buf_submit(entry, size, rctx, entry->ret_ip, 1,
+	perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1,
 			       irq_flags, regs);
 }
 
-static int probe_profile_enable(struct ftrace_event_call *call)
+static int probe_perf_enable(struct ftrace_event_call *call)
 {
 	struct trace_probe *tp = (struct trace_probe *)call->data;
 
@@ -1307,7 +1307,7 @@ static int probe_profile_enable(struct ftrace_event_call *call)
 		return enable_kprobe(&tp->rp.kp);
 }
 
-static void probe_profile_disable(struct ftrace_event_call *call)
+static void probe_perf_disable(struct ftrace_event_call *call)
 {
 	struct trace_probe *tp = (struct trace_probe *)call->data;
 
@@ -1332,7 +1332,7 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
 		kprobe_trace_func(kp, regs);
 #ifdef CONFIG_PERF_EVENTS
 	if (tp->flags & TP_FLAG_PROFILE)
-		kprobe_profile_func(kp, regs);
+		kprobe_perf_func(kp, regs);
 #endif
 	return 0;	/* We don't tweek kernel, so just return 0 */
 }
@@ -1346,7 +1346,7 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
 		kretprobe_trace_func(ri, regs);
 #ifdef CONFIG_PERF_EVENTS
 	if (tp->flags & TP_FLAG_PROFILE)
-		kretprobe_profile_func(ri, regs);
+		kretprobe_perf_func(ri, regs);
 #endif
 	return 0;	/* We don't tweek kernel, so just return 0 */
 }
@@ -1377,8 +1377,8 @@ static int register_probe_event(struct trace_probe *tp)
 	call->unregfunc = probe_event_disable;
 
 #ifdef CONFIG_PERF_EVENTS
-	call->profile_enable = probe_profile_enable;
-	call->profile_disable = probe_profile_disable;
+	call->perf_event_enable = probe_perf_enable;
+	call->perf_event_disable = probe_perf_disable;
 #endif
 	call->data = tp;
 	ret = trace_add_event_call(call);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index d0f34ab..99cc45f 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -423,12 +423,12 @@ core_initcall(init_ftrace_syscalls);
 
 #ifdef CONFIG_PERF_EVENTS
 
-static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls);
-static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls);
-static int sys_prof_refcount_enter;
-static int sys_prof_refcount_exit;
+static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls);
+static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
+static int sys_perf_refcount_enter;
+static int sys_perf_refcount_exit;
 
-static void prof_syscall_enter(struct pt_regs *regs, long id)
+static void perf_syscall_enter(struct pt_regs *regs, long id)
 {
 	struct syscall_metadata *sys_data;
 	struct syscall_trace_enter *rec;
@@ -438,7 +438,7 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
 	int size;
 
 	syscall_nr = syscall_get_nr(current, regs);
-	if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
+	if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
 		return;
 
 	sys_data = syscall_nr_to_meta(syscall_nr);
@@ -450,11 +450,11 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
 	size = ALIGN(size + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
 
-	if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
-		      "profile buffer not large enough"))
+	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
+		      "perf buffer not large enough"))
 		return;
 
-	rec = (struct syscall_trace_enter *)ftrace_perf_buf_prepare(size,
+	rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
 				sys_data->enter_event->id, &rctx, &flags);
 	if (!rec)
 		return;
@@ -462,10 +462,10 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
 	rec->nr = syscall_nr;
 	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
 			       (unsigned long *)&rec->args);
-	ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags, regs);
+	perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs);
 }
 
-int prof_sysenter_enable(struct ftrace_event_call *call)
+int perf_sysenter_enable(struct ftrace_event_call *call)
 {
 	int ret = 0;
 	int num;
@@ -473,34 +473,34 @@ int prof_sysenter_enable(struct ftrace_event_call *call)
 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
 
 	mutex_lock(&syscall_trace_lock);
-	if (!sys_prof_refcount_enter)
-		ret = register_trace_sys_enter(prof_syscall_enter);
+	if (!sys_perf_refcount_enter)
+		ret = register_trace_sys_enter(perf_syscall_enter);
 	if (ret) {
 		pr_info("event trace: Could not activate"
 				"syscall entry trace point");
 	} else {
-		set_bit(num, enabled_prof_enter_syscalls);
-		sys_prof_refcount_enter++;
+		set_bit(num, enabled_perf_enter_syscalls);
+		sys_perf_refcount_enter++;
 	}
 	mutex_unlock(&syscall_trace_lock);
 	return ret;
 }
 
-void prof_sysenter_disable(struct ftrace_event_call *call)
+void perf_sysenter_disable(struct ftrace_event_call *call)
 {
 	int num;
 
 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
 
 	mutex_lock(&syscall_trace_lock);
-	sys_prof_refcount_enter--;
-	clear_bit(num, enabled_prof_enter_syscalls);
-	if (!sys_prof_refcount_enter)
-		unregister_trace_sys_enter(prof_syscall_enter);
+	sys_perf_refcount_enter--;
+	clear_bit(num, enabled_perf_enter_syscalls);
+	if (!sys_perf_refcount_enter)
+		unregister_trace_sys_enter(perf_syscall_enter);
 	mutex_unlock(&syscall_trace_lock);
 }
 
-static void prof_syscall_exit(struct pt_regs *regs, long ret)
+static void perf_syscall_exit(struct pt_regs *regs, long ret)
 {
 	struct syscall_metadata *sys_data;
 	struct syscall_trace_exit *rec;
@@ -510,7 +510,7 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
 	int size;
 
 	syscall_nr = syscall_get_nr(current, regs);
-	if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
+	if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
 		return;
 
 	sys_data = syscall_nr_to_meta(syscall_nr);
@@ -525,11 +525,11 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
 	 * Impossible, but be paranoid with the future
 	 * How to put this check outside runtime?
 	 */
-	if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
-		"exit event has grown above profile buffer size"))
+	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
+		"exit event has grown above perf buffer size"))
 		return;
 
-	rec = (struct syscall_trace_exit *)ftrace_perf_buf_prepare(size,
+	rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
 				sys_data->exit_event->id, &rctx, &flags);
 	if (!rec)
 		return;
@@ -537,10 +537,10 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
 	rec->nr = syscall_nr;
 	rec->ret = syscall_get_return_value(current, regs);
 
-	ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags, regs);
+	perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs);
 }
 
-int prof_sysexit_enable(struct ftrace_event_call *call)
+int perf_sysexit_enable(struct ftrace_event_call *call)
 {
 	int ret = 0;
 	int num;
@@ -548,30 +548,30 @@ int prof_sysexit_enable(struct ftrace_event_call *call)
 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
 
 	mutex_lock(&syscall_trace_lock);
-	if (!sys_prof_refcount_exit)
-		ret = register_trace_sys_exit(prof_syscall_exit);
+	if (!sys_perf_refcount_exit)
+		ret = register_trace_sys_exit(perf_syscall_exit);
 	if (ret) {
 		pr_info("event trace: Could not activate"
 				"syscall entry trace point");
 	} else {
-		set_bit(num, enabled_prof_exit_syscalls);
-		sys_prof_refcount_exit++;
+		set_bit(num, enabled_perf_exit_syscalls);
+		sys_perf_refcount_exit++;
 	}
 	mutex_unlock(&syscall_trace_lock);
 	return ret;
 }
 
-void prof_sysexit_disable(struct ftrace_event_call *call)
+void perf_sysexit_disable(struct ftrace_event_call *call)
 {
 	int num;
 
 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
 
 	mutex_lock(&syscall_trace_lock);
-	sys_prof_refcount_exit--;
-	clear_bit(num, enabled_prof_exit_syscalls);
-	if (!sys_prof_refcount_exit)
-		unregister_trace_sys_exit(prof_syscall_exit);
+	sys_perf_refcount_exit--;
+	clear_bit(num, enabled_perf_exit_syscalls);
+	if (!sys_perf_refcount_exit)
+		unregister_trace_sys_exit(perf_syscall_exit);
 	mutex_unlock(&syscall_trace_lock);
 }
 
-- 
1.6.2.3


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 2/2] perf: Walk through the relevant events only
  2010-03-05  7:00 [PATCH 1/2] perf: Drop the obsolete profile naming for trace events Frederic Weisbecker
@ 2010-03-05  7:00 ` Frederic Weisbecker
  2010-03-05  9:39   ` Peter Zijlstra
  0 siblings, 1 reply; 13+ messages in thread
From: Frederic Weisbecker @ 2010-03-05  7:00 UTC (permalink / raw)
  To: LKML
  Cc: LKML, Frederic Weisbecker, Ingo Molnar, Peter Zijlstra,
	Paul Mackerras, Steven Rostedt, Masami Hiramatsu, Jason Baron,
	Arnaldo Carvalho de Melo

Each time a trace event triggers, we walk through the entire
list of events from the active contexts to find the perf events
that match the current one.

This is wasteful. To solve this, we maintain a per cpu list of
the active perf events for each running trace events and we
directly commit to these.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 include/linux/ftrace_event.h    |   15 ++++++----
 include/linux/perf_event.h      |    9 +++++-
 include/trace/ftrace.h          |    4 +-
 kernel/perf_event.c             |   58 ++++++++++++++++++++++-----------------
 kernel/trace/trace_event_perf.c |   52 +++++++++++++++++++++++++++++++---
 kernel/trace/trace_kprobe.c     |    7 +++--
 kernel/trace/trace_syscalls.c   |    9 +++++-
 7 files changed, 109 insertions(+), 45 deletions(-)

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 7ee157d..b39e8d5 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -132,6 +132,7 @@ struct ftrace_event_call {
 	void			*mod;
 	void			*data;
 
+	struct list_head	__percpu *perf_list;
 	int			perf_refcount;
 	int			(*perf_event_enable)(struct ftrace_event_call *);
 	void			(*perf_event_disable)(struct ftrace_event_call *);
@@ -191,7 +192,7 @@ struct perf_event;
 
 DECLARE_PER_CPU(struct pt_regs, perf_trace_regs);
 
-extern int perf_trace_enable(int event_id);
+extern int perf_trace_enable(int event_id, struct perf_event *perf_event);
 extern void perf_trace_disable(int event_id);
 extern int ftrace_profile_set_filter(struct perf_event *event, int event_id,
 				     char *filter_str);
@@ -201,15 +202,17 @@ perf_trace_buf_prepare(int size, unsigned short type, int *rctxp,
 			 unsigned long *irq_flags);
 
 static inline void
-perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr,
-		       u64 count, unsigned long irq_flags, struct pt_regs *regs)
+perf_trace_buf_submit(struct ftrace_event_call *event, void *raw_data, int size,
+		      int rctx, u64 addr, u64 count, unsigned long irq_flags,
+		      struct pt_regs *regs)
 {
-	struct trace_entry *entry = raw_data;
-
-	perf_tp_event(entry->type, addr, count, raw_data, size, regs);
+	perf_tp_event(event, addr, count, raw_data, size, regs);
 	perf_swevent_put_recursion_context(rctx);
 	local_irq_restore(irq_flags);
 }
+
+extern int perf_trace_sched_in(struct perf_event *event);
+extern void perf_trace_sched_out(struct perf_event *event);
 #endif
 
 #endif /* _LINUX_FTRACE_EVENT_H */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e35ad6f..8a9e38f 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -490,6 +490,8 @@ struct hw_perf_event {
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 		/* breakpoint */
 		struct arch_hw_breakpoint	info;
+		/* tracepoint */
+		struct ftrace_event_call	*trace_event;
 #endif
 	};
 	atomic64_t			prev_count;
@@ -578,6 +580,7 @@ struct perf_event {
 	struct list_head		group_entry;
 	struct list_head		event_entry;
 	struct list_head		sibling_list;
+	struct list_head		trace_list;
 	int				nr_siblings;
 	int				group_flags;
 	struct perf_event		*group_leader;
@@ -897,8 +900,8 @@ extern int sysctl_perf_event_mlock;
 extern int sysctl_perf_event_sample_rate;
 
 extern void perf_event_init(void);
-extern void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
-			  int entry_size, struct pt_regs *regs);
+extern void perf_tp_event(struct ftrace_event_call *fevent, u64 addr, u64 count,
+			  void *record, int entry_size, struct pt_regs *regs);
 extern void perf_bp_event(struct perf_event *event, void *data);
 
 #ifndef perf_misc_flags
@@ -917,6 +920,8 @@ extern int perf_swevent_get_recursion_context(void);
 extern void perf_swevent_put_recursion_context(int rctx);
 extern void perf_event_enable(struct perf_event *event);
 extern void perf_event_disable(struct perf_event *event);
+extern int perf_swevent_enable(struct perf_event *event);
+extern void perf_swevent_disable(struct perf_event *event);
 #else
 static inline void
 perf_event_task_sched_in(struct task_struct *task)			{ }
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index e92f037..dbd5e1f 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -877,8 +877,8 @@ perf_trace_templ_##call(struct ftrace_event_call *event_call,		\
 	__regs = &__get_cpu_var(perf_trace_regs);			\
 	perf_save_regs(__regs, 2);					\
 									\
-	perf_trace_buf_submit(entry, __entry_size, rctx, __addr,	\
-			       __count, irq_flags, __regs);		\
+	perf_trace_buf_submit(event_call, entry, __entry_size, rctx,	\
+			      __addr, __count, irq_flags, __regs);	\
 }
 
 #undef DEFINE_EVENT
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 4fa24a5..87a7048 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -4002,9 +4002,6 @@ static int perf_swevent_is_counting(struct perf_event *event)
 	return 1;
 }
 
-static int perf_tp_event_match(struct perf_event *event,
-				struct perf_sample_data *data);
-
 static int perf_exclude_event(struct perf_event *event,
 			      struct pt_regs *regs)
 {
@@ -4040,10 +4037,6 @@ static int perf_swevent_match(struct perf_event *event,
 	if (perf_exclude_event(event, regs))
 		return 0;
 
-	if (event->attr.type == PERF_TYPE_TRACEPOINT &&
-	    !perf_tp_event_match(event, data))
-		return 0;
-
 	return 1;
 }
 
@@ -4140,7 +4133,7 @@ static void perf_swevent_read(struct perf_event *event)
 {
 }
 
-static int perf_swevent_enable(struct perf_event *event)
+int perf_swevent_enable(struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
 
@@ -4151,7 +4144,7 @@ static int perf_swevent_enable(struct perf_event *event)
 	return 0;
 }
 
-static void perf_swevent_disable(struct perf_event *event)
+void perf_swevent_disable(struct perf_event *event)
 {
 }
 
@@ -4339,9 +4332,21 @@ static const struct pmu perf_ops_task_clock = {
 
 #ifdef CONFIG_EVENT_TRACING
 
-void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
-		   int entry_size, struct pt_regs *regs)
+static int perf_tp_event_match(struct perf_event *event,
+				struct perf_sample_data *data)
+{
+	void *record = data->raw->data;
+
+	if (likely(!event->filter) || filter_match_preds(event->filter, record))
+		return 1;
+	return 0;
+}
+
+void perf_tp_event(struct ftrace_event_call *fevent, u64 addr, u64 count,
+		   void *record, int entry_size, struct pt_regs *regs)
 {
+	struct list_head *list;
+	struct perf_event *event;
 	struct perf_raw_record raw = {
 		.size = entry_size,
 		.data = record,
@@ -4352,27 +4357,30 @@ void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
 		.raw = &raw,
 	};
 
-	/* Trace events already protected against recursion */
-	do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
-			 &data, regs);
-}
-EXPORT_SYMBOL_GPL(perf_tp_event);
+	list = per_cpu_ptr(fevent->perf_list, smp_processor_id());
 
-static int perf_tp_event_match(struct perf_event *event,
-				struct perf_sample_data *data)
-{
-	void *record = data->raw->data;
+	list_for_each_entry(event, list, trace_list) {
+		if (perf_exclude_event(event, regs))
+			continue;
 
-	if (likely(!event->filter) || filter_match_preds(event->filter, record))
-		return 1;
-	return 0;
+		if (perf_tp_event_match(event, &data))
+			perf_swevent_add(event, count, 1, &data, regs);
+	}
 }
+EXPORT_SYMBOL_GPL(perf_tp_event);
 
 static void tp_perf_event_destroy(struct perf_event *event)
 {
 	perf_trace_disable(event->attr.config);
 }
 
+static const struct pmu perf_ops_tp = {
+	.enable		= perf_trace_sched_in,
+	.disable	= perf_trace_sched_out,
+	.read		= perf_swevent_read,
+	.unthrottle	= perf_swevent_unthrottle,
+};
+
 static const struct pmu *tp_perf_event_init(struct perf_event *event)
 {
 	/*
@@ -4384,12 +4392,12 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
 			!capable(CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
 
-	if (perf_trace_enable(event->attr.config))
+	if (perf_trace_enable(event->attr.config, event))
 		return NULL;
 
 	event->destroy = tp_perf_event_destroy;
 
-	return &perf_ops_generic;
+	return &perf_ops_tp;
 }
 
 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index f315b12..2bf254c 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -22,6 +22,7 @@ static int	total_ref_count;
 static int perf_trace_event_enable(struct ftrace_event_call *event)
 {
 	char *buf;
+	int cpu;
 	int ret = -ENOMEM;
 
 	if (event->perf_refcount++ > 0)
@@ -41,12 +42,26 @@ static int perf_trace_event_enable(struct ftrace_event_call *event)
 		rcu_assign_pointer(perf_trace_buf_nmi, buf);
 	}
 
+	event->perf_list = alloc_percpu(struct list_head);
+	if (!event->perf_list)
+		goto fail_buf_nmi;
+
+	for_each_online_cpu(cpu)
+		INIT_LIST_HEAD(per_cpu_ptr(event->perf_list, cpu));
+
+	/* Ensure the lists are visible before starting the events */
+	smp_mb();
+
 	ret = event->perf_event_enable(event);
-	if (!ret) {
-		total_ref_count++;
-		return 0;
-	}
+	if (ret)
+		goto fail_enable;
 
+	total_ref_count++;
+
+	return 0;
+
+fail_enable:
+	free_percpu(event->perf_list);
 fail_buf_nmi:
 	if (!total_ref_count) {
 		free_percpu(perf_trace_buf_nmi);
@@ -60,7 +75,7 @@ fail_buf:
 	return ret;
 }
 
-int perf_trace_enable(int event_id)
+int perf_trace_enable(int event_id, struct perf_event *perf_event)
 {
 	struct ftrace_event_call *event;
 	int ret = -EINVAL;
@@ -70,6 +85,8 @@ int perf_trace_enable(int event_id)
 		if (event->id == event_id && event->perf_event_enable &&
 		    try_module_get(event->mod)) {
 			ret = perf_trace_event_enable(event);
+			if (!ret)
+				perf_event->hw.trace_event = event;
 			break;
 		}
 	}
@@ -102,6 +119,7 @@ static void perf_trace_event_disable(struct ftrace_event_call *event)
 
 		free_percpu(buf);
 		free_percpu(nmi_buf);
+		free_percpu(event->perf_list);
 	}
 }
 
@@ -120,6 +138,30 @@ void perf_trace_disable(int event_id)
 	mutex_unlock(&event_mutex);
 }
 
+/*
+ * No need to protect the per cpu list of events.
+ * This is only changed locally without interrupts to
+ * race against.
+ */
+int perf_trace_sched_in(struct perf_event *event)
+{
+	struct ftrace_event_call *call = event->hw.trace_event;
+	struct list_head __percpu *head;
+
+	head = per_cpu_ptr(call->perf_list, smp_processor_id());
+
+	list_add_tail(&event->trace_list, head);
+	perf_swevent_enable(event);
+
+	return 0;
+}
+
+void perf_trace_sched_out(struct perf_event *event)
+{
+	list_del(&event->trace_list);
+	perf_swevent_disable(event);
+}
+
 __kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
 				       int *rctxp, unsigned long *irq_flags)
 {
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 41ef5fa..1184f48 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1260,7 +1260,8 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
 	for (i = 0; i < tp->nr_args; i++)
 		entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
 
-	perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags, regs);
+	perf_trace_buf_submit(call, entry, size, rctx, entry->ip, 1,
+			      irq_flags, regs);
 }
 
 /* Kretprobe profile handler */
@@ -1291,8 +1292,8 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
 	for (i = 0; i < tp->nr_args; i++)
 		entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
 
-	perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1,
-			       irq_flags, regs);
+	perf_trace_buf_submit(call, entry, size, rctx, entry->ret_ip, 1,
+			      irq_flags, regs);
 }
 
 static int probe_perf_enable(struct ftrace_event_call *call)
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 99cc45f..ed7d175 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -431,6 +431,7 @@ static int sys_perf_refcount_exit;
 static void perf_syscall_enter(struct pt_regs *regs, long id)
 {
 	struct syscall_metadata *sys_data;
+	struct ftrace_event_call *event;
 	struct syscall_trace_enter *rec;
 	unsigned long flags;
 	int syscall_nr;
@@ -462,7 +463,8 @@ static void perf_syscall_enter(struct pt_regs *regs, long id)
 	rec->nr = syscall_nr;
 	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
 			       (unsigned long *)&rec->args);
-	perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs);
+	event = sys_data->enter_event;
+	perf_trace_buf_submit(event, rec, size, rctx, 0, 1, flags, regs);
 }
 
 int perf_sysenter_enable(struct ftrace_event_call *call)
@@ -503,6 +505,7 @@ void perf_sysenter_disable(struct ftrace_event_call *call)
 static void perf_syscall_exit(struct pt_regs *regs, long ret)
 {
 	struct syscall_metadata *sys_data;
+	struct ftrace_event_call *event;
 	struct syscall_trace_exit *rec;
 	unsigned long flags;
 	int syscall_nr;
@@ -537,7 +540,9 @@ static void perf_syscall_exit(struct pt_regs *regs, long ret)
 	rec->nr = syscall_nr;
 	rec->ret = syscall_get_return_value(current, regs);
 
-	perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs);
+	event = sys_data->exit_event;
+
+	perf_trace_buf_submit(event, rec, size, rctx, 0, 1, flags, regs);
 }
 
 int perf_sysexit_enable(struct ftrace_event_call *call)
-- 
1.6.2.3


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/2] perf: Walk through the relevant events only
  2010-03-05  7:00 ` [PATCH 2/2] perf: Walk through the relevant events only Frederic Weisbecker
@ 2010-03-05  9:39   ` Peter Zijlstra
  2010-03-05 17:03     ` Frederic Weisbecker
  2010-03-08 18:35     ` [RFC PATCH] perf: Store relevant events in a hlist Frederic Weisbecker
  0 siblings, 2 replies; 13+ messages in thread
From: Peter Zijlstra @ 2010-03-05  9:39 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: LKML, Ingo Molnar, Paul Mackerras, Steven Rostedt,
	Masami Hiramatsu, Jason Baron, Arnaldo Carvalho de Melo

On Fri, 2010-03-05 at 08:00 +0100, Frederic Weisbecker wrote:
> Each time a trace event triggers, we walk through the entire
> list of events from the active contexts to find the perf events
> that match the current one.
> 
> This is wasteful. To solve this, we maintain a per cpu list of
> the active perf events for each running trace events and we
> directly commit to these.

Right, so this seems a little trace specific. I once thought about using
a hash table to do this for all software events. It also keeps it all
nicely inside perf_event.[ch].




^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/2] perf: Walk through the relevant events only
  2010-03-05  9:39   ` Peter Zijlstra
@ 2010-03-05 17:03     ` Frederic Weisbecker
  2010-03-05 17:20       ` Peter Zijlstra
  2010-03-08 18:35     ` [RFC PATCH] perf: Store relevant events in a hlist Frederic Weisbecker
  1 sibling, 1 reply; 13+ messages in thread
From: Frederic Weisbecker @ 2010-03-05 17:03 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: LKML, Ingo Molnar, Paul Mackerras, Steven Rostedt,
	Masami Hiramatsu, Jason Baron, Arnaldo Carvalho de Melo

On Fri, Mar 05, 2010 at 10:39:29AM +0100, Peter Zijlstra wrote:
> On Fri, 2010-03-05 at 08:00 +0100, Frederic Weisbecker wrote:
> > Each time a trace event triggers, we walk through the entire
> > list of events from the active contexts to find the perf events
> > that match the current one.
> > 
> > This is wasteful. To solve this, we maintain a per cpu list of
> > the active perf events for each running trace events and we
> > directly commit to these.
> 
> Right, so this seems a little trace specific. I once thought about using
> a hash table to do this for all software events. It also keeps it all
> nicely inside perf_event.[ch].


Right. We could have a per cpu type:event_id based hlist that would
cover trace events and other software events.

That would do the trick more generically wrt perf.

Now isn't the problem more in the fact that most of the swevents
should be tracepoints?

This is the case for most of them. Only PERF_COUNT_SW_CPU_CLOCK
and PERF_COUNT_SW_TASK_CLOCK seem to be the exception, and they
manage their own path by calling perf_event_overflow() directly.

And as you guess, turning them into tracepoints would benefit
to everyone. We'll have interesting trace events in fault paths,
we won't have zillions of hooks in the same place (in the context
switch, we have the usual tracepoint plus the perf call).
And eventually the off-case is better optimized, and further
optimizations there (jmp/nop patching/whatever) will propagate
to all tracepoint users.

Finally, we would have only one path to maintain for the swevents.


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/2] perf: Walk through the relevant events only
  2010-03-05 17:03     ` Frederic Weisbecker
@ 2010-03-05 17:20       ` Peter Zijlstra
  2010-03-05 17:33         ` Frederic Weisbecker
  0 siblings, 1 reply; 13+ messages in thread
From: Peter Zijlstra @ 2010-03-05 17:20 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: LKML, Ingo Molnar, Paul Mackerras, Steven Rostedt,
	Masami Hiramatsu, Jason Baron, Arnaldo Carvalho de Melo

On Fri, 2010-03-05 at 18:03 +0100, Frederic Weisbecker wrote:
> 
> Now isn't the problem more in the fact that most of the swevents
> should be tracepoints? 

No, different interface, and I don't want to require TRACE=y, I already
utterly hate that x86 requires PERF=y.


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/2] perf: Walk through the relevant events only
  2010-03-05 17:20       ` Peter Zijlstra
@ 2010-03-05 17:33         ` Frederic Weisbecker
  2010-03-05 17:39           ` Peter Zijlstra
  0 siblings, 1 reply; 13+ messages in thread
From: Frederic Weisbecker @ 2010-03-05 17:33 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: LKML, Ingo Molnar, Paul Mackerras, Steven Rostedt,
	Masami Hiramatsu, Jason Baron, Arnaldo Carvalho de Melo

On Fri, Mar 05, 2010 at 06:20:29PM +0100, Peter Zijlstra wrote:
> On Fri, 2010-03-05 at 18:03 +0100, Frederic Weisbecker wrote:
> > 
> > Now isn't the problem more in the fact that most of the swevents
> > should be tracepoints? 
> 
> No, different interface, and I don't want to require TRACE=y, I already
> utterly hate that x86 requires PERF=y.
> 

This could be reduced to the strict minimum, say CONFIG_TRACEPOINT
and some code around just to support the event ids.

Software events could be made optionals too.

I just don't like this multiplication of probe points of different
natures in a single point. That's wasteful.


> I already
> utterly hate that x86 requires PERF=y.


Me too, and it's my bad, so me double too. Sometimes I think
we should make BREAKPOINTs optional, default y. I just don't know
if something like this that has always been builtin can be made
optional.


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/2] perf: Walk through the relevant events only
  2010-03-05 17:33         ` Frederic Weisbecker
@ 2010-03-05 17:39           ` Peter Zijlstra
  2010-03-05 17:46             ` Frederic Weisbecker
  0 siblings, 1 reply; 13+ messages in thread
From: Peter Zijlstra @ 2010-03-05 17:39 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: LKML, Ingo Molnar, Paul Mackerras, Steven Rostedt,
	Masami Hiramatsu, Jason Baron, Arnaldo Carvalho de Melo

On Fri, 2010-03-05 at 18:33 +0100, Frederic Weisbecker wrote:
> On Fri, Mar 05, 2010 at 06:20:29PM +0100, Peter Zijlstra wrote:
> > On Fri, 2010-03-05 at 18:03 +0100, Frederic Weisbecker wrote:
> > > 
> > > Now isn't the problem more in the fact that most of the swevents
> > > should be tracepoints? 
> > 
> > No, different interface, and I don't want to require TRACE=y, I already
> > utterly hate that x86 requires PERF=y.
> > 
> 
> This could be reduced to the strict minimum, say CONFIG_TRACEPOINT
> and some code around just to support the event ids.

Can't, software events already are an ABI so we'll have to support that
forever, but sure you can make something that reduces to the current
software event callback on TRACE=n and maps to the right software event
id when TRACE=y.

> Software events could be made optionals too.

Sure, but they're nowhere near as much code as tracepoints.

> > I already
> > utterly hate that x86 requires PERF=y.
> 
> 
> Me too, and it's my bad, so me double too. Sometimes I think
> we should make BREAKPOINTs optional, default y. I just don't know
> if something like this that has always been builtin can be made
> optional.

Simply for build testing that would be useful, we could make it an
embedded switch.


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/2] perf: Walk through the relevant events only
  2010-03-05 17:39           ` Peter Zijlstra
@ 2010-03-05 17:46             ` Frederic Weisbecker
  0 siblings, 0 replies; 13+ messages in thread
From: Frederic Weisbecker @ 2010-03-05 17:46 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: LKML, Ingo Molnar, Paul Mackerras, Steven Rostedt,
	Masami Hiramatsu, Jason Baron, Arnaldo Carvalho de Melo

On Fri, Mar 05, 2010 at 06:39:12PM +0100, Peter Zijlstra wrote:
> On Fri, 2010-03-05 at 18:33 +0100, Frederic Weisbecker wrote:
> > On Fri, Mar 05, 2010 at 06:20:29PM +0100, Peter Zijlstra wrote:
> > > On Fri, 2010-03-05 at 18:03 +0100, Frederic Weisbecker wrote:
> > > > 
> > > > Now isn't the problem more in the fact that most of the swevents
> > > > should be tracepoints? 
> > > 
> > > No, different interface, and I don't want to require TRACE=y, I already
> > > utterly hate that x86 requires PERF=y.
> > > 
> > 
> > This could be reduced to the strict minimum, say CONFIG_TRACEPOINT
> > and some code around just to support the event ids.
> 
> Can't, software events already are an ABI so we'll have to support that
> forever, but sure you can make something that reduces to the current
> software event callback on TRACE=n and maps to the right software event
> id when TRACE=y.


That looks feasible. I may try something like that.


> 
> > Software events could be made optionals too.
> 
> Sure, but they're nowhere near as much code as tracepoints.


Yeah but they have the overhead of an off case to handle, something
that can be turned off if we have only perf for breakpoints.


> 
> > > I already
> > > utterly hate that x86 requires PERF=y.
> > 
> > 
> > Me too, and it's my bad, so me double too. Sometimes I think
> > we should make BREAKPOINTs optional, default y. I just don't know
> > if something like this that has always been builtin can be made
> > optional.
> 
> Simply for build testing that would be useful, we could make it an
> embedded switch.


Agreed.

Ok, I'll work toward an hlist version for the initial topic.
Thanks.


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [RFC PATCH] perf: Store relevant events in a hlist
  2010-03-05  9:39   ` Peter Zijlstra
  2010-03-05 17:03     ` Frederic Weisbecker
@ 2010-03-08 18:35     ` Frederic Weisbecker
  2010-03-10 19:34       ` Peter Zijlstra
  1 sibling, 1 reply; 13+ messages in thread
From: Frederic Weisbecker @ 2010-03-08 18:35 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: LKML, Ingo Molnar, Paul Mackerras, Steven Rostedt,
	Masami Hiramatsu, Jason Baron, Arnaldo Carvalho de Melo

On Fri, Mar 05, 2010 at 10:39:29AM +0100, Peter Zijlstra wrote:
> On Fri, 2010-03-05 at 08:00 +0100, Frederic Weisbecker wrote:
> > Each time a trace event triggers, we walk through the entire
> > list of events from the active contexts to find the perf events
> > that match the current one.
> > 
> > This is wasteful. To solve this, we maintain a per cpu list of
> > the active perf events for each running trace events and we
> > directly commit to these.
> 
> Right, so this seems a little trace specific. I once thought about using
> a hash table to do this for all software events. It also keeps it all
> nicely inside perf_event.[ch].


What do you think about this version?
It builds but crashes on runtime and doesn't handle
cpu hotplug yet. Before spending more time in debugging/fixing,
I'd like to know your opinion about the general architecture.

Thanks.

---
>From c389b296a87bd38cfd28da3124508eb1e5a5d553 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 8 Mar 2010 19:04:02 +0100
Subject: [PATCH] perf: Store relevant events in a hlist

When a software/tracepoint event triggers, we walk through the
entire current cpu and task context's events lists to retrieve
those that are concerned.

This is wasteful. This patch proposes a hashlist to walk through
the relevant events only. The hash is calculated using the id of
the event (could be further optimized using the type of the event
too). Each hash map a list of distinct type:id that match the hash.
To these type:id nodes, we affect a list of the active events
matching the type:id.

-----------------------
Hash 1 | Hash 2 | ....
----------------------
  |
 swevent type:id node
  |       |
  |       ------- event 1 ---- event 2 ---- ....
  |
 swevent type:id node
          |
          --------[...]

The hashlist is per cpu (attached to perf_cpu_context) and the
events in the lists gather those that are active in the cpu and
task context. No more per events checks are needed to guess if
the events are "counting" or "matching".

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 include/linux/perf_event.h |   12 ++
 kernel/perf_event.c        |  294 ++++++++++++++++++++++++++++++++++---------
 2 files changed, 244 insertions(+), 62 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index c859bee..bdd9794 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -578,6 +578,7 @@ struct perf_event {
 	struct list_head		group_entry;
 	struct list_head		event_entry;
 	struct list_head		sibling_list;
+	struct list_head		swevent_list;
 	int				nr_siblings;
 	int				group_flags;
 	struct perf_event		*group_leader;
@@ -716,6 +717,14 @@ struct perf_event_context {
 	struct rcu_head			rcu_head;
 };
 
+#define PERF_SW_HLIST_BITS	10		/* 1024 entries */
+#define PERF_SW_HLIST_SIZE	1 << PERF_SW_HLIST_BITS
+
+struct swevents_hlist {
+	struct rcu_head			rcu_head;
+	struct hlist_head		hlist[PERF_SW_HLIST_SIZE];
+};
+
 /**
  * struct perf_event_cpu_context - per cpu event context structure
  */
@@ -732,6 +741,9 @@ struct perf_cpu_context {
 	 * task, softirq, irq, nmi context
 	 */
 	int				recursion[4];
+	struct swevents_hlist		*swevents_hlist;
+	struct mutex			hlist_mutex;
+	struct kref			hlist_refcount;
 };
 
 struct perf_output_handle {
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index a97c1b1..50dabea 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -15,6 +15,7 @@
 #include <linux/smp.h>
 #include <linux/file.h>
 #include <linux/poll.h>
+#include <linux/hash.h>
 #include <linux/sysfs.h>
 #include <linux/dcache.h>
 #include <linux/percpu.h>
@@ -3972,34 +3973,210 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
 	perf_swevent_overflow(event, 0, nmi, data, regs);
 }
 
-static int perf_swevent_is_counting(struct perf_event *event)
+struct swevent_node {
+	enum perf_type_id	type;
+	__u64			id;
+	struct hlist_node	node;
+	struct list_head	events;
+	struct kref		refcount;
+	struct rcu_head		rcu_head;
+};
+
+static __u64 sw_event_hash(enum perf_type_id type, __u64 id)
 {
-	/*
-	 * The event is active, we're good!
-	 */
-	if (event->state == PERF_EVENT_STATE_ACTIVE)
-		return 1;
+	return hash_64(id, PERF_SW_HLIST_BITS);
+}
 
-	/*
-	 * The event is off/error, not counting.
-	 */
-	if (event->state != PERF_EVENT_STATE_INACTIVE)
-		return 0;
+static struct swevent_node *
+__search_swevent_node(__u64 hash, enum perf_type_id type, __u64 id,
+		      struct perf_cpu_context *cpuctx)
+{
+	struct swevent_node *sw_node;
+	struct hlist_node *node;
+	struct hlist_head *head;
 
-	/*
-	 * The event is inactive, if the context is active
-	 * we're part of a group that didn't make it on the 'pmu',
-	 * not counting.
-	 */
-	if (event->ctx->is_active)
-		return 0;
+	head = &cpuctx->swevents_hlist->hlist[hash];
 
-	/*
-	 * We're inactive and the context is too, this means the
-	 * task is scheduled out, we're counting events that happen
-	 * to us, like migration events.
-	 */
-	return 1;
+	hlist_for_each_entry_rcu(sw_node, node, head, node) {
+		if (type == sw_node->type && id == sw_node->id)
+			return sw_node;
+	}
+
+	return NULL;
+}
+
+static struct swevent_node *
+search_swevent_node(enum perf_type_id type, __u64 id,
+		    struct perf_cpu_context *cpuctx)
+{
+	__u64 hash = sw_event_hash(type, id);
+
+	return __search_swevent_node(hash, type, id, cpuctx);
+}
+
+static int search_add_swevent_node(enum perf_type_id type, __u64 id,
+				   struct perf_cpu_context *cpuctx)
+{
+	__u64 hash = sw_event_hash(type, id);
+	struct swevent_node *sw_node;
+
+	sw_node = __search_swevent_node(hash, type, id, cpuctx);
+	if (sw_node)
+		goto done;
+
+	sw_node = kmalloc(sizeof(*sw_node), GFP_KERNEL);
+	if (!sw_node)
+		return -ENOMEM;
+
+	sw_node->type = type;
+	sw_node->id = id;
+	kref_init(&sw_node->refcount);
+	hlist_add_head_rcu(&sw_node->node, &cpuctx->swevents_hlist->hlist[hash]);
+
+ done:
+	kref_get(&sw_node->refcount);
+
+	return 0;
+}
+
+static void release_swevent_hlist_rcu(struct rcu_head *rcu_head)
+{
+	struct swevents_hlist *hlist;
+
+	hlist = container_of(rcu_head, struct swevents_hlist, rcu_head);
+	kfree(hlist);
+}
+
+static void release_swevent_hlist(struct kref *kref)
+{
+	struct perf_cpu_context *cpuctx;
+	struct swevents_hlist *hlist;
+
+	cpuctx = container_of(kref, struct perf_cpu_context, hlist_refcount);
+	hlist = cpuctx->swevents_hlist;
+	rcu_assign_pointer(cpuctx->swevents_hlist, NULL);
+	call_rcu(&hlist->rcu_head, release_swevent_hlist_rcu);
+}
+
+static int add_swevent_hlist_cpu(struct perf_event *event,
+				 struct perf_cpu_context *cpuctx)
+{
+	int err = 0;
+	struct swevents_hlist *hlist;
+
+	mutex_lock(&cpuctx->hlist_mutex);
+
+	if (!cpuctx->swevents_hlist) {
+		hlist = kzalloc(sizeof(cpuctx->swevents_hlist), GFP_KERNEL);
+		if (!hlist) {
+			err = -ENOMEM;
+			goto end;
+		}
+		rcu_assign_pointer(cpuctx->swevents_hlist, hlist);
+	}
+
+	kref_get(&cpuctx->hlist_refcount);
+
+	err = search_add_swevent_node(event->attr.type, event->attr.config,
+				      cpuctx);
+	if (err)
+		kref_put(&cpuctx->hlist_refcount, release_swevent_hlist);
+end:
+	mutex_unlock(&cpuctx->hlist_mutex);
+
+	return err;
+}
+
+static void release_swevent_node_rcu(struct rcu_head *rcu_head)
+{
+	struct swevent_node *sw_node;
+
+	sw_node = container_of(rcu_head, struct swevent_node, rcu_head);
+	kfree(sw_node);
+}
+
+static void release_swevent_node(struct kref *kref)
+{
+	struct swevent_node *sw_node;
+
+	sw_node = container_of(kref, struct swevent_node, refcount);
+	hlist_del_rcu(&sw_node->node);
+	call_rcu(&sw_node->rcu_head, release_swevent_node_rcu);
+}
+
+static void remove_swevent_hlist_cpu(struct perf_event *event,
+				    struct perf_cpu_context *cpuctx)
+{
+	struct swevent_node *sw_node;
+
+	mutex_lock(&cpuctx->hlist_mutex);
+
+	if (WARN_ON_ONCE(!cpuctx->swevents_hlist))
+		goto end;
+
+	sw_node = search_swevent_node(event->attr.type, event->attr.config,
+				      cpuctx);
+	if (WARN_ON_ONCE(!sw_node))
+		goto end;
+
+	kref_put(&sw_node->refcount, release_swevent_node);
+	kref_put(&cpuctx->hlist_refcount, release_swevent_hlist);
+end:
+	mutex_unlock(&cpuctx->hlist_mutex);
+}
+
+static int add_swevent_hlist(struct perf_event *event)
+{
+	int cpu, fail_cpu;
+	int err;
+	struct perf_cpu_context *cpuctx;
+
+	get_online_cpus();
+
+	if (event->cpu == -1) {
+		for_each_online_cpu(cpu) {
+			cpuctx = &per_cpu(perf_cpu_context, cpu);
+			err = add_swevent_hlist_cpu(event, cpuctx);
+			if (err) {
+				fail_cpu = cpu;
+				goto fail;
+			}
+		}
+	} else {
+		cpu = event->cpu;
+		cpuctx = &per_cpu(perf_cpu_context, cpu);
+		err = add_swevent_hlist_cpu(event, cpuctx);
+		goto end;
+	}
+
+fail:
+	for_each_online_cpu(cpu) {
+		if (cpu == fail_cpu)
+			break;
+		cpuctx = &per_cpu(perf_cpu_context, cpu);
+		remove_swevent_hlist_cpu(event, cpuctx);
+	}
+end:
+	put_online_cpus();
+
+	return err;
+}
+
+static void remove_swevent_hlist(struct perf_event *event)
+{
+	int cpu;
+	struct perf_cpu_context *cpuctx;
+
+	if (event->cpu == -1) {
+		for_each_online_cpu(cpu) {
+			cpuctx = &per_cpu(perf_cpu_context, cpu);
+			remove_swevent_hlist_cpu(event, cpuctx);
+		}
+	} else {
+		cpu = event->cpu;
+		cpuctx = &per_cpu(perf_cpu_context, cpu);
+		remove_swevent_hlist_cpu(event, cpuctx);
+	}
 }
 
 static int perf_tp_event_match(struct perf_event *event,
@@ -4020,23 +4197,9 @@ static int perf_exclude_event(struct perf_event *event,
 }
 
 static int perf_swevent_match(struct perf_event *event,
-				enum perf_type_id type,
-				u32 event_id,
 				struct perf_sample_data *data,
 				struct pt_regs *regs)
 {
-	if (event->cpu != -1 && event->cpu != smp_processor_id())
-		return 0;
-
-	if (!perf_swevent_is_counting(event))
-		return 0;
-
-	if (event->attr.type != type)
-		return 0;
-
-	if (event->attr.config != event_id)
-		return 0;
-
 	if (perf_exclude_event(event, regs))
 		return 0;
 
@@ -4047,20 +4210,6 @@ static int perf_swevent_match(struct perf_event *event,
 	return 1;
 }
 
-static void perf_swevent_ctx_event(struct perf_event_context *ctx,
-				     enum perf_type_id type,
-				     u32 event_id, u64 nr, int nmi,
-				     struct perf_sample_data *data,
-				     struct pt_regs *regs)
-{
-	struct perf_event *event;
-
-	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
-		if (perf_swevent_match(event, type, event_id, data, regs))
-			perf_swevent_add(event, nr, nmi, data, regs);
-	}
-}
-
 int perf_swevent_get_recursion_context(void)
 {
 	struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
@@ -4102,19 +4251,20 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
 				    struct pt_regs *regs)
 {
 	struct perf_cpu_context *cpuctx;
-	struct perf_event_context *ctx;
+	struct swevent_node *node;
+	struct perf_event *event;
 
 	cpuctx = &__get_cpu_var(perf_cpu_context);
+
 	rcu_read_lock();
-	perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
-				 nr, nmi, data, regs);
-	/*
-	 * doesn't really matter which of the child contexts the
-	 * events ends up in.
-	 */
-	ctx = rcu_dereference(current->perf_event_ctxp);
-	if (ctx)
-		perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
+	node = search_swevent_node(type, event_id, cpuctx);
+	if (!node)
+		goto end;
+	list_for_each_entry(event, &node->events, swevent_list) {
+		if (perf_swevent_match(event, data, regs))
+			perf_swevent_add(event, nr, nmi, data, regs);
+	}
+end:
 	rcu_read_unlock();
 }
 
@@ -4143,16 +4293,28 @@ static void perf_swevent_read(struct perf_event *event)
 static int perf_swevent_enable(struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
+	struct perf_cpu_context *cpuctx;
+	struct swevent_node *node;
 
 	if (hwc->sample_period) {
 		hwc->last_period = hwc->sample_period;
 		perf_swevent_set_period(event);
 	}
+
+	cpuctx = &__get_cpu_var(perf_cpu_context);
+
+	node = search_swevent_node(event->attr.type, event->attr.config, cpuctx);
+	if (WARN_ON_ONCE(!node))
+		return -EINVAL;
+
+	list_add_tail(&event->swevent_list, &node->events);
+
 	return 0;
 }
 
 static void perf_swevent_disable(struct perf_event *event)
 {
+	list_del(&event->swevent_list);
 }
 
 static const struct pmu perf_ops_generic = {
@@ -4489,10 +4651,13 @@ static void sw_perf_event_destroy(struct perf_event *event)
 	WARN_ON(event->parent);
 
 	atomic_dec(&perf_swevent_enabled[event_id]);
+
+	remove_swevent_hlist(event);
 }
 
 static const struct pmu *sw_perf_event_init(struct perf_event *event)
 {
+	int err;
 	const struct pmu *pmu = NULL;
 	u64 event_id = event->attr.config;
 
@@ -4531,6 +4696,9 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
 			event->destroy = sw_perf_event_destroy;
 		}
 		pmu = &perf_ops_generic;
+		err = add_swevent_hlist(event);
+		if (err)
+			return ERR_PTR(err);
 		break;
 	}
 
@@ -5397,6 +5565,8 @@ static void __cpuinit perf_event_init_cpu(int cpu)
 	struct perf_cpu_context *cpuctx;
 
 	cpuctx = &per_cpu(perf_cpu_context, cpu);
+	mutex_init(&cpuctx->hlist_mutex);
+	kref_init(&cpuctx->hlist_refcount);
 	__perf_event_init_context(&cpuctx->ctx, NULL);
 
 	spin_lock(&perf_resource_lock);
-- 
1.6.2.3




^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [RFC PATCH] perf: Store relevant events in a hlist
  2010-03-08 18:35     ` [RFC PATCH] perf: Store relevant events in a hlist Frederic Weisbecker
@ 2010-03-10 19:34       ` Peter Zijlstra
  2010-03-10 20:33         ` Frederic Weisbecker
  0 siblings, 1 reply; 13+ messages in thread
From: Peter Zijlstra @ 2010-03-10 19:34 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: LKML, Ingo Molnar, Paul Mackerras, Steven Rostedt,
	Masami Hiramatsu, Jason Baron, Arnaldo Carvalho de Melo

On Mon, 2010-03-08 at 19:35 +0100, Frederic Weisbecker wrote:
> On Fri, Mar 05, 2010 at 10:39:29AM +0100, Peter Zijlstra wrote:
> > On Fri, 2010-03-05 at 08:00 +0100, Frederic Weisbecker wrote:
> > > Each time a trace event triggers, we walk through the entire
> > > list of events from the active contexts to find the perf events
> > > that match the current one.
> > > 
> > > This is wasteful. To solve this, we maintain a per cpu list of
> > > the active perf events for each running trace events and we
> > > directly commit to these.
> > 
> > Right, so this seems a little trace specific. I once thought about using
> > a hash table to do this for all software events. It also keeps it all
> > nicely inside perf_event.[ch].
> 
> 
> What do you think about this version?
> It builds but crashes on runtime and doesn't handle
> cpu hotplug yet. Before spending more time in debugging/fixing,
> I'd like to know your opinion about the general architecture.
> 
> Thanks.
> 
> ---
> From c389b296a87bd38cfd28da3124508eb1e5a5d553 Mon Sep 17 00:00:00 2001
> From: Frederic Weisbecker <fweisbec@gmail.com>
> Date: Mon, 8 Mar 2010 19:04:02 +0100
> Subject: [PATCH] perf: Store relevant events in a hlist
> 
> When a software/tracepoint event triggers, we walk through the
> entire current cpu and task context's events lists to retrieve
> those that are concerned.
> 
> This is wasteful. This patch proposes a hashlist to walk through
> the relevant events only. The hash is calculated using the id of
> the event (could be further optimized using the type of the event
> too). Each hash map a list of distinct type:id that match the hash.
> To these type:id nodes, we affect a list of the active events
> matching the type:id.
> 
> -----------------------
> Hash 1 | Hash 2 | ....
> ----------------------
>   |
>  swevent type:id node
>   |       |
>   |       ------- event 1 ---- event 2 ---- ....
>   |
>  swevent type:id node
>           |
>           --------[...]
> 
> The hashlist is per cpu (attached to perf_cpu_context) and the
> events in the lists gather those that are active in the cpu and
> task context. No more per events checks are needed to guess if
> the events are "counting" or "matching".

I'm not quite sure why you need the node thing, you already have a
hash-bucket to iterate, simply stick all events into the one bucket and
walk through it with a filter and process all events that match.

As to all those for_each_online_cpu() thingies, it might make sense to
also have a global hash-table for events active on all cpus,... hmm was
that the reason for the node thing, one event cannot be in multiple
buckets?

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC PATCH] perf: Store relevant events in a hlist
  2010-03-10 19:34       ` Peter Zijlstra
@ 2010-03-10 20:33         ` Frederic Weisbecker
  2010-03-10 20:46           ` Peter Zijlstra
  0 siblings, 1 reply; 13+ messages in thread
From: Frederic Weisbecker @ 2010-03-10 20:33 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: LKML, Ingo Molnar, Paul Mackerras, Steven Rostedt,
	Masami Hiramatsu, Jason Baron, Arnaldo Carvalho de Melo

On Wed, Mar 10, 2010 at 08:34:52PM +0100, Peter Zijlstra wrote:
> I'm not quite sure why you need the node thing, you already have a
> hash-bucket to iterate, simply stick all events into the one bucket and
> walk through it with a filter and process all events that match.


This inter level of indirection was one of my heaviest hesitations.
In case we have a hash collision, I just wanted to ensure we keep
an amortized O(n) in any case, that at the cost of this level of
indirection. Plus that removed the config:id check in every events,
as the check is made only once.

That said I guess we can indeed remove that and have the events
directly in the hash bucket. Assuming we deal well to avoid
collisions, it should be fine.

 
> As to all those for_each_online_cpu() thingies, it might make sense to
> also have a global hash-table for events active on all cpus,... hmm was
> that the reason for the node thing, one event cannot be in multiple
> buckets?


There are several reasons I've made it per cpu.
Assuming we have a global hash table for wide events, it means we'll
have some cache dance each time an event is disabled/enabled (which
is quite often as wide events are per task, even worst if the initial task
has numerous threads that have this event duplicated). Also, as wide
events mean per task, the event will always be active in one cpu at
a time, it would be wasteful to check it on other cpus.


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC PATCH] perf: Store relevant events in a hlist
  2010-03-10 20:33         ` Frederic Weisbecker
@ 2010-03-10 20:46           ` Peter Zijlstra
  2010-03-10 21:04             ` Frederic Weisbecker
  0 siblings, 1 reply; 13+ messages in thread
From: Peter Zijlstra @ 2010-03-10 20:46 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: LKML, Ingo Molnar, Paul Mackerras, Steven Rostedt,
	Masami Hiramatsu, Jason Baron, Arnaldo Carvalho de Melo

On Wed, 2010-03-10 at 21:33 +0100, Frederic Weisbecker wrote:
> On Wed, Mar 10, 2010 at 08:34:52PM +0100, Peter Zijlstra wrote:
> > I'm not quite sure why you need the node thing, you already have a
> > hash-bucket to iterate, simply stick all events into the one bucket and
> > walk through it with a filter and process all events that match.
> 
> 
> This inter level of indirection was one of my heaviest hesitations.
> In case we have a hash collision, I just wanted to ensure we keep
> an amortized O(n) in any case, that at the cost of this level of
> indirection. Plus that removed the config:id check in every events,
> as the check is made only once.
> 
> That said I guess we can indeed remove that and have the events
> directly in the hash bucket. Assuming we deal well to avoid
> collisions, it should be fine.

Right, lets start simple and go from there.

> > As to all those for_each_online_cpu() thingies, it might make sense to
> > also have a global hash-table for events active on all cpus,... hmm was
> > that the reason for the node thing, one event cannot be in multiple
> > buckets?
> 
> 
> There are several reasons I've made it per cpu.
> Assuming we have a global hash table for wide events, it means we'll
> have some cache dance each time an event is disabled/enabled (which
> is quite often as wide events are per task, even worst if the initial task
> has numerous threads that have this event duplicated). Also, as wide
> events mean per task, the event will always be active in one cpu at
> a time, it would be wasteful to check it on other cpus.

Thing is, most events generated by perf are per cpu, even the per task
ones, if they are machine wide the hash table bounces aren't the biggest
problem.

But yeah..

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC PATCH] perf: Store relevant events in a hlist
  2010-03-10 20:46           ` Peter Zijlstra
@ 2010-03-10 21:04             ` Frederic Weisbecker
  0 siblings, 0 replies; 13+ messages in thread
From: Frederic Weisbecker @ 2010-03-10 21:04 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: LKML, Ingo Molnar, Paul Mackerras, Steven Rostedt,
	Masami Hiramatsu, Jason Baron, Arnaldo Carvalho de Melo

On Wed, Mar 10, 2010 at 09:46:40PM +0100, Peter Zijlstra wrote:
> On Wed, 2010-03-10 at 21:33 +0100, Frederic Weisbecker wrote:
> > On Wed, Mar 10, 2010 at 08:34:52PM +0100, Peter Zijlstra wrote:
> > > I'm not quite sure why you need the node thing, you already have a
> > > hash-bucket to iterate, simply stick all events into the one bucket and
> > > walk through it with a filter and process all events that match.
> > 
> > 
> > This inter level of indirection was one of my heaviest hesitations.
> > In case we have a hash collision, I just wanted to ensure we keep
> > an amortized O(n) in any case, that at the cost of this level of
> > indirection. Plus that removed the config:id check in every events,
> > as the check is made only once.
> > 
> > That said I guess we can indeed remove that and have the events
> > directly in the hash bucket. Assuming we deal well to avoid
> > collisions, it should be fine.
> 
> Right, lets start simple and go from there.


Agreed.


 
> > > As to all those for_each_online_cpu() thingies, it might make sense to
> > > also have a global hash-table for events active on all cpus,... hmm was
> > > that the reason for the node thing, one event cannot be in multiple
> > > buckets?
> > 
> > 
> > There are several reasons I've made it per cpu.
> > Assuming we have a global hash table for wide events, it means we'll
> > have some cache dance each time an event is disabled/enabled (which
> > is quite often as wide events are per task, even worst if the initial task
> > has numerous threads that have this event duplicated). Also, as wide
> > events mean per task, the event will always be active in one cpu at
> > a time, it would be wasteful to check it on other cpus.
> 
> Thing is, most events generated by perf are per cpu, even the per task
> ones, if they are machine wide the hash table bounces aren't the biggest
> problem.


Indeed. But it also means a global hlist is going to be wasteful as
we would double walk for each per task events.

It is also uninteresting if people use wide per task event for the
bouncing problems we talked about.

Plus I guess we don't want to maintain a per cpu hlist and a global
one.

I understand you worries about complexity but this is a true win in
any case.

Thanks.


^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2010-03-10 21:04 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-03-05  7:00 [PATCH 1/2] perf: Drop the obsolete profile naming for trace events Frederic Weisbecker
2010-03-05  7:00 ` [PATCH 2/2] perf: Walk through the relevant events only Frederic Weisbecker
2010-03-05  9:39   ` Peter Zijlstra
2010-03-05 17:03     ` Frederic Weisbecker
2010-03-05 17:20       ` Peter Zijlstra
2010-03-05 17:33         ` Frederic Weisbecker
2010-03-05 17:39           ` Peter Zijlstra
2010-03-05 17:46             ` Frederic Weisbecker
2010-03-08 18:35     ` [RFC PATCH] perf: Store relevant events in a hlist Frederic Weisbecker
2010-03-10 19:34       ` Peter Zijlstra
2010-03-10 20:33         ` Frederic Weisbecker
2010-03-10 20:46           ` Peter Zijlstra
2010-03-10 21:04             ` Frederic Weisbecker

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.