linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC PATCH 0/2] perf: Add persistent event facilities
@ 2012-03-21 14:34 Borislav Petkov
  2012-03-21 14:34 ` [PATCH 1/2] " Borislav Petkov
  2012-03-21 14:34 ` [PATCH 2/2] x86, mce: Add persistent MCE event Borislav Petkov
  0 siblings, 2 replies; 32+ messages in thread
From: Borislav Petkov @ 2012-03-21 14:34 UTC (permalink / raw)
  To: Frederic Weisbecker, Ingo Molnar, Peter Zijlstra, Steven Rostedt
  Cc: LKML, Borislav Petkov

From: Borislav Petkov <borislav.petkov@amd.com>

Hi guys,

this is yet another version of the persistent events patch incorporating
all comments from last time. Highlights in this version are the switch
to the other ring buffer, making the facility generic for usage by other
subsystems not only RAS (patch 2/2 shows an exemplary usage) and a
couple of minor simplifications.

Any comments and suggestions are welcome,
thanks.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [PATCH 1/2] perf: Add persistent event facilities
  2012-03-21 14:34 [RFC PATCH 0/2] perf: Add persistent event facilities Borislav Petkov
@ 2012-03-21 14:34 ` Borislav Petkov
  2012-05-18  9:58   ` Peter Zijlstra
                     ` (4 more replies)
  2012-03-21 14:34 ` [PATCH 2/2] x86, mce: Add persistent MCE event Borislav Petkov
  1 sibling, 5 replies; 32+ messages in thread
From: Borislav Petkov @ 2012-03-21 14:34 UTC (permalink / raw)
  To: Frederic Weisbecker, Ingo Molnar, Peter Zijlstra, Steven Rostedt
  Cc: LKML, Borislav Petkov

From: Borislav Petkov <borislav.petkov@amd.com>

Add a barebones implementation for registering persistent events with
perf. For that, we don't destroy the buffers when they're unmapped;
also, we map them read-only so that multiple agents can access them.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
---
 include/linux/perf_event.h |   24 ++++++++++-
 kernel/events/Makefile     |    2 +-
 kernel/events/core.c       |   25 +++++++++--
 kernel/events/internal.h   |    2 +
 kernel/events/persistent.c |   98 ++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 145 insertions(+), 6 deletions(-)
 create mode 100644 kernel/events/persistent.c

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index abb2776be1ba..9743ba46e12d 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -223,8 +223,9 @@ struct perf_event_attr {
 
 				exclude_host   :  1, /* don't count in host   */
 				exclude_guest  :  1, /* don't count in guest  */
+				persistent     :  1, /* always-on event */
 
-				__reserved_1   : 43;
+				__reserved_1   : 42;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
@@ -944,6 +945,14 @@ struct perf_output_handle {
 	int				page;
 };
 
+/* Persistent events descriptor */
+struct pers_event_desc {
+	struct perf_event_attr *pattr;
+	struct perf_event *event;
+	const char *dfs_name;
+	struct dentry *dfs_entry;
+};
+
 #ifdef CONFIG_PERF_EVENTS
 
 extern int perf_pmu_register(struct pmu *pmu, char *name, int type);
@@ -1149,6 +1158,11 @@ extern void perf_swevent_put_recursion_context(int rctx);
 extern void perf_event_enable(struct perf_event *event);
 extern void perf_event_disable(struct perf_event *event);
 extern void perf_event_task_tick(void);
+extern int perf_add_persistent_on_cpu(int cpu, struct pers_event_desc *desc,
+				      struct dentry *dfs_dr, unsigned nr_pages);
+extern void perf_rm_persistent_on_cpu(int cpu, struct pers_event_desc *desc);
+extern int perf_persistent_open(struct inode *inode, struct file *file);
+extern const struct file_operations perf_pers_fops;
 #else
 static inline void
 perf_event_task_sched_in(struct task_struct *prev,
@@ -1187,6 +1201,14 @@ static inline void perf_swevent_put_recursion_context(int rctx)		{ }
 static inline void perf_event_enable(struct perf_event *event)		{ }
 static inline void perf_event_disable(struct perf_event *event)		{ }
 static inline void perf_event_task_tick(void)				{ }
+extern int perf_add_persistent_on_cpu(int cpu, struct pers_event_desc *desc,
+					 struct perf_event_attr *pattr,
+					 struct dentry *dfs_dir,
+					 unsigned nr_pages);		{ return -EINVAL; }
+extern void perf_rm_persistent_on_cpu(int cpu,
+				      struct pers_event_desc *desc)	{ }
+static inline int
+perf_persistent_open(struct inode *inode, struct file *file)		{ return -1; }
 #endif
 
 #define perf_output_put(handle, x) perf_output_copy((handle), &(x), sizeof(x))
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 22d901f9caf4..0d2267fb4336 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -2,5 +2,5 @@ ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_core.o = -pg
 endif
 
-obj-y := core.o ring_buffer.o callchain.o
+obj-y := core.o ring_buffer.o callchain.o persistent.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1b5c081d8b9f..9a39dc6755f4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2770,8 +2770,6 @@ static void free_event_rcu(struct rcu_head *head)
 	kfree(event);
 }
 
-static void ring_buffer_put(struct ring_buffer *rb);
-
 static void free_event(struct perf_event *event)
 {
 	irq_work_sync(&event->pending);
@@ -3423,7 +3421,7 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)
 	return rb;
 }
 
-static void ring_buffer_put(struct ring_buffer *rb)
+void ring_buffer_put(struct ring_buffer *rb)
 {
 	struct perf_event *event, *n;
 	unsigned long flags;
@@ -3452,6 +3450,11 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 {
 	struct perf_event *event = vma->vm_file->private_data;
 
+	if (event->attr.persistent) {
+		atomic_dec(&event->mmap_count);
+		return;
+	}
+
 	if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
 		unsigned long size = perf_data_size(event->rb);
 		struct user_struct *user = event->mmap_user;
@@ -3495,7 +3498,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	if (event->cpu == -1 && event->attr.inherit)
 		return -EINVAL;
 
-	if (!(vma->vm_flags & VM_SHARED))
+	if (!(vma->vm_flags & VM_SHARED) && !event->attr.persistent)
 		return -EINVAL;
 
 	vma_size = vma->vm_end - vma->vm_start;
@@ -3606,6 +3609,16 @@ static const struct file_operations perf_fops = {
 	.fasync			= perf_fasync,
 };
 
+const struct file_operations perf_pers_fops = {
+	.llseek		= no_llseek,
+	.open		= perf_persistent_open,
+	.poll		= perf_poll,
+	.unlocked_ioctl	= perf_ioctl,
+	.compat_ioctl	= perf_ioctl,
+	.mmap		= perf_mmap,
+	.fasync		= perf_fasync,
+};
+
 /*
  * Perf event wakeup
  *
@@ -6319,6 +6332,10 @@ __perf_event_exit_task(struct perf_event *child_event,
 		raw_spin_unlock_irq(&child_ctx->lock);
 	}
 
+	/* do not remove persistent events on task exit */
+	if (child_event->attr.persistent)
+		return;
+
 	perf_remove_from_context(child_event);
 
 	/*
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index b0b107f90afc..d4e6181d5faf 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -37,6 +37,8 @@ struct ring_buffer {
 extern void rb_free(struct ring_buffer *rb);
 extern struct ring_buffer *
 rb_alloc(int nr_pages, long watermark, int cpu, int flags);
+extern void ring_buffer_put(struct ring_buffer *rb);
+
 extern void perf_event_wakeup(struct perf_event *event);
 
 extern void
diff --git a/kernel/events/persistent.c b/kernel/events/persistent.c
new file mode 100644
index 000000000000..e6f146532e34
--- /dev/null
+++ b/kernel/events/persistent.c
@@ -0,0 +1,98 @@
+#include <linux/debugfs.h>
+#include <linux/perf_event.h>
+
+#include "internal.h"
+
+/*
+ * Create and enable the perf event described by @attr.
+ *
+ * Returns the @event pointer which receives the allocated event from
+ * perf on success. Make sure to check return code before touching @event
+ * any further.
+ *
+ * @attr: perf attr template
+ * @cpu: on which cpu
+ * @nr_pages: perf buffer size in pages
+ *
+ */
+static struct perf_event *perf_add_persistent(struct perf_event_attr *attr,
+					      int cpu, unsigned nr_pages)
+{
+	struct ring_buffer *buffer;
+	struct perf_event *event;
+
+	event = perf_event_create_kernel_counter(attr, cpu, NULL, NULL, NULL);
+	if (IS_ERR(event))
+		return event;
+
+	buffer = rb_alloc(nr_pages, 0, cpu, RING_BUFFER_WRITABLE);
+	if (!buffer)
+		goto err;
+
+	rcu_assign_pointer(event->rb, buffer);
+	perf_event_enable(event);
+
+	return event;
+
+err:
+	perf_event_release_kernel(event);
+	return ERR_PTR(-EINVAL);
+}
+
+static void perf_rm_persistent(struct perf_event *event, int cpu)
+{
+	if (!event)
+		return;
+
+	perf_event_disable(event);
+
+	if (event->rb) {
+		ring_buffer_put(event->rb);
+		rcu_assign_pointer(event->rb, NULL);
+	}
+
+	perf_event_release_kernel(event);
+}
+
+static struct dentry *perf_add_debugfs_entry(struct pers_event_desc *d,
+					     struct dentry *dir)
+{
+	return debugfs_create_file(d->dfs_name, S_IRUGO | S_IWUSR,
+				   dir, d->event, &perf_pers_fops);
+}
+
+int perf_add_persistent_on_cpu(int cpu, struct pers_event_desc *desc,
+			       struct dentry *dir, unsigned nr_pages)
+{
+	int err = -EINVAL;
+
+	desc->event = perf_add_persistent(desc->pattr, cpu, nr_pages);
+	if (IS_ERR(desc->event)) {
+		printk(KERN_ERR "Error enabling persistent event on cpu %d\n", cpu);
+		return err;
+	}
+
+	desc->dfs_entry = perf_add_debugfs_entry(desc, dir);
+	if (!desc->dfs_entry) {
+		printk(KERN_ERR "Error adding debugfs entry on cpu %d\n", cpu);
+		goto disable;
+	}
+	return 0;
+
+disable:
+	perf_rm_persistent(desc->event, cpu);
+	return err;
+}
+
+void perf_rm_persistent_on_cpu(int cpu, struct pers_event_desc *desc)
+{
+	debugfs_remove(desc->dfs_entry);
+	perf_rm_persistent(desc->event, cpu);
+}
+
+int perf_persistent_open(struct inode *inode, struct file *file)
+{
+	file->private_data = inode->i_private;
+
+	return 0;
+}
-- 
1.7.9.3.362.g71319


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 2/2] x86, mce: Add persistent MCE event
  2012-03-21 14:34 [RFC PATCH 0/2] perf: Add persistent event facilities Borislav Petkov
  2012-03-21 14:34 ` [PATCH 1/2] " Borislav Petkov
@ 2012-03-21 14:34 ` Borislav Petkov
  2012-03-22  8:36   ` Srivatsa S. Bhat
  2012-03-23 12:31   ` Ingo Molnar
  1 sibling, 2 replies; 32+ messages in thread
From: Borislav Petkov @ 2012-03-21 14:34 UTC (permalink / raw)
  To: Frederic Weisbecker, Ingo Molnar, Peter Zijlstra, Steven Rostedt
  Cc: LKML, Borislav Petkov

From: Borislav Petkov <borislav.petkov@amd.com>

Add the necessary glue to enable the mce_record tracepoint on boot,
turning it into a persistent event. This exports the MCE buffer through
a debugfs per-CPU file which a userspace daemon can read and then
process the received error data further.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c |   53 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 5a11ae2e9e91..791c4633d771 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -95,6 +95,13 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
 static DEFINE_PER_CPU(struct mce, mces_seen);
 static int			cpu_missing;
 
+static struct perf_event_attr pattr = {
+	.type           = PERF_TYPE_TRACEPOINT,
+	.size           = sizeof(pattr),
+	.sample_type    = PERF_SAMPLE_RAW,
+	.persistent     = 1,
+};
+
 /* MCA banks polled by the period polling timer for corrected events */
 DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
 	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
@@ -102,6 +109,8 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
 
 static DEFINE_PER_CPU(struct work_struct, mce_work);
 
+static DEFINE_PER_CPU(struct pers_event_desc, mce_ev);
+
 /*
  * CPU/chipset specific EDAC code can register a notifier call here to print
  * MCE errors in a human-readable form.
@@ -2109,6 +2118,50 @@ static void __cpuinit mce_reenable_cpu(void *h)
 	}
 }
 
+static __init int mcheck_init_persistent_event(void)
+{
+
+#define MCE_RECORD_FNAME_SZ 14
+#define MCE_BUF_PAGES 4
+
+	int cpu, err = 0;
+	char buf[MCE_RECORD_FNAME_SZ];
+
+	pattr.config = event_mce_record.event.type;
+	pattr.sample_period = 1;
+	pattr.wakeup_events = 1;
+
+	get_online_cpus();
+
+	for_each_online_cpu(cpu) {
+		struct pers_event_desc *d = &per_cpu(mce_ev, cpu);
+
+		snprintf(buf, MCE_RECORD_FNAME_SZ, "mce_record%d", cpu);
+		d->dfs_name = buf;
+		d->pattr = &pattr;
+
+		if (perf_add_persistent_on_cpu(cpu, d, mce_get_debugfs_dir(),
+					       MCE_BUF_PAGES))
+			goto err_unwind;
+	}
+	goto unlock;
+
+err_unwind:
+	err = -EINVAL;
+	for (--cpu; cpu >= 0; cpu--)
+		perf_rm_persistent_on_cpu(cpu, &per_cpu(mce_ev, cpu));
+
+unlock:
+	put_online_cpus();
+
+	return err;
+}
+
+/*
+ * This has to run after event_trace_init()
+ */
+device_initcall(mcheck_init_persistent_event);
+
 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
 static int __cpuinit
 mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
-- 
1.7.9.3.362.g71319


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2] x86, mce: Add persistent MCE event
  2012-03-21 14:34 ` [PATCH 2/2] x86, mce: Add persistent MCE event Borislav Petkov
@ 2012-03-22  8:36   ` Srivatsa S. Bhat
  2012-03-22 11:40     ` Borislav Petkov
  2012-03-23 12:31   ` Ingo Molnar
  1 sibling, 1 reply; 32+ messages in thread
From: Srivatsa S. Bhat @ 2012-03-22  8:36 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Frederic Weisbecker, Ingo Molnar, Peter Zijlstra, Steven Rostedt,
	LKML, Borislav Petkov, Srivatsa S. Bhat

On 03/21/2012 08:04 PM, Borislav Petkov wrote:

> From: Borislav Petkov <borislav.petkov@amd.com>
> 
> Add the necessary glue to enable the mce_record tracepoint on boot,
> turning it into a persistent event. This exports the MCE buffer through
> a debugfs per-CPU file which a userspace daemon can read and then
> process the received error data further.
> 
> Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
> ---
>  arch/x86/kernel/cpu/mcheck/mce.c |   53 ++++++++++++++++++++++++++++++++++++++
>  1 file changed, 53 insertions(+)
> 
> diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
> index 5a11ae2e9e91..791c4633d771 100644
> --- a/arch/x86/kernel/cpu/mcheck/mce.c
> +++ b/arch/x86/kernel/cpu/mcheck/mce.c
> @@ -95,6 +95,13 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
>  static DEFINE_PER_CPU(struct mce, mces_seen);
>  static int			cpu_missing;
> 
> +static struct perf_event_attr pattr = {
> +	.type           = PERF_TYPE_TRACEPOINT,
> +	.size           = sizeof(pattr),
> +	.sample_type    = PERF_SAMPLE_RAW,
> +	.persistent     = 1,
> +};
> +
>  /* MCA banks polled by the period polling timer for corrected events */
>  DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
>  	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
> @@ -102,6 +109,8 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
> 
>  static DEFINE_PER_CPU(struct work_struct, mce_work);
> 
> +static DEFINE_PER_CPU(struct pers_event_desc, mce_ev);
> +
>  /*
>   * CPU/chipset specific EDAC code can register a notifier call here to print
>   * MCE errors in a human-readable form.
> @@ -2109,6 +2118,50 @@ static void __cpuinit mce_reenable_cpu(void *h)
>  	}
>  }
> 
> +static __init int mcheck_init_persistent_event(void)
> +{
> +
> +#define MCE_RECORD_FNAME_SZ 14
> +#define MCE_BUF_PAGES 4
> +
> +	int cpu, err = 0;
> +	char buf[MCE_RECORD_FNAME_SZ];
> +
> +	pattr.config = event_mce_record.event.type;
> +	pattr.sample_period = 1;
> +	pattr.wakeup_events = 1;
> +
> +	get_online_cpus();
> +
> +	for_each_online_cpu(cpu) {
> +		struct pers_event_desc *d = &per_cpu(mce_ev, cpu);
> +
> +		snprintf(buf, MCE_RECORD_FNAME_SZ, "mce_record%d", cpu);
> +		d->dfs_name = buf;
> +		d->pattr = &pattr;
> +
> +		if (perf_add_persistent_on_cpu(cpu, d, mce_get_debugfs_dir(),
> +					       MCE_BUF_PAGES))
> +			goto err_unwind;
> +	}
> +	goto unlock;
> +
> +err_unwind:
> +	err = -EINVAL;
> +	for (--cpu; cpu >= 0; cpu--)
> +		perf_rm_persistent_on_cpu(cpu, &per_cpu(mce_ev, cpu));
> +


*Totally* theoretical question: How do you know that the cpu_online_mask isn't
sparse? In other words, what if some CPUs weren't booted? Then this for-loop
wouldn't be very good..

Oh, now I see that perf_rm_persistent_on_cpu() probably handles that case well..
So no issues I guess.. ?

(Moreover, we will probably have bigger issues at hand if some CPU didn't
boot..)

(The code looked funny, so I thought of pointing it out, whether or not it
actually is worrisome. Sorry for the noise, if any).

> +unlock:
> +	put_online_cpus();
> +
> +	return err;
> +}
> +
> +/*
> + * This has to run after event_trace_init()
> + */
> +device_initcall(mcheck_init_persistent_event);
> +
>  /* Get notified when a cpu comes on/off. Be hotplug friendly. */
>  static int __cpuinit
>  mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)


Regards,
Srivatsa S. Bhat


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2] x86, mce: Add persistent MCE event
  2012-03-22  8:36   ` Srivatsa S. Bhat
@ 2012-03-22 11:40     ` Borislav Petkov
  2012-03-22 11:57       ` Srivatsa S. Bhat
  0 siblings, 1 reply; 32+ messages in thread
From: Borislav Petkov @ 2012-03-22 11:40 UTC (permalink / raw)
  To: Srivatsa S. Bhat
  Cc: Borislav Petkov, Frederic Weisbecker, Ingo Molnar,
	Peter Zijlstra, Steven Rostedt, LKML

On Thu, Mar 22, 2012 at 02:06:29PM +0530, Srivatsa S. Bhat wrote:
> > +err_unwind:
> > +	err = -EINVAL;
> > +	for (--cpu; cpu >= 0; cpu--)
> > +		perf_rm_persistent_on_cpu(cpu, &per_cpu(mce_ev, cpu));
> > +
> 
> 
> *Totally* theoretical question: How do you know that the cpu_online_mask isn't
> sparse? In other words, what if some CPUs weren't booted? Then this for-loop
> wouldn't be very good..
> 
> Oh, now I see that perf_rm_persistent_on_cpu() probably handles that case well..
> So no issues I guess.. ?

Right, this could theoretically come around to bite us in some obscure
cases, so we probably fix it from the get-go.

> (Moreover, we will probably have bigger issues at hand if some CPU didn't
> boot..)
> 
> (The code looked funny, so I thought of pointing it out, whether or not it
> actually is worrisome. Sorry for the noise, if any).

Right, no, thanks for pointing it out.

I'll probably do something like the following:

	for (--cpu; cpu >= 0; cpu--)
		if (cpu_online(cpu))
			perf_rm_persistent_on_cpu(cpu, &per_cpu(mce_ev, cpu));

to be on the safe side from that perspective.

Thanks.

-- 
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2] x86, mce: Add persistent MCE event
  2012-03-22 11:40     ` Borislav Petkov
@ 2012-03-22 11:57       ` Srivatsa S. Bhat
  0 siblings, 0 replies; 32+ messages in thread
From: Srivatsa S. Bhat @ 2012-03-22 11:57 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Frederic Weisbecker, Ingo Molnar, Peter Zijlstra, Steven Rostedt, LKML

On 03/22/2012 05:10 PM, Borislav Petkov wrote:

> On Thu, Mar 22, 2012 at 02:06:29PM +0530, Srivatsa S. Bhat wrote:
>>> +err_unwind:
>>> +	err = -EINVAL;
>>> +	for (--cpu; cpu >= 0; cpu--)
>>> +		perf_rm_persistent_on_cpu(cpu, &per_cpu(mce_ev, cpu));
>>> +
>>
>>
>> *Totally* theoretical question: How do you know that the cpu_online_mask isn't
>> sparse? In other words, what if some CPUs weren't booted? Then this for-loop
>> wouldn't be very good..
>>
>> Oh, now I see that perf_rm_persistent_on_cpu() probably handles that case well..
>> So no issues I guess.. ?
> 
> Right, this could theoretically come around to bite us in some obscure
> cases, so we probably fix it from the get-go.
> 
>> (Moreover, we will probably have bigger issues at hand if some CPU didn't
>> boot..)
>>
>> (The code looked funny, so I thought of pointing it out, whether or not it
>> actually is worrisome. Sorry for the noise, if any).
> 
> Right, no, thanks for pointing it out.
> 
> I'll probably do something like the following:
> 
> 	for (--cpu; cpu >= 0; cpu--)
> 		if (cpu_online(cpu))
> 			perf_rm_persistent_on_cpu(cpu, &per_cpu(mce_ev, cpu));
> 
> to be on the safe side from that perspective.
> 


You can do that or something like the following, to make it more readable:

int cpunum;

for_each_online_cpu(cpunum) {
	if (cpunum == cpu)
		break;
	perf_rm_persistent_on_cpu(cpunum, &per_cpu(mce_ev, cpunum));
}

It is of course, up to you.. whichever form you prefer..

Regards,
Srivatsa S. Bhat


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2] x86, mce: Add persistent MCE event
  2012-03-21 14:34 ` [PATCH 2/2] x86, mce: Add persistent MCE event Borislav Petkov
  2012-03-22  8:36   ` Srivatsa S. Bhat
@ 2012-03-23 12:31   ` Ingo Molnar
  2012-03-23 13:30     ` Borislav Petkov
  1 sibling, 1 reply; 32+ messages in thread
From: Ingo Molnar @ 2012-03-23 12:31 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Frederic Weisbecker, Ingo Molnar, Peter Zijlstra, Steven Rostedt,
	LKML, Borislav Petkov


* Borislav Petkov <bp@amd64.org> wrote:

> From: Borislav Petkov <borislav.petkov@amd.com>
> 
> Add the necessary glue to enable the mce_record tracepoint on boot,
> turning it into a persistent event. This exports the MCE buffer through
> a debugfs per-CPU file which a userspace daemon can read and then
> process the received error data further.
> 
> Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
> ---
>  arch/x86/kernel/cpu/mcheck/mce.c |   53 ++++++++++++++++++++++++++++++++++++++
>  1 file changed, 53 insertions(+)
> 
> diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
> index 5a11ae2e9e91..791c4633d771 100644
> --- a/arch/x86/kernel/cpu/mcheck/mce.c
> +++ b/arch/x86/kernel/cpu/mcheck/mce.c
> @@ -95,6 +95,13 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
>  static DEFINE_PER_CPU(struct mce, mces_seen);
>  static int			cpu_missing;
>  
> +static struct perf_event_attr pattr = {
> +	.type           = PERF_TYPE_TRACEPOINT,
> +	.size           = sizeof(pattr),
> +	.sample_type    = PERF_SAMPLE_RAW,
> +	.persistent     = 1,
> +};
> +
>  /* MCA banks polled by the period polling timer for corrected events */
>  DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
>  	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
> @@ -102,6 +109,8 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
>  
>  static DEFINE_PER_CPU(struct work_struct, mce_work);
>  
> +static DEFINE_PER_CPU(struct pers_event_desc, mce_ev);
> +
>  /*
>   * CPU/chipset specific EDAC code can register a notifier call here to print
>   * MCE errors in a human-readable form.
> @@ -2109,6 +2118,50 @@ static void __cpuinit mce_reenable_cpu(void *h)
>  	}
>  }
>  
> +static __init int mcheck_init_persistent_event(void)
> +{
> +
> +#define MCE_RECORD_FNAME_SZ 14
> +#define MCE_BUF_PAGES 4
> +
> +	int cpu, err = 0;
> +	char buf[MCE_RECORD_FNAME_SZ];
> +
> +	pattr.config = event_mce_record.event.type;
> +	pattr.sample_period = 1;
> +	pattr.wakeup_events = 1;
> +
> +	get_online_cpus();
> +
> +	for_each_online_cpu(cpu) {
> +		struct pers_event_desc *d = &per_cpu(mce_ev, cpu);
> +
> +		snprintf(buf, MCE_RECORD_FNAME_SZ, "mce_record%d", cpu);
> +		d->dfs_name = buf;
> +		d->pattr = &pattr;
> +
> +		if (perf_add_persistent_on_cpu(cpu, d, mce_get_debugfs_dir(),
> +					       MCE_BUF_PAGES))
> +			goto err_unwind;
> +	}
> +	goto unlock;
> +
> +err_unwind:
> +	err = -EINVAL;
> +	for (--cpu; cpu >= 0; cpu--)
> +		perf_rm_persistent_on_cpu(cpu, &per_cpu(mce_ev, cpu));
> +
> +unlock:
> +	put_online_cpus();
> +
> +	return err;

I like it.

Have you considered making the addition of persistent events 
straightforward and robust, in terms of adding a TRACE_EVENT() 
variant for them? It could replace the above code.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2] x86, mce: Add persistent MCE event
  2012-03-23 12:31   ` Ingo Molnar
@ 2012-03-23 13:30     ` Borislav Petkov
  2012-03-24  7:37       ` Ingo Molnar
  0 siblings, 1 reply; 32+ messages in thread
From: Borislav Petkov @ 2012-03-23 13:30 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Borislav Petkov, Frederic Weisbecker, Ingo Molnar,
	Peter Zijlstra, Steven Rostedt, LKML

On Fri, Mar 23, 2012 at 01:31:56PM +0100, Ingo Molnar wrote:
> Have you considered making the addition of persistent events
> straightforward and robust, in terms of adding a TRACE_EVENT() variant
> for them? It could replace the above code.

Err,

are you thinking of something in TRACE_EVENT() that sets
perf_event_attr.persistent to 1?

Btw, the more important question is are we going to need persistent
events that much so that a generic approach is warranted? I guess maybe
the black box events recording deal would be another user..

IOW, please elaborate :)

Thanks.

-- 
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2] x86, mce: Add persistent MCE event
  2012-03-23 13:30     ` Borislav Petkov
@ 2012-03-24  7:37       ` Ingo Molnar
  2012-03-24  9:00         ` Borislav Petkov
  0 siblings, 1 reply; 32+ messages in thread
From: Ingo Molnar @ 2012-03-24  7:37 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Frederic Weisbecker, Ingo Molnar, Peter Zijlstra, Steven Rostedt, LKML


* Borislav Petkov <bp@amd64.org> wrote:

> On Fri, Mar 23, 2012 at 01:31:56PM +0100, Ingo Molnar wrote:
> > Have you considered making the addition of persistent events
> > straightforward and robust, in terms of adding a TRACE_EVENT() variant
> > for them? It could replace the above code.
> 
> Err,
> 
> are you thinking of something in TRACE_EVENT() that sets
> perf_event_attr.persistent to 1?

I was mainly thinking of reducing this:

 arch/x86/kernel/cpu/mcheck/mce.c |   53 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

to almost nothing. There doesn't seem to be much MCE specific in 
that code, right?

> Btw, the more important question is are we going to need 
> persistent events that much so that a generic approach is 
> warranted? I guess maybe the black box events recording deal 
> would be another user..

So, here's the big picture as I see it:

I think tracing could use persistent events: mark all the events 
we want to trace as persistent from bootup, and recover the 
bootup trace after the system has been booted up.

But other, runtime models of tracing could use it as well: 
basically the main difference that ftrace has to perf based 
tracing today is a system-wide persistent buffer with no 
particular owning process. (The rest is mostly UI and analysis 
features and scope of tracing differences, and of course a lot 
more love and detail went into ftrace so far.)

So MCE will in the end be just a minor user of such a facility - 
I think you should aim for enabling *any* set of events to have 
persistent recording properties, and add the APIs to recover 
that information sanely. It should also be possible for them to 
record into a shared mmap page in essence - instead of having 
per event persistent buffers.

That way testing would be a lot easier as well: you could enable 
persistent tracing via something like:

    perf trace on -e <events>

and turn it off via:

    perf trace off

and just recover the recorded events via something like:

    perf trace

... or so. That you can then also enable this during bootup for 
MCE events is just a very nice side effect of a much more 
generally useful feature.

Would you be interested in creating a clean incarnation of this 
without making peterz sad? The 'perf trace' perf subcommand is 
still unused and up for grabs for sufficiently good patches, 
coincidentally ;-)

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2] x86, mce: Add persistent MCE event
  2012-03-24  7:37       ` Ingo Molnar
@ 2012-03-24  9:00         ` Borislav Petkov
  2012-03-24  9:15           ` Ingo Molnar
  0 siblings, 1 reply; 32+ messages in thread
From: Borislav Petkov @ 2012-03-24  9:00 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Borislav Petkov, Frederic Weisbecker, Ingo Molnar,
	Peter Zijlstra, Steven Rostedt, LKML

On Sat, Mar 24, 2012 at 08:37:31AM +0100, Ingo Molnar wrote:
> I was mainly thinking of reducing this:
> 
>  arch/x86/kernel/cpu/mcheck/mce.c |   53 ++++++++++++++++++++++++++++++++++++++
>  1 file changed, 53 insertions(+)
> 
> to almost nothing. There doesn't seem to be much MCE specific in 
> that code, right?

Yeah, this could be generalized even more, AFAICT.

> 
> > Btw, the more important question is are we going to need 
> > persistent events that much so that a generic approach is 
> > warranted? I guess maybe the black box events recording deal 
> > would be another user..
> 
> So, here's the big picture as I see it:
> 
> I think tracing could use persistent events: mark all the events 
> we want to trace as persistent from bootup, and recover the 
> bootup trace after the system has been booted up.

Right, but (more nasty questions):

Why would I do this, am I tracing the boot process? If so, then I need
another syntax which enables those events from the kernel command line
which gets parsed the moment ftrace and ring buffer get initialized.

IOW, I'd need userspace for perf otherwise but I don't have that before
booting...

Then, after having booted, do I stop the trace? If no, then I can see
the persistency in there so are you saying we want a low overhead, low
ressource utilization machinery which runs all the time and traces
the system? What are possible real life use cases for that? Scheduler
analysis probably, long-term tracing of some stuff people are interested
in how it behaves over long periods of time... MCE is one use case,
definitely...

> But other, runtime models of tracing could use it as well: 
> basically the main difference that ftrace has to perf based 
> tracing today is a system-wide persistent buffer with no 
> particular owning process. (The rest is mostly UI and analysis 
> features and scope of tracing differences, and of course a lot 
> more love and detail went into ftrace so far.)
> 
> So MCE will in the end be just a minor user of such a facility - 
> I think you should aim for enabling *any* set of events to have 
> persistent recording properties, and add the APIs to recover 
> that information sanely. It should also be possible for them to 
> record into a shared mmap page in essence - instead of having 
> per event persistent buffers.

Sounds like ftrace. But we have that already, we only need to get to
using it perf-side, no...? Especially if we could get the love and
detail that went in there for free :-) ...without stepping on Steve's
toes, of course :-).

..

Fun.

-- 
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2] x86, mce: Add persistent MCE event
  2012-03-24  9:00         ` Borislav Petkov
@ 2012-03-24  9:15           ` Ingo Molnar
  2012-05-15 15:32             ` Borislav Petkov
  0 siblings, 1 reply; 32+ messages in thread
From: Ingo Molnar @ 2012-03-24  9:15 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Frederic Weisbecker, Ingo Molnar, Peter Zijlstra, Steven Rostedt, LKML


* Borislav Petkov <bp@amd64.org> wrote:

> On Sat, Mar 24, 2012 at 08:37:31AM +0100, Ingo Molnar wrote:
> > I was mainly thinking of reducing this:
> > 
> >  arch/x86/kernel/cpu/mcheck/mce.c |   53 ++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 53 insertions(+)
> > 
> > to almost nothing. There doesn't seem to be much MCE specific in 
> > that code, right?
> 
> Yeah, this could be generalized even more, AFAICT.
> 
> > 
> > > Btw, the more important question is are we going to need 
> > > persistent events that much so that a generic approach is 
> > > warranted? I guess maybe the black box events recording deal 
> > > would be another user..
> > 
> > So, here's the big picture as I see it:
> > 
> > I think tracing could use persistent events: mark all the events 
> > we want to trace as persistent from bootup, and recover the 
> > bootup trace after the system has been booted up.
> 
> Right, but (more nasty questions):
> 
> Why would I do this, am I tracing the boot process? [...]

Correct, in essence the MCE persistent event is partially about 
that: we are starting to collect events well before there's any 
user-space available.

> [...] If so, then I need another syntax which enables those 
> events from the kernel command line which gets parsed the 
> moment ftrace and ring buffer get initialized.

Correct. Something really simple like:

  boot_trace=<event1>,<event2>...

... which could be all implicit within MCE too. (So I'm not 
suggesting some boot command trigger to provide the MCE case - 
but for more general boot tracing it would be the right 
solution.)

> IOW, I'd need userspace for perf otherwise but I don't have 
> that before booting...

Correct. In the case of MCE there's no "userspace" really needed 
- we just want to trace early enough. This model carries over to 
later as well: there's no *specific* process we want to attach 
the trace buffer to - we just want a persistent trace buffer 
that essentially never loses MCE events.

> Then, after having booted, do I stop the trace? If no, then I 
> can see the persistency in there so are you saying we want a 
> low overhead, low ressource utilization machinery which runs 
> all the time and traces the system? What are possible real 
> life use cases for that? Scheduler analysis probably, 
> long-term tracing of some stuff people are interested in how 
> it behaves over long periods of time... MCE is one use case, 
> definitely...

Boot tracing is a very real usecase, people use it to reduce 
boot times. Today printk timestamps are used as a substitute. 
(There's also a boot tracer plugin within ftrace, see the 
bootup_tracer.)

> > But other, runtime models of tracing could use it as well: 
> > basically the main difference that ftrace has to perf based 
> > tracing today is a system-wide persistent buffer with no 
> > particular owning process. (The rest is mostly UI and 
> > analysis features and scope of tracing differences, and of 
> > course a lot more love and detail went into ftrace so far.)
> > 
> > So MCE will in the end be just a minor user of such a 
> > facility - I think you should aim for enabling *any* set of 
> > events to have persistent recording properties, and add the 
> > APIs to recover that information sanely. It should also be 
> > possible for them to record into a shared mmap page in 
> > essence - instead of having per event persistent buffers.
> 
> Sounds like ftrace. But we have that already, we only need to 
> get to using it perf-side, no...? [...]

What we want is to extend the perf ring-buffer to be persistent 
*as well*. It's an evidently useful model of collecting events.

All the remaining perf tooling can be used after that point - if 
it's a bog-standard perf ring-buffer then it can be saved into a 
perf.data and can be analyzed in a rich fashion, etc.

Think about it: for example we could do not just boot tracing 
but also boot *profiling*, by using the PMU to sample into a 
persistent buffer which after bootup can be put into a perf.data 
and 'perf report' will do the right thing, etc...

Does it overlap with ftrace? Perf overlapped with ftrace from 
day one on and it's starting to become a maintenance problem: we 
want to remove that overlap not by keeping two separate entities 
(both of which suck and rule in their own ways) but having a 
unified facility.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2] x86, mce: Add persistent MCE event
  2012-03-24  9:15           ` Ingo Molnar
@ 2012-05-15 15:32             ` Borislav Petkov
  2012-05-18  8:18               ` Ingo Molnar
  0 siblings, 1 reply; 32+ messages in thread
From: Borislav Petkov @ 2012-05-15 15:32 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Frederic Weisbecker, Ingo Molnar, Peter Zijlstra, Steven Rostedt, LKML

On Sat, Mar 24, 2012 at 10:15:01AM +0100, Ingo Molnar wrote:
> * Borislav Petkov <bp@amd64.org> wrote:
> 
> > On Sat, Mar 24, 2012 at 08:37:31AM +0100, Ingo Molnar wrote:
> > > I was mainly thinking of reducing this:
> > > 
> > >  arch/x86/kernel/cpu/mcheck/mce.c |   53 ++++++++++++++++++++++++++++++++++++++
> > >  1 file changed, 53 insertions(+)
> > > 
> > > to almost nothing. There doesn't seem to be much MCE specific in 
> > > that code, right?
> > 
> > Yeah, this could be generalized even more, AFAICT.
> > 
> > > 
> > > > Btw, the more important question is are we going to need 
> > > > persistent events that much so that a generic approach is 
> > > > warranted? I guess maybe the black box events recording deal 
> > > > would be another user..
> > > 
> > > So, here's the big picture as I see it:
> > > 
> > > I think tracing could use persistent events: mark all the events 
> > > we want to trace as persistent from bootup, and recover the 
> > > bootup trace after the system has been booted up.
> > 
> > Right, but (more nasty questions):
> > 
> > Why would I do this, am I tracing the boot process? [...]
> 
> Correct, in essence the MCE persistent event is partially about 
> that: we are starting to collect events well before there's any 
> user-space available.
> 
> > [...] If so, then I need another syntax which enables those 
> > events from the kernel command line which gets parsed the 
> > moment ftrace and ring buffer get initialized.
> 
> Correct. Something really simple like:
> 
>   boot_trace=<event1>,<event2>...
> 
> ... which could be all implicit within MCE too. (So I'm not 
> suggesting some boot command trigger to provide the MCE case - 
> but for more general boot tracing it would be the right 
> solution.)
> 
> > IOW, I'd need userspace for perf otherwise but I don't have 
> > that before booting...
> 
> Correct. In the case of MCE there's no "userspace" really needed 
> - we just want to trace early enough. This model carries over to 
> later as well: there's no *specific* process we want to attach 
> the trace buffer to - we just want a persistent trace buffer 
> that essentially never loses MCE events.
> 
> > Then, after having booted, do I stop the trace? If no, then I 
> > can see the persistency in there so are you saying we want a 
> > low overhead, low ressource utilization machinery which runs 
> > all the time and traces the system? What are possible real 
> > life use cases for that? Scheduler analysis probably, 
> > long-term tracing of some stuff people are interested in how 
> > it behaves over long periods of time... MCE is one use case, 
> > definitely...
> 
> Boot tracing is a very real usecase, people use it to reduce 
> boot times. Today printk timestamps are used as a substitute. 
> (There's also a boot tracer plugin within ftrace, see the 
> bootup_tracer.)
> 
> > > But other, runtime models of tracing could use it as well: 
> > > basically the main difference that ftrace has to perf based 
> > > tracing today is a system-wide persistent buffer with no 
> > > particular owning process. (The rest is mostly UI and 
> > > analysis features and scope of tracing differences, and of 
> > > course a lot more love and detail went into ftrace so far.)
> > > 
> > > So MCE will in the end be just a minor user of such a 
> > > facility - I think you should aim for enabling *any* set of 
> > > events to have persistent recording properties, and add the 
> > > APIs to recover that information sanely. It should also be 
> > > possible for them to record into a shared mmap page in 
> > > essence - instead of having per event persistent buffers.
> > 
> > Sounds like ftrace. But we have that already, we only need to 
> > get to using it perf-side, no...? [...]
> 
> What we want is to extend the perf ring-buffer to be persistent 
> *as well*. It's an evidently useful model of collecting events.
> 
> All the remaining perf tooling can be used after that point - if 
> it's a bog-standard perf ring-buffer then it can be saved into a 
> perf.data and can be analyzed in a rich fashion, etc.
> 
> Think about it: for example we could do not just boot tracing 
> but also boot *profiling*, by using the PMU to sample into a 
> persistent buffer which after bootup can be put into a perf.data 
> and 'perf report' will do the right thing, etc...
> 
> Does it overlap with ftrace? Perf overlapped with ftrace from 
> day one on and it's starting to become a maintenance problem: we 
> want to remove that overlap not by keeping two separate entities 
> (both of which suck and rule in their own ways) but having a 
> unified facility.

Leaving all of the above for reference.

So, I spent some more nights sleeping on it :-)

Here's what I dreamt of:

* The last thing perf_event_init() does is init the persistent, per-cpu
buffers.

* there's no need for changing TRACE_EVENT: "boot_trace" parameter
parsing code enables those events the moment perf is initialized. We're
doing this anyway because we're enabling the trace_mce_record TP.

It sounds pretty simple to me but the devil is in the details,
especially making the persistent buffers, task-agnostic and generic
enough.

Ingo, Peter, thoughts?

Thanks.

-- 
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2] x86, mce: Add persistent MCE event
  2012-05-15 15:32             ` Borislav Petkov
@ 2012-05-18  8:18               ` Ingo Molnar
  2012-05-18 10:03                 ` Borislav Petkov
  0 siblings, 1 reply; 32+ messages in thread
From: Ingo Molnar @ 2012-05-18  8:18 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Frederic Weisbecker, Ingo Molnar, Peter Zijlstra, Steven Rostedt, LKML


* Borislav Petkov <bp@amd64.org> wrote:

> On Sat, Mar 24, 2012 at 10:15:01AM +0100, Ingo Molnar wrote:
> > * Borislav Petkov <bp@amd64.org> wrote:
> > 
> > > On Sat, Mar 24, 2012 at 08:37:31AM +0100, Ingo Molnar wrote:
> > > > I was mainly thinking of reducing this:
> > > > 
> > > >  arch/x86/kernel/cpu/mcheck/mce.c |   53 ++++++++++++++++++++++++++++++++++++++
> > > >  1 file changed, 53 insertions(+)
> > > > 
> > > > to almost nothing. There doesn't seem to be much MCE specific in 
> > > > that code, right?
> > > 
> > > Yeah, this could be generalized even more, AFAICT.
> > > 
> > > > 
> > > > > Btw, the more important question is are we going to need 
> > > > > persistent events that much so that a generic approach is 
> > > > > warranted? I guess maybe the black box events recording deal 
> > > > > would be another user..
> > > > 
> > > > So, here's the big picture as I see it:
> > > > 
> > > > I think tracing could use persistent events: mark all the events 
> > > > we want to trace as persistent from bootup, and recover the 
> > > > bootup trace after the system has been booted up.
> > > 
> > > Right, but (more nasty questions):
> > > 
> > > Why would I do this, am I tracing the boot process? [...]
> > 
> > Correct, in essence the MCE persistent event is partially about 
> > that: we are starting to collect events well before there's any 
> > user-space available.
> > 
> > > [...] If so, then I need another syntax which enables those 
> > > events from the kernel command line which gets parsed the 
> > > moment ftrace and ring buffer get initialized.
> > 
> > Correct. Something really simple like:
> > 
> >   boot_trace=<event1>,<event2>...
> > 
> > ... which could be all implicit within MCE too. (So I'm not 
> > suggesting some boot command trigger to provide the MCE case - 
> > but for more general boot tracing it would be the right 
> > solution.)
> > 
> > > IOW, I'd need userspace for perf otherwise but I don't have 
> > > that before booting...
> > 
> > Correct. In the case of MCE there's no "userspace" really needed 
> > - we just want to trace early enough. This model carries over to 
> > later as well: there's no *specific* process we want to attach 
> > the trace buffer to - we just want a persistent trace buffer 
> > that essentially never loses MCE events.
> > 
> > > Then, after having booted, do I stop the trace? If no, then I 
> > > can see the persistency in there so are you saying we want a 
> > > low overhead, low ressource utilization machinery which runs 
> > > all the time and traces the system? What are possible real 
> > > life use cases for that? Scheduler analysis probably, 
> > > long-term tracing of some stuff people are interested in how 
> > > it behaves over long periods of time... MCE is one use case, 
> > > definitely...
> > 
> > Boot tracing is a very real usecase, people use it to reduce 
> > boot times. Today printk timestamps are used as a substitute. 
> > (There's also a boot tracer plugin within ftrace, see the 
> > bootup_tracer.)
> > 
> > > > But other, runtime models of tracing could use it as well: 
> > > > basically the main difference that ftrace has to perf based 
> > > > tracing today is a system-wide persistent buffer with no 
> > > > particular owning process. (The rest is mostly UI and 
> > > > analysis features and scope of tracing differences, and of 
> > > > course a lot more love and detail went into ftrace so far.)
> > > > 
> > > > So MCE will in the end be just a minor user of such a 
> > > > facility - I think you should aim for enabling *any* set of 
> > > > events to have persistent recording properties, and add the 
> > > > APIs to recover that information sanely. It should also be 
> > > > possible for them to record into a shared mmap page in 
> > > > essence - instead of having per event persistent buffers.
> > > 
> > > Sounds like ftrace. But we have that already, we only need to 
> > > get to using it perf-side, no...? [...]
> > 
> > What we want is to extend the perf ring-buffer to be persistent 
> > *as well*. It's an evidently useful model of collecting events.
> > 
> > All the remaining perf tooling can be used after that point - if 
> > it's a bog-standard perf ring-buffer then it can be saved into a 
> > perf.data and can be analyzed in a rich fashion, etc.
> > 
> > Think about it: for example we could do not just boot tracing 
> > but also boot *profiling*, by using the PMU to sample into a 
> > persistent buffer which after bootup can be put into a perf.data 
> > and 'perf report' will do the right thing, etc...
> > 
> > Does it overlap with ftrace? Perf overlapped with ftrace from 
> > day one on and it's starting to become a maintenance problem: we 
> > want to remove that overlap not by keeping two separate entities 
> > (both of which suck and rule in their own ways) but having a 
> > unified facility.
> 
> Leaving all of the above for reference.
> 
> So, I spent some more nights sleeping on it :-)
> 
> Here's what I dreamt of:
> 
> * The last thing perf_event_init() does is init the persistent, per-cpu
> buffers.
> 
> * there's no need for changing TRACE_EVENT: "boot_trace" parameter
> parsing code enables those events the moment perf is initialized. We're
> doing this anyway because we're enabling the trace_mce_record TP.
> 
> It sounds pretty simple to me but the devil is in the details,
> especially making the persistent buffers, task-agnostic and generic
> enough.
> 
> Ingo, Peter, thoughts?

Sounds good to me in principle - I guess if you send something 
that is tested, works, and also enables boot tracing we can see 
the details?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] perf: Add persistent event facilities
  2012-03-21 14:34 ` [PATCH 1/2] " Borislav Petkov
@ 2012-05-18  9:58   ` Peter Zijlstra
  2012-05-18 10:01     ` Borislav Petkov
  2012-05-18 10:00   ` Peter Zijlstra
                     ` (3 subsequent siblings)
  4 siblings, 1 reply; 32+ messages in thread
From: Peter Zijlstra @ 2012-05-18  9:58 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Frederic Weisbecker, Ingo Molnar, Steven Rostedt, LKML, Borislav Petkov

On Wed, 2012-03-21 at 15:34 +0100, Borislav Petkov wrote:
> Add a barebones implementation for registering persistent events with
> perf. For that, we don't destroy the buffers when they're unmapped;
> also, we map them read-only so that multiple agents can access them.

Yet you don't fail mmap() for VM_WRITE && ->persistent.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] perf: Add persistent event facilities
  2012-03-21 14:34 ` [PATCH 1/2] " Borislav Petkov
  2012-05-18  9:58   ` Peter Zijlstra
@ 2012-05-18 10:00   ` Peter Zijlstra
  2012-05-18 10:02   ` Peter Zijlstra
                     ` (2 subsequent siblings)
  4 siblings, 0 replies; 32+ messages in thread
From: Peter Zijlstra @ 2012-05-18 10:00 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Frederic Weisbecker, Ingo Molnar, Steven Rostedt, LKML, Borislav Petkov

On Wed, 2012-03-21 at 15:34 +0100, Borislav Petkov wrote:
> @@ -6319,6 +6332,10 @@ __perf_event_exit_task(struct perf_event *child_event,
>                 raw_spin_unlock_irq(&child_ctx->lock);
>         }
>  
> +       /* do not remove persistent events on task exit */
> +       if (child_event->attr.persistent)
> +               return;
> +
>         perf_remove_from_context(child_event);
>  

Huh, the task it dying, why would you let the event stay in the task
context?

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] perf: Add persistent event facilities
  2012-05-18  9:58   ` Peter Zijlstra
@ 2012-05-18 10:01     ` Borislav Petkov
  0 siblings, 0 replies; 32+ messages in thread
From: Borislav Petkov @ 2012-05-18 10:01 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Borislav Petkov, Frederic Weisbecker, Ingo Molnar, Steven Rostedt, LKML

On Fri, May 18, 2012 at 11:58:58AM +0200, Peter Zijlstra wrote:
> On Wed, 2012-03-21 at 15:34 +0100, Borislav Petkov wrote:
> > Add a barebones implementation for registering persistent events with
> > perf. For that, we don't destroy the buffers when they're unmapped;
> > also, we map them read-only so that multiple agents can access them.
> 
> Yet you don't fail mmap() for VM_WRITE && ->persistent.

I better go fix that after we've agreed on the overall design.

Thanks Peter.

-- 
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] perf: Add persistent event facilities
  2012-03-21 14:34 ` [PATCH 1/2] " Borislav Petkov
  2012-05-18  9:58   ` Peter Zijlstra
  2012-05-18 10:00   ` Peter Zijlstra
@ 2012-05-18 10:02   ` Peter Zijlstra
  2012-05-18 10:09   ` Peter Zijlstra
  2012-05-18 10:14   ` Peter Zijlstra
  4 siblings, 0 replies; 32+ messages in thread
From: Peter Zijlstra @ 2012-05-18 10:02 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Frederic Weisbecker, Ingo Molnar, Steven Rostedt, LKML, Borislav Petkov

On Wed, 2012-03-21 at 15:34 +0100, Borislav Petkov wrote:
> From: Borislav Petkov <borislav.petkov@amd.com>
> 
> Add a barebones implementation for registering persistent events with
> perf. For that, we don't destroy the buffers when they're unmapped;
> also, we map them read-only so that multiple agents can access them.

> +static struct perf_event *perf_add_persistent(struct perf_event_attr *attr,
> +					      int cpu, unsigned nr_pages)
> +{
> +	struct ring_buffer *buffer;
> +	struct perf_event *event;
> +
> +	event = perf_event_create_kernel_counter(attr, cpu, NULL, NULL, NULL);
> +	if (IS_ERR(event))
> +		return event;
> +
> +	buffer = rb_alloc(nr_pages, 0, cpu, RING_BUFFER_WRITABLE);

Uh what?

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2] x86, mce: Add persistent MCE event
  2012-05-18  8:18               ` Ingo Molnar
@ 2012-05-18 10:03                 ` Borislav Petkov
  0 siblings, 0 replies; 32+ messages in thread
From: Borislav Petkov @ 2012-05-18 10:03 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Frederic Weisbecker, Ingo Molnar, Peter Zijlstra, Steven Rostedt, LKML

On Fri, May 18, 2012 at 10:18:31AM +0200, Ingo Molnar wrote:
> Sounds good to me in principle - I guess if you send something that is
> tested, works, and also enables boot tracing we can see the details?

Yeah, let me see what I can do.

Thanks.

-- 
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] perf: Add persistent event facilities
  2012-03-21 14:34 ` [PATCH 1/2] " Borislav Petkov
                     ` (2 preceding siblings ...)
  2012-05-18 10:02   ` Peter Zijlstra
@ 2012-05-18 10:09   ` Peter Zijlstra
  2012-05-18 10:49     ` Borislav Petkov
  2012-05-18 10:14   ` Peter Zijlstra
  4 siblings, 1 reply; 32+ messages in thread
From: Peter Zijlstra @ 2012-05-18 10:09 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Frederic Weisbecker, Ingo Molnar, Steven Rostedt, LKML, Borislav Petkov

On Wed, 2012-03-21 at 15:34 +0100, Borislav Petkov wrote:
> +                               persistent     :  1, /* always-on event */

Should creating something like that be privileged?

And it looks like you allow creating persistent per-task events; is that
useful, if so why? And like already pointed out, your cleanup seems
broken in that case.



^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] perf: Add persistent event facilities
  2012-03-21 14:34 ` [PATCH 1/2] " Borislav Petkov
                     ` (3 preceding siblings ...)
  2012-05-18 10:09   ` Peter Zijlstra
@ 2012-05-18 10:14   ` Peter Zijlstra
  2012-05-18 11:03     ` Borislav Petkov
  2012-05-31 17:33     ` Borislav Petkov
  4 siblings, 2 replies; 32+ messages in thread
From: Peter Zijlstra @ 2012-05-18 10:14 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Frederic Weisbecker, Ingo Molnar, Steven Rostedt, LKML, Borislav Petkov

On Wed, 2012-03-21 at 15:34 +0100, Borislav Petkov wrote:
> +int perf_add_persistent_on_cpu(int cpu, struct pers_event_desc *desc,
> +                              struct dentry *dir, unsigned nr_pages) 

OK, so this creates an even and registers it somewhere in debugfs.

 - you allow an arbitrary place in debugfs; this might make finding them
'interesting'. Should we put them all in the same place?

 - persistent events created from userspace don't seem to get a debugfs
entry and will be lost forever?


In general I think a little more exploring of the semantics and
ramifications might be in order.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] perf: Add persistent event facilities
  2012-05-18 10:09   ` Peter Zijlstra
@ 2012-05-18 10:49     ` Borislav Petkov
  0 siblings, 0 replies; 32+ messages in thread
From: Borislav Petkov @ 2012-05-18 10:49 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Borislav Petkov, Frederic Weisbecker, Ingo Molnar, Steven Rostedt, LKML

On Fri, May 18, 2012 at 12:09:13PM +0200, Peter Zijlstra wrote:
> On Wed, 2012-03-21 at 15:34 +0100, Borislav Petkov wrote:
> > +                               persistent     :  1, /* always-on event */
> 
> Should creating something like that be privileged?

Yeah, I dunno, do we want that? Well, what it is when we enable an event
during boot, i.e. say "boot_trace=<event>..." on the kernel command line
and this event is enabled until we finish booting. Then we disable it
with perf or over debugfs. Is that privileged or can any user boot-trace
the system?

All good questions.

> And it looks like you allow creating persistent per-task events; is
> that useful, if so why? And like already pointed out, your cleanup
> seems broken in that case.

This was dumb - I wanted to have task-agnostic events which get enabled
for the whole system and there's a buffer on each CPU which collects
their output. It was a first attempt and it is a good thing you gave it
a look so that I can address the issues properly.

Thanks.

-- 
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] perf: Add persistent event facilities
  2012-05-18 10:14   ` Peter Zijlstra
@ 2012-05-18 11:03     ` Borislav Petkov
  2012-05-18 11:24       ` Peter Zijlstra
  2012-05-31 17:33     ` Borislav Petkov
  1 sibling, 1 reply; 32+ messages in thread
From: Borislav Petkov @ 2012-05-18 11:03 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Frederic Weisbecker, Ingo Molnar, Steven Rostedt, LKML, Borislav Petkov

On Fri, May 18, 2012 at 12:14:00PM +0200, Peter Zijlstra wrote:
> On Wed, 2012-03-21 at 15:34 +0100, Borislav Petkov wrote:
> > +int perf_add_persistent_on_cpu(int cpu, struct pers_event_desc *desc,
> > +                              struct dentry *dir, unsigned nr_pages) 
> 
> OK, so this creates an even and registers it somewhere in debugfs.

(debugfs)/mce/

>  - you allow an arbitrary place in debugfs; this might make finding
> them 'interesting'. Should we put them all in the same place?

My take on this is that we want to be able to make the same events we
have now, persistent. Basically not trace for the duration of a child
process but in a process-agnostic way, system-wide.

In that case, we probably want to be able to mark events as persistent,
maybe add another node in debugfs:

(debugfs)/tracing/events/mce/mce_record/attr

which can be used for flags or whatever, or something to that effect...

>  - persistent events created from userspace don't seem to get a debugfs
> entry and will be lost forever?

Yeah, see above.

> In general I think a little more exploring of the semantics and
> ramifications might be in order.

Absolutely!

Thanks.

-- 
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] perf: Add persistent event facilities
  2012-05-18 11:03     ` Borislav Petkov
@ 2012-05-18 11:24       ` Peter Zijlstra
  2012-05-18 11:59         ` Ingo Molnar
  0 siblings, 1 reply; 32+ messages in thread
From: Peter Zijlstra @ 2012-05-18 11:24 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Frederic Weisbecker, Ingo Molnar, Steven Rostedt, LKML, Borislav Petkov

On Fri, 2012-05-18 at 13:03 +0200, Borislav Petkov wrote:
> >  - you allow an arbitrary place in debugfs; this might make finding
> > them 'interesting'. Should we put them all in the same place?
> 
> My take on this is that we want to be able to make the same events we
> have now, persistent. Basically not trace for the duration of a child
> process but in a process-agnostic way, system-wide.

This would argue against per-task persistent events.. 

> In that case, we probably want to be able to mark events as persistent,
> maybe add another node in debugfs:
> 
> (debugfs)/tracing/events/mce/mce_record/attr
> 
> which can be used for flags or whatever, or something to that effect... 

( while mce is a user of persistent events, it seems to me in general
persistent events should not be related to mce )

However you raise a valid point about keeping track of what events are
spooled into that buffer.

Note the plural there, it might be very desirable to allow multiple
events into a single persistent buffer.



^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] perf: Add persistent event facilities
  2012-05-18 11:24       ` Peter Zijlstra
@ 2012-05-18 11:59         ` Ingo Molnar
  2012-05-18 12:55           ` Borislav Petkov
  0 siblings, 1 reply; 32+ messages in thread
From: Ingo Molnar @ 2012-05-18 11:59 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Borislav Petkov, Frederic Weisbecker, Ingo Molnar,
	Steven Rostedt, LKML, Borislav Petkov


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Fri, 2012-05-18 at 13:03 +0200, Borislav Petkov wrote:
> > >  - you allow an arbitrary place in debugfs; this might make finding
> > > them 'interesting'. Should we put them all in the same place?
> > 
> > My take on this is that we want to be able to make the same events we
> > have now, persistent. Basically not trace for the duration of a child
> > process but in a process-agnostic way, system-wide.
> 
> This would argue against per-task persistent events.. 

Yeah, 'persistency' is something that's per CPU at minimum.

> Note the plural there, it might be very desirable to allow 
> multiple events into a single persistent buffer.

very, very, very much so. One (per CPU) buffer, with many events 
multiplexed into it.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] perf: Add persistent event facilities
  2012-05-18 11:59         ` Ingo Molnar
@ 2012-05-18 12:55           ` Borislav Petkov
  2012-05-18 13:37             ` Peter Zijlstra
  0 siblings, 1 reply; 32+ messages in thread
From: Borislav Petkov @ 2012-05-18 12:55 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, Frederic Weisbecker, Ingo Molnar, Steven Rostedt,
	LKML, Borislav Petkov

On Fri, May 18, 2012 at 01:59:57PM +0200, Ingo Molnar wrote:
> 
> * Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > On Fri, 2012-05-18 at 13:03 +0200, Borislav Petkov wrote:
> > > >  - you allow an arbitrary place in debugfs; this might make finding
> > > > them 'interesting'. Should we put them all in the same place?
> > > 
> > > My take on this is that we want to be able to make the same events we
> > > have now, persistent. Basically not trace for the duration of a child
> > > process but in a process-agnostic way, system-wide.
> > 
> > This would argue against per-task persistent events.. 
> 
> Yeah, 'persistency' is something that's per CPU at minimum.

Yeah, forget the per-task thing - that's me not understanding the code
fully. They need to be per CPU and "taskless."

> > Note the plural there, it might be very desirable to allow 
> > multiple events into a single persistent buffer.
> 
> very, very, very much so. One (per CPU) buffer, with many events 
> multiplexed into it.

Which begs the other question: mce could use buffers which are RO - at
least, I don't see a usecase where we want to delete entries from them -
but other persistent events users might want to delete those events to
free up the buffers.

It is an interesting question how we handle that RO/RW thing? Maybe per
event and do something of a kernel-side filtering which Peter didn't
like in Robert's IBS stuff.

Hmmm...

-- 
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] perf: Add persistent event facilities
  2012-05-18 12:55           ` Borislav Petkov
@ 2012-05-18 13:37             ` Peter Zijlstra
  2012-05-18 14:09               ` Borislav Petkov
  0 siblings, 1 reply; 32+ messages in thread
From: Peter Zijlstra @ 2012-05-18 13:37 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Ingo Molnar, Frederic Weisbecker, Ingo Molnar, Steven Rostedt,
	LKML, Borislav Petkov

On Fri, 2012-05-18 at 14:55 +0200, Borislav Petkov wrote:
> It is an interesting question how we handle that RO/RW thing?

RO buffers overwrite, RW buffers loose events.

We could allow both for persistent under the constraints:

 VM_SHARED - must imply !VM_WRITE since multiple people writing to the
             control page will get 'interesting' very fast.

 !VM_SHARED - must fail mmap() of the debugfs file if there's already
              an existing user.

Note that even RW buffers only allows writes to the control page, a
write to the actual data itself will generate a fault.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] perf: Add persistent event facilities
  2012-05-18 13:37             ` Peter Zijlstra
@ 2012-05-18 14:09               ` Borislav Petkov
  2012-05-18 14:14                 ` Peter Zijlstra
  0 siblings, 1 reply; 32+ messages in thread
From: Borislav Petkov @ 2012-05-18 14:09 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Frederic Weisbecker, Ingo Molnar, Steven Rostedt,
	LKML, Borislav Petkov

On Fri, May 18, 2012 at 03:37:47PM +0200, Peter Zijlstra wrote:
> On Fri, 2012-05-18 at 14:55 +0200, Borislav Petkov wrote:
> > It is an interesting question how we handle that RO/RW thing?
> 
> RO buffers overwrite, RW buffers loose events.
> 
> We could allow both for persistent under the constraints:
> 
>  VM_SHARED - must imply !VM_WRITE since multiple people writing to the
>              control page will get 'interesting' very fast.

Ok, I get that one.

>  !VM_SHARED - must fail mmap() of the debugfs file if there's already
>               an existing user.

What happens here if a second user wants to enable a second set of
persistent events, allocate a second set of per-CPU buffers?

> Note that even RW buffers only allows writes to the control page, a
> write to the actual data itself will generate a fault.

Ok.

-- 
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] perf: Add persistent event facilities
  2012-05-18 14:09               ` Borislav Petkov
@ 2012-05-18 14:14                 ` Peter Zijlstra
  2012-05-18 14:21                   ` Borislav Petkov
  0 siblings, 1 reply; 32+ messages in thread
From: Peter Zijlstra @ 2012-05-18 14:14 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Ingo Molnar, Frederic Weisbecker, Ingo Molnar, Steven Rostedt,
	LKML, Borislav Petkov

On Fri, 2012-05-18 at 16:09 +0200, Borislav Petkov wrote:
> What happens here if a second user wants to enable a second set of
> persistent events, allocate a second set of per-CPU buffers? 

I'm not seeing the relation to VM_SHARED here. But yeah, if you want a
second set of buffers with either the same or other events, you create
another set..



^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] perf: Add persistent event facilities
  2012-05-18 14:14                 ` Peter Zijlstra
@ 2012-05-18 14:21                   ` Borislav Petkov
  2012-05-18 14:37                     ` Peter Zijlstra
  0 siblings, 1 reply; 32+ messages in thread
From: Borislav Petkov @ 2012-05-18 14:21 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Frederic Weisbecker, Ingo Molnar, Steven Rostedt,
	LKML, Borislav Petkov

On Fri, May 18, 2012 at 04:14:22PM +0200, Peter Zijlstra wrote:
> On Fri, 2012-05-18 at 16:09 +0200, Borislav Petkov wrote:
> > What happens here if a second user wants to enable a second set of
> > persistent events, allocate a second set of per-CPU buffers? 
> 
> I'm not seeing the relation to VM_SHARED here.

Well, in the sense that if mmap fails in the !VM_SHARED case, I can't
enable any persistent events except the ones which are enabled and so
I need to allocate another set of resources for that second persistent
events user.

-- 
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] perf: Add persistent event facilities
  2012-05-18 14:21                   ` Borislav Petkov
@ 2012-05-18 14:37                     ` Peter Zijlstra
  2012-05-18 15:24                       ` Borislav Petkov
  0 siblings, 1 reply; 32+ messages in thread
From: Peter Zijlstra @ 2012-05-18 14:37 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Ingo Molnar, Frederic Weisbecker, Ingo Molnar, Steven Rostedt,
	LKML, Borislav Petkov

On Fri, 2012-05-18 at 16:21 +0200, Borislav Petkov wrote:
> On Fri, May 18, 2012 at 04:14:22PM +0200, Peter Zijlstra wrote:
> > On Fri, 2012-05-18 at 16:09 +0200, Borislav Petkov wrote:
> > > What happens here if a second user wants to enable a second set of
> > > persistent events, allocate a second set of per-CPU buffers? 
> > 
> > I'm not seeing the relation to VM_SHARED here.
> 
> Well, in the sense that if mmap fails in the !VM_SHARED case, I can't
> enable any persistent events except the ones which are enabled and so
> I need to allocate another set of resources for that second persistent
> events user.

Right, well, you could actually allow operations on the fd, just not a
second mapping :-)

Anyway.. I'd push that error back to the user. If they need a second
set, let them create it.

Do you really need multiple consumers for your MCE stuff? If so, what
would be the problem of using VM_SHARED?

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] perf: Add persistent event facilities
  2012-05-18 14:37                     ` Peter Zijlstra
@ 2012-05-18 15:24                       ` Borislav Petkov
  0 siblings, 0 replies; 32+ messages in thread
From: Borislav Petkov @ 2012-05-18 15:24 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Borislav Petkov, Ingo Molnar, Frederic Weisbecker, Ingo Molnar,
	Steven Rostedt, LKML

On Fri, May 18, 2012 at 04:37:37PM +0200, Peter Zijlstra wrote:
> > > I'm not seeing the relation to VM_SHARED here.
> > 
> > Well, in the sense that if mmap fails in the !VM_SHARED case, I can't
> > enable any persistent events except the ones which are enabled and so
> > I need to allocate another set of resources for that second persistent
> > events user.
> 
> Right, well, you could actually allow operations on the fd, just not a
> second mapping :-)
> 
> Anyway.. I'd push that error back to the user. If they need a second
> set, let them create it.
> 
> Do you really need multiple consumers for your MCE stuff? If so, what
> would be the problem of using VM_SHARED?

Nah, for MCE I'm fine with RO buffers, simple :-).

-- 
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/2] perf: Add persistent event facilities
  2012-05-18 10:14   ` Peter Zijlstra
  2012-05-18 11:03     ` Borislav Petkov
@ 2012-05-31 17:33     ` Borislav Petkov
  1 sibling, 0 replies; 32+ messages in thread
From: Borislav Petkov @ 2012-05-31 17:33 UTC (permalink / raw)
  To: Peter Zijlstra, Steven Rostedt
  Cc: Frederic Weisbecker, Ingo Molnar, LKML, Borislav Petkov

On Fri, May 18, 2012 at 12:14:00PM +0200, Peter Zijlstra wrote:
> On Wed, 2012-03-21 at 15:34 +0100, Borislav Petkov wrote:
> > +int perf_add_persistent_on_cpu(int cpu, struct pers_event_desc *desc,
> > +                              struct dentry *dir, unsigned nr_pages) 
> 
> OK, so this creates an even and registers it somewhere in debugfs.
> 
>  - you allow an arbitrary place in debugfs; this might make finding them
> 'interesting'. Should we put them all in the same place?
> 
>  - persistent events created from userspace don't seem to get a debugfs
> entry and will be lost forever?
> 
> In general I think a little more exploring of the semantics and
> ramifications might be in order.

Ok, how about the following. This is rough and not in any way ready - it
is supposed to feel out how you guys feel about it. Basically, it adds
another file in (debugfs)/tracing/events/<subsys>/<event_name>/ which
can be used for whatever.

In the persistent event case, I get this:

/mnt/dbg/tracing/events/mce/mce_record/
|-- enable
|-- filter
|-- format
|-- id
`-- pers

and 'pers' contains the file descriptor of the perf per-cpu buffer which
I can mmap in userspace. I can read that out later.

Other events can put other files in there containing other relevant
stuff for them.

The interface trace_add_file() is pretty generic (it can be done
even more so). It is not design-finished - it is only meant to show
the intent. Oh, and don't look at bob_add_trace_file() - it is a
quick'n'dirty hack for testing :-).

So, what do you guys think, Peter, Steven?

Thanks.

--
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 176a939d1547..1c13fb5e9a50 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -222,6 +222,8 @@ struct ftrace_event_call {
 #ifdef CONFIG_PERF_EVENTS
 	int				perf_refcount;
 	struct hlist_head __percpu	*perf_events;
+	/* persistent event filedes */
+	int				pers_fd;
 #endif
 };
 
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 29111da1d100..57ad0fbe0aef 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -980,6 +980,29 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
 	return r;
 }
 
+static ssize_t
+event_pers_fd_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+	struct ftrace_event_call *call = filp->private_data;
+	struct trace_seq *s;
+	int r;
+
+	if (*ppos)
+		return 0;
+
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (!s)
+		return -ENOMEM;
+
+	trace_seq_init(s);
+	trace_seq_printf(s, "%d\n", call->pers_fd);
+
+	r = simple_read_from_buffer(ubuf, cnt, ppos,
+				    s->buffer, s->len);
+	kfree(s);
+	return r;
+}
+
 static const struct seq_operations show_event_seq_ops = {
 	.start = t_start,
 	.next = t_next,
@@ -1058,6 +1081,12 @@ static const struct file_operations ftrace_show_header_fops = {
 	.llseek = default_llseek,
 };
 
+static const struct file_operations ftrace_persistent_fops = {
+	.open = tracing_open_generic,
+	.read = event_pers_fd_read,
+	.llseek = default_llseek,
+};
+
 static struct dentry *event_trace_events_dir(void)
 {
 	static struct dentry *d_tracer;
@@ -1199,6 +1228,56 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
 	return 0;
 }
 
+static int __trace_add_file(struct ftrace_event_call *call, const char *fname,
+			    umode_t mode, const struct file_operations *fops)
+{
+	int ret = 0;
+	mutex_lock(&event_mutex);
+
+	if (!trace_create_file(fname, mode, call->dir, call, fops))
+		ret = -EINVAL;
+
+	mutex_unlock(&event_mutex);
+	return ret;
+}
+
+extern struct ftrace_event_call *__start_ftrace_events[];
+extern struct ftrace_event_call *__stop_ftrace_events[];
+
+#define for_each_event(event, start, end)			\
+	for (event = start;					\
+	     (unsigned long)event < (unsigned long)end;		\
+	     event++)
+
+/*
+ * Assumptions:
+ * - event debugfs dir is already initialized
+ * - trace event is not declared in a module
+ */
+int trace_add_file(const char *dir_name, const char *fname, umode_t mode,
+		   const struct file_operations *fops)
+{
+	struct ftrace_event_call *call;
+
+	if (!strlen(fname))
+		return -EINVAL;
+
+	list_for_each_entry(call, &ftrace_events, list) {
+		if (!strncmp(call->name, dir_name, strlen(dir_name)))
+			return __trace_add_file(call, fname, mode, fops);
+	}
+	return -EINVAL;
+}
+
+static __init int bob_add_trace_file(void)
+{
+	int ret = 0;
+
+	ret = trace_add_file("mce_record", "pers", 0444, &ftrace_persistent_fops);
+
+	return ret;
+}
+
 static int
 __trace_add_event_call(struct ftrace_event_call *call, struct module *mod,
 		       const struct file_operations *id,
@@ -1292,11 +1371,6 @@ void trace_remove_event_call(struct ftrace_event_call *call)
 	mutex_unlock(&event_mutex);
 }
 
-#define for_each_event(event, start, end)			\
-	for (event = start;					\
-	     (unsigned long)event < (unsigned long)end;		\
-	     event++)
-
 #ifdef CONFIG_MODULES
 
 static LIST_HEAD(ftrace_module_file_list);
@@ -1435,9 +1509,6 @@ static struct notifier_block trace_module_nb = {
 	.priority = 0,
 };
 
-extern struct ftrace_event_call *__start_ftrace_events[];
-extern struct ftrace_event_call *__stop_ftrace_events[];
-
 static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
 
 static __init int setup_trace_event(char *str)
@@ -1753,3 +1824,4 @@ static __init int event_trace_self_tests_init(void)
 late_initcall(event_trace_self_tests_init);
 
 #endif
+late_initcall(bob_add_trace_file);


-- 
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551

^ permalink raw reply related	[flat|nested] 32+ messages in thread

end of thread, other threads:[~2012-05-31 17:33 UTC | newest]

Thread overview: 32+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-03-21 14:34 [RFC PATCH 0/2] perf: Add persistent event facilities Borislav Petkov
2012-03-21 14:34 ` [PATCH 1/2] " Borislav Petkov
2012-05-18  9:58   ` Peter Zijlstra
2012-05-18 10:01     ` Borislav Petkov
2012-05-18 10:00   ` Peter Zijlstra
2012-05-18 10:02   ` Peter Zijlstra
2012-05-18 10:09   ` Peter Zijlstra
2012-05-18 10:49     ` Borislav Petkov
2012-05-18 10:14   ` Peter Zijlstra
2012-05-18 11:03     ` Borislav Petkov
2012-05-18 11:24       ` Peter Zijlstra
2012-05-18 11:59         ` Ingo Molnar
2012-05-18 12:55           ` Borislav Petkov
2012-05-18 13:37             ` Peter Zijlstra
2012-05-18 14:09               ` Borislav Petkov
2012-05-18 14:14                 ` Peter Zijlstra
2012-05-18 14:21                   ` Borislav Petkov
2012-05-18 14:37                     ` Peter Zijlstra
2012-05-18 15:24                       ` Borislav Petkov
2012-05-31 17:33     ` Borislav Petkov
2012-03-21 14:34 ` [PATCH 2/2] x86, mce: Add persistent MCE event Borislav Petkov
2012-03-22  8:36   ` Srivatsa S. Bhat
2012-03-22 11:40     ` Borislav Petkov
2012-03-22 11:57       ` Srivatsa S. Bhat
2012-03-23 12:31   ` Ingo Molnar
2012-03-23 13:30     ` Borislav Petkov
2012-03-24  7:37       ` Ingo Molnar
2012-03-24  9:00         ` Borislav Petkov
2012-03-24  9:15           ` Ingo Molnar
2012-05-15 15:32             ` Borislav Petkov
2012-05-18  8:18               ` Ingo Molnar
2012-05-18 10:03                 ` Borislav Petkov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).