All of lore.kernel.org
 help / color / mirror / Atom feed
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
To: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@redhat.com>,
	linux-kernel@vger.kernel.org, vince@deater.net,
	eranian@google.com, Arnaldo Carvalho de Melo <acme@infradead.org>,
	tglx@linutronix.de, ak@linux.intel.com,
	Alexander Shishkin <alexander.shishkin@linux.intel.com>
Subject: [RFC PATCH 4/6] perf: Add infrastructure for using AUX data in perf samples
Date: Fri, 23 Sep 2016 14:27:24 +0300	[thread overview]
Message-ID: <20160923112726.5890-5-alexander.shishkin@linux.intel.com> (raw)
In-Reply-To: <20160923112726.5890-1-alexander.shishkin@linux.intel.com>

AUX data can be used to annotate perf events such as performance counters
or tracepoints/breakpoints by including it in sample records when
PERF_SAMPLE_AUX flag is set. Such samples would be instrumental in debugging
and profiling by providing, for example, a history of instruction flow
leading up to the event's overflow.

To facilitate this, this patch adds code to create a kernel counter with a
ring buffer to track and collect AUX data that is then copied out into the
sampled events' perf data stream as samples.

The user interface is extended to allow for this, new attribute fields are
added:

  * aux_sample_type: specify PMU on which the AUX data generating event
                     is created;
  * aux_sample_config: event config (maps to attribute's config field),
  * aux_sample_size: size of the sample to be written.

This kernel counter is configured similarly to the event that is being
annotated with regards to filtering (exclude_{hv,idle,user,kernel}) and
enabled state (disabled, enable_on_exec) to make sure that the sampler
is not tracking any out of context activity. One sampler can be used
for multiple events.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
---
 include/linux/perf_event.h      |  12 ++
 include/uapi/linux/perf_event.h |  16 +-
 kernel/events/core.c            | 315 +++++++++++++++++++++++++++++++++++++++-
 3 files changed, 341 insertions(+), 2 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 5c5362584a..7121cf7b5c 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -101,6 +101,12 @@ struct perf_branch_stack {
 	struct perf_branch_entry	entries[0];
 };
 
+struct perf_aux_record {
+	u64		size;
+	unsigned long	from;
+	unsigned long	to;
+};
+
 struct task_struct;
 
 /*
@@ -532,6 +538,7 @@ struct swevent_hlist {
 #define PERF_ATTACH_GROUP	0x02
 #define PERF_ATTACH_TASK	0x04
 #define PERF_ATTACH_TASK_DATA	0x08
+#define PERF_ATTACH_SAMPLING	0x10
 
 struct perf_cgroup;
 struct ring_buffer;
@@ -691,6 +698,9 @@ struct perf_event {
 	perf_overflow_handler_t		overflow_handler;
 	void				*overflow_handler_context;
 
+	struct perf_event		*aux_sampler;
+	atomic_long_t			aux_samplees_count;
+
 #ifdef CONFIG_EVENT_TRACING
 	struct trace_event_call		*tp_event;
 	struct event_filter		*filter;
@@ -888,6 +898,7 @@ struct perf_sample_data {
 	 */
 	u64				addr;
 	struct perf_raw_record		*raw;
+	struct perf_aux_record		aux;
 	struct perf_branch_stack	*br_stack;
 	u64				period;
 	u64				weight;
@@ -937,6 +948,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data,
 	/* remaining struct members initialized in perf_prepare_sample() */
 	data->addr = addr;
 	data->raw  = NULL;
+	data->aux.from = data->aux.to = data->aux.size = 0;
 	data->br_stack = NULL;
 	data->period = period;
 	data->weight = 0;
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index c66a485a24..1bf3f2c358 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -139,8 +139,9 @@ enum perf_event_sample_format {
 	PERF_SAMPLE_IDENTIFIER			= 1U << 16,
 	PERF_SAMPLE_TRANSACTION			= 1U << 17,
 	PERF_SAMPLE_REGS_INTR			= 1U << 18,
+	PERF_SAMPLE_AUX				= 1U << 19,
 
-	PERF_SAMPLE_MAX = 1U << 19,		/* non-ABI */
+	PERF_SAMPLE_MAX = 1U << 20,		/* non-ABI */
 };
 
 /*
@@ -273,6 +274,9 @@ enum perf_event_read_format {
 					/* add: sample_stack_user */
 #define PERF_ATTR_SIZE_VER4	104	/* add: sample_regs_intr */
 #define PERF_ATTR_SIZE_VER5	112	/* add: aux_watermark */
+#define PERF_ATTR_SIZE_VER6	136	/* add: aux_sample_type */
+					/* add: aux_sample_config */
+					/* add: aux_sample_size */
 
 /*
  * Hardware event_id to monitor via a performance monitoring event:
@@ -390,6 +394,14 @@ struct perf_event_attr {
 	__u32	aux_watermark;
 	__u16	sample_max_stack;
 	__u16	__reserved_2;	/* align to __u64 */
+
+	/*
+	 * AUX area sampling configuration
+	 */
+	__u64	aux_sample_config;	/* event config for AUX sampling */
+	__u64	aux_sample_size;	/* desired sample size */
+	__u32	aux_sample_type;	/* pmu::type of an AUX PMU */
+	__u32	__reserved_3;		/* align to __u64 */
 };
 
 #define perf_flags(attr)	(*(&(attr)->read_format + 1))
@@ -773,6 +785,8 @@ enum perf_event_type {
 	 *	{ u64			transaction; } && PERF_SAMPLE_TRANSACTION
 	 *	{ u64			abi; # enum perf_sample_regs_abi
 	 *	  u64			regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
+	 *	{ u64			size;
+	 *	  char			data[size]; } && PERF_SAMPLE_AUX
 	 * };
 	 */
 	PERF_RECORD_SAMPLE			= 9,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b64a5c611f..fdb20fdeb1 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2422,6 +2422,25 @@ static void _perf_event_enable(struct perf_event *event)
 {
 	struct perf_event_context *ctx = event->ctx;
 
+	if (event->aux_sampler) {
+		struct perf_event_context *sctx = event->aux_sampler->ctx;
+
+		lockdep_assert_held(&ctx->mutex);
+
+		if (sctx != ctx) {
+			sctx = perf_event_ctx_lock_nested(event->aux_sampler,
+							  SINGLE_DEPTH_NESTING);
+			if (WARN_ON_ONCE(!sctx))
+				goto done;
+		}
+
+		_perf_event_enable(event->aux_sampler);
+
+		if (sctx != ctx)
+			perf_event_ctx_unlock(event->aux_sampler, sctx);
+	}
+
+done:
 	raw_spin_lock_irq(&ctx->lock);
 	if (event->state >= PERF_EVENT_STATE_INACTIVE ||
 	    event->state <  PERF_EVENT_STATE_ERROR) {
@@ -3855,6 +3874,8 @@ static void unaccount_freq_event(void)
 		atomic_dec(&nr_freq_events);
 }
 
+static void perf_aux_sampler_fini(struct perf_event *event);
+
 static void unaccount_event(struct perf_event *event)
 {
 	bool dec = false;
@@ -3886,6 +3907,9 @@ static void unaccount_event(struct perf_event *event)
 			schedule_delayed_work(&perf_sched_work, HZ);
 	}
 
+	if ((event->attr.sample_type & PERF_SAMPLE_AUX))
+		perf_aux_sampler_fini(event);
+
 	unaccount_event_cpu(event, event->cpu);
 
 	unaccount_pmu_sb_event(event);
@@ -3993,6 +4017,23 @@ static void _free_event(struct perf_event *event)
 
 	unaccount_event(event);
 
+	if (kernel_rb_event(event)) {
+		struct perf_event_context *ctx = event->ctx;
+		unsigned long flags;
+
+		/*
+		 * This event may not be explicitly freed by
+		 * perf_event_release_kernel(), we still need to remove it
+		 * from its context.
+		 */
+		raw_spin_lock_irqsave(&ctx->lock, flags);
+		list_del_event(event, ctx);
+		raw_spin_unlock_irqrestore(&ctx->lock, flags);
+
+		ring_buffer_unaccount(event->rb, false);
+		rb_free_kernel(event->rb, event);
+	}
+
 	if (event->rb) {
 		/*
 		 * Can happen when we close an event with re-directed output.
@@ -5455,6 +5496,232 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
 	}
 }
 
+struct perf_event *__find_sampling_counter(struct perf_event_context *ctx,
+					   struct perf_event *event,
+					   struct task_struct *task)
+{
+	struct perf_event *sampler = NULL;
+
+	list_for_each_entry(sampler, &ctx->event_list, event_entry) {
+		if (kernel_rb_event(sampler) &&
+		    sampler->cpu                  == event->cpu &&
+		    sampler->attr.type            == event->attr.aux_sample_type &&
+		    sampler->attr.config          == event->attr.aux_sample_config &&
+		    sampler->attr.exclude_hv      == event->attr.exclude_hv &&
+		    sampler->attr.exclude_idle    == event->attr.exclude_idle &&
+		    sampler->attr.exclude_user    == event->attr.exclude_user &&
+		    sampler->attr.exclude_kernel  == event->attr.exclude_kernel &&
+		    sampler->attr.aux_sample_size >= event->attr.aux_sample_size &&
+		    atomic_long_inc_not_zero(&sampler->refcount))
+			return sampler;
+	}
+
+	return NULL;
+}
+
+struct perf_event *find_sampling_counter(struct pmu *pmu,
+					 struct perf_event *event,
+					 struct task_struct *task)
+{
+	struct perf_event *sampler = NULL;
+	struct perf_cpu_context *cpuctx;
+	struct perf_event_context *ctx;
+	unsigned long flags;
+
+	if (!task) {
+		if (!cpu_online(event->cpu))
+			return NULL;
+
+		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, event->cpu);
+		ctx = &cpuctx->ctx;
+		raw_spin_lock_irqsave(&ctx->lock, flags);
+	} else {
+		ctx = perf_lock_task_context(task, pmu->task_ctx_nr, &flags);
+
+		if (!ctx)
+			return NULL;
+	}
+
+	sampler = __find_sampling_counter(ctx, event, task);
+	raw_spin_unlock_irqrestore(&ctx->lock, flags);
+
+	return sampler;
+}
+
+/*
+ * Sampling AUX data in perf events is done by means of a kernel event that
+ * collects data to its own ring_buffer. This data gets copied out into sampled
+ * event's SAMPLE_AUX records every time the sampled event overflows. One such
+ * kernel event (sampler) can be used to provide samples for multiple events
+ * (samplees) on the same context if their attributes match. Each samplee
+ * holds a reference to the sampler event; the last one out frees the sampler;
+ * perf_event_exit_task() is instructed not to free samplers directly.
+ */
+static int perf_aux_sampler_init(struct perf_event *event,
+				 struct task_struct *task,
+				 struct pmu *pmu)
+{
+	struct perf_event_attr attr;
+	struct perf_event *sampler;
+	unsigned long nr_pages;
+	int ret;
+
+	if (!pmu || !pmu->setup_aux)
+		return -ENOTSUPP;
+
+	sampler = find_sampling_counter(pmu, event, task);
+	if (!sampler) {
+		memset(&attr, 0, sizeof(attr));
+		attr.type            = pmu->type;
+		attr.config          = event->attr.aux_sample_config;
+		attr.disabled        = 1; /* see below */
+		attr.enable_on_exec  = event->attr.enable_on_exec;
+		attr.exclude_hv      = event->attr.exclude_hv;
+		attr.exclude_idle    = event->attr.exclude_idle;
+		attr.exclude_user    = event->attr.exclude_user;
+		attr.exclude_kernel  = event->attr.exclude_kernel;
+		attr.aux_sample_size = event->attr.aux_sample_size;
+
+		sampler = perf_event_create_kernel_counter(&attr, event->cpu,
+							   task, NULL, NULL);
+		if (IS_ERR(sampler))
+			return PTR_ERR(sampler);
+
+		nr_pages = 1ul << __get_order(event->attr.aux_sample_size);
+
+		ret = rb_alloc_kernel(sampler, 0, nr_pages);
+		if (ret) {
+			perf_event_release_kernel(sampler);
+			return ret;
+		}
+
+		/*
+		 * This event will be freed by the last exiting samplee;
+		 * perf_event_exit_task() should skip it over.
+		 */
+		sampler->attach_state |= PERF_ATTACH_SAMPLING;
+	}
+
+	event->aux_sampler = sampler;
+
+	if (!atomic_long_inc_return(&sampler->aux_samplees_count)) {
+		/*
+		 * enable the sampler here unless the original event wants
+		 * to stay disabled
+		 */
+		if (!event->attr.disabled)
+			perf_event_enable(sampler);
+	}
+
+	return 0;
+}
+
+static void perf_aux_sampler_fini(struct perf_event *event)
+{
+	struct perf_event *sampler = event->aux_sampler;
+
+	if (!sampler)
+		return;
+
+	/*
+	 * We're holding a reference to the sampler, so it's always
+	 * valid here.
+	 */
+	if (atomic_long_dec_and_test(&sampler->aux_samplees_count))
+		perf_event_disable(sampler);
+
+	/* can be last */
+	put_event(sampler);
+
+	event->aux_sampler = NULL;
+}
+
+static unsigned long perf_aux_sampler_trace(struct perf_event *event,
+					    struct perf_sample_data *data)
+{
+	struct perf_event *sampler = event->aux_sampler;
+	struct ring_buffer *rb;
+	int *disable_count;
+
+	data->aux.size = 0;
+
+	if (!sampler || READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE)
+		goto out;
+
+	if (READ_ONCE(sampler->oncpu) != smp_processor_id())
+		goto out;
+
+	/*
+	 * Non-zero disable count here means that we, being the NMI
+	 * context, are racing with pmu::add or pmu::del, both of which
+	 * may lead to a dangling hardware event and all manner of mayhem.
+	 */
+	disable_count = this_cpu_ptr(sampler->pmu->pmu_disable_count);
+	if (*disable_count)
+		goto out;
+
+	perf_pmu_disable(sampler->pmu);
+
+	rb = ring_buffer_get(sampler);
+	if (!rb) {
+		perf_pmu_enable(sampler->pmu);
+		goto out;
+	}
+
+	sampler->pmu->stop(sampler, PERF_EF_UPDATE);
+
+	data->aux.to = local_read(&rb->aux_head);
+
+	if (data->aux.to < sampler->attr.aux_sample_size)
+		data->aux.from = rb->aux_nr_pages * PAGE_SIZE +
+			data->aux.to - sampler->attr.aux_sample_size;
+	else
+		data->aux.from = data->aux.to -
+			sampler->attr.aux_sample_size;
+	data->aux.size = ALIGN(sampler->attr.aux_sample_size, sizeof(u64));
+	ring_buffer_put(rb);
+
+out:
+	return data->aux.size;
+}
+
+static void perf_aux_sampler_output(struct perf_event *event,
+				    struct perf_output_handle *handle,
+				    struct perf_sample_data *data)
+{
+	struct perf_event *sampler = event->aux_sampler;
+	struct ring_buffer *rb;
+	unsigned long pad;
+	int ret;
+
+	if (WARN_ON_ONCE(!sampler || !data->aux.size))
+		goto out_enable;
+
+	rb = ring_buffer_get(sampler);
+	if (WARN_ON_ONCE(!rb))
+		goto out_enable;
+
+	ret = rb_output_aux(rb, data->aux.from, data->aux.to,
+			    (aux_copyfn)perf_output_copy, handle);
+	if (ret < 0) {
+		pr_warn_ratelimited("failed to copy trace data\n");
+		goto out;
+	}
+
+	pad = data->aux.size - ret;
+	if (pad) {
+		u64 p = 0;
+
+		perf_output_copy(handle, &p, pad);
+	}
+out:
+	ring_buffer_put(rb);
+	sampler->pmu->start(sampler, 0);
+
+out_enable:
+	perf_pmu_enable(sampler->pmu);
+}
+
 static void __perf_event_header__init_id(struct perf_event_header *header,
 					 struct perf_sample_data *data,
 					 struct perf_event *event)
@@ -5774,6 +6041,13 @@ void perf_output_sample(struct perf_output_handle *handle,
 		}
 	}
 
+	if (sample_type & PERF_SAMPLE_AUX) {
+		perf_output_put(handle, data->aux.size);
+
+		if (data->aux.size)
+			perf_aux_sampler_output(event, handle, data);
+	}
+
 	if (!event->attr.watermark) {
 		int wakeup_events = event->attr.wakeup_events;
 
@@ -5907,6 +6181,14 @@ void perf_prepare_sample(struct perf_event_header *header,
 
 		header->size += size;
 	}
+
+	if (sample_type & PERF_SAMPLE_AUX) {
+		u64 size = sizeof(u64);
+
+		size += perf_aux_sampler_trace(event, data);
+
+		header->size += size;
+	}
 }
 
 static void __always_inline
@@ -6109,6 +6391,8 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
 		event->addr_filters_gen++;
 	raw_spin_unlock_irqrestore(&ifh->lock, flags);
 
+	perf_pmu_enable(event->pmu);
+
 	if (restart)
 		perf_event_stop(event, 1);
 }
@@ -6673,6 +6957,8 @@ static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
 		event->addr_filters_gen++;
 	raw_spin_unlock_irqrestore(&ifh->lock, flags);
 
+	perf_pmu_enable(event->pmu);
+
 	if (restart)
 		perf_event_stop(event, 1);
 }
@@ -9076,10 +9362,27 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	}
 
 	if (!event->parent) {
+		if (event->attr.sample_type & PERF_SAMPLE_AUX) {
+			struct pmu *aux_pmu;
+			int idx;
+
+			err = -EINVAL;
+
+			idx = srcu_read_lock(&pmus_srcu);
+			aux_pmu = __perf_find_pmu(event->attr.aux_sample_type);
+			if (aux_pmu)
+				err = perf_aux_sampler_init(event, task,
+							    aux_pmu);
+			srcu_read_unlock(&pmus_srcu, idx);
+
+			if (err)
+				goto err_addr_filters;
+		}
+
 		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
 			err = get_callchain_buffers(attr->sample_max_stack);
 			if (err)
-				goto err_addr_filters;
+				goto err_aux_sampler;
 		}
 	}
 
@@ -9088,6 +9391,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 
 	return event;
 
+err_aux_sampler:
+	perf_aux_sampler_fini(event);
+
 err_addr_filters:
 	kfree(event->addr_filters_offs);
 
@@ -9917,6 +10223,13 @@ perf_event_exit_event(struct perf_event *child_event,
 	struct perf_event *parent_event = child_event->parent;
 
 	/*
+	 * Skip over samplers, they are released by the last holder
+	 * of their reference.
+	 */
+	if (child_event->attach_state & PERF_ATTACH_SAMPLING)
+		return;
+
+	/*
 	 * Do not destroy the 'original' grouping; because of the context
 	 * switch optimization the original events could've ended up in a
 	 * random child task.
-- 
2.9.3

  parent reply	other threads:[~2016-09-23 11:30 UTC|newest]

Thread overview: 23+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-09-23 11:27 [RFC PATCH 0/6] perf: Add AUX data sampling Alexander Shishkin
2016-09-23 11:27 ` [RFC PATCH 1/6] perf: Move mlock accounting to ring buffer allocation Alexander Shishkin
2016-09-23 12:14   ` Peter Zijlstra
2016-09-23 14:27     ` Alexander Shishkin
2016-09-23 15:27       ` Peter Zijlstra
2016-09-23 15:58         ` Alexander Shishkin
2016-09-23 17:26     ` Andi Kleen
2016-09-23 20:28       ` Peter Zijlstra
2016-09-26  8:27         ` Alexander Shishkin
2016-09-26  9:03           ` Peter Zijlstra
2016-09-26 12:39             ` Alexander Shishkin
2016-09-26 16:13             ` Alexander Shishkin
2016-09-23 11:27 ` [RFC PATCH 2/6] perf: Add api to (de-)allocate AUX buffers for kernel counters Alexander Shishkin
2016-09-23 11:27 ` [RFC PATCH 3/6] perf: Add a helper for looking up pmus by type Alexander Shishkin
2016-09-23 11:27 ` Alexander Shishkin [this message]
2016-09-23 11:27 ` [RFC PATCH 5/6] perf: Disable PMU around address filter adjustment Alexander Shishkin
2016-09-23 11:27 ` [RFC PATCH 6/6] perf: Disable IRQs in address filter sync path Alexander Shishkin
2016-09-26 16:18   ` Alexander Shishkin
2016-10-04 16:49     ` Mathieu Poirier
2016-09-23 11:49 ` [RFC PATCH 0/6] perf: Add AUX data sampling Peter Zijlstra
2016-09-23 17:19   ` Andi Kleen
2016-09-23 20:35     ` Peter Zijlstra
2016-09-23 22:34       ` Andi Kleen

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20160923112726.5890-5-alexander.shishkin@linux.intel.com \
    --to=alexander.shishkin@linux.intel.com \
    --cc=a.p.zijlstra@chello.nl \
    --cc=acme@infradead.org \
    --cc=ak@linux.intel.com \
    --cc=eranian@google.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@redhat.com \
    --cc=tglx@linutronix.de \
    --cc=vince@deater.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.