* [PATCH v2 1/4] perf: Allow using AUX data in perf samples
2019-10-22 9:58 [PATCH v2 0/4] perf: Add AUX data sampling Alexander Shishkin
@ 2019-10-22 9:58 ` Alexander Shishkin
2019-10-24 13:52 ` Peter Zijlstra
` (4 more replies)
2019-10-22 9:58 ` [PATCH v2 2/4] perf/x86/intel/pt: Factor out starting the trace Alexander Shishkin
` (2 subsequent siblings)
3 siblings, 5 replies; 15+ messages in thread
From: Alexander Shishkin @ 2019-10-22 9:58 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Arnaldo Carvalho de Melo, Ingo Molnar, linux-kernel, jolsa,
adrian.hunter, mathieu.poirier, mark.rutland, Alexander Shishkin
AUX data can be used to annotate perf events such as performance counters
or tracepoints/breakpoints by including it in sample records when
PERF_SAMPLE_AUX flag is set. Such samples would be instrumental in debugging
and profiling by providing, for example, a history of instruction flow
leading up to the event's overflow.
The implementation makes use of grouping an AUX event with all the events
that wish to take samples of the AUX data, such that the former is the
group leader. The samplees should also specify the desired size of the AUX
sample via attr.aux_sample_size.
AUX capable PMUs need to explicitly add support for sampling, because it
relies on a new callback to take a snapshot of the buffer without touching
the event states.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
---
include/linux/perf_event.h | 20 ++++
include/uapi/linux/perf_event.h | 7 +-
kernel/events/core.c | 159 +++++++++++++++++++++++++++++++-
kernel/events/internal.h | 1 +
kernel/events/ring_buffer.c | 36 ++++++++
5 files changed, 220 insertions(+), 3 deletions(-)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 587ae4d002f5..24c5093f1229 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -249,6 +249,8 @@ struct perf_event;
#define PERF_PMU_CAP_NO_EXCLUDE 0x80
#define PERF_PMU_CAP_AUX_OUTPUT 0x100
+struct perf_output_handle;
+
/**
* struct pmu - generic performance monitoring unit
*/
@@ -423,6 +425,19 @@ struct pmu {
*/
void (*free_aux) (void *aux); /* optional */
+ /*
+ * Take a snapshot of the AUX buffer without touching the event
+ * state, so that preempting ->start()/->stop() callbacks does
+ * not interfere with their logic. Called in PMI context.
+ *
+ * Returns the size of AUX data copied to the output handle.
+ *
+ * Optional.
+ */
+ long (*snapshot_aux) (struct perf_event *event,
+ struct perf_output_handle *handle,
+ unsigned long size);
+
/*
* Validate address range filters: make sure the HW supports the
* requested configuration and number of filters; return 0 if the
@@ -964,6 +979,7 @@ struct perf_sample_data {
u32 reserved;
} cpu_entry;
struct perf_callchain_entry *callchain;
+ u64 aux_size;
/*
* regs_user may point to task_pt_regs or to regs_user_copy, depending
@@ -996,6 +1012,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data,
data->weight = 0;
data->data_src.val = PERF_MEM_NA;
data->txn = 0;
+ data->aux_size = 0;
}
extern void perf_output_sample(struct perf_output_handle *handle,
@@ -1353,6 +1370,9 @@ extern unsigned int perf_output_copy(struct perf_output_handle *handle,
const void *buf, unsigned int len);
extern unsigned int perf_output_skip(struct perf_output_handle *handle,
unsigned int len);
+extern long perf_output_copy_aux(struct perf_output_handle *aux_handle,
+ struct perf_output_handle *handle,
+ unsigned long from, unsigned long to);
extern int perf_swevent_get_recursion_context(void);
extern void perf_swevent_put_recursion_context(int rctx);
extern u64 perf_swevent_set_period(struct perf_event *event);
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index bb7b271397a6..84dbd1499a24 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -141,8 +141,9 @@ enum perf_event_sample_format {
PERF_SAMPLE_TRANSACTION = 1U << 17,
PERF_SAMPLE_REGS_INTR = 1U << 18,
PERF_SAMPLE_PHYS_ADDR = 1U << 19,
+ PERF_SAMPLE_AUX = 1U << 20,
- PERF_SAMPLE_MAX = 1U << 20, /* non-ABI */
+ PERF_SAMPLE_MAX = 1U << 21, /* non-ABI */
__PERF_SAMPLE_CALLCHAIN_EARLY = 1ULL << 63, /* non-ABI; internal use */
};
@@ -424,7 +425,7 @@ struct perf_event_attr {
*/
__u32 aux_watermark;
__u16 sample_max_stack;
- __u16 __reserved_2; /* align to __u64 */
+ __u16 aux_sample_size;
};
/*
@@ -864,6 +865,8 @@ enum perf_event_type {
* { u64 abi; # enum perf_sample_regs_abi
* u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
* { u64 phys_addr;} && PERF_SAMPLE_PHYS_ADDR
+ * { u64 size;
+ * char data[size]; } && PERF_SAMPLE_AUX
* };
*/
PERF_RECORD_SAMPLE = 9,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 77793ef0d8bc..5e2c7a715434 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1953,7 +1953,10 @@ static int perf_get_aux_event(struct perf_event *event,
if (!group_leader)
return 0;
- if (!perf_aux_output_match(event, group_leader))
+ if (event->attr.aux_output && !perf_aux_output_match(event, group_leader))
+ return 0;
+
+ if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
return 0;
if (!atomic_long_inc_not_zero(&group_leader->refcount))
@@ -6192,6 +6195,121 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
}
}
+static unsigned long perf_aux_sample_size(struct perf_event *event,
+ struct perf_sample_data *data,
+ size_t size)
+{
+ struct perf_event *sampler = event->aux_event;
+ struct ring_buffer *rb;
+
+ data->aux_size = 0;
+
+ if (!sampler)
+ goto out;
+
+ if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE))
+ goto out;
+
+ if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id()))
+ goto out;
+
+ rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
+ if (!rb)
+ goto out;
+
+ /*
+ * If this is an NMI hit inside sampling code, don't take
+ * the sample. See also perf_aux_sample_output().
+ */
+ if (READ_ONCE(rb->aux_in_sampling)) {
+ data->aux_size = 0;
+ } else {
+ size = min_t(size_t, size, perf_aux_size(rb));
+ data->aux_size = ALIGN(size, sizeof(u64));
+ }
+ ring_buffer_put(rb);
+
+out:
+ return data->aux_size;
+}
+
+long perf_pmu_aux_sample_output(struct perf_event *event,
+ struct perf_output_handle *handle,
+ unsigned long size)
+{
+ unsigned long flags;
+ long ret;
+
+ /*
+ * Normal ->start()/->stop() callbacks run in IRQ mode in scheduler
+ * paths. If we start calling them in NMI context, they may race with
+ * the IRQ ones, that is, for example, re-starting an event that's just
+ * been stopped, which is why we're using a separate callback that
+ * doesn't change the event state.
+ *
+ * IRQs need to be disabled to prevent IPIs from racing with us.
+ */
+ local_irq_save(flags);
+
+ ret = event->pmu->snapshot_aux(event, handle, size);
+
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+static void perf_aux_sample_output(struct perf_event *event,
+ struct perf_output_handle *handle,
+ struct perf_sample_data *data)
+{
+ struct perf_event *sampler = event->aux_event;
+ unsigned long pad;
+ struct ring_buffer *rb;
+ long size;
+
+ if (WARN_ON_ONCE(!sampler || !data->aux_size))
+ return;
+
+ rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
+ if (!rb)
+ return;
+
+ /*
+ * Guard against NMI hits inside the critical section;
+ * see also perf_aux_sample_size().
+ */
+ WRITE_ONCE(rb->aux_in_sampling, 1);
+
+ size = perf_pmu_aux_sample_output(sampler, handle, data->aux_size);
+
+ /*
+ * An error here means that perf_output_copy() failed (returned a
+ * non-zero surplus that it didn't copy), which in its current
+ * enlightened implementation is not possible. If that changes, we'd
+ * like to know.
+ */
+ if (WARN_ON_ONCE(size < 0))
+ goto out_clear;
+
+ /*
+ * The pad comes from ALIGN()ing data->aux_size up to u64 in
+ * perf_aux_sample_size(), so should not be more than that.
+ */
+ pad = data->aux_size - size;
+ if (WARN_ON_ONCE(pad >= sizeof(u64)))
+ pad = 8;
+
+ if (pad) {
+ u64 zero = 0;
+ perf_output_copy(handle, &zero, pad);
+ }
+
+out_clear:
+ WRITE_ONCE(rb->aux_in_sampling, 0);
+
+ ring_buffer_put(rb);
+}
+
static void __perf_event_header__init_id(struct perf_event_header *header,
struct perf_sample_data *data,
struct perf_event *event)
@@ -6511,6 +6629,13 @@ void perf_output_sample(struct perf_output_handle *handle,
if (sample_type & PERF_SAMPLE_PHYS_ADDR)
perf_output_put(handle, data->phys_addr);
+ if (sample_type & PERF_SAMPLE_AUX) {
+ perf_output_put(handle, data->aux_size);
+
+ if (data->aux_size)
+ perf_aux_sample_output(event, handle, data);
+ }
+
if (!event->attr.watermark) {
int wakeup_events = event->attr.wakeup_events;
@@ -6699,6 +6824,35 @@ void perf_prepare_sample(struct perf_event_header *header,
if (sample_type & PERF_SAMPLE_PHYS_ADDR)
data->phys_addr = perf_virt_to_phys(data->addr);
+
+ if (sample_type & PERF_SAMPLE_AUX) {
+ u64 size;
+
+ header->size += sizeof(u64); /* size */
+
+ /*
+ * Given the 16bit nature of header::size, an AUX sample can
+ * easily overflow it, what with all the preceding sample bits.
+ * Make sure this doesn't happen by using up to U16_MAX bytes
+ * per sample in total (rounded down to 8 byte boundary).
+ */
+ size = min_t(size_t, U16_MAX - header->size,
+ event->attr.aux_sample_size);
+ size = rounddown(size, 8);
+ size = perf_aux_sample_size(event, data, size);
+
+ WARN_ON_ONCE(size + header->size > U16_MAX);
+ header->size += size;
+ }
+ /*
+ * If you're adding more sample types here, you likely need to do
+ * something about the overflowing header::size, like repurpose the
+ * lowest 3 bits of size, which should be always zero at the moment.
+ * This raises a more important question, do we really need 512k sized
+ * samples and why, so good argumentation is in order for whatever you
+ * do here next.
+ */
+ WARN_ON_ONCE(header->size & 7);
}
static __always_inline int
@@ -11213,6 +11367,9 @@ SYSCALL_DEFINE5(perf_event_open,
if (event->attr.aux_output && !perf_get_aux_event(event, group_leader))
goto err_locked;
+ if (event->attr.aux_sample_size && !perf_get_aux_event(event, group_leader))
+ goto err_locked;
+
/*
* Must be under the same ctx::mutex as perf_install_in_context(),
* because we need to serialize with concurrent event creation.
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 3aef4191798c..747d67f130cb 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -50,6 +50,7 @@ struct ring_buffer {
unsigned long aux_mmap_locked;
void (*free_aux)(void *);
refcount_t aux_refcount;
+ int aux_in_sampling;
void **aux_pages;
void *aux_priv;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 246c83ac5643..7ffd5c763f93 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -562,6 +562,42 @@ void *perf_get_aux(struct perf_output_handle *handle)
}
EXPORT_SYMBOL_GPL(perf_get_aux);
+/*
+ * Copy out AUX data from an AUX handle.
+ */
+long perf_output_copy_aux(struct perf_output_handle *aux_handle,
+ struct perf_output_handle *handle,
+ unsigned long from, unsigned long to)
+{
+ unsigned long tocopy, remainder, len = 0;
+ struct ring_buffer *rb = aux_handle->rb;
+ void *addr;
+
+ from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
+ to &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
+
+ do {
+ tocopy = PAGE_SIZE - offset_in_page(from);
+ if (to > from)
+ tocopy = min(tocopy, to - from);
+ if (!tocopy)
+ break;
+
+ addr = rb->aux_pages[from >> PAGE_SHIFT];
+ addr += offset_in_page(from);
+
+ remainder = perf_output_copy(handle, addr, tocopy);
+ if (remainder)
+ return -EFAULT;
+
+ len += tocopy;
+ from += tocopy;
+ from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
+ } while (to != from);
+
+ return len;
+}
+
#define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY)
static struct page *rb_alloc_aux_page(int node, int order)
--
2.23.0
^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [PATCH v2 1/4] perf: Allow using AUX data in perf samples
2019-10-22 9:58 ` [PATCH v2 1/4] perf: Allow using AUX data in perf samples Alexander Shishkin
@ 2019-10-24 13:52 ` Peter Zijlstra
2019-10-24 14:01 ` Peter Zijlstra
` (3 subsequent siblings)
4 siblings, 0 replies; 15+ messages in thread
From: Peter Zijlstra @ 2019-10-24 13:52 UTC (permalink / raw)
To: Alexander Shishkin
Cc: Arnaldo Carvalho de Melo, Ingo Molnar, linux-kernel, jolsa,
adrian.hunter, mathieu.poirier, mark.rutland
On Tue, Oct 22, 2019 at 12:58:09PM +0300, Alexander Shishkin wrote:
> @@ -964,6 +979,7 @@ struct perf_sample_data {
> u32 reserved;
> } cpu_entry;
> struct perf_callchain_entry *callchain;
> + u64 aux_size;
>
> /*
> * regs_user may point to task_pt_regs or to regs_user_copy, depending
> @@ -996,6 +1012,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data,
> data->weight = 0;
> data->data_src.val = PERF_MEM_NA;
> data->txn = 0;
> + data->aux_size = 0;
> }
>
That is really sad; so far we've managed to only touch a single
cacheline there, you just wrecked that.
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v2 1/4] perf: Allow using AUX data in perf samples
2019-10-22 9:58 ` [PATCH v2 1/4] perf: Allow using AUX data in perf samples Alexander Shishkin
2019-10-24 13:52 ` Peter Zijlstra
@ 2019-10-24 14:01 ` Peter Zijlstra
2019-10-25 12:52 ` Alexander Shishkin
2019-10-24 14:06 ` Peter Zijlstra
` (2 subsequent siblings)
4 siblings, 1 reply; 15+ messages in thread
From: Peter Zijlstra @ 2019-10-24 14:01 UTC (permalink / raw)
To: Alexander Shishkin
Cc: Arnaldo Carvalho de Melo, Ingo Molnar, linux-kernel, jolsa,
adrian.hunter, mathieu.poirier, mark.rutland
On Tue, Oct 22, 2019 at 12:58:09PM +0300, Alexander Shishkin wrote:
> @@ -11213,6 +11367,9 @@ SYSCALL_DEFINE5(perf_event_open,
> if (event->attr.aux_output && !perf_get_aux_event(event, group_leader))
> goto err_locked;
>
> + if (event->attr.aux_sample_size && !perf_get_aux_event(event, group_leader))
> + goto err_locked;
> +
Either aux_sample_size and aux_output are mutually exclusive, or you're
leaking a refcount on group_leader. The first wants a check, the second
wants error path fixes.
> /*
> * Must be under the same ctx::mutex as perf_install_in_context(),
> * because we need to serialize with concurrent event creation.
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v2 1/4] perf: Allow using AUX data in perf samples
2019-10-24 14:01 ` Peter Zijlstra
@ 2019-10-25 12:52 ` Alexander Shishkin
0 siblings, 0 replies; 15+ messages in thread
From: Alexander Shishkin @ 2019-10-25 12:52 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Arnaldo Carvalho de Melo, Ingo Molnar, linux-kernel, jolsa,
adrian.hunter, mathieu.poirier, mark.rutland, alexander.shishkin
Peter Zijlstra <peterz@infradead.org> writes:
> On Tue, Oct 22, 2019 at 12:58:09PM +0300, Alexander Shishkin wrote:
>> @@ -11213,6 +11367,9 @@ SYSCALL_DEFINE5(perf_event_open,
>> if (event->attr.aux_output && !perf_get_aux_event(event, group_leader))
>> goto err_locked;
>>
>> + if (event->attr.aux_sample_size && !perf_get_aux_event(event, group_leader))
>> + goto err_locked;
>> +
>
> Either aux_sample_size and aux_output are mutually exclusive, or you're
> leaking a refcount on group_leader. The first wants a check, the second
> wants error path fixes.
Now you mention it, it should be possible to have both. Whether that's
useful is another question. But we only need one reference either way.
Thanks,
--
Alex
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v2 1/4] perf: Allow using AUX data in perf samples
2019-10-22 9:58 ` [PATCH v2 1/4] perf: Allow using AUX data in perf samples Alexander Shishkin
2019-10-24 13:52 ` Peter Zijlstra
2019-10-24 14:01 ` Peter Zijlstra
@ 2019-10-24 14:06 ` Peter Zijlstra
2019-10-25 12:21 ` Alexander Shishkin
2019-10-24 14:09 ` Peter Zijlstra
2019-10-24 14:12 ` Peter Zijlstra
4 siblings, 1 reply; 15+ messages in thread
From: Peter Zijlstra @ 2019-10-24 14:06 UTC (permalink / raw)
To: Alexander Shishkin
Cc: Arnaldo Carvalho de Melo, Ingo Molnar, linux-kernel, jolsa,
adrian.hunter, mathieu.poirier, mark.rutland
On Tue, Oct 22, 2019 at 12:58:09PM +0300, Alexander Shishkin wrote:
> @@ -964,6 +979,7 @@ struct perf_sample_data {
> u32 reserved;
> } cpu_entry;
> struct perf_callchain_entry *callchain;
> + u64 aux_size;
>
> /*
> * regs_user may point to task_pt_regs or to regs_user_copy, depending
> @@ -996,6 +1012,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data,
> data->weight = 0;
> data->data_src.val = PERF_MEM_NA;
> data->txn = 0;
> + data->aux_size = 0;
> }
>
I don't see the need to initialize in perf_sample_data_init(), because
prepare sets it unconditionally:
> +static unsigned long perf_aux_sample_size(struct perf_event *event,
> + struct perf_sample_data *data,
> + size_t size)
> +{
> + struct perf_event *sampler = event->aux_event;
> + struct ring_buffer *rb;
> +
> + data->aux_size = 0;
> +
> + if (!sampler)
> + goto out;
> +
> + if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE))
> + goto out;
> +
> + if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id()))
> + goto out;
> +
> + rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
> + if (!rb)
> + goto out;
> +
> + /*
> + * If this is an NMI hit inside sampling code, don't take
> + * the sample. See also perf_aux_sample_output().
> + */
> + if (READ_ONCE(rb->aux_in_sampling)) {
> + data->aux_size = 0;
> + } else {
> + size = min_t(size_t, size, perf_aux_size(rb));
> + data->aux_size = ALIGN(size, sizeof(u64));
> + }
> + ring_buffer_put(rb);
> +
> +out:
> + return data->aux_size;
> +}
When PERF_SAMPLE_AUX
> @@ -6699,6 +6824,35 @@ void perf_prepare_sample(struct perf_event_header *header,
>
> if (sample_type & PERF_SAMPLE_PHYS_ADDR)
> data->phys_addr = perf_virt_to_phys(data->addr);
> +
> + if (sample_type & PERF_SAMPLE_AUX) {
> + u64 size;
> +
> + header->size += sizeof(u64); /* size */
> +
> + /*
> + * Given the 16bit nature of header::size, an AUX sample can
> + * easily overflow it, what with all the preceding sample bits.
> + * Make sure this doesn't happen by using up to U16_MAX bytes
> + * per sample in total (rounded down to 8 byte boundary).
> + */
> + size = min_t(size_t, U16_MAX - header->size,
> + event->attr.aux_sample_size);
> + size = rounddown(size, 8);
> + size = perf_aux_sample_size(event, data, size);
> +
> + WARN_ON_ONCE(size + header->size > U16_MAX);
> + header->size += size;
> + }
> + /*
> + * If you're adding more sample types here, you likely need to do
> + * something about the overflowing header::size, like repurpose the
> + * lowest 3 bits of size, which should be always zero at the moment.
> + * This raises a more important question, do we really need 512k sized
> + * samples and why, so good argumentation is in order for whatever you
> + * do here next.
> + */
> + WARN_ON_ONCE(header->size & 7);
> }
And output only looks at it when PERF_SAMPLE_AUX.
> +static void perf_aux_sample_output(struct perf_event *event,
> + struct perf_output_handle *handle,
> + struct perf_sample_data *data)
> +{
> + struct perf_event *sampler = event->aux_event;
> + unsigned long pad;
> + struct ring_buffer *rb;
> + long size;
> +
> + if (WARN_ON_ONCE(!sampler || !data->aux_size))
> + return;
> +
> + rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
> + if (!rb)
> + return;
> +
> + /*
> + * Guard against NMI hits inside the critical section;
> + * see also perf_aux_sample_size().
> + */
> + WRITE_ONCE(rb->aux_in_sampling, 1);
> +
> + size = perf_pmu_aux_sample_output(sampler, handle, data->aux_size);
> +
> + /*
> + * An error here means that perf_output_copy() failed (returned a
> + * non-zero surplus that it didn't copy), which in its current
> + * enlightened implementation is not possible. If that changes, we'd
> + * like to know.
> + */
> + if (WARN_ON_ONCE(size < 0))
> + goto out_clear;
> +
> + /*
> + * The pad comes from ALIGN()ing data->aux_size up to u64 in
> + * perf_aux_sample_size(), so should not be more than that.
> + */
> + pad = data->aux_size - size;
> + if (WARN_ON_ONCE(pad >= sizeof(u64)))
> + pad = 8;
> +
> + if (pad) {
> + u64 zero = 0;
> + perf_output_copy(handle, &zero, pad);
> + }
> +
> +out_clear:
> + WRITE_ONCE(rb->aux_in_sampling, 0);
> +
> + ring_buffer_put(rb);
> +}
> +
> @@ -6511,6 +6629,13 @@ void perf_output_sample(struct perf_output_handle *handle,
> if (sample_type & PERF_SAMPLE_PHYS_ADDR)
> perf_output_put(handle, data->phys_addr);
>
> + if (sample_type & PERF_SAMPLE_AUX) {
> + perf_output_put(handle, data->aux_size);
> +
> + if (data->aux_size)
> + perf_aux_sample_output(event, handle, data);
> + }
> +
> if (!event->attr.watermark) {
> int wakeup_events = event->attr.wakeup_events;
So, afaict, you can simply remove the line in perf_sample_data_init().
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v2 1/4] perf: Allow using AUX data in perf samples
2019-10-24 14:06 ` Peter Zijlstra
@ 2019-10-25 12:21 ` Alexander Shishkin
0 siblings, 0 replies; 15+ messages in thread
From: Alexander Shishkin @ 2019-10-25 12:21 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Arnaldo Carvalho de Melo, Ingo Molnar, linux-kernel, jolsa,
adrian.hunter, mathieu.poirier, mark.rutland, alexander.shishkin
Peter Zijlstra <peterz@infradead.org> writes:
>> @@ -6511,6 +6629,13 @@ void perf_output_sample(struct perf_output_handle *handle,
>> if (sample_type & PERF_SAMPLE_PHYS_ADDR)
>> perf_output_put(handle, data->phys_addr);
>>
>> + if (sample_type & PERF_SAMPLE_AUX) {
>> + perf_output_put(handle, data->aux_size);
>> +
>> + if (data->aux_size)
>> + perf_aux_sample_output(event, handle, data);
>> + }
>> +
>> if (!event->attr.watermark) {
>> int wakeup_events = event->attr.wakeup_events;
>
>
> So, afaict, you can simply remove the line in perf_sample_data_init().
That's right, thanks for catching this.
Regards,
--
Alex
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v2 1/4] perf: Allow using AUX data in perf samples
2019-10-22 9:58 ` [PATCH v2 1/4] perf: Allow using AUX data in perf samples Alexander Shishkin
` (2 preceding siblings ...)
2019-10-24 14:06 ` Peter Zijlstra
@ 2019-10-24 14:09 ` Peter Zijlstra
2019-10-24 14:12 ` Peter Zijlstra
4 siblings, 0 replies; 15+ messages in thread
From: Peter Zijlstra @ 2019-10-24 14:09 UTC (permalink / raw)
To: Alexander Shishkin
Cc: Arnaldo Carvalho de Melo, Ingo Molnar, linux-kernel, jolsa,
adrian.hunter, mathieu.poirier, mark.rutland
On Tue, Oct 22, 2019 at 12:58:09PM +0300, Alexander Shishkin wrote:
> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
> index bb7b271397a6..84dbd1499a24 100644
> --- a/include/uapi/linux/perf_event.h
> +++ b/include/uapi/linux/perf_event.h
> @@ -141,8 +141,9 @@ enum perf_event_sample_format {
> PERF_SAMPLE_TRANSACTION = 1U << 17,
> PERF_SAMPLE_REGS_INTR = 1U << 18,
> PERF_SAMPLE_PHYS_ADDR = 1U << 19,
> + PERF_SAMPLE_AUX = 1U << 20,
>
> - PERF_SAMPLE_MAX = 1U << 20, /* non-ABI */
> + PERF_SAMPLE_MAX = 1U << 21, /* non-ABI */
>
> __PERF_SAMPLE_CALLCHAIN_EARLY = 1ULL << 63, /* non-ABI; internal use */
> };
> @@ -424,7 +425,7 @@ struct perf_event_attr {
> */
> __u32 aux_watermark;
> __u16 sample_max_stack;
> - __u16 __reserved_2; /* align to __u64 */
> + __u16 aux_sample_size;
> };
While I understand; would it not be better to make that a u32 (like
sample_stack_user) ? That way we only have to ref the output format and
not also the config format to get more output.
> /*
> @@ -864,6 +865,8 @@ enum perf_event_type {
> * { u64 abi; # enum perf_sample_regs_abi
> * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
> * { u64 phys_addr;} && PERF_SAMPLE_PHYS_ADDR
> + * { u64 size;
> + * char data[size]; } && PERF_SAMPLE_AUX
> * };
> */
> PERF_RECORD_SAMPLE = 9,
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v2 1/4] perf: Allow using AUX data in perf samples
2019-10-22 9:58 ` [PATCH v2 1/4] perf: Allow using AUX data in perf samples Alexander Shishkin
` (3 preceding siblings ...)
2019-10-24 14:09 ` Peter Zijlstra
@ 2019-10-24 14:12 ` Peter Zijlstra
4 siblings, 0 replies; 15+ messages in thread
From: Peter Zijlstra @ 2019-10-24 14:12 UTC (permalink / raw)
To: Alexander Shishkin
Cc: Arnaldo Carvalho de Melo, Ingo Molnar, linux-kernel, jolsa,
adrian.hunter, mathieu.poirier, mark.rutland
On Tue, Oct 22, 2019 at 12:58:09PM +0300, Alexander Shishkin wrote:
> AUX data can be used to annotate perf events such as performance counters
> or tracepoints/breakpoints by including it in sample records when
> PERF_SAMPLE_AUX flag is set. Such samples would be instrumental in debugging
> and profiling by providing, for example, a history of instruction flow
> leading up to the event's overflow.
>
> The implementation makes use of grouping an AUX event with all the events
> that wish to take samples of the AUX data, such that the former is the
> group leader. The samplees should also specify the desired size of the AUX
> sample via attr.aux_sample_size.
>
> AUX capable PMUs need to explicitly add support for sampling, because it
> relies on a new callback to take a snapshot of the buffer without touching
> the event states.
Mathieu (and other ARM-CS folks) is this something that can work for
you? It would be really bad to discover this interface can't work for
you after it's been merged.
> Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
> ---
> include/linux/perf_event.h | 20 ++++
> include/uapi/linux/perf_event.h | 7 +-
> kernel/events/core.c | 159 +++++++++++++++++++++++++++++++-
> kernel/events/internal.h | 1 +
> kernel/events/ring_buffer.c | 36 ++++++++
> 5 files changed, 220 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index 587ae4d002f5..24c5093f1229 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -249,6 +249,8 @@ struct perf_event;
> #define PERF_PMU_CAP_NO_EXCLUDE 0x80
> #define PERF_PMU_CAP_AUX_OUTPUT 0x100
>
> +struct perf_output_handle;
> +
> /**
> * struct pmu - generic performance monitoring unit
> */
> @@ -423,6 +425,19 @@ struct pmu {
> */
> void (*free_aux) (void *aux); /* optional */
>
> + /*
> + * Take a snapshot of the AUX buffer without touching the event
> + * state, so that preempting ->start()/->stop() callbacks does
> + * not interfere with their logic. Called in PMI context.
> + *
> + * Returns the size of AUX data copied to the output handle.
> + *
> + * Optional.
> + */
> + long (*snapshot_aux) (struct perf_event *event,
> + struct perf_output_handle *handle,
> + unsigned long size);
> +
> /*
> * Validate address range filters: make sure the HW supports the
> * requested configuration and number of filters; return 0 if the
> @@ -964,6 +979,7 @@ struct perf_sample_data {
> u32 reserved;
> } cpu_entry;
> struct perf_callchain_entry *callchain;
> + u64 aux_size;
>
> /*
> * regs_user may point to task_pt_regs or to regs_user_copy, depending
> @@ -996,6 +1012,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data,
> data->weight = 0;
> data->data_src.val = PERF_MEM_NA;
> data->txn = 0;
> + data->aux_size = 0;
> }
>
> extern void perf_output_sample(struct perf_output_handle *handle,
> @@ -1353,6 +1370,9 @@ extern unsigned int perf_output_copy(struct perf_output_handle *handle,
> const void *buf, unsigned int len);
> extern unsigned int perf_output_skip(struct perf_output_handle *handle,
> unsigned int len);
> +extern long perf_output_copy_aux(struct perf_output_handle *aux_handle,
> + struct perf_output_handle *handle,
> + unsigned long from, unsigned long to);
> extern int perf_swevent_get_recursion_context(void);
> extern void perf_swevent_put_recursion_context(int rctx);
> extern u64 perf_swevent_set_period(struct perf_event *event);
> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
> index bb7b271397a6..84dbd1499a24 100644
> --- a/include/uapi/linux/perf_event.h
> +++ b/include/uapi/linux/perf_event.h
> @@ -141,8 +141,9 @@ enum perf_event_sample_format {
> PERF_SAMPLE_TRANSACTION = 1U << 17,
> PERF_SAMPLE_REGS_INTR = 1U << 18,
> PERF_SAMPLE_PHYS_ADDR = 1U << 19,
> + PERF_SAMPLE_AUX = 1U << 20,
>
> - PERF_SAMPLE_MAX = 1U << 20, /* non-ABI */
> + PERF_SAMPLE_MAX = 1U << 21, /* non-ABI */
>
> __PERF_SAMPLE_CALLCHAIN_EARLY = 1ULL << 63, /* non-ABI; internal use */
> };
> @@ -424,7 +425,7 @@ struct perf_event_attr {
> */
> __u32 aux_watermark;
> __u16 sample_max_stack;
> - __u16 __reserved_2; /* align to __u64 */
> + __u16 aux_sample_size;
> };
>
> /*
> @@ -864,6 +865,8 @@ enum perf_event_type {
> * { u64 abi; # enum perf_sample_regs_abi
> * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
> * { u64 phys_addr;} && PERF_SAMPLE_PHYS_ADDR
> + * { u64 size;
> + * char data[size]; } && PERF_SAMPLE_AUX
> * };
> */
> PERF_RECORD_SAMPLE = 9,
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 77793ef0d8bc..5e2c7a715434 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -1953,7 +1953,10 @@ static int perf_get_aux_event(struct perf_event *event,
> if (!group_leader)
> return 0;
>
> - if (!perf_aux_output_match(event, group_leader))
> + if (event->attr.aux_output && !perf_aux_output_match(event, group_leader))
> + return 0;
> +
> + if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
> return 0;
>
> if (!atomic_long_inc_not_zero(&group_leader->refcount))
> @@ -6192,6 +6195,121 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
> }
> }
>
> +static unsigned long perf_aux_sample_size(struct perf_event *event,
> + struct perf_sample_data *data,
> + size_t size)
> +{
> + struct perf_event *sampler = event->aux_event;
> + struct ring_buffer *rb;
> +
> + data->aux_size = 0;
> +
> + if (!sampler)
> + goto out;
> +
> + if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE))
> + goto out;
> +
> + if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id()))
> + goto out;
> +
> + rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
> + if (!rb)
> + goto out;
> +
> + /*
> + * If this is an NMI hit inside sampling code, don't take
> + * the sample. See also perf_aux_sample_output().
> + */
> + if (READ_ONCE(rb->aux_in_sampling)) {
> + data->aux_size = 0;
> + } else {
> + size = min_t(size_t, size, perf_aux_size(rb));
> + data->aux_size = ALIGN(size, sizeof(u64));
> + }
> + ring_buffer_put(rb);
> +
> +out:
> + return data->aux_size;
> +}
> +
> +long perf_pmu_aux_sample_output(struct perf_event *event,
> + struct perf_output_handle *handle,
> + unsigned long size)
> +{
> + unsigned long flags;
> + long ret;
> +
> + /*
> + * Normal ->start()/->stop() callbacks run in IRQ mode in scheduler
> + * paths. If we start calling them in NMI context, they may race with
> + * the IRQ ones, that is, for example, re-starting an event that's just
> + * been stopped, which is why we're using a separate callback that
> + * doesn't change the event state.
> + *
> + * IRQs need to be disabled to prevent IPIs from racing with us.
> + */
> + local_irq_save(flags);
> +
> + ret = event->pmu->snapshot_aux(event, handle, size);
> +
> + local_irq_restore(flags);
> +
> + return ret;
> +}
> +
> +static void perf_aux_sample_output(struct perf_event *event,
> + struct perf_output_handle *handle,
> + struct perf_sample_data *data)
> +{
> + struct perf_event *sampler = event->aux_event;
> + unsigned long pad;
> + struct ring_buffer *rb;
> + long size;
> +
> + if (WARN_ON_ONCE(!sampler || !data->aux_size))
> + return;
> +
> + rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
> + if (!rb)
> + return;
> +
> + /*
> + * Guard against NMI hits inside the critical section;
> + * see also perf_aux_sample_size().
> + */
> + WRITE_ONCE(rb->aux_in_sampling, 1);
> +
> + size = perf_pmu_aux_sample_output(sampler, handle, data->aux_size);
> +
> + /*
> + * An error here means that perf_output_copy() failed (returned a
> + * non-zero surplus that it didn't copy), which in its current
> + * enlightened implementation is not possible. If that changes, we'd
> + * like to know.
> + */
> + if (WARN_ON_ONCE(size < 0))
> + goto out_clear;
> +
> + /*
> + * The pad comes from ALIGN()ing data->aux_size up to u64 in
> + * perf_aux_sample_size(), so should not be more than that.
> + */
> + pad = data->aux_size - size;
> + if (WARN_ON_ONCE(pad >= sizeof(u64)))
> + pad = 8;
> +
> + if (pad) {
> + u64 zero = 0;
> + perf_output_copy(handle, &zero, pad);
> + }
> +
> +out_clear:
> + WRITE_ONCE(rb->aux_in_sampling, 0);
> +
> + ring_buffer_put(rb);
> +}
> +
> static void __perf_event_header__init_id(struct perf_event_header *header,
> struct perf_sample_data *data,
> struct perf_event *event)
> @@ -6511,6 +6629,13 @@ void perf_output_sample(struct perf_output_handle *handle,
> if (sample_type & PERF_SAMPLE_PHYS_ADDR)
> perf_output_put(handle, data->phys_addr);
>
> + if (sample_type & PERF_SAMPLE_AUX) {
> + perf_output_put(handle, data->aux_size);
> +
> + if (data->aux_size)
> + perf_aux_sample_output(event, handle, data);
> + }
> +
> if (!event->attr.watermark) {
> int wakeup_events = event->attr.wakeup_events;
>
> @@ -6699,6 +6824,35 @@ void perf_prepare_sample(struct perf_event_header *header,
>
> if (sample_type & PERF_SAMPLE_PHYS_ADDR)
> data->phys_addr = perf_virt_to_phys(data->addr);
> +
> + if (sample_type & PERF_SAMPLE_AUX) {
> + u64 size;
> +
> + header->size += sizeof(u64); /* size */
> +
> + /*
> + * Given the 16bit nature of header::size, an AUX sample can
> + * easily overflow it, what with all the preceding sample bits.
> + * Make sure this doesn't happen by using up to U16_MAX bytes
> + * per sample in total (rounded down to 8 byte boundary).
> + */
> + size = min_t(size_t, U16_MAX - header->size,
> + event->attr.aux_sample_size);
> + size = rounddown(size, 8);
> + size = perf_aux_sample_size(event, data, size);
> +
> + WARN_ON_ONCE(size + header->size > U16_MAX);
> + header->size += size;
> + }
> + /*
> + * If you're adding more sample types here, you likely need to do
> + * something about the overflowing header::size, like repurpose the
> + * lowest 3 bits of size, which should be always zero at the moment.
> + * This raises a more important question, do we really need 512k sized
> + * samples and why, so good argumentation is in order for whatever you
> + * do here next.
> + */
> + WARN_ON_ONCE(header->size & 7);
> }
>
> static __always_inline int
> @@ -11213,6 +11367,9 @@ SYSCALL_DEFINE5(perf_event_open,
> if (event->attr.aux_output && !perf_get_aux_event(event, group_leader))
> goto err_locked;
>
> + if (event->attr.aux_sample_size && !perf_get_aux_event(event, group_leader))
> + goto err_locked;
> +
> /*
> * Must be under the same ctx::mutex as perf_install_in_context(),
> * because we need to serialize with concurrent event creation.
> diff --git a/kernel/events/internal.h b/kernel/events/internal.h
> index 3aef4191798c..747d67f130cb 100644
> --- a/kernel/events/internal.h
> +++ b/kernel/events/internal.h
> @@ -50,6 +50,7 @@ struct ring_buffer {
> unsigned long aux_mmap_locked;
> void (*free_aux)(void *);
> refcount_t aux_refcount;
> + int aux_in_sampling;
> void **aux_pages;
> void *aux_priv;
>
> diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
> index 246c83ac5643..7ffd5c763f93 100644
> --- a/kernel/events/ring_buffer.c
> +++ b/kernel/events/ring_buffer.c
> @@ -562,6 +562,42 @@ void *perf_get_aux(struct perf_output_handle *handle)
> }
> EXPORT_SYMBOL_GPL(perf_get_aux);
>
> +/*
> + * Copy out AUX data from an AUX handle.
> + */
> +long perf_output_copy_aux(struct perf_output_handle *aux_handle,
> + struct perf_output_handle *handle,
> + unsigned long from, unsigned long to)
> +{
> + unsigned long tocopy, remainder, len = 0;
> + struct ring_buffer *rb = aux_handle->rb;
> + void *addr;
> +
> + from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
> + to &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
> +
> + do {
> + tocopy = PAGE_SIZE - offset_in_page(from);
> + if (to > from)
> + tocopy = min(tocopy, to - from);
> + if (!tocopy)
> + break;
> +
> + addr = rb->aux_pages[from >> PAGE_SHIFT];
> + addr += offset_in_page(from);
> +
> + remainder = perf_output_copy(handle, addr, tocopy);
> + if (remainder)
> + return -EFAULT;
> +
> + len += tocopy;
> + from += tocopy;
> + from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
> + } while (to != from);
> +
> + return len;
> +}
> +
> #define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY)
>
> static struct page *rb_alloc_aux_page(int node, int order)
> --
> 2.23.0
>
^ permalink raw reply [flat|nested] 15+ messages in thread
* [PATCH v2 2/4] perf/x86/intel/pt: Factor out starting the trace
2019-10-22 9:58 [PATCH v2 0/4] perf: Add AUX data sampling Alexander Shishkin
2019-10-22 9:58 ` [PATCH v2 1/4] perf: Allow using AUX data in perf samples Alexander Shishkin
@ 2019-10-22 9:58 ` Alexander Shishkin
2019-10-22 9:58 ` [PATCH v2 3/4] perf/x86/intel/pt: Add sampling support Alexander Shishkin
2019-10-22 9:58 ` [PATCH v2 4/4] perf/x86/intel/pt: Opportunistically use single range output mode Alexander Shishkin
3 siblings, 0 replies; 15+ messages in thread
From: Alexander Shishkin @ 2019-10-22 9:58 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Arnaldo Carvalho de Melo, Ingo Molnar, linux-kernel, jolsa,
adrian.hunter, mathieu.poirier, mark.rutland, Alexander Shishkin
PT trace is now enabled at the bottom of the event configuration
function that takes care of all configuration bits related to a given
event, including the address filter update. This is only needed where
the event configuration changes, that is, in ->add()/->start().
In the interrupt path we can use a lighter version that keeps the
configuration intact, since it hasn't changed, and only flips the
enable bit.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
---
arch/x86/events/intel/pt.c | 22 ++++++++++++++++------
1 file changed, 16 insertions(+), 6 deletions(-)
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 05e43d0f430b..170f3b402274 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -397,6 +397,20 @@ static bool pt_event_valid(struct perf_event *event)
* These all are cpu affine and operate on a local PT
*/
+static void pt_config_start(struct perf_event *event)
+{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ u64 ctl = event->hw.config;
+
+ ctl |= RTIT_CTL_TRACEEN;
+ if (READ_ONCE(pt->vmx_on))
+ perf_aux_output_flag(&pt->handle, PERF_AUX_FLAG_PARTIAL);
+ else
+ wrmsrl(MSR_IA32_RTIT_CTL, ctl);
+
+ WRITE_ONCE(event->hw.config, ctl);
+}
+
/* Address ranges and their corresponding msr configuration registers */
static const struct pt_address_range {
unsigned long msr_a;
@@ -468,7 +482,6 @@ static u64 pt_config_filters(struct perf_event *event)
static void pt_config(struct perf_event *event)
{
- struct pt *pt = this_cpu_ptr(&pt_ctx);
u64 reg;
/* First round: clear STATUS, in particular the PSB byte counter. */
@@ -501,10 +514,7 @@ static void pt_config(struct perf_event *event)
reg |= (event->attr.config & PT_CONFIG_MASK);
event->hw.config = reg;
- if (READ_ONCE(pt->vmx_on))
- perf_aux_output_flag(&pt->handle, PERF_AUX_FLAG_PARTIAL);
- else
- wrmsrl(MSR_IA32_RTIT_CTL, reg);
+ pt_config_start(event);
}
static void pt_config_stop(struct perf_event *event)
@@ -1381,7 +1391,7 @@ void intel_pt_interrupt(void)
pt_config_buffer(topa_to_page(buf->cur)->table, buf->cur_idx,
buf->output_off);
- pt_config(event);
+ pt_config_start(event);
}
}
--
2.23.0
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH v2 3/4] perf/x86/intel/pt: Add sampling support
2019-10-22 9:58 [PATCH v2 0/4] perf: Add AUX data sampling Alexander Shishkin
2019-10-22 9:58 ` [PATCH v2 1/4] perf: Allow using AUX data in perf samples Alexander Shishkin
2019-10-22 9:58 ` [PATCH v2 2/4] perf/x86/intel/pt: Factor out starting the trace Alexander Shishkin
@ 2019-10-22 9:58 ` Alexander Shishkin
2019-10-22 9:58 ` [PATCH v2 4/4] perf/x86/intel/pt: Opportunistically use single range output mode Alexander Shishkin
3 siblings, 0 replies; 15+ messages in thread
From: Alexander Shishkin @ 2019-10-22 9:58 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Arnaldo Carvalho de Melo, Ingo Molnar, linux-kernel, jolsa,
adrian.hunter, mathieu.poirier, mark.rutland, Alexander Shishkin
Add AUX sampling support to the PT PMU: implement an NMI-safe callback
that takes a snapshot of the buffer without touching the event states.
This is done for PT events that don't use PMIs, that is, snapshot mode
(RO mapping of the AUX area).
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
---
arch/x86/events/intel/pt.c | 54 ++++++++++++++++++++++++++++++++++++++
1 file changed, 54 insertions(+)
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 170f3b402274..2f20d5a333c1 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -1208,6 +1208,13 @@ pt_buffer_setup_aux(struct perf_event *event, void **pages,
if (!nr_pages)
return NULL;
+ /*
+ * Only support AUX sampling in snapshot mode, where we don't
+ * generate NMIs.
+ */
+ if (event->attr.aux_sample_size && !snapshot)
+ return NULL;
+
if (cpu == -1)
cpu = raw_smp_processor_id();
node = cpu_to_node(cpu);
@@ -1506,6 +1513,52 @@ static void pt_event_stop(struct perf_event *event, int mode)
}
}
+static long pt_event_snapshot_aux(struct perf_event *event,
+ struct perf_output_handle *handle,
+ unsigned long size)
+{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ struct pt_buffer *buf = perf_get_aux(&pt->handle);
+ unsigned long from = 0, to;
+ long ret;
+
+ if (WARN_ON_ONCE(!buf))
+ return 0;
+
+ /*
+ * Sampling is only allowed on snapshot events;
+ * see pt_buffer_setup_aux().
+ */
+ if (WARN_ON_ONCE(!buf->snapshot))
+ return 0;
+
+ /*
+ * Here, handle_nmi tells us if the tracing is on
+ */
+ if (READ_ONCE(pt->handle_nmi))
+ pt_config_stop(event);
+
+ pt_read_offset(buf);
+ pt_update_head(pt);
+
+ to = local_read(&buf->data_size);
+ if (to < size)
+ from = buf->nr_pages << PAGE_SHIFT;
+ from += to - size;
+
+ ret = perf_output_copy_aux(&pt->handle, handle, from, to);
+
+ /*
+ * If the tracing was on when we turned up, restart it.
+ * Compiler barrier not needed as we couldn't have been
+ * preempted by anything that touches pt->handle_nmi.
+ */
+ if (pt->handle_nmi)
+ pt_config_start(event);
+
+ return ret;
+}
+
static void pt_event_del(struct perf_event *event, int mode)
{
pt_event_stop(event, PERF_EF_UPDATE);
@@ -1625,6 +1678,7 @@ static __init int pt_init(void)
pt_pmu.pmu.del = pt_event_del;
pt_pmu.pmu.start = pt_event_start;
pt_pmu.pmu.stop = pt_event_stop;
+ pt_pmu.pmu.snapshot_aux = pt_event_snapshot_aux;
pt_pmu.pmu.read = pt_event_read;
pt_pmu.pmu.setup_aux = pt_buffer_setup_aux;
pt_pmu.pmu.free_aux = pt_buffer_free_aux;
--
2.23.0
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH v2 4/4] perf/x86/intel/pt: Opportunistically use single range output mode
2019-10-22 9:58 [PATCH v2 0/4] perf: Add AUX data sampling Alexander Shishkin
` (2 preceding siblings ...)
2019-10-22 9:58 ` [PATCH v2 3/4] perf/x86/intel/pt: Add sampling support Alexander Shishkin
@ 2019-10-22 9:58 ` Alexander Shishkin
2019-10-23 15:09 ` Alexander Shishkin
2019-10-24 13:56 ` Peter Zijlstra
3 siblings, 2 replies; 15+ messages in thread
From: Alexander Shishkin @ 2019-10-22 9:58 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Arnaldo Carvalho de Melo, Ingo Molnar, linux-kernel, jolsa,
adrian.hunter, mathieu.poirier, mark.rutland, Alexander Shishkin
Most of PT implementations support Single Range Output mode, which is
an alternative to ToPA that can be used for a single contiguous buffer
and if we don't require an interrupt, that is, in AUX snapshot mode.
Now that perf core will use high order allocations for the AUX buffer,
in many cases the first condition will also be satisfied.
The two most obvious benefits of the Single Range Output mode over the
ToPA are:
* not having to allocate the ToPA table(s),
* not using the ToPA walk hardware.
Make use of this functionality where available and appropriate.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
---
arch/x86/events/intel/pt.c | 116 ++++++++++++++++++++++++++++---------
arch/x86/events/intel/pt.h | 2 +
2 files changed, 90 insertions(+), 28 deletions(-)
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 2f20d5a333c1..6edd7b785861 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -491,7 +491,9 @@ static void pt_config(struct perf_event *event)
}
reg = pt_config_filters(event);
- reg |= RTIT_CTL_TOPA | RTIT_CTL_TRACEEN;
+ reg |= RTIT_CTL_TRACEEN;
+ if (!buf->single)
+ reg |= RTIT_CTL_TOPA;
/*
* Previously, we had BRANCH_EN on by default, but now that PT has
@@ -543,18 +545,6 @@ static void pt_config_stop(struct perf_event *event)
wmb();
}
-static void pt_config_buffer(void *buf, unsigned int topa_idx,
- unsigned int output_off)
-{
- u64 reg;
-
- wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, virt_to_phys(buf));
-
- reg = 0x7f | ((u64)topa_idx << 7) | ((u64)output_off << 32);
-
- wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg);
-}
-
/**
* struct topa - ToPA metadata
* @list: linkage to struct pt_buffer's list of tables
@@ -612,6 +602,26 @@ static inline phys_addr_t topa_pfn(struct topa *topa)
#define TOPA_ENTRY_SIZE(t, i) (sizes(TOPA_ENTRY((t), (i))->size))
#define TOPA_ENTRY_PAGES(t, i) (1 << TOPA_ENTRY((t), (i))->size)
+static void pt_config_buffer(struct pt_buffer *buf)
+{
+ u64 reg, mask;
+ void *base;
+
+ if (buf->single) {
+ base = buf->data_pages[0];
+ mask = (buf->nr_pages * PAGE_SIZE - 1) >> 7;
+ } else {
+ base = topa_to_page(buf->cur)->table;
+ mask = (u64)buf->cur_idx;
+ }
+
+ wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, virt_to_phys(base));
+
+ reg = 0x7f | (mask << 7) | ((u64)buf->output_off << 32);
+
+ wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg);
+}
+
/**
* topa_alloc() - allocate page-sized ToPA table
* @cpu: CPU on which to allocate.
@@ -812,6 +822,11 @@ static void pt_update_head(struct pt *pt)
struct pt_buffer *buf = perf_get_aux(&pt->handle);
u64 topa_idx, base, old;
+ if (buf->single) {
+ local_set(&buf->data_size, buf->output_off);
+ return;
+ }
+
/* offset of the first region in this table from the beginning of buf */
base = buf->cur->offset + buf->output_off;
@@ -913,18 +928,21 @@ static void pt_handle_status(struct pt *pt)
*/
static void pt_read_offset(struct pt_buffer *buf)
{
- u64 offset, base_topa;
+ u64 offset, base;
struct topa_page *tp;
- rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base_topa);
- tp = phys_to_virt(base_topa);
- buf->cur = &tp->topa;
+ rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base);
+ if (!buf->single) {
+ tp = phys_to_virt(base);
+ buf->cur = &tp->topa;
+ }
rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, offset);
/* offset within current output region */
buf->output_off = offset >> 32;
/* index of current output region within this table */
- buf->cur_idx = (offset & 0xffffff80) >> 7;
+ if (!buf->single)
+ buf->cur_idx = (offset & 0xffffff80) >> 7;
}
static struct topa_entry *
@@ -1040,6 +1058,9 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf,
unsigned long head = local64_read(&buf->head);
unsigned long idx, npages, wakeup;
+ if (buf->single)
+ return 0;
+
/* can't stop in the middle of an output region */
if (buf->output_off + handle->size + 1 < pt_buffer_region_size(buf)) {
perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
@@ -1121,13 +1142,17 @@ static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head)
if (buf->snapshot)
head &= (buf->nr_pages << PAGE_SHIFT) - 1;
- pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1);
- te = pt_topa_entry_for_page(buf, pg);
+ if (!buf->single) {
+ pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1);
+ te = pt_topa_entry_for_page(buf, pg);
- cur_tp = topa_entry_to_page(te);
- buf->cur = &cur_tp->topa;
- buf->cur_idx = te - TOPA_ENTRY(buf->cur, 0);
- buf->output_off = head & (pt_buffer_region_size(buf) - 1);
+ cur_tp = topa_entry_to_page(te);
+ buf->cur = &cur_tp->topa;
+ buf->cur_idx = te - TOPA_ENTRY(buf->cur, 0);
+ buf->output_off = head & (pt_buffer_region_size(buf) - 1);
+ } else {
+ buf->output_off = head;
+ }
local64_set(&buf->head, head);
local_set(&buf->data_size, 0);
@@ -1141,6 +1166,9 @@ static void pt_buffer_fini_topa(struct pt_buffer *buf)
{
struct topa *topa, *iter;
+ if (buf->single)
+ return;
+
list_for_each_entry_safe(topa, iter, &buf->tables, list) {
/*
* right now, this is in free_aux() path only, so
@@ -1186,6 +1214,36 @@ static int pt_buffer_init_topa(struct pt_buffer *buf, int cpu,
return 0;
}
+static int pt_buffer_try_single(struct pt_buffer *buf, int nr_pages)
+{
+ struct page *p = virt_to_page(buf->data_pages[0]);
+ int ret = -ENOTSUPP, order = 0;
+
+ /*
+ * We can use single range output mode
+ * + in snapshot mode, where we don't need interrupts;
+ * + if the hardware supports it;
+ * + if the entire buffer is one contiguous allocation.
+ */
+ if (!buf->snapshot)
+ goto out;
+
+ if (!intel_pt_validate_hw_cap(PT_CAP_single_range_output))
+ goto out;
+
+ if (PagePrivate(p))
+ order = page_private(p);
+
+ if (1 << order != nr_pages)
+ goto out;
+
+ buf->single = true;
+ buf->nr_pages = nr_pages;
+ ret = 0;
+out:
+ return ret;
+}
+
/**
* pt_buffer_setup_aux() - set up topa tables for a PT buffer
* @cpu: Cpu on which to allocate, -1 means current.
@@ -1230,6 +1288,10 @@ pt_buffer_setup_aux(struct perf_event *event, void **pages,
INIT_LIST_HEAD(&buf->tables);
+ ret = pt_buffer_try_single(buf, nr_pages);
+ if (!ret)
+ return buf;
+
ret = pt_buffer_init_topa(buf, cpu, nr_pages, GFP_KERNEL);
if (ret) {
kfree(buf);
@@ -1396,8 +1458,7 @@ void intel_pt_interrupt(void)
return;
}
- pt_config_buffer(topa_to_page(buf->cur)->table, buf->cur_idx,
- buf->output_off);
+ pt_config_buffer(buf);
pt_config_start(event);
}
}
@@ -1461,8 +1522,7 @@ static void pt_event_start(struct perf_event *event, int mode)
WRITE_ONCE(pt->handle_nmi, 1);
hwc->state = 0;
- pt_config_buffer(topa_to_page(buf->cur)->table, buf->cur_idx,
- buf->output_off);
+ pt_config_buffer(buf);
pt_config(event);
return;
diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h
index 1d2bb7572374..3f7818221b95 100644
--- a/arch/x86/events/intel/pt.h
+++ b/arch/x86/events/intel/pt.h
@@ -64,6 +64,7 @@ struct pt_pmu {
* @lost: if data was lost/truncated
* @head: logical write offset inside the buffer
* @snapshot: if this is for a snapshot/overwrite counter
+ * @single: use Single Range Output instead of ToPA
* @stop_pos: STOP topa entry index
* @intr_pos: INT topa entry index
* @stop_te: STOP topa entry pointer
@@ -80,6 +81,7 @@ struct pt_buffer {
local_t data_size;
local64_t head;
bool snapshot;
+ bool single;
long stop_pos, intr_pos;
struct topa_entry *stop_te, *intr_te;
void **data_pages;
--
2.23.0
^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [PATCH v2 4/4] perf/x86/intel/pt: Opportunistically use single range output mode
2019-10-22 9:58 ` [PATCH v2 4/4] perf/x86/intel/pt: Opportunistically use single range output mode Alexander Shishkin
@ 2019-10-23 15:09 ` Alexander Shishkin
2019-10-24 13:56 ` Peter Zijlstra
1 sibling, 0 replies; 15+ messages in thread
From: Alexander Shishkin @ 2019-10-23 15:09 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Arnaldo Carvalho de Melo, Ingo Molnar, linux-kernel, jolsa,
adrian.hunter, mathieu.poirier, mark.rutland, alexander.shishkin
Alexander Shishkin <alexander.shishkin@linux.intel.com> writes:
> diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
> index 2f20d5a333c1..6edd7b785861 100644
> --- a/arch/x86/events/intel/pt.c
> +++ b/arch/x86/events/intel/pt.c
> @@ -491,7 +491,9 @@ static void pt_config(struct perf_event *event)
> }
>
> reg = pt_config_filters(event);
> - reg |= RTIT_CTL_TOPA | RTIT_CTL_TRACEEN;
> + reg |= RTIT_CTL_TRACEEN;
> + if (!buf->single)
> + reg |= RTIT_CTL_TOPA;
This one is broken. The below is better.
From a18feffdc15957f6db0a686e51cddf69eef205c3 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Thu, 17 Oct 2019 15:42:15 +0300
Subject: [PATCH] perf/x86/intel/pt: Opportunistically use single range output
mode
Most of PT implementations support Single Range Output mode, which is
an alternative to ToPA that can be used for a single contiguous buffer
and if we don't require an interrupt, that is, in AUX snapshot mode.
Now that perf core will use high order allocations for the AUX buffer,
in many cases the first condition will also be satisfied.
The two most obvious benefits of the Single Range Output mode over the
ToPA are:
* not having to allocate the ToPA table(s),
* not using the ToPA walk hardware.
Make use of this functionality where available and appropriate.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
---
arch/x86/events/intel/pt.c | 118 ++++++++++++++++++++++++++++---------
arch/x86/events/intel/pt.h | 2 +
2 files changed, 92 insertions(+), 28 deletions(-)
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 2f20d5a333c1..e6a10a5dba85 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -482,6 +482,8 @@ static u64 pt_config_filters(struct perf_event *event)
static void pt_config(struct perf_event *event)
{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ struct pt_buffer *buf = perf_get_aux(&pt->handle);
u64 reg;
/* First round: clear STATUS, in particular the PSB byte counter. */
@@ -491,7 +493,9 @@ static void pt_config(struct perf_event *event)
}
reg = pt_config_filters(event);
- reg |= RTIT_CTL_TOPA | RTIT_CTL_TRACEEN;
+ reg |= RTIT_CTL_TRACEEN;
+ if (!buf->single)
+ reg |= RTIT_CTL_TOPA;
/*
* Previously, we had BRANCH_EN on by default, but now that PT has
@@ -543,18 +547,6 @@ static void pt_config_stop(struct perf_event *event)
wmb();
}
-static void pt_config_buffer(void *buf, unsigned int topa_idx,
- unsigned int output_off)
-{
- u64 reg;
-
- wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, virt_to_phys(buf));
-
- reg = 0x7f | ((u64)topa_idx << 7) | ((u64)output_off << 32);
-
- wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg);
-}
-
/**
* struct topa - ToPA metadata
* @list: linkage to struct pt_buffer's list of tables
@@ -612,6 +604,26 @@ static inline phys_addr_t topa_pfn(struct topa *topa)
#define TOPA_ENTRY_SIZE(t, i) (sizes(TOPA_ENTRY((t), (i))->size))
#define TOPA_ENTRY_PAGES(t, i) (1 << TOPA_ENTRY((t), (i))->size)
+static void pt_config_buffer(struct pt_buffer *buf)
+{
+ u64 reg, mask;
+ void *base;
+
+ if (buf->single) {
+ base = buf->data_pages[0];
+ mask = (buf->nr_pages * PAGE_SIZE - 1) >> 7;
+ } else {
+ base = topa_to_page(buf->cur)->table;
+ mask = (u64)buf->cur_idx;
+ }
+
+ wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, virt_to_phys(base));
+
+ reg = 0x7f | (mask << 7) | ((u64)buf->output_off << 32);
+
+ wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg);
+}
+
/**
* topa_alloc() - allocate page-sized ToPA table
* @cpu: CPU on which to allocate.
@@ -812,6 +824,11 @@ static void pt_update_head(struct pt *pt)
struct pt_buffer *buf = perf_get_aux(&pt->handle);
u64 topa_idx, base, old;
+ if (buf->single) {
+ local_set(&buf->data_size, buf->output_off);
+ return;
+ }
+
/* offset of the first region in this table from the beginning of buf */
base = buf->cur->offset + buf->output_off;
@@ -913,18 +930,21 @@ static void pt_handle_status(struct pt *pt)
*/
static void pt_read_offset(struct pt_buffer *buf)
{
- u64 offset, base_topa;
+ u64 offset, base;
struct topa_page *tp;
- rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base_topa);
- tp = phys_to_virt(base_topa);
- buf->cur = &tp->topa;
+ rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base);
+ if (!buf->single) {
+ tp = phys_to_virt(base);
+ buf->cur = &tp->topa;
+ }
rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, offset);
/* offset within current output region */
buf->output_off = offset >> 32;
/* index of current output region within this table */
- buf->cur_idx = (offset & 0xffffff80) >> 7;
+ if (!buf->single)
+ buf->cur_idx = (offset & 0xffffff80) >> 7;
}
static struct topa_entry *
@@ -1040,6 +1060,9 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf,
unsigned long head = local64_read(&buf->head);
unsigned long idx, npages, wakeup;
+ if (buf->single)
+ return 0;
+
/* can't stop in the middle of an output region */
if (buf->output_off + handle->size + 1 < pt_buffer_region_size(buf)) {
perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
@@ -1121,13 +1144,17 @@ static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head)
if (buf->snapshot)
head &= (buf->nr_pages << PAGE_SHIFT) - 1;
- pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1);
- te = pt_topa_entry_for_page(buf, pg);
+ if (!buf->single) {
+ pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1);
+ te = pt_topa_entry_for_page(buf, pg);
- cur_tp = topa_entry_to_page(te);
- buf->cur = &cur_tp->topa;
- buf->cur_idx = te - TOPA_ENTRY(buf->cur, 0);
- buf->output_off = head & (pt_buffer_region_size(buf) - 1);
+ cur_tp = topa_entry_to_page(te);
+ buf->cur = &cur_tp->topa;
+ buf->cur_idx = te - TOPA_ENTRY(buf->cur, 0);
+ buf->output_off = head & (pt_buffer_region_size(buf) - 1);
+ } else {
+ buf->output_off = head;
+ }
local64_set(&buf->head, head);
local_set(&buf->data_size, 0);
@@ -1141,6 +1168,9 @@ static void pt_buffer_fini_topa(struct pt_buffer *buf)
{
struct topa *topa, *iter;
+ if (buf->single)
+ return;
+
list_for_each_entry_safe(topa, iter, &buf->tables, list) {
/*
* right now, this is in free_aux() path only, so
@@ -1186,6 +1216,36 @@ static int pt_buffer_init_topa(struct pt_buffer *buf, int cpu,
return 0;
}
+static int pt_buffer_try_single(struct pt_buffer *buf, int nr_pages)
+{
+ struct page *p = virt_to_page(buf->data_pages[0]);
+ int ret = -ENOTSUPP, order = 0;
+
+ /*
+ * We can use single range output mode
+ * + in snapshot mode, where we don't need interrupts;
+ * + if the hardware supports it;
+ * + if the entire buffer is one contiguous allocation.
+ */
+ if (!buf->snapshot)
+ goto out;
+
+ if (!intel_pt_validate_hw_cap(PT_CAP_single_range_output))
+ goto out;
+
+ if (PagePrivate(p))
+ order = page_private(p);
+
+ if (1 << order != nr_pages)
+ goto out;
+
+ buf->single = true;
+ buf->nr_pages = nr_pages;
+ ret = 0;
+out:
+ return ret;
+}
+
/**
* pt_buffer_setup_aux() - set up topa tables for a PT buffer
* @cpu: Cpu on which to allocate, -1 means current.
@@ -1230,6 +1290,10 @@ pt_buffer_setup_aux(struct perf_event *event, void **pages,
INIT_LIST_HEAD(&buf->tables);
+ ret = pt_buffer_try_single(buf, nr_pages);
+ if (!ret)
+ return buf;
+
ret = pt_buffer_init_topa(buf, cpu, nr_pages, GFP_KERNEL);
if (ret) {
kfree(buf);
@@ -1396,8 +1460,7 @@ void intel_pt_interrupt(void)
return;
}
- pt_config_buffer(topa_to_page(buf->cur)->table, buf->cur_idx,
- buf->output_off);
+ pt_config_buffer(buf);
pt_config_start(event);
}
}
@@ -1461,8 +1524,7 @@ static void pt_event_start(struct perf_event *event, int mode)
WRITE_ONCE(pt->handle_nmi, 1);
hwc->state = 0;
- pt_config_buffer(topa_to_page(buf->cur)->table, buf->cur_idx,
- buf->output_off);
+ pt_config_buffer(buf);
pt_config(event);
return;
diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h
index 1d2bb7572374..3f7818221b95 100644
--- a/arch/x86/events/intel/pt.h
+++ b/arch/x86/events/intel/pt.h
@@ -64,6 +64,7 @@ struct pt_pmu {
* @lost: if data was lost/truncated
* @head: logical write offset inside the buffer
* @snapshot: if this is for a snapshot/overwrite counter
+ * @single: use Single Range Output instead of ToPA
* @stop_pos: STOP topa entry index
* @intr_pos: INT topa entry index
* @stop_te: STOP topa entry pointer
@@ -80,6 +81,7 @@ struct pt_buffer {
local_t data_size;
local64_t head;
bool snapshot;
+ bool single;
long stop_pos, intr_pos;
struct topa_entry *stop_te, *intr_te;
void **data_pages;
--
2.23.0
^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [PATCH v2 4/4] perf/x86/intel/pt: Opportunistically use single range output mode
2019-10-22 9:58 ` [PATCH v2 4/4] perf/x86/intel/pt: Opportunistically use single range output mode Alexander Shishkin
2019-10-23 15:09 ` Alexander Shishkin
@ 2019-10-24 13:56 ` Peter Zijlstra
2019-10-25 12:19 ` Alexander Shishkin
1 sibling, 1 reply; 15+ messages in thread
From: Peter Zijlstra @ 2019-10-24 13:56 UTC (permalink / raw)
To: Alexander Shishkin
Cc: Arnaldo Carvalho de Melo, Ingo Molnar, linux-kernel, jolsa,
adrian.hunter, mathieu.poirier, mark.rutland
On Tue, Oct 22, 2019 at 12:58:12PM +0300, Alexander Shishkin wrote:
> Most of PT implementations support Single Range Output mode, which is
> an alternative to ToPA that can be used for a single contiguous buffer
> and if we don't require an interrupt, that is, in AUX snapshot mode.
>
> Now that perf core will use high order allocations for the AUX buffer,
> in many cases the first condition will also be satisfied.
>
> The two most obvious benefits of the Single Range Output mode over the
> ToPA are:
> * not having to allocate the ToPA table(s),
> * not using the ToPA walk hardware.
>
> Make use of this functionality where available and appropriate.
This seems independent of the snapshot stuff, right?
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v2 4/4] perf/x86/intel/pt: Opportunistically use single range output mode
2019-10-24 13:56 ` Peter Zijlstra
@ 2019-10-25 12:19 ` Alexander Shishkin
0 siblings, 0 replies; 15+ messages in thread
From: Alexander Shishkin @ 2019-10-25 12:19 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Arnaldo Carvalho de Melo, Ingo Molnar, linux-kernel, jolsa,
adrian.hunter, mathieu.poirier, mark.rutland, alexander.shishkin
Peter Zijlstra <peterz@infradead.org> writes:
> On Tue, Oct 22, 2019 at 12:58:12PM +0300, Alexander Shishkin wrote:
>> Most of PT implementations support Single Range Output mode, which is
>> an alternative to ToPA that can be used for a single contiguous buffer
>> and if we don't require an interrupt, that is, in AUX snapshot mode.
>>
>> Now that perf core will use high order allocations for the AUX buffer,
>> in many cases the first condition will also be satisfied.
>>
>> The two most obvious benefits of the Single Range Output mode over the
>> ToPA are:
>> * not having to allocate the ToPA table(s),
>> * not using the ToPA walk hardware.
>>
>> Make use of this functionality where available and appropriate.
>
> This seems independent of the snapshot stuff, right?
Yes, it depends on the previous PT driver patch for context,
though. I'll drop it from the set.
Regards,
--
Alex
^ permalink raw reply [flat|nested] 15+ messages in thread