linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/2] perf/x86/intel/pt: Optimizations
@ 2019-11-05  8:26 Alexander Shishkin
  2019-11-05  8:27 ` [PATCH 1/2] perf/x86/intel/pt: Opportunistically use single range output mode Alexander Shishkin
  2019-11-05  8:27 ` [PATCH 2/2] perf/x86/intel/pt: Prevent redundant WRMSRs Alexander Shishkin
  0 siblings, 2 replies; 5+ messages in thread
From: Alexander Shishkin @ 2019-11-05  8:26 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Arnaldo Carvalho de Melo, Ingo Molnar, linux-kernel, Jiri Olsa,
	Alexander Shishkin

Hi Peter,

The first one was earlier part of the AUX sampling patchset, but wasn't
strictly sampling related. The second one is based on an observation that
between high-order AUX allocations and the Single Range output, we don't
have to reprogram buffer-related MSRs all of the time.

Alexander Shishkin (2):
  perf/x86/intel/pt: Opportunistically use single range output mode
  perf/x86/intel/pt: Prevent redundant WRMSRs

 arch/x86/events/intel/pt.c | 129 ++++++++++++++++++++++++++++---------
 arch/x86/events/intel/pt.h |  14 ++--
 2 files changed, 109 insertions(+), 34 deletions(-)

-- 
2.24.0.rc1


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH 1/2] perf/x86/intel/pt: Opportunistically use single range output mode
  2019-11-05  8:26 [PATCH 0/2] perf/x86/intel/pt: Optimizations Alexander Shishkin
@ 2019-11-05  8:27 ` Alexander Shishkin
  2019-11-13 10:56   ` [tip: perf/core] " tip-bot2 for Alexander Shishkin
  2019-11-05  8:27 ` [PATCH 2/2] perf/x86/intel/pt: Prevent redundant WRMSRs Alexander Shishkin
  1 sibling, 1 reply; 5+ messages in thread
From: Alexander Shishkin @ 2019-11-05  8:27 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Arnaldo Carvalho de Melo, Ingo Molnar, linux-kernel, Jiri Olsa,
	Alexander Shishkin

Most of PT implementations support Single Range Output mode, which is
an alternative to ToPA that can be used for a single contiguous buffer
and if we don't require an interrupt, that is, in AUX snapshot mode.

Now that perf core will use high order allocations for the AUX buffer,
in many cases the first condition will also be satisfied.

The two most obvious benefits of the Single Range Output mode over the
ToPA are:
 * not having to allocate the ToPA table(s),
 * not using the ToPA walk hardware.

Make use of this functionality where available and appropriate.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
---
 arch/x86/events/intel/pt.c | 118 ++++++++++++++++++++++++++++---------
 arch/x86/events/intel/pt.h |   2 +
 2 files changed, 92 insertions(+), 28 deletions(-)

diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 35c7ca1ddab5..49426faf2688 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -482,6 +482,8 @@ static u64 pt_config_filters(struct perf_event *event)
 
 static void pt_config(struct perf_event *event)
 {
+	struct pt *pt = this_cpu_ptr(&pt_ctx);
+	struct pt_buffer *buf = perf_get_aux(&pt->handle);
 	u64 reg;
 
 	/* First round: clear STATUS, in particular the PSB byte counter. */
@@ -491,7 +493,9 @@ static void pt_config(struct perf_event *event)
 	}
 
 	reg = pt_config_filters(event);
-	reg |= RTIT_CTL_TOPA | RTIT_CTL_TRACEEN;
+	reg |= RTIT_CTL_TRACEEN;
+	if (!buf->single)
+		reg |= RTIT_CTL_TOPA;
 
 	/*
 	 * Previously, we had BRANCH_EN on by default, but now that PT has
@@ -543,18 +547,6 @@ static void pt_config_stop(struct perf_event *event)
 	wmb();
 }
 
-static void pt_config_buffer(void *buf, unsigned int topa_idx,
-			     unsigned int output_off)
-{
-	u64 reg;
-
-	wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, virt_to_phys(buf));
-
-	reg = 0x7f | ((u64)topa_idx << 7) | ((u64)output_off << 32);
-
-	wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg);
-}
-
 /**
  * struct topa - ToPA metadata
  * @list:	linkage to struct pt_buffer's list of tables
@@ -612,6 +604,26 @@ static inline phys_addr_t topa_pfn(struct topa *topa)
 #define TOPA_ENTRY_SIZE(t, i) (sizes(TOPA_ENTRY((t), (i))->size))
 #define TOPA_ENTRY_PAGES(t, i) (1 << TOPA_ENTRY((t), (i))->size)
 
+static void pt_config_buffer(struct pt_buffer *buf)
+{
+	u64 reg, mask;
+	void *base;
+
+	if (buf->single) {
+		base = buf->data_pages[0];
+		mask = (buf->nr_pages * PAGE_SIZE - 1) >> 7;
+	} else {
+		base = topa_to_page(buf->cur)->table;
+		mask = (u64)buf->cur_idx;
+	}
+
+	wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, virt_to_phys(base));
+
+	reg = 0x7f | (mask << 7) | ((u64)buf->output_off << 32);
+
+	wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg);
+}
+
 /**
  * topa_alloc() - allocate page-sized ToPA table
  * @cpu:	CPU on which to allocate.
@@ -812,6 +824,11 @@ static void pt_update_head(struct pt *pt)
 	struct pt_buffer *buf = perf_get_aux(&pt->handle);
 	u64 topa_idx, base, old;
 
+	if (buf->single) {
+		local_set(&buf->data_size, buf->output_off);
+		return;
+	}
+
 	/* offset of the first region in this table from the beginning of buf */
 	base = buf->cur->offset + buf->output_off;
 
@@ -913,18 +930,21 @@ static void pt_handle_status(struct pt *pt)
  */
 static void pt_read_offset(struct pt_buffer *buf)
 {
-	u64 offset, base_topa;
+	u64 offset, base;
 	struct topa_page *tp;
 
-	rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base_topa);
-	tp = phys_to_virt(base_topa);
-	buf->cur = &tp->topa;
+	if (!buf->single) {
+		rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base);
+		tp = phys_to_virt(base);
+		buf->cur = &tp->topa;
+	}
 
 	rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, offset);
 	/* offset within current output region */
 	buf->output_off = offset >> 32;
 	/* index of current output region within this table */
-	buf->cur_idx = (offset & 0xffffff80) >> 7;
+	if (!buf->single)
+		buf->cur_idx = (offset & 0xffffff80) >> 7;
 }
 
 static struct topa_entry *
@@ -1040,6 +1060,9 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf,
 	unsigned long head = local64_read(&buf->head);
 	unsigned long idx, npages, wakeup;
 
+	if (buf->single)
+		return 0;
+
 	/* can't stop in the middle of an output region */
 	if (buf->output_off + handle->size + 1 < pt_buffer_region_size(buf)) {
 		perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
@@ -1121,13 +1144,17 @@ static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head)
 	if (buf->snapshot)
 		head &= (buf->nr_pages << PAGE_SHIFT) - 1;
 
-	pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1);
-	te = pt_topa_entry_for_page(buf, pg);
+	if (!buf->single) {
+		pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1);
+		te = pt_topa_entry_for_page(buf, pg);
 
-	cur_tp = topa_entry_to_page(te);
-	buf->cur = &cur_tp->topa;
-	buf->cur_idx = te - TOPA_ENTRY(buf->cur, 0);
-	buf->output_off = head & (pt_buffer_region_size(buf) - 1);
+		cur_tp = topa_entry_to_page(te);
+		buf->cur = &cur_tp->topa;
+		buf->cur_idx = te - TOPA_ENTRY(buf->cur, 0);
+		buf->output_off = head & (pt_buffer_region_size(buf) - 1);
+	} else {
+		buf->output_off = head;
+	}
 
 	local64_set(&buf->head, head);
 	local_set(&buf->data_size, 0);
@@ -1141,6 +1168,9 @@ static void pt_buffer_fini_topa(struct pt_buffer *buf)
 {
 	struct topa *topa, *iter;
 
+	if (buf->single)
+		return;
+
 	list_for_each_entry_safe(topa, iter, &buf->tables, list) {
 		/*
 		 * right now, this is in free_aux() path only, so
@@ -1186,6 +1216,36 @@ static int pt_buffer_init_topa(struct pt_buffer *buf, int cpu,
 	return 0;
 }
 
+static int pt_buffer_try_single(struct pt_buffer *buf, int nr_pages)
+{
+	struct page *p = virt_to_page(buf->data_pages[0]);
+	int ret = -ENOTSUPP, order = 0;
+
+	/*
+	 * We can use single range output mode
+	 * + in snapshot mode, where we don't need interrupts;
+	 * + if the hardware supports it;
+	 * + if the entire buffer is one contiguous allocation.
+	 */
+	if (!buf->snapshot)
+		goto out;
+
+	if (!intel_pt_validate_hw_cap(PT_CAP_single_range_output))
+		goto out;
+
+	if (PagePrivate(p))
+		order = page_private(p);
+
+	if (1 << order != nr_pages)
+		goto out;
+
+	buf->single = true;
+	buf->nr_pages = nr_pages;
+	ret = 0;
+out:
+	return ret;
+}
+
 /**
  * pt_buffer_setup_aux() - set up topa tables for a PT buffer
  * @cpu:	Cpu on which to allocate, -1 means current.
@@ -1230,6 +1290,10 @@ pt_buffer_setup_aux(struct perf_event *event, void **pages,
 
 	INIT_LIST_HEAD(&buf->tables);
 
+	ret = pt_buffer_try_single(buf, nr_pages);
+	if (!ret)
+		return buf;
+
 	ret = pt_buffer_init_topa(buf, cpu, nr_pages, GFP_KERNEL);
 	if (ret) {
 		kfree(buf);
@@ -1396,8 +1460,7 @@ void intel_pt_interrupt(void)
 			return;
 		}
 
-		pt_config_buffer(topa_to_page(buf->cur)->table, buf->cur_idx,
-				 buf->output_off);
+		pt_config_buffer(buf);
 		pt_config_start(event);
 	}
 }
@@ -1461,8 +1524,7 @@ static void pt_event_start(struct perf_event *event, int mode)
 	WRITE_ONCE(pt->handle_nmi, 1);
 	hwc->state = 0;
 
-	pt_config_buffer(topa_to_page(buf->cur)->table, buf->cur_idx,
-			 buf->output_off);
+	pt_config_buffer(buf);
 	pt_config(event);
 
 	return;
diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h
index 62913adf2aa9..3f26150886c2 100644
--- a/arch/x86/events/intel/pt.h
+++ b/arch/x86/events/intel/pt.h
@@ -64,6 +64,7 @@ struct pt_pmu {
  * @lost:	if data was lost/truncated
  * @head:	logical write offset inside the buffer
  * @snapshot:	if this is for a snapshot/overwrite counter
+ * @single:	use Single Range Output instead of ToPA
  * @stop_pos:	STOP topa entry index
  * @intr_pos:	INT topa entry index
  * @stop_te:	STOP topa entry pointer
@@ -80,6 +81,7 @@ struct pt_buffer {
 	local_t			data_size;
 	local64_t		head;
 	bool			snapshot;
+	bool			single;
 	long			stop_pos, intr_pos;
 	struct topa_entry	*stop_te, *intr_te;
 	void			**data_pages;
-- 
2.24.0.rc1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 2/2] perf/x86/intel/pt: Prevent redundant WRMSRs
  2019-11-05  8:26 [PATCH 0/2] perf/x86/intel/pt: Optimizations Alexander Shishkin
  2019-11-05  8:27 ` [PATCH 1/2] perf/x86/intel/pt: Opportunistically use single range output mode Alexander Shishkin
@ 2019-11-05  8:27 ` Alexander Shishkin
  2019-11-13 10:56   ` [tip: perf/core] " tip-bot2 for Alexander Shishkin
  1 sibling, 1 reply; 5+ messages in thread
From: Alexander Shishkin @ 2019-11-05  8:27 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Arnaldo Carvalho de Melo, Ingo Molnar, linux-kernel, Jiri Olsa,
	Alexander Shishkin

With recent optimizations to AUX and PT buffer management code (high order
AUX allocations, opportunistic Single Range Output), it is far more likely
now that the output MSRs won't need reprogramming on every sched-in.

To avoid needless WRMSRs of those registers, cache their values and only
write them when needed.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
---
 arch/x86/events/intel/pt.c | 25 ++++++++++++++++---------
 arch/x86/events/intel/pt.h | 12 ++++++++----
 2 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 49426faf2688..41c18126161c 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -606,6 +606,7 @@ static inline phys_addr_t topa_pfn(struct topa *topa)
 
 static void pt_config_buffer(struct pt_buffer *buf)
 {
+	struct pt *pt = this_cpu_ptr(&pt_ctx);
 	u64 reg, mask;
 	void *base;
 
@@ -617,11 +618,17 @@ static void pt_config_buffer(struct pt_buffer *buf)
 		mask = (u64)buf->cur_idx;
 	}
 
-	wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, virt_to_phys(base));
+	reg = virt_to_phys(base);
+	if (pt->output_base != reg) {
+		pt->output_base = reg;
+		wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, reg);
+	}
 
 	reg = 0x7f | (mask << 7) | ((u64)buf->output_off << 32);
-
-	wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg);
+	if (pt->output_mask != reg) {
+		pt->output_mask = reg;
+		wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg);
+	}
 }
 
 /**
@@ -930,21 +937,21 @@ static void pt_handle_status(struct pt *pt)
  */
 static void pt_read_offset(struct pt_buffer *buf)
 {
-	u64 offset, base;
+	struct pt *pt = this_cpu_ptr(&pt_ctx);
 	struct topa_page *tp;
 
 	if (!buf->single) {
-		rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base);
-		tp = phys_to_virt(base);
+		rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, pt->output_base);
+		tp = phys_to_virt(pt->output_base);
 		buf->cur = &tp->topa;
 	}
 
-	rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, offset);
+	rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, pt->output_mask);
 	/* offset within current output region */
-	buf->output_off = offset >> 32;
+	buf->output_off = pt->output_mask >> 32;
 	/* index of current output region within this table */
 	if (!buf->single)
-		buf->cur_idx = (offset & 0xffffff80) >> 7;
+		buf->cur_idx = (pt->output_mask & 0xffffff80) >> 7;
 }
 
 static struct topa_entry *
diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h
index 3f26150886c2..96906a62aacd 100644
--- a/arch/x86/events/intel/pt.h
+++ b/arch/x86/events/intel/pt.h
@@ -113,16 +113,20 @@ struct pt_filters {
 
 /**
  * struct pt - per-cpu pt context
- * @handle:	perf output handle
- * @filters:	last configured filters
- * @handle_nmi:	do handle PT PMI on this cpu, there's an active event
- * @vmx_on:	1 if VMX is ON on this cpu
+ * @handle:		perf output handle
+ * @filters:		last configured filters
+ * @handle_nmi:		do handle PT PMI on this cpu, there's an active event
+ * @vmx_on:		1 if VMX is ON on this cpu
+ * @output_base:	cached RTIT_OUTPUT_BASE MSR value
+ * @output_mask:	cached RTIT_OUTPUT_MASK MSR value
  */
 struct pt {
 	struct perf_output_handle handle;
 	struct pt_filters	filters;
 	int			handle_nmi;
 	int			vmx_on;
+	u64			output_base;
+	u64			output_mask;
 };
 
 #endif /* __INTEL_PT_H__ */
-- 
2.24.0.rc1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [tip: perf/core] perf/x86/intel/pt: Prevent redundant WRMSRs
  2019-11-05  8:27 ` [PATCH 2/2] perf/x86/intel/pt: Prevent redundant WRMSRs Alexander Shishkin
@ 2019-11-13 10:56   ` tip-bot2 for Alexander Shishkin
  0 siblings, 0 replies; 5+ messages in thread
From: tip-bot2 for Alexander Shishkin @ 2019-11-13 10:56 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Alexander Shishkin, Peter Zijlstra (Intel),
	Arnaldo Carvalho de Melo, David Ahern, Jiri Olsa, Jiri Olsa,
	Linus Torvalds, Mark Rutland, Namhyung Kim, Stephane Eranian,
	Thomas Gleixner, Vince Weaver, Ingo Molnar, Borislav Petkov,
	linux-kernel

The following commit has been merged into the perf/core branch of tip:

Commit-ID:     295c52ee1485e4dee660fc1a0e6ceed6c803c9d3
Gitweb:        https://git.kernel.org/tip/295c52ee1485e4dee660fc1a0e6ceed6c803c9d3
Author:        Alexander Shishkin <alexander.shishkin@linux.intel.com>
AuthorDate:    Tue, 05 Nov 2019 10:27:01 +02:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Wed, 13 Nov 2019 11:06:18 +01:00

perf/x86/intel/pt: Prevent redundant WRMSRs

With recent optimizations to AUX and PT buffer management code (high order
AUX allocations, opportunistic Single Range Output), it is far more likely
now that the output MSRs won't need reprogramming on every sched-in.

To avoid needless WRMSRs of those registers, cache their values and only
write them when needed.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: https://lkml.kernel.org/r/20191105082701.78442-3-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/pt.c | 25 ++++++++++++++++---------
 arch/x86/events/intel/pt.h | 10 +++++++---
 2 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index c87d163..1db7a51 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -606,6 +606,7 @@ static inline phys_addr_t topa_pfn(struct topa *topa)
 
 static void pt_config_buffer(struct pt_buffer *buf)
 {
+	struct pt *pt = this_cpu_ptr(&pt_ctx);
 	u64 reg, mask;
 	void *base;
 
@@ -617,11 +618,17 @@ static void pt_config_buffer(struct pt_buffer *buf)
 		mask = (u64)buf->cur_idx;
 	}
 
-	wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, virt_to_phys(base));
+	reg = virt_to_phys(base);
+	if (pt->output_base != reg) {
+		pt->output_base = reg;
+		wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, reg);
+	}
 
 	reg = 0x7f | (mask << 7) | ((u64)buf->output_off << 32);
-
-	wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg);
+	if (pt->output_mask != reg) {
+		pt->output_mask = reg;
+		wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg);
+	}
 }
 
 /**
@@ -930,21 +937,21 @@ static void pt_handle_status(struct pt *pt)
  */
 static void pt_read_offset(struct pt_buffer *buf)
 {
-	u64 offset, base;
+	struct pt *pt = this_cpu_ptr(&pt_ctx);
 	struct topa_page *tp;
 
 	if (!buf->single) {
-		rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base);
-		tp = phys_to_virt(base);
+		rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, pt->output_base);
+		tp = phys_to_virt(pt->output_base);
 		buf->cur = &tp->topa;
 	}
 
-	rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, offset);
+	rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, pt->output_mask);
 	/* offset within current output region */
-	buf->output_off = offset >> 32;
+	buf->output_off = pt->output_mask >> 32;
 	/* index of current output region within this table */
 	if (!buf->single)
-		buf->cur_idx = (offset & 0xffffff80) >> 7;
+		buf->cur_idx = (pt->output_mask & 0xffffff80) >> 7;
 }
 
 static struct topa_entry *
diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h
index 3f78182..96906a6 100644
--- a/arch/x86/events/intel/pt.h
+++ b/arch/x86/events/intel/pt.h
@@ -113,16 +113,20 @@ struct pt_filters {
 
 /**
  * struct pt - per-cpu pt context
- * @handle:	perf output handle
+ * @handle:		perf output handle
  * @filters:		last configured filters
- * @handle_nmi:	do handle PT PMI on this cpu, there's an active event
- * @vmx_on:	1 if VMX is ON on this cpu
+ * @handle_nmi:		do handle PT PMI on this cpu, there's an active event
+ * @vmx_on:		1 if VMX is ON on this cpu
+ * @output_base:	cached RTIT_OUTPUT_BASE MSR value
+ * @output_mask:	cached RTIT_OUTPUT_MASK MSR value
  */
 struct pt {
 	struct perf_output_handle handle;
 	struct pt_filters	filters;
 	int			handle_nmi;
 	int			vmx_on;
+	u64			output_base;
+	u64			output_mask;
 };
 
 #endif /* __INTEL_PT_H__ */

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [tip: perf/core] perf/x86/intel/pt: Opportunistically use single range output mode
  2019-11-05  8:27 ` [PATCH 1/2] perf/x86/intel/pt: Opportunistically use single range output mode Alexander Shishkin
@ 2019-11-13 10:56   ` tip-bot2 for Alexander Shishkin
  0 siblings, 0 replies; 5+ messages in thread
From: tip-bot2 for Alexander Shishkin @ 2019-11-13 10:56 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Alexander Shishkin, Peter Zijlstra (Intel),
	Arnaldo Carvalho de Melo, David Ahern, Jiri Olsa, Jiri Olsa,
	Linus Torvalds, Mark Rutland, Namhyung Kim, Stephane Eranian,
	Thomas Gleixner, Vince Weaver, Ingo Molnar, Borislav Petkov,
	linux-kernel

The following commit has been merged into the perf/core branch of tip:

Commit-ID:     670638477aede0d7a355ced04b569214aa3feacd
Gitweb:        https://git.kernel.org/tip/670638477aede0d7a355ced04b569214aa3feacd
Author:        Alexander Shishkin <alexander.shishkin@linux.intel.com>
AuthorDate:    Tue, 05 Nov 2019 10:27:00 +02:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Wed, 13 Nov 2019 11:06:17 +01:00

perf/x86/intel/pt: Opportunistically use single range output mode

Most of PT implementations support Single Range Output mode, which is
an alternative to ToPA that can be used for a single contiguous buffer
and if we don't require an interrupt, that is, in AUX snapshot mode.

Now that perf core will use high order allocations for the AUX buffer,
in many cases the first condition will also be satisfied.

The two most obvious benefits of the Single Range Output mode over the
ToPA are:

 * not having to allocate the ToPA table(s),
 * not using the ToPA walk hardware.

Make use of this functionality where available and appropriate.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: https://lkml.kernel.org/r/20191105082701.78442-2-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/pt.c | 118 +++++++++++++++++++++++++++---------
 arch/x86/events/intel/pt.h |   2 +-
 2 files changed, 92 insertions(+), 28 deletions(-)

diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 2f20d5a..c87d163 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -482,6 +482,8 @@ static u64 pt_config_filters(struct perf_event *event)
 
 static void pt_config(struct perf_event *event)
 {
+	struct pt *pt = this_cpu_ptr(&pt_ctx);
+	struct pt_buffer *buf = perf_get_aux(&pt->handle);
 	u64 reg;
 
 	/* First round: clear STATUS, in particular the PSB byte counter. */
@@ -491,7 +493,9 @@ static void pt_config(struct perf_event *event)
 	}
 
 	reg = pt_config_filters(event);
-	reg |= RTIT_CTL_TOPA | RTIT_CTL_TRACEEN;
+	reg |= RTIT_CTL_TRACEEN;
+	if (!buf->single)
+		reg |= RTIT_CTL_TOPA;
 
 	/*
 	 * Previously, we had BRANCH_EN on by default, but now that PT has
@@ -543,18 +547,6 @@ static void pt_config_stop(struct perf_event *event)
 	wmb();
 }
 
-static void pt_config_buffer(void *buf, unsigned int topa_idx,
-			     unsigned int output_off)
-{
-	u64 reg;
-
-	wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, virt_to_phys(buf));
-
-	reg = 0x7f | ((u64)topa_idx << 7) | ((u64)output_off << 32);
-
-	wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg);
-}
-
 /**
  * struct topa - ToPA metadata
  * @list:	linkage to struct pt_buffer's list of tables
@@ -612,6 +604,26 @@ static inline phys_addr_t topa_pfn(struct topa *topa)
 #define TOPA_ENTRY_SIZE(t, i) (sizes(TOPA_ENTRY((t), (i))->size))
 #define TOPA_ENTRY_PAGES(t, i) (1 << TOPA_ENTRY((t), (i))->size)
 
+static void pt_config_buffer(struct pt_buffer *buf)
+{
+	u64 reg, mask;
+	void *base;
+
+	if (buf->single) {
+		base = buf->data_pages[0];
+		mask = (buf->nr_pages * PAGE_SIZE - 1) >> 7;
+	} else {
+		base = topa_to_page(buf->cur)->table;
+		mask = (u64)buf->cur_idx;
+	}
+
+	wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, virt_to_phys(base));
+
+	reg = 0x7f | (mask << 7) | ((u64)buf->output_off << 32);
+
+	wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg);
+}
+
 /**
  * topa_alloc() - allocate page-sized ToPA table
  * @cpu:	CPU on which to allocate.
@@ -812,6 +824,11 @@ static void pt_update_head(struct pt *pt)
 	struct pt_buffer *buf = perf_get_aux(&pt->handle);
 	u64 topa_idx, base, old;
 
+	if (buf->single) {
+		local_set(&buf->data_size, buf->output_off);
+		return;
+	}
+
 	/* offset of the first region in this table from the beginning of buf */
 	base = buf->cur->offset + buf->output_off;
 
@@ -913,18 +930,21 @@ static void pt_handle_status(struct pt *pt)
  */
 static void pt_read_offset(struct pt_buffer *buf)
 {
-	u64 offset, base_topa;
+	u64 offset, base;
 	struct topa_page *tp;
 
-	rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base_topa);
-	tp = phys_to_virt(base_topa);
-	buf->cur = &tp->topa;
+	if (!buf->single) {
+		rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base);
+		tp = phys_to_virt(base);
+		buf->cur = &tp->topa;
+	}
 
 	rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, offset);
 	/* offset within current output region */
 	buf->output_off = offset >> 32;
 	/* index of current output region within this table */
-	buf->cur_idx = (offset & 0xffffff80) >> 7;
+	if (!buf->single)
+		buf->cur_idx = (offset & 0xffffff80) >> 7;
 }
 
 static struct topa_entry *
@@ -1040,6 +1060,9 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf,
 	unsigned long head = local64_read(&buf->head);
 	unsigned long idx, npages, wakeup;
 
+	if (buf->single)
+		return 0;
+
 	/* can't stop in the middle of an output region */
 	if (buf->output_off + handle->size + 1 < pt_buffer_region_size(buf)) {
 		perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
@@ -1121,13 +1144,17 @@ static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head)
 	if (buf->snapshot)
 		head &= (buf->nr_pages << PAGE_SHIFT) - 1;
 
-	pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1);
-	te = pt_topa_entry_for_page(buf, pg);
+	if (!buf->single) {
+		pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1);
+		te = pt_topa_entry_for_page(buf, pg);
 
-	cur_tp = topa_entry_to_page(te);
-	buf->cur = &cur_tp->topa;
-	buf->cur_idx = te - TOPA_ENTRY(buf->cur, 0);
-	buf->output_off = head & (pt_buffer_region_size(buf) - 1);
+		cur_tp = topa_entry_to_page(te);
+		buf->cur = &cur_tp->topa;
+		buf->cur_idx = te - TOPA_ENTRY(buf->cur, 0);
+		buf->output_off = head & (pt_buffer_region_size(buf) - 1);
+	} else {
+		buf->output_off = head;
+	}
 
 	local64_set(&buf->head, head);
 	local_set(&buf->data_size, 0);
@@ -1141,6 +1168,9 @@ static void pt_buffer_fini_topa(struct pt_buffer *buf)
 {
 	struct topa *topa, *iter;
 
+	if (buf->single)
+		return;
+
 	list_for_each_entry_safe(topa, iter, &buf->tables, list) {
 		/*
 		 * right now, this is in free_aux() path only, so
@@ -1186,6 +1216,36 @@ static int pt_buffer_init_topa(struct pt_buffer *buf, int cpu,
 	return 0;
 }
 
+static int pt_buffer_try_single(struct pt_buffer *buf, int nr_pages)
+{
+	struct page *p = virt_to_page(buf->data_pages[0]);
+	int ret = -ENOTSUPP, order = 0;
+
+	/*
+	 * We can use single range output mode
+	 * + in snapshot mode, where we don't need interrupts;
+	 * + if the hardware supports it;
+	 * + if the entire buffer is one contiguous allocation.
+	 */
+	if (!buf->snapshot)
+		goto out;
+
+	if (!intel_pt_validate_hw_cap(PT_CAP_single_range_output))
+		goto out;
+
+	if (PagePrivate(p))
+		order = page_private(p);
+
+	if (1 << order != nr_pages)
+		goto out;
+
+	buf->single = true;
+	buf->nr_pages = nr_pages;
+	ret = 0;
+out:
+	return ret;
+}
+
 /**
  * pt_buffer_setup_aux() - set up topa tables for a PT buffer
  * @cpu:	Cpu on which to allocate, -1 means current.
@@ -1230,6 +1290,10 @@ pt_buffer_setup_aux(struct perf_event *event, void **pages,
 
 	INIT_LIST_HEAD(&buf->tables);
 
+	ret = pt_buffer_try_single(buf, nr_pages);
+	if (!ret)
+		return buf;
+
 	ret = pt_buffer_init_topa(buf, cpu, nr_pages, GFP_KERNEL);
 	if (ret) {
 		kfree(buf);
@@ -1396,8 +1460,7 @@ void intel_pt_interrupt(void)
 			return;
 		}
 
-		pt_config_buffer(topa_to_page(buf->cur)->table, buf->cur_idx,
-				 buf->output_off);
+		pt_config_buffer(buf);
 		pt_config_start(event);
 	}
 }
@@ -1461,8 +1524,7 @@ static void pt_event_start(struct perf_event *event, int mode)
 	WRITE_ONCE(pt->handle_nmi, 1);
 	hwc->state = 0;
 
-	pt_config_buffer(topa_to_page(buf->cur)->table, buf->cur_idx,
-			 buf->output_off);
+	pt_config_buffer(buf);
 	pt_config(event);
 
 	return;
diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h
index 1d2bb75..3f78182 100644
--- a/arch/x86/events/intel/pt.h
+++ b/arch/x86/events/intel/pt.h
@@ -64,6 +64,7 @@ struct pt_pmu {
  * @lost:	if data was lost/truncated
  * @head:	logical write offset inside the buffer
  * @snapshot:	if this is for a snapshot/overwrite counter
+ * @single:	use Single Range Output instead of ToPA
  * @stop_pos:	STOP topa entry index
  * @intr_pos:	INT topa entry index
  * @stop_te:	STOP topa entry pointer
@@ -80,6 +81,7 @@ struct pt_buffer {
 	local_t			data_size;
 	local64_t		head;
 	bool			snapshot;
+	bool			single;
 	long			stop_pos, intr_pos;
 	struct topa_entry	*stop_te, *intr_te;
 	void			**data_pages;

^ permalink raw reply related	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2019-11-13 10:57 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-11-05  8:26 [PATCH 0/2] perf/x86/intel/pt: Optimizations Alexander Shishkin
2019-11-05  8:27 ` [PATCH 1/2] perf/x86/intel/pt: Opportunistically use single range output mode Alexander Shishkin
2019-11-13 10:56   ` [tip: perf/core] " tip-bot2 for Alexander Shishkin
2019-11-05  8:27 ` [PATCH 2/2] perf/x86/intel/pt: Prevent redundant WRMSRs Alexander Shishkin
2019-11-13 10:56   ` [tip: perf/core] " tip-bot2 for Alexander Shishkin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).