[MODERATED] Re: [PATCH v2 4/8] PERFv2

From: Andi Kleen <ak@linux.intel.com>
To: speck@linutronix.de
Subject: [MODERATED] Re: [PATCH v2 4/8] PERFv2
Date: Tue, 5 Feb 2019 19:55:25 -0800	[thread overview]
Message-ID: <20190206035525.GM31598@tassilo.jf.intel.com> (raw)
In-Reply-To: <bb3b849d7271aa49ac362919a999e16c7c41638b.1549416008.git.ak@linux.intel.com>

[-- Attachment #1: Type: text/plain, Size: 32 bytes --]

mbox of the patches attached.



[-- Attachment #2: mbox --]
[-- Type: text/plain, Size: 34052 bytes --]

From 3c5756531a45ca9791a9c4bca295e92e063957ac Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Tue, 18 Sep 2018 13:18:07 -0700
Subject: [PATCH 1/8] x86/pmu/intel: Export number of counters in caps
Status: O
Content-Length: 1263
Lines: 45

Export the number of generic and fixed counters in the core PMU in caps/
This helps users and tools with constructing groups. Will be also more
important with upcoming patches that can change the number of counters.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/events/core.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 374a19712e20..58e659bfc2d9 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2248,8 +2248,28 @@ static ssize_t max_precise_show(struct device *cdev,
 
 static DEVICE_ATTR_RO(max_precise);
 
+static ssize_t num_counter_show(struct device *cdev,
+				  struct device_attribute *attr,
+				  char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu.num_counters);
+}
+
+static DEVICE_ATTR_RO(num_counter);
+
+static ssize_t num_counter_fixed_show(struct device *cdev,
+				  struct device_attribute *attr,
+				  char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu.num_counters_fixed);
+}
+
+static DEVICE_ATTR_RO(num_counter_fixed);
+
 static struct attribute *x86_pmu_caps_attrs[] = {
 	&dev_attr_max_precise.attr,
+	&dev_attr_num_counter.attr,
+	&dev_attr_num_counter_fixed.attr,
 	NULL
 };
 
-- 
2.17.2


From 6837c3d8994cbbc086e2a209fe00fb2f3d166b32 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Mon, 4 Feb 2019 16:32:06 -0800
Subject: [PATCH 2/8] x86/pmu/intel: Support excluding GP counters in scheduler
Status: O
Content-Length: 5829
Lines: 165

Add support for excluding GP counters per event in the x86 event
scheduler. This is right now used for a bug workaround in a followon
patch, but will be also later useful for more efficient PMU
virtualization in KVM.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/events/core.c         | 24 +++++++++++++++++-------
 arch/x86/events/intel/uncore.c |  2 +-
 arch/x86/events/perf_event.h   |  3 ++-
 include/linux/perf_event.h     |  1 +
 4 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 58e659bfc2d9..4bad36dfcc8e 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -685,6 +685,7 @@ struct perf_sched {
 	int			max_events;
 	int			max_gp;
 	int			saved_states;
+	u64			exclude;
 	struct event_constraint	**constraints;
 	struct sched_state	state;
 	struct sched_state	saved[SCHED_STATES_MAX];
@@ -694,7 +695,7 @@ struct perf_sched {
  * Initialize interator that runs through all events and counters.
  */
 static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints,
-			    int num, int wmin, int wmax, int gpmax)
+			    int num, int wmin, int wmax, int gpmax, u64 exclude)
 {
 	int idx;
 
@@ -703,6 +704,7 @@ static void perf_sched_init(struct perf_sched *sched, struct event_constraint **
 	sched->max_weight	= wmax;
 	sched->max_gp		= gpmax;
 	sched->constraints	= constraints;
+	sched->exclude		= exclude;
 
 	for (idx = 0; idx < num; idx++) {
 		if (constraints[idx]->weight == wmin)
@@ -745,6 +747,7 @@ static bool __perf_sched_find_counter(struct perf_sched *sched)
 {
 	struct event_constraint *c;
 	int idx;
+	u64 idxmsk;
 
 	if (!sched->state.unassigned)
 		return false;
@@ -753,10 +756,12 @@ static bool __perf_sched_find_counter(struct perf_sched *sched)
 		return false;
 
 	c = sched->constraints[sched->state.event];
+	idxmsk = c->idxmsk64 & ~sched->exclude;
 	/* Prefer fixed purpose counters */
-	if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) {
+	if (idxmsk & (~0ULL << INTEL_PMC_IDX_FIXED)) {
 		idx = INTEL_PMC_IDX_FIXED;
-		for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
+		for_each_set_bit_from(idx, (unsigned long *)&idxmsk,
+				      X86_PMC_IDX_MAX) {
 			if (!__test_and_set_bit(idx, sched->state.used))
 				goto done;
 		}
@@ -764,7 +769,7 @@ static bool __perf_sched_find_counter(struct perf_sched *sched)
 
 	/* Grab the first unused counter starting with idx */
 	idx = sched->state.counter;
-	for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
+	for_each_set_bit_from(idx, (unsigned long *)&idxmsk, INTEL_PMC_IDX_FIXED) {
 		if (!__test_and_set_bit(idx, sched->state.used)) {
 			if (sched->state.nr_gp++ >= sched->max_gp)
 				return false;
@@ -827,11 +832,12 @@ static bool perf_sched_next_event(struct perf_sched *sched)
  * Assign a counter for each event.
  */
 int perf_assign_events(struct event_constraint **constraints, int n,
-			int wmin, int wmax, int gpmax, int *assign)
+			int wmin, int wmax, int gpmax, int *assign,
+			u64 exclude)
 {
 	struct perf_sched sched;
 
-	perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax);
+	perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax, exclude);
 
 	do {
 		if (!perf_sched_find_counter(&sched))
@@ -851,6 +857,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 	struct perf_event *e;
 	int i, wmin, wmax, unsched = 0;
 	struct hw_perf_event *hwc;
+	u64 exclude = 0;
 
 	bitmap_zero(used_mask, X86_PMC_IDX_MAX);
 
@@ -873,6 +880,8 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 		hwc = &cpuc->event_list[i]->hw;
 		c = cpuc->event_constraint[i];
 
+		exclude |= hwc->exclude;
+
 		/* never assigned */
 		if (hwc->idx == -1)
 			break;
@@ -894,6 +903,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 	if (i != n) {
 		int gpmax = x86_pmu.num_counters;
 
+		gpmax = max_t(int, 0, gpmax - hweight32(exclude));
 		/*
 		 * Do not allow scheduling of more than half the available
 		 * generic counters.
@@ -909,7 +919,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 			gpmax /= 2;
 
 		unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
-					     wmax, gpmax, assign);
+					     wmax, gpmax, assign, exclude);
 	}
 
 	/*
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index d516161c00c4..cfaae41ffbb4 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -442,7 +442,7 @@ static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int
 	/* slow path */
 	if (i != n)
 		ret = perf_assign_events(box->event_constraint, n,
-					 wmin, wmax, n, assign);
+					 wmin, wmax, n, assign, 0);
 
 	if (!assign || ret) {
 		for (i = 0; i < n; i++)
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 78d7b7031bfc..5b08ae3edafd 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -783,7 +783,8 @@ static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
 void x86_pmu_enable_all(int added);
 
 int perf_assign_events(struct event_constraint **constraints, int n,
-			int wmin, int wmax, int gpmax, int *assign);
+		       int wmin, int wmax, int gpmax, int *assign,
+		       u64 exclude);
 int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign);
 
 void x86_pmu_stop(struct perf_event *event, int flags);
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 6cb5d483ab34..e7caf100d2bb 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -125,6 +125,7 @@ struct hw_perf_event {
 			u64		last_tag;
 			unsigned long	config_base;
 			unsigned long	event_base;
+			u64		exclude;
 			int		event_base_rdpmc;
 			int		idx;
 			int		last_cpu;
-- 
2.17.2


From 583e51cd2a2246421610871ee7413907345f7828 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Mon, 4 Feb 2019 16:36:40 -0800
Subject: [PATCH 3/8] x86/pmu/intel: Handle TSX with counter 3 on Skylake
Status: O
Content-Length: 5404
Lines: 158

On Skylake with recent microcode updates due to errata XXX
perfmon general purpose counter 3 can be corrupted when RTM transactions
are executed.

The microcode provides a new MSR to force disable RTM
(make all RTM transactions abort).

This patch adds the low level code to manage this MSR. Depending
on a per event flag TSX is disabled while a perf event that
uses counter 3 is running. When the event flag is not set,
then avoid using counter 3. When the event is scheduled
out again reenable TSX again.

This patch avoids reprograming the MSR when counter 3
is reused for a different MSR.

Future patches will add the user interfaces to enable this
functionality.

Signed-off-by: Andi Kleen <ak@linux.intel.com>

---
v2:
Use u8 instead of bool
Rename force_rtm_abort_active.
---
 arch/x86/events/intel/core.c       | 40 ++++++++++++++++++++++++++++++
 arch/x86/events/perf_event.h       |  7 +++++-
 arch/x86/include/asm/cpufeatures.h |  1 +
 arch/x86/include/asm/msr-index.h   |  3 +++
 4 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index daafb893449b..a24f73462a98 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2042,6 +2042,16 @@ static inline bool event_is_checkpointed(struct perf_event *event)
 	return (event->hw.config & HSW_IN_TX_CHECKPOINTED) != 0;
 }
 
+static void intel_pmu_disable_fra(struct perf_event *event)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	if (event->hw.idx == 3 && cpuc->rtm_abort_set) {
+		wrmsrl(MSR_TSX_FORCE_ABORT, 0);
+		cpuc->rtm_abort_set = false;
+	}
+}
+
 static void intel_pmu_disable_event(struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
@@ -2060,6 +2070,9 @@ static void intel_pmu_disable_event(struct perf_event *event)
 	if (unlikely(event->attr.precise_ip))
 		intel_pmu_pebs_disable(event);
 
+	if (static_cpu_has(X86_FEATURE_TSX_FORCE_ABORT))
+		intel_pmu_disable_fra(event, flags);
+
 	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
 		intel_pmu_disable_fixed(hwc);
 		return;
@@ -2084,6 +2097,30 @@ static void intel_pmu_read_event(struct perf_event *event)
 		x86_perf_event_update(event);
 }
 
+static bool __use_counter3(struct perf_event *event)
+{
+	return event->hw.flags & PERF_X86_EVENT_ABORT_TSX;
+}
+
+static bool use_counter3(struct perf_event *event)
+{
+	return __use_counter3(event) ||
+		(event->group_leader && __use_counter3(event->group_leader));
+}
+
+static void intel_pmu_enable_fra(struct perf_event *event)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	if (use_counter3(event) &&
+	    event->hw.idx == 3 &&
+	    !cpuc->rtm_abort_set) {
+		/* No RMW, so other bits get clobbered */
+		wrmsrl(MSR_TSX_FORCE_ABORT, MSR_TFA_RTM_FORCE_ABORT);
+		cpuc->rtm_abort_set = true;
+	}
+}
+
 static void intel_pmu_enable_fixed(struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
@@ -2141,6 +2178,9 @@ static void intel_pmu_enable_event(struct perf_event *event)
 	if (unlikely(event->attr.precise_ip))
 		intel_pmu_pebs_enable(event);
 
+	if (static_cpu_has(X86_FEATURE_TSX_FORCE_ABORT))
+		intel_pmu_enable_fra(event);
+
 	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
 		intel_pmu_enable_fixed(event);
 		return;
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 5b08ae3edafd..f203d2e33710 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -70,7 +70,7 @@ struct event_constraint {
 #define PERF_X86_EVENT_EXCL_ACCT	0x0200 /* accounted EXCL event */
 #define PERF_X86_EVENT_AUTO_RELOAD	0x0400 /* use PEBS auto-reload */
 #define PERF_X86_EVENT_LARGE_PEBS	0x0800 /* use large PEBS */
-
+#define PERF_X86_EVENT_ABORT_TSX	0x1000 /* force abort TSX */
 
 struct amd_nb {
 	int nb_id;  /* NorthBridge id */
@@ -242,6 +242,11 @@ struct cpu_hw_events {
 	struct intel_excl_cntrs		*excl_cntrs;
 	int excl_thread_id; /* 0 or 1 */
 
+	/*
+	 * Manage using counter 3 on Skylake with TSX.
+	 */
+	u8		rtm_abort_set; /* We set the MSR */
+
 	/*
 	 * AMD specific bits
 	 */
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 6d6122524711..981ff9479648 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -344,6 +344,7 @@
 /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */
 #define X86_FEATURE_AVX512_4VNNIW	(18*32+ 2) /* AVX-512 Neural Network Instructions */
 #define X86_FEATURE_AVX512_4FMAPS	(18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */
+#define X86_FEATURE_TSX_FORCE_ABORT	(18*32+13) /* "" TSX_FORCE_ABORT */
 #define X86_FEATURE_PCONFIG		(18*32+18) /* Intel PCONFIG */
 #define X86_FEATURE_SPEC_CTRL		(18*32+26) /* "" Speculation Control (IBRS + IBPB) */
 #define X86_FEATURE_INTEL_STIBP		(18*32+27) /* "" Single Thread Indirect Branch Predictors */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 8e40c2446fd1..d7f29ca27ca1 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -666,6 +666,9 @@
 
 #define MSR_IA32_TSC_DEADLINE		0x000006E0
 
+#define MSR_TSX_FORCE_ABORT 		0x0000010F
+#define MSR_TFA_RTM_FORCE_ABORT 			BIT_ULL(0)
+
 /* P4/Xeon+ specific */
 #define MSR_IA32_MCG_EAX		0x00000180
 #define MSR_IA32_MCG_EBX		0x00000181
-- 
2.17.2


From bb3b849d7271aa49ac362919a999e16c7c41638b Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Mon, 4 Feb 2019 16:47:07 -0800
Subject: [PATCH 4/8] x86/pmu/intel: Add perf event attribute to enable counter
 3
Status: O
Content-Length: 2976
Lines: 100

Add a "force_rtm_abort" attribute per perf event attribute
to allow user programs to opt in to use counter 3 and disable
TSX while the perf event is active.

This allows non root programs to use groups with 4 counters,
even if TSX is fully enabled.

The actual defaults depend on further patches.

We use config2 bit 0 in the core PMU for this interface.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/events/intel/core.c | 21 +++++++++++++++++++++
 arch/x86/events/perf_event.h |  2 ++
 2 files changed, 23 insertions(+)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index a24f73462a98..c197b2f1cfcc 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3181,6 +3181,13 @@ static int intel_pmu_hw_config(struct perf_event *event)
 	if (ret)
 		return ret;
 
+	if (static_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)) {
+		if (event->attr.config2 & FORCE_RTM_ABORT)
+			event->hw.flags |= PERF_X86_EVENT_ABORT_TSX;
+		else
+			event->hw.exclude = BIT(3);
+	}
+
 	if (event->attr.precise_ip) {
 		if (!event->attr.freq) {
 			event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
@@ -3633,6 +3640,8 @@ PMU_FORMAT_ATTR(ldlat, "config1:0-15");
 
 PMU_FORMAT_ATTR(frontend, "config1:0-23");
 
+PMU_FORMAT_ATTR(force_rtm_abort, "config2:0");
+
 static struct attribute *intel_arch3_formats_attr[] = {
 	&format_attr_event.attr,
 	&format_attr_umask.attr,
@@ -3668,6 +3677,11 @@ static struct attribute *skl_format_attr[] = {
 	NULL,
 };
 
+static struct attribute *skl_extra_format_attr[] = {
+	&format_attr_force_rtm_abort.attr,
+	NULL,
+};
+
 static __initconst const struct x86_pmu core_pmu = {
 	.name			= "core",
 	.handle_irq		= x86_pmu_handle_irq,
@@ -4133,6 +4147,7 @@ __init int intel_pmu_init(void)
 	struct attribute **mem_attr = NULL;
 	struct attribute **tsx_attr = NULL;
 	struct attribute **to_free = NULL;
+	struct attribute **to_free2 = NULL;
 	union cpuid10_edx edx;
 	union cpuid10_eax eax;
 	union cpuid10_ebx ebx;
@@ -4597,6 +4612,11 @@ __init int intel_pmu_init(void)
 			boot_cpu_data.x86_model == INTEL_FAM6_SKYLAKE_X);
 		pr_cont("Skylake events, ");
 		name = "skylake";
+		if (boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)) {
+			extra_attr = merge_attr(extra_attr, skl_extra_format_attr);
+			to_free2 = extra_attr;
+			x86_pmu.attrs = merge_attr(x86_pmu.attrs, skl_extra_attr);
+		}
 		break;
 
 	default:
@@ -4710,6 +4730,7 @@ __init int intel_pmu_init(void)
 		x86_pmu.handle_irq = intel_pmu_handle_irq_v4;
 
 	kfree(to_free);
+	kfree(to_free2);
 	return 0;
 }
 
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index f203d2e33710..ed353f9970c8 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -775,6 +775,8 @@ int x86_pmu_hw_config(struct perf_event *event);
 
 void x86_pmu_disable_all(void);
 
+#define FORCE_RTM_ABORT (1U << 0)
+
 static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
 					  u64 enable_mask)
 {
-- 
2.17.2


From 9d56319fee38739b876946e171dd7be749e1a97b Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Wed, 21 Nov 2018 17:12:32 -0800
Subject: [PATCH 5/8] x86/pmu/intel: Add global option for enable all counters
Status: O
Content-Length: 4261
Lines: 131

Add a global setting to allow to disable RTM when counter 3
is needed for some perf event group. Otherwise RTM will
not be impacted.

This allows existing programs that want to use groups
with all four counters to run without changes.

This patch sets the default to TSX enabled, but
that could be easily changed.

It also adds a new allow_rtm attribute that allows perf
event users to override the default.

The trade offs for setting the option default are:

Using 4 (or 8 with HT off) events in perf versus
allowing RTM usage while perf is active.

- Existing programs that use perf groups with 4 counters
may not retrieve perfmon data anymore. Perf usages
that use less than four (or 7 with HT off) counters
are not impacted. Perf usages that don't use group
will still work, but will see increase multiplexing.

- TSX programs should not functionally break from
forcing RTM to abort because they always need a valid
fall back path. However they will see significantly
lower performance if they rely on TSX for performance
(all RTM transactions will run and only abort at the end),
potentially slowing them down so much that it is
equivalent to functional breakage.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/events/core.c       |  8 +++++++-
 arch/x86/events/intel/core.c | 13 ++++++++++++-
 arch/x86/events/perf_event.h |  3 +++
 3 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 4bad36dfcc8e..fe77a6f7b57c 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2258,11 +2258,17 @@ static ssize_t max_precise_show(struct device *cdev,
 
 static DEVICE_ATTR_RO(max_precise);
 
+bool perf_enable_all_counters __read_mostly;
+
 static ssize_t num_counter_show(struct device *cdev,
 				  struct device_attribute *attr,
 				  char *buf)
 {
-	return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu.num_counters);
+	int num = x86_pmu.num_counters;
+	if (boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT) &&
+		!perf_enable_all_counters && num > 0)
+		num--;
+	return snprintf(buf, PAGE_SIZE, "%d\n", num);
 }
 
 static DEVICE_ATTR_RO(num_counter);
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index c197b2f1cfcc..5d2d5851d13d 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3182,7 +3182,9 @@ static int intel_pmu_hw_config(struct perf_event *event)
 		return ret;
 
 	if (static_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)) {
-		if (event->attr.config2 & FORCE_RTM_ABORT)
+		if ((event->attr.config2 & FORCE_RTM_ABORT) ||
+		    (perf_enable_all_counters &&
+		     !(event->attr.config2 & ALLOW_RTM)))
 			event->hw.flags |= PERF_X86_EVENT_ABORT_TSX;
 		else
 			event->hw.exclude = BIT(3);
@@ -3641,6 +3643,7 @@ PMU_FORMAT_ATTR(ldlat, "config1:0-15");
 PMU_FORMAT_ATTR(frontend, "config1:0-23");
 
 PMU_FORMAT_ATTR(force_rtm_abort, "config2:0");
+PMU_FORMAT_ATTR(allow_rtm, "config2:1");
 
 static struct attribute *intel_arch3_formats_attr[] = {
 	&format_attr_event.attr,
@@ -3679,6 +3682,7 @@ static struct attribute *skl_format_attr[] = {
 
 static struct attribute *skl_extra_format_attr[] = {
 	&format_attr_force_rtm_abort.attr,
+	&format_attr_allow_rtm.attr,
 	NULL,
 };
 
@@ -4141,6 +4145,13 @@ get_events_attrs(struct attribute **base,
 	return attrs;
 }
 
+static DEVICE_BOOL_ATTR(enable_all_counters, 0644, perf_enable_all_counters);
+
+static struct attribute *skl_extra_attr[] = {
+	&dev_attr_enable_all_counters.attr.attr,
+	NULL,
+};
+
 __init int intel_pmu_init(void)
 {
 	struct attribute **extra_attr = NULL;
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index ed353f9970c8..67e1581df96f 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -776,6 +776,7 @@ int x86_pmu_hw_config(struct perf_event *event);
 void x86_pmu_disable_all(void);
 
 #define FORCE_RTM_ABORT (1U << 0)
+#define ALLOW_RTM	(1U << 1)
 
 static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
 					  u64 enable_mask)
@@ -863,6 +864,8 @@ static inline int amd_pmu_init(void)
 
 #endif /* CONFIG_CPU_SUP_AMD */
 
+extern bool perf_enable_all_counters;
+
 #ifdef CONFIG_CPU_SUP_INTEL
 
 static inline bool intel_pmu_has_bts(struct perf_event *event)
-- 
2.17.2


From affcea3deaab10a79ebc9539655a8620f055137e Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Fri, 28 Sep 2018 16:03:06 -0700
Subject: [PATCH 6/8] perf stat: Make all existing groups weak
Status: O
Content-Length: 1268
Lines: 51

Now that we may only have three counters make the --topdown and
the --transaction groups weak, so that they still work.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 tools/perf/builtin-stat.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index e587808591e8..c94f5ed135f1 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -101,7 +101,7 @@ static const char *transaction_attrs = {
 	"cpu/tx-start/,"
 	"cpu/el-start/,"
 	"cpu/cycles-ct/"
-	"}"
+	"}:W"
 };
 
 /* More limited version when the CPU does not have all events. */
@@ -112,7 +112,7 @@ static const char * transaction_limited_attrs = {
 	"cycles,"
 	"cpu/cycles-t/,"
 	"cpu/tx-start/"
-	"}"
+	"}:W"
 };
 
 static const char * topdown_attrs[] = {
@@ -999,7 +999,7 @@ static int topdown_filter_events(const char **attr, char **str, bool use_group)
 	}
 	attr[i - off] = NULL;
 
-	*str = malloc(len + 1 + 2);
+	*str = malloc(len + 1 + 2 + 2);
 	if (!*str)
 		return -1;
 	s = *str;
@@ -1016,6 +1016,8 @@ static int topdown_filter_events(const char **attr, char **str, bool use_group)
 	}
 	if (use_group) {
 		s[-1] = '}';
+		*s++ = ':';
+		*s++ = 'W';
 		*s = 0;
 	} else
 		s[-1] = 0;
-- 
2.17.2


From 62292403cca4e428755d8debd4a4bb214e1d1b47 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Fri, 28 Sep 2018 16:04:08 -0700
Subject: [PATCH 7/8] perf stat: Don't count EL for --transaction with three
 counters
Status: O
Content-Length: 3235
Lines: 103

When the system only has three counters HLE (EL) is also not
available. Detect that there are only three counters and
then automatically disable el-starts for --transaction.
This avoids event multiplexing in this situation with no loss
of information.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 tools/perf/builtin-stat.c | 30 ++++++++++++++++++++++--------
 tools/perf/util/pmu.c     | 10 ++++++++++
 tools/perf/util/pmu.h     |  1 +
 3 files changed, 33 insertions(+), 8 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index c94f5ed135f1..59a5bf0389b7 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -104,6 +104,18 @@ static const char *transaction_attrs = {
 	"}:W"
 };
 
+static const char *transaction_noel_attrs = {
+	"task-clock,"
+	"{"
+	"instructions,"
+	"cycles,"
+	"cpu/cycles-t/,"
+	"cpu/tx-start/,"
+	"cpu/cycles-ct/"
+	"}:W"
+};
+
+
 /* More limited version when the CPU does not have all events. */
 static const char * transaction_limited_attrs = {
 	"task-clock,"
@@ -1160,6 +1172,8 @@ static int add_default_attributes(void)
 		return 0;
 
 	if (transaction_run) {
+		const char *attrs = transaction_limited_attrs;
+
 		/* Handle -T as -M transaction. Once platform specific metrics
 		 * support has been added to the json files, all archictures
 		 * will use this approach. To determine transaction support
@@ -1173,16 +1187,16 @@ static int add_default_attributes(void)
 		}
 
 		if (pmu_have_event("cpu", "cycles-ct") &&
-		    pmu_have_event("cpu", "el-start"))
-			err = parse_events(evsel_list, transaction_attrs,
-					   &errinfo);
-		else
-			err = parse_events(evsel_list,
-					   transaction_limited_attrs,
-					   &errinfo);
+		    pmu_have_event("cpu", "el-start")) {
+			if (pmu_num_counters("cpu") == 3)
+				attrs = transaction_noel_attrs;
+			else
+				attrs = transaction_attrs;
+		}
+		err = parse_events(evsel_list, attrs, &errinfo);
 		if (err) {
 			fprintf(stderr, "Cannot set up transaction events\n");
-			parse_events_print_error(&errinfo, transaction_attrs);
+			parse_events_print_error(&errinfo, attrs);
 			return -1;
 		}
 		return 0;
diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index 11a234740632..5e17409b7ff6 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -1495,3 +1495,13 @@ int perf_pmu__scan_file(struct perf_pmu *pmu, const char *name, const char *fmt,
 	va_end(args);
 	return ret;
 }
+
+int pmu_num_counters(const char *pname)
+{
+	unsigned long num;
+	struct perf_pmu *pmu = perf_pmu__find(pname);
+
+	if (pmu && perf_pmu__scan_file(pmu, "caps/num_counter", "%ld", &num) == 1)
+		return num;
+	return -1;
+}
diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h
index 76fecec7b3f9..6e772243fd1d 100644
--- a/tools/perf/util/pmu.h
+++ b/tools/perf/util/pmu.h
@@ -85,6 +85,7 @@ struct perf_pmu *perf_pmu__scan(struct perf_pmu *pmu);
 void print_pmu_events(const char *event_glob, bool name_only, bool quiet,
 		      bool long_desc, bool details_flag);
 bool pmu_have_event(const char *pname, const char *name);
+int pmu_num_counters(const char *);
 
 int perf_pmu__scan_file(struct perf_pmu *pmu, const char *name, const char *fmt, ...) __scanf(3, 4);
 
-- 
2.17.2


From c8716ab9ee8706ca85d6d53c3e03494cf9006e05 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Fri, 1 Feb 2019 18:17:19 -0800
Subject: [PATCH 8/8] kvm: vmx: Support TSX_FORCE_ABORT in KVM guests
Status: O
Content-Length: 7695
Lines: 213

Recent microcode for Skylake added a new CPUID bit and MSR to control
TSX aborting and enabling PMU counter 3. This patch adds support
for controlling counter 3 from KVM guests.

Intercept the MSR and set the correct attribute on the perf events
used by the virtualized PMU. TSX is only disabled when counter
3 is actually used by the host PMU. The guest can use all four
counters without multiplexing.

Also export the CPUID bit.

Signed-off-by: Andi Kleen <ak@linux.intel.com>

---
v2:
Redo implementation. We already intercept the MSR now and
pass the correct attribute to the host perf. The MSR is only
active when counter 3 is actually used, but the guest can
use all counters without multiplexing.
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/cpuid.c            |  3 ++-
 arch/x86/kvm/pmu.c              | 19 ++++++++++++-------
 arch/x86/kvm/pmu.h              |  6 ++++--
 arch/x86/kvm/pmu_amd.c          |  2 +-
 arch/x86/kvm/vmx/pmu_intel.c    | 16 ++++++++++++++--
 6 files changed, 34 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4660ce90de7f..75f098142672 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -470,6 +470,7 @@ struct kvm_pmu {
 	u64 global_ctrl_mask;
 	u64 reserved_bits;
 	u8 version;
+	u8 force_tsx_abort;
 	struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC];
 	struct kvm_pmc fixed_counters[INTEL_PMC_MAX_FIXED];
 	struct irq_work irq_work;
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index bbffa6c54697..2570b17ac372 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -409,7 +409,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 	/* cpuid 7.0.edx*/
 	const u32 kvm_cpuid_7_0_edx_x86_features =
 		F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
-		F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP);
+		F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
+		F(TSX_FORCE_ABORT);
 
 	/* all calls to cpuid_count() should be made on the same cpu */
 	get_cpu();
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 58ead7db71a3..c7b31fdd8d61 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -99,7 +99,8 @@ static void kvm_perf_overflow_intr(struct perf_event *perf_event,
 static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
 				  unsigned config, bool exclude_user,
 				  bool exclude_kernel, bool intr,
-				  bool in_tx, bool in_tx_cp)
+				  bool in_tx, bool in_tx_cp,
+				  bool tsx_force_abort)
 {
 	struct perf_event *event;
 	struct perf_event_attr attr = {
@@ -111,6 +112,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
 		.exclude_user = exclude_user,
 		.exclude_kernel = exclude_kernel,
 		.config = config,
+		.config2 = tsx_force_abort,
 	};
 
 	attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc);
@@ -140,7 +142,8 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
 	clear_bit(pmc->idx, (unsigned long*)&pmc_to_pmu(pmc)->reprogram_pmi);
 }
 
-void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
+void reprogram_gp_counter(struct kvm_pmu *pmu, struct kvm_pmc *pmc,
+			  u64 eventsel)
 {
 	unsigned config, type = PERF_TYPE_RAW;
 	u8 event_select, unit_mask;
@@ -178,11 +181,13 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
 			      !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
 			      eventsel & ARCH_PERFMON_EVENTSEL_INT,
 			      (eventsel & HSW_IN_TX),
-			      (eventsel & HSW_IN_TX_CHECKPOINTED));
+			      (eventsel & HSW_IN_TX_CHECKPOINTED),
+			      pmu->force_tsx_abort);
 }
 EXPORT_SYMBOL_GPL(reprogram_gp_counter);
 
-void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx)
+void reprogram_fixed_counter(struct kvm_pmu *pmu, struct kvm_pmc *pmc,
+			     u8 ctrl, int idx)
 {
 	unsigned en_field = ctrl & 0x3;
 	bool pmi = ctrl & 0x8;
@@ -196,7 +201,7 @@ void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx)
 			      kvm_x86_ops->pmu_ops->find_fixed_event(idx),
 			      !(en_field & 0x2), /* exclude user */
 			      !(en_field & 0x1), /* exclude kernel */
-			      pmi, false, false);
+			      pmi, false, false, pmu->force_tsx_abort);
 }
 EXPORT_SYMBOL_GPL(reprogram_fixed_counter);
 
@@ -208,12 +213,12 @@ void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx)
 		return;
 
 	if (pmc_is_gp(pmc))
-		reprogram_gp_counter(pmc, pmc->eventsel);
+		reprogram_gp_counter(pmu, pmc, pmc->eventsel);
 	else {
 		int idx = pmc_idx - INTEL_PMC_IDX_FIXED;
 		u8 ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, idx);
 
-		reprogram_fixed_counter(pmc, ctrl, idx);
+		reprogram_fixed_counter(pmu, pmc, ctrl, idx);
 	}
 }
 EXPORT_SYMBOL_GPL(reprogram_counter);
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index ba8898e1a854..7c31f38be1ee 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -102,8 +102,10 @@ static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr)
 	return NULL;
 }
 
-void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel);
-void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int fixed_idx);
+void reprogram_gp_counter(struct kvm_pmu *pmu, struct kvm_pmc *pmc,
+			  u64 eventsel);
+void reprogram_fixed_counter(struct kvm_pmu *pmu, struct kvm_pmc *pmc, u8 ctrl,
+			     int fixed_idx);
 void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx);
 
 void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/pmu_amd.c b/arch/x86/kvm/pmu_amd.c
index 1495a735b38e..1de7fc73a634 100644
--- a/arch/x86/kvm/pmu_amd.c
+++ b/arch/x86/kvm/pmu_amd.c
@@ -250,7 +250,7 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		if (data == pmc->eventsel)
 			return 0;
 		if (!(data & pmu->reserved_bits)) {
-			reprogram_gp_counter(pmc, data);
+			reprogram_gp_counter(pmu, pmc, data);
 			return 0;
 		}
 	}
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 5ab4a364348e..bf0681bed109 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -49,7 +49,7 @@ static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
 		if (old_ctrl == new_ctrl)
 			continue;
 
-		reprogram_fixed_counter(pmc, new_ctrl, i);
+		reprogram_fixed_counter(pmu, pmc, new_ctrl, i);
 	}
 
 	pmu->fixed_ctr_ctrl = data;
@@ -148,6 +148,9 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
 	int ret;
 
 	switch (msr) {
+	case MSR_TSX_FORCE_ABORT:
+		return guest_cpuid_has(vcpu, X86_FEATURE_TSX_FORCE_ABORT);
+
 	case MSR_CORE_PERF_FIXED_CTR_CTRL:
 	case MSR_CORE_PERF_GLOBAL_STATUS:
 	case MSR_CORE_PERF_GLOBAL_CTRL:
@@ -182,6 +185,9 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
 	case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
 		*data = pmu->global_ovf_ctrl;
 		return 0;
+	case MSR_TSX_FORCE_ABORT:
+		*data = pmu->force_tsx_abort;
+		return 0;
 	default:
 		if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
 		    (pmc = get_fixed_pmc(pmu, msr))) {
@@ -234,6 +240,12 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			return 0;
 		}
 		break;
+	case MSR_TSX_FORCE_ABORT:
+		if (data & ~1ULL)
+			break;
+		/* Will take effect at next enable */
+		pmu->force_tsx_abort = data;
+		return 0;
 	default:
 		if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
 		    (pmc = get_fixed_pmc(pmu, msr))) {
@@ -245,7 +257,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			if (data == pmc->eventsel)
 				return 0;
 			if (!(data & pmu->reserved_bits)) {
-				reprogram_gp_counter(pmc, data);
+				reprogram_gp_counter(pmu, pmc, data);
 				return 0;
 			}
 		}
-- 
2.17.2