linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC,PATCH] VMWARE faults on accessing disabled counters
@ 2016-08-31 12:03 Jiri Olsa
  2016-08-31 13:11 ` Peter Zijlstra
  0 siblings, 1 reply; 4+ messages in thread
From: Jiri Olsa @ 2016-08-31 12:03 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Andi Kleen; +Cc: lkml, Alexander Shishkin

hi,
when booting under VMWARE we've got following dmesg lines:

[    0.051567] perf_event_intel: CPUID marked event: 'cpu cycles' unavailable
[    0.051567] perf_event_intel: CPUID marked event: 'instructions' unavailable
[    0.051568] perf_event_intel: CPUID marked event: 'bus cycles' unavailable
[    0.051568] perf_event_intel: CPUID marked event: 'cache references' unavailable
[    0.051569] perf_event_intel: CPUID marked event: 'cache misses' unavailable
[    0.051570] perf_event_intel: CPUID marked event: 'branch instructions' unavailable
[    0.051570] perf_event_intel: CPUID marked event: 'branch misses' unavailable

that means all the architectural events are disabled by CPUID(0xa)

The kernel code sets intel_perfmon_event_map to prevent
those event to be configured by PERF_TYPE_HARDWARE pmu
type. However they can still be configured by via
PERF_TYPE_RAW type.

We're getting GP fault on VMWARE when reading cycles PMC
configured throgh the PERF_TYPE_RAW interface:

 #4 [ffff88007c603e10] do_general_protection at ffffffff8163da9e
 #5 [ffff88007c603e40] general_protection at ffffffff8163d3a8
    [exception RIP: native_read_pmc+6]
    RIP: ffffffff81058d66  RSP: ffff88007c603ef0  RFLAGS: 00010083
    RAX: ffffffff81957ee0  RBX: 0000000000000000  RCX: 0000000040000002
    RDX: 000000000ff8f719  RSI: ffff88007c617fa8  RDI: 0000000040000002
    RBP: ffff88007c603ef0   R8: 00007ffde5053150   R9: 0000000000000000
    R10: 00007ffde5052530  R11: 00007fbb22aedc70  R12: ffffffff80000001
    R13: ffff880079b74400  R14: ffff880079b74578  R15: 0000000000000010
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0000
 #6 [ffff88007c603ef8] x86_perf_event_update at ffffffff81029e03
 #7 [ffff88007c603f30] x86_pmu_read at ffffffff8102a079
 #8 [ffff88007c603f40] __perf_event_read at ffffffff811590de

I couldn't find what real HW rdpmc does on this situation,
so I'm not sure if we actually want to prevent this.. patch
below tries to catch this case.

thanks,
jirka


---
 arch/x86/events/core.c       |  8 ++++-
 arch/x86/events/intel/core.c | 72 ++++++++++++++++++++++++++++++++------------
 arch/x86/events/perf_event.h |  6 ++++
 3 files changed, 65 insertions(+), 21 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 473519100b11..d836c5922b12 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -534,8 +534,14 @@ int x86_pmu_hw_config(struct perf_event *event)
 	if (!event->attr.exclude_kernel)
 		event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
 
-	if (event->attr.type == PERF_TYPE_RAW)
+	if (event->attr.type == PERF_TYPE_RAW) {
+		u64 arch_config = event->attr.config & INTEL_ARCH_EVENT_MASK;
+
+		if (x86_pmu_event_disabled(arch_config))
+			return -ENOENT;
+
 		event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
+	}
 
 	if (event->attr.sample_period && x86_pmu.limit_period) {
 		if (x86_pmu.limit_period(event, event->attr.sample_period) >
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 9049d62f34ae..99a83529c7ff 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -23,16 +23,22 @@
 /*
  * Intel PerfMon, used on Core and later.
  */
-static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
-{
-	[PERF_COUNT_HW_CPU_CYCLES]		= 0x003c,
-	[PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,
-	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0x4f2e,
-	[PERF_COUNT_HW_CACHE_MISSES]		= 0x412e,
-	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4,
-	[PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,
-	[PERF_COUNT_HW_BUS_CYCLES]		= 0x013c,
-	[PERF_COUNT_HW_REF_CPU_CYCLES]		= 0x0300, /* pseudo-encoding */
+struct intel_perfmon_event {
+	u64	config;
+	bool	disabled;
+	u64	replacement;
+};
+
+static struct intel_perfmon_event intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
+{
+	[PERF_COUNT_HW_CPU_CYCLES]		= { .config = 0x003c },
+	[PERF_COUNT_HW_INSTRUCTIONS]		= { .config = 0x00c0 },
+	[PERF_COUNT_HW_CACHE_REFERENCES]	= { .config = 0x4f2e },
+	[PERF_COUNT_HW_CACHE_MISSES]		= { .config = 0x412e },
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= { .config = 0x00c4 },
+	[PERF_COUNT_HW_BRANCH_MISSES]		= { .config = 0x00c5 },
+	[PERF_COUNT_HW_BUS_CYCLES]		= { .config = 0x013c },
+	[PERF_COUNT_HW_REF_CPU_CYCLES]		= { .config = 0x0300 }, /* pseudo-encoding */
 };
 
 static struct event_constraint intel_core_event_constraints[] __read_mostly =
@@ -268,7 +274,31 @@ struct event_constraint intel_bdw_event_constraints[] = {
 
 static u64 intel_pmu_event_map(int hw_event)
 {
-	return intel_perfmon_event_map[hw_event];
+	struct intel_perfmon_event *event = &intel_perfmon_event_map[hw_event];
+
+	if (event->disabled)
+		return event->config;
+	if (event->replacement)
+		return event->replacement;
+
+	return event->config;
+}
+
+static bool intel_pmu_event_disabled(int hw_event)
+{
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(intel_perfmon_event_map); i++) {
+		struct intel_perfmon_event *event = &intel_perfmon_event_map[hw_event];
+
+		if (event->config != hw_event)
+			continue;
+
+		if (event->disabled)
+			return true;
+	}
+
+	return false;
 }
 
 /*
@@ -3165,6 +3195,7 @@ static __initconst const struct x86_pmu core_pmu = {
 	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
 	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
 	.event_map		= intel_pmu_event_map,
+	.event_disabled		= intel_pmu_event_disabled,
 	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
 	.apic			= 1,
 	.free_running_flags	= PEBS_FREERUNNING_FLAGS,
@@ -3205,6 +3236,7 @@ static __initconst const struct x86_pmu intel_pmu = {
 	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
 	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
 	.event_map		= intel_pmu_event_map,
+	.event_disabled		= intel_pmu_event_disabled,
 	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
 	.apic			= 1,
 	.free_running_flags	= PEBS_FREERUNNING_FLAGS,
@@ -3357,7 +3389,7 @@ static __init void intel_arch_events_quirk(void)
 
 	/* disable event that reported as not presend by cpuid */
 	for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) {
-		intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0;
+		intel_perfmon_event_map[intel_arch_events_map[bit].id].disabled = true;
 		pr_warn("CPUID marked event: \'%s\' unavailable\n",
 			intel_arch_events_map[bit].name);
 	}
@@ -3375,7 +3407,7 @@ static __init void intel_nehalem_quirk(void)
 		 * branch-misses, but it's still much better than the
 		 * architectural event which is often completely bogus:
 		 */
-		intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
+		intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES].replacement = 0x7f89;
 		ebx.split.no_branch_misses_retired = 0;
 		x86_pmu.events_maskl = ebx.full;
 		pr_info("CPU erratum AAJ80 worked around\n");
@@ -3543,10 +3575,10 @@ __init int intel_pmu_init(void)
 		x86_pmu.cpu_events = nhm_events_attrs;
 
 		/* UOPS_ISSUED.STALLED_CYCLES */
-		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND].replacement =
 			X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
 		/* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
-		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND].replacement =
 			X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
 
 		intel_pmu_pebs_data_source_nhm();
@@ -3630,10 +3662,10 @@ __init int intel_pmu_init(void)
 		x86_pmu.cpu_events = nhm_events_attrs;
 
 		/* UOPS_ISSUED.STALLED_CYCLES */
-		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND].replacement =
 			X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
 		/* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
-		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND].replacement =
 			X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
 
 		intel_pmu_pebs_data_source_nhm();
@@ -3667,10 +3699,10 @@ __init int intel_pmu_init(void)
 		x86_pmu.cpu_events = snb_events_attrs;
 
 		/* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
-		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND].replacement =
 			X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
 		/* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/
-		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND].replacement =
 			X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1);
 
 		pr_cont("SandyBridge events, ");
@@ -3704,7 +3736,7 @@ __init int intel_pmu_init(void)
 		x86_pmu.cpu_events = snb_events_attrs;
 
 		/* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
-		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND].replacement =
 			X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
 
 		pr_cont("IvyBridge events, ");
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 01ddfeadaee6..69cca7dc8de4 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -514,6 +514,7 @@ struct x86_pmu {
 	int		(*addr_offset)(int index, bool eventsel);
 	int		(*rdpmc_index)(int index);
 	u64		(*event_map)(int);
+	bool		(*event_disabled)(int);
 	int		max_events;
 	int		num_counters;
 	int		num_counters_fixed;
@@ -715,6 +716,11 @@ static inline int x86_pmu_rdpmc_index(int index)
 	return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index;
 }
 
+static inline bool x86_pmu_event_disabled(u64 config)
+{
+	return x86_pmu.event_disabled ? x86_pmu.event_disabled(config) : false;
+}
+
 int x86_add_exclusive(unsigned int what);
 
 void x86_del_exclusive(unsigned int what);
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [RFC,PATCH] VMWARE faults on accessing disabled counters
  2016-08-31 12:03 [RFC,PATCH] VMWARE faults on accessing disabled counters Jiri Olsa
@ 2016-08-31 13:11 ` Peter Zijlstra
  2016-08-31 13:19   ` Jiri Olsa
  0 siblings, 1 reply; 4+ messages in thread
From: Peter Zijlstra @ 2016-08-31 13:11 UTC (permalink / raw)
  To: Jiri Olsa; +Cc: Ingo Molnar, Andi Kleen, lkml, Alexander Shishkin

On Wed, Aug 31, 2016 at 02:03:58PM +0200, Jiri Olsa wrote:
> hi,
> when booting under VMWARE we've got following dmesg lines:
> 
> [    0.051567] perf_event_intel: CPUID marked event: 'cpu cycles' unavailable
> [    0.051567] perf_event_intel: CPUID marked event: 'instructions' unavailable
> [    0.051568] perf_event_intel: CPUID marked event: 'bus cycles' unavailable
> [    0.051568] perf_event_intel: CPUID marked event: 'cache references' unavailable
> [    0.051569] perf_event_intel: CPUID marked event: 'cache misses' unavailable
> [    0.051570] perf_event_intel: CPUID marked event: 'branch instructions' unavailable
> [    0.051570] perf_event_intel: CPUID marked event: 'branch misses' unavailable
> 
> that means all the architectural events are disabled by CPUID(0xa)
> 
> The kernel code sets intel_perfmon_event_map to prevent
> those event to be configured by PERF_TYPE_HARDWARE pmu
> type. However they can still be configured by via
> PERF_TYPE_RAW type.
> 
> We're getting GP fault on VMWARE when reading cycles PMC
> configured throgh the PERF_TYPE_RAW interface:
> 
>  #4 [ffff88007c603e10] do_general_protection at ffffffff8163da9e
>  #5 [ffff88007c603e40] general_protection at ffffffff8163d3a8
>     [exception RIP: native_read_pmc+6]
>     RIP: ffffffff81058d66  RSP: ffff88007c603ef0  RFLAGS: 00010083
>     RAX: ffffffff81957ee0  RBX: 0000000000000000  RCX: 0000000040000002
>     RDX: 000000000ff8f719  RSI: ffff88007c617fa8  RDI: 0000000040000002
>     RBP: ffff88007c603ef0   R8: 00007ffde5053150   R9: 0000000000000000
>     R10: 00007ffde5052530  R11: 00007fbb22aedc70  R12: ffffffff80000001
>     R13: ffff880079b74400  R14: ffff880079b74578  R15: 0000000000000010
>     ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0000
>  #6 [ffff88007c603ef8] x86_perf_event_update at ffffffff81029e03
>  #7 [ffff88007c603f30] x86_pmu_read at ffffffff8102a079
>  #8 [ffff88007c603f40] __perf_event_read at ffffffff811590de
> 
> I couldn't find what real HW rdpmc does on this situation,
> so I'm not sure if we actually want to prevent this.. patch
> below tries to catch this case.

Typically real hardware allows you to program any old crap. The results,
as in what the counter does, is undefined. Some actually count, some do
not.

I'm not exactly thrilled by this patch, it adds a lot of code for a
weird case. What happens when you stuff another non existing even in? GP
again?

/me mutters vile things about virt

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [RFC,PATCH] VMWARE faults on accessing disabled counters
  2016-08-31 13:11 ` Peter Zijlstra
@ 2016-08-31 13:19   ` Jiri Olsa
  2016-08-31 13:41     ` Peter Zijlstra
  0 siblings, 1 reply; 4+ messages in thread
From: Jiri Olsa @ 2016-08-31 13:19 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Ingo Molnar, Andi Kleen, lkml, Alexander Shishkin

On Wed, Aug 31, 2016 at 03:11:04PM +0200, Peter Zijlstra wrote:
> On Wed, Aug 31, 2016 at 02:03:58PM +0200, Jiri Olsa wrote:
> > hi,
> > when booting under VMWARE we've got following dmesg lines:
> > 
> > [    0.051567] perf_event_intel: CPUID marked event: 'cpu cycles' unavailable
> > [    0.051567] perf_event_intel: CPUID marked event: 'instructions' unavailable
> > [    0.051568] perf_event_intel: CPUID marked event: 'bus cycles' unavailable
> > [    0.051568] perf_event_intel: CPUID marked event: 'cache references' unavailable
> > [    0.051569] perf_event_intel: CPUID marked event: 'cache misses' unavailable
> > [    0.051570] perf_event_intel: CPUID marked event: 'branch instructions' unavailable
> > [    0.051570] perf_event_intel: CPUID marked event: 'branch misses' unavailable
> > 
> > that means all the architectural events are disabled by CPUID(0xa)
> > 
> > The kernel code sets intel_perfmon_event_map to prevent
> > those event to be configured by PERF_TYPE_HARDWARE pmu
> > type. However they can still be configured by via
> > PERF_TYPE_RAW type.
> > 
> > We're getting GP fault on VMWARE when reading cycles PMC
> > configured throgh the PERF_TYPE_RAW interface:
> > 
> >  #4 [ffff88007c603e10] do_general_protection at ffffffff8163da9e
> >  #5 [ffff88007c603e40] general_protection at ffffffff8163d3a8
> >     [exception RIP: native_read_pmc+6]
> >     RIP: ffffffff81058d66  RSP: ffff88007c603ef0  RFLAGS: 00010083
> >     RAX: ffffffff81957ee0  RBX: 0000000000000000  RCX: 0000000040000002
> >     RDX: 000000000ff8f719  RSI: ffff88007c617fa8  RDI: 0000000040000002
> >     RBP: ffff88007c603ef0   R8: 00007ffde5053150   R9: 0000000000000000
> >     R10: 00007ffde5052530  R11: 00007fbb22aedc70  R12: ffffffff80000001
> >     R13: ffff880079b74400  R14: ffff880079b74578  R15: 0000000000000010
> >     ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0000
> >  #6 [ffff88007c603ef8] x86_perf_event_update at ffffffff81029e03
> >  #7 [ffff88007c603f30] x86_pmu_read at ffffffff8102a079
> >  #8 [ffff88007c603f40] __perf_event_read at ffffffff811590de
> > 
> > I couldn't find what real HW rdpmc does on this situation,
> > so I'm not sure if we actually want to prevent this.. patch
> > below tries to catch this case.
> 
> Typically real hardware allows you to program any old crap. The results,
> as in what the counter does, is undefined. Some actually count, some do
> not.
> 
> I'm not exactly thrilled by this patch, it adds a lot of code for a
> weird case. What happens when you stuff another non existing even in? GP
> again?

I guess if real HW does not fault on this we dont need to bother,
and treat it as the VMWARE issue.. but I couldn't find this info

jirka

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [RFC,PATCH] VMWARE faults on accessing disabled counters
  2016-08-31 13:19   ` Jiri Olsa
@ 2016-08-31 13:41     ` Peter Zijlstra
  0 siblings, 0 replies; 4+ messages in thread
From: Peter Zijlstra @ 2016-08-31 13:41 UTC (permalink / raw)
  To: Jiri Olsa; +Cc: Ingo Molnar, Andi Kleen, lkml, Alexander Shishkin

On Wed, Aug 31, 2016 at 03:19:24PM +0200, Jiri Olsa wrote:
> 
> I guess if real HW does not fault on this we dont need to bother,
> and treat it as the VMWARE issue.. but I couldn't find this info
> 

So I've once had a machine that backlisted one entry, but I've forgotten
what machine that was. But IIRC, you could program the thing just fine,
the results were crap though, which is why CPUID blacklisted the
counter.

/me pokes around some code...

Ah, see the AAJ80 code. That was some Nehalem part and the comment very
much implies programming the value 'works', you just get crap results.

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2016-08-31 13:41 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-08-31 12:03 [RFC,PATCH] VMWARE faults on accessing disabled counters Jiri Olsa
2016-08-31 13:11 ` Peter Zijlstra
2016-08-31 13:19   ` Jiri Olsa
2016-08-31 13:41     ` Peter Zijlstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).