All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/3] perf, x86: Add new cache events table for Haswell
@ 2015-02-18  2:18 Andi Kleen
  2015-02-18  2:18 ` [PATCH 2/3] perf, x86: Add Broadwell core support Andi Kleen
                   ` (3 more replies)
  0 siblings, 4 replies; 21+ messages in thread
From: Andi Kleen @ 2015-02-18  2:18 UTC (permalink / raw)
  To: peterz; +Cc: linux-kernel, Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

Haswell offcore events are quite different from Sandy Bridge.
Add a new table to handle Haswell properly.

Note that the offcore bits listed in the SDM are not quite correct
(this is currently being fixed). An uptodate list of bits is
in the patch.

The basic setup is similar to Sandy Bridge. The prefetch columns
have been removed, as prefetch counting is not very reliable
on Haswell. One L1 event that is not in the event list anymore
has been also removed.

- data reads do not include code reads (comparable to earlier Sandy
Bridge tables)
- data counts include speculative execution (except L1 write, dtlb, bpu)
- remote node access includes both remote memory, remote cache, remote mmio.
- prefetches are not included in the counts for consistency
(different from Sandy Bridge, which includes prefetches in the remote node)

The events with additional caveats have references to the specification update.

v2: Change name of variable.
v3: Based on older Broadwell patch, but now just for Haswell.
    Various fixes with events being more similar to the older
    Sandy Bridge tables.
v4: Add aliases. Drop two PF bits.
v5: Don't use SUPPLIER_NONE. Remote node is DRAM only. More aliases.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 195 ++++++++++++++++++++++++++++++++-
 1 file changed, 193 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 498b6d9..5cd7e2b 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -415,6 +415,197 @@ static __initconst const u64 snb_hw_cache_event_ids
 
 };
 
+/*
+ * Notes on the events:
+ * - data reads do not include code reads (comparable to earlier tables)
+ * - data counts include speculative execution (except L1 write, dtlb, bpu)
+ * - remote node access includes remote memory, remote cache, remote mmio.
+ * - prefetches are not included in the counts because they are not
+ *   reliably counted.
+ * The events with additional caveats have references to the specification update.
+ */
+
+#define HSW_DEMAND_DATA_RD		BIT_ULL(0)
+#define HSW_DEMAND_RFO			BIT_ULL(1)
+#define HSW_ANY_RESPONSE		BIT_ULL(16)
+#define HSW_SUPPLIER_NONE		BIT_ULL(17)
+#define HSW_L3_MISS_LOCAL_DRAM		BIT_ULL(22)
+#define HSW_L3_MISS_REMOTE_HOP0		BIT_ULL(27)
+#define HSW_L3_MISS_REMOTE_HOP1		BIT_ULL(28)
+#define HSW_L3_MISS_REMOTE_HOP2P	BIT_ULL(29)
+#define HSW_L3_MISS			(HSW_L3_MISS_LOCAL_DRAM| \
+					 HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
+					 HSW_L3_MISS_REMOTE_HOP2P)
+#define HSW_SNOOP_NONE			BIT_ULL(31)
+#define HSW_SNOOP_NOT_NEEDED		BIT_ULL(32)
+#define HSW_SNOOP_MISS			BIT_ULL(33)
+#define HSW_SNOOP_HIT_NO_FWD		BIT_ULL(34)
+#define HSW_SNOOP_HIT_WITH_FWD		BIT_ULL(35)
+#define HSW_SNOOP_HITM			BIT_ULL(36)
+#define HSW_SNOOP_NON_DRAM		BIT_ULL(37)
+#define HSW_ANY_SNOOP			(HSW_SNOOP_NONE| \
+					 HSW_SNOOP_NOT_NEEDED|HSW_SNOOP_MISS| \
+					 HSW_SNOOP_HIT_NO_FWD|HSW_SNOOP_HIT_WITH_FWD| \
+					 HSW_SNOOP_HITM|HSW_SNOOP_NON_DRAM)
+#define HSW_SNOOP_DRAM			(HSW_ANY_SNOOP & ~HSW_SNOOP_NON_DRAM)
+#define HSW_DEMAND_READ			HSW_DEMAND_DATA_RD
+#define HSW_DEMAND_WRITE		HSW_DEMAND_RFO
+#define HSW_L3_MISS_REMOTE		(HSW_L3_MISS_REMOTE_HOP0|\
+					 HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P)
+#define HSW_LLC_ACCESS			HSW_ANY_RESPONSE
+
+static __initconst const u64 hsw_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x81d0,	/* MEM_UOPS_RETIRED.ALL_LOADS, HSM30 */
+		[ C(RESULT_MISS)   ] = 0x151,	/* L1D.REPLACEMENT */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x82d0,	/* MEM_UOPS_RETIRED.ALL_STORES, HSM30 */
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x280,	/* ICACHE.MISSES */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x81d0,	/* MEM_UOPS_RETIRED.ALL_LOADS, HSM30 */
+		[ C(RESULT_MISS)   ] = 0x108,	/* DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x82d0,	/* MEM_UOPS_RETIRED.ALL_STORES, HSM30 */
+		[ C(RESULT_MISS)   ] = 0x149,	/* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x6085,	/* ITLB_MISSES.STLB_HIT */
+		[ C(RESULT_MISS)   ] = 0x185,	/* ITLB_MISSES.MISS_CAUSES_A_WALK */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0xc4,	/* BR_INST_RETIRED.ALL_BRANCHES */
+		[ C(RESULT_MISS)   ] = 0xc5,	/* BR_MISP_RETIRED.ALL_BRANCHES */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+};
+
+static __initconst const u64 hsw_hw_cache_extra_regs
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
+				       HSW_LLC_ACCESS,
+		[ C(RESULT_MISS)   ] = HSW_DEMAND_READ|
+				       HSW_L3_MISS|HSW_ANY_SNOOP,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
+				       HSW_LLC_ACCESS,
+		[ C(RESULT_MISS)   ] = HSW_DEMAND_WRITE|
+				       HSW_L3_MISS|HSW_ANY_SNOOP,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
+				       HSW_L3_MISS_LOCAL_DRAM|
+				       HSW_SNOOP_DRAM,
+		[ C(RESULT_MISS)   ] = HSW_DEMAND_READ|
+				       HSW_L3_MISS_REMOTE|
+				       HSW_SNOOP_DRAM,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
+				       HSW_L3_MISS_LOCAL_DRAM|
+				       HSW_SNOOP_DRAM,
+		[ C(RESULT_MISS)   ] = HSW_DEMAND_WRITE|
+				       HSW_L3_MISS_REMOTE|
+				       HSW_SNOOP_DRAM,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+};
+
 static __initconst const u64 westmere_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
 				[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -2546,8 +2737,8 @@ __init int intel_pmu_init(void)
 	case 69: /* 22nm Haswell ULT */
 	case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
 		x86_pmu.late_ack = true;
-		memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids));
-		memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+		memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+		memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
 
 		intel_pmu_lbr_init_snb();
 
-- 
1.9.3


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 2/3] perf, x86: Add Broadwell core support
  2015-02-18  2:18 [PATCH 1/3] perf, x86: Add new cache events table for Haswell Andi Kleen
@ 2015-02-18  2:18 ` Andi Kleen
  2015-03-27 11:39   ` [tip:perf/core] perf/x86/intel: " tip-bot for Andi Kleen
  2015-02-18  2:18 ` [PATCH 3/3] perf, x86: Add INST_RETIRED.ALL workarounds Andi Kleen
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 21+ messages in thread
From: Andi Kleen @ 2015-02-18  2:18 UTC (permalink / raw)
  To: peterz; +Cc: linux-kernel, Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

Add Broadwell support for Broadwell to perf.

The basic support is very similar to Haswell. We use the new cache
event list added for Haswell earlier. The only differences
are a few bits related to remote nodes. To avoid an extra,
mostly identical, table these are patched up in the initialization code.

The constraint list has one new event that needs to be handled over Haswell.

Includes code and testing from Kan Liang.

v2: Remove unnamed model numbers.
v3: Rename cache event list to hsw_*. Change names.
v4: Use symbolic names for cache events. Improve comments and description.
    Fix sparse warnings (Fengguang Wu)
    Add Xeon D model number.
    Remove cache event table (in separate patch)
    Patch up remote node differences (Kan Liang)
v5: Update offcore bits
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 47 ++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 5cd7e2b..259e06b 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -220,6 +220,15 @@ static struct event_constraint intel_hsw_event_constraints[] = {
 	EVENT_CONSTRAINT_END
 };
 
+struct event_constraint intel_bdw_event_constraints[] = {
+	FIXED_EVENT_CONSTRAINT(0x00c0, 0),	/* INST_RETIRED.ANY */
+	FIXED_EVENT_CONSTRAINT(0x003c, 1),	/* CPU_CLK_UNHALTED.CORE */
+	FIXED_EVENT_CONSTRAINT(0x0300, 2),	/* CPU_CLK_UNHALTED.REF */
+	INTEL_UEVENT_CONSTRAINT(0x148, 0x4),	/* L1D_PEND_MISS.PENDING */
+	INTEL_EVENT_CONSTRAINT(0xa3, 0x4),	/* CYCLE_ACTIVITY.* */
+	EVENT_CONSTRAINT_END
+};
+
 static u64 intel_pmu_event_map(int hw_event)
 {
 	return intel_perfmon_event_map[hw_event];
@@ -454,6 +463,12 @@ static __initconst const u64 snb_hw_cache_event_ids
 					 HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P)
 #define HSW_LLC_ACCESS			HSW_ANY_RESPONSE
 
+#define BDW_L3_MISS_LOCAL		BIT(26)
+#define BDW_L3_MISS			(BDW_L3_MISS_LOCAL| \
+					 HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
+					 HSW_L3_MISS_REMOTE_HOP2P)
+
+
 static __initconst const u64 hsw_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
 				[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -2757,6 +2772,38 @@ __init int intel_pmu_init(void)
 		pr_cont("Haswell events, ");
 		break;
 
+	case 61: /* 14nm Broadwell Core-M */
+	case 86: /* 14nm Broadwell Xeon D */
+		x86_pmu.late_ack = true;
+		memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+		memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+
+		/* L3_MISS_LOCAL_DRAM is BIT(26) in Broadwell */
+		hw_cache_extra_regs[C(LL)][C(OP_READ)][C(RESULT_MISS)] = HSW_DEMAND_READ |
+									 BDW_L3_MISS|HSW_SNOOP_DRAM;
+		hw_cache_extra_regs[C(LL)][C(OP_WRITE)][C(RESULT_MISS)] = HSW_DEMAND_WRITE|BDW_L3_MISS|
+									  HSW_SNOOP_DRAM;
+		hw_cache_extra_regs[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = HSW_DEMAND_READ|
+									     BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
+		hw_cache_extra_regs[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = HSW_DEMAND_WRITE|
+									      BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
+
+		intel_pmu_lbr_init_snb();
+
+		x86_pmu.event_constraints = intel_bdw_event_constraints;
+		x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
+		x86_pmu.extra_regs = intel_snbep_extra_regs;
+		x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+		/* all extra regs are per-cpu when HT is on */
+		x86_pmu.er_flags |= ERF_HAS_RSP_1;
+		x86_pmu.er_flags |= ERF_NO_HT_SHARING;
+
+		x86_pmu.hw_config = hsw_hw_config;
+		x86_pmu.get_event_constraints = hsw_get_event_constraints;
+		x86_pmu.cpu_events = hsw_events_attrs;
+		pr_cont("Broadwell events, ");
+		break;
+
 	default:
 		switch (x86_pmu.version) {
 		case 1:
-- 
1.9.3


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 3/3] perf, x86: Add INST_RETIRED.ALL workarounds
  2015-02-18  2:18 [PATCH 1/3] perf, x86: Add new cache events table for Haswell Andi Kleen
  2015-02-18  2:18 ` [PATCH 2/3] perf, x86: Add Broadwell core support Andi Kleen
@ 2015-02-18  2:18 ` Andi Kleen
  2015-03-23  9:38   ` Ingo Molnar
  2015-03-27 11:39   ` [tip:perf/core] perf/x86/intel: " tip-bot for Andi Kleen
  2015-03-23  9:45 ` [PATCH 1/3] perf, x86: Add new cache events table for Haswell Ingo Molnar
  2015-03-27 11:39 ` [tip:perf/core] perf/x86/intel: " tip-bot for Andi Kleen
  3 siblings, 2 replies; 21+ messages in thread
From: Andi Kleen @ 2015-02-18  2:18 UTC (permalink / raw)
  To: peterz; +Cc: linux-kernel, Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

On Broadwell INST_RETIRED.ALL cannot be used with any period
that doesn't have the lowest 6 bits cleared. And the period
should not be smaller than 128.

Add a new callback to enforce this, and set it for Broadwell.

This is erratum BDM57 and BDM11.

How does this handle the case when an app requests a specific
period with some of the bottom bits set

The apps thinks it is sampling at X occurences per sample, when it is
in fact at X - 63 (worst case).

Short answer:

Any useful instruction sampling period needs to be 4-6 orders
of magnitude larger than 128, as an PMI every 128 instructions
would instantly overwhelm the system and be throttled.
So the +-64 error from this is really small compared to the
period, much smaller than normal system jitter.

Long answer:

<write up by Peter:>

IFF we guarantee perf_event_attr::sample_period >= 128.

Suppose we start out with sample_period=192; then we'll set period_left
to 192, we'll end up with left = 128 (we truncate the lower bits). We
get an interrupt, find that period_left = 64 (>0 so we return 0 and
don't get an overflow handler), up that to 128. Then we trigger again,
at n=256. Then we find period_left = -64 (<=0 so we return 1 and do get
an overflow). We increment with sample_period so we get left = 128. We
fire again, at n=384, period_left = 0 (<=0 so we return 1 and get an
overflow). And on and on.

So while the individual interrupts are 'wrong' we get then with
interval=256,128 in exactly the right ratio to average out at 192. And
this works for everything >=128.

So the num_samples*fixed_period thing is still entirely correct +- 127,
which is good enough I'd say, as you already have that error anyhow.

So no need to 'fix' the tools, al we need to do is refuse to create
INST_RETIRED:ALL events with sample_period < 128.

v2: Use correct event name in description. Use EVENT() macro.
v3: Use INTEL_ARCH_EVENT_MASK. Error out for events with too small period.
v4: Expand description.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/kernel/cpu/perf_event.c       |  9 +++++++++
 arch/x86/kernel/cpu/perf_event.h       |  1 +
 arch/x86/kernel/cpu/perf_event_intel.c | 19 +++++++++++++++++++
 3 files changed, 29 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 143e5f5..66451a6 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -445,6 +445,12 @@ int x86_pmu_hw_config(struct perf_event *event)
 	if (event->attr.type == PERF_TYPE_RAW)
 		event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
 
+	if (event->attr.sample_period && x86_pmu.limit_period) {
+		if (x86_pmu.limit_period(event, event->attr.sample_period) >
+				event->attr.sample_period)
+			return -EINVAL;
+	}
+
 	return x86_setup_perfctr(event);
 }
 
@@ -982,6 +988,9 @@ int x86_perf_event_set_period(struct perf_event *event)
 	if (left > x86_pmu.max_period)
 		left = x86_pmu.max_period;
 
+	if (x86_pmu.limit_period)
+		left = x86_pmu.limit_period(event, left);
+
 	per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
 
 	/*
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 4e6cdb0..6ce77bd 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -449,6 +449,7 @@ struct x86_pmu {
 	struct x86_pmu_quirk *quirks;
 	int		perfctr_second_write;
 	bool		late_ack;
+	unsigned	(*limit_period)(struct perf_event *event, unsigned l);
 
 	/*
 	 * sysfs attrs
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 259e06b..222274c 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -2111,6 +2111,24 @@ hsw_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
 	return c;
 }
 
+/*
+ * Broadwell:
+ * The INST_RETIRED.ALL period always needs to have lowest
+ * 6bits cleared (BDM57). It shall not use a period smaller
+ * than 100 (BDM11). We combine the two to enforce
+ * a min-period of 128.
+ */
+static unsigned bdw_limit_period(struct perf_event *event, unsigned left)
+{
+	if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
+			X86_CONFIG(.event=0xc0, .umask=0x01)) {
+		if (left < 128)
+			left = 128;
+		left &= ~0x3fu;
+	}
+	return left;
+}
+
 PMU_FORMAT_ATTR(event,	"config:0-7"	);
 PMU_FORMAT_ATTR(umask,	"config:8-15"	);
 PMU_FORMAT_ATTR(edge,	"config:18"	);
@@ -2801,6 +2819,7 @@ __init int intel_pmu_init(void)
 		x86_pmu.hw_config = hsw_hw_config;
 		x86_pmu.get_event_constraints = hsw_get_event_constraints;
 		x86_pmu.cpu_events = hsw_events_attrs;
+		x86_pmu.limit_period = bdw_limit_period;
 		pr_cont("Broadwell events, ");
 		break;
 
-- 
1.9.3


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH 3/3] perf, x86: Add INST_RETIRED.ALL workarounds
  2015-02-18  2:18 ` [PATCH 3/3] perf, x86: Add INST_RETIRED.ALL workarounds Andi Kleen
@ 2015-03-23  9:38   ` Ingo Molnar
  2015-03-23 10:19     ` Peter Zijlstra
  2015-03-27 11:39   ` [tip:perf/core] perf/x86/intel: " tip-bot for Andi Kleen
  1 sibling, 1 reply; 21+ messages in thread
From: Ingo Molnar @ 2015-03-23  9:38 UTC (permalink / raw)
  To: Andi Kleen; +Cc: peterz, linux-kernel, Andi Kleen


* Andi Kleen <andi@firstfloor.org> wrote:

> From: Andi Kleen <ak@linux.intel.com>
> 
> On Broadwell INST_RETIRED.ALL cannot be used with any period
> that doesn't have the lowest 6 bits cleared. And the period
> should not be smaller than 128.

Sloppy changelog: a most basic question is not answered by the 
changelog: what happens in practice when the period is set to a 
smaller value than 128?

> +/*
> + * Broadwell:
> + * The INST_RETIRED.ALL period always needs to have lowest
> + * 6bits cleared (BDM57). It shall not use a period smaller
> + * than 100 (BDM11). We combine the two to enforce
> + * a min-period of 128.
> + */

Sloppy comment: that's not what we do:

> +static unsigned bdw_limit_period(struct perf_event *event, unsigned left)
> +{
> +	if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
> +			X86_CONFIG(.event=0xc0, .umask=0x01)) {
> +		if (left < 128)
> +			left = 128;
> +		left &= ~0x3fu;
> +	}
> +	return left;

We enforce a minimum period of 128 and round the requested period to 
64.

I think in this case it would be useful to tooling if we updated the 
syscall attribute with the real period value that was used, to not 
skew tooling output.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/3] perf, x86: Add new cache events table for Haswell
  2015-02-18  2:18 [PATCH 1/3] perf, x86: Add new cache events table for Haswell Andi Kleen
  2015-02-18  2:18 ` [PATCH 2/3] perf, x86: Add Broadwell core support Andi Kleen
  2015-02-18  2:18 ` [PATCH 3/3] perf, x86: Add INST_RETIRED.ALL workarounds Andi Kleen
@ 2015-03-23  9:45 ` Ingo Molnar
  2015-03-23 13:53   ` Peter Zijlstra
  2015-03-27 11:39 ` [tip:perf/core] perf/x86/intel: " tip-bot for Andi Kleen
  3 siblings, 1 reply; 21+ messages in thread
From: Ingo Molnar @ 2015-03-23  9:45 UTC (permalink / raw)
  To: Andi Kleen; +Cc: peterz, linux-kernel, Andi Kleen


* Andi Kleen <andi@firstfloor.org> wrote:

> From: Andi Kleen <ak@linux.intel.com>
> 
> Haswell offcore events are quite different from Sandy Bridge.
> Add a new table to handle Haswell properly.
> 
> Note that the offcore bits listed in the SDM are not quite correct
> (this is currently being fixed). An uptodate list of bits is
> in the patch.
> 
> The basic setup is similar to Sandy Bridge. The prefetch columns
> have been removed, as prefetch counting is not very reliable
> on Haswell. One L1 event that is not in the event list anymore
> has been also removed.
> 
> - data reads do not include code reads (comparable to earlier Sandy
> Bridge tables)
> - data counts include speculative execution (except L1 write, dtlb, bpu)
> - remote node access includes both remote memory, remote cache, remote mmio.
> - prefetches are not included in the counts for consistency
> (different from Sandy Bridge, which includes prefetches in the remote node)
> 
> The events with additional caveats have references to the specification update.

> +		[ C(RESULT_ACCESS) ] = 0x81d0,	/* MEM_UOPS_RETIRED.ALL_LOADS, HSM30 */
> +		[ C(RESULT_ACCESS) ] = 0x82d0,	/* MEM_UOPS_RETIRED.ALL_STORES, HSM30 */
> +		[ C(RESULT_ACCESS) ] = 0x81d0,	/* MEM_UOPS_RETIRED.ALL_LOADS, HSM30 */
> +		[ C(RESULT_ACCESS) ] = 0x82d0,	/* MEM_UOPS_RETIRED.ALL_STORES, HSM30 */

So that 'HSM30' is code for the specification update?

You'll need to properly describe HSM30 at least once instead of using 
obfuscation.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 3/3] perf, x86: Add INST_RETIRED.ALL workarounds
  2015-03-23  9:38   ` Ingo Molnar
@ 2015-03-23 10:19     ` Peter Zijlstra
  2015-03-23 10:39       ` Ingo Molnar
  0 siblings, 1 reply; 21+ messages in thread
From: Peter Zijlstra @ 2015-03-23 10:19 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Andi Kleen, linux-kernel, Andi Kleen

On Mon, Mar 23, 2015 at 10:38:54AM +0100, Ingo Molnar wrote:
> 
> * Andi Kleen <andi@firstfloor.org> wrote:
> 
> > From: Andi Kleen <ak@linux.intel.com>
> > 
> > On Broadwell INST_RETIRED.ALL cannot be used with any period
> > that doesn't have the lowest 6 bits cleared. And the period
> > should not be smaller than 128.
> 
> Sloppy changelog: a most basic question is not answered by the 
> changelog: what happens in practice when the period is set to a 
> smaller value than 128?

http://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/5th-gen-core-family-spec-update.pdf

BDM11 and BDM55 (not 57) tell us that the PMU will generate crap output
if you don't do this. Non-fatal but gibberish.

> > +/*
> > + * Broadwell:
> > + * The INST_RETIRED.ALL period always needs to have lowest
> > + * 6bits cleared (BDM57). It shall not use a period smaller
> > + * than 100 (BDM11). We combine the two to enforce
> > + * a min-period of 128.
> > + */
> 
> Sloppy comment: that's not what we do:
> 
> > +static unsigned bdw_limit_period(struct perf_event *event, unsigned left)
> > +{
> > +	if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
> > +			X86_CONFIG(.event=0xc0, .umask=0x01)) {
> > +		if (left < 128)
> > +			left = 128;
> > +		left &= ~0x3fu;
> > +	}
> > +	return left;
> 
> We enforce a minimum period of 128 and round the requested period to 
> 64.

Not quite, we enforce a min period of 128 but otherwise mask bit0-5, no
rounding up.

Which is pretty much what the comment says.

> I think in this case it would be useful to tooling if we updated the 
> syscall attribute with the real period value that was used, to not 
> skew tooling output.

Seeing how we already have a fuzz of up to sample_period events; we
don't know how far into the last period we are when we stop the event,
it might have been 1 event away from generating a PMI, this patch
doesn't actually add significantly to that.

Also, the effective period is the one specified, if the requested period
< 128 we simply reject the event creation. If its any larger we iterate
around the requested sample period with steps of 64 but such that we
average out on the requested period. There is no 'real' period to copy
back.

Another way to look at this is that we use a form of pulse density
modulation to create the desired period using the larger step size; or
perhaps compare it to Breshenham's line drawing algorithm.




^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 3/3] perf, x86: Add INST_RETIRED.ALL workarounds
  2015-03-23 10:19     ` Peter Zijlstra
@ 2015-03-23 10:39       ` Ingo Molnar
  2015-03-23 12:35         ` Peter Zijlstra
  0 siblings, 1 reply; 21+ messages in thread
From: Ingo Molnar @ 2015-03-23 10:39 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Andi Kleen, linux-kernel, Andi Kleen


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Mon, Mar 23, 2015 at 10:38:54AM +0100, Ingo Molnar wrote:
> > 
> > * Andi Kleen <andi@firstfloor.org> wrote:
> > 
> > > From: Andi Kleen <ak@linux.intel.com>
> > > 
> > > On Broadwell INST_RETIRED.ALL cannot be used with any period
> > > that doesn't have the lowest 6 bits cleared. And the period
> > > should not be smaller than 128.
> > 
> > Sloppy changelog: a most basic question is not answered by the 
> > changelog: what happens in practice when the period is set to a 
> > smaller value than 128?
> 
> http://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/5th-gen-core-family-spec-update.pdf
> 
> BDM11 and BDM55 (not 57) tell us that the PMU will generate crap output
> if you don't do this. Non-fatal but gibberish.

Should be part of the changelog?

> > > +/*
> > > + * Broadwell:
> > > + * The INST_RETIRED.ALL period always needs to have lowest
> > > + * 6bits cleared (BDM57). It shall not use a period smaller
> > > + * than 100 (BDM11). We combine the two to enforce
> > > + * a min-period of 128.
> > > + */
> > 
> > Sloppy comment: that's not what we do:
> > 
> > > +static unsigned bdw_limit_period(struct perf_event *event, unsigned left)
> > > +{
> > > +	if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
> > > +			X86_CONFIG(.event=0xc0, .umask=0x01)) {
> > > +		if (left < 128)
> > > +			left = 128;
> > > +		left &= ~0x3fu;
> > > +	}
> > > +	return left;
> > 
> > We enforce a minimum period of 128 and round the requested period to 
> > 64.
> 
> Not quite, we enforce a min period of 128 but otherwise mask bit0-5, no
> rounding up.

So I did not say rounding up, I meant this sentence:

> > > + *   [...] We combine the two to enforce
> > > + * a min-period of 128.

IMO ambiguously suggests that the result of the combination of the two 
is to enforce a min-period of 128. Would somethin like this:

	          We combine the two to enforce
          a min-period of 128, rounded (down) to multiples of 64.
          The original period is still kept by the core code and is 
          approximated in the long run via these slightly fuzzed 
          hardware-periods.

work with you?

> > I think in this case it would be useful to tooling if we updated 
> > the syscall attribute with the real period value that was used, to 
> > not skew tooling output.
> 
> Seeing how we already have a fuzz of up to sample_period events; we 
> don't know how far into the last period we are when we stop the 
> event, it might have been 1 event away from generating a PMI, this 
> patch doesn't actually add significantly to that.
> 
> Also, the effective period is the one specified, if the requested 
> period < 128 we simply reject the event creation. If its any larger 
> we iterate around the requested sample period with steps of 64 but 
> such that we average out on the requested period. There is no 'real' 
> period to copy back.

Yeah, fair enough.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 3/3] perf, x86: Add INST_RETIRED.ALL workarounds
  2015-03-23 10:39       ` Ingo Molnar
@ 2015-03-23 12:35         ` Peter Zijlstra
  2015-03-23 13:32           ` Ingo Molnar
  0 siblings, 1 reply; 21+ messages in thread
From: Peter Zijlstra @ 2015-03-23 12:35 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Andi Kleen, linux-kernel, Andi Kleen

On Mon, Mar 23, 2015 at 11:39:00AM +0100, Ingo Molnar wrote:
> > http://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/5th-gen-core-family-spec-update.pdf
> > 
> > BDM11 and BDM55 (not 57) tell us that the PMU will generate crap output
> > if you don't do this. Non-fatal but gibberish.
> 
> Should be part of the changelog?

Sure, lemme go make that happen.

> So I did not say rounding up, I meant this sentence:
> 
> > > > + *   [...] We combine the two to enforce
> > > > + * a min-period of 128.
> 
> IMO ambiguously suggests that the result of the combination of the two 
> is to enforce a min-period of 128. Would somethin like this:
> 
> 	          We combine the two to enforce
>           a min-period of 128, rounded (down) to multiples of 64.
>           The original period is still kept by the core code and is 
>           approximated in the long run via these slightly fuzzed 
>           hardware-periods.

Like so then?

---
Subject: perf, x86: Add INST_RETIRED.ALL workarounds
From: Andi Kleen <ak@linux.intel.com>
Date: Tue, 17 Feb 2015 18:18:06 -0800

On Broadwell INST_RETIRED.ALL cannot be used with any period
that doesn't have the lowest 6 bits cleared. And the period
should not be smaller than 128.

This is erratum BDM11 and BDM55;
http://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/5th-gen-core-family-spec-update.pdf

BDM11: When using a period < 100; we may get incorrect PEBS/PMI
interrupts and/or an invalid counter state.
BDM55: When bit0-5 of the period are !0 we may get redundant PEBS
records on overflow.

Add a new callback to enforce this, and set it for Broadwell.

How does this handle the case when an app requests a specific
period with some of the bottom bits set?

Short answer:

Any useful instruction sampling period needs to be 4-6 orders
of magnitude larger than 128, as an PMI every 128 instructions
would instantly overwhelm the system and be throttled.
So the +-64 error from this is really small compared to the
period, much smaller than normal system jitter.

Long answer (by Peterz):

IFF we guarantee perf_event_attr::sample_period >= 128.

Suppose we start out with sample_period=192; then we'll set period_left
to 192, we'll end up with left = 128 (we truncate the lower bits). We
get an interrupt, find that period_left = 64 (>0 so we return 0 and
don't get an overflow handler), up that to 128. Then we trigger again,
at n=256. Then we find period_left = -64 (<=0 so we return 1 and do get
an overflow). We increment with sample_period so we get left = 128. We
fire again, at n=384, period_left = 0 (<=0 so we return 1 and get an
overflow). And on and on.

So while the individual interrupts are 'wrong' we get then with
interval=256,128 in exactly the right ratio to average out at 192. And
this works for everything >=128.

So the num_samples*fixed_period thing is still entirely correct +- 127,
which is good enough I'd say, as you already have that error anyhow.

So no need to 'fix' the tools, al we need to do is refuse to create
INST_RETIRED:ALL events with sample_period < 128.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/kernel/cpu/perf_event.c       |    9 +++++++++
 arch/x86/kernel/cpu/perf_event.h       |    1 +
 arch/x86/kernel/cpu/perf_event_intel.c |   27 +++++++++++++++++++++++++++
 3 files changed, 37 insertions(+)

--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -451,6 +451,12 @@ int x86_pmu_hw_config(struct perf_event
 	if (event->attr.type == PERF_TYPE_RAW)
 		event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
 
+	if (event->attr.sample_period && x86_pmu.limit_period) {
+		if (x86_pmu.limit_period(event, event->attr.sample_period) >
+				event->attr.sample_period)
+			return -EINVAL;
+	}
+
 	return x86_setup_perfctr(event);
 }
 
@@ -988,6 +994,9 @@ int x86_perf_event_set_period(struct per
 	if (left > x86_pmu.max_period)
 		left = x86_pmu.max_period;
 
+	if (x86_pmu.limit_period)
+		left = x86_pmu.limit_period(event, left);
+
 	per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
 
 	/*
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -451,6 +451,7 @@ struct x86_pmu {
 	struct x86_pmu_quirk *quirks;
 	int		perfctr_second_write;
 	bool		late_ack;
+	unsigned	(*limit_period)(struct perf_event *event, unsigned l);
 
 	/*
 	 * sysfs attrs
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -2097,6 +2097,32 @@ hsw_get_event_constraints(struct cpu_hw_
 	return c;
 }
 
+/*
+ * Broadwell:
+ *
+ * The INST_RETIRED.ALL period always needs to have lowest 6 bits cleared
+ * (BDM55) and it must not use a period smaller than 100 (BDM11). We combine
+ * the two to enforce a minimum period of 128 (the smallest value that has bits
+ * 0-5 cleared and >= 100).
+ *
+ * Because of how the code in x86_perf_event_set_period() works, the truncation
+ * of the lower 6 bits is 'harmless' as we'll occasionally add a longer period
+ * to make up for the 'lost' events due to carrying the 'error' in period_left.
+ *
+ * Therefore the effective (average) period matches the requested period,
+ * despite coarser hardware granularity.
+ */
+static unsigned bdw_limit_period(struct perf_event *event, unsigned left)
+{
+	if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
+			X86_CONFIG(.event=0xc0, .umask=0x01)) {
+		if (left < 128)
+			left = 128;
+		left &= ~0x3fu;
+	}
+	return left;
+}
+
 PMU_FORMAT_ATTR(event,	"config:0-7"	);
 PMU_FORMAT_ATTR(umask,	"config:8-15"	);
 PMU_FORMAT_ATTR(edge,	"config:18"	);
@@ -2775,6 +2801,7 @@ __init int intel_pmu_init(void)
 		x86_pmu.hw_config = hsw_hw_config;
 		x86_pmu.get_event_constraints = hsw_get_event_constraints;
 		x86_pmu.cpu_events = hsw_events_attrs;
+		x86_pmu.limit_period = bdw_limit_period;
 		pr_cont("Broadwell events, ");
 		break;
 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 3/3] perf, x86: Add INST_RETIRED.ALL workarounds
  2015-03-23 12:35         ` Peter Zijlstra
@ 2015-03-23 13:32           ` Ingo Molnar
  0 siblings, 0 replies; 21+ messages in thread
From: Ingo Molnar @ 2015-03-23 13:32 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Andi Kleen, linux-kernel, Andi Kleen


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Mon, Mar 23, 2015 at 11:39:00AM +0100, Ingo Molnar wrote:
> > > http://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/5th-gen-core-family-spec-update.pdf
> > > 
> > > BDM11 and BDM55 (not 57) tell us that the PMU will generate crap output
> > > if you don't do this. Non-fatal but gibberish.
> > 
> > Should be part of the changelog?
> 
> Sure, lemme go make that happen.
> 
> > So I did not say rounding up, I meant this sentence:
> > 
> > > > > + *   [...] We combine the two to enforce
> > > > > + * a min-period of 128.
> > 
> > IMO ambiguously suggests that the result of the combination of the two 
> > is to enforce a min-period of 128. Would somethin like this:
> > 
> > 	          We combine the two to enforce
> >           a min-period of 128, rounded (down) to multiples of 64.
> >           The original period is still kept by the core code and is 
> >           approximated in the long run via these slightly fuzzed 
> >           hardware-periods.
> 
> Like so then?

Yeah, looks good to me!

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/3] perf, x86: Add new cache events table for Haswell
  2015-03-23  9:45 ` [PATCH 1/3] perf, x86: Add new cache events table for Haswell Ingo Molnar
@ 2015-03-23 13:53   ` Peter Zijlstra
  2015-03-23 13:55     ` Ingo Molnar
  0 siblings, 1 reply; 21+ messages in thread
From: Peter Zijlstra @ 2015-03-23 13:53 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Andi Kleen, linux-kernel, Andi Kleen

On Mon, Mar 23, 2015 at 10:45:07AM +0100, Ingo Molnar wrote:
> 
> * Andi Kleen <andi@firstfloor.org> wrote:
> 
> > From: Andi Kleen <ak@linux.intel.com>
> > 
> > Haswell offcore events are quite different from Sandy Bridge.
> > Add a new table to handle Haswell properly.
> > 
> > Note that the offcore bits listed in the SDM are not quite correct
> > (this is currently being fixed). An uptodate list of bits is
> > in the patch.
> > 
> > The basic setup is similar to Sandy Bridge. The prefetch columns
> > have been removed, as prefetch counting is not very reliable
> > on Haswell. One L1 event that is not in the event list anymore
> > has been also removed.
> > 
> > - data reads do not include code reads (comparable to earlier Sandy
> > Bridge tables)
> > - data counts include speculative execution (except L1 write, dtlb, bpu)
> > - remote node access includes both remote memory, remote cache, remote mmio.
> > - prefetches are not included in the counts for consistency
> > (different from Sandy Bridge, which includes prefetches in the remote node)
> > 
> > The events with additional caveats have references to the specification update.
> 
> > +		[ C(RESULT_ACCESS) ] = 0x81d0,	/* MEM_UOPS_RETIRED.ALL_LOADS, HSM30 */
> > +		[ C(RESULT_ACCESS) ] = 0x82d0,	/* MEM_UOPS_RETIRED.ALL_STORES, HSM30 */
> > +		[ C(RESULT_ACCESS) ] = 0x81d0,	/* MEM_UOPS_RETIRED.ALL_LOADS, HSM30 */
> > +		[ C(RESULT_ACCESS) ] = 0x82d0,	/* MEM_UOPS_RETIRED.ALL_STORES, HSM30 */
> 
> So that 'HSM30' is code for the specification update?

Yep; found it in:
http://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/4th-gen-core-family-mobile-specification-update.pdf

> You'll need to properly describe HSM30 at least once instead of using 
> obfuscation.


HSM30.
Problem:
Performance Monitor Counters May Produce Incorrect Results
When operating with SMT enabled, a memory at-retirement performance monitoring
event (from the list below) may be dropped or may increment an enabled event on the
corresponding counter with the same number on the physical core’s other thread rather
than the thread experiencing the event. Processors with SMT disabled in BIOS are not
affected by this erratum.
The list of affected memory at-retirement events is as follows:
MEM_UOP_RETIRED.LOADS
MEM_UOP_RETIRED.STORES
MEM_UOP_RETIRED.LOCK
MEM_UOP_RETIRED.SPLIT
MEM_UOP_RETIRED.STLB_MISS
MEM_LOAD_UOPS_RETIRED.HIT_LFB
MEM_LOAD_UOPS_RETIRED.L1_HIT
MEM_LOAD_UOPS_RETIRED.L2_HIT
MEM_LOAD_UOPS_RETIRED.L3_HIT
MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT
MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM
MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS
MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_NONE
MEM_LOAD_UOPS_RETIRED.L3_MISS
MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM
MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM
MEM_LOAD_UOPS_RETIRED.L2_MISS
Implication:
Due to this erratum, certain performance monitoring event will produce unreliable
results during hyper-threaded operation.
Workaround:
None identified.
Status:
For the steppings affected, see the Summary Table of Changes.

Stephane is working on patches to address this. It affects multiple
generations.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/3] perf, x86: Add new cache events table for Haswell
  2015-03-23 13:53   ` Peter Zijlstra
@ 2015-03-23 13:55     ` Ingo Molnar
  0 siblings, 0 replies; 21+ messages in thread
From: Ingo Molnar @ 2015-03-23 13:55 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Andi Kleen, linux-kernel, Andi Kleen


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Mon, Mar 23, 2015 at 10:45:07AM +0100, Ingo Molnar wrote:
> > 
> > * Andi Kleen <andi@firstfloor.org> wrote:
> > 
> > > From: Andi Kleen <ak@linux.intel.com>
> > > 
> > > Haswell offcore events are quite different from Sandy Bridge.
> > > Add a new table to handle Haswell properly.
> > > 
> > > Note that the offcore bits listed in the SDM are not quite correct
> > > (this is currently being fixed). An uptodate list of bits is
> > > in the patch.
> > > 
> > > The basic setup is similar to Sandy Bridge. The prefetch columns
> > > have been removed, as prefetch counting is not very reliable
> > > on Haswell. One L1 event that is not in the event list anymore
> > > has been also removed.
> > > 
> > > - data reads do not include code reads (comparable to earlier Sandy
> > > Bridge tables)
> > > - data counts include speculative execution (except L1 write, dtlb, bpu)
> > > - remote node access includes both remote memory, remote cache, remote mmio.
> > > - prefetches are not included in the counts for consistency
> > > (different from Sandy Bridge, which includes prefetches in the remote node)
> > > 
> > > The events with additional caveats have references to the specification update.
> > 
> > > +		[ C(RESULT_ACCESS) ] = 0x81d0,	/* MEM_UOPS_RETIRED.ALL_LOADS, HSM30 */
> > > +		[ C(RESULT_ACCESS) ] = 0x82d0,	/* MEM_UOPS_RETIRED.ALL_STORES, HSM30 */
> > > +		[ C(RESULT_ACCESS) ] = 0x81d0,	/* MEM_UOPS_RETIRED.ALL_LOADS, HSM30 */
> > > +		[ C(RESULT_ACCESS) ] = 0x82d0,	/* MEM_UOPS_RETIRED.ALL_STORES, HSM30 */
> > 
> > So that 'HSM30' is code for the specification update?
> 
> Yep; found it in:
> http://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/4th-gen-core-family-mobile-specification-update.pdf
> 
> > You'll need to properly describe HSM30 at least once instead of using 
> > obfuscation.
> 
> 
> HSM30.
> Problem:
> Performance Monitor Counters May Produce Incorrect Results
> When operating with SMT enabled, a memory at-retirement performance monitoring
> event (from the list below) may be dropped or may increment an enabled event on the
> corresponding counter with the same number on the physical core’s other thread rather
> than the thread experiencing the event. Processors with SMT disabled in BIOS are not
> affected by this erratum.
> The list of affected memory at-retirement events is as follows:
> MEM_UOP_RETIRED.LOADS
> MEM_UOP_RETIRED.STORES
> MEM_UOP_RETIRED.LOCK
> MEM_UOP_RETIRED.SPLIT
> MEM_UOP_RETIRED.STLB_MISS
> MEM_LOAD_UOPS_RETIRED.HIT_LFB
> MEM_LOAD_UOPS_RETIRED.L1_HIT
> MEM_LOAD_UOPS_RETIRED.L2_HIT
> MEM_LOAD_UOPS_RETIRED.L3_HIT
> MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT
> MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM
> MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS
> MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_NONE
> MEM_LOAD_UOPS_RETIRED.L3_MISS
> MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM
> MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM
> MEM_LOAD_UOPS_RETIRED.L2_MISS
> Implication:
> Due to this erratum, certain performance monitoring event will produce unreliable
> results during hyper-threaded operation.
> Workaround:
> None identified.
> Status:
> For the steppings affected, see the Summary Table of Changes.
> 
> Stephane is working on patches to address this. It affects multiple
> generations.

Ok - then at minimum a minimal summary of 'HSM30' should be added to 
the code, so that people know what's happening, without having to dig 
out Intel documents (which might be gone years down the line).

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [tip:perf/core] perf/x86/intel: Add new cache events table for Haswell
  2015-02-18  2:18 [PATCH 1/3] perf, x86: Add new cache events table for Haswell Andi Kleen
                   ` (2 preceding siblings ...)
  2015-03-23  9:45 ` [PATCH 1/3] perf, x86: Add new cache events table for Haswell Ingo Molnar
@ 2015-03-27 11:39 ` tip-bot for Andi Kleen
  3 siblings, 0 replies; 21+ messages in thread
From: tip-bot for Andi Kleen @ 2015-03-27 11:39 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: hpa, mingo, linux-kernel, peterz, ak, tglx

Commit-ID:  0f1b5ca240c65ed9533f193720f337bf24fb2f2f
Gitweb:     http://git.kernel.org/tip/0f1b5ca240c65ed9533f193720f337bf24fb2f2f
Author:     Andi Kleen <ak@linux.intel.com>
AuthorDate: Tue, 17 Feb 2015 18:18:04 -0800
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Fri, 27 Mar 2015 09:14:01 +0100

perf/x86/intel: Add new cache events table for Haswell

Haswell offcore events are quite different from Sandy Bridge.
Add a new table to handle Haswell properly.

Note that the offcore bits listed in the SDM are not quite correct
(this is currently being fixed). An uptodate list of bits is
in the patch.

The basic setup is similar to Sandy Bridge. The prefetch columns
have been removed, as prefetch counting is not very reliable
on Haswell. One L1 event that is not in the event list anymore
has been also removed.

- data reads do not include code reads (comparable to earlier Sandy Bridge tables)
- data counts include speculative execution (except L1 write, dtlb, bpu)
- remote node access includes both remote memory, remote cache, remote mmio.
- prefetches are not included in the counts for consistency
  (different from Sandy Bridge, which includes prefetches in the remote node)

Signed-off-by: Andi Kleen <ak@linux.intel.com>
[ Removed the HSM30 comments; we don't have them for SNB/IVB either. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1424225886-18652-1-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 194 ++++++++++++++++++++++++++++++++-
 1 file changed, 192 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 9f1dd18..5ef64bf 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -415,6 +415,196 @@ static __initconst const u64 snb_hw_cache_event_ids
 
 };
 
+/*
+ * Notes on the events:
+ * - data reads do not include code reads (comparable to earlier tables)
+ * - data counts include speculative execution (except L1 write, dtlb, bpu)
+ * - remote node access includes remote memory, remote cache, remote mmio.
+ * - prefetches are not included in the counts because they are not
+ *   reliably counted.
+ */
+
+#define HSW_DEMAND_DATA_RD		BIT_ULL(0)
+#define HSW_DEMAND_RFO			BIT_ULL(1)
+#define HSW_ANY_RESPONSE		BIT_ULL(16)
+#define HSW_SUPPLIER_NONE		BIT_ULL(17)
+#define HSW_L3_MISS_LOCAL_DRAM		BIT_ULL(22)
+#define HSW_L3_MISS_REMOTE_HOP0		BIT_ULL(27)
+#define HSW_L3_MISS_REMOTE_HOP1		BIT_ULL(28)
+#define HSW_L3_MISS_REMOTE_HOP2P	BIT_ULL(29)
+#define HSW_L3_MISS			(HSW_L3_MISS_LOCAL_DRAM| \
+					 HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
+					 HSW_L3_MISS_REMOTE_HOP2P)
+#define HSW_SNOOP_NONE			BIT_ULL(31)
+#define HSW_SNOOP_NOT_NEEDED		BIT_ULL(32)
+#define HSW_SNOOP_MISS			BIT_ULL(33)
+#define HSW_SNOOP_HIT_NO_FWD		BIT_ULL(34)
+#define HSW_SNOOP_HIT_WITH_FWD		BIT_ULL(35)
+#define HSW_SNOOP_HITM			BIT_ULL(36)
+#define HSW_SNOOP_NON_DRAM		BIT_ULL(37)
+#define HSW_ANY_SNOOP			(HSW_SNOOP_NONE| \
+					 HSW_SNOOP_NOT_NEEDED|HSW_SNOOP_MISS| \
+					 HSW_SNOOP_HIT_NO_FWD|HSW_SNOOP_HIT_WITH_FWD| \
+					 HSW_SNOOP_HITM|HSW_SNOOP_NON_DRAM)
+#define HSW_SNOOP_DRAM			(HSW_ANY_SNOOP & ~HSW_SNOOP_NON_DRAM)
+#define HSW_DEMAND_READ			HSW_DEMAND_DATA_RD
+#define HSW_DEMAND_WRITE		HSW_DEMAND_RFO
+#define HSW_L3_MISS_REMOTE		(HSW_L3_MISS_REMOTE_HOP0|\
+					 HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P)
+#define HSW_LLC_ACCESS			HSW_ANY_RESPONSE
+
+static __initconst const u64 hsw_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x81d0,	/* MEM_UOPS_RETIRED.ALL_LOADS */
+		[ C(RESULT_MISS)   ] = 0x151,	/* L1D.REPLACEMENT */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x82d0,	/* MEM_UOPS_RETIRED.ALL_STORES */
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x280,	/* ICACHE.MISSES */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x81d0,	/* MEM_UOPS_RETIRED.ALL_LOADS */
+		[ C(RESULT_MISS)   ] = 0x108,	/* DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x82d0,	/* MEM_UOPS_RETIRED.ALL_STORES */
+		[ C(RESULT_MISS)   ] = 0x149,	/* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x6085,	/* ITLB_MISSES.STLB_HIT */
+		[ C(RESULT_MISS)   ] = 0x185,	/* ITLB_MISSES.MISS_CAUSES_A_WALK */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0xc4,	/* BR_INST_RETIRED.ALL_BRANCHES */
+		[ C(RESULT_MISS)   ] = 0xc5,	/* BR_MISP_RETIRED.ALL_BRANCHES */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+};
+
+static __initconst const u64 hsw_hw_cache_extra_regs
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
+				       HSW_LLC_ACCESS,
+		[ C(RESULT_MISS)   ] = HSW_DEMAND_READ|
+				       HSW_L3_MISS|HSW_ANY_SNOOP,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
+				       HSW_LLC_ACCESS,
+		[ C(RESULT_MISS)   ] = HSW_DEMAND_WRITE|
+				       HSW_L3_MISS|HSW_ANY_SNOOP,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
+				       HSW_L3_MISS_LOCAL_DRAM|
+				       HSW_SNOOP_DRAM,
+		[ C(RESULT_MISS)   ] = HSW_DEMAND_READ|
+				       HSW_L3_MISS_REMOTE|
+				       HSW_SNOOP_DRAM,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
+				       HSW_L3_MISS_LOCAL_DRAM|
+				       HSW_SNOOP_DRAM,
+		[ C(RESULT_MISS)   ] = HSW_DEMAND_WRITE|
+				       HSW_L3_MISS_REMOTE|
+				       HSW_SNOOP_DRAM,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+};
+
 static __initconst const u64 westmere_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
 				[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -2520,8 +2710,8 @@ __init int intel_pmu_init(void)
 	case 69: /* 22nm Haswell ULT */
 	case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
 		x86_pmu.late_ack = true;
-		memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids));
-		memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+		memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+		memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
 
 		intel_pmu_lbr_init_hsw();
 

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [tip:perf/core] perf/x86/intel: Add Broadwell core support
  2015-02-18  2:18 ` [PATCH 2/3] perf, x86: Add Broadwell core support Andi Kleen
@ 2015-03-27 11:39   ` tip-bot for Andi Kleen
  0 siblings, 0 replies; 21+ messages in thread
From: tip-bot for Andi Kleen @ 2015-03-27 11:39 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: mingo, ak, linux-kernel, tglx, peterz, hpa

Commit-ID:  91f1b70582c62576f429cf78d53751c66677553d
Gitweb:     http://git.kernel.org/tip/91f1b70582c62576f429cf78d53751c66677553d
Author:     Andi Kleen <ak@linux.intel.com>
AuthorDate: Tue, 17 Feb 2015 18:18:05 -0800
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Fri, 27 Mar 2015 09:14:02 +0100

perf/x86/intel: Add Broadwell core support

Add Broadwell support for Broadwell to perf.

The basic support is very similar to Haswell. We use the new cache
event list added for Haswell earlier. The only differences
are a few bits related to remote nodes. To avoid an extra,
mostly identical, table these are patched up in the initialization code.

The constraint list has one new event that needs to be handled over Haswell.

Includes code and testing from Kan Liang.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1424225886-18652-2-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 47 ++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 5ef64bf..2883853 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -220,6 +220,15 @@ static struct event_constraint intel_hsw_event_constraints[] = {
 	EVENT_CONSTRAINT_END
 };
 
+struct event_constraint intel_bdw_event_constraints[] = {
+	FIXED_EVENT_CONSTRAINT(0x00c0, 0),	/* INST_RETIRED.ANY */
+	FIXED_EVENT_CONSTRAINT(0x003c, 1),	/* CPU_CLK_UNHALTED.CORE */
+	FIXED_EVENT_CONSTRAINT(0x0300, 2),	/* CPU_CLK_UNHALTED.REF */
+	INTEL_UEVENT_CONSTRAINT(0x148, 0x4),	/* L1D_PEND_MISS.PENDING */
+	INTEL_EVENT_CONSTRAINT(0xa3, 0x4),	/* CYCLE_ACTIVITY.* */
+	EVENT_CONSTRAINT_END
+};
+
 static u64 intel_pmu_event_map(int hw_event)
 {
 	return intel_perfmon_event_map[hw_event];
@@ -453,6 +462,12 @@ static __initconst const u64 snb_hw_cache_event_ids
 					 HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P)
 #define HSW_LLC_ACCESS			HSW_ANY_RESPONSE
 
+#define BDW_L3_MISS_LOCAL		BIT(26)
+#define BDW_L3_MISS			(BDW_L3_MISS_LOCAL| \
+					 HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
+					 HSW_L3_MISS_REMOTE_HOP2P)
+
+
 static __initconst const u64 hsw_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
 				[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -2730,6 +2745,38 @@ __init int intel_pmu_init(void)
 		pr_cont("Haswell events, ");
 		break;
 
+	case 61: /* 14nm Broadwell Core-M */
+	case 86: /* 14nm Broadwell Xeon D */
+		x86_pmu.late_ack = true;
+		memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+		memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+
+		/* L3_MISS_LOCAL_DRAM is BIT(26) in Broadwell */
+		hw_cache_extra_regs[C(LL)][C(OP_READ)][C(RESULT_MISS)] = HSW_DEMAND_READ |
+									 BDW_L3_MISS|HSW_SNOOP_DRAM;
+		hw_cache_extra_regs[C(LL)][C(OP_WRITE)][C(RESULT_MISS)] = HSW_DEMAND_WRITE|BDW_L3_MISS|
+									  HSW_SNOOP_DRAM;
+		hw_cache_extra_regs[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = HSW_DEMAND_READ|
+									     BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
+		hw_cache_extra_regs[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = HSW_DEMAND_WRITE|
+									      BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
+
+		intel_pmu_lbr_init_snb();
+
+		x86_pmu.event_constraints = intel_bdw_event_constraints;
+		x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
+		x86_pmu.extra_regs = intel_snbep_extra_regs;
+		x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+		/* all extra regs are per-cpu when HT is on */
+		x86_pmu.er_flags |= ERF_HAS_RSP_1;
+		x86_pmu.er_flags |= ERF_NO_HT_SHARING;
+
+		x86_pmu.hw_config = hsw_hw_config;
+		x86_pmu.get_event_constraints = hsw_get_event_constraints;
+		x86_pmu.cpu_events = hsw_events_attrs;
+		pr_cont("Broadwell events, ");
+		break;
+
 	default:
 		switch (x86_pmu.version) {
 		case 1:

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [tip:perf/core] perf/x86/intel: Add INST_RETIRED.ALL workarounds
  2015-02-18  2:18 ` [PATCH 3/3] perf, x86: Add INST_RETIRED.ALL workarounds Andi Kleen
  2015-03-23  9:38   ` Ingo Molnar
@ 2015-03-27 11:39   ` tip-bot for Andi Kleen
  1 sibling, 0 replies; 21+ messages in thread
From: tip-bot for Andi Kleen @ 2015-03-27 11:39 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: peterz, tglx, hpa, linux-kernel, mingo, ak

Commit-ID:  294fe0f52a44c6f207211de0686c369a961b5533
Gitweb:     http://git.kernel.org/tip/294fe0f52a44c6f207211de0686c369a961b5533
Author:     Andi Kleen <ak@linux.intel.com>
AuthorDate: Tue, 17 Feb 2015 18:18:06 -0800
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Fri, 27 Mar 2015 09:14:03 +0100

perf/x86/intel: Add INST_RETIRED.ALL workarounds

On Broadwell INST_RETIRED.ALL cannot be used with any period
that doesn't have the lowest 6 bits cleared. And the period
should not be smaller than 128.

This is erratum BDM11 and BDM55:

  http://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/5th-gen-core-family-spec-update.pdf

BDM11: When using a period < 100; we may get incorrect PEBS/PMI
interrupts and/or an invalid counter state.
BDM55: When bit0-5 of the period are !0 we may get redundant PEBS
records on overflow.

Add a new callback to enforce this, and set it for Broadwell.

How does this handle the case when an app requests a specific
period with some of the bottom bits set?

Short answer:

Any useful instruction sampling period needs to be 4-6 orders
of magnitude larger than 128, as an PMI every 128 instructions
would instantly overwhelm the system and be throttled.
So the +-64 error from this is really small compared to the
period, much smaller than normal system jitter.

Long answer (by Peterz):

IFF we guarantee perf_event_attr::sample_period >= 128.

Suppose we start out with sample_period=192; then we'll set period_left
to 192, we'll end up with left = 128 (we truncate the lower bits). We
get an interrupt, find that period_left = 64 (>0 so we return 0 and
don't get an overflow handler), up that to 128. Then we trigger again,
at n=256. Then we find period_left = -64 (<=0 so we return 1 and do get
an overflow). We increment with sample_period so we get left = 128. We
fire again, at n=384, period_left = 0 (<=0 so we return 1 and get an
overflow). And on and on.

So while the individual interrupts are 'wrong' we get then with
interval=256,128 in exactly the right ratio to average out at 192. And
this works for everything >=128.

So the num_samples*fixed_period thing is still entirely correct +- 127,
which is good enough I'd say, as you already have that error anyhow.

So no need to 'fix' the tools, al we need to do is refuse to create
INST_RETIRED:ALL events with sample_period < 128.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
[ Updated comments and changelog a bit. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1424225886-18652-3-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c       |  9 +++++++++
 arch/x86/kernel/cpu/perf_event.h       |  1 +
 arch/x86/kernel/cpu/perf_event_intel.c | 27 +++++++++++++++++++++++++++
 3 files changed, 37 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index e0dab5c..ec6e982 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -451,6 +451,12 @@ int x86_pmu_hw_config(struct perf_event *event)
 	if (event->attr.type == PERF_TYPE_RAW)
 		event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
 
+	if (event->attr.sample_period && x86_pmu.limit_period) {
+		if (x86_pmu.limit_period(event, event->attr.sample_period) >
+				event->attr.sample_period)
+			return -EINVAL;
+	}
+
 	return x86_setup_perfctr(event);
 }
 
@@ -988,6 +994,9 @@ int x86_perf_event_set_period(struct perf_event *event)
 	if (left > x86_pmu.max_period)
 		left = x86_pmu.max_period;
 
+	if (x86_pmu.limit_period)
+		left = x86_pmu.limit_period(event, left);
+
 	per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
 
 	/*
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index a371d27..87e5081 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -451,6 +451,7 @@ struct x86_pmu {
 	struct x86_pmu_quirk *quirks;
 	int		perfctr_second_write;
 	bool		late_ack;
+	unsigned	(*limit_period)(struct perf_event *event, unsigned l);
 
 	/*
 	 * sysfs attrs
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 2883853..fc6dbc4 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -2096,6 +2096,32 @@ hsw_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
 	return c;
 }
 
+/*
+ * Broadwell:
+ *
+ * The INST_RETIRED.ALL period always needs to have lowest 6 bits cleared
+ * (BDM55) and it must not use a period smaller than 100 (BDM11). We combine
+ * the two to enforce a minimum period of 128 (the smallest value that has bits
+ * 0-5 cleared and >= 100).
+ *
+ * Because of how the code in x86_perf_event_set_period() works, the truncation
+ * of the lower 6 bits is 'harmless' as we'll occasionally add a longer period
+ * to make up for the 'lost' events due to carrying the 'error' in period_left.
+ *
+ * Therefore the effective (average) period matches the requested period,
+ * despite coarser hardware granularity.
+ */
+static unsigned bdw_limit_period(struct perf_event *event, unsigned left)
+{
+	if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
+			X86_CONFIG(.event=0xc0, .umask=0x01)) {
+		if (left < 128)
+			left = 128;
+		left &= ~0x3fu;
+	}
+	return left;
+}
+
 PMU_FORMAT_ATTR(event,	"config:0-7"	);
 PMU_FORMAT_ATTR(umask,	"config:8-15"	);
 PMU_FORMAT_ATTR(edge,	"config:18"	);
@@ -2774,6 +2800,7 @@ __init int intel_pmu_init(void)
 		x86_pmu.hw_config = hsw_hw_config;
 		x86_pmu.get_event_constraints = hsw_get_event_constraints;
 		x86_pmu.cpu_events = hsw_events_attrs;
+		x86_pmu.limit_period = bdw_limit_period;
 		pr_cont("Broadwell events, ");
 		break;
 

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/3] perf, x86: Add new cache events table for Haswell
  2015-02-12 21:28 ` Peter Zijlstra
@ 2015-02-18  4:58   ` Andi Kleen
  0 siblings, 0 replies; 21+ messages in thread
From: Andi Kleen @ 2015-02-18  4:58 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Andi Kleen, linux-kernel, kan.liang, Andi Kleen

On Thu, Feb 12, 2015 at 10:28:06PM +0100, Peter Zijlstra wrote:
> 1) SDM (2014-09) says in 18.9.5 (snb offcore):
> 
> "software must set at least one request type bit and a valid response
> type pattern"
> 
> and
> 
> "A valid response type must be a non-zero value of the following
> expression:
> 
>   ANY | [('OR' of Supplier Info Bits) & ('OR' of Snoop Info Bits)]"
> 
> Is this still valid for the HSW part?

Yes.

However they need to be set on Sandy Bridge to work around an issue.
On HSW they don't hurt at least.

> #define HSW_LLC_ACCESS	(HSW_ANY_RESPONSE)
> #define HSW_LLC_MISS	(HSW_L3_MISS|HSW_ANY_SNOOP)
> 
> Would be the suitable helpers to use here. No need to set Supplier and
> Snoop bits for ANY_RESPONSE.

Ok.

> 2) I know I included it on SNB, but would SNOOP_HITM really be a miss?
> From what I can tell it could be a local MtoS w/ WB or so. Do we count
> it as a miss because the WB part goes to DRAM so we still get to wait
> for it (just in the 'wrong' direction)?

SNOOP_HITM is a miss because M is only present in one cache.
So it's a miss for the requester.

> 
> 3) While we're there, will we get SNOOP_FWD only for Clean forwards or
> also for the HITM forwards; the SDM is vague -- it would be nice if
> SNOOP_FWD was a selector for all remote socket snoops.

It's only for clean forwards.

> 
> > + [ C(NODE) ] = {
> > +	[ C(OP_READ) ] = {
> > +		[ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
> > +				       HSW_L3_MISS_LOCAL_DRAM|HSW_SUPPLIER_NONE|
> > +				       HSW_ANY_SNOOP,
> 
> 4) SUPPLIER_NONE; I'll interpret it as specific event that lacks other
> supplier info (as opposed to _any_ supplier). What kind of events would
> this be?

Your interpretation is right.
Ok, we can drop it. It should rarely happen.

> 
> I didn't include SUPPLIER_NONE in any events on SNB, its implied by
> L3_ACCESS due to ANY, but other than that I'm not sure what to do with
> it. It seems out of place for DRAM_ANY.
> 
> 5) NODE-ACCESS is _any_ DRAM;
>    NODE-MISS is remote DRAM.
> 
> for SNB I didn't include NON_DRAM in ANY_SNOOP for this reason.
> 
> #define HSW_DRAM_ANY		(HSW_LLC_MISS & ~HSW_SNOOP_NON_DRAM)
> #define HSW_DRAM_REMOTE		(HSW_MISS_LOCAL_DRAM|HSW_ANY_SNOOP & ~HSW_SNOOP_NON_DRAM)

This should be more like
HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P|HSW_ANY_SNOOP & ~HSW_SNOOP_NON_DRAM)

> 
> 6) Should we maybe use
> 
> 	(SNOOP_ANY & ~(SNOOP_HIT_NO_FWD|NON_DRAM))
> 
> instead for DRAM_REMOTE? SNOOP_HIT_NO_FWD seem as inappropriate as
> NON_DRAM for REMOTE.

SNOOP_HIT_NO_FWD indicates DRAM.

-Andi

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/3] perf, x86: Add new cache events table for Haswell
  2015-02-11  0:40 [PATCH 1/3] perf, x86: " Andi Kleen
@ 2015-02-12 21:28 ` Peter Zijlstra
  2015-02-18  4:58   ` Andi Kleen
  0 siblings, 1 reply; 21+ messages in thread
From: Peter Zijlstra @ 2015-02-12 21:28 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel, kan.liang, Andi Kleen

On Tue, Feb 10, 2015 at 04:40:22PM -0800, Andi Kleen wrote:

> diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
> index 498b6d9..02ab31d 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel.c
> @@ -415,6 +415,199 @@ static __initconst const u64 snb_hw_cache_event_ids
>  
>  };
>  
> +/*
> + * Notes on the events:
> + * - data reads do not include code reads (comparable to earlier tables)
> + * - data counts include speculative execution (except L1 write, dtlb, bpu)
> + * - remote node access includes remote memory, remote cache, remote mmio.
> + * - prefetches are not included in the counts because they are not
> + *   reliably counted.
> + * The events with additional caveats have references to the specification update.
> + */
> +
> +#define HSW_DEMAND_DATA_RD		BIT_ULL(0)
> +#define HSW_DEMAND_RFO			BIT_ULL(1)
> +#define HSW_ANY_RESPONSE		BIT_ULL(16)
> +#define HSW_SUPPLIER_NONE		BIT_ULL(17)
> +#define HSW_L3_MISS_LOCAL_DRAM		BIT_ULL(22)
> +#define HSW_L3_MISS_REMOTE_HOP0		BIT_ULL(27)
> +#define HSW_L3_MISS_REMOTE_HOP1		BIT_ULL(28)
> +#define HSW_L3_MISS_REMOTE_HOP2P	BIT_ULL(29)
> +#define HSW_L3_MISS			(HSW_L3_MISS_LOCAL_DRAM| \
> +					 HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
> +					 HSW_L3_MISS_REMOTE_HOP2P)
> +#define HSW_SNOOP_NONE			BIT_ULL(31)
> +#define HSW_SNOOP_NOT_NEEDED		BIT_ULL(32)
> +#define HSW_SNOOP_MISS			BIT_ULL(33)
> +#define HSW_SNOOP_HIT_NO_FWD		BIT_ULL(34)
> +#define HSW_SNOOP_HIT_WITH_FWD		BIT_ULL(35)
> +#define HSW_SNOOP_HITM			BIT_ULL(36)
> +#define HSW_SNOOP_NON_DRAM		BIT_ULL(37)
> +#define HSW_ANY_SNOOP			(HSW_SNOOP_NONE| \
> +					 HSW_SNOOP_NOT_NEEDED|HSW_SNOOP_MISS| \
> +					 HSW_SNOOP_HIT_NO_FWD|HSW_SNOOP_HIT_WITH_FWD| \
> +					 HSW_SNOOP_HITM|HSW_SNOOP_NON_DRAM)
> +#define HSW_DEMAND_READ			HSW_DEMAND_DATA_RD
> +#define HSW_DEMAND_WRITE		HSW_DEMAND_RFO
> +#define HSW_L3_MISS_REMOTE		(HSW_L3_MISS_REMOTE_HOP0|\
> +					 HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P)
> +

> +static __initconst const u64 hsw_hw_cache_extra_regs
> +				[PERF_COUNT_HW_CACHE_MAX]
> +				[PERF_COUNT_HW_CACHE_OP_MAX]
> +				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
> +{
> + [ C(LL  ) ] = {
> +	[ C(OP_READ) ] = {
> +		[ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
> +				       HSW_ANY_RESPONSE|HSW_ANY_SNOOP|
> +				       HSW_SUPPLIER_NONE,
> +		[ C(RESULT_MISS)   ] = HSW_DEMAND_READ|
> +				       HSW_L3_MISS|HSW_ANY_SNOOP|
> +				       HSW_SUPPLIER_NONE,

OK, sorry for going on about this, I'm trying to understand a few things:

1) SDM (2014-09) says in 18.9.5 (snb offcore):

"software must set at least one request type bit and a valid response
type pattern"

and

"A valid response type must be a non-zero value of the following
expression:

  ANY | [('OR' of Supplier Info Bits) & ('OR' of Snoop Info Bits)]"

Is this still valid for the HSW part?

Assuming so, it appears to me that:

#define HSW_LLC_ACCESS	(HSW_ANY_RESPONSE)
#define HSW_LLC_MISS	(HSW_L3_MISS|HSW_ANY_SNOOP)

Would be the suitable helpers to use here. No need to set Supplier and
Snoop bits for ANY_RESPONSE.

2) I know I included it on SNB, but would SNOOP_HITM really be a miss?
>From what I can tell it could be a local MtoS w/ WB or so. Do we count
it as a miss because the WB part goes to DRAM so we still get to wait
for it (just in the 'wrong' direction)?

3) While we're there, will we get SNOOP_FWD only for Clean forwards or
also for the HITM forwards; the SDM is vague -- it would be nice if
SNOOP_FWD was a selector for all remote socket snoops.

> + [ C(NODE) ] = {
> +	[ C(OP_READ) ] = {
> +		[ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
> +				       HSW_L3_MISS_LOCAL_DRAM|HSW_SUPPLIER_NONE|
> +				       HSW_ANY_SNOOP,

4) SUPPLIER_NONE; I'll interpret it as specific event that lacks other
supplier info (as opposed to _any_ supplier). What kind of events would
this be?

I didn't include SUPPLIER_NONE in any events on SNB, its implied by
L3_ACCESS due to ANY, but other than that I'm not sure what to do with
it. It seems out of place for DRAM_ANY.

5) NODE-ACCESS is _any_ DRAM;
   NODE-MISS is remote DRAM.

for SNB I didn't include NON_DRAM in ANY_SNOOP for this reason.

#define HSW_DRAM_ANY		(HSW_LLC_MISS & ~HSW_SNOOP_NON_DRAM)
#define HSW_DRAM_REMOTE		(HSW_MISS_LOCAL_DRAM|HSW_ANY_SNOOP & ~HSW_SNOOP_NON_DRAM)

6) Should we maybe use

	(SNOOP_ANY & ~(SNOOP_HIT_NO_FWD|NON_DRAM))

instead for DRAM_REMOTE? SNOOP_HIT_NO_FWD seem as inappropriate as
NON_DRAM for REMOTE.

The SNB patch would look something like so I suppose..

---
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 498b6d967138..d2030868444c 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -225,6 +225,21 @@ static u64 intel_pmu_event_map(int hw_event)
 	return intel_perfmon_event_map[hw_event];
 }
 
+
+/*
+ * SDM (2014-09) says in 18.9.5 (snb offcore):
+ *
+ * "software must set at least one request type bit and a valid response type
+ * pattern"
+ *
+ * and
+ *
+ * "A valid response type must be a non-zero value of the following expression:
+ *
+ *    ANY | [('OR' of Supplier Info Bits) & ('OR' of Snoop Info Bits)]"
+ */
+
+/* Request */
 #define SNB_DMND_DATA_RD	(1ULL << 0)
 #define SNB_DMND_RFO		(1ULL << 1)
 #define SNB_DMND_IFETCH		(1ULL << 2)
@@ -238,7 +253,10 @@ static u64 intel_pmu_event_map(int hw_event)
 #define SNB_BUS_LOCKS		(1ULL << 10)
 #define SNB_STRM_ST		(1ULL << 11)
 #define SNB_OTHER		(1ULL << 15)
+
 #define SNB_RESP_ANY		(1ULL << 16)
+
+/* Supplier */
 #define SNB_NO_SUPP		(1ULL << 17)
 #define SNB_LLC_HITM		(1ULL << 18)
 #define SNB_LLC_HITE		(1ULL << 19)
@@ -246,6 +264,8 @@ static u64 intel_pmu_event_map(int hw_event)
 #define SNB_LLC_HITF		(1ULL << 21)
 #define SNB_LOCAL		(1ULL << 22)
 #define SNB_REMOTE		(0xffULL << 23)
+
+/* Snoop */
 #define SNB_SNP_NONE		(1ULL << 31)
 #define SNB_SNP_NOT_NEEDED	(1ULL << 32)
 #define SNB_SNP_MISS		(1ULL << 33)
@@ -258,12 +278,12 @@ static u64 intel_pmu_event_map(int hw_event)
 #define SNB_DMND_WRITE		(SNB_DMND_RFO|SNB_LLC_RFO)
 #define SNB_DMND_PREFETCH	(SNB_PF_DATA_RD|SNB_PF_RFO)
 
-#define SNB_SNP_ANY		(SNB_SNP_NONE|SNB_SNP_NOT_NEEDED| \
-				 SNB_SNP_MISS|SNB_NO_FWD|SNB_SNP_FWD| \
-				 SNB_HITM)
+#define SNB_NO_SNP		(SNB_SNP_NONE|SNB_SNP_NOT_NEEDED|SNB_SNP_MISS)
+#define SNB_REMOTE_SNP		(SNB_NO_SNP|SNB_SNP_FWD|SNB_HITM)
+#define SNB_ANY_SNP		(SNB_REMOTE_SNP|SNB_NO_FWD)
 
-#define SNB_DRAM_ANY		(SNB_LOCAL|SNB_REMOTE|SNB_SNP_ANY)
-#define SNB_DRAM_REMOTE		(SNB_REMOTE|SNB_SNP_ANY)
+#define SNB_DRAM_ANY		(SNB_LOCAL|SNB_REMOTE|SNB_ANY_SNP)
+#define SNB_DRAM_REMOTE		(SNB_REMOTE|SNB_REMOTE_SNP)
 
 #define SNB_L3_ACCESS		SNB_RESP_ANY
 #define SNB_L3_MISS		(SNB_DRAM_ANY|SNB_NON_DRAM)

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 1/3] perf, x86: Add new cache events table for Haswell
@ 2015-02-11  0:40 Andi Kleen
  2015-02-12 21:28 ` Peter Zijlstra
  0 siblings, 1 reply; 21+ messages in thread
From: Andi Kleen @ 2015-02-11  0:40 UTC (permalink / raw)
  To: peterz; +Cc: linux-kernel, kan.liang, Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

Haswell offcore events are quite different from Sandy Bridge.
Add a new table to handle Haswell properly.

Note that the offcore bits listed in the SDM are not quite correct
(this is currently being fixed). An uptodate list of bits is
in the patch.

The basic setup is similar to Sandy Bridge. The prefetch columns
have been removed, as prefetch counting is not very reliable
on Haswell. One L1 event that is not in the event list anymore
has been also removed.

- data reads do not include code reads (comparable to earlier Sandy
Bridge tables)
- data counts include speculative execution (except L1 write, dtlb, bpu)
- remote node access includes both remote memory, remote cache, remote mmio.
- prefetches are not included in the counts for consistency
(different from Sandy Bridge, which includes prefetches in the remote node)

The events with additional caveats have references to the specification update.

v2: Change name of variable.
v3: Based on older Broadwell patch, but now just for Haswell.
    Various fixes with events being more similar to the older
    Sandy Bridge tables.
v4: Add aliases. Drop two PF bits.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 197 ++++++++++++++++++++++++++++++++-
 1 file changed, 195 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 498b6d9..02ab31d 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -415,6 +415,199 @@ static __initconst const u64 snb_hw_cache_event_ids
 
 };
 
+/*
+ * Notes on the events:
+ * - data reads do not include code reads (comparable to earlier tables)
+ * - data counts include speculative execution (except L1 write, dtlb, bpu)
+ * - remote node access includes remote memory, remote cache, remote mmio.
+ * - prefetches are not included in the counts because they are not
+ *   reliably counted.
+ * The events with additional caveats have references to the specification update.
+ */
+
+#define HSW_DEMAND_DATA_RD		BIT_ULL(0)
+#define HSW_DEMAND_RFO			BIT_ULL(1)
+#define HSW_ANY_RESPONSE		BIT_ULL(16)
+#define HSW_SUPPLIER_NONE		BIT_ULL(17)
+#define HSW_L3_MISS_LOCAL_DRAM		BIT_ULL(22)
+#define HSW_L3_MISS_REMOTE_HOP0		BIT_ULL(27)
+#define HSW_L3_MISS_REMOTE_HOP1		BIT_ULL(28)
+#define HSW_L3_MISS_REMOTE_HOP2P	BIT_ULL(29)
+#define HSW_L3_MISS			(HSW_L3_MISS_LOCAL_DRAM| \
+					 HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
+					 HSW_L3_MISS_REMOTE_HOP2P)
+#define HSW_SNOOP_NONE			BIT_ULL(31)
+#define HSW_SNOOP_NOT_NEEDED		BIT_ULL(32)
+#define HSW_SNOOP_MISS			BIT_ULL(33)
+#define HSW_SNOOP_HIT_NO_FWD		BIT_ULL(34)
+#define HSW_SNOOP_HIT_WITH_FWD		BIT_ULL(35)
+#define HSW_SNOOP_HITM			BIT_ULL(36)
+#define HSW_SNOOP_NON_DRAM		BIT_ULL(37)
+#define HSW_ANY_SNOOP			(HSW_SNOOP_NONE| \
+					 HSW_SNOOP_NOT_NEEDED|HSW_SNOOP_MISS| \
+					 HSW_SNOOP_HIT_NO_FWD|HSW_SNOOP_HIT_WITH_FWD| \
+					 HSW_SNOOP_HITM|HSW_SNOOP_NON_DRAM)
+#define HSW_DEMAND_READ			HSW_DEMAND_DATA_RD
+#define HSW_DEMAND_WRITE		HSW_DEMAND_RFO
+#define HSW_L3_MISS_REMOTE		(HSW_L3_MISS_REMOTE_HOP0|\
+					 HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P)
+
+static __initconst const u64 hsw_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x81d0, 	/* MEM_UOPS_RETIRED.ALL_LOADS, HSM30 */
+		[ C(RESULT_MISS)   ] = 0x151, 	/* L1D.REPLACEMENT */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x82d0, 	/* MEM_UOPS_RETIRED.ALL_STORES, HSM30 */
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x280, 	/* ICACHE.MISSES */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7, 	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7, 	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7, 	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7, 	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x81d0, 	/* MEM_UOPS_RETIRED.ALL_LOADS, HSM30 */
+		[ C(RESULT_MISS)   ] = 0x108, 	/* DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x82d0, 	/* MEM_UOPS_RETIRED.ALL_STORES, HSM30 */
+		[ C(RESULT_MISS)   ] = 0x149, 	/* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x6085, 	/* ITLB_MISSES.STLB_HIT */
+		[ C(RESULT_MISS)   ] = 0x185, 	/* ITLB_MISSES.MISS_CAUSES_A_WALK */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0xc4, 	/* BR_INST_RETIRED.ALL_BRANCHES */
+		[ C(RESULT_MISS)   ] = 0xc5, 	/* BR_MISP_RETIRED.ALL_BRANCHES */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7, 	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7, 	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7, 	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7, 	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+};
+
+static __initconst const u64 hsw_hw_cache_extra_regs
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
+				       HSW_ANY_RESPONSE|HSW_ANY_SNOOP|
+				       HSW_SUPPLIER_NONE,
+		[ C(RESULT_MISS)   ] = HSW_DEMAND_READ|
+				       HSW_L3_MISS|HSW_ANY_SNOOP|
+				       HSW_SUPPLIER_NONE,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
+				       HSW_ANY_RESPONSE|HSW_ANY_SNOOP|
+				       HSW_SUPPLIER_NONE,
+		[ C(RESULT_MISS)   ] = HSW_DEMAND_WRITE|
+				       HSW_L3_MISS|HSW_ANY_SNOOP|
+				       HSW_SUPPLIER_NONE,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
+				       HSW_L3_MISS_LOCAL_DRAM|HSW_SUPPLIER_NONE|
+				       HSW_ANY_SNOOP,
+		[ C(RESULT_MISS)   ] = HSW_DEMAND_READ|
+				       HSW_L3_MISS_REMOTE|HSW_SUPPLIER_NONE|
+				       HSW_ANY_SNOOP,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
+				       HSW_L3_MISS_LOCAL_DRAM|HSW_SUPPLIER_NONE|
+				       HSW_ANY_SNOOP,
+		[ C(RESULT_MISS)   ] = HSW_DEMAND_WRITE|
+				       HSW_L3_MISS_REMOTE|HSW_SUPPLIER_NONE|
+				       HSW_ANY_SNOOP,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+};
+
 static __initconst const u64 westmere_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
 				[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -2546,8 +2739,8 @@ __init int intel_pmu_init(void)
 	case 69: /* 22nm Haswell ULT */
 	case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
 		x86_pmu.late_ack = true;
-		memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids));
-		memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+		memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+		memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
 
 		intel_pmu_lbr_init_snb();
 
-- 
1.9.3


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/3] perf, x86: Add new cache events table for Haswell
  2015-02-10 17:11   ` Andi Kleen
@ 2015-02-10 17:16     ` Peter Zijlstra
  0 siblings, 0 replies; 21+ messages in thread
From: Peter Zijlstra @ 2015-02-10 17:16 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Andi Kleen, linux-kernel, kan.liang

On Tue, Feb 10, 2015 at 09:11:35AM -0800, Andi Kleen wrote:

> > Now the other tables create little helpers like:
> > 
> > #define HSW_DMND_READ	(HSW_DMND_DATA_RD)
> > #define HSW_DMND_WRITE	(HSW_DMND_RFO)
> > 
> > #define HSW_L3_ACCESS	(HSW_ANY_RESPONSE)
> > #define HSW_L3_MISS	(HSW_L3_MISS)
> > 
> > And compose the tables values using those:
> > 
> > 	HSW_DMND_READ|HSW_L3_ACCESS
> > 
> > Please do so here too.
> 
> I'm trying to stay with the official documented bit names. No such bit names exist.
> If we make up our own names nobody else can read it anymore.

Well, its a simple matter of looking up the bit definitions; we define
those helpers in terms of the official names after all.

By having the 4 helpers {r,w}x{access,miss} you avoid some repetition
and decrease the room for mistakes.

> > Now; when comparing these value to the SNB for example I note that you
> > include ANY_SNOOP and SUPPLIER_NONE in L3_ACCESS, SNB and other do not,
> > please explain.
> 
> You're supposed to set a snoop and supplier qualifier.
> AFAIK SNB should set them too. It may work without them due to some
> quirk.

OK, then we should fix the others. Thanks.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/3] perf, x86: Add new cache events table for Haswell
  2015-02-10 10:34 ` Peter Zijlstra
@ 2015-02-10 17:11   ` Andi Kleen
  2015-02-10 17:16     ` Peter Zijlstra
  0 siblings, 1 reply; 21+ messages in thread
From: Andi Kleen @ 2015-02-10 17:11 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Andi Kleen, linux-kernel, kan.liang

> Just in case someone is crazy enough to run a 32bit kernel on HSW, this
> needs to be BIT_ULL() -- also for compile testing, gcc tends to complain
> about things like (1UL << 32) for ILP32 targets.

Ok.

> Now the other tables create little helpers like:
> 
> #define HSW_DMND_READ	(HSW_DMND_DATA_RD)
> #define HSW_DMND_WRITE	(HSW_DMND_RFO)
> 
> #define HSW_L3_ACCESS	(HSW_ANY_RESPONSE)
> #define HSW_L3_MISS	(HSW_L3_MISS)
> 
> And compose the tables values using those:
> 
> 	HSW_DMND_READ|HSW_L3_ACCESS
> 
> Please do so here too.

I'm trying to stay with the official documented bit names. No such bit names exist.
If we make up our own names nobody else can read it anymore.

> 
> Now; when comparing these value to the SNB for example I note that you
> include ANY_SNOOP and SUPPLIER_NONE in L3_ACCESS, SNB and other do not,
> please explain.

You're supposed to set a snoop and supplier qualifier.
AFAIK SNB should set them too. It may work without them due to some
quirk.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/3] perf, x86: Add new cache events table for Haswell
  2015-02-09 19:17 Andi Kleen
@ 2015-02-10 10:34 ` Peter Zijlstra
  2015-02-10 17:11   ` Andi Kleen
  0 siblings, 1 reply; 21+ messages in thread
From: Peter Zijlstra @ 2015-02-10 10:34 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel, kan.liang, Andi Kleen

On Mon, Feb 09, 2015 at 11:17:44AM -0800, Andi Kleen wrote:
> +++ b/arch/x86/kernel/cpu/perf_event_intel.c
> @@ -415,6 +415,199 @@ static __initconst const u64 snb_hw_cache_event_ids
>  
>  };
>  
> +/*
> + * Notes on the events:
> + * - data reads do not include code reads (comparable to earlier tables)
> + * - data counts include speculative execution (except L1 write, dtlb, bpu)
> + * - remote node access includes both remote memory, remote cache, remote mmio.
> + * - prefetches are not included in the counts.
> + * The events with additional caveats have references to the specification update.
> + */

It would be good to have the prefetch note from the changelog in a
comment here or at the extra_regs table; i.e. prefetches are not
included because they're 'broken'.

> +
> +#define HSW_DEMAND_DATA_RD		BIT(0)
> +#define HSW_DEMAND_RFO			BIT(1)
> +#define HSW_PF_L2_RFO			BIT(5)
> +#define HSW_PF_L3_RFO			BIT(8)
> +#define HSW_ALL_RFO			(HSW_DEMAND_RFO| \
> +					 HSW_PF_L2_RFO|HSW_PF_L3_RFO)
> +#define HSW_ANY_RESPONSE		BIT(16)
> +#define HSW_SUPPLIER_NONE		BIT(17)
> +#define HSW_L3_MISS_LOCAL		BIT(22)
> +#define HSW_L3_MISS_REMOTE_HOP0		BIT(27)
> +#define HSW_L3_MISS_REMOTE_HOP1		BIT(28)
> +#define HSW_L3_MISS_REMOTE_HOP2P	BIT(29)
> +#define HSW_L3_MISS			(HSW_L3_MISS_LOCAL| \
> +					 HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
> +					 HSW_L3_MISS_REMOTE_HOP2P)
> +#define HSW_SNOOP_NONE			BIT(31)
> +#define HSW_SNOOP_NOT_NEEDED		BIT(32)
> +#define HSW_SNOOP_MISS			BIT(33)
> +#define HSW_SNOOP_HIT_NO_FWD		BIT(34)
> +#define HSW_SNOOP_HIT_WITH_FWD		BIT(35)
> +#define HSW_SNOOP_HITM			BIT(36)
> +#define HSW_SNOOP_NON_DRAM		BIT(37)
> +#define HSW_ANY_SNOOP			(HSW_SNOOP_NONE| \
> +					 HSW_SNOOP_NOT_NEEDED|HSW_SNOOP_MISS| \
> +					 HSW_SNOOP_HIT_NO_FWD|HSW_SNOOP_HIT_WITH_FWD| \
> +					 HSW_SNOOP_HITM|HSW_SNOOP_NON_DRAM)

Just in case someone is crazy enough to run a 32bit kernel on HSW, this
needs to be BIT_ULL() -- also for compile testing, gcc tends to complain
about things like (1UL << 32) for ILP32 targets.

> +static __initconst const u64 hsw_hw_cache_extra_regs
> +				[PERF_COUNT_HW_CACHE_MAX]
> +				[PERF_COUNT_HW_CACHE_OP_MAX]
> +				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
> +{
> + [ C(LL  ) ] = {
> +	[ C(OP_READ) ] = {
> +		[ C(RESULT_ACCESS) ] = HSW_DEMAND_DATA_RD|
> +				       HSW_ANY_RESPONSE|HSW_ANY_SNOOP|
> +				       HSW_SUPPLIER_NONE,
> +		[ C(RESULT_MISS)   ] = HSW_DEMAND_DATA_RD|
> +				       HSW_L3_MISS|HSW_ANY_SNOOP|
> +				       HSW_SUPPLIER_NONE,
> +	},
> +	[ C(OP_WRITE) ] = {
> +		[ C(RESULT_ACCESS) ] = HSW_DEMAND_RFO|
> +				       HSW_ANY_RESPONSE|HSW_ANY_SNOOP|
> +				       HSW_SUPPLIER_NONE,
> +		[ C(RESULT_MISS)   ] = HSW_DEMAND_RFO|HSW_L3_MISS|
> +				       HSW_ANY_SNOOP|HSW_SUPPLIER_NONE,
> +	},
> +	[ C(OP_PREFETCH) ] = {
> +		[ C(RESULT_ACCESS) ] = 0x0,
> +		[ C(RESULT_MISS)   ] = 0x0,
> +	},
> + },
> + [ C(NODE) ] = {
> +	[ C(OP_READ) ] = {
> +		[ C(RESULT_ACCESS) ] = HSW_DEMAND_DATA_RD|
> +				       HSW_L3_MISS_LOCAL|HSW_SUPPLIER_NONE|
> +				       HSW_ANY_SNOOP,
> +		[ C(RESULT_MISS)   ] = HSW_DEMAND_DATA_RD|
> +				       HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1|
> +				       HSW_L3_MISS_REMOTE_HOP2P|HSW_SUPPLIER_NONE|
> +				       HSW_ANY_SNOOP,
> +	},
> +	[ C(OP_WRITE) ] = {
> +		[ C(RESULT_ACCESS) ] = HSW_DEMAND_RFO|
> +				       HSW_L3_MISS_LOCAL|HSW_SUPPLIER_NONE|
> +				       HSW_ANY_SNOOP,
> +		[ C(RESULT_MISS)   ] = HSW_DEMAND_RFO|
> +				       HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1|
> +				       HSW_L3_MISS_REMOTE_HOP2P|HSW_SUPPLIER_NONE|
> +				       HSW_ANY_SNOOP,
> +	},
> +	[ C(OP_PREFETCH) ] = {
> +		[ C(RESULT_ACCESS) ] = 0x0,
> +		[ C(RESULT_MISS)   ] = 0x0,
> +	},
> + },
> +};

Now the other tables create little helpers like:

#define HSW_DMND_READ	(HSW_DMND_DATA_RD)
#define HSW_DMND_WRITE	(HSW_DMND_RFO)

#define HSW_L3_ACCESS	(HSW_ANY_RESPONSE)
#define HSW_L3_MISS	(HSW_L3_MISS)

And compose the tables values using those:

	HSW_DMND_READ|HSW_L3_ACCESS

Please do so here too.

Now; when comparing these value to the SNB for example I note that you
include ANY_SNOOP and SUPPLIER_NONE in L3_ACCESS, SNB and other do not,
please explain.



^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH 1/3] perf, x86: Add new cache events table for Haswell
@ 2015-02-09 19:17 Andi Kleen
  2015-02-10 10:34 ` Peter Zijlstra
  0 siblings, 1 reply; 21+ messages in thread
From: Andi Kleen @ 2015-02-09 19:17 UTC (permalink / raw)
  To: peterz; +Cc: linux-kernel, kan.liang, Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

Haswell offcore events are quite different from Sandy Bridge.
Add a new table to handle Haswell properly.

Note that the offcore bits listed in the SDM are not quite correct
(this is currently being fixed). An uptodate list of bits is
in the patch.

The basic setup is similar to Sandy Bridge. The prefetch columns
have been removed, as prefetch counting is not very reliable
on Haswell. One L1 event that is not in the event list anymore
has been also removed.

- data reads do not include code reads (comparable to earlier Sandy
Bridge tables)
- data counts include speculative execution (except L1 write, dtlb, bpu)
- remote node access includes both remote memory, remote cache, remote mmio.
- prefetches are not included in the counts for consistency
(different from Sandy Bridge, which includes prefetches in the remote node)

The events with additional caveats have references to the specification update.

v2: Change name of variable.
v3: Based on older Broadwell patch, but now just for Haswell.
    Various fixes with events being more similar to the older
    Sandy Bridge tables.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 197 ++++++++++++++++++++++++++++++++-
 1 file changed, 195 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 498b6d9..387f580 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -415,6 +415,199 @@ static __initconst const u64 snb_hw_cache_event_ids
 
 };
 
+/*
+ * Notes on the events:
+ * - data reads do not include code reads (comparable to earlier tables)
+ * - data counts include speculative execution (except L1 write, dtlb, bpu)
+ * - remote node access includes both remote memory, remote cache, remote mmio.
+ * - prefetches are not included in the counts.
+ * The events with additional caveats have references to the specification update.
+ */
+
+#define HSW_DEMAND_DATA_RD		BIT(0)
+#define HSW_DEMAND_RFO			BIT(1)
+#define HSW_PF_L2_RFO			BIT(5)
+#define HSW_PF_L3_RFO			BIT(8)
+#define HSW_ALL_RFO			(HSW_DEMAND_RFO| \
+					 HSW_PF_L2_RFO|HSW_PF_L3_RFO)
+#define HSW_ANY_RESPONSE		BIT(16)
+#define HSW_SUPPLIER_NONE		BIT(17)
+#define HSW_L3_MISS_LOCAL		BIT(22)
+#define HSW_L3_MISS_REMOTE_HOP0		BIT(27)
+#define HSW_L3_MISS_REMOTE_HOP1		BIT(28)
+#define HSW_L3_MISS_REMOTE_HOP2P	BIT(29)
+#define HSW_L3_MISS			(HSW_L3_MISS_LOCAL| \
+					 HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
+					 HSW_L3_MISS_REMOTE_HOP2P)
+#define HSW_SNOOP_NONE			BIT(31)
+#define HSW_SNOOP_NOT_NEEDED		BIT(32)
+#define HSW_SNOOP_MISS			BIT(33)
+#define HSW_SNOOP_HIT_NO_FWD		BIT(34)
+#define HSW_SNOOP_HIT_WITH_FWD		BIT(35)
+#define HSW_SNOOP_HITM			BIT(36)
+#define HSW_SNOOP_NON_DRAM		BIT(37)
+#define HSW_ANY_SNOOP			(HSW_SNOOP_NONE| \
+					 HSW_SNOOP_NOT_NEEDED|HSW_SNOOP_MISS| \
+					 HSW_SNOOP_HIT_NO_FWD|HSW_SNOOP_HIT_WITH_FWD| \
+					 HSW_SNOOP_HITM|HSW_SNOOP_NON_DRAM)
+
+static __initconst const u64 hsw_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x81d0, 	/* MEM_UOPS_RETIRED.ALL_LOADS, HSM30 */
+		[ C(RESULT_MISS)   ] = 0x151, 	/* L1D.REPLACEMENT */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x82d0, 	/* MEM_UOPS_RETIRED.ALL_STORES, HSM30 */
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x280, 	/* ICACHE.MISSES */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7, 	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7, 	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7, 	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7, 	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x81d0, 	/* MEM_UOPS_RETIRED.ALL_LOADS, HSM30 */
+		[ C(RESULT_MISS)   ] = 0x108, 	/* DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x82d0, 	/* MEM_UOPS_RETIRED.ALL_STORES, HSM30 */
+		[ C(RESULT_MISS)   ] = 0x149, 	/* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x6085, 	/* ITLB_MISSES.STLB_HIT */
+		[ C(RESULT_MISS)   ] = 0x185, 	/* ITLB_MISSES.MISS_CAUSES_A_WALK */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0xc4, 	/* BR_INST_RETIRED.ALL_BRANCHES */
+		[ C(RESULT_MISS)   ] = 0xc5, 	/* BR_MISP_RETIRED.ALL_BRANCHES */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7, 	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7, 	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7, 	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7, 	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+};
+
+static __initconst const u64 hsw_hw_cache_extra_regs
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = HSW_DEMAND_DATA_RD|
+				       HSW_ANY_RESPONSE|HSW_ANY_SNOOP|
+				       HSW_SUPPLIER_NONE,
+		[ C(RESULT_MISS)   ] = HSW_DEMAND_DATA_RD|
+				       HSW_L3_MISS|HSW_ANY_SNOOP|
+				       HSW_SUPPLIER_NONE,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = HSW_DEMAND_RFO|
+				       HSW_ANY_RESPONSE|HSW_ANY_SNOOP|
+				       HSW_SUPPLIER_NONE,
+		[ C(RESULT_MISS)   ] = HSW_DEMAND_RFO|HSW_L3_MISS|
+				       HSW_ANY_SNOOP|HSW_SUPPLIER_NONE,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = HSW_DEMAND_DATA_RD|
+				       HSW_L3_MISS_LOCAL|HSW_SUPPLIER_NONE|
+				       HSW_ANY_SNOOP,
+		[ C(RESULT_MISS)   ] = HSW_DEMAND_DATA_RD|
+				       HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1|
+				       HSW_L3_MISS_REMOTE_HOP2P|HSW_SUPPLIER_NONE|
+				       HSW_ANY_SNOOP,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = HSW_DEMAND_RFO|
+				       HSW_L3_MISS_LOCAL|HSW_SUPPLIER_NONE|
+				       HSW_ANY_SNOOP,
+		[ C(RESULT_MISS)   ] = HSW_DEMAND_RFO|
+				       HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1|
+				       HSW_L3_MISS_REMOTE_HOP2P|HSW_SUPPLIER_NONE|
+				       HSW_ANY_SNOOP,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+};
+
 static __initconst const u64 westmere_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
 				[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -2546,8 +2739,8 @@ __init int intel_pmu_init(void)
 	case 69: /* 22nm Haswell ULT */
 	case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
 		x86_pmu.late_ack = true;
-		memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids));
-		memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+		memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+		memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
 
 		intel_pmu_lbr_init_snb();
 
-- 
1.9.3


^ permalink raw reply related	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2015-03-27 11:40 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-02-18  2:18 [PATCH 1/3] perf, x86: Add new cache events table for Haswell Andi Kleen
2015-02-18  2:18 ` [PATCH 2/3] perf, x86: Add Broadwell core support Andi Kleen
2015-03-27 11:39   ` [tip:perf/core] perf/x86/intel: " tip-bot for Andi Kleen
2015-02-18  2:18 ` [PATCH 3/3] perf, x86: Add INST_RETIRED.ALL workarounds Andi Kleen
2015-03-23  9:38   ` Ingo Molnar
2015-03-23 10:19     ` Peter Zijlstra
2015-03-23 10:39       ` Ingo Molnar
2015-03-23 12:35         ` Peter Zijlstra
2015-03-23 13:32           ` Ingo Molnar
2015-03-27 11:39   ` [tip:perf/core] perf/x86/intel: " tip-bot for Andi Kleen
2015-03-23  9:45 ` [PATCH 1/3] perf, x86: Add new cache events table for Haswell Ingo Molnar
2015-03-23 13:53   ` Peter Zijlstra
2015-03-23 13:55     ` Ingo Molnar
2015-03-27 11:39 ` [tip:perf/core] perf/x86/intel: " tip-bot for Andi Kleen
  -- strict thread matches above, loose matches on Subject: below --
2015-02-11  0:40 [PATCH 1/3] perf, x86: " Andi Kleen
2015-02-12 21:28 ` Peter Zijlstra
2015-02-18  4:58   ` Andi Kleen
2015-02-09 19:17 Andi Kleen
2015-02-10 10:34 ` Peter Zijlstra
2015-02-10 17:11   ` Andi Kleen
2015-02-10 17:16     ` Peter Zijlstra

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.