All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/3] perf, x86: Add new cache events table for Haswell
@ 2015-02-11  0:40 Andi Kleen
  2015-02-11  0:40 ` [PATCH 2/3] perf, x86: Add Broadwell core support Andi Kleen
                   ` (2 more replies)
  0 siblings, 3 replies; 12+ messages in thread
From: Andi Kleen @ 2015-02-11  0:40 UTC (permalink / raw)
  To: peterz; +Cc: linux-kernel, kan.liang, Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

Haswell offcore events are quite different from Sandy Bridge.
Add a new table to handle Haswell properly.

Note that the offcore bits listed in the SDM are not quite correct
(this is currently being fixed). An uptodate list of bits is
in the patch.

The basic setup is similar to Sandy Bridge. The prefetch columns
have been removed, as prefetch counting is not very reliable
on Haswell. One L1 event that is not in the event list anymore
has been also removed.

- data reads do not include code reads (comparable to earlier Sandy
Bridge tables)
- data counts include speculative execution (except L1 write, dtlb, bpu)
- remote node access includes both remote memory, remote cache, remote mmio.
- prefetches are not included in the counts for consistency
(different from Sandy Bridge, which includes prefetches in the remote node)

The events with additional caveats have references to the specification update.

v2: Change name of variable.
v3: Based on older Broadwell patch, but now just for Haswell.
    Various fixes with events being more similar to the older
    Sandy Bridge tables.
v4: Add aliases. Drop two PF bits.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 197 ++++++++++++++++++++++++++++++++-
 1 file changed, 195 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 498b6d9..02ab31d 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -415,6 +415,199 @@ static __initconst const u64 snb_hw_cache_event_ids
 
 };
 
+/*
+ * Notes on the events:
+ * - data reads do not include code reads (comparable to earlier tables)
+ * - data counts include speculative execution (except L1 write, dtlb, bpu)
+ * - remote node access includes remote memory, remote cache, remote mmio.
+ * - prefetches are not included in the counts because they are not
+ *   reliably counted.
+ * The events with additional caveats have references to the specification update.
+ */
+
+#define HSW_DEMAND_DATA_RD		BIT_ULL(0)
+#define HSW_DEMAND_RFO			BIT_ULL(1)
+#define HSW_ANY_RESPONSE		BIT_ULL(16)
+#define HSW_SUPPLIER_NONE		BIT_ULL(17)
+#define HSW_L3_MISS_LOCAL_DRAM		BIT_ULL(22)
+#define HSW_L3_MISS_REMOTE_HOP0		BIT_ULL(27)
+#define HSW_L3_MISS_REMOTE_HOP1		BIT_ULL(28)
+#define HSW_L3_MISS_REMOTE_HOP2P	BIT_ULL(29)
+#define HSW_L3_MISS			(HSW_L3_MISS_LOCAL_DRAM| \
+					 HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
+					 HSW_L3_MISS_REMOTE_HOP2P)
+#define HSW_SNOOP_NONE			BIT_ULL(31)
+#define HSW_SNOOP_NOT_NEEDED		BIT_ULL(32)
+#define HSW_SNOOP_MISS			BIT_ULL(33)
+#define HSW_SNOOP_HIT_NO_FWD		BIT_ULL(34)
+#define HSW_SNOOP_HIT_WITH_FWD		BIT_ULL(35)
+#define HSW_SNOOP_HITM			BIT_ULL(36)
+#define HSW_SNOOP_NON_DRAM		BIT_ULL(37)
+#define HSW_ANY_SNOOP			(HSW_SNOOP_NONE| \
+					 HSW_SNOOP_NOT_NEEDED|HSW_SNOOP_MISS| \
+					 HSW_SNOOP_HIT_NO_FWD|HSW_SNOOP_HIT_WITH_FWD| \
+					 HSW_SNOOP_HITM|HSW_SNOOP_NON_DRAM)
+#define HSW_DEMAND_READ			HSW_DEMAND_DATA_RD
+#define HSW_DEMAND_WRITE		HSW_DEMAND_RFO
+#define HSW_L3_MISS_REMOTE		(HSW_L3_MISS_REMOTE_HOP0|\
+					 HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P)
+
+static __initconst const u64 hsw_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x81d0, 	/* MEM_UOPS_RETIRED.ALL_LOADS, HSM30 */
+		[ C(RESULT_MISS)   ] = 0x151, 	/* L1D.REPLACEMENT */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x82d0, 	/* MEM_UOPS_RETIRED.ALL_STORES, HSM30 */
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x280, 	/* ICACHE.MISSES */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7, 	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7, 	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7, 	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7, 	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x81d0, 	/* MEM_UOPS_RETIRED.ALL_LOADS, HSM30 */
+		[ C(RESULT_MISS)   ] = 0x108, 	/* DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x82d0, 	/* MEM_UOPS_RETIRED.ALL_STORES, HSM30 */
+		[ C(RESULT_MISS)   ] = 0x149, 	/* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x6085, 	/* ITLB_MISSES.STLB_HIT */
+		[ C(RESULT_MISS)   ] = 0x185, 	/* ITLB_MISSES.MISS_CAUSES_A_WALK */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0xc4, 	/* BR_INST_RETIRED.ALL_BRANCHES */
+		[ C(RESULT_MISS)   ] = 0xc5, 	/* BR_MISP_RETIRED.ALL_BRANCHES */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7, 	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7, 	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7, 	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7, 	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+};
+
+static __initconst const u64 hsw_hw_cache_extra_regs
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
+				       HSW_ANY_RESPONSE|HSW_ANY_SNOOP|
+				       HSW_SUPPLIER_NONE,
+		[ C(RESULT_MISS)   ] = HSW_DEMAND_READ|
+				       HSW_L3_MISS|HSW_ANY_SNOOP|
+				       HSW_SUPPLIER_NONE,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
+				       HSW_ANY_RESPONSE|HSW_ANY_SNOOP|
+				       HSW_SUPPLIER_NONE,
+		[ C(RESULT_MISS)   ] = HSW_DEMAND_WRITE|
+				       HSW_L3_MISS|HSW_ANY_SNOOP|
+				       HSW_SUPPLIER_NONE,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
+				       HSW_L3_MISS_LOCAL_DRAM|HSW_SUPPLIER_NONE|
+				       HSW_ANY_SNOOP,
+		[ C(RESULT_MISS)   ] = HSW_DEMAND_READ|
+				       HSW_L3_MISS_REMOTE|HSW_SUPPLIER_NONE|
+				       HSW_ANY_SNOOP,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
+				       HSW_L3_MISS_LOCAL_DRAM|HSW_SUPPLIER_NONE|
+				       HSW_ANY_SNOOP,
+		[ C(RESULT_MISS)   ] = HSW_DEMAND_WRITE|
+				       HSW_L3_MISS_REMOTE|HSW_SUPPLIER_NONE|
+				       HSW_ANY_SNOOP,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+};
+
 static __initconst const u64 westmere_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
 				[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -2546,8 +2739,8 @@ __init int intel_pmu_init(void)
 	case 69: /* 22nm Haswell ULT */
 	case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
 		x86_pmu.late_ack = true;
-		memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids));
-		memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+		memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+		memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
 
 		intel_pmu_lbr_init_snb();
 
-- 
1.9.3


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 2/3] perf, x86: Add Broadwell core support
  2015-02-11  0:40 [PATCH 1/3] perf, x86: Add new cache events table for Haswell Andi Kleen
@ 2015-02-11  0:40 ` Andi Kleen
  2015-02-11  0:40 ` [PATCH 3/3] perf, x86: Add INST_RETIRED.ALL workarounds Andi Kleen
  2015-02-12 21:28 ` [PATCH 1/3] perf, x86: Add new cache events table for Haswell Peter Zijlstra
  2 siblings, 0 replies; 12+ messages in thread
From: Andi Kleen @ 2015-02-11  0:40 UTC (permalink / raw)
  To: peterz; +Cc: linux-kernel, kan.liang, Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

Add Broadwell support for Broadwell to perf.

The basic support is very similar to Haswell. We use the new cache
event list added for Haswell earlier. The only differences
are a few bits related to remote nodes. To avoid an extra,
mostly identical, table these are patched up in the initialization code.

The constraint list has one new event that needs to be handled over Haswell.

Includes code and testing from Kan Liang.

v2: Remove unnamed model numbers.
v3: Rename cache event list to hsw_*. Change names.
v4: Use symbolic names for cache events. Improve comments and description.
    Fix sparse warnings (Fengguang Wu)
    Add Xeon D model number.
    Remove cache event table (in separate patch)
    Patch up remote node differences (Kan Liang)
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 50 ++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 02ab31d..035f8d1 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -220,6 +220,15 @@ static struct event_constraint intel_hsw_event_constraints[] = {
 	EVENT_CONSTRAINT_END
 };
 
+struct event_constraint intel_bdw_event_constraints[] = {
+	FIXED_EVENT_CONSTRAINT(0x00c0, 0),	/* INST_RETIRED.ANY */
+	FIXED_EVENT_CONSTRAINT(0x003c, 1),	/* CPU_CLK_UNHALTED.CORE */
+	FIXED_EVENT_CONSTRAINT(0x0300, 2),	/* CPU_CLK_UNHALTED.REF */
+	INTEL_UEVENT_CONSTRAINT(0x148, 0x4),	/* L1D_PEND_MISS.PENDING */
+	INTEL_EVENT_CONSTRAINT(0xa3, 0x4),	/* CYCLE_ACTIVITY.* */
+	EVENT_CONSTRAINT_END
+};
+
 static u64 intel_pmu_event_map(int hw_event)
 {
 	return intel_perfmon_event_map[hw_event];
@@ -452,6 +461,12 @@ static __initconst const u64 snb_hw_cache_event_ids
 #define HSW_L3_MISS_REMOTE		(HSW_L3_MISS_REMOTE_HOP0|\
 					 HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P)
 
+#define BDW_L3_MISS_LOCAL		BIT(26)
+#define BDW_L3_MISS			(BDW_L3_MISS_LOCAL| \
+					 HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
+					 HSW_L3_MISS_REMOTE_HOP2P)
+
+
 static __initconst const u64 hsw_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
 				[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -2759,6 +2774,41 @@ __init int intel_pmu_init(void)
 		pr_cont("Haswell events, ");
 		break;
 
+	case 61: /* 14nm Broadwell Core-M */
+	case 86: /* 14nm Broadwell Xeon D */
+		x86_pmu.late_ack = true;
+		memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+		memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+
+		/* L3_MISS_LOCAL_DRAM is BIT(26) in Broadwell */
+		hw_cache_extra_regs[C(LL)][C(OP_READ)][C(RESULT_MISS)] = HSW_DEMAND_READ |
+									 BDW_L3_MISS|HSW_ANY_SNOOP|
+									 HSW_SUPPLIER_NONE;
+		hw_cache_extra_regs[C(LL)][C(OP_WRITE)][C(RESULT_MISS)] = HSW_DEMAND_WRITE|BDW_L3_MISS|
+									  HSW_ANY_SNOOP|HSW_SUPPLIER_NONE;
+		hw_cache_extra_regs[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = HSW_DEMAND_READ|
+									     BDW_L3_MISS_LOCAL|HSW_SUPPLIER_NONE|
+									     HSW_ANY_SNOOP;
+		hw_cache_extra_regs[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = HSW_DEMAND_WRITE|
+									      BDW_L3_MISS_LOCAL|HSW_SUPPLIER_NONE|
+									      HSW_ANY_SNOOP;
+
+		intel_pmu_lbr_init_snb();
+
+		x86_pmu.event_constraints = intel_bdw_event_constraints;
+		x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
+		x86_pmu.extra_regs = intel_snbep_extra_regs;
+		x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+		/* all extra regs are per-cpu when HT is on */
+		x86_pmu.er_flags |= ERF_HAS_RSP_1;
+		x86_pmu.er_flags |= ERF_NO_HT_SHARING;
+
+		x86_pmu.hw_config = hsw_hw_config;
+		x86_pmu.get_event_constraints = hsw_get_event_constraints;
+		x86_pmu.cpu_events = hsw_events_attrs;
+		pr_cont("Broadwell events, ");
+		break;
+
 	default:
 		switch (x86_pmu.version) {
 		case 1:
-- 
1.9.3


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 3/3] perf, x86: Add INST_RETIRED.ALL workarounds
  2015-02-11  0:40 [PATCH 1/3] perf, x86: Add new cache events table for Haswell Andi Kleen
  2015-02-11  0:40 ` [PATCH 2/3] perf, x86: Add Broadwell core support Andi Kleen
@ 2015-02-11  0:40 ` Andi Kleen
  2015-02-12 21:28 ` [PATCH 1/3] perf, x86: Add new cache events table for Haswell Peter Zijlstra
  2 siblings, 0 replies; 12+ messages in thread
From: Andi Kleen @ 2015-02-11  0:40 UTC (permalink / raw)
  To: peterz; +Cc: linux-kernel, kan.liang, Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

On Broadwell INST_RETIRED.ALL cannot be used with any period
that doesn't have the lowest 6 bits cleared. And the period
should not be smaller than 128.

Add a new callback to enforce this, and set it for Broadwell.

This is erratum BDM57 and BDM11.

How does this handle the case when an app requests a specific
period with some of the bottom bits set

The apps thinks it is sampling at X occurences per sample, when it is
in fact at X - 63 (worst case).

Short answer:

Any useful instruction sampling period needs to be 4-6 orders
of magnitude larger than 128, as an PMI every 128 instructions
would instantly overwhelm the system and be throttled.
So the +-64 error from this is really small compared to the
period, much smaller than normal system jitter.

Long answer:

<write up by Peter:>

IFF we guarantee perf_event_attr::sample_period >= 128.

Suppose we start out with sample_period=192; then we'll set period_left
to 192, we'll end up with left = 128 (we truncate the lower bits). We
get an interrupt, find that period_left = 64 (>0 so we return 0 and
don't get an overflow handler), up that to 128. Then we trigger again,
at n=256. Then we find period_left = -64 (<=0 so we return 1 and do get
an overflow). We increment with sample_period so we get left = 128. We
fire again, at n=384, period_left = 0 (<=0 so we return 1 and get an
overflow). And on and on.

So while the individual interrupts are 'wrong' we get then with
interval=256,128 in exactly the right ratio to average out at 192. And
this works for everything >=128.

So the num_samples*fixed_period thing is still entirely correct +- 127,
which is good enough I'd say, as you already have that error anyhow.

So no need to 'fix' the tools, al we need to do is refuse to create
INST_RETIRED:ALL events with sample_period < 128.

v2: Use correct event name in description. Use EVENT() macro.
v3: Use INTEL_ARCH_EVENT_MASK. Error out for events with too small period.
v4: Expand description.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/kernel/cpu/perf_event.c       |  9 +++++++++
 arch/x86/kernel/cpu/perf_event.h       |  1 +
 arch/x86/kernel/cpu/perf_event_intel.c | 19 +++++++++++++++++++
 3 files changed, 29 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 143e5f5..66451a6 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -445,6 +445,12 @@ int x86_pmu_hw_config(struct perf_event *event)
 	if (event->attr.type == PERF_TYPE_RAW)
 		event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
 
+	if (event->attr.sample_period && x86_pmu.limit_period) {
+		if (x86_pmu.limit_period(event, event->attr.sample_period) >
+				event->attr.sample_period)
+			return -EINVAL;
+	}
+
 	return x86_setup_perfctr(event);
 }
 
@@ -982,6 +988,9 @@ int x86_perf_event_set_period(struct perf_event *event)
 	if (left > x86_pmu.max_period)
 		left = x86_pmu.max_period;
 
+	if (x86_pmu.limit_period)
+		left = x86_pmu.limit_period(event, left);
+
 	per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
 
 	/*
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 4e6cdb0..6ce77bd 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -449,6 +449,7 @@ struct x86_pmu {
 	struct x86_pmu_quirk *quirks;
 	int		perfctr_second_write;
 	bool		late_ack;
+	unsigned	(*limit_period)(struct perf_event *event, unsigned l);
 
 	/*
 	 * sysfs attrs
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 035f8d1..7fec0f0 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -2113,6 +2113,24 @@ hsw_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
 	return c;
 }
 
+/*
+ * Broadwell:
+ * The INST_RETIRED.ALL period always needs to have lowest
+ * 6bits cleared (BDM57). It shall not use a period smaller
+ * than 100 (BDM11). We combine the two to enforce
+ * a min-period of 128.
+ */
+static unsigned bdw_limit_period(struct perf_event *event, unsigned left)
+{
+	if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
+			X86_CONFIG(.event=0xc0, .umask=0x01)) {
+		if (left < 128)
+			left = 128;
+		left &= ~0x3fu;
+	}
+	return left;
+}
+
 PMU_FORMAT_ATTR(event,	"config:0-7"	);
 PMU_FORMAT_ATTR(umask,	"config:8-15"	);
 PMU_FORMAT_ATTR(edge,	"config:18"	);
@@ -2806,6 +2824,7 @@ __init int intel_pmu_init(void)
 		x86_pmu.hw_config = hsw_hw_config;
 		x86_pmu.get_event_constraints = hsw_get_event_constraints;
 		x86_pmu.cpu_events = hsw_events_attrs;
+		x86_pmu.limit_period = bdw_limit_period;
 		pr_cont("Broadwell events, ");
 		break;
 
-- 
1.9.3


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/3] perf, x86: Add new cache events table for Haswell
  2015-02-11  0:40 [PATCH 1/3] perf, x86: Add new cache events table for Haswell Andi Kleen
  2015-02-11  0:40 ` [PATCH 2/3] perf, x86: Add Broadwell core support Andi Kleen
  2015-02-11  0:40 ` [PATCH 3/3] perf, x86: Add INST_RETIRED.ALL workarounds Andi Kleen
@ 2015-02-12 21:28 ` Peter Zijlstra
  2015-02-18  4:58   ` Andi Kleen
  2 siblings, 1 reply; 12+ messages in thread
From: Peter Zijlstra @ 2015-02-12 21:28 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel, kan.liang, Andi Kleen

On Tue, Feb 10, 2015 at 04:40:22PM -0800, Andi Kleen wrote:

> diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
> index 498b6d9..02ab31d 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel.c
> @@ -415,6 +415,199 @@ static __initconst const u64 snb_hw_cache_event_ids
>  
>  };
>  
> +/*
> + * Notes on the events:
> + * - data reads do not include code reads (comparable to earlier tables)
> + * - data counts include speculative execution (except L1 write, dtlb, bpu)
> + * - remote node access includes remote memory, remote cache, remote mmio.
> + * - prefetches are not included in the counts because they are not
> + *   reliably counted.
> + * The events with additional caveats have references to the specification update.
> + */
> +
> +#define HSW_DEMAND_DATA_RD		BIT_ULL(0)
> +#define HSW_DEMAND_RFO			BIT_ULL(1)
> +#define HSW_ANY_RESPONSE		BIT_ULL(16)
> +#define HSW_SUPPLIER_NONE		BIT_ULL(17)
> +#define HSW_L3_MISS_LOCAL_DRAM		BIT_ULL(22)
> +#define HSW_L3_MISS_REMOTE_HOP0		BIT_ULL(27)
> +#define HSW_L3_MISS_REMOTE_HOP1		BIT_ULL(28)
> +#define HSW_L3_MISS_REMOTE_HOP2P	BIT_ULL(29)
> +#define HSW_L3_MISS			(HSW_L3_MISS_LOCAL_DRAM| \
> +					 HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
> +					 HSW_L3_MISS_REMOTE_HOP2P)
> +#define HSW_SNOOP_NONE			BIT_ULL(31)
> +#define HSW_SNOOP_NOT_NEEDED		BIT_ULL(32)
> +#define HSW_SNOOP_MISS			BIT_ULL(33)
> +#define HSW_SNOOP_HIT_NO_FWD		BIT_ULL(34)
> +#define HSW_SNOOP_HIT_WITH_FWD		BIT_ULL(35)
> +#define HSW_SNOOP_HITM			BIT_ULL(36)
> +#define HSW_SNOOP_NON_DRAM		BIT_ULL(37)
> +#define HSW_ANY_SNOOP			(HSW_SNOOP_NONE| \
> +					 HSW_SNOOP_NOT_NEEDED|HSW_SNOOP_MISS| \
> +					 HSW_SNOOP_HIT_NO_FWD|HSW_SNOOP_HIT_WITH_FWD| \
> +					 HSW_SNOOP_HITM|HSW_SNOOP_NON_DRAM)
> +#define HSW_DEMAND_READ			HSW_DEMAND_DATA_RD
> +#define HSW_DEMAND_WRITE		HSW_DEMAND_RFO
> +#define HSW_L3_MISS_REMOTE		(HSW_L3_MISS_REMOTE_HOP0|\
> +					 HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P)
> +

> +static __initconst const u64 hsw_hw_cache_extra_regs
> +				[PERF_COUNT_HW_CACHE_MAX]
> +				[PERF_COUNT_HW_CACHE_OP_MAX]
> +				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
> +{
> + [ C(LL  ) ] = {
> +	[ C(OP_READ) ] = {
> +		[ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
> +				       HSW_ANY_RESPONSE|HSW_ANY_SNOOP|
> +				       HSW_SUPPLIER_NONE,
> +		[ C(RESULT_MISS)   ] = HSW_DEMAND_READ|
> +				       HSW_L3_MISS|HSW_ANY_SNOOP|
> +				       HSW_SUPPLIER_NONE,

OK, sorry for going on about this, I'm trying to understand a few things:

1) SDM (2014-09) says in 18.9.5 (snb offcore):

"software must set at least one request type bit and a valid response
type pattern"

and

"A valid response type must be a non-zero value of the following
expression:

  ANY | [('OR' of Supplier Info Bits) & ('OR' of Snoop Info Bits)]"

Is this still valid for the HSW part?

Assuming so, it appears to me that:

#define HSW_LLC_ACCESS	(HSW_ANY_RESPONSE)
#define HSW_LLC_MISS	(HSW_L3_MISS|HSW_ANY_SNOOP)

Would be the suitable helpers to use here. No need to set Supplier and
Snoop bits for ANY_RESPONSE.

2) I know I included it on SNB, but would SNOOP_HITM really be a miss?
>From what I can tell it could be a local MtoS w/ WB or so. Do we count
it as a miss because the WB part goes to DRAM so we still get to wait
for it (just in the 'wrong' direction)?

3) While we're there, will we get SNOOP_FWD only for Clean forwards or
also for the HITM forwards; the SDM is vague -- it would be nice if
SNOOP_FWD was a selector for all remote socket snoops.

> + [ C(NODE) ] = {
> +	[ C(OP_READ) ] = {
> +		[ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
> +				       HSW_L3_MISS_LOCAL_DRAM|HSW_SUPPLIER_NONE|
> +				       HSW_ANY_SNOOP,

4) SUPPLIER_NONE; I'll interpret it as specific event that lacks other
supplier info (as opposed to _any_ supplier). What kind of events would
this be?

I didn't include SUPPLIER_NONE in any events on SNB, its implied by
L3_ACCESS due to ANY, but other than that I'm not sure what to do with
it. It seems out of place for DRAM_ANY.

5) NODE-ACCESS is _any_ DRAM;
   NODE-MISS is remote DRAM.

for SNB I didn't include NON_DRAM in ANY_SNOOP for this reason.

#define HSW_DRAM_ANY		(HSW_LLC_MISS & ~HSW_SNOOP_NON_DRAM)
#define HSW_DRAM_REMOTE		(HSW_MISS_LOCAL_DRAM|HSW_ANY_SNOOP & ~HSW_SNOOP_NON_DRAM)

6) Should we maybe use

	(SNOOP_ANY & ~(SNOOP_HIT_NO_FWD|NON_DRAM))

instead for DRAM_REMOTE? SNOOP_HIT_NO_FWD seem as inappropriate as
NON_DRAM for REMOTE.

The SNB patch would look something like so I suppose..

---
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 498b6d967138..d2030868444c 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -225,6 +225,21 @@ static u64 intel_pmu_event_map(int hw_event)
 	return intel_perfmon_event_map[hw_event];
 }
 
+
+/*
+ * SDM (2014-09) says in 18.9.5 (snb offcore):
+ *
+ * "software must set at least one request type bit and a valid response type
+ * pattern"
+ *
+ * and
+ *
+ * "A valid response type must be a non-zero value of the following expression:
+ *
+ *    ANY | [('OR' of Supplier Info Bits) & ('OR' of Snoop Info Bits)]"
+ */
+
+/* Request */
 #define SNB_DMND_DATA_RD	(1ULL << 0)
 #define SNB_DMND_RFO		(1ULL << 1)
 #define SNB_DMND_IFETCH		(1ULL << 2)
@@ -238,7 +253,10 @@ static u64 intel_pmu_event_map(int hw_event)
 #define SNB_BUS_LOCKS		(1ULL << 10)
 #define SNB_STRM_ST		(1ULL << 11)
 #define SNB_OTHER		(1ULL << 15)
+
 #define SNB_RESP_ANY		(1ULL << 16)
+
+/* Supplier */
 #define SNB_NO_SUPP		(1ULL << 17)
 #define SNB_LLC_HITM		(1ULL << 18)
 #define SNB_LLC_HITE		(1ULL << 19)
@@ -246,6 +264,8 @@ static u64 intel_pmu_event_map(int hw_event)
 #define SNB_LLC_HITF		(1ULL << 21)
 #define SNB_LOCAL		(1ULL << 22)
 #define SNB_REMOTE		(0xffULL << 23)
+
+/* Snoop */
 #define SNB_SNP_NONE		(1ULL << 31)
 #define SNB_SNP_NOT_NEEDED	(1ULL << 32)
 #define SNB_SNP_MISS		(1ULL << 33)
@@ -258,12 +278,12 @@ static u64 intel_pmu_event_map(int hw_event)
 #define SNB_DMND_WRITE		(SNB_DMND_RFO|SNB_LLC_RFO)
 #define SNB_DMND_PREFETCH	(SNB_PF_DATA_RD|SNB_PF_RFO)
 
-#define SNB_SNP_ANY		(SNB_SNP_NONE|SNB_SNP_NOT_NEEDED| \
-				 SNB_SNP_MISS|SNB_NO_FWD|SNB_SNP_FWD| \
-				 SNB_HITM)
+#define SNB_NO_SNP		(SNB_SNP_NONE|SNB_SNP_NOT_NEEDED|SNB_SNP_MISS)
+#define SNB_REMOTE_SNP		(SNB_NO_SNP|SNB_SNP_FWD|SNB_HITM)
+#define SNB_ANY_SNP		(SNB_REMOTE_SNP|SNB_NO_FWD)
 
-#define SNB_DRAM_ANY		(SNB_LOCAL|SNB_REMOTE|SNB_SNP_ANY)
-#define SNB_DRAM_REMOTE		(SNB_REMOTE|SNB_SNP_ANY)
+#define SNB_DRAM_ANY		(SNB_LOCAL|SNB_REMOTE|SNB_ANY_SNP)
+#define SNB_DRAM_REMOTE		(SNB_REMOTE|SNB_REMOTE_SNP)
 
 #define SNB_L3_ACCESS		SNB_RESP_ANY
 #define SNB_L3_MISS		(SNB_DRAM_ANY|SNB_NON_DRAM)

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/3] perf, x86: Add new cache events table for Haswell
  2015-02-12 21:28 ` [PATCH 1/3] perf, x86: Add new cache events table for Haswell Peter Zijlstra
@ 2015-02-18  4:58   ` Andi Kleen
  0 siblings, 0 replies; 12+ messages in thread
From: Andi Kleen @ 2015-02-18  4:58 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Andi Kleen, linux-kernel, kan.liang, Andi Kleen

On Thu, Feb 12, 2015 at 10:28:06PM +0100, Peter Zijlstra wrote:
> 1) SDM (2014-09) says in 18.9.5 (snb offcore):
> 
> "software must set at least one request type bit and a valid response
> type pattern"
> 
> and
> 
> "A valid response type must be a non-zero value of the following
> expression:
> 
>   ANY | [('OR' of Supplier Info Bits) & ('OR' of Snoop Info Bits)]"
> 
> Is this still valid for the HSW part?

Yes.

However they need to be set on Sandy Bridge to work around an issue.
On HSW they don't hurt at least.

> #define HSW_LLC_ACCESS	(HSW_ANY_RESPONSE)
> #define HSW_LLC_MISS	(HSW_L3_MISS|HSW_ANY_SNOOP)
> 
> Would be the suitable helpers to use here. No need to set Supplier and
> Snoop bits for ANY_RESPONSE.

Ok.

> 2) I know I included it on SNB, but would SNOOP_HITM really be a miss?
> From what I can tell it could be a local MtoS w/ WB or so. Do we count
> it as a miss because the WB part goes to DRAM so we still get to wait
> for it (just in the 'wrong' direction)?

SNOOP_HITM is a miss because M is only present in one cache.
So it's a miss for the requester.

> 
> 3) While we're there, will we get SNOOP_FWD only for Clean forwards or
> also for the HITM forwards; the SDM is vague -- it would be nice if
> SNOOP_FWD was a selector for all remote socket snoops.

It's only for clean forwards.

> 
> > + [ C(NODE) ] = {
> > +	[ C(OP_READ) ] = {
> > +		[ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
> > +				       HSW_L3_MISS_LOCAL_DRAM|HSW_SUPPLIER_NONE|
> > +				       HSW_ANY_SNOOP,
> 
> 4) SUPPLIER_NONE; I'll interpret it as specific event that lacks other
> supplier info (as opposed to _any_ supplier). What kind of events would
> this be?

Your interpretation is right.
Ok, we can drop it. It should rarely happen.

> 
> I didn't include SUPPLIER_NONE in any events on SNB, its implied by
> L3_ACCESS due to ANY, but other than that I'm not sure what to do with
> it. It seems out of place for DRAM_ANY.
> 
> 5) NODE-ACCESS is _any_ DRAM;
>    NODE-MISS is remote DRAM.
> 
> for SNB I didn't include NON_DRAM in ANY_SNOOP for this reason.
> 
> #define HSW_DRAM_ANY		(HSW_LLC_MISS & ~HSW_SNOOP_NON_DRAM)
> #define HSW_DRAM_REMOTE		(HSW_MISS_LOCAL_DRAM|HSW_ANY_SNOOP & ~HSW_SNOOP_NON_DRAM)

This should be more like
HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P|HSW_ANY_SNOOP & ~HSW_SNOOP_NON_DRAM)

> 
> 6) Should we maybe use
> 
> 	(SNOOP_ANY & ~(SNOOP_HIT_NO_FWD|NON_DRAM))
> 
> instead for DRAM_REMOTE? SNOOP_HIT_NO_FWD seem as inappropriate as
> NON_DRAM for REMOTE.

SNOOP_HIT_NO_FWD indicates DRAM.

-Andi

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/3] perf, x86: Add INST_RETIRED.ALL workarounds
  2015-03-23 12:35         ` Peter Zijlstra
@ 2015-03-23 13:32           ` Ingo Molnar
  0 siblings, 0 replies; 12+ messages in thread
From: Ingo Molnar @ 2015-03-23 13:32 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Andi Kleen, linux-kernel, Andi Kleen


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Mon, Mar 23, 2015 at 11:39:00AM +0100, Ingo Molnar wrote:
> > > http://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/5th-gen-core-family-spec-update.pdf
> > > 
> > > BDM11 and BDM55 (not 57) tell us that the PMU will generate crap output
> > > if you don't do this. Non-fatal but gibberish.
> > 
> > Should be part of the changelog?
> 
> Sure, lemme go make that happen.
> 
> > So I did not say rounding up, I meant this sentence:
> > 
> > > > > + *   [...] We combine the two to enforce
> > > > > + * a min-period of 128.
> > 
> > IMO ambiguously suggests that the result of the combination of the two 
> > is to enforce a min-period of 128. Would somethin like this:
> > 
> > 	          We combine the two to enforce
> >           a min-period of 128, rounded (down) to multiples of 64.
> >           The original period is still kept by the core code and is 
> >           approximated in the long run via these slightly fuzzed 
> >           hardware-periods.
> 
> Like so then?

Yeah, looks good to me!

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/3] perf, x86: Add INST_RETIRED.ALL workarounds
  2015-03-23 10:39       ` Ingo Molnar
@ 2015-03-23 12:35         ` Peter Zijlstra
  2015-03-23 13:32           ` Ingo Molnar
  0 siblings, 1 reply; 12+ messages in thread
From: Peter Zijlstra @ 2015-03-23 12:35 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Andi Kleen, linux-kernel, Andi Kleen

On Mon, Mar 23, 2015 at 11:39:00AM +0100, Ingo Molnar wrote:
> > http://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/5th-gen-core-family-spec-update.pdf
> > 
> > BDM11 and BDM55 (not 57) tell us that the PMU will generate crap output
> > if you don't do this. Non-fatal but gibberish.
> 
> Should be part of the changelog?

Sure, lemme go make that happen.

> So I did not say rounding up, I meant this sentence:
> 
> > > > + *   [...] We combine the two to enforce
> > > > + * a min-period of 128.
> 
> IMO ambiguously suggests that the result of the combination of the two 
> is to enforce a min-period of 128. Would somethin like this:
> 
> 	          We combine the two to enforce
>           a min-period of 128, rounded (down) to multiples of 64.
>           The original period is still kept by the core code and is 
>           approximated in the long run via these slightly fuzzed 
>           hardware-periods.

Like so then?

---
Subject: perf, x86: Add INST_RETIRED.ALL workarounds
From: Andi Kleen <ak@linux.intel.com>
Date: Tue, 17 Feb 2015 18:18:06 -0800

On Broadwell INST_RETIRED.ALL cannot be used with any period
that doesn't have the lowest 6 bits cleared. And the period
should not be smaller than 128.

This is erratum BDM11 and BDM55;
http://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/5th-gen-core-family-spec-update.pdf

BDM11: When using a period < 100; we may get incorrect PEBS/PMI
interrupts and/or an invalid counter state.
BDM55: When bit0-5 of the period are !0 we may get redundant PEBS
records on overflow.

Add a new callback to enforce this, and set it for Broadwell.

How does this handle the case when an app requests a specific
period with some of the bottom bits set?

Short answer:

Any useful instruction sampling period needs to be 4-6 orders
of magnitude larger than 128, as an PMI every 128 instructions
would instantly overwhelm the system and be throttled.
So the +-64 error from this is really small compared to the
period, much smaller than normal system jitter.

Long answer (by Peterz):

IFF we guarantee perf_event_attr::sample_period >= 128.

Suppose we start out with sample_period=192; then we'll set period_left
to 192, we'll end up with left = 128 (we truncate the lower bits). We
get an interrupt, find that period_left = 64 (>0 so we return 0 and
don't get an overflow handler), up that to 128. Then we trigger again,
at n=256. Then we find period_left = -64 (<=0 so we return 1 and do get
an overflow). We increment with sample_period so we get left = 128. We
fire again, at n=384, period_left = 0 (<=0 so we return 1 and get an
overflow). And on and on.

So while the individual interrupts are 'wrong' we get then with
interval=256,128 in exactly the right ratio to average out at 192. And
this works for everything >=128.

So the num_samples*fixed_period thing is still entirely correct +- 127,
which is good enough I'd say, as you already have that error anyhow.

So no need to 'fix' the tools, al we need to do is refuse to create
INST_RETIRED:ALL events with sample_period < 128.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/kernel/cpu/perf_event.c       |    9 +++++++++
 arch/x86/kernel/cpu/perf_event.h       |    1 +
 arch/x86/kernel/cpu/perf_event_intel.c |   27 +++++++++++++++++++++++++++
 3 files changed, 37 insertions(+)

--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -451,6 +451,12 @@ int x86_pmu_hw_config(struct perf_event
 	if (event->attr.type == PERF_TYPE_RAW)
 		event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
 
+	if (event->attr.sample_period && x86_pmu.limit_period) {
+		if (x86_pmu.limit_period(event, event->attr.sample_period) >
+				event->attr.sample_period)
+			return -EINVAL;
+	}
+
 	return x86_setup_perfctr(event);
 }
 
@@ -988,6 +994,9 @@ int x86_perf_event_set_period(struct per
 	if (left > x86_pmu.max_period)
 		left = x86_pmu.max_period;
 
+	if (x86_pmu.limit_period)
+		left = x86_pmu.limit_period(event, left);
+
 	per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
 
 	/*
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -451,6 +451,7 @@ struct x86_pmu {
 	struct x86_pmu_quirk *quirks;
 	int		perfctr_second_write;
 	bool		late_ack;
+	unsigned	(*limit_period)(struct perf_event *event, unsigned l);
 
 	/*
 	 * sysfs attrs
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -2097,6 +2097,32 @@ hsw_get_event_constraints(struct cpu_hw_
 	return c;
 }
 
+/*
+ * Broadwell:
+ *
+ * The INST_RETIRED.ALL period always needs to have lowest 6 bits cleared
+ * (BDM55) and it must not use a period smaller than 100 (BDM11). We combine
+ * the two to enforce a minimum period of 128 (the smallest value that has bits
+ * 0-5 cleared and >= 100).
+ *
+ * Because of how the code in x86_perf_event_set_period() works, the truncation
+ * of the lower 6 bits is 'harmless' as we'll occasionally add a longer period
+ * to make up for the 'lost' events due to carrying the 'error' in period_left.
+ *
+ * Therefore the effective (average) period matches the requested period,
+ * despite coarser hardware granularity.
+ */
+static unsigned bdw_limit_period(struct perf_event *event, unsigned left)
+{
+	if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
+			X86_CONFIG(.event=0xc0, .umask=0x01)) {
+		if (left < 128)
+			left = 128;
+		left &= ~0x3fu;
+	}
+	return left;
+}
+
 PMU_FORMAT_ATTR(event,	"config:0-7"	);
 PMU_FORMAT_ATTR(umask,	"config:8-15"	);
 PMU_FORMAT_ATTR(edge,	"config:18"	);
@@ -2775,6 +2801,7 @@ __init int intel_pmu_init(void)
 		x86_pmu.hw_config = hsw_hw_config;
 		x86_pmu.get_event_constraints = hsw_get_event_constraints;
 		x86_pmu.cpu_events = hsw_events_attrs;
+		x86_pmu.limit_period = bdw_limit_period;
 		pr_cont("Broadwell events, ");
 		break;
 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/3] perf, x86: Add INST_RETIRED.ALL workarounds
  2015-03-23 10:19     ` Peter Zijlstra
@ 2015-03-23 10:39       ` Ingo Molnar
  2015-03-23 12:35         ` Peter Zijlstra
  0 siblings, 1 reply; 12+ messages in thread
From: Ingo Molnar @ 2015-03-23 10:39 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Andi Kleen, linux-kernel, Andi Kleen


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Mon, Mar 23, 2015 at 10:38:54AM +0100, Ingo Molnar wrote:
> > 
> > * Andi Kleen <andi@firstfloor.org> wrote:
> > 
> > > From: Andi Kleen <ak@linux.intel.com>
> > > 
> > > On Broadwell INST_RETIRED.ALL cannot be used with any period
> > > that doesn't have the lowest 6 bits cleared. And the period
> > > should not be smaller than 128.
> > 
> > Sloppy changelog: a most basic question is not answered by the 
> > changelog: what happens in practice when the period is set to a 
> > smaller value than 128?
> 
> http://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/5th-gen-core-family-spec-update.pdf
> 
> BDM11 and BDM55 (not 57) tell us that the PMU will generate crap output
> if you don't do this. Non-fatal but gibberish.

Should be part of the changelog?

> > > +/*
> > > + * Broadwell:
> > > + * The INST_RETIRED.ALL period always needs to have lowest
> > > + * 6bits cleared (BDM57). It shall not use a period smaller
> > > + * than 100 (BDM11). We combine the two to enforce
> > > + * a min-period of 128.
> > > + */
> > 
> > Sloppy comment: that's not what we do:
> > 
> > > +static unsigned bdw_limit_period(struct perf_event *event, unsigned left)
> > > +{
> > > +	if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
> > > +			X86_CONFIG(.event=0xc0, .umask=0x01)) {
> > > +		if (left < 128)
> > > +			left = 128;
> > > +		left &= ~0x3fu;
> > > +	}
> > > +	return left;
> > 
> > We enforce a minimum period of 128 and round the requested period to 
> > 64.
> 
> Not quite, we enforce a min period of 128 but otherwise mask bit0-5, no
> rounding up.

So I did not say rounding up, I meant this sentence:

> > > + *   [...] We combine the two to enforce
> > > + * a min-period of 128.

IMO ambiguously suggests that the result of the combination of the two 
is to enforce a min-period of 128. Would somethin like this:

	          We combine the two to enforce
          a min-period of 128, rounded (down) to multiples of 64.
          The original period is still kept by the core code and is 
          approximated in the long run via these slightly fuzzed 
          hardware-periods.

work with you?

> > I think in this case it would be useful to tooling if we updated 
> > the syscall attribute with the real period value that was used, to 
> > not skew tooling output.
> 
> Seeing how we already have a fuzz of up to sample_period events; we 
> don't know how far into the last period we are when we stop the 
> event, it might have been 1 event away from generating a PMI, this 
> patch doesn't actually add significantly to that.
> 
> Also, the effective period is the one specified, if the requested 
> period < 128 we simply reject the event creation. If its any larger 
> we iterate around the requested sample period with steps of 64 but 
> such that we average out on the requested period. There is no 'real' 
> period to copy back.

Yeah, fair enough.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/3] perf, x86: Add INST_RETIRED.ALL workarounds
  2015-03-23  9:38   ` Ingo Molnar
@ 2015-03-23 10:19     ` Peter Zijlstra
  2015-03-23 10:39       ` Ingo Molnar
  0 siblings, 1 reply; 12+ messages in thread
From: Peter Zijlstra @ 2015-03-23 10:19 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Andi Kleen, linux-kernel, Andi Kleen

On Mon, Mar 23, 2015 at 10:38:54AM +0100, Ingo Molnar wrote:
> 
> * Andi Kleen <andi@firstfloor.org> wrote:
> 
> > From: Andi Kleen <ak@linux.intel.com>
> > 
> > On Broadwell INST_RETIRED.ALL cannot be used with any period
> > that doesn't have the lowest 6 bits cleared. And the period
> > should not be smaller than 128.
> 
> Sloppy changelog: a most basic question is not answered by the 
> changelog: what happens in practice when the period is set to a 
> smaller value than 128?

http://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/5th-gen-core-family-spec-update.pdf

BDM11 and BDM55 (not 57) tell us that the PMU will generate crap output
if you don't do this. Non-fatal but gibberish.

> > +/*
> > + * Broadwell:
> > + * The INST_RETIRED.ALL period always needs to have lowest
> > + * 6bits cleared (BDM57). It shall not use a period smaller
> > + * than 100 (BDM11). We combine the two to enforce
> > + * a min-period of 128.
> > + */
> 
> Sloppy comment: that's not what we do:
> 
> > +static unsigned bdw_limit_period(struct perf_event *event, unsigned left)
> > +{
> > +	if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
> > +			X86_CONFIG(.event=0xc0, .umask=0x01)) {
> > +		if (left < 128)
> > +			left = 128;
> > +		left &= ~0x3fu;
> > +	}
> > +	return left;
> 
> We enforce a minimum period of 128 and round the requested period to 
> 64.

Not quite, we enforce a min period of 128 but otherwise mask bit0-5, no
rounding up.

Which is pretty much what the comment says.

> I think in this case it would be useful to tooling if we updated the 
> syscall attribute with the real period value that was used, to not 
> skew tooling output.

Seeing how we already have a fuzz of up to sample_period events; we
don't know how far into the last period we are when we stop the event,
it might have been 1 event away from generating a PMI, this patch
doesn't actually add significantly to that.

Also, the effective period is the one specified, if the requested period
< 128 we simply reject the event creation. If its any larger we iterate
around the requested sample period with steps of 64 but such that we
average out on the requested period. There is no 'real' period to copy
back.

Another way to look at this is that we use a form of pulse density
modulation to create the desired period using the larger step size; or
perhaps compare it to Breshenham's line drawing algorithm.




^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/3] perf, x86: Add INST_RETIRED.ALL workarounds
  2015-02-18  2:18 ` [PATCH 3/3] perf, x86: Add INST_RETIRED.ALL workarounds Andi Kleen
@ 2015-03-23  9:38   ` Ingo Molnar
  2015-03-23 10:19     ` Peter Zijlstra
  0 siblings, 1 reply; 12+ messages in thread
From: Ingo Molnar @ 2015-03-23  9:38 UTC (permalink / raw)
  To: Andi Kleen; +Cc: peterz, linux-kernel, Andi Kleen


* Andi Kleen <andi@firstfloor.org> wrote:

> From: Andi Kleen <ak@linux.intel.com>
> 
> On Broadwell INST_RETIRED.ALL cannot be used with any period
> that doesn't have the lowest 6 bits cleared. And the period
> should not be smaller than 128.

Sloppy changelog: a most basic question is not answered by the 
changelog: what happens in practice when the period is set to a 
smaller value than 128?

> +/*
> + * Broadwell:
> + * The INST_RETIRED.ALL period always needs to have lowest
> + * 6bits cleared (BDM57). It shall not use a period smaller
> + * than 100 (BDM11). We combine the two to enforce
> + * a min-period of 128.
> + */

Sloppy comment: that's not what we do:

> +static unsigned bdw_limit_period(struct perf_event *event, unsigned left)
> +{
> +	if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
> +			X86_CONFIG(.event=0xc0, .umask=0x01)) {
> +		if (left < 128)
> +			left = 128;
> +		left &= ~0x3fu;
> +	}
> +	return left;

We enforce a minimum period of 128 and round the requested period to 
64.

I think in this case it would be useful to tooling if we updated the 
syscall attribute with the real period value that was used, to not 
skew tooling output.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 3/3] perf, x86: Add INST_RETIRED.ALL workarounds
  2015-02-18  2:18 Andi Kleen
@ 2015-02-18  2:18 ` Andi Kleen
  2015-03-23  9:38   ` Ingo Molnar
  0 siblings, 1 reply; 12+ messages in thread
From: Andi Kleen @ 2015-02-18  2:18 UTC (permalink / raw)
  To: peterz; +Cc: linux-kernel, Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

On Broadwell INST_RETIRED.ALL cannot be used with any period
that doesn't have the lowest 6 bits cleared. And the period
should not be smaller than 128.

Add a new callback to enforce this, and set it for Broadwell.

This is erratum BDM57 and BDM11.

How does this handle the case when an app requests a specific
period with some of the bottom bits set

The apps thinks it is sampling at X occurences per sample, when it is
in fact at X - 63 (worst case).

Short answer:

Any useful instruction sampling period needs to be 4-6 orders
of magnitude larger than 128, as an PMI every 128 instructions
would instantly overwhelm the system and be throttled.
So the +-64 error from this is really small compared to the
period, much smaller than normal system jitter.

Long answer:

<write up by Peter:>

IFF we guarantee perf_event_attr::sample_period >= 128.

Suppose we start out with sample_period=192; then we'll set period_left
to 192, we'll end up with left = 128 (we truncate the lower bits). We
get an interrupt, find that period_left = 64 (>0 so we return 0 and
don't get an overflow handler), up that to 128. Then we trigger again,
at n=256. Then we find period_left = -64 (<=0 so we return 1 and do get
an overflow). We increment with sample_period so we get left = 128. We
fire again, at n=384, period_left = 0 (<=0 so we return 1 and get an
overflow). And on and on.

So while the individual interrupts are 'wrong' we get then with
interval=256,128 in exactly the right ratio to average out at 192. And
this works for everything >=128.

So the num_samples*fixed_period thing is still entirely correct +- 127,
which is good enough I'd say, as you already have that error anyhow.

So no need to 'fix' the tools, al we need to do is refuse to create
INST_RETIRED:ALL events with sample_period < 128.

v2: Use correct event name in description. Use EVENT() macro.
v3: Use INTEL_ARCH_EVENT_MASK. Error out for events with too small period.
v4: Expand description.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/kernel/cpu/perf_event.c       |  9 +++++++++
 arch/x86/kernel/cpu/perf_event.h       |  1 +
 arch/x86/kernel/cpu/perf_event_intel.c | 19 +++++++++++++++++++
 3 files changed, 29 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 143e5f5..66451a6 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -445,6 +445,12 @@ int x86_pmu_hw_config(struct perf_event *event)
 	if (event->attr.type == PERF_TYPE_RAW)
 		event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
 
+	if (event->attr.sample_period && x86_pmu.limit_period) {
+		if (x86_pmu.limit_period(event, event->attr.sample_period) >
+				event->attr.sample_period)
+			return -EINVAL;
+	}
+
 	return x86_setup_perfctr(event);
 }
 
@@ -982,6 +988,9 @@ int x86_perf_event_set_period(struct perf_event *event)
 	if (left > x86_pmu.max_period)
 		left = x86_pmu.max_period;
 
+	if (x86_pmu.limit_period)
+		left = x86_pmu.limit_period(event, left);
+
 	per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
 
 	/*
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 4e6cdb0..6ce77bd 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -449,6 +449,7 @@ struct x86_pmu {
 	struct x86_pmu_quirk *quirks;
 	int		perfctr_second_write;
 	bool		late_ack;
+	unsigned	(*limit_period)(struct perf_event *event, unsigned l);
 
 	/*
 	 * sysfs attrs
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 259e06b..222274c 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -2111,6 +2111,24 @@ hsw_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
 	return c;
 }
 
+/*
+ * Broadwell:
+ * The INST_RETIRED.ALL period always needs to have lowest
+ * 6bits cleared (BDM57). It shall not use a period smaller
+ * than 100 (BDM11). We combine the two to enforce
+ * a min-period of 128.
+ */
+static unsigned bdw_limit_period(struct perf_event *event, unsigned left)
+{
+	if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
+			X86_CONFIG(.event=0xc0, .umask=0x01)) {
+		if (left < 128)
+			left = 128;
+		left &= ~0x3fu;
+	}
+	return left;
+}
+
 PMU_FORMAT_ATTR(event,	"config:0-7"	);
 PMU_FORMAT_ATTR(umask,	"config:8-15"	);
 PMU_FORMAT_ATTR(edge,	"config:18"	);
@@ -2801,6 +2819,7 @@ __init int intel_pmu_init(void)
 		x86_pmu.hw_config = hsw_hw_config;
 		x86_pmu.get_event_constraints = hsw_get_event_constraints;
 		x86_pmu.cpu_events = hsw_events_attrs;
+		x86_pmu.limit_period = bdw_limit_period;
 		pr_cont("Broadwell events, ");
 		break;
 
-- 
1.9.3


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 3/3] perf, x86: Add INST_RETIRED.ALL workarounds
  2015-02-09 19:17 [PATCH 1/3] perf, x86: Add new cache events table for Haswell Andi Kleen
@ 2015-02-09 19:17 ` Andi Kleen
  0 siblings, 0 replies; 12+ messages in thread
From: Andi Kleen @ 2015-02-09 19:17 UTC (permalink / raw)
  To: peterz; +Cc: linux-kernel, kan.liang, Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

On Broadwell INST_RETIRED.ALL cannot be used with any period
that doesn't have the lowest 6 bits cleared. And the period
should not be smaller than 128.

Add a new callback to enforce this, and set it for Broadwell.

This is erratum BDM57 and BDM11.

How does this handle the case when an app requests a specific
period with some of the bottom bits set

The apps thinks it is sampling at X occurences per sample, when it is
in fact at X - 63 (worst case).

Short answer:

Any useful instruction sampling period needs to be 4-6 orders
of magnitude larger than 128, as an PMI every 128 instructions
would instantly overwhelm the system and be throttled.
So the +-64 error from this is really small compared to the
period, much smaller than normal system jitter.

Long answer:

<write up by Peter:>

IFF we guarantee perf_event_attr::sample_period >= 128.

Suppose we start out with sample_period=192; then we'll set period_left
to 192, we'll end up with left = 128 (we truncate the lower bits). We
get an interrupt, find that period_left = 64 (>0 so we return 0 and
don't get an overflow handler), up that to 128. Then we trigger again,
at n=256. Then we find period_left = -64 (<=0 so we return 1 and do get
an overflow). We increment with sample_period so we get left = 128. We
fire again, at n=384, period_left = 0 (<=0 so we return 1 and get an
overflow). And on and on.

So while the individual interrupts are 'wrong' we get then with
interval=256,128 in exactly the right ratio to average out at 192. And
this works for everything >=128.

So the num_samples*fixed_period thing is still entirely correct +- 127,
which is good enough I'd say, as you already have that error anyhow.

So no need to 'fix' the tools, al we need to do is refuse to create
INST_RETIRED:ALL events with sample_period < 128.

v2: Use correct event name in description. Use EVENT() macro.
v3: Use INTEL_ARCH_EVENT_MASK. Error out for events with too small period.
v4: Expand description.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/kernel/cpu/perf_event.c       |  9 +++++++++
 arch/x86/kernel/cpu/perf_event.h       |  1 +
 arch/x86/kernel/cpu/perf_event_intel.c | 19 +++++++++++++++++++
 3 files changed, 29 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 143e5f5..66451a6 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -445,6 +445,12 @@ int x86_pmu_hw_config(struct perf_event *event)
 	if (event->attr.type == PERF_TYPE_RAW)
 		event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
 
+	if (event->attr.sample_period && x86_pmu.limit_period) {
+		if (x86_pmu.limit_period(event, event->attr.sample_period) >
+				event->attr.sample_period)
+			return -EINVAL;
+	}
+
 	return x86_setup_perfctr(event);
 }
 
@@ -982,6 +988,9 @@ int x86_perf_event_set_period(struct perf_event *event)
 	if (left > x86_pmu.max_period)
 		left = x86_pmu.max_period;
 
+	if (x86_pmu.limit_period)
+		left = x86_pmu.limit_period(event, left);
+
 	per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
 
 	/*
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 4e6cdb0..6ce77bd 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -449,6 +449,7 @@ struct x86_pmu {
 	struct x86_pmu_quirk *quirks;
 	int		perfctr_second_write;
 	bool		late_ack;
+	unsigned	(*limit_period)(struct perf_event *event, unsigned l);
 
 	/*
 	 * sysfs attrs
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 7c480e8..96c0898 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -2111,6 +2111,24 @@ hsw_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
 	return c;
 }
 
+/*
+ * Broadwell:
+ * The INST_RETIRED.ALL period always needs to have lowest
+ * 6bits cleared (BDM57). It shall not use a period smaller
+ * than 100 (BDM11). We combine the two to enforce
+ * a min-period of 128.
+ */
+static unsigned bdw_limit_period(struct perf_event *event, unsigned left)
+{
+	if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
+			X86_CONFIG(.event=0xc0, .umask=0x01)) {
+		if (left < 128)
+			left = 128;
+		left &= ~0x3fu;
+	}
+	return left;
+}
+
 PMU_FORMAT_ATTR(event,	"config:0-7"	);
 PMU_FORMAT_ATTR(umask,	"config:8-15"	);
 PMU_FORMAT_ATTR(edge,	"config:18"	);
@@ -2804,6 +2822,7 @@ __init int intel_pmu_init(void)
 		x86_pmu.hw_config = hsw_hw_config;
 		x86_pmu.get_event_constraints = hsw_get_event_constraints;
 		x86_pmu.cpu_events = hsw_events_attrs;
+		x86_pmu.limit_period = bdw_limit_period;
 		pr_cont("Broadwell events, ");
 		break;
 
-- 
1.9.3


^ permalink raw reply related	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2015-03-23 13:32 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-02-11  0:40 [PATCH 1/3] perf, x86: Add new cache events table for Haswell Andi Kleen
2015-02-11  0:40 ` [PATCH 2/3] perf, x86: Add Broadwell core support Andi Kleen
2015-02-11  0:40 ` [PATCH 3/3] perf, x86: Add INST_RETIRED.ALL workarounds Andi Kleen
2015-02-12 21:28 ` [PATCH 1/3] perf, x86: Add new cache events table for Haswell Peter Zijlstra
2015-02-18  4:58   ` Andi Kleen
  -- strict thread matches above, loose matches on Subject: below --
2015-02-18  2:18 Andi Kleen
2015-02-18  2:18 ` [PATCH 3/3] perf, x86: Add INST_RETIRED.ALL workarounds Andi Kleen
2015-03-23  9:38   ` Ingo Molnar
2015-03-23 10:19     ` Peter Zijlstra
2015-03-23 10:39       ` Ingo Molnar
2015-03-23 12:35         ` Peter Zijlstra
2015-03-23 13:32           ` Ingo Molnar
2015-02-09 19:17 [PATCH 1/3] perf, x86: Add new cache events table for Haswell Andi Kleen
2015-02-09 19:17 ` [PATCH 3/3] perf, x86: Add INST_RETIRED.ALL workarounds Andi Kleen

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.