linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* Updated perf offcore patchkit
@ 2010-11-29 13:12 Andi Kleen
  2010-11-29 13:12 ` [PATCH 1/3] perf: Document enhanced event encoding for OFFCORE_MSR Andi Kleen
                   ` (2 more replies)
  0 siblings, 3 replies; 5+ messages in thread
From: Andi Kleen @ 2010-11-29 13:12 UTC (permalink / raw)
  To: peterz; +Cc: mingo, linux-kernel

Rebased on top of tip perf/perf-core with the new initcall sequence.
Now uses reference counts and kzalloc_node() to allocate the per node
data.
I dropped the patch that was already in tip.

-Andi


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH 1/3] perf: Document enhanced event encoding for OFFCORE_MSR
  2010-11-29 13:12 Updated perf offcore patchkit Andi Kleen
@ 2010-11-29 13:12 ` Andi Kleen
  2010-11-29 13:12 ` [PATCH 2/3] perf-events: Add support for supplementary event registers v4 Andi Kleen
  2010-11-29 13:12 ` [PATCH 3/3] perf-events: Fix LLC-* events on Intel Nehalem/Westmere v2 Andi Kleen
  2 siblings, 0 replies; 5+ messages in thread
From: Andi Kleen @ 2010-11-29 13:12 UTC (permalink / raw)
  To: peterz; +Cc: mingo, linux-kernel, Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 tools/perf/Documentation/perf-list.txt |    7 +++++++
 1 files changed, 7 insertions(+), 0 deletions(-)

diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt
index 399751b..22678d2 100644
--- a/tools/perf/Documentation/perf-list.txt
+++ b/tools/perf/Documentation/perf-list.txt
@@ -58,6 +58,13 @@ raw encoding of 0x1A8 can be used:
  perf stat -e r1a8 -a sleep 1
  perf record -e r1a8 ...
 
+Some special events on x86 encode additional data in the normally unused
+[32;63] bits of the raw value. This is particularly used for
+the OFFCORE_RESPONSE events on Intel Core i7.  The 16bit
+mask in the OFFCORE_RESPONSE register is put into bits [32;48].
+For example the OFFCORE_RESPONSE_0.ANY_DATA.ANY_CACHE_DRAM event
+is encoded as r00007f11004301b7.
+
 You should refer to the processor specific documentation for getting these
 details. Some of them are referenced in the SEE ALSO section below.
 
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 2/3] perf-events: Add support for supplementary event registers v4
  2010-11-29 13:12 Updated perf offcore patchkit Andi Kleen
  2010-11-29 13:12 ` [PATCH 1/3] perf: Document enhanced event encoding for OFFCORE_MSR Andi Kleen
@ 2010-11-29 13:12 ` Andi Kleen
  2010-12-01 14:26   ` Peter Zijlstra
  2010-11-29 13:12 ` [PATCH 3/3] perf-events: Fix LLC-* events on Intel Nehalem/Westmere v2 Andi Kleen
  2 siblings, 1 reply; 5+ messages in thread
From: Andi Kleen @ 2010-11-29 13:12 UTC (permalink / raw)
  To: peterz; +Cc: mingo, linux-kernel, Andi Kleen, eranian

From: Andi Kleen <ak@linux.intel.com>

Intel Nehalem/Westmere have a special OFFCORE_RESPONSE event
that can be used to monitor any offcore accesses from a core.
This is a very useful event for various tunings, and it's
also needed to implement the generic LLC-* events correctly.

Unfortunately this event requires programming a mask in a separate
register. And worse this separate register is per core, not per
CPU thread.

This patch adds:
- Teaches perf_events that OFFCORE_RESPONSE needs extra parameters.
The extra parameters are passed by user space in the unused upper
32bits of the config word.
- Add support to the Intel perf_event core to schedule per
core resources. This adds fairly generic infrastructure that
can be also used for other per core resources.
The basic code has is patterned after the similar AMD northbridge
constraints code.

Thanks to Stephane Eranian who pointed out some problems
in the original version and suggested improvements.

Cc: eranian@google.com
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/kernel/cpu/perf_event.c       |   70 +++++++++++
 arch/x86/kernel/cpu/perf_event_intel.c |  198 ++++++++++++++++++++++++++++++++
 include/linux/perf_event.h             |    2 +
 3 files changed, 270 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 817d2b1..a6754ea 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -93,6 +93,8 @@ struct amd_nb {
 	struct event_constraint event_constraints[X86_PMC_IDX_MAX];
 };
 
+struct intel_percore;
+
 #define MAX_LBR_ENTRIES		16
 
 struct cpu_hw_events {
@@ -128,6 +130,13 @@ struct cpu_hw_events {
 	struct perf_branch_entry	lbr_entries[MAX_LBR_ENTRIES];
 
 	/*
+	 * Intel percore register state.
+	 * Coordinate shared resources between HT threads.
+	 */
+	int				percore_used; /* Used by this CPU? */
+	struct intel_percore		*per_core;
+
+	/*
 	 * AMD specific bits
 	 */
 	struct amd_nb		*amd_nb;
@@ -175,6 +184,32 @@ struct cpu_hw_events {
 #define for_each_event_constraint(e, c)	\
 	for ((e) = (c); (e)->weight; (e)++)
 
+/*
+ * Extra registers for specific events.
+ * Some events need large masks and require external MSRs.
+ * Define a mapping to these extra registers.
+ * The actual contents are still encoded in unused parts of the
+ * original config u64.
+ */
+struct extra_reg {
+	unsigned int		event;
+	unsigned int		msr;
+	unsigned int		extra_shift;
+	u64			config_mask;
+	u64			valid_mask;
+};
+
+#define EVENT_EXTRA_REG(e, ms, m, vm, es) {	\
+	.event = (e),		\
+	.msr = (ms),		\
+	.config_mask = (m),	\
+	.valid_mask = (vm),	\
+	.extra_shift = (es),	\
+	}
+#define INTEL_EVENT_EXTRA_REG(event, msr, vm, es)	\
+	EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, es)
+#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, 0)
+
 union perf_capabilities {
 	struct {
 		u64	lbr_format    : 6;
@@ -219,6 +254,7 @@ struct x86_pmu {
 	void		(*put_event_constraints)(struct cpu_hw_events *cpuc,
 						 struct perf_event *event);
 	struct event_constraint *event_constraints;
+	struct event_constraint *percore_constraints;
 	void		(*quirks)(void);
 	int		perfctr_second_write;
 
@@ -247,6 +283,11 @@ struct x86_pmu {
 	 */
 	unsigned long	lbr_tos, lbr_from, lbr_to; /* MSR base regs       */
 	int		lbr_nr;			   /* hardware stack size */
+
+	/*
+	 * Extra registers for events
+	 */
+	struct extra_reg *extra_regs;
 };
 
 static struct x86_pmu x86_pmu __read_mostly;
@@ -321,6 +362,33 @@ again:
 	return new_raw_count;
 }
 
+/*
+ * Find and validate any extra registers to set up.
+ */
+static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
+{
+	struct extra_reg *er;
+	u64 extra;
+
+	event->hw.extra_reg = 0;
+	event->hw.extra_config = 0;
+
+	if (!x86_pmu.extra_regs)
+		return 0;
+
+	for (er = x86_pmu.extra_regs; er->msr; er++) {
+		if (er->event != (config & er->config_mask))
+			continue;
+		event->hw.extra_reg = er->msr;
+		extra = config >> er->extra_shift;
+		if (extra & ~er->valid_mask)
+			return -EINVAL;
+		event->hw.extra_config = extra;
+		break;
+	}
+	return 0;
+}
+
 static atomic_t active_events;
 static DEFINE_MUTEX(pmc_reserve_mutex);
 
@@ -880,6 +948,8 @@ static void x86_pmu_enable(struct pmu *pmu)
 static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
 					  u64 enable_mask)
 {
+	if (hwc->extra_reg)
+		wrmsrl(hwc->extra_reg, hwc->extra_config);
 	wrmsrl(hwc->config_base + hwc->idx, hwc->config | enable_mask);
 }
 
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index c8f5c08..04fdcf9 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1,5 +1,27 @@
 #ifdef CONFIG_CPU_SUP_INTEL
 
+#define MAX_EXTRA_REGS 2
+
+/* 
+ * Per register state.
+ */
+struct er_account {
+	int			ref;		/* reference count */
+	unsigned int		extra_reg;	/* extra MSR number */
+	u64			extra_config;	/* extra MSR config */
+};
+
+/*
+ * Per core state
+ * This used to coordinate shared registers for HT threads.
+ */
+struct intel_percore {
+	raw_spinlock_t		lock;		/* protect structure */
+	struct er_account	regs[MAX_EXTRA_REGS];
+	int			refcnt;		/* number of threads */
+	unsigned		core_id;
+};
+
 /*
  * Intel PerfMon, used on Core and later.
  */
@@ -64,6 +86,18 @@ static struct event_constraint intel_nehalem_event_constraints[] =
 	EVENT_CONSTRAINT_END
 };
 
+static struct extra_reg intel_nehalem_extra_regs[] =
+{
+	INTEL_EVENT_EXTRA_REG(0xb7, 0x1a6, 0xffff, 32), /* OFFCORE_RESPONSE_0 */
+	EVENT_EXTRA_END
+};
+
+static struct event_constraint intel_nehalem_percore_constraints[] =
+{
+	INTEL_EVENT_CONSTRAINT(0xb7, 0),
+	EVENT_CONSTRAINT_END
+};
+
 static struct event_constraint intel_westmere_event_constraints[] =
 {
 	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
@@ -76,6 +110,20 @@ static struct event_constraint intel_westmere_event_constraints[] =
 	EVENT_CONSTRAINT_END
 };
 
+static struct extra_reg intel_westmere_extra_regs[] =
+{
+	INTEL_EVENT_EXTRA_REG(0xb7, 0x1a6, 0xffff, 32), /* OFFCORE_RESPONSE_0 */
+	INTEL_EVENT_EXTRA_REG(0xbb, 0x1a7, 0xffff, 32), /* OFFCORE_RESPONSE_1 */
+	EVENT_EXTRA_END
+};
+
+static struct event_constraint intel_westmere_percore_constraints[] =
+{
+	INTEL_EVENT_CONSTRAINT(0xb7, 0),
+	INTEL_EVENT_CONSTRAINT(0xbb, 0),
+	EVENT_CONSTRAINT_END
+};
+
 static struct event_constraint intel_gen_event_constraints[] =
 {
 	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
@@ -794,6 +842,66 @@ intel_bts_constraints(struct perf_event *event)
 }
 
 static struct event_constraint *
+intel_percore_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	unsigned int e = hwc->config & ARCH_PERFMON_EVENTSEL_EVENT;
+	struct event_constraint *c;
+	struct intel_percore *pc;
+	struct er_account *era;
+	int i;
+	int free_slot;
+	int found;
+
+	if (!x86_pmu.percore_constraints)
+		return NULL;
+
+	for (c = x86_pmu.percore_constraints; c->cmask; c++) {
+		if (e != c->code)
+			continue;
+
+		/*
+		 * Allocate resource per core.
+		 */
+		c = NULL;
+		pc = cpuc->per_core;
+		if (!pc)
+			break;
+		c = &emptyconstraint;
+		raw_spin_lock(&pc->lock);
+		free_slot = -1;
+		found = 0;
+		for (i = 0; i < MAX_EXTRA_REGS; i++) {
+			era = &pc->regs[i];
+			if (era->ref > 0 && hwc->extra_reg == era->extra_reg) {
+				/* Allow sharing same config */
+				if (hwc->extra_config == era->extra_config) {
+					era->ref++;
+					cpuc->percore_used = 1;
+					c = NULL;
+				}
+				/* else conflict */
+				found = 1;
+				break;
+			} else if (era->ref == 0 && free_slot == -1)
+				free_slot = i;
+		}
+		if (!found && free_slot != -1) {
+			era = &pc->regs[free_slot];
+			era->ref = 1;
+			era->extra_reg = hwc->extra_reg;
+			era->extra_config = hwc->extra_config;
+			cpuc->percore_used = 1;
+			c = NULL;
+		}
+		raw_spin_unlock(&pc->lock);
+		return c;
+	}
+
+	return NULL;
+}
+
+static struct event_constraint *
 intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
 {
 	struct event_constraint *c;
@@ -806,9 +914,50 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
 	if (c)
 		return c;
 
+	c = intel_percore_constraints(cpuc, event);
+	if (c)
+		return c;
+
 	return x86_get_event_constraints(cpuc, event);
 }
 
+static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
+					struct perf_event *event)
+{
+	struct extra_reg *er;
+	struct intel_percore *pc;
+	struct er_account *era;
+	struct hw_perf_event *hwc = &event->hw;
+	int i, allref;
+
+	if (!cpuc->percore_used)
+		return;
+
+	for (er = x86_pmu.extra_regs; er->msr; er++) {
+		if (er->event != (hwc->config & er->config_mask))
+			continue;
+
+		pc = cpuc->per_core;
+		raw_spin_lock(&pc->lock);
+		for (i = 0; i < MAX_EXTRA_REGS; i++) {
+			era = &pc->regs[i];
+			if (era->ref > 0 &&
+			    era->extra_config == hwc->extra_config &&
+			    era->extra_reg == er->msr) {
+				era->ref--;
+				break;
+			}
+		}
+		allref = 0;
+		for (i = 0; i < MAX_EXTRA_REGS; i++)
+			allref += pc->regs[i].ref;
+		if (allref == 0)
+			cpuc->percore_used = 0;
+		raw_spin_unlock(&pc->lock);
+		break;
+	}
+}
+
 static int intel_pmu_hw_config(struct perf_event *event)
 {
 	int ret = x86_pmu_hw_config(event);
@@ -854,11 +1003,43 @@ static __initconst const struct x86_pmu core_pmu = {
 	 */
 	.max_period		= (1ULL << 31) - 1,
 	.get_event_constraints	= intel_get_event_constraints,
+	.put_event_constraints	= intel_put_event_constraints,
 	.event_constraints	= intel_core_event_constraints,
 };
 
+static int intel_pmu_cpu_prepare(int cpu)
+{
+	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+	
+	cpuc->per_core = kzalloc_node(sizeof(struct intel_percore), 
+				      GFP_KERNEL, cpu_to_node(cpu));
+	if (!cpuc->per_core)
+		return NOTIFY_BAD;
+
+	raw_spin_lock_init(&cpuc->per_core->lock);
+	cpuc->per_core->core_id = -1;
+	return NOTIFY_OK;
+}
+
 static void intel_pmu_cpu_starting(int cpu)
 {
+	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+	int core_id = topology_core_id(cpu);
+	int i;
+
+	for_each_online_cpu(i) {
+		struct intel_percore *pc = per_cpu(cpu_hw_events, i).per_core;
+
+		if (pc && pc->core_id == core_id) {
+			kfree(cpuc->per_core);
+			cpuc->per_core = pc;
+			break;
+		}
+	}
+
+	cpuc->per_core->core_id = core_id;
+	cpuc->per_core->refcnt++;
+
 	init_debug_store_on_cpu(cpu);
 	/*
 	 * Deal with CPUs that don't clear their LBRs on power-up.
@@ -868,6 +1049,15 @@ static void intel_pmu_cpu_starting(int cpu)
 
 static void intel_pmu_cpu_dying(int cpu)
 {
+	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+	struct intel_percore *pc = cpuc->per_core;
+	
+	if (pc) {
+		if (pc->core_id == -1 || --pc->refcnt == 0)
+			kfree(pc);
+		cpuc->per_core = NULL;
+	}
+
 	fini_debug_store_on_cpu(cpu);
 }
 
@@ -892,7 +1082,9 @@ static __initconst const struct x86_pmu intel_pmu = {
 	 */
 	.max_period		= (1ULL << 31) - 1,
 	.get_event_constraints	= intel_get_event_constraints,
+	.put_event_constraints	= intel_put_event_constraints,
 
+	.cpu_prepare		= intel_pmu_cpu_prepare,
 	.cpu_starting		= intel_pmu_cpu_starting,
 	.cpu_dying		= intel_pmu_cpu_dying,
 };
@@ -1010,7 +1202,10 @@ static __init int intel_pmu_init(void)
 		intel_pmu_lbr_init_nhm();
 
 		x86_pmu.event_constraints = intel_nehalem_event_constraints;
+		x86_pmu.percore_constraints =
+			intel_nehalem_percore_constraints;
 		x86_pmu.enable_all = intel_pmu_nhm_enable_all;
+		x86_pmu.extra_regs = intel_nehalem_extra_regs;
 		pr_cont("Nehalem events, ");
 		break;
 
@@ -1032,7 +1227,10 @@ static __init int intel_pmu_init(void)
 		intel_pmu_lbr_init_nhm();
 
 		x86_pmu.event_constraints = intel_westmere_event_constraints;
+		x86_pmu.percore_constraints =
+			intel_westmere_percore_constraints;
 		x86_pmu.enable_all = intel_pmu_nhm_enable_all;
+		x86_pmu.extra_regs = intel_westmere_extra_regs;
 		pr_cont("Westmere events, ");
 		break;
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index cbf04cc..7e3f231 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -529,6 +529,8 @@ struct hw_perf_event {
 			unsigned long	event_base;
 			int		idx;
 			int		last_cpu;
+			unsigned int	extra_reg;
+			u64		extra_config;
 		};
 		struct { /* software */
 			struct hrtimer	hrtimer;
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 3/3] perf-events: Fix LLC-* events on Intel Nehalem/Westmere v2
  2010-11-29 13:12 Updated perf offcore patchkit Andi Kleen
  2010-11-29 13:12 ` [PATCH 1/3] perf: Document enhanced event encoding for OFFCORE_MSR Andi Kleen
  2010-11-29 13:12 ` [PATCH 2/3] perf-events: Add support for supplementary event registers v4 Andi Kleen
@ 2010-11-29 13:12 ` Andi Kleen
  2 siblings, 0 replies; 5+ messages in thread
From: Andi Kleen @ 2010-11-29 13:12 UTC (permalink / raw)
  To: peterz; +Cc: mingo, linux-kernel, Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

The generic perf LLC-* events do count the L2 caches, not the real
L3 LLC on Intel Nehalem and Westmere. This lead to quite some confusion.

Fixing this properly requires use of the special OFFCORE_RESPONSE
events which need a separate mask register. This has been implemented
in a earlier patch.

Now use this infrastructure to set correct events for the LLC-*
on Nehalem and Westmere

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/kernel/cpu/perf_event.c       |   12 ++++----
 arch/x86/kernel/cpu/perf_event_intel.c |   46 +++++++++++++++++++++++---------
 2 files changed, 39 insertions(+), 19 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index a6754ea..36549f3 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -472,8 +472,9 @@ static inline int x86_pmu_initialized(void)
 }
 
 static inline int
-set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
+set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
 {
+	struct perf_event_attr *attr = &event->attr;
 	unsigned int cache_type, cache_op, cache_result;
 	u64 config, val;
 
@@ -499,9 +500,8 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
 	if (val == -1)
 		return -EINVAL;
 
-	hwc->config |= val;
-
-	return 0;
+	hwc->config |= val & X86_RAW_EVENT_MASK;
+	return x86_pmu_extra_regs(val, event);
 }
 
 static int x86_setup_perfctr(struct perf_event *event)
@@ -526,10 +526,10 @@ static int x86_setup_perfctr(struct perf_event *event)
 	}
 
 	if (attr->type == PERF_TYPE_RAW)
-		return 0;
+		return x86_pmu_extra_regs(event->attr.config, event);
 
 	if (attr->type == PERF_TYPE_HW_CACHE)
-		return set_ext_hw_attr(hwc, attr);
+		return set_ext_hw_attr(hwc, event);
 
 	if (attr->config >= x86_pmu.max_events)
 		return -EINVAL;
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 04fdcf9..fdc5d01 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -88,7 +88,7 @@ static struct event_constraint intel_nehalem_event_constraints[] =
 
 static struct extra_reg intel_nehalem_extra_regs[] =
 {
-	INTEL_EVENT_EXTRA_REG(0xb7, 0x1a6, 0xffff, 32), /* OFFCORE_RESPONSE_0 */
+	INTEL_EVENT_EXTRA_REG(0xb7, 0x1a6, 0xffff, 32), /* OFFCORE_RESPONSE */
 	EVENT_EXTRA_END
 };
 
@@ -172,16 +172,26 @@ static __initconst const u64 westmere_hw_cache_event_ids
  },
  [ C(LL  ) ] = {
 	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS               */
-		[ C(RESULT_MISS)   ] = 0x0224, /* L2_RQSTS.LD_MISS             */
+		/* OFFCORE_RESPONSE_0.ANY_DATA.LOCAL_CACHE */
+		[ C(RESULT_ACCESS) ] = 0x00000711004301b7,
+		/* OFFCORE_RESPONSE_1.ANY_DATA.ANY_LLC_MISS */
+		[ C(RESULT_MISS)   ] = 0x0000f811004301bb,
 	},
+	/*
+	 * Use RFO, not WRITEBACK, because a write miss would typically occur
+	 * on RFO.
+	 */
 	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS                */
-		[ C(RESULT_MISS)   ] = 0x0824, /* L2_RQSTS.RFO_MISS            */
+		/* OFFCORE_RESPONSE_1.ANY_RFO.LOCAL_CACHE */
+		[ C(RESULT_ACCESS) ] = 0x00000722004301bb,
+		/* OFFCORE_RESPONSE_0.ANY_RFO.ANY_LLC_MISS */
+		[ C(RESULT_MISS)   ] = 0x0000f822004301b7,
 	},
 	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference                */
-		[ C(RESULT_MISS)   ] = 0x412e, /* LLC Misses                   */
+		/* OFFCORE_RESPONSE_0.PREFETCH.LOCAL_CACHE */
+		[ C(RESULT_ACCESS) ] = 0x00000770004301b7,
+		/* OFFCORE_RESPONSE_1.PREFETCH.ANY_LLC_MISS */
+		[ C(RESULT_MISS)   ] = 0x0000f870004301bb,
 	},
  },
  [ C(DTLB) ] = {
@@ -263,16 +273,26 @@ static __initconst const u64 nehalem_hw_cache_event_ids
  },
  [ C(LL  ) ] = {
 	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS               */
-		[ C(RESULT_MISS)   ] = 0x0224, /* L2_RQSTS.LD_MISS             */
+		/* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
+		[ C(RESULT_ACCESS) ] = 0x00000711004301b7,
+		/* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
+		[ C(RESULT_MISS)   ] = 0x0000f811004301b7,
 	},
+	/*
+	 * Use RFO, not WRITEBACK, because a write miss would typically occur
+	 * on RFO.
+	 */
 	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS                */
-		[ C(RESULT_MISS)   ] = 0x0824, /* L2_RQSTS.RFO_MISS            */
+		/* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
+		[ C(RESULT_ACCESS) ] = 0x00000722004301b7,
+		/* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
+		[ C(RESULT_MISS)   ] = 0x0000f822004301b7,
 	},
 	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference                */
-		[ C(RESULT_MISS)   ] = 0x412e, /* LLC Misses                   */
+		/* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
+		[ C(RESULT_ACCESS) ] = 0x00000770004301b7,
+		/* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
+		[ C(RESULT_MISS)   ] = 0x0000f870004301b7,
 	},
  },
  [ C(DTLB) ] = {
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH 2/3] perf-events: Add support for supplementary event registers v4
  2010-11-29 13:12 ` [PATCH 2/3] perf-events: Add support for supplementary event registers v4 Andi Kleen
@ 2010-12-01 14:26   ` Peter Zijlstra
  0 siblings, 0 replies; 5+ messages in thread
From: Peter Zijlstra @ 2010-12-01 14:26 UTC (permalink / raw)
  To: Andi Kleen; +Cc: mingo, linux-kernel, Andi Kleen, eranian

On Mon, 2010-11-29 at 14:12 +0100, Andi Kleen wrote:

> diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
> index 817d2b1..a6754ea 100644
> --- a/arch/x86/kernel/cpu/perf_event.c
> +++ b/arch/x86/kernel/cpu/perf_event.c
> @@ -93,6 +93,8 @@ struct amd_nb {
>  	struct event_constraint event_constraints[X86_PMC_IDX_MAX];
>  };
>  
> +struct intel_percore;
> +
>  #define MAX_LBR_ENTRIES		16
>  
>  struct cpu_hw_events {
> @@ -128,6 +130,13 @@ struct cpu_hw_events {
>  	struct perf_branch_entry	lbr_entries[MAX_LBR_ENTRIES];
>  
>  	/*
> +	 * Intel percore register state.
> +	 * Coordinate shared resources between HT threads.
> +	 */
> +	int				percore_used; /* Used by this CPU? */
> +	struct intel_percore		*per_core;
> +
> +	/*
>  	 * AMD specific bits
>  	 */
>  	struct amd_nb		*amd_nb;

> +/*
> + * Per core state
> + * This used to coordinate shared registers for HT threads.
> + */
> +struct intel_percore {
> +	raw_spinlock_t		lock;		/* protect structure */
> +	struct er_account	regs[MAX_EXTRA_REGS];
> +	int			refcnt;		/* number of threads */
> +	unsigned		core_id;
> +};
> +
>  /*
>   * Intel PerfMon, used on Core and later.
>   */


> +static int intel_pmu_cpu_prepare(int cpu)
> +{
> +	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
> +	
> +	cpuc->per_core = kzalloc_node(sizeof(struct intel_percore), 
> +				      GFP_KERNEL, cpu_to_node(cpu));
> +	if (!cpuc->per_core)
> +		return NOTIFY_BAD;
> +
> +	raw_spin_lock_init(&cpuc->per_core->lock);
> +	cpuc->per_core->core_id = -1;
> +	return NOTIFY_OK;
> +}
> +
>  static void intel_pmu_cpu_starting(int cpu)
>  {
> +	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
> +	int core_id = topology_core_id(cpu);
> +	int i;
> +
> +	for_each_online_cpu(i) {
> +		struct intel_percore *pc = per_cpu(cpu_hw_events, i).per_core;
> +
> +		if (pc && pc->core_id == core_id) {
> +			kfree(cpuc->per_core);
> +			cpuc->per_core = pc;
> +			break;
> +		}
> +	}
> +
> +	cpuc->per_core->core_id = core_id;
> +	cpuc->per_core->refcnt++;
> +
>  	init_debug_store_on_cpu(cpu);
>  	/*
>  	 * Deal with CPUs that don't clear their LBRs on power-up.
> @@ -868,6 +1049,15 @@ static void intel_pmu_cpu_starting(int cpu)
>  
>  static void intel_pmu_cpu_dying(int cpu)
>  {
> +	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
> +	struct intel_percore *pc = cpuc->per_core;
> +	
> +	if (pc) {
> +		if (pc->core_id == -1 || --pc->refcnt == 0)
> +			kfree(pc);
> +		cpuc->per_core = NULL;
> +	}
> +
>  	fini_debug_store_on_cpu(cpu);
>  }
>  
> @@ -892,7 +1082,9 @@ static __initconst const struct x86_pmu intel_pmu = {
>  	 */
>  	.max_period		= (1ULL << 31) - 1,
>  	.get_event_constraints	= intel_get_event_constraints,
> +	.put_event_constraints	= intel_put_event_constraints,
>  
> +	.cpu_prepare		= intel_pmu_cpu_prepare,
>  	.cpu_starting		= intel_pmu_cpu_starting,
>  	.cpu_dying		= intel_pmu_cpu_dying,
>  };
> @@ -1010,7 +1202,10 @@ static __init int intel_pmu_init(void)
>  		intel_pmu_lbr_init_nhm();
>  
>  		x86_pmu.event_constraints = intel_nehalem_event_constraints;
> +		x86_pmu.percore_constraints =
> +			intel_nehalem_percore_constraints;
>  		x86_pmu.enable_all = intel_pmu_nhm_enable_all;
> +		x86_pmu.extra_regs = intel_nehalem_extra_regs;
>  		pr_cont("Nehalem events, ");
>  		break;
>  
> @@ -1032,7 +1227,10 @@ static __init int intel_pmu_init(void)
>  		intel_pmu_lbr_init_nhm();
>  
>  		x86_pmu.event_constraints = intel_westmere_event_constraints;
> +		x86_pmu.percore_constraints =
> +			intel_westmere_percore_constraints;
>  		x86_pmu.enable_all = intel_pmu_nhm_enable_all;
> +		x86_pmu.extra_regs = intel_westmere_extra_regs;
>  		pr_cont("Westmere events, ");
>  		break;
>  

You seem to have lost the needs_percore stuff.

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2010-12-01 14:26 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-11-29 13:12 Updated perf offcore patchkit Andi Kleen
2010-11-29 13:12 ` [PATCH 1/3] perf: Document enhanced event encoding for OFFCORE_MSR Andi Kleen
2010-11-29 13:12 ` [PATCH 2/3] perf-events: Add support for supplementary event registers v4 Andi Kleen
2010-12-01 14:26   ` Peter Zijlstra
2010-11-29 13:12 ` [PATCH 3/3] perf-events: Fix LLC-* events on Intel Nehalem/Westmere v2 Andi Kleen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).