linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/4] perf: memory load/store events generalization
@ 2011-07-04  8:02 Lin Ming
  2011-07-04  8:02 ` [PATCH 1/4] perf: Add memory load/store events generic code Lin Ming
                   ` (4 more replies)
  0 siblings, 5 replies; 37+ messages in thread
From: Lin Ming @ 2011-07-04  8:02 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Andi Kleen, Stephane Eranian,
	Arnaldo Carvalho de Melo
  Cc: linux-kernel

Hi, all

Intel PMU provides 2 facilities to monitor memory operation: load latency and precise store.
This patchset tries to generalize memory load/store events.
So other arches may also add such features.

A new sub-command "mem" is added,

$ perf mem

 usage: perf mem [<options>] {record <command> |report}

    -t, --type <type>     memory operations(load/store)
    -L, --latency <n>     latency to sample(only for load op)

$ perf mem -t load record make -j8

<building kernel ..., monitoring memory load opeartion>

$ perf mem -t load report

Memory load operation statistics
================================
                      L1-local: total latency=   28027, count=    3355(avg=8)
                      L2-snoop: total latency=    1430, count=      29(avg=49)
                      L2-local: total latency=     124, count=       8(avg=15)
             L3-snoop, found M: total latency=     452, count=       4(avg=113)
          L3-snoop, found no M: total latency=       0, count=       0(avg=0)
L3-snoop, no coherency actions: total latency=     875, count=      18(avg=48)
        L3-miss, snoop, shared: total latency=       0, count=       0(avg=0)
     L3-miss, local, exclusive: total latency=       0, count=       0(avg=0)
        L3-miss, local, shared: total latency=       0, count=       0(avg=0)
    L3-miss, remote, exclusive: total latency=       0, count=       0(avg=0)
       L3-miss, remote, shared: total latency=       0, count=       0(avg=0)
                    Unknown L3: total latency=       0, count=       0(avg=0)
                            IO: total latency=       0, count=       0(avg=0)
                      Uncached: total latency=     464, count=      30(avg=15)

$ perf mem -t store record make -j8

<building kernel ..., monitoring memory store opeartion>

$ perf mem -t store report

Memory store operation statistics
=================================
                data-cache hit:     8138
               data-cache miss:        0
                      STLB hit:     8138
                     STLB miss:        0
                 Locked access:        0
               Unlocked access:     8138

Any comment is appreciated.

Thanks,
Lin Ming

^ permalink raw reply	[flat|nested] 37+ messages in thread

* [PATCH 1/4] perf: Add memory load/store events generic code
  2011-07-04  8:02 [PATCH 0/4] perf: memory load/store events generalization Lin Ming
@ 2011-07-04  8:02 ` Lin Ming
  2011-07-04  8:33   ` Peter Zijlstra
                     ` (2 more replies)
  2011-07-04  8:02 ` [PATCH 2/4] perf, x86: Add Intel Nhm/Wsm/Snb load latency support Lin Ming
                   ` (3 subsequent siblings)
  4 siblings, 3 replies; 37+ messages in thread
From: Lin Ming @ 2011-07-04  8:02 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Andi Kleen, Stephane Eranian,
	Arnaldo Carvalho de Melo
  Cc: linux-kernel

Add generic memory load/store events: PERF_COUNT_HW_MEM_{LOAD, STORE}
Add generic memory load/store operation encoding.
Add code to handle memory operation data.

Signed-off-by: Lin Ming <ming.m.lin@intel.com>
---
 include/linux/perf_event.h |   44 +++++++++++++++++++++++++++++++++++++++++++-
 kernel/events/core.c       |   12 ++++++++++++
 2 files changed, 55 insertions(+), 1 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e76a410..c410ae4 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -54,6 +54,8 @@ enum perf_hw_id {
 	PERF_COUNT_HW_BUS_CYCLES		= 6,
 	PERF_COUNT_HW_STALLED_CYCLES_FRONTEND	= 7,
 	PERF_COUNT_HW_STALLED_CYCLES_BACKEND	= 8,
+	PERF_COUNT_HW_MEM_LOAD			= 9,
+	PERF_COUNT_HW_MEM_STORE			= 10,
 
 	PERF_COUNT_HW_MAX,			/* non-ABI */
 };
@@ -127,8 +129,10 @@ enum perf_event_sample_format {
 	PERF_SAMPLE_PERIOD			= 1U << 8,
 	PERF_SAMPLE_STREAM_ID			= 1U << 9,
 	PERF_SAMPLE_RAW				= 1U << 10,
+	PERF_SAMPLE_LATENCY			= 1U << 11,
+	PERF_SAMPLE_EXTRA			= 1U << 12,
 
-	PERF_SAMPLE_MAX = 1U << 11,		/* non-ABI */
+	PERF_SAMPLE_MAX = 1U << 13,		/* non-ABI */
 };
 
 /*
@@ -432,6 +436,8 @@ enum perf_event_type {
 	 *	{ u64			stream_id;} && PERF_SAMPLE_STREAM_ID
 	 *	{ u32			cpu, res; } && PERF_SAMPLE_CPU
 	 *	{ u64			period;   } && PERF_SAMPLE_PERIOD
+	 *	{ u64			latency;  } && PERF_SAMPLE_LATENCY
+	 *	{ u64			extra;    } && PERF_SAMPLE_EXTRA
 	 *
 	 *	{ struct read_format	values;	  } && PERF_SAMPLE_READ
 	 *
@@ -474,6 +480,40 @@ enum perf_callchain_context {
 #define PERF_FLAG_FD_OUTPUT		(1U << 1)
 #define PERF_FLAG_PID_CGROUP		(1U << 2) /* pid=cgroup id, per-cpu mode only */
 
+/*
+ * Memory load operation info encoding
+ */
+
+/* Bits(0-1) {L1, L2, L3, RAM} or {unknown, IO, uncached, reserved} */
+#define MEM_LOAD_L1			0x00
+#define MEM_LOAD_L2			0x01
+#define MEM_LOAD_L3			0x02
+#define MEM_LOAD_RAM			0x03
+#define MEM_LOAD_UNKNOWN		0x00
+#define MEM_LOAD_IO			0x01
+#define MEM_LOAD_UNCACHED		0x02
+#define MEM_LOAD_RESERVED		0x03
+
+/* Bits(2-3) {toggle, snoop, local, remote} */
+#define MEM_LOAD_TOGGLE			(0x00 << 2)
+#define MEM_LOAD_SNOOP			(0x01 << 2)
+#define MEM_LOAD_LOCAL			(0x02 << 2)
+#define MEM_LOAD_REMOTE			(0x03 << 2)
+
+/* Bits(4-5) {modified, exclusive, shared, invalid} */
+#define MEM_LOAD_MODIFIED		(0x00 << 4)
+#define MEM_LOAD_EXCLUSIVE		(0x01 << 4)
+#define MEM_LOAD_SHARED			(0x02 << 4)
+#define MEM_LOAD_INVALID		(0x03 << 4)
+
+/*
+ * Memory store operation info encoding
+ */
+
+#define MEM_STORE_DCU_HIT		(1ULL << 0)
+#define MEM_STORE_STLB_HIT		(1ULL << 1)
+#define MEM_STORE_LOCKED_ACCESS		(1ULL << 2)
+
 #ifdef __KERNEL__
 /*
  * Kernel-internal data types and definitions:
@@ -974,6 +1014,8 @@ struct perf_sample_data {
 		u32	reserved;
 	}				cpu_entry;
 	u64				period;
+	u64				latency;
+	u64				extra;
 	struct perf_callchain_entry	*callchain;
 	struct perf_raw_record		*raw;
 };
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5e70f62..b835ef5 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -887,6 +887,12 @@ static void perf_event__header_size(struct perf_event *event)
 	if (sample_type & PERF_SAMPLE_PERIOD)
 		size += sizeof(data->period);
 
+	if (sample_type & PERF_SAMPLE_LATENCY)
+		size += sizeof(data->latency);
+
+	if (sample_type & PERF_SAMPLE_EXTRA)
+		size += sizeof(data->extra);
+
 	if (sample_type & PERF_SAMPLE_READ)
 		size += event->read_size;
 
@@ -3871,6 +3877,12 @@ void perf_output_sample(struct perf_output_handle *handle,
 	if (sample_type & PERF_SAMPLE_PERIOD)
 		perf_output_put(handle, data->period);
 
+	if (sample_type & PERF_SAMPLE_LATENCY)
+		perf_output_put(handle, data->latency);
+
+	if (sample_type & PERF_SAMPLE_EXTRA)
+		perf_output_put(handle, data->extra);
+
 	if (sample_type & PERF_SAMPLE_READ)
 		perf_output_read(handle, event);
 
-- 
1.7.5.1


^ permalink raw reply related	[flat|nested] 37+ messages in thread

* [PATCH 2/4] perf, x86: Add Intel Nhm/Wsm/Snb load latency support
  2011-07-04  8:02 [PATCH 0/4] perf: memory load/store events generalization Lin Ming
  2011-07-04  8:02 ` [PATCH 1/4] perf: Add memory load/store events generic code Lin Ming
@ 2011-07-04  8:02 ` Lin Ming
  2011-07-05 13:17   ` Peter Zijlstra
  2011-07-22 18:58   ` Stephane Eranian
  2011-07-04  8:02 ` [PATCH 3/4] perf, x86: Add Intel SandyBridge pricise store support Lin Ming
                   ` (2 subsequent siblings)
  4 siblings, 2 replies; 37+ messages in thread
From: Lin Ming @ 2011-07-04  8:02 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Andi Kleen, Stephane Eranian,
	Arnaldo Carvalho de Melo
  Cc: linux-kernel

Implements Intel memory load event for Nehalem/Westmere/SandyBridge.

$ perf mem -t load record make -j8

<building kernel ..., monitoring memory load opeartion>

$ perf mem -t load report

Memory load operation statistics
================================
                      L1-local: total latency=   28027, count=    3355(avg=8)
                      L2-snoop: total latency=    1430, count=      29(avg=49)
                      L2-local: total latency=     124, count=       8(avg=15)
             L3-snoop, found M: total latency=     452, count=       4(avg=113)
          L3-snoop, found no M: total latency=       0, count=       0(avg=0)
L3-snoop, no coherency actions: total latency=     875, count=      18(avg=48)
        L3-miss, snoop, shared: total latency=       0, count=       0(avg=0)
     L3-miss, local, exclusive: total latency=       0, count=       0(avg=0)
        L3-miss, local, shared: total latency=       0, count=       0(avg=0)
    L3-miss, remote, exclusive: total latency=       0, count=       0(avg=0)
       L3-miss, remote, shared: total latency=       0, count=       0(avg=0)
                    Unknown L3: total latency=       0, count=       0(avg=0)
                            IO: total latency=       0, count=       0(avg=0)
                      Uncached: total latency=     464, count=      30(avg=15)

Signed-off-by: Lin Ming <ming.m.lin@intel.com>
---
 arch/x86/include/asm/msr-index.h          |    2 +
 arch/x86/kernel/cpu/perf_event.c          |   10 ++++++
 arch/x86/kernel/cpu/perf_event_intel.c    |   20 +++++++++++-
 arch/x86/kernel/cpu/perf_event_intel_ds.c |   49 ++++++++++++++++++++++++++--
 4 files changed, 76 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 485b4f1..da93a9d 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -60,6 +60,8 @@
 #define MSR_IA32_DS_AREA		0x00000600
 #define MSR_IA32_PERF_CAPABILITIES	0x00000345
 
+#define MSR_PEBS_LD_LAT_THRESHOLD	0x000003f6
+
 #define MSR_MTRRfix64K_00000		0x00000250
 #define MSR_MTRRfix16K_80000		0x00000258
 #define MSR_MTRRfix16K_A0000		0x00000259
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 3a0338b..ce380a7 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -207,6 +207,9 @@ struct extra_reg {
 #define INTEL_EVENT_EXTRA_REG(event, msr, vm)	\
 	EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm)
 #define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0)
+#define INTEL_EVENT_EXTRA_REG2(event, msr, vm)    \
+	EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT | \
+			ARCH_PERFMON_EVENTSEL_UMASK, vm)
 
 union perf_capabilities {
 	struct {
@@ -406,6 +409,11 @@ static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
 			continue;
 		if (event->attr.config1 & ~er->valid_mask)
 			return -EINVAL;
+
+		/* The minimum value that may be programmed into MSR_PEBS_LD_LAT is 3 */
+		if (er->msr == MSR_PEBS_LD_LAT_THRESHOLD && event->attr.config1 < 3)
+			return -EINVAL;
+
 		event->hw.extra_reg = er->msr;
 		event->hw.extra_config = event->attr.config1;
 		break;
@@ -617,6 +625,8 @@ static int x86_setup_perfctr(struct perf_event *event)
 	if (config == -1LL)
 		return -EINVAL;
 
+	x86_pmu_extra_regs(config, event);
+
 	/*
 	 * Branch tracing:
 	 */
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 41178c8..dde9041 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1,6 +1,6 @@
 #ifdef CONFIG_CPU_SUP_INTEL
 
-#define MAX_EXTRA_REGS 2
+#define MAX_EXTRA_REGS 3
 
 /*
  * Per register state.
@@ -89,6 +89,7 @@ static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =
 static struct extra_reg intel_nehalem_extra_regs[] __read_mostly =
 {
 	INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff),
+	INTEL_EVENT_EXTRA_REG2(0x100b, MSR_PEBS_LD_LAT_THRESHOLD, 0xffff),
 	EVENT_EXTRA_END
 };
 
@@ -123,10 +124,17 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly =
 	EVENT_CONSTRAINT_END
 };
 
+static struct extra_reg intel_snb_extra_regs[] __read_mostly =
+{
+	INTEL_EVENT_EXTRA_REG2(0x01cd, MSR_PEBS_LD_LAT_THRESHOLD, 0xffff),
+	EVENT_EXTRA_END
+};
+
 static struct extra_reg intel_westmere_extra_regs[] __read_mostly =
 {
 	INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff),
 	INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff),
+	INTEL_EVENT_EXTRA_REG2(0x100b, MSR_PEBS_LD_LAT_THRESHOLD, 0xffff),
 	EVENT_EXTRA_END
 };
 
@@ -1445,6 +1453,9 @@ static __init int intel_pmu_init(void)
 		/* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
 		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1;
 
+		/* Memory load latency */
+		intel_perfmon_event_map[PERF_COUNT_HW_MEM_LOAD] = 0x100b;
+
 		if (ebx & 0x40) {
 			/*
 			 * Erratum AAJ80 detected, we work it around by using
@@ -1491,6 +1502,9 @@ static __init int intel_pmu_init(void)
 		/* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
 		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1;
 
+		/* Memory load latency */
+		intel_perfmon_event_map[PERF_COUNT_HW_MEM_LOAD] = 0x100b;
+
 		pr_cont("Westmere events, ");
 		break;
 
@@ -1502,12 +1516,16 @@ static __init int intel_pmu_init(void)
 
 		x86_pmu.event_constraints = intel_snb_event_constraints;
 		x86_pmu.pebs_constraints = intel_snb_pebs_events;
+		x86_pmu.extra_regs = intel_snb_extra_regs;
 
 		/* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
 		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
 		/* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/
 		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x18001b1;
 
+		/* Memory load latency */
+		intel_perfmon_event_map[PERF_COUNT_HW_MEM_LOAD] = 0x01cd;
+
 		pr_cont("SandyBridge events, ");
 		break;
 
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index bab491b..d2d3155 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -1,5 +1,28 @@
 #ifdef CONFIG_CPU_SUP_INTEL
 
+/* Indexed by Intel load latency data source encoding value */
+
+static u64 load_latency_data_source[] = {
+	MEM_LOAD_UNKNOWN | MEM_LOAD_TOGGLE,			/* 0x00: Unknown L3 */
+	MEM_LOAD_L1 | MEM_LOAD_LOCAL,				/* 0x01: L1-local */
+	MEM_LOAD_L2 | MEM_LOAD_SNOOP,				/* 0x02: L2-snoop */
+	MEM_LOAD_L2 | MEM_LOAD_LOCAL,				/* 0x03: L2-local */
+	MEM_LOAD_L3 | MEM_LOAD_SNOOP | MEM_LOAD_INVALID,	/* 0x04: L3-snoop, no coherency actions */
+	MEM_LOAD_L3 | MEM_LOAD_SNOOP | MEM_LOAD_SHARED,		/* 0x05: L3-snoop, found no M */
+	MEM_LOAD_L3 | MEM_LOAD_SNOOP | MEM_LOAD_MODIFIED,	/* 0x06: L3-snoop, found M */
+	MEM_LOAD_RESERVED,					/* 0x07: reserved */
+	MEM_LOAD_RAM | MEM_LOAD_SNOOP | MEM_LOAD_SHARED,	/* 0x08: L3-miss, snoop, shared */
+	MEM_LOAD_RESERVED,					/* 0x09: reserved */
+	MEM_LOAD_RAM | MEM_LOAD_LOCAL | MEM_LOAD_SHARED,	/* 0x0A: L3-miss, local, shared */
+	MEM_LOAD_RAM | MEM_LOAD_REMOTE | MEM_LOAD_SHARED,	/* 0x0B: L3-miss, remote, shared */
+	MEM_LOAD_RAM | MEM_LOAD_LOCAL | MEM_LOAD_EXCLUSIVE,	/* 0x0C: L3-miss, local, exclusive */
+	MEM_LOAD_RAM | MEM_LOAD_REMOTE | MEM_LOAD_EXCLUSIVE,	/* 0x0D: L3-miss, remote, exclusive */
+	MEM_LOAD_IO | MEM_LOAD_TOGGLE,				/* 0x0E: IO */
+	MEM_LOAD_UNCACHED | MEM_LOAD_TOGGLE,			/* 0x0F: Uncached */
+};
+
+#define LOAD_LATENCY_DATA_SOURCE_MASK	0x0FULL
+
 /* The maximal number of PEBS events: */
 #define MAX_PEBS_EVENTS		4
 
@@ -454,6 +477,8 @@ static void intel_pmu_pebs_enable(struct perf_event *event)
 	hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
 
 	cpuc->pebs_enabled |= 1ULL << hwc->idx;
+	if (hwc->extra_reg == MSR_PEBS_LD_LAT_THRESHOLD)
+		cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32);
 	WARN_ON_ONCE(cpuc->enabled);
 
 	if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
@@ -466,6 +491,8 @@ static void intel_pmu_pebs_disable(struct perf_event *event)
 	struct hw_perf_event *hwc = &event->hw;
 
 	cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
+	if (hwc->extra_reg == MSR_PEBS_LD_LAT_THRESHOLD)
+		cpuc->pebs_enabled &= ~(1ULL << (hwc->idx + 32));
 	if (cpuc->enabled)
 		wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
 
@@ -582,13 +609,13 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
 				   struct pt_regs *iregs, void *__pebs)
 {
 	/*
-	 * We cast to pebs_record_core since that is a subset of
-	 * both formats and we don't use the other fields in this
-	 * routine.
+	 * We cast to pebs_record_nhm to get the load latency data
+	 * if extra_reg MSR_PEBS_LD_LAT_THRESHOLD used
 	 */
-	struct pebs_record_core *pebs = __pebs;
+	struct pebs_record_nhm *pebs = __pebs;
 	struct perf_sample_data data;
 	struct pt_regs regs;
+	u64 sample_type;
 
 	if (!intel_pmu_save_and_restart(event))
 		return;
@@ -596,6 +623,20 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
 	perf_sample_data_init(&data, 0);
 	data.period = event->hw.last_period;
 
+	if (event->attr.config == PERF_COUNT_HW_MEM_LOAD) {
+		sample_type = event->attr.sample_type;
+
+		if (sample_type & PERF_SAMPLE_ADDR)
+			data.addr = pebs->dla;
+
+		if (sample_type & PERF_SAMPLE_LATENCY)
+			data.latency = pebs->lat;
+
+		if (sample_type & PERF_SAMPLE_EXTRA)
+			data.extra = load_latency_data_source[pebs->dse &
+					LOAD_LATENCY_DATA_SOURCE_MASK];
+	}
+
 	/*
 	 * We use the interrupt regs as a base because the PEBS record
 	 * does not contain a full regs set, specifically it seems to
-- 
1.7.5.1


^ permalink raw reply related	[flat|nested] 37+ messages in thread

* [PATCH 3/4] perf, x86: Add Intel SandyBridge pricise store support
  2011-07-04  8:02 [PATCH 0/4] perf: memory load/store events generalization Lin Ming
  2011-07-04  8:02 ` [PATCH 1/4] perf: Add memory load/store events generic code Lin Ming
  2011-07-04  8:02 ` [PATCH 2/4] perf, x86: Add Intel Nhm/Wsm/Snb load latency support Lin Ming
@ 2011-07-04  8:02 ` Lin Ming
  2011-07-11  8:32   ` Peter Zijlstra
  2011-07-04  8:02 ` [PATCH 4/4] perf, tool: Add new command "perf mem" Lin Ming
  2011-07-22 18:55 ` [PATCH 0/4] perf: memory load/store events generalization Stephane Eranian
  4 siblings, 1 reply; 37+ messages in thread
From: Lin Ming @ 2011-07-04  8:02 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Andi Kleen, Stephane Eranian,
	Arnaldo Carvalho de Melo
  Cc: linux-kernel

Implements Intel memory store event for SandyBridge.

$ perf mem -t store record make -j8

<building kernel ..., monitoring memory store opeartion>

$ perf mem -t store report

Memory store operation statistics
=================================
                data-cache hit:     8138
               data-cache miss:        0
                      STLB hit:     8138
                     STLB miss:        0
                 Locked access:        0
               Unlocked access:     8138

Signed-off-by: Lin Ming <ming.m.lin@intel.com>
---
 arch/x86/kernel/cpu/perf_event_intel.c    |    3 +-
 arch/x86/kernel/cpu/perf_event_intel_ds.c |   30 +++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index dde9041..eede1f3 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1523,8 +1523,9 @@ static __init int intel_pmu_init(void)
 		/* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/
 		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x18001b1;
 
-		/* Memory load latency */
+		/* Memory load latency and precise store*/
 		intel_perfmon_event_map[PERF_COUNT_HW_MEM_LOAD] = 0x01cd;
+		intel_perfmon_event_map[PERF_COUNT_HW_MEM_STORE] = 0x02cd;
 
 		pr_cont("SandyBridge events, ");
 		break;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index d2d3155..bd7289b 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -23,6 +23,26 @@ static u64 load_latency_data_source[] = {
 
 #define LOAD_LATENCY_DATA_SOURCE_MASK	0x0FULL
 
+#define PRECISE_STORE_DCU_HIT		(1ULL << 0)
+#define PRECISE_STORE_STLB_MISS		(1ULL << 4)
+#define PRECISE_STORE_LOCKED_ACCESS	(1ULL << 5)
+
+static u64 precise_store_data(u64 status)
+{
+	u64 extra = 0;
+
+	if (status & PRECISE_STORE_DCU_HIT)
+		extra |= MEM_STORE_DCU_HIT;
+
+	if (!(status & PRECISE_STORE_STLB_MISS))
+		extra |= MEM_STORE_STLB_HIT;
+
+	if (status & PRECISE_STORE_LOCKED_ACCESS)
+		extra |= MEM_STORE_LOCKED_ACCESS;
+
+	return extra;
+}
+
 /* The maximal number of PEBS events: */
 #define MAX_PEBS_EVENTS		4
 
@@ -637,6 +657,16 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
 					LOAD_LATENCY_DATA_SOURCE_MASK];
 	}
 
+	if (event->attr.config == PERF_COUNT_HW_MEM_STORE) {
+		sample_type = event->attr.sample_type;
+
+		if (sample_type & PERF_SAMPLE_ADDR)
+			data.addr = pebs->dla;
+
+		if (sample_type & PERF_SAMPLE_EXTRA)
+			data.extra = precise_store_data(pebs->dse);
+	}
+
 	/*
 	 * We use the interrupt regs as a base because the PEBS record
 	 * does not contain a full regs set, specifically it seems to
-- 
1.7.5.1


^ permalink raw reply related	[flat|nested] 37+ messages in thread

* [PATCH 4/4] perf, tool: Add new command "perf mem"
  2011-07-04  8:02 [PATCH 0/4] perf: memory load/store events generalization Lin Ming
                   ` (2 preceding siblings ...)
  2011-07-04  8:02 ` [PATCH 3/4] perf, x86: Add Intel SandyBridge pricise store support Lin Ming
@ 2011-07-04  8:02 ` Lin Ming
  2011-07-04 22:00   ` Andi Kleen
  2011-07-22 18:55 ` [PATCH 0/4] perf: memory load/store events generalization Stephane Eranian
  4 siblings, 1 reply; 37+ messages in thread
From: Lin Ming @ 2011-07-04  8:02 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Andi Kleen, Stephane Eranian,
	Arnaldo Carvalho de Melo
  Cc: linux-kernel

Adds new command "perf mem" to monitor memory load/store events.

$ perf mem

 usage: perf mem [<options>] {record <command> |report}

    -t, --type <type>     memory operations(load/store)
    -L, --latency <n>     latency to sample(only for load op)

Signed-off-by: Lin Ming <ming.m.lin@intel.com>
---
 tools/perf/Makefile            |    1 +
 tools/perf/builtin-record.c    |    8 ++++++++
 tools/perf/builtin-script.c    |    6 +++---
 tools/perf/builtin.h           |    1 +
 tools/perf/perf.c              |    1 +
 tools/perf/util/event.h        |    2 ++
 tools/perf/util/evsel.c        |   10 ++++++++++
 tools/perf/util/parse-events.c |   40 ++++++++++++++++++++++++++++++++++------
 tools/perf/util/parse-events.h |    2 +-
 9 files changed, 61 insertions(+), 10 deletions(-)

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 032ba63..221d1d8 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -372,6 +372,7 @@ BUILTIN_OBJS += $(OUTPUT)builtin-lock.o
 BUILTIN_OBJS += $(OUTPUT)builtin-kvm.o
 BUILTIN_OBJS += $(OUTPUT)builtin-test.o
 BUILTIN_OBJS += $(OUTPUT)builtin-inject.o
+BUILTIN_OBJS += $(OUTPUT)builtin-mem.o
 
 PERFLIBS = $(LIB_FILE)
 
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 8e2c857..8ebdcdd 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -63,6 +63,7 @@ static bool			inherit_stat			=  false;
 static bool			no_samples			=  false;
 static bool			sample_address			=  false;
 static bool			sample_time			=  false;
+static bool			latency_data			=  false;
 static bool			no_buildid			=  false;
 static bool			no_buildid_cache		=  false;
 static struct perf_evlist	*evsel_list;
@@ -199,6 +200,11 @@ static void config_attr(struct perf_evsel *evsel, struct perf_evlist *evlist)
 		attr->mmap_data = track;
 	}
 
+	if (latency_data) {
+		attr->sample_type	|= PERF_SAMPLE_LATENCY;
+		attr->sample_type	|= PERF_SAMPLE_EXTRA;
+	}
+
 	if (call_graph)
 		attr->sample_type	|= PERF_SAMPLE_CALLCHAIN;
 
@@ -780,6 +786,8 @@ const struct option record_options[] = {
 	OPT_BOOLEAN('T', "timestamp", &sample_time, "Sample timestamps"),
 	OPT_BOOLEAN('n', "no-samples", &no_samples,
 		    "don't sample"),
+	OPT_BOOLEAN('l', "latency", &latency_data,
+		    "Latency data"),
 	OPT_BOOLEAN('N', "no-buildid-cache", &no_buildid_cache,
 		    "do not update the buildid cache"),
 	OPT_BOOLEAN('B', "no-buildid", &no_buildid,
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 3056b45..c7489a6 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -140,7 +140,7 @@ static int perf_event_attr__check_stype(struct perf_event_attr *attr,
 		return 0;
 
 	if (output[type].user_set) {
-		evname = __event_name(attr->type, attr->config);
+		evname = __event_name(attr->type, attr->config, attr->config1);
 		pr_err("Samples for '%s' event do not have %s attribute set. "
 		       "Cannot print '%s' field.\n",
 		       evname, sample_msg, output_field2str(field));
@@ -149,7 +149,7 @@ static int perf_event_attr__check_stype(struct perf_event_attr *attr,
 
 	/* user did not ask for it explicitly so remove from the default list */
 	output[type].fields &= ~field;
-	evname = __event_name(attr->type, attr->config);
+	evname = __event_name(attr->type, attr->config, attr->config1);
 	pr_debug("Samples for '%s' event do not have %s attribute set. "
 		 "Skipping '%s' field.\n",
 		 evname, sample_msg, output_field2str(field));
@@ -292,7 +292,7 @@ static void print_sample_start(struct perf_sample *sample,
 			if (event)
 				evname = event->name;
 		} else
-			evname = __event_name(attr->type, attr->config);
+			evname = __event_name(attr->type, attr->config, 0);
 
 		printf("%s: ", evname ? evname : "(unknown)");
 	}
diff --git a/tools/perf/builtin.h b/tools/perf/builtin.h
index 4702e24..419ba8f 100644
--- a/tools/perf/builtin.h
+++ b/tools/perf/builtin.h
@@ -36,5 +36,6 @@ extern int cmd_lock(int argc, const char **argv, const char *prefix);
 extern int cmd_kvm(int argc, const char **argv, const char *prefix);
 extern int cmd_test(int argc, const char **argv, const char *prefix);
 extern int cmd_inject(int argc, const char **argv, const char *prefix);
+extern int cmd_mem(int argc, const char **argv, const char *prefix);
 
 #endif
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index ec635b7..20c53f8 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -332,6 +332,7 @@ static void handle_internal_command(int argc, const char **argv)
 		{ "kvm",	cmd_kvm,	0 },
 		{ "test",	cmd_test,	0 },
 		{ "inject",	cmd_inject,	0 },
+		{ "mem",	cmd_mem,	0 },
 	};
 	unsigned int i;
 	static const char ext[] = STRIP_EXTENSION;
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 1d7f664..1392867 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -76,6 +76,8 @@ struct perf_sample {
 	u64 id;
 	u64 stream_id;
 	u64 period;
+	u64 latency;
+	u64 extra;
 	u32 cpu;
 	u32 raw_size;
 	void *raw_data;
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index a03a36b..8eab351 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -405,6 +405,16 @@ int perf_event__parse_sample(const union perf_event *event, u64 type,
 		array++;
 	}
 
+	if (type & PERF_SAMPLE_LATENCY) {
+		data->latency = *array;
+		array++;
+	}
+
+	if (type & PERF_SAMPLE_EXTRA) {
+		data->extra = *array;
+		array++;
+	}
+
 	if (type & PERF_SAMPLE_READ) {
 		fprintf(stderr, "PERF_SAMPLE_READ is unsuported for now\n");
 		return -1;
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 41982c3..9f3bcb9 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -40,6 +40,8 @@ static struct event_symbol event_symbols[] = {
   { CHW(BRANCH_INSTRUCTIONS),		"branch-instructions",		"branches"		},
   { CHW(BRANCH_MISSES),			"branch-misses",		""			},
   { CHW(BUS_CYCLES),			"bus-cycles",			""			},
+  { CHW(MEM_LOAD),			"mem-load",			""			},
+  { CHW(MEM_STORE),			"mem-store",			""			},
 
   { CSW(CPU_CLOCK),			"cpu-clock",			""			},
   { CSW(TASK_CLOCK),			"task-clock",			""			},
@@ -297,15 +299,18 @@ const char *event_name(struct perf_evsel *evsel)
 	if (evsel->name)
 		return evsel->name;
 
-	return __event_name(type, config);
+	return __event_name(type, config, evsel->attr.config1);
 }
 
-const char *__event_name(int type, u64 config)
+const char *__event_name(int type, u64 config, u64 extra)
 {
 	static char buf[32];
+	int n;
 
 	if (type == PERF_TYPE_RAW) {
-		sprintf(buf, "raw 0x%" PRIx64, config);
+		n = sprintf(buf, "raw 0x%" PRIx64, config);
+		if (extra)
+			sprintf(buf + n, ":%#" PRIx64, extra);
 		return buf;
 	}
 
@@ -668,6 +673,7 @@ static enum event_result
 parse_symbolic_event(const char **strp, struct perf_event_attr *attr)
 {
 	const char *str = *strp;
+	u64 config;
 	unsigned int i;
 	int n;
 
@@ -676,7 +682,18 @@ parse_symbolic_event(const char **strp, struct perf_event_attr *attr)
 		if (n > 0) {
 			attr->type = event_symbols[i].type;
 			attr->config = event_symbols[i].config;
-			*strp = str + n;
+			str += n;
+			*strp = str;
+
+			if (*str++ == ':') {
+				n = hex2u64(str + 1, &config);
+				if (n > 0) {
+					attr->config1 = config;
+					str += n + 1;
+					*strp = str;
+				}
+			}
+
 			return EVT_HANDLED;
 		}
 	}
@@ -694,9 +711,20 @@ parse_raw_event(const char **strp, struct perf_event_attr *attr)
 		return EVT_FAILED;
 	n = hex2u64(str + 1, &config);
 	if (n > 0) {
-		*strp = str + n + 1;
+		str += n + 1;
+		*strp = str;
 		attr->type = PERF_TYPE_RAW;
 		attr->config = config;
+
+		if (*str++ == ':') {
+			n = hex2u64(str + 1, &config);
+			if (n > 0) {
+				attr->config1 = config;
+				str += n + 1;
+				*strp = str;
+			}
+		}
+
 		return EVT_HANDLED;
 	}
 	return EVT_FAILED;
@@ -1078,7 +1106,7 @@ void print_events(const char *event_glob)
 
 	printf("\n");
 	printf("  %-50s [%s]\n",
-		"rNNN (see 'perf list --help' on how to encode it)",
+		"rNNN[:EEE] (see 'perf list --help' on how to encode it)",
 	       event_type_descriptors[PERF_TYPE_RAW]);
 	printf("\n");
 
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index 746d3fc..904c8c4 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -22,7 +22,7 @@ extern bool have_tracepoints(struct list_head *evlist);
 
 const char *event_type(int type);
 const char *event_name(struct perf_evsel *event);
-extern const char *__event_name(int type, u64 config);
+extern const char *__event_name(int type, u64 config, u64 extra);
 
 extern int parse_events(const struct option *opt, const char *str, int unset);
 extern int parse_filter(const struct option *opt, const char *str, int unset);
-- 
1.7.5.1


^ permalink raw reply related	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/4] perf: Add memory load/store events generic code
  2011-07-04  8:02 ` [PATCH 1/4] perf: Add memory load/store events generic code Lin Ming
@ 2011-07-04  8:33   ` Peter Zijlstra
  2011-07-04  8:44     ` Peter Zijlstra
  2011-07-04 22:01     ` Andi Kleen
  2011-07-04 11:08   ` Peter Zijlstra
  2011-07-04 11:16   ` Peter Zijlstra
  2 siblings, 2 replies; 37+ messages in thread
From: Peter Zijlstra @ 2011-07-04  8:33 UTC (permalink / raw)
  To: Lin Ming
  Cc: Ingo Molnar, Andi Kleen, Stephane Eranian,
	Arnaldo Carvalho de Melo, linux-kernel

On Mon, 2011-07-04 at 08:02 +0000, Lin Ming wrote:
> +/*
> + * Memory load operation info encoding
> + */
> +
> +/* Bits(0-1) {L1, L2, L3, RAM} or {unknown, IO, uncached, reserved}
> */
> +#define MEM_LOAD_L1                    0x00
> +#define MEM_LOAD_L2                    0x01
> +#define MEM_LOAD_L3                    0x02
> +#define MEM_LOAD_RAM                   0x03
> +#define MEM_LOAD_UNKNOWN               0x00
> +#define MEM_LOAD_IO                    0x01
> +#define MEM_LOAD_UNCACHED              0x02
> +#define MEM_LOAD_RESERVED              0x03
> +
> +/* Bits(2-3) {toggle, snoop, local, remote} */
> +#define MEM_LOAD_TOGGLE                        (0x00 << 2)
> +#define MEM_LOAD_SNOOP                 (0x01 << 2)
> +#define MEM_LOAD_LOCAL                 (0x02 << 2)
> +#define MEM_LOAD_REMOTE                        (0x03 << 2)
> +
> +/* Bits(4-5) {modified, exclusive, shared, invalid} */
> +#define MEM_LOAD_MODIFIED              (0x00 << 4)
> +#define MEM_LOAD_EXCLUSIVE             (0x01 << 4)
> +#define MEM_LOAD_SHARED                        (0x02 << 4)
> +#define MEM_LOAD_INVALID               (0x03 << 4) 

Did anybody check with the other PMUs that have similar features like
PowerPC and possibly IA64?

I keep mentioning this, nobody seems interested.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/4] perf: Add memory load/store events generic code
  2011-07-04  8:33   ` Peter Zijlstra
@ 2011-07-04  8:44     ` Peter Zijlstra
  2011-07-05 12:03       ` Peter Zijlstra
  2011-07-04 22:01     ` Andi Kleen
  1 sibling, 1 reply; 37+ messages in thread
From: Peter Zijlstra @ 2011-07-04  8:44 UTC (permalink / raw)
  To: Lin Ming
  Cc: Ingo Molnar, Andi Kleen, Stephane Eranian,
	Arnaldo Carvalho de Melo, linux-kernel, Anton Blanchard, paulus

On Mon, 2011-07-04 at 10:33 +0200, Peter Zijlstra wrote:
> On Mon, 2011-07-04 at 08:02 +0000, Lin Ming wrote:
> > +/*
> > + * Memory load operation info encoding
> > + */
> > +
> > +/* Bits(0-1) {L1, L2, L3, RAM} or {unknown, IO, uncached, reserved}
> > */
> > +#define MEM_LOAD_L1                    0x00
> > +#define MEM_LOAD_L2                    0x01
> > +#define MEM_LOAD_L3                    0x02
> > +#define MEM_LOAD_RAM                   0x03
> > +#define MEM_LOAD_UNKNOWN               0x00
> > +#define MEM_LOAD_IO                    0x01
> > +#define MEM_LOAD_UNCACHED              0x02
> > +#define MEM_LOAD_RESERVED              0x03
> > +
> > +/* Bits(2-3) {toggle, snoop, local, remote} */
> > +#define MEM_LOAD_TOGGLE                        (0x00 << 2)
> > +#define MEM_LOAD_SNOOP                 (0x01 << 2)
> > +#define MEM_LOAD_LOCAL                 (0x02 << 2)
> > +#define MEM_LOAD_REMOTE                        (0x03 << 2)
> > +
> > +/* Bits(4-5) {modified, exclusive, shared, invalid} */
> > +#define MEM_LOAD_MODIFIED              (0x00 << 4)
> > +#define MEM_LOAD_EXCLUSIVE             (0x01 << 4)
> > +#define MEM_LOAD_SHARED                        (0x02 << 4)
> > +#define MEM_LOAD_INVALID               (0x03 << 4) 
> 
> Did anybody check with the other PMUs that have similar features like
> PowerPC and possibly IA64?
> 
> I keep mentioning this, nobody seems interested.

Anton, Paulus, IIRC PowerPC had some sort of Data-Source indication,
would you have some docs available on the PowerPC PMU?

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/4] perf: Add memory load/store events generic code
  2011-07-04  8:02 ` [PATCH 1/4] perf: Add memory load/store events generic code Lin Ming
  2011-07-04  8:33   ` Peter Zijlstra
@ 2011-07-04 11:08   ` Peter Zijlstra
  2011-07-04 11:16   ` Peter Zijlstra
  2 siblings, 0 replies; 37+ messages in thread
From: Peter Zijlstra @ 2011-07-04 11:08 UTC (permalink / raw)
  To: Lin Ming
  Cc: Ingo Molnar, Andi Kleen, Stephane Eranian,
	Arnaldo Carvalho de Melo, linux-kernel, Robert Richter

On Mon, 2011-07-04 at 08:02 +0000, Lin Ming wrote:
> +/*
> + * Memory load operation info encoding
> + */
> +
> +/* Bits(0-1) {L1, L2, L3, RAM} or {unknown, IO, uncached, reserved}
> */
> +#define MEM_LOAD_L1                    0x00
> +#define MEM_LOAD_L2                    0x01
> +#define MEM_LOAD_L3                    0x02
> +#define MEM_LOAD_RAM                   0x03
> +#define MEM_LOAD_UNKNOWN               0x00
> +#define MEM_LOAD_IO                    0x01
> +#define MEM_LOAD_UNCACHED              0x02
> +#define MEM_LOAD_RESERVED              0x03
> +
> +/* Bits(2-3) {toggle, snoop, local, remote} */
> +#define MEM_LOAD_TOGGLE                        (0x00 << 2)
> +#define MEM_LOAD_SNOOP                 (0x01 << 2)
> +#define MEM_LOAD_LOCAL                 (0x02 << 2)
> +#define MEM_LOAD_REMOTE                        (0x03 << 2)
> +
> +/* Bits(4-5) {modified, exclusive, shared, invalid} */
> +#define MEM_LOAD_MODIFIED              (0x00 << 4)
> +#define MEM_LOAD_EXCLUSIVE             (0x01 << 4)
> +#define MEM_LOAD_SHARED                        (0x02 << 4)
> +#define MEM_LOAD_INVALID               (0x03 << 4) 

AMD IBS also has load/store source information, from their data format a
single op can even be both a load and a store (atomic RMW ops might
qualify).

The problem with mapping this to IBS is that they don't have the L1/L2
split but simply say data-cache miss (but imply L3 is excluded from that
by having separate L3/DRAM bits).

Also, I really don't like the EXTRA name you gave it, if we're going to
do something like this it should really be about data source.



^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/4] perf: Add memory load/store events generic code
  2011-07-04  8:02 ` [PATCH 1/4] perf: Add memory load/store events generic code Lin Ming
  2011-07-04  8:33   ` Peter Zijlstra
  2011-07-04 11:08   ` Peter Zijlstra
@ 2011-07-04 11:16   ` Peter Zijlstra
  2011-07-04 21:52     ` Andi Kleen
  2011-07-05 11:54     ` Lin Ming
  2 siblings, 2 replies; 37+ messages in thread
From: Peter Zijlstra @ 2011-07-04 11:16 UTC (permalink / raw)
  To: Lin Ming
  Cc: Ingo Molnar, Andi Kleen, Stephane Eranian,
	Arnaldo Carvalho de Melo, linux-kernel

On Mon, 2011-07-04 at 08:02 +0000, Lin Ming wrote:
> +#define MEM_STORE_DCU_HIT              (1ULL << 0)

I'm pretty sure that's not Dublin City University, but what is it?
Data-Cache-Unit? what does that mean, L1/L2 or also L3? 

> +#define MEM_STORE_STLB_HIT             (1ULL << 1)

What's an sTLB? I know iTLB and dTLB's but sTLBs I've not heard of yet.

> +#define MEM_STORE_LOCKED_ACCESS                (1ULL << 2) 

Presumably that's about LOCK'ed ops?

So now you're just tacking bits on the end without even attempting to
generalize/unify things, not charmed at all.


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/4] perf: Add memory load/store events generic code
  2011-07-04 11:16   ` Peter Zijlstra
@ 2011-07-04 21:52     ` Andi Kleen
  2011-07-05 11:54     ` Lin Ming
  1 sibling, 0 replies; 37+ messages in thread
From: Andi Kleen @ 2011-07-04 21:52 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Lin Ming, Ingo Molnar, Andi Kleen, Stephane Eranian,
	Arnaldo Carvalho de Melo, linux-kernel

On Mon, Jul 04, 2011 at 01:16:32PM +0200, Peter Zijlstra wrote:
> On Mon, 2011-07-04 at 08:02 +0000, Lin Ming wrote:
> > +#define MEM_STORE_DCU_HIT              (1ULL << 0)
> 
> I'm pretty sure that's not Dublin City University, but what is it?
> Data-Cache-Unit? 

Yes

> what does that mean, L1/L2 or also L3? 

DCU is L1D

(L2 would be MLC, L3 is LLC in Intelnese)

> > +#define MEM_STORE_STLB_HIT             (1ULL << 1)
> 
> What's an sTLB? I know iTLB and dTLB's but sTLBs I've not heard of yet.

Second Level TLB (for both i and d)

-Andi
-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 4/4] perf, tool: Add new command "perf mem"
  2011-07-04  8:02 ` [PATCH 4/4] perf, tool: Add new command "perf mem" Lin Ming
@ 2011-07-04 22:00   ` Andi Kleen
  2011-07-05  1:35     ` Lin Ming
  0 siblings, 1 reply; 37+ messages in thread
From: Andi Kleen @ 2011-07-04 22:00 UTC (permalink / raw)
  To: Lin Ming
  Cc: Peter Zijlstra, Ingo Molnar, Andi Kleen, Stephane Eranian,
	Arnaldo Carvalho de Melo, linux-kernel

> diff --git a/tools/perf/Makefile b/tools/perf/Makefile
> index 032ba63..221d1d8 100644
> --- a/tools/perf/Makefile
> +++ b/tools/perf/Makefile
> @@ -372,6 +372,7 @@ BUILTIN_OBJS += $(OUTPUT)builtin-lock.o
>  BUILTIN_OBJS += $(OUTPUT)builtin-kvm.o
>  BUILTIN_OBJS += $(OUTPUT)builtin-test.o
>  BUILTIN_OBJS += $(OUTPUT)builtin-inject.o
> +BUILTIN_OBJS += $(OUTPUT)builtin-mem.o

File seems to be missing in the patch. Forgot a git add?

Also need a manpage for it.

-Andi
-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/4] perf: Add memory load/store events generic code
  2011-07-04  8:33   ` Peter Zijlstra
  2011-07-04  8:44     ` Peter Zijlstra
@ 2011-07-04 22:01     ` Andi Kleen
  2011-07-05  8:43       ` Peter Zijlstra
  1 sibling, 1 reply; 37+ messages in thread
From: Andi Kleen @ 2011-07-04 22:01 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Lin Ming, Ingo Molnar, Andi Kleen, Stephane Eranian,
	Arnaldo Carvalho de Melo, linux-kernel

> Did anybody check with the other PMUs that have similar features like
> PowerPC and possibly IA64?

IA64 has a similar feature I believe, but I'm not aware of anyone
working on that.

-Andi
-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 4/4] perf, tool: Add new command "perf mem"
  2011-07-04 22:00   ` Andi Kleen
@ 2011-07-05  1:35     ` Lin Ming
  0 siblings, 0 replies; 37+ messages in thread
From: Lin Ming @ 2011-07-05  1:35 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Peter Zijlstra, Ingo Molnar, Stephane Eranian,
	Arnaldo Carvalho de Melo, linux-kernel

On Tue, 2011-07-05 at 06:00 +0800, Andi Kleen wrote:
> > diff --git a/tools/perf/Makefile b/tools/perf/Makefile
> > index 032ba63..221d1d8 100644
> > --- a/tools/perf/Makefile
> > +++ b/tools/perf/Makefile
> > @@ -372,6 +372,7 @@ BUILTIN_OBJS += $(OUTPUT)builtin-lock.o
> >  BUILTIN_OBJS += $(OUTPUT)builtin-kvm.o
> >  BUILTIN_OBJS += $(OUTPUT)builtin-test.o
> >  BUILTIN_OBJS += $(OUTPUT)builtin-inject.o
> > +BUILTIN_OBJS += $(OUTPUT)builtin-mem.o
> 
> File seems to be missing in the patch. Forgot a git add?
> 
> Also need a manpage for it.

Ah, sorry for the missing. Here it is.

>From 6fb31b6fb63d73624c6bffbe81a013ca915da077 Mon Sep 17 00:00:00 2001
From: Lin Ming <ming.m.lin@intel.com>
Date: Mon, 4 Jul 2011 07:33:36 +0000
Subject: [PATCH] perf, tool: Add new command "perf mem"

Adds new command "perf mem" to monitor memory load/store events.

$ perf mem

 usage: perf mem [<options>] {record <command> |report}

    -t, --type <type>     memory operations(load/store)
    -L, --latency <n>     latency to sample(only for load op)

Signed-off-by: Lin Ming <ming.m.lin@intel.com>
---
 tools/perf/Documentation/perf-mem.txt |   38 +++++
 tools/perf/Makefile                   |    1 +
 tools/perf/builtin-mem.c              |  269 +++++++++++++++++++++++++++++++++
 tools/perf/builtin-record.c           |    8 +
 tools/perf/builtin-script.c           |    6 +-
 tools/perf/builtin.h                  |    1 +
 tools/perf/perf.c                     |    1 +
 tools/perf/util/event.h               |    2 +
 tools/perf/util/evsel.c               |   10 ++
 tools/perf/util/parse-events.c        |   40 ++++-
 tools/perf/util/parse-events.h        |    2 +-
 11 files changed, 368 insertions(+), 10 deletions(-)
 create mode 100644 tools/perf/Documentation/perf-mem.txt
 create mode 100644 tools/perf/builtin-mem.c

diff --git a/tools/perf/Documentation/perf-mem.txt b/tools/perf/Documentation/perf-mem.txt
new file mode 100644
index 0000000..8ee5794
--- /dev/null
+++ b/tools/perf/Documentation/perf-mem.txt
@@ -0,0 +1,38 @@
+perf-mem(1)
+===========
+
+NAME
+----
+perf-mem - Monitor memory load/store operation
+
+SYNOPSIS
+--------
+[verse]
+'perf mem' -t load [-L <n>] record <command>
+'perf mem' -t store record <command>
+'perf mem' -t load report
+'perf mem' -t store report
+
+DESCRIPTION
+-----------
+"perf mem -t <TYPE> record" runs a command and gathers memory operation data
+from it, into perf.data.
+
+"perf mem -t <TYPE> report" displays the result.
+
+OPTIONS
+-------
+<command>...::
+	Any command you can specify in a shell.
+
+-t::
+--type=::
+	Select the memory operation type: load or store
+
+-L::
+--latency=::
+	Select the memory load latency to sample. Only used for memory load operation.
+
+SEE ALSO
+--------
+linkperf:perf-record[1], linkperf:perf-report[1]
diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 032ba63..221d1d8 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -372,6 +372,7 @@ BUILTIN_OBJS += $(OUTPUT)builtin-lock.o
 BUILTIN_OBJS += $(OUTPUT)builtin-kvm.o
 BUILTIN_OBJS += $(OUTPUT)builtin-test.o
 BUILTIN_OBJS += $(OUTPUT)builtin-inject.o
+BUILTIN_OBJS += $(OUTPUT)builtin-mem.o
 
 PERFLIBS = $(LIB_FILE)
 
diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c
new file mode 100644
index 0000000..d00fedf
--- /dev/null
+++ b/tools/perf/builtin-mem.c
@@ -0,0 +1,269 @@
+#include "builtin.h"
+#include "perf.h"
+
+#include "util/parse-options.h"
+#include "util/trace-event.h"
+
+static char			const *input_name		= "perf.data";
+static const char		*mem_operation;
+static int			latency_value			= 3;
+
+#define MEM_OPEARTION_LOAD	"load"
+#define MEM_OPERATION_STORE	"store"
+
+static const char * const mem_usage[] = {
+	"perf mem [<options>] {record <command> |report}",
+	NULL
+};
+
+static const struct option mem_options[] = {
+	OPT_STRING('t', "type", &mem_operation, "type", "memory operations(load/store)"),
+	OPT_INTEGER('L', "latency", &latency_value, "latency to sample(only for load op)"),
+	OPT_END()
+};
+
+static int __cmd_record(int argc, const char **argv)
+{
+	int rec_argc, i = 0, j;
+	const char **rec_argv;
+	char event[20];
+
+	rec_argc = argc + 4;
+	rec_argv = calloc(rec_argc + 1, sizeof(char *));
+	rec_argv[i++] = strdup("record");
+	rec_argv[i++] = strdup("-l");
+	rec_argv[i++] = strdup("-d");
+	rec_argv[i++] = strdup("-e");
+	if (!strcmp(mem_operation, MEM_OPEARTION_LOAD))
+		sprintf(event, "mem-load:%04x:p", latency_value);
+	else
+		sprintf(event, "mem-store:p");
+	rec_argv[i++] = strdup(event);
+	for (j = 1; j < argc; j++, i++)
+		rec_argv[i] = argv[j];
+
+	BUG_ON(i != rec_argc);
+
+	return cmd_record(i, rec_argv, NULL);
+}
+
+#define LEN 56
+struct perf_mem_data {
+	char name[LEN];
+	u64 count;
+	u64 latency;
+};
+
+static struct perf_mem_data load_data[7][4][4] = {
+ [MEM_LOAD_L1] = {
+	[MEM_LOAD_LOCAL >> 2] = {
+		[MEM_LOAD_MODIFIED >> 4] = {
+			"L1-local", 0, 0
+		},
+	},
+ },
+ [MEM_LOAD_L2] = {
+	[MEM_LOAD_SNOOP >> 2] = {
+		[MEM_LOAD_MODIFIED >> 4] = {
+			"L2-snoop", 0, 0
+		},
+	},
+	[MEM_LOAD_LOCAL >> 2] = {
+		[MEM_LOAD_MODIFIED >> 4] = {
+			"L2-local", 0, 0
+		},
+	},
+ },
+ [MEM_LOAD_L3] = {
+	[MEM_LOAD_SNOOP >> 2] = {
+		[MEM_LOAD_MODIFIED >> 4] = {
+			"L3-snoop, found M", 0, 0
+		},
+		[MEM_LOAD_SHARED >> 4] = {
+			"L3-snoop, found no M", 0, 0
+		},
+		[MEM_LOAD_INVALID >> 4] = {
+			"L3-snoop, no coherency actions", 0, 0
+		},
+	},
+ },
+ [MEM_LOAD_RAM] = {
+	[MEM_LOAD_SNOOP >> 2] = {
+		[MEM_LOAD_SHARED >> 4] = {
+			"L3-miss, snoop, shared", 0, 0
+		},
+	},
+	[MEM_LOAD_LOCAL >> 2] = {
+		[MEM_LOAD_EXCLUSIVE >> 4] = {
+			"L3-miss, local, exclusive", 0, 0
+		},
+		[MEM_LOAD_SHARED >> 4] = {
+			"L3-miss, local, shared", 0, 0
+		},
+	},
+	[MEM_LOAD_REMOTE >> 2] = {
+		[MEM_LOAD_EXCLUSIVE >> 4] = {
+			"L3-miss, remote, exclusive", 0, 0
+		},
+		[MEM_LOAD_SHARED >> 4] = {
+			"L3-miss, remote, shared", 0, 0
+		},
+	},
+ },
+ [MEM_LOAD_UNKNOWN + 4] = {
+	[MEM_LOAD_TOGGLE] = {
+		[0] = {
+			"Unknown L3", 0, 0
+		},
+	},
+ },
+ [MEM_LOAD_IO + 4] = {
+	[MEM_LOAD_TOGGLE] = {
+		[0] = {
+			"IO", 0, 0
+		},
+	},
+ },
+ [MEM_LOAD_UNCACHED + 4] = {
+	[MEM_LOAD_TOGGLE] = {
+		[0] = {
+			"Uncached", 0, 0
+		},
+	},
+ },
+};
+
+static struct perf_mem_data store_data[6] = {
+	{"data-cache hit", 0, 0},
+	{"data-cache miss", 0, 0},
+	{"STLB hit", 0, 0},
+	{"STLB miss", 0, 0},
+	{"Locked access", 0, 0},
+	{"Unlocked access", 0, 0},
+};
+
+static void dump_load_data(void)
+{
+	int i, j, k;
+
+	printf("Memory load operation statistics\n");
+	printf("================================\n");
+	for (i = 0; i < 7; i++)
+		for (j = 0; j < 4; j++)
+			for (k = 0; k < 4; k++) {
+				if (!load_data[i][j][k].name[0])
+					continue;
+				printf("%30s: total latency=%8" PRId64 ", count=%8" PRId64 "(avg=%" PRId64 ")\n",
+					load_data[i][j][k].name,
+					load_data[i][j][k].latency,
+					load_data[i][j][k].count,
+					load_data[i][j][k].count ?
+					(load_data[i][j][k].latency /
+					load_data[i][j][k].count) : 0);
+			}
+}
+
+static void dump_store_data(void)
+{
+	int i;
+
+	printf("Memory store operation statistics\n");
+	printf("=================================\n");
+	for (i = 0; i < 6; i++)
+		printf("%30s: %8" PRId64 "\n", store_data[i].name,
+			store_data[i].count);
+}
+
+static void process_load_sample(u64 latency, u64 extra)
+{
+	int i, j, k;
+
+	i = extra & 0x3;
+	j = (extra >> 2) & 0x3;
+	k = (extra >> 4) & 0x3;
+
+	if (j == 0)
+		i += 4;
+
+	load_data[i][j][k].latency += latency;
+	load_data[i][j][k].count++;
+}
+
+static void process_store_sample(u64 extra)
+{
+	if (extra & MEM_STORE_DCU_HIT)
+		store_data[0].count++;
+	else
+		store_data[1].count++;
+
+	if (extra & MEM_STORE_STLB_HIT)
+		store_data[2].count++;
+	else
+		store_data[3].count++;
+
+	if (extra & MEM_STORE_LOCKED_ACCESS)
+		store_data[4].count++;
+	else
+		store_data[5].count++;
+}
+
+static int process_sample_event(union perf_event *event __unused, struct perf_sample *sample,
+                                struct perf_evsel *evsel __unused, struct perf_session *session __unused)
+{
+	if (!strcmp(mem_operation, MEM_OPEARTION_LOAD))
+		process_load_sample(sample->latency, sample->extra);
+	else
+		process_store_sample(sample->extra);
+
+	return 0;
+}
+
+static struct perf_event_ops event_ops = {
+	.sample			= process_sample_event,
+	.mmap			= perf_event__process_mmap,
+	.comm			= perf_event__process_comm,
+	.lost			= perf_event__process_lost,
+	.fork			= perf_event__process_task,
+	.ordered_samples	= true,
+};
+
+static int report_events(void)
+{
+	int err = -EINVAL;
+	struct perf_session *session = perf_session__new(input_name, O_RDONLY,
+							 0, false, &event_ops);
+
+	if (symbol__init() < 0)
+		return -1;
+
+	if (session == NULL)
+		return -ENOMEM;
+
+	err = perf_session__process_events(session, &event_ops);
+
+	if (!strcmp(mem_operation, MEM_OPEARTION_LOAD))
+		dump_load_data();
+	else
+		dump_store_data();
+
+	perf_session__delete(session);
+	return err;
+}
+
+int cmd_mem(int argc, const char **argv, const char *prefix __used)
+{
+	argc = parse_options(argc, argv, mem_options, mem_usage,
+                        PARSE_OPT_STOP_AT_NON_OPTION);
+
+	if (!argc || !mem_operation)
+		usage_with_options(mem_usage, mem_options);
+
+        if (!strncmp(argv[0], "rec", 3))
+		return __cmd_record(argc, argv);
+	else if (!strncmp(argv[0], "rep", 3))
+		return report_events();
+	else
+		usage_with_options(mem_usage, mem_options);
+
+	return 0;
+}
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 8e2c857..8ebdcdd 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -63,6 +63,7 @@ static bool			inherit_stat			=  false;
 static bool			no_samples			=  false;
 static bool			sample_address			=  false;
 static bool			sample_time			=  false;
+static bool			latency_data			=  false;
 static bool			no_buildid			=  false;
 static bool			no_buildid_cache		=  false;
 static struct perf_evlist	*evsel_list;
@@ -199,6 +200,11 @@ static void config_attr(struct perf_evsel *evsel, struct perf_evlist *evlist)
 		attr->mmap_data = track;
 	}
 
+	if (latency_data) {
+		attr->sample_type	|= PERF_SAMPLE_LATENCY;
+		attr->sample_type	|= PERF_SAMPLE_EXTRA;
+	}
+
 	if (call_graph)
 		attr->sample_type	|= PERF_SAMPLE_CALLCHAIN;
 
@@ -780,6 +786,8 @@ const struct option record_options[] = {
 	OPT_BOOLEAN('T', "timestamp", &sample_time, "Sample timestamps"),
 	OPT_BOOLEAN('n', "no-samples", &no_samples,
 		    "don't sample"),
+	OPT_BOOLEAN('l', "latency", &latency_data,
+		    "Latency data"),
 	OPT_BOOLEAN('N', "no-buildid-cache", &no_buildid_cache,
 		    "do not update the buildid cache"),
 	OPT_BOOLEAN('B', "no-buildid", &no_buildid,
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 3056b45..c7489a6 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -140,7 +140,7 @@ static int perf_event_attr__check_stype(struct perf_event_attr *attr,
 		return 0;
 
 	if (output[type].user_set) {
-		evname = __event_name(attr->type, attr->config);
+		evname = __event_name(attr->type, attr->config, attr->config1);
 		pr_err("Samples for '%s' event do not have %s attribute set. "
 		       "Cannot print '%s' field.\n",
 		       evname, sample_msg, output_field2str(field));
@@ -149,7 +149,7 @@ static int perf_event_attr__check_stype(struct perf_event_attr *attr,
 
 	/* user did not ask for it explicitly so remove from the default list */
 	output[type].fields &= ~field;
-	evname = __event_name(attr->type, attr->config);
+	evname = __event_name(attr->type, attr->config, attr->config1);
 	pr_debug("Samples for '%s' event do not have %s attribute set. "
 		 "Skipping '%s' field.\n",
 		 evname, sample_msg, output_field2str(field));
@@ -292,7 +292,7 @@ static void print_sample_start(struct perf_sample *sample,
 			if (event)
 				evname = event->name;
 		} else
-			evname = __event_name(attr->type, attr->config);
+			evname = __event_name(attr->type, attr->config, 0);
 
 		printf("%s: ", evname ? evname : "(unknown)");
 	}
diff --git a/tools/perf/builtin.h b/tools/perf/builtin.h
index 4702e24..419ba8f 100644
--- a/tools/perf/builtin.h
+++ b/tools/perf/builtin.h
@@ -36,5 +36,6 @@ extern int cmd_lock(int argc, const char **argv, const char *prefix);
 extern int cmd_kvm(int argc, const char **argv, const char *prefix);
 extern int cmd_test(int argc, const char **argv, const char *prefix);
 extern int cmd_inject(int argc, const char **argv, const char *prefix);
+extern int cmd_mem(int argc, const char **argv, const char *prefix);
 
 #endif
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index ec635b7..20c53f8 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -332,6 +332,7 @@ static void handle_internal_command(int argc, const char **argv)
 		{ "kvm",	cmd_kvm,	0 },
 		{ "test",	cmd_test,	0 },
 		{ "inject",	cmd_inject,	0 },
+		{ "mem",	cmd_mem,	0 },
 	};
 	unsigned int i;
 	static const char ext[] = STRIP_EXTENSION;
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 1d7f664..1392867 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -76,6 +76,8 @@ struct perf_sample {
 	u64 id;
 	u64 stream_id;
 	u64 period;
+	u64 latency;
+	u64 extra;
 	u32 cpu;
 	u32 raw_size;
 	void *raw_data;
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index a03a36b..8eab351 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -405,6 +405,16 @@ int perf_event__parse_sample(const union perf_event *event, u64 type,
 		array++;
 	}
 
+	if (type & PERF_SAMPLE_LATENCY) {
+		data->latency = *array;
+		array++;
+	}
+
+	if (type & PERF_SAMPLE_EXTRA) {
+		data->extra = *array;
+		array++;
+	}
+
 	if (type & PERF_SAMPLE_READ) {
 		fprintf(stderr, "PERF_SAMPLE_READ is unsuported for now\n");
 		return -1;
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 41982c3..9f3bcb9 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -40,6 +40,8 @@ static struct event_symbol event_symbols[] = {
   { CHW(BRANCH_INSTRUCTIONS),		"branch-instructions",		"branches"		},
   { CHW(BRANCH_MISSES),			"branch-misses",		""			},
   { CHW(BUS_CYCLES),			"bus-cycles",			""			},
+  { CHW(MEM_LOAD),			"mem-load",			""			},
+  { CHW(MEM_STORE),			"mem-store",			""			},
 
   { CSW(CPU_CLOCK),			"cpu-clock",			""			},
   { CSW(TASK_CLOCK),			"task-clock",			""			},
@@ -297,15 +299,18 @@ const char *event_name(struct perf_evsel *evsel)
 	if (evsel->name)
 		return evsel->name;
 
-	return __event_name(type, config);
+	return __event_name(type, config, evsel->attr.config1);
 }
 
-const char *__event_name(int type, u64 config)
+const char *__event_name(int type, u64 config, u64 extra)
 {
 	static char buf[32];
+	int n;
 
 	if (type == PERF_TYPE_RAW) {
-		sprintf(buf, "raw 0x%" PRIx64, config);
+		n = sprintf(buf, "raw 0x%" PRIx64, config);
+		if (extra)
+			sprintf(buf + n, ":%#" PRIx64, extra);
 		return buf;
 	}
 
@@ -668,6 +673,7 @@ static enum event_result
 parse_symbolic_event(const char **strp, struct perf_event_attr *attr)
 {
 	const char *str = *strp;
+	u64 config;
 	unsigned int i;
 	int n;
 
@@ -676,7 +682,18 @@ parse_symbolic_event(const char **strp, struct perf_event_attr *attr)
 		if (n > 0) {
 			attr->type = event_symbols[i].type;
 			attr->config = event_symbols[i].config;
-			*strp = str + n;
+			str += n;
+			*strp = str;
+
+			if (*str++ == ':') {
+				n = hex2u64(str + 1, &config);
+				if (n > 0) {
+					attr->config1 = config;
+					str += n + 1;
+					*strp = str;
+				}
+			}
+
 			return EVT_HANDLED;
 		}
 	}
@@ -694,9 +711,20 @@ parse_raw_event(const char **strp, struct perf_event_attr *attr)
 		return EVT_FAILED;
 	n = hex2u64(str + 1, &config);
 	if (n > 0) {
-		*strp = str + n + 1;
+		str += n + 1;
+		*strp = str;
 		attr->type = PERF_TYPE_RAW;
 		attr->config = config;
+
+		if (*str++ == ':') {
+			n = hex2u64(str + 1, &config);
+			if (n > 0) {
+				attr->config1 = config;
+				str += n + 1;
+				*strp = str;
+			}
+		}
+
 		return EVT_HANDLED;
 	}
 	return EVT_FAILED;
@@ -1078,7 +1106,7 @@ void print_events(const char *event_glob)
 
 	printf("\n");
 	printf("  %-50s [%s]\n",
-		"rNNN (see 'perf list --help' on how to encode it)",
+		"rNNN[:EEE] (see 'perf list --help' on how to encode it)",
 	       event_type_descriptors[PERF_TYPE_RAW]);
 	printf("\n");
 
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index 746d3fc..904c8c4 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -22,7 +22,7 @@ extern bool have_tracepoints(struct list_head *evlist);
 
 const char *event_type(int type);
 const char *event_name(struct perf_evsel *event);
-extern const char *__event_name(int type, u64 config);
+extern const char *__event_name(int type, u64 config, u64 extra);
 
 extern int parse_events(const struct option *opt, const char *str, int unset);
 extern int parse_filter(const struct option *opt, const char *str, int unset);
-- 
1.7.5.1




^ permalink raw reply related	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/4] perf: Add memory load/store events generic code
  2011-07-04 22:01     ` Andi Kleen
@ 2011-07-05  8:43       ` Peter Zijlstra
  0 siblings, 0 replies; 37+ messages in thread
From: Peter Zijlstra @ 2011-07-05  8:43 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Lin Ming, Ingo Molnar, Stephane Eranian,
	Arnaldo Carvalho de Melo, linux-kernel

On Tue, 2011-07-05 at 00:01 +0200, Andi Kleen wrote:
> > Did anybody check with the other PMUs that have similar features like
> > PowerPC and possibly IA64?
> 
> IA64 has a similar feature I believe, but I'm not aware of anyone
> working on that.

Working on it isn't the important part, the important part is that we
don't make it impossible to implement would someone be willing to work
on it.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/4] perf: Add memory load/store events generic code
  2011-07-04 11:16   ` Peter Zijlstra
  2011-07-04 21:52     ` Andi Kleen
@ 2011-07-05 11:54     ` Lin Ming
  2011-07-05 14:17       ` Peter Zijlstra
  1 sibling, 1 reply; 37+ messages in thread
From: Lin Ming @ 2011-07-05 11:54 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Andi Kleen, Stephane Eranian,
	Arnaldo Carvalho de Melo, linux-kernel

On Mon, 2011-07-04 at 19:16 +0800, Peter Zijlstra wrote:
> On Mon, 2011-07-04 at 08:02 +0000, Lin Ming wrote:
> > +#define MEM_STORE_DCU_HIT              (1ULL << 0)
> 
> I'm pretty sure that's not Dublin City University, but what is it?
> Data-Cache-Unit? what does that mean, L1/L2 or also L3? 
> 
> > +#define MEM_STORE_STLB_HIT             (1ULL << 1)
> 
> What's an sTLB? I know iTLB and dTLB's but sTLBs I've not heard of yet.
> 
> > +#define MEM_STORE_LOCKED_ACCESS                (1ULL << 2) 
> 
> Presumably that's about LOCK'ed ops?
> 
> So now you're just tacking bits on the end without even attempting to
> generalize/unify things, not charmed at all.

Any idea on the more useful store bits encoding?

Thanks,
Lin Ming



^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/4] perf: Add memory load/store events generic code
  2011-07-04  8:44     ` Peter Zijlstra
@ 2011-07-05 12:03       ` Peter Zijlstra
  2011-07-05 23:02         ` Paul Mackerras
  0 siblings, 1 reply; 37+ messages in thread
From: Peter Zijlstra @ 2011-07-05 12:03 UTC (permalink / raw)
  To: Lin Ming
  Cc: Ingo Molnar, Andi Kleen, Stephane Eranian,
	Arnaldo Carvalho de Melo, linux-kernel, Anton Blanchard, paulus

On Mon, 2011-07-04 at 10:44 +0200, Peter Zijlstra wrote:
> On Mon, 2011-07-04 at 10:33 +0200, Peter Zijlstra wrote:
> > On Mon, 2011-07-04 at 08:02 +0000, Lin Ming wrote:
> > > +/*
> > > + * Memory load operation info encoding
> > > + */
> > > +
> > > +/* Bits(0-1) {L1, L2, L3, RAM} or {unknown, IO, uncached, reserved}
> > > */
> > > +#define MEM_LOAD_L1                    0x00
> > > +#define MEM_LOAD_L2                    0x01
> > > +#define MEM_LOAD_L3                    0x02
> > > +#define MEM_LOAD_RAM                   0x03
> > > +#define MEM_LOAD_UNKNOWN               0x00
> > > +#define MEM_LOAD_IO                    0x01
> > > +#define MEM_LOAD_UNCACHED              0x02
> > > +#define MEM_LOAD_RESERVED              0x03
> > > +
> > > +/* Bits(2-3) {toggle, snoop, local, remote} */
> > > +#define MEM_LOAD_TOGGLE                        (0x00 << 2)
> > > +#define MEM_LOAD_SNOOP                 (0x01 << 2)
> > > +#define MEM_LOAD_LOCAL                 (0x02 << 2)
> > > +#define MEM_LOAD_REMOTE                        (0x03 << 2)
> > > +
> > > +/* Bits(4-5) {modified, exclusive, shared, invalid} */
> > > +#define MEM_LOAD_MODIFIED              (0x00 << 4)
> > > +#define MEM_LOAD_EXCLUSIVE             (0x01 << 4)
> > > +#define MEM_LOAD_SHARED                        (0x02 << 4)
> > > +#define MEM_LOAD_INVALID               (0x03 << 4) 
> > 
> > Did anybody check with the other PMUs that have similar features like
> > PowerPC and possibly IA64?
> > 
> > I keep mentioning this, nobody seems interested.
> 
> Anton, Paulus, IIRC PowerPC had some sort of Data-Source indication,
> would you have some docs available on the PowerPC PMU?

Going through
http://www.power.org/resources/downloads/PowerISA_V2.06B_V2_PUBLIC.pdf

Book III-S, Appendix B

I can only find the SDAR thing (which I assume is what PERF_SAMPLE_DATA
uses) but no mention of extra bits describing where the data was sourced
from. For some reason I had the impression PPC64 had the capability to
tell if a load/store was from/to L1/2/3/DRAM etc.

Now since the above document is in fact not an exhaustive spec of a
particular chip but more an outline of what a regular ppc64 chip should
have, with lots of room for implementation specific extensions it
doesn't say much at all.

So do you know of such a feature for PPC64 and if so, where's the
docs? :-)

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 2/4] perf, x86: Add Intel Nhm/Wsm/Snb load latency support
  2011-07-04  8:02 ` [PATCH 2/4] perf, x86: Add Intel Nhm/Wsm/Snb load latency support Lin Ming
@ 2011-07-05 13:17   ` Peter Zijlstra
  2011-07-05 13:34     ` Lin Ming
  2011-07-22 18:58   ` Stephane Eranian
  1 sibling, 1 reply; 37+ messages in thread
From: Peter Zijlstra @ 2011-07-05 13:17 UTC (permalink / raw)
  To: Lin Ming
  Cc: Ingo Molnar, Andi Kleen, Stephane Eranian,
	Arnaldo Carvalho de Melo, linux-kernel

On Mon, 2011-07-04 at 08:02 +0000, Lin Ming wrote:
> +#define INTEL_EVENT_EXTRA_REG2(event, msr, vm)    \
> +       EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT | \
> +                       ARCH_PERFMON_EVENTSEL_UMASK, vm) 

That's inconsistent wrt the normal constraints, INTEL_UEVENT_EXTRA_REG
would be the consistent name.

#define INTEL_UEVENT_EXTRA_REG(event, msr, vm)	\
	EVENT_EXTRA_REG(event, msr, INTEL_ARCH_EVENT_MASK, vm)



^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 2/4] perf, x86: Add Intel Nhm/Wsm/Snb load latency support
  2011-07-05 13:17   ` Peter Zijlstra
@ 2011-07-05 13:34     ` Lin Ming
  0 siblings, 0 replies; 37+ messages in thread
From: Lin Ming @ 2011-07-05 13:34 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Andi Kleen, Stephane Eranian,
	Arnaldo Carvalho de Melo, linux-kernel

On Tue, 2011-07-05 at 21:17 +0800, Peter Zijlstra wrote:
> On Mon, 2011-07-04 at 08:02 +0000, Lin Ming wrote:
> > +#define INTEL_EVENT_EXTRA_REG2(event, msr, vm)    \
> > +       EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT | \
> > +                       ARCH_PERFMON_EVENTSEL_UMASK, vm) 
> 
> That's inconsistent wrt the normal constraints, INTEL_UEVENT_EXTRA_REG
> would be the consistent name.
> 
> #define INTEL_UEVENT_EXTRA_REG(event, msr, vm)	\
> 	EVENT_EXTRA_REG(event, msr, INTEL_ARCH_EVENT_MASK, vm)

Right, thanks.



^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/4] perf: Add memory load/store events generic code
  2011-07-05 11:54     ` Lin Ming
@ 2011-07-05 14:17       ` Peter Zijlstra
  2011-07-06  5:53         ` Lin Ming
  0 siblings, 1 reply; 37+ messages in thread
From: Peter Zijlstra @ 2011-07-05 14:17 UTC (permalink / raw)
  To: Lin Ming
  Cc: Ingo Molnar, Andi Kleen, Stephane Eranian,
	Arnaldo Carvalho de Melo, linux-kernel, Robert Richter

On Tue, 2011-07-05 at 19:54 +0800, Lin Ming wrote:
> On Mon, 2011-07-04 at 19:16 +0800, Peter Zijlstra wrote:
> > On Mon, 2011-07-04 at 08:02 +0000, Lin Ming wrote:
> > > +#define MEM_STORE_DCU_HIT              (1ULL << 0)
> > 
> > I'm pretty sure that's not Dublin City University, but what is it?
> > Data-Cache-Unit? what does that mean, L1/L2 or also L3? 
> > 
> > > +#define MEM_STORE_STLB_HIT             (1ULL << 1)
> > 
> > What's an sTLB? I know iTLB and dTLB's but sTLBs I've not heard of yet.
> > 
> > > +#define MEM_STORE_LOCKED_ACCESS                (1ULL << 2) 
> > 
> > Presumably that's about LOCK'ed ops?
> > 
> > So now you're just tacking bits on the end without even attempting to
> > generalize/unify things, not charmed at all.
> 
> Any idea on the more useful store bits encoding?

For two of them, sure:

{load, store} x {atomic} x
	{hasSRC} x {l1, l2, l3, ram, unkown, io, uncached, reserved} x
	{hasLRS} x {local, remote, snoop} x 
	{hasMESI} x {MESI}

that would make MEM_STORE_DCU_HIT: store-l1 and MEM_STORE_LOCKED:
store-atomic.

Now this is needed for load-latency as well, since SNB extended the src
information with the same STLB/LOCK bits.

The SDM is somewhat inconsistent on what an STLB_MISS means:

Table 30-22 says: 0 - did not miss STLB (hit the DTLB/STLB), 1 - missed
the STLB. 

Table 30-23 says: "the store missed the STLB if set, otherwise the store
hit the STLB", which simply cannot be true. 

So I'm sticking with 30-22.

Now the above doesn't yet deal with TLBs nor can it map the IBS data
source bits because afaict that can report a u-op as both a store and a
load, but does not mention if a data-cache miss means L1 or L1/L2,
Robert?

One way to sort all that is not use enumerated spaces like above but
simply explode the whole thing like: load x store x atomic x l1 x l2
x ... that would of course give rise to a load of impossible
combinations but would do away with the hasFOO bits.

If the AMD data-cache means L1/L2 it can simply set both bits, same with
the Intel STLB miss, it can set TLB1/TLB2 bits (AMD does split those
nicely).

With all those bits exploded we can also express the inverse of
MEM_STORE_DCU_HIT as: store-l2-l3-dram, we simply set ~l1 for the
appropriate submask (which should arguably include IO/uncached/unknown
as well).

Now if anybody knows of another arch that has similar features (IA64,
ppc64?) can we get links to their PMU docs so that we can see if we can
map them as well?


Comments?


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/4] perf: Add memory load/store events generic code
  2011-07-05 12:03       ` Peter Zijlstra
@ 2011-07-05 23:02         ` Paul Mackerras
  2011-07-06 13:58           ` Peter Zijlstra
  0 siblings, 1 reply; 37+ messages in thread
From: Paul Mackerras @ 2011-07-05 23:02 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Lin Ming, Ingo Molnar, Andi Kleen, Stephane Eranian,
	Arnaldo Carvalho de Melo, linux-kernel, Anton Blanchard

[-- Attachment #1: Type: text/plain, Size: 1668 bytes --]

On Tue, Jul 05, 2011 at 02:03:38PM +0200, Peter Zijlstra wrote:
> On Mon, 2011-07-04 at 10:44 +0200, Peter Zijlstra wrote:
> > Anton, Paulus, IIRC PowerPC had some sort of Data-Source indication,
> > would you have some docs available on the PowerPC PMU?
> 
> Going through
> http://www.power.org/resources/downloads/PowerISA_V2.06B_V2_PUBLIC.pdf
> 
> Book III-S, Appendix B
> 
> I can only find the SDAR thing (which I assume is what PERF_SAMPLE_DATA
> uses) but no mention of extra bits describing where the data was sourced
> from. For some reason I had the impression PPC64 had the capability to
> tell if a load/store was from/to L1/2/3/DRAM etc.
> 
> Now since the above document is in fact not an exhaustive spec of a
> particular chip but more an outline of what a regular ppc64 chip should
> have, with lots of room for implementation specific extensions it
> doesn't say much at all.
> 
> So do you know of such a feature for PPC64 and if so, where's the
> docs? :-)

Unfortunately the P7 PMU documentation is not available publicly yet. :(

There are events that can be used to count how many times data or
instructions get loaded from different places in the memory
subsystem.  There are 15 separate DATA_FROM_xxx events, for instance,
that count things like "number of times data was loaded from L2 or L3
cache on another chip and the cache line was in shared state".
They're great if you want fine detail on memory traffic but perhaps
not so good if you want a broad overview (there are separate events
for L1 and L2 accesses and misses though).

I've attached a table of P7 PMU events.  Look for the PM_DATA_FROM_xxx
and PM_INST_FROM_xxx events.

Paul.

[-- Attachment #2: p7-events.gz --]
[-- Type: application/octet-stream, Size: 8758 bytes --]

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/4] perf: Add memory load/store events generic code
  2011-07-05 14:17       ` Peter Zijlstra
@ 2011-07-06  5:53         ` Lin Ming
  2011-07-06 13:51           ` Peter Zijlstra
  0 siblings, 1 reply; 37+ messages in thread
From: Lin Ming @ 2011-07-06  5:53 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Andi Kleen, Stephane Eranian,
	Arnaldo Carvalho de Melo, linux-kernel, Robert Richter

On Tue, 2011-07-05 at 22:17 +0800, Peter Zijlstra wrote:
> On Tue, 2011-07-05 at 19:54 +0800, Lin Ming wrote:
> > On Mon, 2011-07-04 at 19:16 +0800, Peter Zijlstra wrote:
> > > On Mon, 2011-07-04 at 08:02 +0000, Lin Ming wrote:
> > > > +#define MEM_STORE_DCU_HIT              (1ULL << 0)
> > > 
> > > I'm pretty sure that's not Dublin City University, but what is it?
> > > Data-Cache-Unit? what does that mean, L1/L2 or also L3? 
> > > 
> > > > +#define MEM_STORE_STLB_HIT             (1ULL << 1)
> > > 
> > > What's an sTLB? I know iTLB and dTLB's but sTLBs I've not heard of yet.
> > > 
> > > > +#define MEM_STORE_LOCKED_ACCESS                (1ULL << 2) 
> > > 
> > > Presumably that's about LOCK'ed ops?
> > > 
> > > So now you're just tacking bits on the end without even attempting to
> > > generalize/unify things, not charmed at all.
> > 
> > Any idea on the more useful store bits encoding?
> 
> For two of them, sure:
> 
> {load, store} x {atomic} x
> 	{hasSRC} x {l1, l2, l3, ram, unkown, io, uncached, reserved} x
> 	{hasLRS} x {local, remote, snoop} x 
> 	{hasMESI} x {MESI}
> 
> that would make MEM_STORE_DCU_HIT: store-l1 and MEM_STORE_LOCKED:
> store-atomic.
> 
> Now this is needed for load-latency as well, since SNB extended the src
> information with the same STLB/LOCK bits.
> 
> The SDM is somewhat inconsistent on what an STLB_MISS means:
> 
> Table 30-22 says: 0 - did not miss STLB (hit the DTLB/STLB), 1 - missed
> the STLB. 
> 
> Table 30-23 says: "the store missed the STLB if set, otherwise the store
> hit the STLB", which simply cannot be true. 
> 
> So I'm sticking with 30-22.
> 
> Now the above doesn't yet deal with TLBs nor can it map the IBS data
> source bits because afaict that can report a u-op as both a store and a
> load, but does not mention if a data-cache miss means L1 or L1/L2,
> Robert?
> 
> One way to sort all that is not use enumerated spaces like above but
> simply explode the whole thing like: load x store x atomic x l1 x l2
> x ... that would of course give rise to a load of impossible
> combinations but would do away with the hasFOO bits.
> 
> If the AMD data-cache means L1/L2 it can simply set both bits, same with
> the Intel STLB miss, it can set TLB1/TLB2 bits (AMD does split those
> nicely).
> 
> With all those bits exploded we can also express the inverse of
> MEM_STORE_DCU_HIT as: store-l2-l3-dram, we simply set ~l1 for the
> appropriate submask (which should arguably include IO/uncached/unknown
> as well).

Do you mean to use the "impossible combinations" to express the inverse?
MEM_STORE_DCU_MISS as: store-l2-l3-dram
MEM_STORE_STLB_MISS as: store-itlb-dtlb

How about below code?

#define PERF_MEM_LOAD                   (1ULL << 0)
#define PERF_MEM_STORE                  (1ULL << 1)
#define PERF_MEM_ATOMIC                 (1ULL << 2)
#define PERF_MEM_L1                     (1ULL << 3)
#define PERF_MEM_L2                     (1ULL << 4)
#define PERF_MEM_L3                     (1ULL << 5)
#define PERF_MEM_RAM                    (1ULL << 6)
#define PERF_MEM_UNKNOWN                (1ULL << 7)
#define PERF_MEM_IO                     (1ULL << 8)
#define PERF_MEM_UNCACHED               (1ULL << 9)
#define PERF_MEM_RESERVED               (1ULL << 10)
#define PERF_MEM_LOCAL                  (1ULL << 11)
#define PERF_MEM_REMOTE                 (1ULL << 12)
#define PERF_MEM_SNOOP                  (1ULL << 13)
#define PERF_MEM_MODIFIED               (1ULL << 14)
#define PERF_MEM_EXCLUSIVE              (1ULL << 15)
#define PERF_MEM_SHARED                 (1ULL << 16)
#define PERF_MEM_INVALID                (1ULL << 17)
#define PERF_MEM_ITLB                   (1ULL << 18)
#define PERF_MEM_DTLB                   (1ULL << 19)
#define PERF_MEM_STLB                   (1ULL << 20)

#define PERF_MEM_STORE_L1D_HIT  \
        (PERF_MEM_STORE | PERF_MEM_L1)

#define PERF_MEM_STORE_L1D_MISS \
        (PERF_MEM_STORE | PERF_MEM_L2 | PERF_MEM_L3 | PERF_MEM_RAM)

#define PERF_MEM_STORE_STLB_HIT \
        (PERF_MEM_STORE | PERF_MEM_STLB)
        
#define PERF_MEM_STORE_STLB_MISS \
        (PERF_MEM_STORE | PERF_MEM_ITLB | PERF_MEM_DTLB)

#define PERF_MEM_STORE_ATOMIC \
        (PERF_MEM_STORE | PERF_MEM_ATOMIC)

#define PERF_MEM_LOAD_STLB_HIT  \
        (PERF_MEM_LOAD | PERF_MEM_STLB)
   
#define PERF_MEM_LOAD_STLB_MISS \
        (PERF_MEM_LOAD | PERF_MEM_ITLB | PERF_MEM_DTLB)

#define PERF_MEM_LOAD_ATOMIC \
        (PERF_MEM_LOAD | PERF_MEM_ATOMIC)


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/4] perf: Add memory load/store events generic code
  2011-07-06  5:53         ` Lin Ming
@ 2011-07-06 13:51           ` Peter Zijlstra
  2011-07-07  2:01             ` Lin Ming
  0 siblings, 1 reply; 37+ messages in thread
From: Peter Zijlstra @ 2011-07-06 13:51 UTC (permalink / raw)
  To: Lin Ming
  Cc: Ingo Molnar, Andi Kleen, Stephane Eranian,
	Arnaldo Carvalho de Melo, linux-kernel, Robert Richter

On Wed, 2011-07-06 at 13:53 +0800, Lin Ming wrote:

> Do you mean to use the "impossible combinations" to express the inverse?

Nah, impossible would be things like having neither LOAD nor STORE set.

> MEM_STORE_DCU_MISS as: store-l2-l3-dram
> MEM_STORE_STLB_MISS as: store-itlb-dtlb
> 
> How about below code?

Right, something like that. Robert can the IBS data source data be
mapped onto this as well?

> #define PERF_MEM_LOAD                   (1ULL << 0)
> #define PERF_MEM_STORE                  (1ULL << 1)
> #define PERF_MEM_ATOMIC                 (1ULL << 2)
> #define PERF_MEM_L1                     (1ULL << 3)
> #define PERF_MEM_L2                     (1ULL << 4)
> #define PERF_MEM_L3                     (1ULL << 5)
> #define PERF_MEM_RAM                    (1ULL << 6)
> #define PERF_MEM_UNKNOWN                (1ULL << 7)
> #define PERF_MEM_IO                     (1ULL << 8)
> #define PERF_MEM_UNCACHED               (1ULL << 9)
> #define PERF_MEM_RESERVED               (1ULL << 10)
> #define PERF_MEM_LOCAL                  (1ULL << 11)
> #define PERF_MEM_REMOTE                 (1ULL << 12)
> #define PERF_MEM_SNOOP                  (1ULL << 13)
> #define PERF_MEM_MODIFIED               (1ULL << 14)
> #define PERF_MEM_EXCLUSIVE              (1ULL << 15)
> #define PERF_MEM_SHARED                 (1ULL << 16)
> #define PERF_MEM_INVALID                (1ULL << 17)

> #define PERF_MEM_ITLB                   (1ULL << 18)
> #define PERF_MEM_DTLB                   (1ULL << 19)
> #define PERF_MEM_STLB                   (1ULL << 20)

Are these TLB hit or miss?

> #define PERF_MEM_STORE_L1D_HIT  \
>         (PERF_MEM_STORE | PERF_MEM_L1)
> 
> #define PERF_MEM_STORE_L1D_MISS \
>         (PERF_MEM_STORE | PERF_MEM_L2 | PERF_MEM_L3 | PERF_MEM_RAM)
> 
> #define PERF_MEM_STORE_STLB_HIT \
>         (PERF_MEM_STORE | PERF_MEM_STLB)
>       
> #define PERF_MEM_STORE_STLB_MISS \
>         (PERF_MEM_STORE | PERF_MEM_ITLB | PERF_MEM_DTLB)

Going by the definition in table 30-22 neither of these seem correct, a
STLB_HIT was defined as DTLB|STLB whereas a STLB_MISS was missing both
(resulting in a full page-table walk I presume).

> #define PERF_MEM_STORE_ATOMIC \
>         (PERF_MEM_STORE | PERF_MEM_ATOMIC)
> 
> #define PERF_MEM_LOAD_STLB_HIT  \
>         (PERF_MEM_LOAD | PERF_MEM_STLB)
>    
> #define PERF_MEM_LOAD_STLB_MISS \
>         (PERF_MEM_LOAD | PERF_MEM_ITLB | PERF_MEM_DTLB)

idem

> #define PERF_MEM_LOAD_ATOMIC \
>         (PERF_MEM_LOAD | PERF_MEM_ATOMIC)
> 


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/4] perf: Add memory load/store events generic code
  2011-07-05 23:02         ` Paul Mackerras
@ 2011-07-06 13:58           ` Peter Zijlstra
  2011-07-08  7:18             ` Anton Blanchard
  0 siblings, 1 reply; 37+ messages in thread
From: Peter Zijlstra @ 2011-07-06 13:58 UTC (permalink / raw)
  To: Paul Mackerras
  Cc: Lin Ming, Ingo Molnar, Andi Kleen, Stephane Eranian,
	Arnaldo Carvalho de Melo, linux-kernel, Anton Blanchard

On Wed, 2011-07-06 at 09:02 +1000, Paul Mackerras wrote:
> On Tue, Jul 05, 2011 at 02:03:38PM +0200, Peter Zijlstra wrote:
> > On Mon, 2011-07-04 at 10:44 +0200, Peter Zijlstra wrote:
> > > Anton, Paulus, IIRC PowerPC had some sort of Data-Source indication,
> > > would you have some docs available on the PowerPC PMU?
> > 
> > Going through
> > http://www.power.org/resources/downloads/PowerISA_V2.06B_V2_PUBLIC.pdf
> > 
> > Book III-S, Appendix B
> > 
> > I can only find the SDAR thing (which I assume is what PERF_SAMPLE_DATA
> > uses) but no mention of extra bits describing where the data was sourced
> > from. For some reason I had the impression PPC64 had the capability to
> > tell if a load/store was from/to L1/2/3/DRAM etc.
> > 
> > Now since the above document is in fact not an exhaustive spec of a
> > particular chip but more an outline of what a regular ppc64 chip should
> > have, with lots of room for implementation specific extensions it
> > doesn't say much at all.
> > 
> > So do you know of such a feature for PPC64 and if so, where's the
> > docs? :-)
> 
> Unfortunately the P7 PMU documentation is not available publicly yet. :(

Are the P6/P6+ PMU docs? That at least would give me something to look
at.

> There are events that can be used to count how many times data or
> instructions get loaded from different places in the memory
> subsystem.  There are 15 separate DATA_FROM_xxx events, for instance,
> that count things like "number of times data was loaded from L2 or L3
> cache on another chip and the cache line was in shared state".
> They're great if you want fine detail on memory traffic but perhaps
> not so good if you want a broad overview (there are separate events
> for L1 and L2 accesses and misses though).
> 
> I've attached a table of P7 PMU events.  Look for the PM_DATA_FROM_xxx
> and PM_INST_FROM_xxx events.

Ok, so those are regular events and perf covers that capability.

The thing we're talking about is Intel PEBS Load Latency/Precise Store
and AMD IBS where together with a mem op retired event (mem loads
retired for Load-Latency, mem stores retired for Precise Store) provides
an additional field describing where the load/store was sourced from.

Such additional data would require the addition of a PERF_SAMPLE_SOURCE
field or similar, for some reason or other I was under the impression
some of the PPC chips had something similar. But if not, it saves us
having to worry about that.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/4] perf: Add memory load/store events generic code
  2011-07-06 13:51           ` Peter Zijlstra
@ 2011-07-07  2:01             ` Lin Ming
  0 siblings, 0 replies; 37+ messages in thread
From: Lin Ming @ 2011-07-07  2:01 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Andi Kleen, Stephane Eranian,
	Arnaldo Carvalho de Melo, linux-kernel, Robert Richter

On Wed, 2011-07-06 at 21:51 +0800, Peter Zijlstra wrote:
> On Wed, 2011-07-06 at 13:53 +0800, Lin Ming wrote:
> 
> > Do you mean to use the "impossible combinations" to express the inverse?
> 
> Nah, impossible would be things like having neither LOAD nor STORE set.
> 
> > MEM_STORE_DCU_MISS as: store-l2-l3-dram
> > MEM_STORE_STLB_MISS as: store-itlb-dtlb
> > 
> > How about below code?
> 
> Right, something like that. Robert can the IBS data source data be
> mapped onto this as well?
> 
> > #define PERF_MEM_LOAD                   (1ULL << 0)
> > #define PERF_MEM_STORE                  (1ULL << 1)
> > #define PERF_MEM_ATOMIC                 (1ULL << 2)
> > #define PERF_MEM_L1                     (1ULL << 3)
> > #define PERF_MEM_L2                     (1ULL << 4)
> > #define PERF_MEM_L3                     (1ULL << 5)
> > #define PERF_MEM_RAM                    (1ULL << 6)
> > #define PERF_MEM_UNKNOWN                (1ULL << 7)
> > #define PERF_MEM_IO                     (1ULL << 8)
> > #define PERF_MEM_UNCACHED               (1ULL << 9)
> > #define PERF_MEM_RESERVED               (1ULL << 10)
> > #define PERF_MEM_LOCAL                  (1ULL << 11)
> > #define PERF_MEM_REMOTE                 (1ULL << 12)
> > #define PERF_MEM_SNOOP                  (1ULL << 13)
> > #define PERF_MEM_MODIFIED               (1ULL << 14)
> > #define PERF_MEM_EXCLUSIVE              (1ULL << 15)
> > #define PERF_MEM_SHARED                 (1ULL << 16)
> > #define PERF_MEM_INVALID                (1ULL << 17)
> 
> > #define PERF_MEM_ITLB                   (1ULL << 18)
> > #define PERF_MEM_DTLB                   (1ULL << 19)
> > #define PERF_MEM_STLB                   (1ULL << 20)
> 
> Are these TLB hit or miss?

I meant hit, but that's not correct.
How about more complete definition as below?

#define PERF_MEM_L1D_HIT           
#define PERF_MEM_L1D_MISS               
#define PERF_MEM_L1I_HIT                
#define PERF_MEM_L1I_MISS              
#define PERF_MEM_L2_HIT                
#define PERF_MEM_L2_MISS                
#define PERF_MEM_L3_HIT                 
#define PERF_MEM_L3_MISS                

#define PERF_MEM_ITLB_HIT               
#define PERF_MEM_ITLB_MISS              
#define PERF_MEM_DTLB_HIT               
#define PERF_MEM_DTLB_MISS              
#define PERF_MEM_STLB_HIT               
#define PERF_MEM_STLB_MISS              

#define PERF_MEM_STORE_L1D_HIT  \
        (PERF_MEM_STORE | PERF_MEM_L1D_HIT)

#define PERF_MEM_STORE_L1D_MISS \
        (PERF_MEM_STORE | PERF_MEM_L1D_MISS)

#define PERF_MEM_STORE_STLB_HIT \
        (PERF_MEM_STORE | PERF_MEM_STLB_HIT)

#define PERF_MEM_STORE_STLB_MISS \
        (PERF_MEM_STORE | PERF_MEM_STLB_MISS)

#define PERF_MEM_STORE_ATOMIC \
        (PERF_MEM_STORE | PERF_MEM_ATOMIC)

#define PERF_MEM_LOAD_STLB_HIT  \
        (PERF_MEM_LOAD | PERF_MEM_STLB_HIT)

#define PERF_MEM_LOAD_STLB_MISS \
        (PERF_MEM_LOAD | PERF_MEM_STLB_MISS)

#define PERF_MEM_LOAD_ATOMIC \
        (PERF_MEM_LOAD | PERF_MEM_ATOMIC)

Lin Ming

> 
> > #define PERF_MEM_STORE_L1D_HIT  \
> >         (PERF_MEM_STORE | PERF_MEM_L1)
> > 
> > #define PERF_MEM_STORE_L1D_MISS \
> >         (PERF_MEM_STORE | PERF_MEM_L2 | PERF_MEM_L3 | PERF_MEM_RAM)
> > 
> > #define PERF_MEM_STORE_STLB_HIT \
> >         (PERF_MEM_STORE | PERF_MEM_STLB)
> >       
> > #define PERF_MEM_STORE_STLB_MISS \
> >         (PERF_MEM_STORE | PERF_MEM_ITLB | PERF_MEM_DTLB)
> 
> Going by the definition in table 30-22 neither of these seem correct, a
> STLB_HIT was defined as DTLB|STLB whereas a STLB_MISS was missing both
> (resulting in a full page-table walk I presume).
> 
> > #define PERF_MEM_STORE_ATOMIC \
> >         (PERF_MEM_STORE | PERF_MEM_ATOMIC)
> > 
> > #define PERF_MEM_LOAD_STLB_HIT  \
> >         (PERF_MEM_LOAD | PERF_MEM_STLB)
> >    
> > #define PERF_MEM_LOAD_STLB_MISS \
> >         (PERF_MEM_LOAD | PERF_MEM_ITLB | PERF_MEM_DTLB)
> 
> idem
> 
> > #define PERF_MEM_LOAD_ATOMIC \
> >         (PERF_MEM_LOAD | PERF_MEM_ATOMIC)
> > 
> 



^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/4] perf: Add memory load/store events generic code
  2011-07-06 13:58           ` Peter Zijlstra
@ 2011-07-08  7:18             ` Anton Blanchard
  2011-07-08 15:18               ` Peter Zijlstra
  0 siblings, 1 reply; 37+ messages in thread
From: Anton Blanchard @ 2011-07-08  7:18 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Paul Mackerras, Lin Ming, Ingo Molnar, Andi Kleen,
	Stephane Eranian, Arnaldo Carvalho de Melo, linux-kernel


Hi Peter,

> The thing we're talking about is Intel PEBS Load Latency/Precise Store
> and AMD IBS where together with a mem op retired event (mem loads
> retired for Load-Latency, mem stores retired for Precise Store)
> provides an additional field describing where the load/store was
> sourced from.
> 
> Such additional data would require the addition of a
> PERF_SAMPLE_SOURCE field or similar, for some reason or other I was
> under the impression some of the PPC chips had something similar. But
> if not, it saves us having to worry about that.

It does sound a lot like our event vector, where we can have up to
64 bits of information that goes with a sample. A lot of the fields
relate to loads and stores, but there are other fields (eg pipeline
information at the point the sample was taken).

So we could definitely use a field to capture this.

Anton

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/4] perf: Add memory load/store events generic code
  2011-07-08  7:18             ` Anton Blanchard
@ 2011-07-08 15:18               ` Peter Zijlstra
  2011-08-08 11:57                 ` Peter Zijlstra
  2011-08-08 11:59                 ` Peter Zijlstra
  0 siblings, 2 replies; 37+ messages in thread
From: Peter Zijlstra @ 2011-07-08 15:18 UTC (permalink / raw)
  To: Anton Blanchard
  Cc: Paul Mackerras, Lin Ming, Ingo Molnar, Andi Kleen,
	Stephane Eranian, Arnaldo Carvalho de Melo, linux-kernel

On Fri, 2011-07-08 at 17:18 +1000, Anton Blanchard wrote:
> Hi Peter,
> 
> > The thing we're talking about is Intel PEBS Load Latency/Precise Store
> > and AMD IBS where together with a mem op retired event (mem loads
> > retired for Load-Latency, mem stores retired for Precise Store)
> > provides an additional field describing where the load/store was
> > sourced from.
> > 
> > Such additional data would require the addition of a
> > PERF_SAMPLE_SOURCE field or similar, for some reason or other I was
> > under the impression some of the PPC chips had something similar. But
> > if not, it saves us having to worry about that.
> 
> It does sound a lot like our event vector, where we can have up to
> 64 bits of information that goes with a sample. A lot of the fields
> relate to loads and stores, but there are other fields (eg pipeline
> information at the point the sample was taken).
> 
> So we could definitely use a field to capture this.

Happen to have a ref to some docs about that? We'd want to make sure our
definition is wide enough to also work for ppc.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 3/4] perf, x86: Add Intel SandyBridge pricise store support
  2011-07-04  8:02 ` [PATCH 3/4] perf, x86: Add Intel SandyBridge pricise store support Lin Ming
@ 2011-07-11  8:32   ` Peter Zijlstra
  2011-07-11  8:57     ` Lin Ming
  0 siblings, 1 reply; 37+ messages in thread
From: Peter Zijlstra @ 2011-07-11  8:32 UTC (permalink / raw)
  To: Lin Ming
  Cc: Ingo Molnar, Andi Kleen, Stephane Eranian,
	Arnaldo Carvalho de Melo, linux-kernel

On Mon, 2011-07-04 at 08:02 +0000, Lin Ming wrote:
> Implements Intel memory store event for SandyBridge.
> 
> $ perf mem -t store record make -j8


I was just looking through the Intel SDM, and stumbled upon:

C0H	01H	INST_RETIRED.PREC_DIST

Precise instruction retired event
with HW to reduce effect of PEBS
shadow in IP distribution PMC1 only; 
Must quiesce other PMCs.
^^^^^^^^^^^^^^^^^^^^^^^^

WTF!? Are they real? The implementation as provided by you doesn't do
that (quite understandably), but please check with the hardware folks.


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 3/4] perf, x86: Add Intel SandyBridge pricise store support
  2011-07-11  8:57     ` Lin Ming
@ 2011-07-11  8:52       ` Peter Zijlstra
  0 siblings, 0 replies; 37+ messages in thread
From: Peter Zijlstra @ 2011-07-11  8:52 UTC (permalink / raw)
  To: Lin Ming
  Cc: Ingo Molnar, Andi Kleen, Stephane Eranian,
	Arnaldo Carvalho de Melo, linux-kernel

On Mon, 2011-07-11 at 16:57 +0800, Lin Ming wrote:
> On Mon, 2011-07-11 at 16:32 +0800, Peter Zijlstra wrote:
> > On Mon, 2011-07-04 at 08:02 +0000, Lin Ming wrote:
> > > Implements Intel memory store event for SandyBridge.
> > > 
> > > $ perf mem -t store record make -j8
> > 
> > 
> > I was just looking through the Intel SDM, and stumbled upon:
> > 
> > C0H	01H	INST_RETIRED.PREC_DIST
> > 
> > Precise instruction retired event
> > with HW to reduce effect of PEBS
> > shadow in IP distribution PMC1 only; 
> > Must quiesce other PMCs.
> > ^^^^^^^^^^^^^^^^^^^^^^^^
> > 
> > WTF!? Are they real? The implementation as provided by you doesn't do
> > that (quite understandably), but please check with the hardware folks.
> 
> This is Precise Distribution of Instructions Retired (PDIR), which is
> not related to Precise Store.

Gah right, still ridiculous constraint.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 3/4] perf, x86: Add Intel SandyBridge pricise store support
  2011-07-11  8:32   ` Peter Zijlstra
@ 2011-07-11  8:57     ` Lin Ming
  2011-07-11  8:52       ` Peter Zijlstra
  0 siblings, 1 reply; 37+ messages in thread
From: Lin Ming @ 2011-07-11  8:57 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Andi Kleen, Stephane Eranian,
	Arnaldo Carvalho de Melo, linux-kernel

On Mon, 2011-07-11 at 16:32 +0800, Peter Zijlstra wrote:
> On Mon, 2011-07-04 at 08:02 +0000, Lin Ming wrote:
> > Implements Intel memory store event for SandyBridge.
> > 
> > $ perf mem -t store record make -j8
> 
> 
> I was just looking through the Intel SDM, and stumbled upon:
> 
> C0H	01H	INST_RETIRED.PREC_DIST
> 
> Precise instruction retired event
> with HW to reduce effect of PEBS
> shadow in IP distribution PMC1 only; 
> Must quiesce other PMCs.
> ^^^^^^^^^^^^^^^^^^^^^^^^
> 
> WTF!? Are they real? The implementation as provided by you doesn't do
> that (quite understandably), but please check with the hardware folks.

This is Precise Distribution of Instructions Retired (PDIR), which is
not related to Precise Store.



^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/4] perf: memory load/store events generalization
  2011-07-04  8:02 [PATCH 0/4] perf: memory load/store events generalization Lin Ming
                   ` (3 preceding siblings ...)
  2011-07-04  8:02 ` [PATCH 4/4] perf, tool: Add new command "perf mem" Lin Ming
@ 2011-07-22 18:55 ` Stephane Eranian
  2011-07-22 21:01   ` Andi Kleen
  4 siblings, 1 reply; 37+ messages in thread
From: Stephane Eranian @ 2011-07-22 18:55 UTC (permalink / raw)
  To: Lin Ming
  Cc: Peter Zijlstra, Ingo Molnar, Andi Kleen,
	Arnaldo Carvalho de Melo, linux-kernel

Lin,

On Mon, Jul 4, 2011 at 1:02 AM, Lin Ming <ming.m.lin@intel.com> wrote:
> Hi, all
>
> Intel PMU provides 2 facilities to monitor memory operation: load latency and precise store.
> This patchset tries to generalize memory load/store events.
> So other arches may also add such features.
>
> A new sub-command "mem" is added,
>
> $ perf mem
>
>  usage: perf mem [<options>] {record <command> |report}
>
>    -t, --type <type>     memory operations(load/store)
>    -L, --latency <n>     latency to sample(only for load op)
>
That looks okay as a first approach tool. But what people are most
often interested in is to see where the misses occur, i.e., you need
to display load/store addresses somehow, especially for the more
costly misses (the ones the compiler cannot really hide by hoisting
loads).

> $ perf mem -t load record make -j8
>
> <building kernel ..., monitoring memory load opeartion>
>
> $ perf mem -t load report
>
> Memory load operation statistics
> ================================
>                      L1-local: total latency=   28027, count=    3355(avg=8)

That's wrong. On Intel, you need to subtract 4 cycles from the latency
you get out of PEBS-LL. The kernel can do that.

>                      L2-snoop: total latency=    1430, count=      29(avg=49)

I suspect L2-snoop is not correct. If this line item relates to bit 2 of
the data source, then it corresponds to a secondary miss. That means
you have a load to a cache-line that is already being requested.

>                      L2-local: total latency=     124, count=       8(avg=15)
>             L3-snoop, found M: total latency=     452, count=       4(avg=113)
>          L3-snoop, found no M: total latency=       0, count=       0(avg=0)
> L3-snoop, no coherency actions: total latency=     875, count=      18(avg=48)
>        L3-miss, snoop, shared: total latency=       0, count=       0(avg=0)
>     L3-miss, local, exclusive: total latency=       0, count=       0(avg=0)
>        L3-miss, local, shared: total latency=       0, count=       0(avg=0)
>    L3-miss, remote, exclusive: total latency=       0, count=       0(avg=0)
>       L3-miss, remote, shared: total latency=       0, count=       0(avg=0)
>                    Unknown L3: total latency=       0, count=       0(avg=0)
>                            IO: total latency=       0, count=       0(avg=0)
>                      Uncached: total latency=     464, count=      30(avg=15)
>
I think it would be more useful to print the % of loads captured for
each category.

> $ perf mem -t store record make -j8
>
> <building kernel ..., monitoring memory store opeartion>
>
> $ perf mem -t store report
>
> Memory store operation statistics
> =================================
>                data-cache hit:     8138
>               data-cache miss:        0
>                      STLB hit:     8138
>                     STLB miss:        0
>                 Locked access:        0
>               Unlocked access:     8138
>
> Any comment is appreciated.
>
> Thanks,
> Lin Ming
>

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 2/4] perf, x86: Add Intel Nhm/Wsm/Snb load latency support
  2011-07-04  8:02 ` [PATCH 2/4] perf, x86: Add Intel Nhm/Wsm/Snb load latency support Lin Ming
  2011-07-05 13:17   ` Peter Zijlstra
@ 2011-07-22 18:58   ` Stephane Eranian
  1 sibling, 0 replies; 37+ messages in thread
From: Stephane Eranian @ 2011-07-22 18:58 UTC (permalink / raw)
  To: Lin Ming
  Cc: Peter Zijlstra, Ingo Molnar, Andi Kleen,
	Arnaldo Carvalho de Melo, linux-kernel

Lin,

On Mon, Jul 4, 2011 at 1:02 AM, Lin Ming <ming.m.lin@intel.com> wrote:
>
> Implements Intel memory load event for Nehalem/Westmere/SandyBridge.
>
> $ perf mem -t load record make -j8
>
> <building kernel ..., monitoring memory load opeartion>
>
> $ perf mem -t load report
>
> Memory load operation statistics
> ================================
>                      L1-local: total latency=   28027, count=    3355(avg=8)
>                      L2-snoop: total latency=    1430, count=      29(avg=49)
>                      L2-local: total latency=     124, count=       8(avg=15)
>             L3-snoop, found M: total latency=     452, count=       4(avg=113)
>          L3-snoop, found no M: total latency=       0, count=       0(avg=0)
> L3-snoop, no coherency actions: total latency=     875, count=      18(avg=48)
>        L3-miss, snoop, shared: total latency=       0, count=       0(avg=0)
>     L3-miss, local, exclusive: total latency=       0, count=       0(avg=0)
>        L3-miss, local, shared: total latency=       0, count=       0(avg=0)
>    L3-miss, remote, exclusive: total latency=       0, count=       0(avg=0)
>       L3-miss, remote, shared: total latency=       0, count=       0(avg=0)
>                    Unknown L3: total latency=       0, count=       0(avg=0)
>                            IO: total latency=       0, count=       0(avg=0)
>                      Uncached: total latency=     464, count=      30(avg=15)
>
> Signed-off-by: Lin Ming <ming.m.lin@intel.com>
> ---
>  arch/x86/include/asm/msr-index.h          |    2 +
>  arch/x86/kernel/cpu/perf_event.c          |   10 ++++++
>  arch/x86/kernel/cpu/perf_event_intel.c    |   20 +++++++++++-
>  arch/x86/kernel/cpu/perf_event_intel_ds.c |   49 ++++++++++++++++++++++++++--
>  4 files changed, 76 insertions(+), 5 deletions(-)
>
> diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
> index 485b4f1..da93a9d 100644
> --- a/arch/x86/include/asm/msr-index.h
> +++ b/arch/x86/include/asm/msr-index.h
> @@ -60,6 +60,8 @@
>  #define MSR_IA32_DS_AREA               0x00000600
>  #define MSR_IA32_PERF_CAPABILITIES     0x00000345
>
> +#define MSR_PEBS_LD_LAT_THRESHOLD      0x000003f6
> +
>  #define MSR_MTRRfix64K_00000           0x00000250
>  #define MSR_MTRRfix16K_80000           0x00000258
>  #define MSR_MTRRfix16K_A0000           0x00000259
> diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
> index 3a0338b..ce380a7 100644
> --- a/arch/x86/kernel/cpu/perf_event.c
> +++ b/arch/x86/kernel/cpu/perf_event.c
> @@ -207,6 +207,9 @@ struct extra_reg {
>  #define INTEL_EVENT_EXTRA_REG(event, msr, vm)  \
>        EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm)
>  #define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0)
> +#define INTEL_EVENT_EXTRA_REG2(event, msr, vm)    \
> +       EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT | \
> +                       ARCH_PERFMON_EVENTSEL_UMASK, vm)
>
>  union perf_capabilities {
>        struct {
> @@ -406,6 +409,11 @@ static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
>                        continue;
>                if (event->attr.config1 & ~er->valid_mask)
>                        return -EINVAL;
> +
> +               /* The minimum value that may be programmed into MSR_PEBS_LD_LAT is 3 */
> +               if (er->msr == MSR_PEBS_LD_LAT_THRESHOLD && event->attr.config1 < 3)
> +                       return -EINVAL;
> +
>                event->hw.extra_reg = er->msr;
>                event->hw.extra_config = event->attr.config1;
>                break;
> @@ -617,6 +625,8 @@ static int x86_setup_perfctr(struct perf_event *event)
>        if (config == -1LL)
>                return -EINVAL;
>
> +       x86_pmu_extra_regs(config, event);
> +
>        /*
>         * Branch tracing:
>         */
> diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
> index 41178c8..dde9041 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel.c
> @@ -1,6 +1,6 @@
>  #ifdef CONFIG_CPU_SUP_INTEL
>
> -#define MAX_EXTRA_REGS 2
> +#define MAX_EXTRA_REGS 3
>
>  /*
>  * Per register state.
> @@ -89,6 +89,7 @@ static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =
>  static struct extra_reg intel_nehalem_extra_regs[] __read_mostly =
>  {
>        INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff),
> +       INTEL_EVENT_EXTRA_REG2(0x100b, MSR_PEBS_LD_LAT_THRESHOLD, 0xffff),
>        EVENT_EXTRA_END
>  };
>
> @@ -123,10 +124,17 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly =
>        EVENT_CONSTRAINT_END
>  };
>
> +static struct extra_reg intel_snb_extra_regs[] __read_mostly =
> +{
> +       INTEL_EVENT_EXTRA_REG2(0x01cd, MSR_PEBS_LD_LAT_THRESHOLD, 0xffff),
> +       EVENT_EXTRA_END
> +};
> +

As I described in arch/x86/kernel/perf_event.c, LD_LAT is NOT shared
between HT threads.
It seems the way your patch is written, LD_LAT will end up being
treated as OFFCORE_RSP
on NHM/WSM when HT is on, i.e., considered shared. You need to handle this in
intel_pmu_cpu_starting().

>
>  static struct extra_reg intel_westmere_extra_regs[] __read_mostly =
>  {
>        INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff),
>        INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff),
> +       INTEL_EVENT_EXTRA_REG2(0x100b, MSR_PEBS_LD_LAT_THRESHOLD, 0xffff),
>        EVENT_EXTRA_END
>  };
>
> @@ -1445,6 +1453,9 @@ static __init int intel_pmu_init(void)
>                /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
>                intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1;
>
> +               /* Memory load latency */
> +               intel_perfmon_event_map[PERF_COUNT_HW_MEM_LOAD] = 0x100b;
> +
>                if (ebx & 0x40) {
>                        /*
>                         * Erratum AAJ80 detected, we work it around by using
> @@ -1491,6 +1502,9 @@ static __init int intel_pmu_init(void)
>                /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
>                intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1;
>
> +               /* Memory load latency */
> +               intel_perfmon_event_map[PERF_COUNT_HW_MEM_LOAD] = 0x100b;
> +
>                pr_cont("Westmere events, ");
>                break;
>
> @@ -1502,12 +1516,16 @@ static __init int intel_pmu_init(void)
>
>                x86_pmu.event_constraints = intel_snb_event_constraints;
>                x86_pmu.pebs_constraints = intel_snb_pebs_events;
> +               x86_pmu.extra_regs = intel_snb_extra_regs;
>
>                /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
>                intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
>                /* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/
>                intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x18001b1;
>
> +               /* Memory load latency */
> +               intel_perfmon_event_map[PERF_COUNT_HW_MEM_LOAD] = 0x01cd;
> +
>                pr_cont("SandyBridge events, ");
>                break;
>
> diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
> index bab491b..d2d3155 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
> @@ -1,5 +1,28 @@
>  #ifdef CONFIG_CPU_SUP_INTEL
>
> +/* Indexed by Intel load latency data source encoding value */
> +
> +static u64 load_latency_data_source[] = {
> +       MEM_LOAD_UNKNOWN | MEM_LOAD_TOGGLE,                     /* 0x00: Unknown L3 */
> +       MEM_LOAD_L1 | MEM_LOAD_LOCAL,                           /* 0x01: L1-local */
> +       MEM_LOAD_L2 | MEM_LOAD_SNOOP,                           /* 0x02: L2-snoop */
> +       MEM_LOAD_L2 | MEM_LOAD_LOCAL,                           /* 0x03: L2-local */
> +       MEM_LOAD_L3 | MEM_LOAD_SNOOP | MEM_LOAD_INVALID,        /* 0x04: L3-snoop, no coherency actions */
> +       MEM_LOAD_L3 | MEM_LOAD_SNOOP | MEM_LOAD_SHARED,         /* 0x05: L3-snoop, found no M */
> +       MEM_LOAD_L3 | MEM_LOAD_SNOOP | MEM_LOAD_MODIFIED,       /* 0x06: L3-snoop, found M */
> +       MEM_LOAD_RESERVED,                                      /* 0x07: reserved */
> +       MEM_LOAD_RAM | MEM_LOAD_SNOOP | MEM_LOAD_SHARED,        /* 0x08: L3-miss, snoop, shared */
> +       MEM_LOAD_RESERVED,                                      /* 0x09: reserved */
> +       MEM_LOAD_RAM | MEM_LOAD_LOCAL | MEM_LOAD_SHARED,        /* 0x0A: L3-miss, local, shared */
> +       MEM_LOAD_RAM | MEM_LOAD_REMOTE | MEM_LOAD_SHARED,       /* 0x0B: L3-miss, remote, shared */
> +       MEM_LOAD_RAM | MEM_LOAD_LOCAL | MEM_LOAD_EXCLUSIVE,     /* 0x0C: L3-miss, local, exclusive */
> +       MEM_LOAD_RAM | MEM_LOAD_REMOTE | MEM_LOAD_EXCLUSIVE,    /* 0x0D: L3-miss, remote, exclusive */
> +       MEM_LOAD_IO | MEM_LOAD_TOGGLE,                          /* 0x0E: IO */
> +       MEM_LOAD_UNCACHED | MEM_LOAD_TOGGLE,                    /* 0x0F: Uncached */
> +};
> +
> +#define LOAD_LATENCY_DATA_SOURCE_MASK  0x0FULL
> +
>  /* The maximal number of PEBS events: */
>  #define MAX_PEBS_EVENTS                4
>
> @@ -454,6 +477,8 @@ static void intel_pmu_pebs_enable(struct perf_event *event)
>        hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
>
>        cpuc->pebs_enabled |= 1ULL << hwc->idx;
> +       if (hwc->extra_reg == MSR_PEBS_LD_LAT_THRESHOLD)
> +               cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32);
>        WARN_ON_ONCE(cpuc->enabled);
>
>        if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
> @@ -466,6 +491,8 @@ static void intel_pmu_pebs_disable(struct perf_event *event)
>        struct hw_perf_event *hwc = &event->hw;
>
>        cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
> +       if (hwc->extra_reg == MSR_PEBS_LD_LAT_THRESHOLD)
> +               cpuc->pebs_enabled &= ~(1ULL << (hwc->idx + 32));

Something I don't understand here. How is the precise_ip mode selected
when you use LD_LAT? You need PEBS but there is still that off-by-1
error on the RIP. You'd like to user to be able to choose whether or not
to apply the in-kernel correction. Yet, you've abstracted PEBS-LL in such
a way that the user is not even aware the kernel is enabling PEBS internally.
Something's wrong here.
That matters if you extract the load/stores addresses. It does not if
you just look at the latency distribution but that's too limiting in my mind.

>
>        if (cpuc->enabled)
>                wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
>
> @@ -582,13 +609,13 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
>                                   struct pt_regs *iregs, void *__pebs)
>  {
>        /*
> -        * We cast to pebs_record_core since that is a subset of
> -        * both formats and we don't use the other fields in this
> -        * routine.
> +        * We cast to pebs_record_nhm to get the load latency data
> +        * if extra_reg MSR_PEBS_LD_LAT_THRESHOLD used
>         */
> -       struct pebs_record_core *pebs = __pebs;
> +       struct pebs_record_nhm *pebs = __pebs;
>        struct perf_sample_data data;
>        struct pt_regs regs;
> +       u64 sample_type;
>
>        if (!intel_pmu_save_and_restart(event))
>                return;
> @@ -596,6 +623,20 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
>        perf_sample_data_init(&data, 0);
>        data.period = event->hw.last_period;
>
> +       if (event->attr.config == PERF_COUNT_HW_MEM_LOAD) {
> +               sample_type = event->attr.sample_type;
> +
> +               if (sample_type & PERF_SAMPLE_ADDR)
> +                       data.addr = pebs->dla;
> +
> +               if (sample_type & PERF_SAMPLE_LATENCY)
> +                       data.latency = pebs->lat;
> +
> +               if (sample_type & PERF_SAMPLE_EXTRA)
> +                       data.extra = load_latency_data_source[pebs->dse &
> +                                       LOAD_LATENCY_DATA_SOURCE_MASK];
> +       }
> +
>        /*
>         * We use the interrupt regs as a base because the PEBS record
>         * does not contain a full regs set, specifically it seems to
> --
> 1.7.5.1
>

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/4] perf: memory load/store events generalization
  2011-07-22 18:55 ` [PATCH 0/4] perf: memory load/store events generalization Stephane Eranian
@ 2011-07-22 21:01   ` Andi Kleen
  2011-07-22 21:14     ` Stephane Eranian
  0 siblings, 1 reply; 37+ messages in thread
From: Andi Kleen @ 2011-07-22 21:01 UTC (permalink / raw)
  To: Stephane Eranian
  Cc: Lin Ming, Peter Zijlstra, Ingo Molnar, Andi Kleen,
	Arnaldo Carvalho de Melo, linux-kernel

> That looks okay as a first approach tool. But what people are most
> often interested in is to see where the misses occur, i.e., you need
> to display load/store addresses somehow, especially for the more

But that's what it already does?  (for loads, stores
are not in there yet) Did you try it?

Or did you mean resolving the addresses? While I agree that would
be useful that's a quite big task and I don't think it should 
be blocked on that.


-Andi

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/4] perf: memory load/store events generalization
  2011-07-22 21:01   ` Andi Kleen
@ 2011-07-22 21:14     ` Stephane Eranian
  2011-07-22 21:43       ` Andi Kleen
  0 siblings, 1 reply; 37+ messages in thread
From: Stephane Eranian @ 2011-07-22 21:14 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Lin Ming, Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	linux-kernel

On Fri, Jul 22, 2011 at 2:01 PM, Andi Kleen <andi@firstfloor.org> wrote:
>> That looks okay as a first approach tool. But what people are most
>> often interested in is to see where the misses occur, i.e., you need
>> to display load/store addresses somehow, especially for the more
>
> But that's what it already does?  (for loads, stores
> are not in there yet) Did you try it?
>
I meant displaying the load + data addresses.
I have not tried it yet, just looked at the patches.

> Or did you mean resolving the addresses? While I agree that would
> be useful that's a quite big task and I don't think it should
> be blocked on that.
>
I agree with you, just wanted to check on the longer term goal.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/4] perf: memory load/store events generalization
  2011-07-22 21:14     ` Stephane Eranian
@ 2011-07-22 21:43       ` Andi Kleen
  2011-07-22 21:59         ` Stephane Eranian
  0 siblings, 1 reply; 37+ messages in thread
From: Andi Kleen @ 2011-07-22 21:43 UTC (permalink / raw)
  To: Stephane Eranian
  Cc: Andi Kleen, Lin Ming, Peter Zijlstra, Ingo Molnar,
	Arnaldo Carvalho de Melo, linux-kernel

On Fri, Jul 22, 2011 at 02:14:26PM -0700, Stephane Eranian wrote:
> On Fri, Jul 22, 2011 at 2:01 PM, Andi Kleen <andi@firstfloor.org> wrote:
> >> That looks okay as a first approach tool. But what people are most
> >> often interested in is to see where the misses occur, i.e., you need
> >> to display load/store addresses somehow, especially for the more
> >
> > But that's what it already does?  (for loads, stores
> > are not in there yet) Did you try it?
> >
> I meant displaying the load + data addresses.

That's what it does already.

Or did you mean the instructions that caused them?

-Andi
-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/4] perf: memory load/store events generalization
  2011-07-22 21:43       ` Andi Kleen
@ 2011-07-22 21:59         ` Stephane Eranian
  0 siblings, 0 replies; 37+ messages in thread
From: Stephane Eranian @ 2011-07-22 21:59 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Lin Ming, Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	linux-kernel

Hi,

On Fri, Jul 22, 2011 at 2:43 PM, Andi Kleen <andi@firstfloor.org> wrote:
> On Fri, Jul 22, 2011 at 02:14:26PM -0700, Stephane Eranian wrote:
>> On Fri, Jul 22, 2011 at 2:01 PM, Andi Kleen <andi@firstfloor.org> wrote:
>> >> That looks okay as a first approach tool. But what people are most
>> >> often interested in is to see where the misses occur, i.e., you need
>> >> to display load/store addresses somehow, especially for the more
>> >
>> > But that's what it already does?  (for loads, stores
>> > are not in there yet) Did you try it?
>> >
>> I meant displaying the load + data addresses.
>
> That's what it does already.
>
Are you talking about dump_load_data()?

You get:
   - load instruction addr
   - load data address
   - latency
   - data source

How do you display the load instruction address? Just using plain perf report?
How do you display the load data addresses?

But what's certainly more interesting if to get an output that shows
the correlation between the load and data addresses.


> Or did you mean the instructions that caused them?
>
yes.

> -Andi
> --
> ak@linux.intel.com -- Speaking for myself only.
>

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/4] perf: Add memory load/store events generic code
  2011-07-08 15:18               ` Peter Zijlstra
@ 2011-08-08 11:57                 ` Peter Zijlstra
  2011-08-08 11:59                 ` Peter Zijlstra
  1 sibling, 0 replies; 37+ messages in thread
From: Peter Zijlstra @ 2011-08-08 11:57 UTC (permalink / raw)
  To: Anton Blanchard
  Cc: Paul Mackerras, Lin Ming, Ingo Molnar, Andi Kleen,
	Stephane Eranian, Arnaldo Carvalho de Melo, linux-kernel

On Fri, 2011-07-08 at 17:18 +0200, Peter Zijlstra wrote:
> On Fri, 2011-07-08 at 17:18 +1000, Anton Blanchard wrote:
> > Hi Peter,
> > 
> > > The thing we're talking about is Intel PEBS Load Latency/Precise Store
> > > and AMD IBS where together with a mem op retired event (mem loads
> > > retired for Load-Latency, mem stores retired for Precise Store)
> > > provides an additional field describing where the load/store was
> > > sourced from.
> > > 
> > > Such additional data would require the addition of a
> > > PERF_SAMPLE_SOURCE field or similar, for some reason or other I was
> > > under the impression some of the PPC chips had something similar. But
> > > if not, it saves us having to worry about that.
> > 
> > It does sound a lot like our event vector, where we can have up to
> > 64 bits of information that goes with a sample. A lot of the fields
> > relate to loads and stores, but there are other fields (eg pipeline
> > information at the point the sample was taken).
> > 
> > So we could definitely use a field to capture this.
> 
> Happen to have a ref to some docs about that? We'd want to make sure our
> definition is wide enough to also work for ppc.

Anton, Paul, any word on this? I'd love to see the specs for that
power-pmu event vector thing.. It would be a terrible shame if we now
include an abstraction that somewhat matches your needs but not quite,
requiring us to either introduce another abi component later or.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/4] perf: Add memory load/store events generic code
  2011-07-08 15:18               ` Peter Zijlstra
  2011-08-08 11:57                 ` Peter Zijlstra
@ 2011-08-08 11:59                 ` Peter Zijlstra
  1 sibling, 0 replies; 37+ messages in thread
From: Peter Zijlstra @ 2011-08-08 11:59 UTC (permalink / raw)
  To: Anton Blanchard
  Cc: Paul Mackerras, Lin Ming, Ingo Molnar, Andi Kleen,
	Stephane Eranian, Arnaldo Carvalho de Melo, linux-kernel

On Mon, 2011-08-08 at 13:57 +0200, Peter Zijlstra wrote:
> 
> Anton, Paul, any word on this? I'd love to see the specs for that
> power-pmu event vector thing.. It would be a terrible shame if we now
> include an abstraction that somewhat matches your needs but not quite,
> requiring us to either introduce another abi component later or. 

not context switch and get confused about the email you were
writing... :-)

I think I wanted to say something about trying to version the ABI, which
is pretty much the same anyway, a royal pain in the butt ;-)

^ permalink raw reply	[flat|nested] 37+ messages in thread

end of thread, other threads:[~2011-08-08 11:59 UTC | newest]

Thread overview: 37+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-07-04  8:02 [PATCH 0/4] perf: memory load/store events generalization Lin Ming
2011-07-04  8:02 ` [PATCH 1/4] perf: Add memory load/store events generic code Lin Ming
2011-07-04  8:33   ` Peter Zijlstra
2011-07-04  8:44     ` Peter Zijlstra
2011-07-05 12:03       ` Peter Zijlstra
2011-07-05 23:02         ` Paul Mackerras
2011-07-06 13:58           ` Peter Zijlstra
2011-07-08  7:18             ` Anton Blanchard
2011-07-08 15:18               ` Peter Zijlstra
2011-08-08 11:57                 ` Peter Zijlstra
2011-08-08 11:59                 ` Peter Zijlstra
2011-07-04 22:01     ` Andi Kleen
2011-07-05  8:43       ` Peter Zijlstra
2011-07-04 11:08   ` Peter Zijlstra
2011-07-04 11:16   ` Peter Zijlstra
2011-07-04 21:52     ` Andi Kleen
2011-07-05 11:54     ` Lin Ming
2011-07-05 14:17       ` Peter Zijlstra
2011-07-06  5:53         ` Lin Ming
2011-07-06 13:51           ` Peter Zijlstra
2011-07-07  2:01             ` Lin Ming
2011-07-04  8:02 ` [PATCH 2/4] perf, x86: Add Intel Nhm/Wsm/Snb load latency support Lin Ming
2011-07-05 13:17   ` Peter Zijlstra
2011-07-05 13:34     ` Lin Ming
2011-07-22 18:58   ` Stephane Eranian
2011-07-04  8:02 ` [PATCH 3/4] perf, x86: Add Intel SandyBridge pricise store support Lin Ming
2011-07-11  8:32   ` Peter Zijlstra
2011-07-11  8:57     ` Lin Ming
2011-07-11  8:52       ` Peter Zijlstra
2011-07-04  8:02 ` [PATCH 4/4] perf, tool: Add new command "perf mem" Lin Ming
2011-07-04 22:00   ` Andi Kleen
2011-07-05  1:35     ` Lin Ming
2011-07-22 18:55 ` [PATCH 0/4] perf: memory load/store events generalization Stephane Eranian
2011-07-22 21:01   ` Andi Kleen
2011-07-22 21:14     ` Stephane Eranian
2011-07-22 21:43       ` Andi Kleen
2011-07-22 21:59         ` Stephane Eranian

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).