All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/8] perf: Add PERF_SAMPLE_PHYS_ADDR
@ 2016-01-06 11:04 kan.liang
  2016-01-06 11:04 ` [PATCH 2/8] perf tools: add option to record sample physical address kan.liang
                   ` (9 more replies)
  0 siblings, 10 replies; 17+ messages in thread
From: kan.liang @ 2016-01-06 11:04 UTC (permalink / raw)
  To: peterz, acme; +Cc: eranian, ak, jolsa, namhyung, linux-kernel, Kan Liang

From: Kan Liang <kan.liang@intel.com>

For understanding how the workload maps to memory channels and hardware
behavior, it's useful to collect address maps with physical addresses.
This is not intended for detecting page sharing (which can be already
done using the mmap inode), but for lower level hardware behavior
studies.
Perf supports load latency/DLA which can only collect virtual addresses.
This patch add a new sample type PERF_SAMPLE_PHYS_ADDR to expose the
physical addresses.
For kernel direct mapping addresses, the patch uses virt_to_phys to
convert the virtual addresses from DLA to physical address.
For user virtual addresses, __get_user_pages_fast is used to walk the
pages tables for user physical address.
This does not work for vmalloc addresses. Right now these are not
resolved, but code to do that could be added.
For security, the physical address can only be exposed to root or
privileged user.

Signed-off-by: Kan Liang <kan.liang@intel.com>
---
 arch/x86/kernel/cpu/perf_event.h          |  2 +-
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 23 +++++++++++++++++++++++
 include/linux/perf_event.h                |  3 +++
 include/uapi/linux/perf_event.h           |  4 +++-
 kernel/events/core.c                      | 11 +++++++++++
 5 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 799e6bd..164de68 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -90,7 +90,7 @@ struct amd_nb {
 	(PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \
 	PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \
 	PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
-	PERF_SAMPLE_TRANSACTION)
+	PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR)
 
 /*
  * A debug store configuration.
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 5db1c77..2e333dc 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -986,6 +986,7 @@ static void setup_pebs_sample_data(struct perf_event *event,
 	u64 sample_type;
 	int fll, fst, dsrc;
 	int fl = event->hw.flags;
+	struct page *p = NULL;
 
 	if (pebs == NULL)
 		return;
@@ -1071,6 +1072,28 @@ static void setup_pebs_sample_data(struct perf_event *event,
 	    x86_pmu.intel_cap.pebs_format >= 1)
 		data->addr = pebs->dla;
 
+	if ((sample_type & PERF_SAMPLE_PHYS_ADDR) && (data->addr != 0)) {
+		if (data->addr >= TASK_SIZE) {
+			/* If it's vmalloc()d memory, leave phys_addr as 0 */
+			if (virt_addr_valid(data->addr) &&
+			    !(data->addr >= VMALLOC_START && data->addr < VMALLOC_END))
+				data->phys_addr = (u64)virt_to_phys((void *)data->addr);
+		} else {
+			/*
+			 * Walking the pages tables for user address.
+			 * Interrupts are disabled, so it prevents any tear down
+			 * of the page tables.
+			 * Try IRQ-safe __get_user_pages_fast first.
+			 * If failed, leave phys_addr as 0.
+			 */
+			if ((current->mm != NULL) &&
+			    (__get_user_pages_fast(data->addr, 1, 0, &p) == 1))
+				data->phys_addr = page_to_phys(p) + data->addr % PAGE_SIZE;
+
+			if (p)
+				put_page(p);
+		}
+	}
 	if (x86_pmu.intel_cap.pebs_format >= 2) {
 		/* Only set the TSX weight when no memory weight. */
 		if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index f9828a4..d9c0527 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -795,6 +795,8 @@ struct perf_sample_data {
 
 	struct perf_regs		regs_intr;
 	u64				stack_user_size;
+
+	u64				phys_addr;
 } ____cacheline_aligned;
 
 /* default value for data source */
@@ -815,6 +817,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data,
 	data->weight = 0;
 	data->data_src.val = PERF_MEM_NA;
 	data->txn = 0;
+	data->phys_addr = 0;
 }
 
 extern void perf_output_sample(struct perf_output_handle *handle,
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 1afe962..5afc572 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -139,8 +139,9 @@ enum perf_event_sample_format {
 	PERF_SAMPLE_IDENTIFIER			= 1U << 16,
 	PERF_SAMPLE_TRANSACTION			= 1U << 17,
 	PERF_SAMPLE_REGS_INTR			= 1U << 18,
+	PERF_SAMPLE_PHYS_ADDR			= 1U << 19,
 
-	PERF_SAMPLE_MAX = 1U << 19,		/* non-ABI */
+	PERF_SAMPLE_MAX = 1U << 20,		/* non-ABI */
 };
 
 /*
@@ -767,6 +768,7 @@ enum perf_event_type {
 	 *	{ u64			transaction; } && PERF_SAMPLE_TRANSACTION
 	 *	{ u64			abi; # enum perf_sample_regs_abi
 	 *	  u64			regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
+	 *	{ u64			phys_addr;} && PERF_SAMPLE_PHYS_ADDR
 	 * };
 	 */
 	PERF_RECORD_SAMPLE			= 9,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index a627f36..9a922a2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1334,6 +1334,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
 	if (sample_type & PERF_SAMPLE_TRANSACTION)
 		size += sizeof(data->txn);
 
+	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+		size += sizeof(data->phys_addr);
+
 	event->header_size = size;
 }
 
@@ -5432,6 +5435,9 @@ void perf_output_sample(struct perf_output_handle *handle,
 	if (sample_type & PERF_SAMPLE_DATA_SRC)
 		perf_output_put(handle, data->data_src.val);
 
+	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+		perf_output_put(handle, data->phys_addr);
+
 	if (sample_type & PERF_SAMPLE_TRANSACTION)
 		perf_output_put(handle, data->txn);
 
@@ -8269,6 +8275,11 @@ SYSCALL_DEFINE5(perf_event_open,
 			return -EINVAL;
 	}
 
+	/* Only privileged users can get kernel addresses */
+	if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
+	    !capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
 	/*
 	 * In cgroup mode, the pid argument is used to pass the fd
 	 * opened to the cgroup directory in cgroupfs. The cpu argument
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 2/8] perf tools: add option to record sample physical address
  2016-01-06 11:04 [PATCH 1/8] perf: Add PERF_SAMPLE_PHYS_ADDR kan.liang
@ 2016-01-06 11:04 ` kan.liang
  2016-01-07  9:56   ` Jiri Olsa
  2016-01-06 11:04 ` [PATCH 3/8] perf tools: add sort option phys_daddr kan.liang
                   ` (8 subsequent siblings)
  9 siblings, 1 reply; 17+ messages in thread
From: kan.liang @ 2016-01-06 11:04 UTC (permalink / raw)
  To: peterz, acme; +Cc: eranian, ak, jolsa, namhyung, linux-kernel, Kan Liang

From: Kan Liang <kan.liang@intel.com>

Add new option --phys-data for perf record to record sample physical
address. Once the option is applied, it implies that --data option is
also applied to record sample virtual address.

Signed-off-by: Kan Liang <kan.liang@intel.com>
---
 tools/perf/Documentation/perf-record.txt |  6 +++++-
 tools/perf/builtin-record.c              | 10 +++++++++-
 tools/perf/perf.h                        |  1 +
 tools/perf/util/event.h                  |  1 +
 tools/perf/util/evsel.c                  | 17 +++++++++++++++++
 tools/perf/util/session.c                |  3 +++
 6 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index 3a1a32f..e40c9df 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -181,7 +181,11 @@ OPTIONS
 
 -d::
 --data::
-	Record the sample addresses.
+	Record the sample virtual addresses.
+
+--phys-data::
+	Record the sample physical addresses which implies
+	the virtual address is also recorded.
 
 -T::
 --timestamp::
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 9c5cdc2c4..b552f50 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -1082,7 +1082,8 @@ struct option __record_options[] = {
 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
 		    "per thread counts"),
-	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
+	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample virtual addresses"),
+	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr, "Record the sample physical addresses"),
 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
 			&record.opts.sample_time_set,
 			"Record the sample timestamps"),
@@ -1223,6 +1224,13 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
 		goto out_symbol_exit;
 	}
 
+	/*
+	 * Record the sample physical addr which also implies
+	 * the virtual addr is recorded.
+	 */
+	if (rec->opts.sample_phys_addr)
+		rec->opts.sample_address = true;
+
 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
 		rec->opts.no_inherit = true;
 
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index 90129ac..edfb938 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -49,6 +49,7 @@ struct record_opts {
 	bool	     no_samples;
 	bool	     raw_samples;
 	bool	     sample_address;
+	bool	     sample_phys_addr;
 	bool	     sample_weight;
 	bool	     sample_time;
 	bool	     sample_time_set;
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index b7ffb7e..0715883 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -190,6 +190,7 @@ struct perf_sample {
 	u32 cpu;
 	u32 raw_size;
 	u64 data_src;
+	u64 phys_addr;
 	u32 flags;
 	u16 insn_len;
 	void *raw_data;
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 544e440..32f7b4f 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -838,6 +838,9 @@ void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts)
 	if (opts->sample_address)
 		perf_evsel__set_sample_bit(evsel, DATA_SRC);
 
+	if (opts->sample_phys_addr)
+		perf_evsel__set_sample_bit(evsel, PHYS_ADDR);
+
 	if (opts->no_buffering) {
 		attr->watermark = 0;
 		attr->wakeup_events = 1;
@@ -1847,6 +1850,12 @@ int perf_evsel__parse_sample(struct perf_evsel *evsel, union perf_event *event,
 		array++;
 	}
 
+	data->phys_addr = 0;
+	if (type & PERF_SAMPLE_PHYS_ADDR) {
+		data->phys_addr = *array;
+		array++;
+	}
+
 	data->transaction = 0;
 	if (type & PERF_SAMPLE_TRANSACTION) {
 		OVERFLOW_CHECK_u64(array);
@@ -1963,6 +1972,9 @@ size_t perf_event__sample_event_size(const struct perf_sample *sample, u64 type,
 	if (type & PERF_SAMPLE_DATA_SRC)
 		result += sizeof(u64);
 
+	if (type & PERF_SAMPLE_PHYS_ADDR)
+		result += sizeof(u64);
+
 	if (type & PERF_SAMPLE_TRANSACTION)
 		result += sizeof(u64);
 
@@ -2149,6 +2161,11 @@ int perf_event__synthesize_sample(union perf_event *event, u64 type,
 		array++;
 	}
 
+	if (type & PERF_SAMPLE_PHYS_ADDR) {
+		*array = sample->phys_addr;
+		array++;
+	}
+
 	if (type & PERF_SAMPLE_TRANSACTION) {
 		*array = sample->transaction;
 		array++;
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index d5636ba..670ff4f 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -1122,6 +1122,9 @@ static void dump_sample(struct perf_evsel *evsel, union perf_event *event,
 	if (sample_type & PERF_SAMPLE_DATA_SRC)
 		printf(" . data_src: 0x%"PRIx64"\n", sample->data_src);
 
+	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+		printf(" .. phys_addr: 0x%"PRIx64"\n", sample->phys_addr);
+
 	if (sample_type & PERF_SAMPLE_TRANSACTION)
 		printf("... transaction: %" PRIx64 "\n", sample->transaction);
 
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 3/8] perf tools: add sort option phys_daddr
  2016-01-06 11:04 [PATCH 1/8] perf: Add PERF_SAMPLE_PHYS_ADDR kan.liang
  2016-01-06 11:04 ` [PATCH 2/8] perf tools: add option to record sample physical address kan.liang
@ 2016-01-06 11:04 ` kan.liang
  2016-01-06 11:04 ` [PATCH 4/8] perf mem: add option phys-data to record physical address kan.liang
                   ` (7 subsequent siblings)
  9 siblings, 0 replies; 17+ messages in thread
From: kan.liang @ 2016-01-06 11:04 UTC (permalink / raw)
  To: peterz, acme; +Cc: eranian, ak, jolsa, namhyung, linux-kernel, Kan Liang

From: Kan Liang <kan.liang@intel.com>

Add a new sort option phys_daddr for --mem-mode sort. With this option
applied, perf can sort and report by sample's physical address.

Signed-off-by: Kan Liang <kan.liang@intel.com>
---
 tools/perf/Documentation/perf-report.txt |  1 +
 tools/perf/util/hist.c                   |  3 +++
 tools/perf/util/hist.h                   |  1 +
 tools/perf/util/machine.c                |  7 ++++--
 tools/perf/util/sort.c                   | 42 ++++++++++++++++++++++++++++++++
 tools/perf/util/sort.h                   |  1 +
 tools/perf/util/symbol.h                 |  1 +
 7 files changed, 54 insertions(+), 2 deletions(-)

diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index 8a301f6..91c387e 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -163,6 +163,7 @@ OPTIONS
 	- mem: type of memory access for the data at the time of sample
 	- snoop: type of snoop (if any) for the data at the time of sample
 	- dcacheline: the cacheline the data address is on at the time of sample
+	- phys_daddr: physical address of data being executed on at the time of sample
 
 	And default sort keys are changed to local_weight, mem, sym, dso,
 	symbol_daddr, dso_daddr, snoop, tlb, locked, see '--mem-mode'.
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index afc9b8f..3051f15 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -152,6 +152,9 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
 			symlen = unresolved_col_width + 4 + 2;
 			hists__set_unres_dso_col_len(hists, HISTC_MEM_DADDR_DSO);
 		}
+
+		hists__new_col_len(hists, HISTC_MEM_PHYS_DADDR, unresolved_col_width + 4 + 2);
+
 	} else {
 		symlen = unresolved_col_width + 4 + 2;
 		hists__new_col_len(hists, HISTC_MEM_DADDR_SYMBOL, symlen);
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index cb8f373..3bb574a 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -44,6 +44,7 @@ enum hist_column {
 	HISTC_GLOBAL_WEIGHT,
 	HISTC_MEM_DADDR_SYMBOL,
 	HISTC_MEM_DADDR_DSO,
+	HISTC_MEM_PHYS_DADDR,
 	HISTC_MEM_LOCKED,
 	HISTC_MEM_TLB,
 	HISTC_MEM_LVL,
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index ad79297..6a516cc 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -1560,10 +1560,12 @@ static void ip__resolve_ams(struct thread *thread,
 	ams->al_addr = al.addr;
 	ams->sym = al.sym;
 	ams->map = al.map;
+	ams->phys_addr = 0;
 }
 
 static void ip__resolve_data(struct thread *thread,
-			     u8 m, struct addr_map_symbol *ams, u64 addr)
+			     u8 m, struct addr_map_symbol *ams,
+			     u64 addr, u64 phys_addr)
 {
 	struct addr_location al;
 
@@ -1583,6 +1585,7 @@ static void ip__resolve_data(struct thread *thread,
 	ams->al_addr = al.addr;
 	ams->sym = al.sym;
 	ams->map = al.map;
+	ams->phys_addr = phys_addr;
 }
 
 struct mem_info *sample__resolve_mem(struct perf_sample *sample,
@@ -1594,7 +1597,7 @@ struct mem_info *sample__resolve_mem(struct perf_sample *sample,
 		return NULL;
 
 	ip__resolve_ams(al->thread, &mi->iaddr, sample->ip);
-	ip__resolve_data(al->thread, al->cpumode, &mi->daddr, sample->addr);
+	ip__resolve_data(al->thread, al->cpumode, &mi->daddr, sample->addr, sample->phys_addr);
 	mi->data_src.val = sample->data_src;
 
 	return mi;
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index a8a9588..4ce6827 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -832,6 +832,40 @@ sort__tlb_cmp(struct hist_entry *left, struct hist_entry *right)
 	return (int64_t)(data_src_r.mem_dtlb - data_src_l.mem_dtlb);
 }
 
+static int64_t
+sort__phys_daddr_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	uint64_t l = 0, r = 0;
+
+	if (left->mem_info)
+		l = left->mem_info->daddr.phys_addr;
+	if (right->mem_info)
+		r = right->mem_info->daddr.phys_addr;
+
+	return (int64_t)(r - l);
+}
+
+static int hist_entry__phys_daddr_snprintf(struct hist_entry *he, char *bf,
+				    size_t size, unsigned int width)
+{
+	uint64_t addr = 0;
+	size_t ret = 0;
+	size_t len = BITS_PER_LONG / 4;
+
+	addr = he->mem_info->daddr.phys_addr;
+
+	ret += repsep_snprintf(bf + ret, size - ret, "[%c] ", he->level);
+
+	ret += repsep_snprintf(bf + ret, size - ret, "%-#.*llx", len, addr);
+
+	ret += repsep_snprintf(bf + ret, size - ret, "%-*s", width - ret, "");
+
+	if (ret > width)
+		bf[width] = '\0';
+
+	return width;
+}
+
 static const char * const tlb_access[] = {
 	"N/A",
 	"HIT",
@@ -1222,6 +1256,13 @@ struct sort_entry sort_mem_dcacheline = {
 	.se_width_idx	= HISTC_MEM_DCACHELINE,
 };
 
+struct sort_entry sort_mem_phys_daddr = {
+	.se_header	= "Data Physical Address",
+	.se_cmp		= sort__phys_daddr_cmp,
+	.se_snprintf	= hist_entry__phys_daddr_snprintf,
+	.se_width_idx	= HISTC_MEM_PHYS_DADDR,
+};
+
 static int64_t
 sort__abort_cmp(struct hist_entry *left, struct hist_entry *right)
 {
@@ -1410,6 +1451,7 @@ static struct sort_dimension memory_sort_dimensions[] = {
 	DIM(SORT_MEM_LVL, "mem", sort_mem_lvl),
 	DIM(SORT_MEM_SNOOP, "snoop", sort_mem_snoop),
 	DIM(SORT_MEM_DCACHELINE, "dcacheline", sort_mem_dcacheline),
+	DIM(SORT_MEM_PHYS_DADDR, "phys_daddr", sort_mem_phys_daddr),
 };
 
 #undef DIM
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index dec536b..f6e1781 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -207,6 +207,7 @@ enum sort_type {
 	SORT_MEM_SNOOP,
 	SORT_MEM_DCACHELINE,
 	SORT_MEM_IADDR_SYMBOL,
+	SORT_MEM_PHYS_DADDR,
 };
 
 /*
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index ccd1caa..c35c3f0 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -170,6 +170,7 @@ struct addr_map_symbol {
 	struct symbol *sym;
 	u64	      addr;
 	u64	      al_addr;
+	u64	      phys_addr;
 };
 
 struct branch_info {
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 4/8] perf mem: add option phys-data to record physical address
  2016-01-06 11:04 [PATCH 1/8] perf: Add PERF_SAMPLE_PHYS_ADDR kan.liang
  2016-01-06 11:04 ` [PATCH 2/8] perf tools: add option to record sample physical address kan.liang
  2016-01-06 11:04 ` [PATCH 3/8] perf tools: add sort option phys_daddr kan.liang
@ 2016-01-06 11:04 ` kan.liang
  2016-01-06 11:04 ` [PATCH 5/8] perf mem: report physical addresses kan.liang
                   ` (6 subsequent siblings)
  9 siblings, 0 replies; 17+ messages in thread
From: kan.liang @ 2016-01-06 11:04 UTC (permalink / raw)
  To: peterz, acme; +Cc: eranian, ak, jolsa, namhyung, linux-kernel, Kan Liang

From: Kan Liang <kan.liang@intel.com>

Add option phys-data in perf mem to record physical address

Signed-off-by: Kan Liang <kan.liang@intel.com>
---
 tools/perf/builtin-mem.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c
index 3901700..7d5ff3a 100644
--- a/tools/perf/builtin-mem.c
+++ b/tools/perf/builtin-mem.c
@@ -16,6 +16,7 @@ struct perf_mem {
 	bool			hide_unresolved;
 	bool			dump_raw;
 	bool			force;
+	bool			phys_addr;
 	int			operation;
 	const char		*cpu_list;
 	DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
@@ -39,6 +40,9 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem)
 
 	rec_argv[i++] = "-d";
 
+	if (mem->phys_addr)
+		rec_argv[i++] = "--phys-data";
+
 	if (mem->operation & MEM_OPERATION_LOAD) {
 		rec_argv[i++] = "-e";
 		rec_argv[i++] = "cpu/mem-loads/pp";
@@ -290,6 +294,7 @@ int cmd_mem(int argc, const char **argv, const char *prefix __maybe_unused)
 		   "separator for columns, no spaces will be added"
 		   " between columns '.' is reserved."),
 	OPT_BOOLEAN('f', "force", &mem.force, "don't complain, do it"),
+	OPT_BOOLEAN('p', "phys-data", &mem.phys_addr, "Record sample physical addresses"),
 	OPT_END()
 	};
 	const char *const mem_subcommands[] = { "record", "report", NULL };
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 5/8] perf mem: report physical addresses
  2016-01-06 11:04 [PATCH 1/8] perf: Add PERF_SAMPLE_PHYS_ADDR kan.liang
                   ` (2 preceding siblings ...)
  2016-01-06 11:04 ` [PATCH 4/8] perf mem: add option phys-data to record physical address kan.liang
@ 2016-01-06 11:04 ` kan.liang
  2016-01-07  8:27   ` Jiri Olsa
  2016-01-06 11:04 ` [PATCH 6/8] perf mem: dump " kan.liang
                   ` (5 subsequent siblings)
  9 siblings, 1 reply; 17+ messages in thread
From: kan.liang @ 2016-01-06 11:04 UTC (permalink / raw)
  To: peterz, acme; +Cc: eranian, ak, jolsa, namhyung, linux-kernel, Kan Liang

From: Kan Liang <kan.liang@intel.com>

perf mem report should support physical addresses by applying -p or
--phys-data. The default mem sort order for physical addresses is added
accordingly.

Signed-off-by: Kan Liang <kan.liang@intel.com>
---
 tools/perf/builtin-mem.c | 15 +++++++++++----
 tools/perf/util/sort.c   |  1 +
 tools/perf/util/sort.h   |  1 +
 3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c
index 7d5ff3a..a81bdc2 100644
--- a/tools/perf/builtin-mem.c
+++ b/tools/perf/builtin-mem.c
@@ -6,6 +6,7 @@
 #include "util/tool.h"
 #include "util/session.h"
 #include "util/data.h"
+#include "util/sort.h"
 
 #define MEM_OPERATION_LOAD	0x1
 #define MEM_OPERATION_STORE	0x2
@@ -176,9 +177,15 @@ static int report_events(int argc, const char **argv, struct perf_mem *mem)
 	 * there is no weight (cost) associated with stores, so don't print
 	 * the column
 	 */
-	if (!(mem->operation & MEM_OPERATION_LOAD))
-		rep_argv[i++] = "--sort=mem,sym,dso,symbol_daddr,"
-				"dso_daddr,tlb,locked";
+	if (!(mem->operation & MEM_OPERATION_LOAD)) {
+		if (mem->phys_addr)
+			rep_argv[i++] = "--sort=mem,sym,dso,symbol_daddr,"
+					"dso_daddr,tlb,locked,phys_daddr";
+		else
+			rep_argv[i++] = "--sort=mem,sym,dso,symbol_daddr,"
+					"dso_daddr,tlb,locked";
+	} else if (mem->phys_addr)
+		rep_argv[i++] = default_phys_mem_sort_order;
 
 	for (j = 1; j < argc; j++, i++)
 		rep_argv[i] = argv[j];
@@ -294,7 +301,7 @@ int cmd_mem(int argc, const char **argv, const char *prefix __maybe_unused)
 		   "separator for columns, no spaces will be added"
 		   " between columns '.' is reserved."),
 	OPT_BOOLEAN('f', "force", &mem.force, "don't complain, do it"),
-	OPT_BOOLEAN('p', "phys-data", &mem.phys_addr, "Record sample physical addresses"),
+	OPT_BOOLEAN('p', "phys-data", &mem.phys_addr, "Record/Report sample physical addresses"),
 	OPT_END()
 	};
 	const char *const mem_subcommands[] = { "record", "report", NULL };
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 4ce6827..c9e7088 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -13,6 +13,7 @@ const char	*parent_pattern = default_parent_pattern;
 const char	default_sort_order[] = "comm,dso,symbol";
 const char	default_branch_sort_order[] = "comm,dso_from,symbol_from,symbol_to,cycles";
 const char	default_mem_sort_order[] = "local_weight,mem,sym,dso,symbol_daddr,dso_daddr,snoop,tlb,locked";
+const char	default_phys_mem_sort_order[] = "local_weight,mem,sym,dso,symbol_daddr,dso_daddr,snoop,tlb,locked,phys_daddr";
 const char	default_top_sort_order[] = "dso,symbol";
 const char	default_diff_sort_order[] = "dso,symbol";
 const char	default_tracepoint_sort_order[] = "trace";
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index f6e1781..4421f76 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -46,6 +46,7 @@ extern struct sort_entry sort_sym_from;
 extern struct sort_entry sort_sym_to;
 extern enum sort_type sort__first_dimension;
 extern const char default_mem_sort_order[];
+extern const char default_phys_mem_sort_order[];
 
 struct he_stat {
 	u64			period;
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 6/8] perf mem: dump physical addresses
  2016-01-06 11:04 [PATCH 1/8] perf: Add PERF_SAMPLE_PHYS_ADDR kan.liang
                   ` (3 preceding siblings ...)
  2016-01-06 11:04 ` [PATCH 5/8] perf mem: report physical addresses kan.liang
@ 2016-01-06 11:04 ` kan.liang
  2016-01-06 11:04 ` [PATCH 7/8] perf script: support physical addresses in script kan.liang
                   ` (4 subsequent siblings)
  9 siblings, 0 replies; 17+ messages in thread
From: kan.liang @ 2016-01-06 11:04 UTC (permalink / raw)
  To: peterz, acme; +Cc: eranian, ak, jolsa, namhyung, linux-kernel, Kan Liang

From: Kan Liang <kan.liang@intel.com>

perf mem report should support dumping physical addresses by applying -p
or --phys-data.

Signed-off-by: Kan Liang <kan.liang@intel.com>
---
 tools/perf/Documentation/perf-mem.txt |  4 ++
 tools/perf/builtin-mem.c              | 81 ++++++++++++++++++++++++-----------
 2 files changed, 61 insertions(+), 24 deletions(-)

diff --git a/tools/perf/Documentation/perf-mem.txt b/tools/perf/Documentation/perf-mem.txt
index 43310d8..929575a 100644
--- a/tools/perf/Documentation/perf-mem.txt
+++ b/tools/perf/Documentation/perf-mem.txt
@@ -48,6 +48,10 @@ OPTIONS
 	option can be passed in record mode. It will be interpreted the same way as perf
 	record.
 
+-p::
+--phys-data::
+	Record/Report/Dump sample physical addresses
+
 SEE ALSO
 --------
 linkperf:perf-record[1], linkperf:perf-report[1]
diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c
index a81bdc2..8dc6b13 100644
--- a/tools/perf/builtin-mem.c
+++ b/tools/perf/builtin-mem.c
@@ -84,30 +84,60 @@ dump_raw_samples(struct perf_tool *tool,
 	if (al.map != NULL)
 		al.map->dso->hit = 1;
 
-	if (symbol_conf.field_sep) {
-		fmt = "%d%s%d%s0x%"PRIx64"%s0x%"PRIx64"%s%"PRIu64
-		      "%s0x%"PRIx64"%s%s:%s\n";
+	if (mem->phys_addr) {
+		if (symbol_conf.field_sep) {
+			fmt = "%d%s%d%s0x%"PRIx64"%s0x%"PRIx64"%s0x%016"PRIx64
+			      "%s%"PRIu64"%s0x%"PRIx64"%s%s:%s\n";
+		} else {
+			fmt = "%5d%s%5d%s0x%016"PRIx64"%s0x016%"PRIx64
+			      "%s0x%016"PRIx64"%s%5"PRIu64"%s0x%06"PRIx64
+			      "%s%s:%s\n";
+			symbol_conf.field_sep = " ";
+		}
+
+		printf(fmt,
+			sample->pid,
+			symbol_conf.field_sep,
+			sample->tid,
+			symbol_conf.field_sep,
+			sample->ip,
+			symbol_conf.field_sep,
+			sample->addr,
+			symbol_conf.field_sep,
+			sample->phys_addr,
+			symbol_conf.field_sep,
+			sample->weight,
+			symbol_conf.field_sep,
+			sample->data_src,
+			symbol_conf.field_sep,
+			al.map ? (al.map->dso ? al.map->dso->long_name : "???") : "???",
+			al.sym ? al.sym->name : "???");
 	} else {
-		fmt = "%5d%s%5d%s0x%016"PRIx64"%s0x016%"PRIx64
-		      "%s%5"PRIu64"%s0x%06"PRIx64"%s%s:%s\n";
-		symbol_conf.field_sep = " ";
-	}
+		if (symbol_conf.field_sep) {
+			fmt = "%d%s%d%s0x%"PRIx64"%s0x%"PRIx64"%s%"PRIu64
+			      "%s0x%"PRIx64"%s%s:%s\n";
+		} else {
+			fmt = "%5d%s%5d%s0x%016"PRIx64"%s0x016%"PRIx64
+			      "%s%5"PRIu64"%s0x%06"PRIx64"%s%s:%s\n";
+			symbol_conf.field_sep = " ";
+		}
 
-	printf(fmt,
-		sample->pid,
-		symbol_conf.field_sep,
-		sample->tid,
-		symbol_conf.field_sep,
-		sample->ip,
-		symbol_conf.field_sep,
-		sample->addr,
-		symbol_conf.field_sep,
-		sample->weight,
-		symbol_conf.field_sep,
-		sample->data_src,
-		symbol_conf.field_sep,
-		al.map ? (al.map->dso ? al.map->dso->long_name : "???") : "???",
-		al.sym ? al.sym->name : "???");
+		printf(fmt,
+			sample->pid,
+			symbol_conf.field_sep,
+			sample->tid,
+			symbol_conf.field_sep,
+			sample->ip,
+			symbol_conf.field_sep,
+			sample->addr,
+			symbol_conf.field_sep,
+			sample->weight,
+			symbol_conf.field_sep,
+			sample->data_src,
+			symbol_conf.field_sep,
+			al.map ? (al.map->dso ? al.map->dso->long_name : "???") : "???",
+			al.sym ? al.sym->name : "???");
+	}
 out_put:
 	addr_location__put(&al);
 	return 0;
@@ -147,7 +177,10 @@ static int report_raw_events(struct perf_mem *mem)
 	if (ret < 0)
 		goto out_delete;
 
-	printf("# PID, TID, IP, ADDR, LOCAL WEIGHT, DSRC, SYMBOL\n");
+	if (mem->phys_addr)
+		printf("# PID, TID, IP, ADDR, PHYS ADDR, LOCAL WEIGHT, DSRC, SYMBOL\n");
+	else
+		printf("# PID, TID, IP, ADDR, LOCAL WEIGHT, DSRC, SYMBOL\n");
 
 	ret = perf_session__process_events(session);
 
@@ -301,7 +334,7 @@ int cmd_mem(int argc, const char **argv, const char *prefix __maybe_unused)
 		   "separator for columns, no spaces will be added"
 		   " between columns '.' is reserved."),
 	OPT_BOOLEAN('f', "force", &mem.force, "don't complain, do it"),
-	OPT_BOOLEAN('p', "phys-data", &mem.phys_addr, "Record/Report sample physical addresses"),
+	OPT_BOOLEAN('p', "phys-data", &mem.phys_addr, "Record/Report/Dump sample physical addresses"),
 	OPT_END()
 	};
 	const char *const mem_subcommands[] = { "record", "report", NULL };
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 7/8] perf script: support physical addresses in script
  2016-01-06 11:04 [PATCH 1/8] perf: Add PERF_SAMPLE_PHYS_ADDR kan.liang
                   ` (4 preceding siblings ...)
  2016-01-06 11:04 ` [PATCH 6/8] perf mem: dump " kan.liang
@ 2016-01-06 11:04 ` kan.liang
  2016-01-07  9:54   ` Jiri Olsa
  2016-01-06 11:04 ` [PATCH 8/8] perf test: add test case for PERF_SAMPLE_PHYS_ADDR kan.liang
                   ` (3 subsequent siblings)
  9 siblings, 1 reply; 17+ messages in thread
From: kan.liang @ 2016-01-06 11:04 UTC (permalink / raw)
  To: peterz, acme; +Cc: eranian, ak, jolsa, namhyung, linux-kernel, Kan Liang

From: Kan Liang <kan.liang@intel.com>

perf script print out physical addresses by applying phys_addr.
Only display physical address when virtual address is selected. The
physical address will be printed out right after virtual address.

Signed-off-by: Kan Liang <kan.liang@intel.com>
---
 tools/perf/builtin-script.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index c298cdc..1e292a7 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -58,6 +58,7 @@ enum perf_output_field {
 	PERF_OUTPUT_IREGS	    = 1U << 14,
 	PERF_OUTPUT_BRSTACK	    = 1U << 15,
 	PERF_OUTPUT_BRSTACKSYM	    = 1U << 16,
+	PERF_OUTPUT_PHYS_ADDR	    = 1U << 17,
 };
 
 struct output_option {
@@ -81,6 +82,7 @@ struct output_option {
 	{.str = "iregs", .field = PERF_OUTPUT_IREGS},
 	{.str = "brstack", .field = PERF_OUTPUT_BRSTACK},
 	{.str = "brstacksym", .field = PERF_OUTPUT_BRSTACKSYM},
+	{.str = "phys_addr", .field = PERF_OUTPUT_PHYS_ADDR},
 };
 
 /* default set to maintain compatibility with current format */
@@ -242,6 +244,12 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel,
 					   PERF_OUTPUT_ADDR, allow_user_set))
 		return -EINVAL;
 
+	if (PRINT_FIELD(PHYS_ADDR) && !PRINT_FIELD(ADDR)) {
+		pr_err("Display of sample physical address"
+		       "but sample address is not selected.\n");
+		return -EINVAL;
+	}
+
 	if (PRINT_FIELD(SYM) && !PRINT_FIELD(IP) && !PRINT_FIELD(ADDR)) {
 		pr_err("Display of symbols requested but neither sample IP nor "
 			   "sample address\nis selected. Hence, no addresses to convert "
@@ -290,6 +298,11 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel,
 					PERF_OUTPUT_IREGS))
 		return -EINVAL;
 
+	if (PRINT_FIELD(PHYS_ADDR) &&
+		perf_evsel__do_check_stype(evsel, PERF_SAMPLE_PHYS_ADDR, "PHYS_ADDR",
+					   PERF_OUTPUT_PHYS_ADDR, allow_user_set))
+		return -EINVAL;
+
 	return 0;
 }
 
@@ -653,6 +666,9 @@ static void process_event(struct perf_script *script __maybe_unused, union perf_
 	if (PRINT_FIELD(ADDR))
 		print_sample_addr(event, sample, thread, attr);
 
+	if (PRINT_FIELD(PHYS_ADDR))
+		printf("%16" PRIx64, sample->phys_addr);
+
 	if (PRINT_FIELD(IP)) {
 		if (!symbol_conf.use_callchain)
 			printf(" ");
@@ -1893,7 +1909,8 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
 		     "comma separated output fields prepend with 'type:'. "
 		     "Valid types: hw,sw,trace,raw. "
 		     "Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso,"
-		     "addr,symoff,period,iregs,brstack,brstacksym,flags", parse_output_fields),
+		     "addr,symoff,period,iregs,brstack,brstacksym,flags,phys_addr",
+		     parse_output_fields),
 	OPT_BOOLEAN('a', "all-cpus", &system_wide,
 		    "system-wide collection from all CPUs"),
 	OPT_STRING('S', "symbols", &symbol_conf.sym_list_str, "symbol[,symbol...]",
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 8/8] perf test: add test case for PERF_SAMPLE_PHYS_ADDR
  2016-01-06 11:04 [PATCH 1/8] perf: Add PERF_SAMPLE_PHYS_ADDR kan.liang
                   ` (5 preceding siblings ...)
  2016-01-06 11:04 ` [PATCH 7/8] perf script: support physical addresses in script kan.liang
@ 2016-01-06 11:04 ` kan.liang
  2016-01-06 19:21 ` [PATCH 1/8] perf: Add PERF_SAMPLE_PHYS_ADDR Stephane Eranian
                   ` (2 subsequent siblings)
  9 siblings, 0 replies; 17+ messages in thread
From: kan.liang @ 2016-01-06 11:04 UTC (permalink / raw)
  To: peterz, acme; +Cc: eranian, ak, jolsa, namhyung, linux-kernel, Kan Liang

From: Kan Liang <kan.liang@intel.com>

Extend sample-parsing test cases to support new sample type
PERF_SAMPLE_PHYS_ADDR.

Signed-off-by: Kan Liang <kan.liang@intel.com>
---
 tools/perf/tests/sample-parsing.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/perf/tests/sample-parsing.c b/tools/perf/tests/sample-parsing.c
index 5f23710..8813fa1 100644
--- a/tools/perf/tests/sample-parsing.c
+++ b/tools/perf/tests/sample-parsing.c
@@ -45,6 +45,9 @@ static bool samples_same(const struct perf_sample *s1,
 	if (type & PERF_SAMPLE_ADDR)
 		COMP(addr);
 
+	if (type & PERF_SAMPLE_PHYS_ADDR)
+		COMP(phys_addr);
+
 	if (type & PERF_SAMPLE_ID)
 		COMP(id);
 
@@ -303,7 +306,7 @@ int test__sample_parsing(int subtest __maybe_unused)
 	 * were added.  Please actually update the test rather than just change
 	 * the condition below.
 	 */
-	if (PERF_SAMPLE_MAX > PERF_SAMPLE_REGS_INTR << 1) {
+	if (PERF_SAMPLE_MAX > PERF_SAMPLE_PHYS_ADDR << 1) {
 		pr_debug("sample format has changed, some new PERF_SAMPLE_ bit was introduced - test needs updating\n");
 		return -1;
 	}
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/8] perf: Add PERF_SAMPLE_PHYS_ADDR
  2016-01-06 11:04 [PATCH 1/8] perf: Add PERF_SAMPLE_PHYS_ADDR kan.liang
                   ` (6 preceding siblings ...)
  2016-01-06 11:04 ` [PATCH 8/8] perf test: add test case for PERF_SAMPLE_PHYS_ADDR kan.liang
@ 2016-01-06 19:21 ` Stephane Eranian
  2016-01-07  8:33 ` Jiri Olsa
  2016-01-07 21:50 ` Stephane Eranian
  9 siblings, 0 replies; 17+ messages in thread
From: Stephane Eranian @ 2016-01-06 19:21 UTC (permalink / raw)
  To: Liang, Kan
  Cc: Peter Zijlstra, Arnaldo Carvalho de Melo, ak, Jiri Olsa,
	Namhyung Kim, LKML

On Wed, Jan 6, 2016 at 3:04 AM,  <kan.liang@intel.com> wrote:
> From: Kan Liang <kan.liang@intel.com>
>
> For understanding how the workload maps to memory channels and hardware
> behavior, it's useful to collect address maps with physical addresses.
> This is not intended for detecting page sharing (which can be already
> done using the mmap inode), but for lower level hardware behavior
> studies.
> Perf supports load latency/DLA which can only collect virtual addresses.
> This patch add a new sample type PERF_SAMPLE_PHYS_ADDR to expose the
> physical addresses.
> For kernel direct mapping addresses, the patch uses virt_to_phys to
> convert the virtual addresses from DLA to physical address.
> For user virtual addresses, __get_user_pages_fast is used to walk the
> pages tables for user physical address.
> This does not work for vmalloc addresses. Right now these are not
> resolved, but code to do that could be added.
> For security, the physical address can only be exposed to root or
> privileged user.
>
> Signed-off-by: Kan Liang <kan.liang@intel.com>
> ---
>  arch/x86/kernel/cpu/perf_event.h          |  2 +-
>  arch/x86/kernel/cpu/perf_event_intel_ds.c | 23 +++++++++++++++++++++++
>  include/linux/perf_event.h                |  3 +++
>  include/uapi/linux/perf_event.h           |  4 +++-
>  kernel/events/core.c                      | 11 +++++++++++
>  5 files changed, 41 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
> index 799e6bd..164de68 100644
> --- a/arch/x86/kernel/cpu/perf_event.h
> +++ b/arch/x86/kernel/cpu/perf_event.h
> @@ -90,7 +90,7 @@ struct amd_nb {
>         (PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \
>         PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \
>         PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
> -       PERF_SAMPLE_TRANSACTION)
> +       PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR)
>
>  /*
>   * A debug store configuration.
> diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
> index 5db1c77..2e333dc 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
> @@ -986,6 +986,7 @@ static void setup_pebs_sample_data(struct perf_event *event,
>         u64 sample_type;
>         int fll, fst, dsrc;
>         int fl = event->hw.flags;
> +       struct page *p = NULL;
>
>         if (pebs == NULL)
>                 return;
> @@ -1071,6 +1072,28 @@ static void setup_pebs_sample_data(struct perf_event *event,
>             x86_pmu.intel_cap.pebs_format >= 1)
>                 data->addr = pebs->dla;
>
> +       if ((sample_type & PERF_SAMPLE_PHYS_ADDR) && (data->addr != 0)) {
> +               if (data->addr >= TASK_SIZE) {
> +                       /* If it's vmalloc()d memory, leave phys_addr as 0 */
> +                       if (virt_addr_valid(data->addr) &&
> +                           !(data->addr >= VMALLOC_START && data->addr < VMALLOC_END))
> +                               data->phys_addr = (u64)virt_to_phys((void *)data->addr);
> +               } else {
> +                       /*
> +                        * Walking the pages tables for user address.
> +                        * Interrupts are disabled, so it prevents any tear down
> +                        * of the page tables.
> +                        * Try IRQ-safe __get_user_pages_fast first.
> +                        * If failed, leave phys_addr as 0.
> +                        */
> +                       if ((current->mm != NULL) &&
> +                           (__get_user_pages_fast(data->addr, 1, 0, &p) == 1))
> +                               data->phys_addr = page_to_phys(p) + data->addr % PAGE_SIZE;
> +
> +                       if (p)
> +                               put_page(p);
> +               }
> +       }
>         if (x86_pmu.intel_cap.pebs_format >= 2) {
>                 /* Only set the TSX weight when no memory weight. */
>                 if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll)
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index f9828a4..d9c0527 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -795,6 +795,8 @@ struct perf_sample_data {
>
>         struct perf_regs                regs_intr;
>         u64                             stack_user_size;
> +
> +       u64                             phys_addr;
>  } ____cacheline_aligned;
>
>  /* default value for data source */
> @@ -815,6 +817,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data,
>         data->weight = 0;
>         data->data_src.val = PERF_MEM_NA;
>         data->txn = 0;
> +       data->phys_addr = 0;
>  }
>
>  extern void perf_output_sample(struct perf_output_handle *handle,
> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
> index 1afe962..5afc572 100644
> --- a/include/uapi/linux/perf_event.h
> +++ b/include/uapi/linux/perf_event.h
> @@ -139,8 +139,9 @@ enum perf_event_sample_format {
>         PERF_SAMPLE_IDENTIFIER                  = 1U << 16,
>         PERF_SAMPLE_TRANSACTION                 = 1U << 17,
>         PERF_SAMPLE_REGS_INTR                   = 1U << 18,
> +       PERF_SAMPLE_PHYS_ADDR                   = 1U << 19,
>
> -       PERF_SAMPLE_MAX = 1U << 19,             /* non-ABI */
> +       PERF_SAMPLE_MAX = 1U << 20,             /* non-ABI */
>  };
>
>  /*
> @@ -767,6 +768,7 @@ enum perf_event_type {
>          *      { u64                   transaction; } && PERF_SAMPLE_TRANSACTION
>          *      { u64                   abi; # enum perf_sample_regs_abi
>          *        u64                   regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
> +        *      { u64                   phys_addr;} && PERF_SAMPLE_PHYS_ADDR

If you say, PHYS_ADDR appears after SAMPLE_REGS_INTR then you have do respect
that order in the perf_output_sample() routine, otherwise the tool cannot parse
the record correctly.

>          * };
>          */
>         PERF_RECORD_SAMPLE                      = 9,
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index a627f36..9a922a2 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -1334,6 +1334,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
>         if (sample_type & PERF_SAMPLE_TRANSACTION)
>                 size += sizeof(data->txn);
>
> +       if (sample_type & PERF_SAMPLE_PHYS_ADDR)
> +               size += sizeof(data->phys_addr);
> +
>         event->header_size = size;
>  }
>
> @@ -5432,6 +5435,9 @@ void perf_output_sample(struct perf_output_handle *handle,
>         if (sample_type & PERF_SAMPLE_DATA_SRC)
>                 perf_output_put(handle, data->data_src.val);
>
> +       if (sample_type & PERF_SAMPLE_PHYS_ADDR)
> +               perf_output_put(handle, data->phys_addr);
> +
Need to order the put calls according to the order you define in the
header file,
unless this logic has changed recently.

>         if (sample_type & PERF_SAMPLE_TRANSACTION)
>                 perf_output_put(handle, data->txn);
>
> @@ -8269,6 +8275,11 @@ SYSCALL_DEFINE5(perf_event_open,
>                         return -EINVAL;
>         }
>
> +       /* Only privileged users can get kernel addresses */
> +       if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
> +           !capable(CAP_SYS_ADMIN))
> +               return -EACCES;
> +
>         /*
>          * In cgroup mode, the pid argument is used to pass the fd
>          * opened to the cgroup directory in cgroupfs. The cpu argument
> --
> 1.8.3.1
>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 5/8] perf mem: report physical addresses
  2016-01-06 11:04 ` [PATCH 5/8] perf mem: report physical addresses kan.liang
@ 2016-01-07  8:27   ` Jiri Olsa
  0 siblings, 0 replies; 17+ messages in thread
From: Jiri Olsa @ 2016-01-07  8:27 UTC (permalink / raw)
  To: kan.liang; +Cc: peterz, acme, eranian, ak, jolsa, namhyung, linux-kernel

On Wed, Jan 06, 2016 at 06:04:34AM -0500, kan.liang@intel.com wrote:
> From: Kan Liang <kan.liang@intel.com>
> 
> perf mem report should support physical addresses by applying -p or
> --phys-data. The default mem sort order for physical addresses is added
> accordingly.
> 
> Signed-off-by: Kan Liang <kan.liang@intel.com>
> ---
>  tools/perf/builtin-mem.c | 15 +++++++++++----
>  tools/perf/util/sort.c   |  1 +
>  tools/perf/util/sort.h   |  1 +
>  3 files changed, 13 insertions(+), 4 deletions(-)
> 
> diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c
> index 7d5ff3a..a81bdc2 100644
> --- a/tools/perf/builtin-mem.c
> +++ b/tools/perf/builtin-mem.c
> @@ -6,6 +6,7 @@
>  #include "util/tool.h"
>  #include "util/session.h"
>  #include "util/data.h"
> +#include "util/sort.h"
>  
>  #define MEM_OPERATION_LOAD	0x1
>  #define MEM_OPERATION_STORE	0x2
> @@ -176,9 +177,15 @@ static int report_events(int argc, const char **argv, struct perf_mem *mem)
>  	 * there is no weight (cost) associated with stores, so don't print
>  	 * the column
>  	 */
> -	if (!(mem->operation & MEM_OPERATION_LOAD))
> -		rep_argv[i++] = "--sort=mem,sym,dso,symbol_daddr,"
> -				"dso_daddr,tlb,locked";
> +	if (!(mem->operation & MEM_OPERATION_LOAD)) {
> +		if (mem->phys_addr)
> +			rep_argv[i++] = "--sort=mem,sym,dso,symbol_daddr,"
> +					"dso_daddr,tlb,locked,phys_daddr";
> +		else
> +			rep_argv[i++] = "--sort=mem,sym,dso,symbol_daddr,"
> +					"dso_daddr,tlb,locked";
> +	} else if (mem->phys_addr)
> +		rep_argv[i++] = default_phys_mem_sort_order;

should this have "--sort=" as prefix ?

thanks,
jirka

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/8] perf: Add PERF_SAMPLE_PHYS_ADDR
  2016-01-06 11:04 [PATCH 1/8] perf: Add PERF_SAMPLE_PHYS_ADDR kan.liang
                   ` (7 preceding siblings ...)
  2016-01-06 19:21 ` [PATCH 1/8] perf: Add PERF_SAMPLE_PHYS_ADDR Stephane Eranian
@ 2016-01-07  8:33 ` Jiri Olsa
  2016-01-07 15:55   ` Liang, Kan
  2016-01-07 21:50 ` Stephane Eranian
  9 siblings, 1 reply; 17+ messages in thread
From: Jiri Olsa @ 2016-01-07  8:33 UTC (permalink / raw)
  To: kan.liang; +Cc: peterz, acme, eranian, ak, jolsa, namhyung, linux-kernel

On Wed, Jan 06, 2016 at 06:04:30AM -0500, kan.liang@intel.com wrote:
> From: Kan Liang <kan.liang@intel.com>
> 
> For understanding how the workload maps to memory channels and hardware
> behavior, it's useful to collect address maps with physical addresses.
> This is not intended for detecting page sharing (which can be already
> done using the mmap inode), but for lower level hardware behavior
> studies.
> Perf supports load latency/DLA which can only collect virtual addresses.
> This patch add a new sample type PERF_SAMPLE_PHYS_ADDR to expose the
> physical addresses.
> For kernel direct mapping addresses, the patch uses virt_to_phys to
> convert the virtual addresses from DLA to physical address.
> For user virtual addresses, __get_user_pages_fast is used to walk the
> pages tables for user physical address.
> This does not work for vmalloc addresses. Right now these are not
> resolved, but code to do that could be added.
> For security, the physical address can only be exposed to root or
> privileged user.
> 
> Signed-off-by: Kan Liang <kan.liang@intel.com>
> ---
>  arch/x86/kernel/cpu/perf_event.h          |  2 +-
>  arch/x86/kernel/cpu/perf_event_intel_ds.c | 23 +++++++++++++++++++++++
>  include/linux/perf_event.h                |  3 +++
>  include/uapi/linux/perf_event.h           |  4 +++-
>  kernel/events/core.c                      | 11 +++++++++++
>  5 files changed, 41 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
> index 799e6bd..164de68 100644
> --- a/arch/x86/kernel/cpu/perf_event.h
> +++ b/arch/x86/kernel/cpu/perf_event.h
> @@ -90,7 +90,7 @@ struct amd_nb {
>  	(PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \
>  	PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \
>  	PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
> -	PERF_SAMPLE_TRANSACTION)
> +	PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR)

could you please add some probe code to detect kernel
support, so we get some sensible error message

[jolsa@krava perf]$ ./perf mem -p record ls
Error:
The sys_perf_event_open() syscall returned with 22 (Invalid argument) for event (cpu/mem-loads/pp).
/bin/dmesg may provide additional information.
No CONFIG_PERF_EVENTS=y kernel support configured?

thanks,
jirka

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 7/8] perf script: support physical addresses in script
  2016-01-06 11:04 ` [PATCH 7/8] perf script: support physical addresses in script kan.liang
@ 2016-01-07  9:54   ` Jiri Olsa
  0 siblings, 0 replies; 17+ messages in thread
From: Jiri Olsa @ 2016-01-07  9:54 UTC (permalink / raw)
  To: kan.liang; +Cc: peterz, acme, eranian, ak, jolsa, namhyung, linux-kernel

On Wed, Jan 06, 2016 at 06:04:36AM -0500, kan.liang@intel.com wrote:
> From: Kan Liang <kan.liang@intel.com>
> 
> perf script print out physical addresses by applying phys_addr.
> Only display physical address when virtual address is selected. The
> physical address will be printed out right after virtual address.
> 
> Signed-off-by: Kan Liang <kan.liang@intel.com>
> ---
>  tools/perf/builtin-script.c | 19 ++++++++++++++++++-
>  1 file changed, 18 insertions(+), 1 deletion(-)
> 
> diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
> index c298cdc..1e292a7 100644
> --- a/tools/perf/builtin-script.c
> +++ b/tools/perf/builtin-script.c
> @@ -58,6 +58,7 @@ enum perf_output_field {
>  	PERF_OUTPUT_IREGS	    = 1U << 14,
>  	PERF_OUTPUT_BRSTACK	    = 1U << 15,
>  	PERF_OUTPUT_BRSTACKSYM	    = 1U << 16,
> +	PERF_OUTPUT_PHYS_ADDR	    = 1U << 17,
>  };
>  
>  struct output_option {
> @@ -81,6 +82,7 @@ struct output_option {
>  	{.str = "iregs", .field = PERF_OUTPUT_IREGS},
>  	{.str = "brstack", .field = PERF_OUTPUT_BRSTACK},
>  	{.str = "brstacksym", .field = PERF_OUTPUT_BRSTACKSYM},
> +	{.str = "phys_addr", .field = PERF_OUTPUT_PHYS_ADDR},
>  };
>  
>  /* default set to maintain compatibility with current format */
> @@ -242,6 +244,12 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel,
>  					   PERF_OUTPUT_ADDR, allow_user_set))
>  		return -EINVAL;
>  
> +	if (PRINT_FIELD(PHYS_ADDR) && !PRINT_FIELD(ADDR)) {
> +		pr_err("Display of sample physical address"
> +		       "but sample address is not selected.\n");
> +		return -EINVAL;
> +	}

hum, why the ADDR matter in here? we have PHYS_ADDR already stored
in perf.data so we should be all set to display it

[jolsa@ibm-x3650m4-01 perf]$ ./perf script -F addr,phys_addr | head -3
ffff8804744e5de9       4744e5de9
ffff88047fca5e90       47fca5e90
ffff8804710cfcb8       4710cfcb8

[jolsa@ibm-x3650m4-01 perf]$ ./perf script -F phys_addr
Display of sample physical addressbut sample address is not selected.


also you need to add extra space behind 'address':

thanks,
jirka

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/8] perf tools: add option to record sample physical address
  2016-01-06 11:04 ` [PATCH 2/8] perf tools: add option to record sample physical address kan.liang
@ 2016-01-07  9:56   ` Jiri Olsa
  0 siblings, 0 replies; 17+ messages in thread
From: Jiri Olsa @ 2016-01-07  9:56 UTC (permalink / raw)
  To: kan.liang; +Cc: peterz, acme, eranian, ak, jolsa, namhyung, linux-kernel

On Wed, Jan 06, 2016 at 06:04:31AM -0500, kan.liang@intel.com wrote:

SNIP

> diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
> index d5636ba..670ff4f 100644
> --- a/tools/perf/util/session.c
> +++ b/tools/perf/util/session.c
> @@ -1122,6 +1122,9 @@ static void dump_sample(struct perf_evsel *evsel, union perf_event *event,
>  	if (sample_type & PERF_SAMPLE_DATA_SRC)
>  		printf(" . data_src: 0x%"PRIx64"\n", sample->data_src);
>  
> +	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
> +		printf(" .. phys_addr: 0x%"PRIx64"\n", sample->phys_addr);

could you also please add PERF_SAMPLE_PHYS_ADDR __p_sample_type
so perf evlist displays it?

thanks,
jirka

^ permalink raw reply	[flat|nested] 17+ messages in thread

* RE: [PATCH 1/8] perf: Add PERF_SAMPLE_PHYS_ADDR
  2016-01-07  8:33 ` Jiri Olsa
@ 2016-01-07 15:55   ` Liang, Kan
  2016-01-07 19:31     ` Jiri Olsa
  0 siblings, 1 reply; 17+ messages in thread
From: Liang, Kan @ 2016-01-07 15:55 UTC (permalink / raw)
  To: Jiri Olsa; +Cc: peterz, acme, eranian, ak, jolsa, namhyung, linux-kernel



> On Wed, Jan 06, 2016 at 06:04:30AM -0500, kan.liang@intel.com wrote:
> > From: Kan Liang <kan.liang@intel.com>
> >
> > For understanding how the workload maps to memory channels and
> > hardware behavior, it's useful to collect address maps with physical
> addresses.
> > This is not intended for detecting page sharing (which can be already
> > done using the mmap inode), but for lower level hardware behavior
> > studies.
> > Perf supports load latency/DLA which can only collect virtual addresses.
> > This patch add a new sample type PERF_SAMPLE_PHYS_ADDR to expose
> the
> > physical addresses.
> > For kernel direct mapping addresses, the patch uses virt_to_phys to
> > convert the virtual addresses from DLA to physical address.
> > For user virtual addresses, __get_user_pages_fast is used to walk the
> > pages tables for user physical address.
> > This does not work for vmalloc addresses. Right now these are not
> > resolved, but code to do that could be added.
> > For security, the physical address can only be exposed to root or
> > privileged user.
> >
> > Signed-off-by: Kan Liang <kan.liang@intel.com>
> > ---
> >  arch/x86/kernel/cpu/perf_event.h          |  2 +-
> >  arch/x86/kernel/cpu/perf_event_intel_ds.c | 23
> +++++++++++++++++++++++
> >  include/linux/perf_event.h                |  3 +++
> >  include/uapi/linux/perf_event.h           |  4 +++-
> >  kernel/events/core.c                      | 11 +++++++++++
> >  5 files changed, 41 insertions(+), 2 deletions(-)
> >
> > diff --git a/arch/x86/kernel/cpu/perf_event.h
> > b/arch/x86/kernel/cpu/perf_event.h
> > index 799e6bd..164de68 100644
> > --- a/arch/x86/kernel/cpu/perf_event.h
> > +++ b/arch/x86/kernel/cpu/perf_event.h
> > @@ -90,7 +90,7 @@ struct amd_nb {
> >  	(PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \
> >  	PERF_SAMPLE_ID | PERF_SAMPLE_CPU |
> PERF_SAMPLE_STREAM_ID | \
> >  	PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
> > -	PERF_SAMPLE_TRANSACTION)
> > +	PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR)
> 
> could you please add some probe code to detect kernel support, so we get
> some sensible error message
> 

I'm not quite sure what kind of probe code do you mean.
Yes, the error message below is not accurate. It could be some errors in
sample_type, read_format, branch_sample_type or something else.
Because they share the same error code -EINVAL. Perf tool cannot know
more. 
I think we may print some error kernel messages when invalid attr is
detected. So the user can get more detail messages from dmesg.  

Or you want me to add some extra code to detect the kernel capability
before opening event? If so, I think we may do it in separate patches.

Thanks,
Kan

> [jolsa@krava perf]$ ./perf mem -p record ls
> Error:
> The sys_perf_event_open() syscall returned with 22 (Invalid argument) for
> event (cpu/mem-loads/pp).
> /bin/dmesg may provide additional information.
> No CONFIG_PERF_EVENTS=y kernel support configured?
> 
> thanks,
> jirka

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/8] perf: Add PERF_SAMPLE_PHYS_ADDR
  2016-01-07 15:55   ` Liang, Kan
@ 2016-01-07 19:31     ` Jiri Olsa
  0 siblings, 0 replies; 17+ messages in thread
From: Jiri Olsa @ 2016-01-07 19:31 UTC (permalink / raw)
  To: Liang, Kan; +Cc: peterz, acme, eranian, ak, jolsa, namhyung, linux-kernel

On Thu, Jan 07, 2016 at 03:55:23PM +0000, Liang, Kan wrote:
> 
> 
> > On Wed, Jan 06, 2016 at 06:04:30AM -0500, kan.liang@intel.com wrote:
> > > From: Kan Liang <kan.liang@intel.com>
> > >
> > > For understanding how the workload maps to memory channels and
> > > hardware behavior, it's useful to collect address maps with physical
> > addresses.
> > > This is not intended for detecting page sharing (which can be already
> > > done using the mmap inode), but for lower level hardware behavior
> > > studies.
> > > Perf supports load latency/DLA which can only collect virtual addresses.
> > > This patch add a new sample type PERF_SAMPLE_PHYS_ADDR to expose
> > the
> > > physical addresses.
> > > For kernel direct mapping addresses, the patch uses virt_to_phys to
> > > convert the virtual addresses from DLA to physical address.
> > > For user virtual addresses, __get_user_pages_fast is used to walk the
> > > pages tables for user physical address.
> > > This does not work for vmalloc addresses. Right now these are not
> > > resolved, but code to do that could be added.
> > > For security, the physical address can only be exposed to root or
> > > privileged user.
> > >
> > > Signed-off-by: Kan Liang <kan.liang@intel.com>
> > > ---
> > >  arch/x86/kernel/cpu/perf_event.h          |  2 +-
> > >  arch/x86/kernel/cpu/perf_event_intel_ds.c | 23
> > +++++++++++++++++++++++
> > >  include/linux/perf_event.h                |  3 +++
> > >  include/uapi/linux/perf_event.h           |  4 +++-
> > >  kernel/events/core.c                      | 11 +++++++++++
> > >  5 files changed, 41 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/arch/x86/kernel/cpu/perf_event.h
> > > b/arch/x86/kernel/cpu/perf_event.h
> > > index 799e6bd..164de68 100644
> > > --- a/arch/x86/kernel/cpu/perf_event.h
> > > +++ b/arch/x86/kernel/cpu/perf_event.h
> > > @@ -90,7 +90,7 @@ struct amd_nb {
> > >  	(PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \
> > >  	PERF_SAMPLE_ID | PERF_SAMPLE_CPU |
> > PERF_SAMPLE_STREAM_ID | \
> > >  	PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
> > > -	PERF_SAMPLE_TRANSACTION)
> > > +	PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR)
> > 
> > could you please add some probe code to detect kernel support, so we get
> > some sensible error message
> > 
> 
> I'm not quite sure what kind of probe code do you mean.
> Yes, the error message below is not accurate. It could be some errors in
> sample_type, read_format, branch_sample_type or something else.
> Because they share the same error code -EINVAL. Perf tool cannot know
> more. 
> I think we may print some error kernel messages when invalid attr is
> detected. So the user can get more detail messages from dmesg.  
> 
> Or you want me to add some extra code to detect the kernel capability
> before opening event? If so, I think we may do it in separate patches.

yep, could be done later

jirka

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/8] perf: Add PERF_SAMPLE_PHYS_ADDR
  2016-01-06 11:04 [PATCH 1/8] perf: Add PERF_SAMPLE_PHYS_ADDR kan.liang
                   ` (8 preceding siblings ...)
  2016-01-07  8:33 ` Jiri Olsa
@ 2016-01-07 21:50 ` Stephane Eranian
  2016-01-08 21:12   ` Liang, Kan
  9 siblings, 1 reply; 17+ messages in thread
From: Stephane Eranian @ 2016-01-07 21:50 UTC (permalink / raw)
  To: Liang, Kan
  Cc: Peter Zijlstra, Arnaldo Carvalho de Melo, ak, Jiri Olsa,
	Namhyung Kim, LKML

On Wed, Jan 6, 2016 at 3:04 AM,  <kan.liang@intel.com> wrote:
> From: Kan Liang <kan.liang@intel.com>
>
> For understanding how the workload maps to memory channels and hardware
> behavior, it's useful to collect address maps with physical addresses.
> This is not intended for detecting page sharing (which can be already
> done using the mmap inode), but for lower level hardware behavior
> studies.
> Perf supports load latency/DLA which can only collect virtual addresses.
> This patch add a new sample type PERF_SAMPLE_PHYS_ADDR to expose the
> physical addresses.
> For kernel direct mapping addresses, the patch uses virt_to_phys to
> convert the virtual addresses from DLA to physical address.
> For user virtual addresses, __get_user_pages_fast is used to walk the
> pages tables for user physical address.
> This does not work for vmalloc addresses. Right now these are not
> resolved, but code to do that could be added.
> For security, the physical address can only be exposed to root or
> privileged user.
>
The other limitation of your patch series is that it is providing
physical address for data only. It does not cover code, i.e., it is not possible
to obtain the physical address of a sampled IP. So maybe the new sample_type
should be renamed to PERF_SAMPLE_DATA_PHYS_ADDR instead.
Should you add physical addresses for IP (no need for PEBS for this), then
you'd have to have a new name, anyway.

Getting code and data physical addresses would provide better coverage
of the memory subsystem usage.

> Signed-off-by: Kan Liang <kan.liang@intel.com>
> ---
>  arch/x86/kernel/cpu/perf_event.h          |  2 +-
>  arch/x86/kernel/cpu/perf_event_intel_ds.c | 23 +++++++++++++++++++++++
>  include/linux/perf_event.h                |  3 +++
>  include/uapi/linux/perf_event.h           |  4 +++-
>  kernel/events/core.c                      | 11 +++++++++++
>  5 files changed, 41 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
> index 799e6bd..164de68 100644
> --- a/arch/x86/kernel/cpu/perf_event.h
> +++ b/arch/x86/kernel/cpu/perf_event.h
> @@ -90,7 +90,7 @@ struct amd_nb {
>         (PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \
>         PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \
>         PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
> -       PERF_SAMPLE_TRANSACTION)
> +       PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR)
>
>  /*
>   * A debug store configuration.
> diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
> index 5db1c77..2e333dc 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
> @@ -986,6 +986,7 @@ static void setup_pebs_sample_data(struct perf_event *event,
>         u64 sample_type;
>         int fll, fst, dsrc;
>         int fl = event->hw.flags;
> +       struct page *p = NULL;
>
>         if (pebs == NULL)
>                 return;
> @@ -1071,6 +1072,28 @@ static void setup_pebs_sample_data(struct perf_event *event,
>             x86_pmu.intel_cap.pebs_format >= 1)
>                 data->addr = pebs->dla;
>
> +       if ((sample_type & PERF_SAMPLE_PHYS_ADDR) && (data->addr != 0)) {
> +               if (data->addr >= TASK_SIZE) {
> +                       /* If it's vmalloc()d memory, leave phys_addr as 0 */
> +                       if (virt_addr_valid(data->addr) &&
> +                           !(data->addr >= VMALLOC_START && data->addr < VMALLOC_END))
> +                               data->phys_addr = (u64)virt_to_phys((void *)data->addr);
> +               } else {
> +                       /*
> +                        * Walking the pages tables for user address.
> +                        * Interrupts are disabled, so it prevents any tear down
> +                        * of the page tables.
> +                        * Try IRQ-safe __get_user_pages_fast first.
> +                        * If failed, leave phys_addr as 0.
> +                        */
> +                       if ((current->mm != NULL) &&
> +                           (__get_user_pages_fast(data->addr, 1, 0, &p) == 1))
> +                               data->phys_addr = page_to_phys(p) + data->addr % PAGE_SIZE;
> +
> +                       if (p)
> +                               put_page(p);
> +               }
> +       }
>         if (x86_pmu.intel_cap.pebs_format >= 2) {
>                 /* Only set the TSX weight when no memory weight. */
>                 if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll)
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index f9828a4..d9c0527 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -795,6 +795,8 @@ struct perf_sample_data {
>
>         struct perf_regs                regs_intr;
>         u64                             stack_user_size;
> +
> +       u64                             phys_addr;
>  } ____cacheline_aligned;
>
>  /* default value for data source */
> @@ -815,6 +817,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data,
>         data->weight = 0;
>         data->data_src.val = PERF_MEM_NA;
>         data->txn = 0;
> +       data->phys_addr = 0;
>  }
>
>  extern void perf_output_sample(struct perf_output_handle *handle,
> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
> index 1afe962..5afc572 100644
> --- a/include/uapi/linux/perf_event.h
> +++ b/include/uapi/linux/perf_event.h
> @@ -139,8 +139,9 @@ enum perf_event_sample_format {
>         PERF_SAMPLE_IDENTIFIER                  = 1U << 16,
>         PERF_SAMPLE_TRANSACTION                 = 1U << 17,
>         PERF_SAMPLE_REGS_INTR                   = 1U << 18,
> +       PERF_SAMPLE_PHYS_ADDR                   = 1U << 19,
>
> -       PERF_SAMPLE_MAX = 1U << 19,             /* non-ABI */
> +       PERF_SAMPLE_MAX = 1U << 20,             /* non-ABI */
>  };
>
>  /*
> @@ -767,6 +768,7 @@ enum perf_event_type {
>          *      { u64                   transaction; } && PERF_SAMPLE_TRANSACTION
>          *      { u64                   abi; # enum perf_sample_regs_abi
>          *        u64                   regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
> +        *      { u64                   phys_addr;} && PERF_SAMPLE_PHYS_ADDR
>          * };
>          */
>         PERF_RECORD_SAMPLE                      = 9,
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index a627f36..9a922a2 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -1334,6 +1334,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
>         if (sample_type & PERF_SAMPLE_TRANSACTION)
>                 size += sizeof(data->txn);
>
> +       if (sample_type & PERF_SAMPLE_PHYS_ADDR)
> +               size += sizeof(data->phys_addr);
> +
>         event->header_size = size;
>  }
>
> @@ -5432,6 +5435,9 @@ void perf_output_sample(struct perf_output_handle *handle,
>         if (sample_type & PERF_SAMPLE_DATA_SRC)
>                 perf_output_put(handle, data->data_src.val);
>
> +       if (sample_type & PERF_SAMPLE_PHYS_ADDR)
> +               perf_output_put(handle, data->phys_addr);
> +
>         if (sample_type & PERF_SAMPLE_TRANSACTION)
>                 perf_output_put(handle, data->txn);
>
> @@ -8269,6 +8275,11 @@ SYSCALL_DEFINE5(perf_event_open,
>                         return -EINVAL;
>         }
>
> +       /* Only privileged users can get kernel addresses */
> +       if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
> +           !capable(CAP_SYS_ADMIN))
> +               return -EACCES;
> +
>         /*
>          * In cgroup mode, the pid argument is used to pass the fd
>          * opened to the cgroup directory in cgroupfs. The cpu argument
> --
> 1.8.3.1
>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* RE: [PATCH 1/8] perf: Add PERF_SAMPLE_PHYS_ADDR
  2016-01-07 21:50 ` Stephane Eranian
@ 2016-01-08 21:12   ` Liang, Kan
  0 siblings, 0 replies; 17+ messages in thread
From: Liang, Kan @ 2016-01-08 21:12 UTC (permalink / raw)
  To: Stephane Eranian
  Cc: Peter Zijlstra, Arnaldo Carvalho de Melo, ak, Jiri Olsa,
	Namhyung Kim, LKML


> 
> On Wed, Jan 6, 2016 at 3:04 AM,  <kan.liang@intel.com> wrote:
> > From: Kan Liang <kan.liang@intel.com>
> >
> > For understanding how the workload maps to memory channels and
> > hardware behavior, it's useful to collect address maps with physical
> addresses.
> > This is not intended for detecting page sharing (which can be already
> > done using the mmap inode), but for lower level hardware behavior
> > studies.
> > Perf supports load latency/DLA which can only collect virtual addresses.
> > This patch add a new sample type PERF_SAMPLE_PHYS_ADDR to expose
> the
> > physical addresses.
> > For kernel direct mapping addresses, the patch uses virt_to_phys to
> > convert the virtual addresses from DLA to physical address.
> > For user virtual addresses, __get_user_pages_fast is used to walk the
> > pages tables for user physical address.
> > This does not work for vmalloc addresses. Right now these are not
> > resolved, but code to do that could be added.
> > For security, the physical address can only be exposed to root or
> > privileged user.
> >
> The other limitation of your patch series is that it is providing physical
> address for data only. It does not cover code, i.e., it is not possible to
> obtain the physical address of a sampled IP. So maybe the new
> sample_type should be renamed to PERF_SAMPLE_DATA_PHYS_ADDR
> instead.

We use sample_type PERF_SAMPLE_ADDR for the data virtual address.
So in this patch I choose a similar name PERF_SAMPLE_PHYS_ADDR for
data physical address. I think it should not lead to ambiguity. 

> Should you add physical addresses for IP (no need for PEBS for this), then
> you'd have to have a new name, anyway.

We use another sample_type PERF_SAMPLE_IP to get data->ip.
So I think we may name PERF_SAMPLE_PHYS_IP as the new sample_type
for the physical addresses of IP  
Also, since code and data are from different sample_type, I think it may
be better to implement the PERF_SAMPLE_PHYS_IP in a separate patch later.

Thanks,
Kan

> 
> Getting code and data physical addresses would provide better coverage
> of the memory subsystem usage.
> 
> > Signed-off-by: Kan Liang <kan.liang@intel.com>
> > ---
> >  arch/x86/kernel/cpu/perf_event.h          |  2 +-
> >  arch/x86/kernel/cpu/perf_event_intel_ds.c | 23
> +++++++++++++++++++++++
> >  include/linux/perf_event.h                |  3 +++
> >  include/uapi/linux/perf_event.h           |  4 +++-
> >  kernel/events/core.c                      | 11 +++++++++++
> >  5 files changed, 41 insertions(+), 2 deletions(-)
> >
> > diff --git a/arch/x86/kernel/cpu/perf_event.h
> > b/arch/x86/kernel/cpu/perf_event.h
> > index 799e6bd..164de68 100644
> > --- a/arch/x86/kernel/cpu/perf_event.h
> > +++ b/arch/x86/kernel/cpu/perf_event.h
> > @@ -90,7 +90,7 @@ struct amd_nb {
> >         (PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \
> >         PERF_SAMPLE_ID | PERF_SAMPLE_CPU |
> PERF_SAMPLE_STREAM_ID | \
> >         PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
> > -       PERF_SAMPLE_TRANSACTION)
> > +       PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR)
> >
> >  /*
> >   * A debug store configuration.
> > diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c
> > b/arch/x86/kernel/cpu/perf_event_intel_ds.c
> > index 5db1c77..2e333dc 100644
> > --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
> > +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
> > @@ -986,6 +986,7 @@ static void setup_pebs_sample_data(struct
> perf_event *event,
> >         u64 sample_type;
> >         int fll, fst, dsrc;
> >         int fl = event->hw.flags;
> > +       struct page *p = NULL;
> >
> >         if (pebs == NULL)
> >                 return;
> > @@ -1071,6 +1072,28 @@ static void setup_pebs_sample_data(struct
> perf_event *event,
> >             x86_pmu.intel_cap.pebs_format >= 1)
> >                 data->addr = pebs->dla;
> >
> > +       if ((sample_type & PERF_SAMPLE_PHYS_ADDR) && (data->addr !=
> 0)) {
> > +               if (data->addr >= TASK_SIZE) {
> > +                       /* If it's vmalloc()d memory, leave phys_addr as 0 */
> > +                       if (virt_addr_valid(data->addr) &&
> > +                           !(data->addr >= VMALLOC_START && data->addr <
> VMALLOC_END))
> > +                               data->phys_addr = (u64)virt_to_phys((void *)data-
> >addr);
> > +               } else {
> > +                       /*
> > +                        * Walking the pages tables for user address.
> > +                        * Interrupts are disabled, so it prevents any tear down
> > +                        * of the page tables.
> > +                        * Try IRQ-safe __get_user_pages_fast first.
> > +                        * If failed, leave phys_addr as 0.
> > +                        */
> > +                       if ((current->mm != NULL) &&
> > +                           (__get_user_pages_fast(data->addr, 1, 0, &p) == 1))
> > +                               data->phys_addr = page_to_phys(p) +
> > + data->addr % PAGE_SIZE;
> > +
> > +                       if (p)
> > +                               put_page(p);
> > +               }
> > +       }
> >         if (x86_pmu.intel_cap.pebs_format >= 2) {
> >                 /* Only set the TSX weight when no memory weight. */
> >                 if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll) diff
> > --git a/include/linux/perf_event.h b/include/linux/perf_event.h index
> > f9828a4..d9c0527 100644
> > --- a/include/linux/perf_event.h
> > +++ b/include/linux/perf_event.h
> > @@ -795,6 +795,8 @@ struct perf_sample_data {
> >
> >         struct perf_regs                regs_intr;
> >         u64                             stack_user_size;
> > +
> > +       u64                             phys_addr;
> >  } ____cacheline_aligned;
> >
> >  /* default value for data source */
> > @@ -815,6 +817,7 @@ static inline void perf_sample_data_init(struct
> perf_sample_data *data,
> >         data->weight = 0;
> >         data->data_src.val = PERF_MEM_NA;
> >         data->txn = 0;
> > +       data->phys_addr = 0;
> >  }
> >
> >  extern void perf_output_sample(struct perf_output_handle *handle,
> > diff --git a/include/uapi/linux/perf_event.h
> > b/include/uapi/linux/perf_event.h index 1afe962..5afc572 100644
> > --- a/include/uapi/linux/perf_event.h
> > +++ b/include/uapi/linux/perf_event.h
> > @@ -139,8 +139,9 @@ enum perf_event_sample_format {
> >         PERF_SAMPLE_IDENTIFIER                  = 1U << 16,
> >         PERF_SAMPLE_TRANSACTION                 = 1U << 17,
> >         PERF_SAMPLE_REGS_INTR                   = 1U << 18,
> > +       PERF_SAMPLE_PHYS_ADDR                   = 1U << 19,
> >
> > -       PERF_SAMPLE_MAX = 1U << 19,             /* non-ABI */
> > +       PERF_SAMPLE_MAX = 1U << 20,             /* non-ABI */
> >  };
> >
> >  /*
> > @@ -767,6 +768,7 @@ enum perf_event_type {
> >          *      { u64                   transaction; } && PERF_SAMPLE_TRANSACTION
> >          *      { u64                   abi; # enum perf_sample_regs_abi
> >          *        u64                   regs[weight(mask)]; } &&
> PERF_SAMPLE_REGS_INTR
> > +        *      { u64                   phys_addr;} && PERF_SAMPLE_PHYS_ADDR
> >          * };
> >          */
> >         PERF_RECORD_SAMPLE                      = 9,
> > diff --git a/kernel/events/core.c b/kernel/events/core.c index
> > a627f36..9a922a2 100644
> > --- a/kernel/events/core.c
> > +++ b/kernel/events/core.c
> > @@ -1334,6 +1334,9 @@ static void __perf_event_header_size(struct
> perf_event *event, u64 sample_type)
> >         if (sample_type & PERF_SAMPLE_TRANSACTION)
> >                 size += sizeof(data->txn);
> >
> > +       if (sample_type & PERF_SAMPLE_PHYS_ADDR)
> > +               size += sizeof(data->phys_addr);
> > +
> >         event->header_size = size;
> >  }
> >
> > @@ -5432,6 +5435,9 @@ void perf_output_sample(struct
> perf_output_handle *handle,
> >         if (sample_type & PERF_SAMPLE_DATA_SRC)
> >                 perf_output_put(handle, data->data_src.val);
> >
> > +       if (sample_type & PERF_SAMPLE_PHYS_ADDR)
> > +               perf_output_put(handle, data->phys_addr);
> > +
> >         if (sample_type & PERF_SAMPLE_TRANSACTION)
> >                 perf_output_put(handle, data->txn);
> >
> > @@ -8269,6 +8275,11 @@ SYSCALL_DEFINE5(perf_event_open,
> >                         return -EINVAL;
> >         }
> >
> > +       /* Only privileged users can get kernel addresses */
> > +       if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
> > +           !capable(CAP_SYS_ADMIN))
> > +               return -EACCES;
> > +
> >         /*
> >          * In cgroup mode, the pid argument is used to pass the fd
> >          * opened to the cgroup directory in cgroupfs. The cpu
> > argument
> > --
> > 1.8.3.1
> >

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2016-01-08 21:12 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-01-06 11:04 [PATCH 1/8] perf: Add PERF_SAMPLE_PHYS_ADDR kan.liang
2016-01-06 11:04 ` [PATCH 2/8] perf tools: add option to record sample physical address kan.liang
2016-01-07  9:56   ` Jiri Olsa
2016-01-06 11:04 ` [PATCH 3/8] perf tools: add sort option phys_daddr kan.liang
2016-01-06 11:04 ` [PATCH 4/8] perf mem: add option phys-data to record physical address kan.liang
2016-01-06 11:04 ` [PATCH 5/8] perf mem: report physical addresses kan.liang
2016-01-07  8:27   ` Jiri Olsa
2016-01-06 11:04 ` [PATCH 6/8] perf mem: dump " kan.liang
2016-01-06 11:04 ` [PATCH 7/8] perf script: support physical addresses in script kan.liang
2016-01-07  9:54   ` Jiri Olsa
2016-01-06 11:04 ` [PATCH 8/8] perf test: add test case for PERF_SAMPLE_PHYS_ADDR kan.liang
2016-01-06 19:21 ` [PATCH 1/8] perf: Add PERF_SAMPLE_PHYS_ADDR Stephane Eranian
2016-01-07  8:33 ` Jiri Olsa
2016-01-07 15:55   ` Liang, Kan
2016-01-07 19:31     ` Jiri Olsa
2016-01-07 21:50 ` Stephane Eranian
2016-01-08 21:12   ` Liang, Kan

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.