All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/8] perf: Add PERF_SAMPLE_PHYS_ADDR
@ 2016-01-06 11:04 kan.liang
  2016-01-06 11:04 ` [PATCH 2/8] perf tools: add option to record sample physical address kan.liang
                   ` (9 more replies)
  0 siblings, 10 replies; 17+ messages in thread
From: kan.liang @ 2016-01-06 11:04 UTC (permalink / raw)
  To: peterz, acme; +Cc: eranian, ak, jolsa, namhyung, linux-kernel, Kan Liang

From: Kan Liang <kan.liang@intel.com>

For understanding how the workload maps to memory channels and hardware
behavior, it's useful to collect address maps with physical addresses.
This is not intended for detecting page sharing (which can be already
done using the mmap inode), but for lower level hardware behavior
studies.
Perf supports load latency/DLA which can only collect virtual addresses.
This patch add a new sample type PERF_SAMPLE_PHYS_ADDR to expose the
physical addresses.
For kernel direct mapping addresses, the patch uses virt_to_phys to
convert the virtual addresses from DLA to physical address.
For user virtual addresses, __get_user_pages_fast is used to walk the
pages tables for user physical address.
This does not work for vmalloc addresses. Right now these are not
resolved, but code to do that could be added.
For security, the physical address can only be exposed to root or
privileged user.

Signed-off-by: Kan Liang <kan.liang@intel.com>
---
 arch/x86/kernel/cpu/perf_event.h          |  2 +-
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 23 +++++++++++++++++++++++
 include/linux/perf_event.h                |  3 +++
 include/uapi/linux/perf_event.h           |  4 +++-
 kernel/events/core.c                      | 11 +++++++++++
 5 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 799e6bd..164de68 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -90,7 +90,7 @@ struct amd_nb {
 	(PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \
 	PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \
 	PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
-	PERF_SAMPLE_TRANSACTION)
+	PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR)
 
 /*
  * A debug store configuration.
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 5db1c77..2e333dc 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -986,6 +986,7 @@ static void setup_pebs_sample_data(struct perf_event *event,
 	u64 sample_type;
 	int fll, fst, dsrc;
 	int fl = event->hw.flags;
+	struct page *p = NULL;
 
 	if (pebs == NULL)
 		return;
@@ -1071,6 +1072,28 @@ static void setup_pebs_sample_data(struct perf_event *event,
 	    x86_pmu.intel_cap.pebs_format >= 1)
 		data->addr = pebs->dla;
 
+	if ((sample_type & PERF_SAMPLE_PHYS_ADDR) && (data->addr != 0)) {
+		if (data->addr >= TASK_SIZE) {
+			/* If it's vmalloc()d memory, leave phys_addr as 0 */
+			if (virt_addr_valid(data->addr) &&
+			    !(data->addr >= VMALLOC_START && data->addr < VMALLOC_END))
+				data->phys_addr = (u64)virt_to_phys((void *)data->addr);
+		} else {
+			/*
+			 * Walking the pages tables for user address.
+			 * Interrupts are disabled, so it prevents any tear down
+			 * of the page tables.
+			 * Try IRQ-safe __get_user_pages_fast first.
+			 * If failed, leave phys_addr as 0.
+			 */
+			if ((current->mm != NULL) &&
+			    (__get_user_pages_fast(data->addr, 1, 0, &p) == 1))
+				data->phys_addr = page_to_phys(p) + data->addr % PAGE_SIZE;
+
+			if (p)
+				put_page(p);
+		}
+	}
 	if (x86_pmu.intel_cap.pebs_format >= 2) {
 		/* Only set the TSX weight when no memory weight. */
 		if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index f9828a4..d9c0527 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -795,6 +795,8 @@ struct perf_sample_data {
 
 	struct perf_regs		regs_intr;
 	u64				stack_user_size;
+
+	u64				phys_addr;
 } ____cacheline_aligned;
 
 /* default value for data source */
@@ -815,6 +817,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data,
 	data->weight = 0;
 	data->data_src.val = PERF_MEM_NA;
 	data->txn = 0;
+	data->phys_addr = 0;
 }
 
 extern void perf_output_sample(struct perf_output_handle *handle,
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 1afe962..5afc572 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -139,8 +139,9 @@ enum perf_event_sample_format {
 	PERF_SAMPLE_IDENTIFIER			= 1U << 16,
 	PERF_SAMPLE_TRANSACTION			= 1U << 17,
 	PERF_SAMPLE_REGS_INTR			= 1U << 18,
+	PERF_SAMPLE_PHYS_ADDR			= 1U << 19,
 
-	PERF_SAMPLE_MAX = 1U << 19,		/* non-ABI */
+	PERF_SAMPLE_MAX = 1U << 20,		/* non-ABI */
 };
 
 /*
@@ -767,6 +768,7 @@ enum perf_event_type {
 	 *	{ u64			transaction; } && PERF_SAMPLE_TRANSACTION
 	 *	{ u64			abi; # enum perf_sample_regs_abi
 	 *	  u64			regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
+	 *	{ u64			phys_addr;} && PERF_SAMPLE_PHYS_ADDR
 	 * };
 	 */
 	PERF_RECORD_SAMPLE			= 9,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index a627f36..9a922a2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1334,6 +1334,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
 	if (sample_type & PERF_SAMPLE_TRANSACTION)
 		size += sizeof(data->txn);
 
+	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+		size += sizeof(data->phys_addr);
+
 	event->header_size = size;
 }
 
@@ -5432,6 +5435,9 @@ void perf_output_sample(struct perf_output_handle *handle,
 	if (sample_type & PERF_SAMPLE_DATA_SRC)
 		perf_output_put(handle, data->data_src.val);
 
+	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+		perf_output_put(handle, data->phys_addr);
+
 	if (sample_type & PERF_SAMPLE_TRANSACTION)
 		perf_output_put(handle, data->txn);
 
@@ -8269,6 +8275,11 @@ SYSCALL_DEFINE5(perf_event_open,
 			return -EINVAL;
 	}
 
+	/* Only privileged users can get kernel addresses */
+	if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
+	    !capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
 	/*
 	 * In cgroup mode, the pid argument is used to pass the fd
 	 * opened to the cgroup directory in cgroupfs. The cpu argument
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2016-01-08 21:12 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-01-06 11:04 [PATCH 1/8] perf: Add PERF_SAMPLE_PHYS_ADDR kan.liang
2016-01-06 11:04 ` [PATCH 2/8] perf tools: add option to record sample physical address kan.liang
2016-01-07  9:56   ` Jiri Olsa
2016-01-06 11:04 ` [PATCH 3/8] perf tools: add sort option phys_daddr kan.liang
2016-01-06 11:04 ` [PATCH 4/8] perf mem: add option phys-data to record physical address kan.liang
2016-01-06 11:04 ` [PATCH 5/8] perf mem: report physical addresses kan.liang
2016-01-07  8:27   ` Jiri Olsa
2016-01-06 11:04 ` [PATCH 6/8] perf mem: dump " kan.liang
2016-01-06 11:04 ` [PATCH 7/8] perf script: support physical addresses in script kan.liang
2016-01-07  9:54   ` Jiri Olsa
2016-01-06 11:04 ` [PATCH 8/8] perf test: add test case for PERF_SAMPLE_PHYS_ADDR kan.liang
2016-01-06 19:21 ` [PATCH 1/8] perf: Add PERF_SAMPLE_PHYS_ADDR Stephane Eranian
2016-01-07  8:33 ` Jiri Olsa
2016-01-07 15:55   ` Liang, Kan
2016-01-07 19:31     ` Jiri Olsa
2016-01-07 21:50 ` Stephane Eranian
2016-01-08 21:12   ` Liang, Kan

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.