All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH V4 01/13] perf/core, x86: Add PERF_SAMPLE_DATA_PAGE_SIZE
@ 2019-01-31 20:27 kan.liang
  2019-01-31 20:27 ` [PATCH V4 02/13] perf tools: Support new sample type for data page size kan.liang
                   ` (12 more replies)
  0 siblings, 13 replies; 23+ messages in thread
From: kan.liang @ 2019-01-31 20:27 UTC (permalink / raw)
  To: peterz, acme, tglx, mingo, linux-kernel
  Cc: eranian, jolsa, namhyung, ak, luto, Kan Liang

From: Kan Liang <kan.liang@linux.intel.com>

Current perf can report both virtual address and physical address, but
it doesn't report page size. Users have no idea how large the utilized
page is. They cannot promote/demote large pages to optimize memory use.

Add a new sample type for data page size.

Current perf already has a facility to collect data virtual address.
A __weak function, aim to retrieve page size via a given virtual
address, is introduced in the generic code. Now, it always returns 0.
The function must be IRQ-safe.
This patch only implements a x86 specific version, which do full
page-table walk of a given virtual address to retrieve page size.
For x86, disabling IRQs over the walk is sufficient to prevent any
tear down of the page tables.
Other architectures can implement their own functions later separately.

The new sample type requires collecting the virtual address. The
virtual address will not be output unless SAMPLE_ADDR is applied.

A u64 type is claimed for page_size. Because struct perf_sample_data
requires cacheline_aligned.

The large PEBS will be disabled with this sample type. Because we need
to track munmap to flush the PEBS buffer for large PEBS. Perf doesn't
support munmap tracking yet. The large PEBS can be enabled later
separately when munmap tracking is supported.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---

Changes since V3
- Use the real page size to replace enum.
- Modify the changelog to mention the generic support of
  __weak perf_get_page_size()

 arch/x86/events/core.c          | 31 +++++++++++++++++++++++++++++++
 arch/x86/events/intel/ds.c      |  3 ++-
 include/linux/perf_event.h      |  1 +
 include/uapi/linux/perf_event.h |  4 +++-
 kernel/events/core.c            | 15 +++++++++++++++
 5 files changed, 52 insertions(+), 2 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 374a197..229a73b 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2578,3 +2578,34 @@ void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
 	cap->events_mask_len	= x86_pmu.events_mask_len;
 }
 EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
+
+u64 perf_get_page_size(u64 virt)
+{
+	unsigned long flags;
+	unsigned int level;
+	pte_t *pte;
+
+	if (!virt)
+		return 0;
+
+	/*
+	 * Interrupts are disabled, so it prevents any tear down
+	 * of the page tables.
+	 * See the comment near struct mmu_table_batch.
+	 */
+	local_irq_save(flags);
+	if (virt >= TASK_SIZE)
+		pte = lookup_address(virt, &level);
+	else {
+		if (current->mm) {
+			pte = lookup_address_in_pgd(pgd_offset(current->mm, virt),
+						    virt, &level);
+		} else
+			level = PG_LEVEL_NUM;
+	}
+	local_irq_restore(flags);
+	if (level >= PG_LEVEL_NUM)
+		return 0;
+
+	return (u64)page_level_size(level);
+}
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index e9acf1d..720dc9e 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1274,7 +1274,8 @@ static void setup_pebs_sample_data(struct perf_event *event,
 	}
 
 
-	if ((sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR)) &&
+	if ((sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR
+			    | PERF_SAMPLE_DATA_PAGE_SIZE)) &&
 	    x86_pmu.intel_cap.pebs_format >= 1)
 		data->addr = pebs->dla;
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index a79e59f..0e048ab 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -937,6 +937,7 @@ struct perf_sample_data {
 	u64				stack_user_size;
 
 	u64				phys_addr;
+	u64				data_page_size;
 } ____cacheline_aligned;
 
 /* default value for data source */
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 7198ddd..0e8d222 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -141,8 +141,9 @@ enum perf_event_sample_format {
 	PERF_SAMPLE_TRANSACTION			= 1U << 17,
 	PERF_SAMPLE_REGS_INTR			= 1U << 18,
 	PERF_SAMPLE_PHYS_ADDR			= 1U << 19,
+	PERF_SAMPLE_DATA_PAGE_SIZE		= 1U << 20,
 
-	PERF_SAMPLE_MAX = 1U << 20,		/* non-ABI */
+	PERF_SAMPLE_MAX = 1U << 21,		/* non-ABI */
 
 	__PERF_SAMPLE_CALLCHAIN_EARLY		= 1ULL << 63, /* non-ABI; internal use */
 };
@@ -863,6 +864,7 @@ enum perf_event_type {
 	 *	{ u64			abi; # enum perf_sample_regs_abi
 	 *	  u64			regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
 	 *	{ u64			phys_addr;} && PERF_SAMPLE_PHYS_ADDR
+	 *	{ u64			data_page_size;} && PERF_SAMPLE_DATA_PAGE_SIZE
 	 * };
 	 */
 	PERF_RECORD_SAMPLE			= 9,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 236bb8d..d233f45 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1753,6 +1753,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
 	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
 		size += sizeof(data->phys_addr);
 
+	if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
+		size += sizeof(data->data_page_size);
+
 	event->header_size = size;
 }
 
@@ -6305,6 +6308,9 @@ void perf_output_sample(struct perf_output_handle *handle,
 	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
 		perf_output_put(handle, data->phys_addr);
 
+	if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
+		perf_output_put(handle, data->data_page_size);
+
 	if (!event->attr.watermark) {
 		int wakeup_events = event->attr.wakeup_events;
 
@@ -6352,6 +6358,12 @@ static u64 perf_virt_to_phys(u64 virt)
 	return phys_addr;
 }
 
+/* Return page size of given virtual address. IRQ-safe required. */
+u64 __weak perf_get_page_size(u64 virt)
+{
+	return 0;
+}
+
 static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
 
 struct perf_callchain_entry *
@@ -6493,6 +6505,9 @@ void perf_prepare_sample(struct perf_event_header *header,
 
 	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
 		data->phys_addr = perf_virt_to_phys(data->addr);
+
+	if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
+		data->data_page_size = perf_get_page_size(data->addr);
 }
 
 static __always_inline int
-- 
2.7.4


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH V4 02/13] perf tools: Support new sample type for data page size
  2019-01-31 20:27 [PATCH V4 01/13] perf/core, x86: Add PERF_SAMPLE_DATA_PAGE_SIZE kan.liang
@ 2019-01-31 20:27 ` kan.liang
  2019-01-31 20:27 ` [PATCH V4 03/13] perf script: Support " kan.liang
                   ` (11 subsequent siblings)
  12 siblings, 0 replies; 23+ messages in thread
From: kan.liang @ 2019-01-31 20:27 UTC (permalink / raw)
  To: peterz, acme, tglx, mingo, linux-kernel
  Cc: eranian, jolsa, namhyung, ak, luto, Kan Liang

From: Kan Liang <kan.liang@linux.intel.com>

Support new sample type PERF_SAMPLE_DATA_PAGE_SIZE for page size.

Add new option --data-page-size to record sample data page size.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---

Changes since V3
- Use the real page size to replace enum.

 tools/include/uapi/linux/perf_event.h    |  4 +++-
 tools/perf/Documentation/perf-record.txt |  3 +++
 tools/perf/builtin-record.c              |  2 ++
 tools/perf/perf.h                        |  1 +
 tools/perf/util/event.h                  |  1 +
 tools/perf/util/evsel.c                  | 19 ++++++++++++++++++-
 6 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
index 7198ddd..0e8d222 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -141,8 +141,9 @@ enum perf_event_sample_format {
 	PERF_SAMPLE_TRANSACTION			= 1U << 17,
 	PERF_SAMPLE_REGS_INTR			= 1U << 18,
 	PERF_SAMPLE_PHYS_ADDR			= 1U << 19,
+	PERF_SAMPLE_DATA_PAGE_SIZE		= 1U << 20,
 
-	PERF_SAMPLE_MAX = 1U << 20,		/* non-ABI */
+	PERF_SAMPLE_MAX = 1U << 21,		/* non-ABI */
 
 	__PERF_SAMPLE_CALLCHAIN_EARLY		= 1ULL << 63, /* non-ABI; internal use */
 };
@@ -863,6 +864,7 @@ enum perf_event_type {
 	 *	{ u64			abi; # enum perf_sample_regs_abi
 	 *	  u64			regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
 	 *	{ u64			phys_addr;} && PERF_SAMPLE_PHYS_ADDR
+	 *	{ u64			data_page_size;} && PERF_SAMPLE_DATA_PAGE_SIZE
 	 * };
 	 */
 	PERF_RECORD_SAMPLE			= 9,
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index d232b13..c2a2875 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -264,6 +264,9 @@ OPTIONS
 --phys-data::
 	Record the sample physical addresses.
 
+--data-page-size::
+	Record the sampled data address data page size
+
 -T::
 --timestamp::
 	Record the sample timestamps. Use it with 'perf report -D' to see the
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 88ea11d..b9df9b9 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -1870,6 +1870,8 @@ static struct option __record_options[] = {
 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
 		    "Record the sample physical addresses"),
+	OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
+		    "Record the sampled data address data page size"),
 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
 			&record.opts.sample_time_set,
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index 5941fb6..87a345a 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -48,6 +48,7 @@ struct record_opts {
 	bool	     raw_samples;
 	bool	     sample_address;
 	bool	     sample_phys_addr;
+	bool	     sample_data_page_size;
 	bool	     sample_weight;
 	bool	     sample_time;
 	bool	     sample_time_set;
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index feba1aa..53d1a41 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -207,6 +207,7 @@ struct perf_sample {
 	u32 raw_size;
 	u64 data_src;
 	u64 phys_addr;
+	u64 data_page_size;
 	u32 flags;
 	u16 insn_len;
 	u8  cpumode;
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 684c893..4d9e29e 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1019,6 +1019,9 @@ void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts,
 	if (opts->sample_phys_addr)
 		perf_evsel__set_sample_bit(evsel, PHYS_ADDR);
 
+	if (opts->sample_data_page_size)
+		perf_evsel__set_sample_bit(evsel, DATA_PAGE_SIZE);
+
 	if (opts->no_buffering) {
 		attr->watermark = 0;
 		attr->wakeup_events = 1;
@@ -1561,7 +1564,7 @@ static void __p_sample_type(char *buf, size_t size, u64 value)
 		bit_name(PERIOD), bit_name(STREAM_ID), bit_name(RAW),
 		bit_name(BRANCH_STACK), bit_name(REGS_USER), bit_name(STACK_USER),
 		bit_name(IDENTIFIER), bit_name(REGS_INTR), bit_name(DATA_SRC),
-		bit_name(WEIGHT), bit_name(PHYS_ADDR),
+		bit_name(WEIGHT), bit_name(PHYS_ADDR), bit_name(DATA_PAGE_SIZE),
 		{ .name = NULL, }
 	};
 #undef bit_name
@@ -2392,6 +2395,12 @@ int perf_evsel__parse_sample(struct perf_evsel *evsel, union perf_event *event,
 		array++;
 	}
 
+	data->data_page_size = 0;
+	if (type & PERF_SAMPLE_DATA_PAGE_SIZE) {
+		data->data_page_size = *array;
+		array++;
+	}
+
 	return 0;
 }
 
@@ -2544,6 +2553,9 @@ size_t perf_event__sample_event_size(const struct perf_sample *sample, u64 type,
 	if (type & PERF_SAMPLE_PHYS_ADDR)
 		result += sizeof(u64);
 
+	if (type & PERF_SAMPLE_DATA_PAGE_SIZE)
+		result += sizeof(u64);
+
 	return result;
 }
 
@@ -2713,6 +2725,11 @@ int perf_event__synthesize_sample(union perf_event *event, u64 type,
 		array++;
 	}
 
+	if (type & PERF_SAMPLE_DATA_PAGE_SIZE) {
+		*array = sample->data_page_size;
+		array++;
+	}
+
 	return 0;
 }
 
-- 
2.7.4


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH V4 03/13] perf script: Support data page size
  2019-01-31 20:27 [PATCH V4 01/13] perf/core, x86: Add PERF_SAMPLE_DATA_PAGE_SIZE kan.liang
  2019-01-31 20:27 ` [PATCH V4 02/13] perf tools: Support new sample type for data page size kan.liang
@ 2019-01-31 20:27 ` kan.liang
  2019-01-31 20:27 ` [PATCH V4 04/13] perf sort: Add sort option for " kan.liang
                   ` (10 subsequent siblings)
  12 siblings, 0 replies; 23+ messages in thread
From: kan.liang @ 2019-01-31 20:27 UTC (permalink / raw)
  To: peterz, acme, tglx, mingo, linux-kernel
  Cc: eranian, jolsa, namhyung, ak, luto, Kan Liang

From: Kan Liang <kan.liang@linux.intel.com>

Display the data page size if it is available.

Can be configured by the user, for example:
  perf script --fields comm,event,phys_addr,data_page_size
            dtlb mem-loads:uP:        3fec82ea8 4K
            dtlb mem-loads:uP:        3fec82e90 4K
            dtlb mem-loads:uP:        3e23700a4 4K
            dtlb mem-loads:uP:        3fec82f20 4K
            dtlb mem-loads:uP:        3e23700a4 4K
            dtlb mem-loads:uP:        3b4211bec 4K
            dtlb mem-loads:uP:        382205dc0 2M
            dtlb mem-loads:uP:        36fa082c0 2M
            dtlb mem-loads:uP:        377607340 2M
            dtlb mem-loads:uP:        330010180 2M
            dtlb mem-loads:uP:        33200fd80 2M
            dtlb mem-loads:uP:        31b012b80 2M

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---

Changes since V3
- Use the real page size to replace enum.
- Modify the get_page_size_name()

 tools/perf/Documentation/perf-script.txt |  5 +++--
 tools/perf/builtin-script.c              | 19 +++++++++++++++++--
 tools/perf/util/event.h                  |  3 +++
 tools/perf/util/session.c                | 23 +++++++++++++++++++++++
 4 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt
index 9e4def0..14ae84c1 100644
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -116,8 +116,9 @@ OPTIONS
 --fields::
         Comma separated list of fields to print. Options are:
         comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr, symoff,
-        srcline, period, iregs, uregs, brstack, brstacksym, flags, bpf-output, brstackinsn,
-        brstackoff, callindent, insn, insnlen, synth, phys_addr, metric, misc, srccode.
+        srcline, period, iregs, uregs, brstack, brstacksym, flags, bpf-output,
+        brstackinsn, brstackoff, callindent, insn, insnlen, synth, phys_addr,
+        metric, misc, srccode, data_page_size.
         Field list can be prepended with the type, trace, sw or hw,
         to indicate to which event type the field list applies.
         e.g., -F sw:comm,tid,time,ip,sym  and -F trace:time,cpu,trace
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index d079f36..440ae80 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -27,6 +27,7 @@
 #include "util/thread-stack.h"
 #include "util/time-utils.h"
 #include "util/path.h"
+#include "util/event.h"
 #include "print_binary.h"
 #include <linux/bitmap.h>
 #include <linux/kernel.h>
@@ -97,6 +98,7 @@ enum perf_output_field {
 	PERF_OUTPUT_METRIC	    = 1U << 28,
 	PERF_OUTPUT_MISC            = 1U << 29,
 	PERF_OUTPUT_SRCCODE	    = 1U << 30,
+	PERF_OUTPUT_DATA_PAGE_SIZE  = 1U << 31,
 };
 
 struct output_option {
@@ -134,6 +136,7 @@ struct output_option {
 	{.str = "metric", .field = PERF_OUTPUT_METRIC},
 	{.str = "misc", .field = PERF_OUTPUT_MISC},
 	{.str = "srccode", .field = PERF_OUTPUT_SRCCODE},
+	{.str = "data_page_size", .field = PERF_OUTPUT_DATA_PAGE_SIZE},
 };
 
 enum {
@@ -204,7 +207,8 @@ static struct {
 			      PERF_OUTPUT_SYM | PERF_OUTPUT_SYMOFFSET |
 			      PERF_OUTPUT_DSO | PERF_OUTPUT_PERIOD |
 			      PERF_OUTPUT_ADDR | PERF_OUTPUT_DATA_SRC |
-			      PERF_OUTPUT_WEIGHT | PERF_OUTPUT_PHYS_ADDR,
+			      PERF_OUTPUT_WEIGHT | PERF_OUTPUT_PHYS_ADDR |
+			      PERF_OUTPUT_DATA_PAGE_SIZE,
 
 		.invalid_fields = PERF_OUTPUT_TRACE | PERF_OUTPUT_BPF_OUTPUT,
 	},
@@ -468,6 +472,11 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel,
 					PERF_OUTPUT_PHYS_ADDR))
 		return -EINVAL;
 
+	if (PRINT_FIELD(DATA_PAGE_SIZE) &&
+		perf_evsel__check_stype(evsel, PERF_SAMPLE_DATA_PAGE_SIZE, "DATA_PAGE_SIZE",
+					PERF_OUTPUT_DATA_PAGE_SIZE))
+		return -EINVAL;
+
 	return 0;
 }
 
@@ -1762,6 +1771,7 @@ static void process_event(struct perf_script *script,
 	unsigned int type = output_type(attr->type);
 	struct perf_evsel_script *es = evsel->priv;
 	FILE *fp = es->fp;
+	char str[PAGE_SIZE_NAME_LEN];
 
 	if (output[type].fields == 0)
 		return;
@@ -1842,6 +1852,10 @@ static void process_event(struct perf_script *script,
 
 	if (PRINT_FIELD(PHYS_ADDR))
 		fprintf(fp, "%16" PRIx64, sample->phys_addr);
+
+	if (PRINT_FIELD(DATA_PAGE_SIZE))
+		fprintf(fp, " %s", get_page_size_name(sample->data_page_size, str));
+
 	fprintf(fp, "\n");
 
 	if (PRINT_FIELD(SRCCODE)) {
@@ -3337,7 +3351,8 @@ int cmd_script(int argc, const char **argv)
 		     "Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso,"
 		     "addr,symoff,srcline,period,iregs,uregs,brstack,"
 		     "brstacksym,flags,bpf-output,brstackinsn,brstackoff,"
-		     "callindent,insn,insnlen,synth,phys_addr,metric,misc",
+		     "callindent,insn,insnlen,synth,phys_addr,metric,misc,"
+		     "data_page_size",
 		     parse_output_fields),
 	OPT_BOOLEAN('a', "all-cpus", &system_wide,
 		    "system-wide collection from all CPUs"),
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 53d1a41..7ef06fc 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -846,4 +846,7 @@ extern int sysctl_perf_event_max_stack;
 extern int sysctl_perf_event_max_contexts_per_stack;
 extern unsigned int proc_map_timeout;
 
+#define PAGE_SIZE_NAME_LEN	10
+char *get_page_size_name(u64 size, char *str);
+
 #endif /* __PERF_RECORD_H */
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 24fd625..9ad3686 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -1082,10 +1082,30 @@ static void dump_event(struct perf_evlist *evlist, union perf_event *event,
 	       event->header.size, perf_event__name(event->header.type));
 }
 
+char *get_page_size_name(u64 size, char *str)
+{
+	const char suffixes[5] = { 'B', 'K', 'M', 'G', 'T' };
+	int i;
+
+	if (size == 0) {
+		snprintf(str, PAGE_SIZE_NAME_LEN, "%s", "N/A");
+		return str;
+	}
+	for (i = 0; i < 5; i++) {
+		if (size < 1024)
+			break;
+		size /= 1024;
+	}
+
+	snprintf(str, PAGE_SIZE_NAME_LEN, "%lu%c", size, suffixes[i]);
+	return str;
+}
+
 static void dump_sample(struct perf_evsel *evsel, union perf_event *event,
 			struct perf_sample *sample)
 {
 	u64 sample_type;
+	char str[PAGE_SIZE_NAME_LEN];
 
 	if (!dump_trace)
 		return;
@@ -1120,6 +1140,9 @@ static void dump_sample(struct perf_evsel *evsel, union perf_event *event,
 	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
 		printf(" .. phys_addr: 0x%"PRIx64"\n", sample->phys_addr);
 
+	if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
+		printf(" .. data page size: %s\n", get_page_size_name(sample->data_page_size, str));
+
 	if (sample_type & PERF_SAMPLE_TRANSACTION)
 		printf("... transaction: %" PRIx64 "\n", sample->transaction);
 
-- 
2.7.4


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH V4 04/13] perf sort: Add sort option for data page size
  2019-01-31 20:27 [PATCH V4 01/13] perf/core, x86: Add PERF_SAMPLE_DATA_PAGE_SIZE kan.liang
  2019-01-31 20:27 ` [PATCH V4 02/13] perf tools: Support new sample type for data page size kan.liang
  2019-01-31 20:27 ` [PATCH V4 03/13] perf script: Support " kan.liang
@ 2019-01-31 20:27 ` kan.liang
  2019-01-31 20:27 ` [PATCH V4 05/13] perf mem: Factor out a function to generate sort order kan.liang
                   ` (9 subsequent siblings)
  12 siblings, 0 replies; 23+ messages in thread
From: kan.liang @ 2019-01-31 20:27 UTC (permalink / raw)
  To: peterz, acme, tglx, mingo, linux-kernel
  Cc: eranian, jolsa, namhyung, ak, luto, Kan Liang

From: Kan Liang <kan.liang@linux.intel.com>

Add a new sort option "data_page_size" for --mem-mode sort.  With this
option applied, perf can sort and report by sample's data page size.

Here is an example.
perf report --stdio --mem-mode
--sort=comm,symbol,phys_daddr,data_page_size

 # To display the perf.data header info, please use
 # --header/--header-only options.
 #
 #
 # Total Lost Samples: 0
 #
 # Samples: 9K of event 'mem-loads:uP'
 # Total weight : 9028
 # Sort order   : comm,symbol,phys_daddr,data_page_size
 #
 # Overhead  Command  Symbol                        Data Physical
 # Address
 # Data Page Size
 # ........  .......  ............................
 # ......................  ......................
 #
    11.19%  dtlb     [.] touch_buffer              [.]
0x00000003fec82ea8  4K
     8.61%  dtlb     [.] GetTickCount              [.]
0x00000003c4f2c8a8  4K
     4.52%  dtlb     [.] GetTickCount              [.]
0x00000003fec82f58  4K
     4.33%  dtlb     [.] __gettimeofday            [.]
0x00000003fec82f48  4K
     4.32%  dtlb     [.] GetTickCount              [.]
0x00000003fec82f78  4K
     4.28%  dtlb     [.] GetTickCount              [.]
0x00000003fec82f50  4K
     4.23%  dtlb     [.] GetTickCount              [.]
0x00000003fec82f70  4K
     4.11%  dtlb     [.] GetTickCount              [.]
0x00000003fec82f68  4K
     4.00%  dtlb     [.] Calibrate                 [.]
0x00000003fec82f98  4K
     3.91%  dtlb     [.] Calibrate                 [.]
0x00000003fec82f90  4K
     3.43%  dtlb     [.] touch_buffer              [.]
0x00000003fec82e98  4K
     3.42%  dtlb     [.] touch_buffer              [.]
0x00000003fec82e90  4K
     0.09%  dtlb     [.] DoDependentLoads          [.]
0x000000036ea084c0  2M
     0.08%  dtlb     [.] DoDependentLoads          [.]
0x000000032b010b80  2M

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---

Changes since V3
- Apply modified get_page_size_name()

 tools/perf/Documentation/perf-report.txt |  1 +
 tools/perf/util/hist.c                   |  3 +++
 tools/perf/util/hist.h                   |  1 +
 tools/perf/util/machine.c                |  7 +++++--
 tools/perf/util/sort.c                   | 30 ++++++++++++++++++++++++++++++
 tools/perf/util/sort.h                   |  1 +
 tools/perf/util/symbol.h                 |  1 +
 7 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index 1a27bfe..2ca0477 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -147,6 +147,7 @@ OPTIONS
 	- snoop: type of snoop (if any) for the data at the time of the sample
 	- dcacheline: the cacheline the data address is on at the time of the sample
 	- phys_daddr: physical address of data being executed on at the time of sample
+	- data_page_size: the data page size of data being executed on at the time of sample
 
 	And the default sort keys are changed to local_weight, mem, sym, dso,
 	symbol_daddr, dso_daddr, snoop, tlb, locked, see '--mem-mode'.
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 9e7a8e0..253bf3f 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -174,6 +174,9 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
 		hists__new_col_len(hists, HISTC_MEM_PHYS_DADDR,
 				   unresolved_col_width + 4 + 2);
 
+		hists__new_col_len(hists, HISTC_MEM_DATA_PAGE_SIZE,
+				   unresolved_col_width + 4 + 2);
+
 	} else {
 		symlen = unresolved_col_width + 4 + 2;
 		hists__new_col_len(hists, HISTC_MEM_DADDR_SYMBOL, symlen);
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 08267af..2b72d03 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -49,6 +49,7 @@ enum hist_column {
 	HISTC_MEM_DADDR_SYMBOL,
 	HISTC_MEM_DADDR_DSO,
 	HISTC_MEM_PHYS_DADDR,
+	HISTC_MEM_DATA_PAGE_SIZE,
 	HISTC_MEM_LOCKED,
 	HISTC_MEM_TLB,
 	HISTC_MEM_LVL,
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 66f019f..9e54725 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -1914,11 +1914,12 @@ static void ip__resolve_ams(struct thread *thread,
 	ams->sym = al.sym;
 	ams->map = al.map;
 	ams->phys_addr = 0;
+	ams->data_page_size = 0;
 }
 
 static void ip__resolve_data(struct thread *thread,
 			     u8 m, struct addr_map_symbol *ams,
-			     u64 addr, u64 phys_addr)
+			     u64 addr, u64 phys_addr, u64 daddr_page_size)
 {
 	struct addr_location al;
 
@@ -1931,6 +1932,7 @@ static void ip__resolve_data(struct thread *thread,
 	ams->sym = al.sym;
 	ams->map = al.map;
 	ams->phys_addr = phys_addr;
+	ams->data_page_size = daddr_page_size;
 }
 
 struct mem_info *sample__resolve_mem(struct perf_sample *sample,
@@ -1943,7 +1945,8 @@ struct mem_info *sample__resolve_mem(struct perf_sample *sample,
 
 	ip__resolve_ams(al->thread, &mi->iaddr, sample->ip);
 	ip__resolve_data(al->thread, al->cpumode, &mi->daddr,
-			 sample->addr, sample->phys_addr);
+			 sample->addr, sample->phys_addr,
+			 sample->data_page_size);
 	mi->data_src.val = sample->data_src;
 
 	return mi;
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 6c1a837..1e0bb0c 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -1393,6 +1393,35 @@ struct sort_entry sort_mem_phys_daddr = {
 };
 
 static int64_t
+sort__data_page_size_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	uint64_t l = 0, r = 0;
+
+	if (left->mem_info)
+		l = left->mem_info->daddr.data_page_size;
+	if (right->mem_info)
+		r = right->mem_info->daddr.data_page_size;
+
+	return (int64_t)(r - l);
+}
+
+static int hist_entry__data_page_size_snprintf(struct hist_entry *he, char *bf,
+					  size_t size, unsigned int width)
+{
+	char str[PAGE_SIZE_NAME_LEN];
+
+	return repsep_snprintf(bf, size, "%-*s", width,
+			       get_page_size_name(he->mem_info->daddr.data_page_size, str));
+}
+
+struct sort_entry sort_mem_data_page_size = {
+	.se_header	= "Data Page Size",
+	.se_cmp		= sort__data_page_size_cmp,
+	.se_snprintf	= hist_entry__data_page_size_snprintf,
+	.se_width_idx	= HISTC_MEM_DATA_PAGE_SIZE,
+};
+
+static int64_t
 sort__abort_cmp(struct hist_entry *left, struct hist_entry *right)
 {
 	if (!left->branch_info || !right->branch_info)
@@ -1668,6 +1697,7 @@ static struct sort_dimension memory_sort_dimensions[] = {
 	DIM(SORT_MEM_SNOOP, "snoop", sort_mem_snoop),
 	DIM(SORT_MEM_DCACHELINE, "dcacheline", sort_mem_dcacheline),
 	DIM(SORT_MEM_PHYS_DADDR, "phys_daddr", sort_mem_phys_daddr),
+	DIM(SORT_MEM_DATA_PAGE_SIZE, "data_page_size", sort_mem_data_page_size),
 };
 
 #undef DIM
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index dd63128..2e324ae 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -256,6 +256,7 @@ enum sort_type {
 	SORT_MEM_DCACHELINE,
 	SORT_MEM_IADDR_SYMBOL,
 	SORT_MEM_PHYS_DADDR,
+	SORT_MEM_DATA_PAGE_SIZE,
 };
 
 /*
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index 56e2bcb..3c563ab 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -126,6 +126,7 @@ struct addr_map_symbol {
 	u64	      addr;
 	u64	      al_addr;
 	u64	      phys_addr;
+	u64	      data_page_size;
 };
 
 struct branch_info {
-- 
2.7.4


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH V4 05/13] perf mem: Factor out a function to generate sort order
  2019-01-31 20:27 [PATCH V4 01/13] perf/core, x86: Add PERF_SAMPLE_DATA_PAGE_SIZE kan.liang
                   ` (2 preceding siblings ...)
  2019-01-31 20:27 ` [PATCH V4 04/13] perf sort: Add sort option for " kan.liang
@ 2019-01-31 20:27 ` kan.liang
  2019-01-31 20:27 ` [PATCH V4 06/13] perf mem: Clean up output format kan.liang
                   ` (8 subsequent siblings)
  12 siblings, 0 replies; 23+ messages in thread
From: kan.liang @ 2019-01-31 20:27 UTC (permalink / raw)
  To: peterz, acme, tglx, mingo, linux-kernel
  Cc: eranian, jolsa, namhyung, ak, luto, Kan Liang

From: Kan Liang <kan.liang@linux.intel.com>

Now, "--phys-data" is the only option which impacts the sort order.
A simple "if else" is enough to handle the option. But there will be
more options added, e.g. "--data-page-size", which also impact the sort
order. The code will become too complex to be maintained.

Divide the sort order string into several small pieces.
The first piece is always the default sort string for LOAD/STORE.
Appends the specific sort string if related option is applied.

No functional change.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---

No changes since V3

 tools/perf/builtin-mem.c | 41 +++++++++++++++++++++++++++--------------
 1 file changed, 27 insertions(+), 14 deletions(-)

diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c
index 57393e9..0647bd7 100644
--- a/tools/perf/builtin-mem.c
+++ b/tools/perf/builtin-mem.c
@@ -273,11 +273,35 @@ static int report_raw_events(struct perf_mem *mem)
 	perf_session__delete(session);
 	return ret;
 }
+static char *get_sort_order(struct perf_mem *mem)
+{
+	bool has_extra_options = mem->phys_addr ? true : false;
+	char sort[128];
+
+	/*
+	 * there is no weight (cost) associated with stores, so don't print
+	 * the column
+	 */
+	if (!(mem->operation & MEM_OPERATION_LOAD)) {
+		strcpy(sort, "--sort=mem,sym,dso,symbol_daddr,"
+			     "dso_daddr,tlb,locked");
+	} else if (has_extra_options) {
+		strcpy(sort, "--sort=local_weight,mem,sym,dso,symbol_daddr,"
+			     "dso_daddr,snoop,tlb,locked");
+	} else
+		return NULL;
+
+	if (mem->phys_addr)
+		strcat(sort, ",phys_daddr");
+
+	return strdup(sort);
+}
 
 static int report_events(int argc, const char **argv, struct perf_mem *mem)
 {
 	const char **rep_argv;
 	int ret, i = 0, j, rep_argc;
+	char *new_sort_order;
 
 	if (mem->dump_raw)
 		return report_raw_events(mem);
@@ -291,20 +315,9 @@ static int report_events(int argc, const char **argv, struct perf_mem *mem)
 	rep_argv[i++] = "--mem-mode";
 	rep_argv[i++] = "-n"; /* display number of samples */
 
-	/*
-	 * there is no weight (cost) associated with stores, so don't print
-	 * the column
-	 */
-	if (!(mem->operation & MEM_OPERATION_LOAD)) {
-		if (mem->phys_addr)
-			rep_argv[i++] = "--sort=mem,sym,dso,symbol_daddr,"
-					"dso_daddr,tlb,locked,phys_daddr";
-		else
-			rep_argv[i++] = "--sort=mem,sym,dso,symbol_daddr,"
-					"dso_daddr,tlb,locked";
-	} else if (mem->phys_addr)
-		rep_argv[i++] = "--sort=local_weight,mem,sym,dso,symbol_daddr,"
-				"dso_daddr,snoop,tlb,locked,phys_daddr";
+	new_sort_order = get_sort_order(mem);
+	if (new_sort_order)
+		rep_argv[i++] = new_sort_order;
 
 	for (j = 1; j < argc; j++, i++)
 		rep_argv[i] = argv[j];
-- 
2.7.4


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH V4 06/13] perf mem: Clean up output format
  2019-01-31 20:27 [PATCH V4 01/13] perf/core, x86: Add PERF_SAMPLE_DATA_PAGE_SIZE kan.liang
                   ` (3 preceding siblings ...)
  2019-01-31 20:27 ` [PATCH V4 05/13] perf mem: Factor out a function to generate sort order kan.liang
@ 2019-01-31 20:27 ` kan.liang
  2019-01-31 20:28 ` [PATCH V4 07/13] perf mem: Support data page size kan.liang
                   ` (7 subsequent siblings)
  12 siblings, 0 replies; 23+ messages in thread
From: kan.liang @ 2019-01-31 20:27 UTC (permalink / raw)
  To: peterz, acme, tglx, mingo, linux-kernel
  Cc: eranian, jolsa, namhyung, ak, luto, Kan Liang

From: Kan Liang <kan.liang@linux.intel.com>

Now, "--phys-data" is the only option which impacts the output format.
A simple "if else" is enough to handle the option. But there will be
more options added, e.g. "--data-page-size", which also impact the
output format. The code will become too complex to be maintained.

Divide the big printf into several small pieces. Output the specific
piece only if the related option is applied.

No functional change.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---

No changes since V3

 tools/perf/builtin-mem.c | 93 ++++++++++++++++++++----------------------------
 1 file changed, 38 insertions(+), 55 deletions(-)

diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c
index 0647bd7..24acc2a 100644
--- a/tools/perf/builtin-mem.c
+++ b/tools/perf/builtin-mem.c
@@ -153,7 +153,7 @@ dump_raw_samples(struct perf_tool *tool,
 {
 	struct perf_mem *mem = container_of(tool, struct perf_mem, tool);
 	struct addr_location al;
-	const char *fmt;
+	const char *fmt, *field_sep;
 
 	if (machine__resolve(machine, &al, sample) < 0) {
 		fprintf(stderr, "problem processing %d event, skipping it.\n",
@@ -167,60 +167,41 @@ dump_raw_samples(struct perf_tool *tool,
 	if (al.map != NULL)
 		al.map->dso->hit = 1;
 
-	if (mem->phys_addr) {
-		if (symbol_conf.field_sep) {
-			fmt = "%d%s%d%s0x%"PRIx64"%s0x%"PRIx64"%s0x%016"PRIx64
-			      "%s%"PRIu64"%s0x%"PRIx64"%s%s:%s\n";
-		} else {
-			fmt = "%5d%s%5d%s0x%016"PRIx64"%s0x016%"PRIx64
-			      "%s0x%016"PRIx64"%s%5"PRIu64"%s0x%06"PRIx64
-			      "%s%s:%s\n";
-			symbol_conf.field_sep = " ";
-		}
-
-		printf(fmt,
-			sample->pid,
-			symbol_conf.field_sep,
-			sample->tid,
-			symbol_conf.field_sep,
-			sample->ip,
-			symbol_conf.field_sep,
-			sample->addr,
-			symbol_conf.field_sep,
-			sample->phys_addr,
-			symbol_conf.field_sep,
-			sample->weight,
-			symbol_conf.field_sep,
-			sample->data_src,
-			symbol_conf.field_sep,
-			al.map ? (al.map->dso ? al.map->dso->long_name : "???") : "???",
-			al.sym ? al.sym->name : "???");
+	field_sep = symbol_conf.field_sep;
+	if (field_sep) {
+		fmt = "%d%s%d%s0x%"PRIx64"%s0x%"PRIx64"%s";
 	} else {
-		if (symbol_conf.field_sep) {
-			fmt = "%d%s%d%s0x%"PRIx64"%s0x%"PRIx64"%s%"PRIu64
-			      "%s0x%"PRIx64"%s%s:%s\n";
-		} else {
-			fmt = "%5d%s%5d%s0x%016"PRIx64"%s0x016%"PRIx64
-			      "%s%5"PRIu64"%s0x%06"PRIx64"%s%s:%s\n";
-			symbol_conf.field_sep = " ";
-		}
+		fmt = "%5d%s%5d%s0x%016"PRIx64"%s0x016%"PRIx64"%s";
+		symbol_conf.field_sep = " ";
+	}
+	printf(fmt,
+		sample->pid,
+		symbol_conf.field_sep,
+		sample->tid,
+		symbol_conf.field_sep,
+		sample->ip,
+		symbol_conf.field_sep,
+		sample->addr,
+		symbol_conf.field_sep);
 
-		printf(fmt,
-			sample->pid,
-			symbol_conf.field_sep,
-			sample->tid,
-			symbol_conf.field_sep,
-			sample->ip,
-			symbol_conf.field_sep,
-			sample->addr,
-			symbol_conf.field_sep,
-			sample->weight,
-			symbol_conf.field_sep,
-			sample->data_src,
-			symbol_conf.field_sep,
-			al.map ? (al.map->dso ? al.map->dso->long_name : "???") : "???",
-			al.sym ? al.sym->name : "???");
+	if (mem->phys_addr) {
+		printf("0x%016"PRIx64"%s",
+			sample->phys_addr,
+			symbol_conf.field_sep);
 	}
+
+	if (field_sep)
+		fmt = "%"PRIu64"%s0x%"PRIx64"%s%s:%s\n";
+	else
+		fmt = "%5"PRIu64"%s0x%06"PRIx64"%s%s:%s\n";
+
+	printf(fmt,
+		sample->weight,
+		symbol_conf.field_sep,
+		sample->data_src,
+		symbol_conf.field_sep,
+		al.map ? (al.map->dso ? al.map->dso->long_name : "???") : "???",
+		al.sym ? al.sym->name : "???");
 out_put:
 	addr_location__put(&al);
 	return 0;
@@ -262,10 +243,12 @@ static int report_raw_events(struct perf_mem *mem)
 	if (ret < 0)
 		goto out_delete;
 
+	printf("# PID, TID, IP, ADDR, ");
+
 	if (mem->phys_addr)
-		printf("# PID, TID, IP, ADDR, PHYS ADDR, LOCAL WEIGHT, DSRC, SYMBOL\n");
-	else
-		printf("# PID, TID, IP, ADDR, LOCAL WEIGHT, DSRC, SYMBOL\n");
+		printf("PHYS ADDR, ");
+
+	printf("LOCAL WEIGHT, DSRC, SYMBOL\n");
 
 	ret = perf_session__process_events(session);
 
-- 
2.7.4


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH V4 07/13] perf mem: Support data page size
  2019-01-31 20:27 [PATCH V4 01/13] perf/core, x86: Add PERF_SAMPLE_DATA_PAGE_SIZE kan.liang
                   ` (4 preceding siblings ...)
  2019-01-31 20:27 ` [PATCH V4 06/13] perf mem: Clean up output format kan.liang
@ 2019-01-31 20:28 ` kan.liang
  2019-01-31 20:28 ` [PATCH V4 08/13] perf test: Add test case for PERF_SAMPLE_DATA_PAGE_SIZE kan.liang
                   ` (6 subsequent siblings)
  12 siblings, 0 replies; 23+ messages in thread
From: kan.liang @ 2019-01-31 20:28 UTC (permalink / raw)
  To: peterz, acme, tglx, mingo, linux-kernel
  Cc: eranian, jolsa, namhyung, ak, luto, Kan Liang

From: Kan Liang <kan.liang@linux.intel.com>

Add option --data-page-size in "perf mem" to record/report data page
size.

Here are some examples.
perf mem --phys-data --data-page-size report -D

 # PID, TID, IP, ADDR, PHYS ADDR, DATA PAGE SIZE, LOCAL WEIGHT, DSRC,
 # SYMBOL
20134 20134 0xffffffffb5bd2fd0 0x016ffff9a274e96a308 0x000000044e96a308
4K  1168 0x5080144
/lib/modules/4.18.0-rc7+/build/vmlinux:perf_ctx_unlock
20134 20134 0xffffffffb63f645c 0xffffffffb752b814 0xcfb52b814 2M 225
0x26a100142 /lib/modules/4.18.0-rc7+/build/vmlinux:_raw_spin_lock
20134 20134 0xffffffffb660300c 0xfffffe00016b8bb0 0x0 4K 0 0x5080144
/lib/modules/4.18.0-rc7+/build/vmlinux:__x86_indirect_thunk_rax

perf mem --phys-data --data-page-size report --stdio

 # To display the perf.data header info, please use
 # --header/--header-only options.
 #
 #
 # Total Lost Samples: 0
 #
 # Samples: 5K of event 'cpu/mem-loads,ldlat=30/P'
 # Total weight : 281234
 # Sort order   :
 # mem,sym,dso,symbol_daddr,dso_daddr,tlb,locked,phys_daddr,data_page_size
 #
 # Overhead       Samples  Memory access             Symbol
 # Shared Object     Data Symbol                                  Data
 # Object              TLB access              Locked  Data Physical
 # Address   Data Page Size
 # ........  ............  ........................
 # ................................  ................
 # ...........................................  .......................
 # ......................  ......  ......................
 # ......................
 #
    28.54%          1826  L1 or L1 hit              [k]
__x86_indirect_thunk_rax      [kernel.vmlinux]  [k] 0xffffb0df31b0ff28
[unknown]                L1 or L2 hit            No      [k]
0000000000000000    4K
     6.02%           256  L1 or L1 hit              [.] touch_buffer
dtlb              [.] 0x00007ffd50109da8                       [stack]
L1 or L2 hit            No      [.] 0x000000042454ada8  4K
     3.23%             5  L1 or L1 hit              [k] clear_huge_page
[kernel.vmlinux]  [k] 0xffff9a2753b8ce60                       [unknown]
L1 or L2 hit            No      [k] 0x0000000453b8ce60  2M
     2.98%             4  L1 or L1 hit              [k] clear_page_erms
[kernel.vmlinux]  [k] 0xffffb0df31b0fd00                       [unknown]
L1 or L2 hit            No      [k] 0000000000000000    4K

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---

Changes since V3
- Apply modified get_page_size_name()

 tools/perf/Documentation/perf-mem.txt |  3 +++
 tools/perf/builtin-mem.c              | 20 +++++++++++++++++++-
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/tools/perf/Documentation/perf-mem.txt b/tools/perf/Documentation/perf-mem.txt
index f8d2167..9110692 100644
--- a/tools/perf/Documentation/perf-mem.txt
+++ b/tools/perf/Documentation/perf-mem.txt
@@ -63,6 +63,9 @@ OPTIONS
 --phys-data::
 	Record/Report sample physical addresses
 
+--data-page-size::
+	Record/Report sample data address page size
+
 RECORD OPTIONS
 --------------
 -e::
diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c
index 24acc2a..d14c28f 100644
--- a/tools/perf/builtin-mem.c
+++ b/tools/perf/builtin-mem.c
@@ -25,6 +25,7 @@ struct perf_mem {
 	bool			dump_raw;
 	bool			force;
 	bool			phys_addr;
+	bool			data_page_size;
 	int			operation;
 	const char		*cpu_list;
 	DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
@@ -106,6 +107,9 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem)
 	if (mem->phys_addr)
 		rec_argv[i++] = "--phys-data";
 
+	if (mem->data_page_size)
+		rec_argv[i++] = "--data-page-size";
+
 	for (j = 0; j < PERF_MEM_EVENTS__MAX; j++) {
 		if (!perf_mem_events[j].record)
 			continue;
@@ -154,6 +158,7 @@ dump_raw_samples(struct perf_tool *tool,
 	struct perf_mem *mem = container_of(tool, struct perf_mem, tool);
 	struct addr_location al;
 	const char *fmt, *field_sep;
+	char str[PAGE_SIZE_NAME_LEN];
 
 	if (machine__resolve(machine, &al, sample) < 0) {
 		fprintf(stderr, "problem processing %d event, skipping it.\n",
@@ -190,6 +195,12 @@ dump_raw_samples(struct perf_tool *tool,
 			symbol_conf.field_sep);
 	}
 
+	if (mem->data_page_size) {
+		printf("%s%s",
+			get_page_size_name(sample->data_page_size, str),
+			symbol_conf.field_sep);
+	}
+
 	if (field_sep)
 		fmt = "%"PRIu64"%s0x%"PRIx64"%s%s:%s\n";
 	else
@@ -248,6 +259,9 @@ static int report_raw_events(struct perf_mem *mem)
 	if (mem->phys_addr)
 		printf("PHYS ADDR, ");
 
+	if (mem->data_page_size)
+		printf("DATA PAGE SIZE, ");
+
 	printf("LOCAL WEIGHT, DSRC, SYMBOL\n");
 
 	ret = perf_session__process_events(session);
@@ -258,7 +272,7 @@ static int report_raw_events(struct perf_mem *mem)
 }
 static char *get_sort_order(struct perf_mem *mem)
 {
-	bool has_extra_options = mem->phys_addr ? true : false;
+	bool has_extra_options = (mem->phys_addr | mem->data_page_size) ? true : false;
 	char sort[128];
 
 	/*
@@ -277,6 +291,9 @@ static char *get_sort_order(struct perf_mem *mem)
 	if (mem->phys_addr)
 		strcat(sort, ",phys_daddr");
 
+	if (mem->data_page_size)
+		strcat(sort, ",data_page_size");
+
 	return strdup(sort);
 }
 
@@ -418,6 +435,7 @@ int cmd_mem(int argc, const char **argv)
 		   " between columns '.' is reserved."),
 	OPT_BOOLEAN('f', "force", &mem.force, "don't complain, do it"),
 	OPT_BOOLEAN('p', "phys-data", &mem.phys_addr, "Record/Report sample physical addresses"),
+	OPT_BOOLEAN(0, "data-page-size", &mem.data_page_size, "Record/Report sample data address page size"),
 	OPT_END()
 	};
 	const char *const mem_subcommands[] = { "record", "report", NULL };
-- 
2.7.4


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH V4 08/13] perf test: Add test case for PERF_SAMPLE_DATA_PAGE_SIZE
  2019-01-31 20:27 [PATCH V4 01/13] perf/core, x86: Add PERF_SAMPLE_DATA_PAGE_SIZE kan.liang
                   ` (5 preceding siblings ...)
  2019-01-31 20:28 ` [PATCH V4 07/13] perf mem: Support data page size kan.liang
@ 2019-01-31 20:28 ` kan.liang
  2019-01-31 20:28 ` [PATCH V4 09/13] perf/core, x86: Add support for PERF_SAMPLE_CODE_PAGE_SIZE kan.liang
                   ` (5 subsequent siblings)
  12 siblings, 0 replies; 23+ messages in thread
From: kan.liang @ 2019-01-31 20:28 UTC (permalink / raw)
  To: peterz, acme, tglx, mingo, linux-kernel
  Cc: eranian, jolsa, namhyung, ak, luto, Kan Liang

From: Kan Liang <kan.liang@linux.intel.com>

Extend sample-parsing test cases to support new sample type
PERF_SAMPLE_DATA_PAGE_SIZE.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---

Changes since V3
- Use the real page size to replace enum.

 tools/perf/tests/sample-parsing.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tools/perf/tests/sample-parsing.c b/tools/perf/tests/sample-parsing.c
index 0e2d00d..cf74f9d 100644
--- a/tools/perf/tests/sample-parsing.c
+++ b/tools/perf/tests/sample-parsing.c
@@ -145,6 +145,9 @@ static bool samples_same(const struct perf_sample *s1,
 	if (type & PERF_SAMPLE_PHYS_ADDR)
 		COMP(phys_addr);
 
+	if (type & PERF_SAMPLE_DATA_PAGE_SIZE)
+		COMP(data_page_size);
+
 	return true;
 }
 
@@ -210,7 +213,9 @@ static int do_test(u64 sample_type, u64 sample_regs, u64 read_format)
 			.mask	= sample_regs,
 			.regs	= regs,
 		},
+
 		.phys_addr	= 113,
+		.data_page_size	= 4096,
 	};
 	struct sample_read_value values[] = {{1, 5}, {9, 3}, {2, 7}, {6, 4},};
 	struct perf_sample sample_out;
@@ -310,7 +315,7 @@ int test__sample_parsing(struct test *test __maybe_unused, int subtest __maybe_u
 	 * were added.  Please actually update the test rather than just change
 	 * the condition below.
 	 */
-	if (PERF_SAMPLE_MAX > PERF_SAMPLE_PHYS_ADDR << 1) {
+	if (PERF_SAMPLE_MAX > PERF_SAMPLE_DATA_PAGE_SIZE << 1) {
 		pr_debug("sample format has changed, some new PERF_SAMPLE_ bit was introduced - test needs updating\n");
 		return -1;
 	}
-- 
2.7.4


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH V4 09/13] perf/core, x86: Add support for PERF_SAMPLE_CODE_PAGE_SIZE
  2019-01-31 20:27 [PATCH V4 01/13] perf/core, x86: Add PERF_SAMPLE_DATA_PAGE_SIZE kan.liang
                   ` (6 preceding siblings ...)
  2019-01-31 20:28 ` [PATCH V4 08/13] perf test: Add test case for PERF_SAMPLE_DATA_PAGE_SIZE kan.liang
@ 2019-01-31 20:28 ` kan.liang
  2019-01-31 20:28 ` [PATCH V4 10/13] perf tools: " kan.liang
                   ` (4 subsequent siblings)
  12 siblings, 0 replies; 23+ messages in thread
From: kan.liang @ 2019-01-31 20:28 UTC (permalink / raw)
  To: peterz, acme, tglx, mingo, linux-kernel
  Cc: eranian, jolsa, namhyung, ak, luto, Kan Liang

From: Stephane Eranian <eranian@google.com>

When studying code layout, it is useful to capture the page size of the
sampled code address.

Add a new sample type for code page size.
The new sample type requires collecting the ip. The code page size can
be calculated from the IRQ-safe perf_get_page_size().

Only the generic support is covered. The large PEBS will be disabled
with this sample type.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---

No changes since V3

 include/linux/perf_event.h      |  1 +
 include/uapi/linux/perf_event.h |  4 +++-
 kernel/events/core.c            | 11 ++++++++++-
 3 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 0e048ab..10c23f9 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -938,6 +938,7 @@ struct perf_sample_data {
 
 	u64				phys_addr;
 	u64				data_page_size;
+	u64				code_page_size;
 } ____cacheline_aligned;
 
 /* default value for data source */
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 0e8d222..045e218 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -142,8 +142,9 @@ enum perf_event_sample_format {
 	PERF_SAMPLE_REGS_INTR			= 1U << 18,
 	PERF_SAMPLE_PHYS_ADDR			= 1U << 19,
 	PERF_SAMPLE_DATA_PAGE_SIZE		= 1U << 20,
+	PERF_SAMPLE_CODE_PAGE_SIZE		= 1U << 21,
 
-	PERF_SAMPLE_MAX = 1U << 21,		/* non-ABI */
+	PERF_SAMPLE_MAX = 1U << 22,		/* non-ABI */
 
 	__PERF_SAMPLE_CALLCHAIN_EARLY		= 1ULL << 63, /* non-ABI; internal use */
 };
@@ -865,6 +866,7 @@ enum perf_event_type {
 	 *	  u64			regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
 	 *	{ u64			phys_addr;} && PERF_SAMPLE_PHYS_ADDR
 	 *	{ u64			data_page_size;} && PERF_SAMPLE_DATA_PAGE_SIZE
+	 *	{ u64			code_page_size;} && PERF_SAMPLE_CODE_PAGE_SIZE
 	 * };
 	 */
 	PERF_RECORD_SAMPLE			= 9,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d233f45..a1575b4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1756,6 +1756,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
 	if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
 		size += sizeof(data->data_page_size);
 
+	if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
+		size += sizeof(data->code_page_size);
+
 	event->header_size = size;
 }
 
@@ -6311,6 +6314,9 @@ void perf_output_sample(struct perf_output_handle *handle,
 	if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
 		perf_output_put(handle, data->data_page_size);
 
+	if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
+		perf_output_put(handle, data->code_page_size);
+
 	if (!event->attr.watermark) {
 		int wakeup_events = event->attr.wakeup_events;
 
@@ -6399,7 +6405,7 @@ void perf_prepare_sample(struct perf_event_header *header,
 
 	__perf_event_header__init_id(header, data, event);
 
-	if (sample_type & PERF_SAMPLE_IP)
+	if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE))
 		data->ip = perf_instruction_pointer(regs);
 
 	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
@@ -6508,6 +6514,9 @@ void perf_prepare_sample(struct perf_event_header *header,
 
 	if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
 		data->data_page_size = perf_get_page_size(data->addr);
+
+	if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
+		data->code_page_size = perf_get_page_size(data->ip);
 }
 
 static __always_inline int
-- 
2.7.4


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH V4 10/13] perf tools: Add support for PERF_SAMPLE_CODE_PAGE_SIZE
  2019-01-31 20:27 [PATCH V4 01/13] perf/core, x86: Add PERF_SAMPLE_DATA_PAGE_SIZE kan.liang
                   ` (7 preceding siblings ...)
  2019-01-31 20:28 ` [PATCH V4 09/13] perf/core, x86: Add support for PERF_SAMPLE_CODE_PAGE_SIZE kan.liang
@ 2019-01-31 20:28 ` kan.liang
  2019-01-31 20:28 ` [PATCH V4 11/13] perf script: " kan.liang
                   ` (3 subsequent siblings)
  12 siblings, 0 replies; 23+ messages in thread
From: kan.liang @ 2019-01-31 20:28 UTC (permalink / raw)
  To: peterz, acme, tglx, mingo, linux-kernel
  Cc: eranian, jolsa, namhyung, ak, luto, Kan Liang

From: Stephane Eranian <eranian@google.com>

Adds the infrastructure to sample the code address page size.

Introduce a new --code-page-size option for perf record.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---

Changes since V3
- Use the real page size to replace enum.

 tools/include/uapi/linux/perf_event.h    |  4 +++-
 tools/perf/Documentation/perf-record.txt |  3 +++
 tools/perf/builtin-record.c              |  2 ++
 tools/perf/perf.h                        |  1 +
 tools/perf/util/event.h                  |  1 +
 tools/perf/util/evsel.c                  | 18 ++++++++++++++++++
 6 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
index 0e8d222..045e218 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -142,8 +142,9 @@ enum perf_event_sample_format {
 	PERF_SAMPLE_REGS_INTR			= 1U << 18,
 	PERF_SAMPLE_PHYS_ADDR			= 1U << 19,
 	PERF_SAMPLE_DATA_PAGE_SIZE		= 1U << 20,
+	PERF_SAMPLE_CODE_PAGE_SIZE		= 1U << 21,
 
-	PERF_SAMPLE_MAX = 1U << 21,		/* non-ABI */
+	PERF_SAMPLE_MAX = 1U << 22,		/* non-ABI */
 
 	__PERF_SAMPLE_CALLCHAIN_EARLY		= 1ULL << 63, /* non-ABI; internal use */
 };
@@ -865,6 +866,7 @@ enum perf_event_type {
 	 *	  u64			regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
 	 *	{ u64			phys_addr;} && PERF_SAMPLE_PHYS_ADDR
 	 *	{ u64			data_page_size;} && PERF_SAMPLE_DATA_PAGE_SIZE
+	 *	{ u64			code_page_size;} && PERF_SAMPLE_CODE_PAGE_SIZE
 	 * };
 	 */
 	PERF_RECORD_SAMPLE			= 9,
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index c2a2875..edaaeca 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -267,6 +267,9 @@ OPTIONS
 --data-page-size::
 	Record the sampled data address data page size
 
+--code-page-size::
+	Record the sampled code address (ip) page size
+
 -T::
 --timestamp::
 	Record the sample timestamps. Use it with 'perf report -D' to see the
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index b9df9b9..cac5804 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -1872,6 +1872,8 @@ static struct option __record_options[] = {
 		    "Record the sample physical addresses"),
 	OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
 		    "Record the sampled data address data page size"),
+	OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
+		    "Record the sampled code address page size"),
 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
 			&record.opts.sample_time_set,
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index 87a345a..9a2348d 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -49,6 +49,7 @@ struct record_opts {
 	bool	     sample_address;
 	bool	     sample_phys_addr;
 	bool	     sample_data_page_size;
+	bool	     sample_code_page_size;
 	bool	     sample_weight;
 	bool	     sample_time;
 	bool	     sample_time_set;
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 7ef06fc..601e085 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -208,6 +208,7 @@ struct perf_sample {
 	u64 data_src;
 	u64 phys_addr;
 	u64 data_page_size;
+	u64 code_page_size;
 	u32 flags;
 	u16 insn_len;
 	u8  cpumode;
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 4d9e29e..541de5b 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1022,6 +1022,9 @@ void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts,
 	if (opts->sample_data_page_size)
 		perf_evsel__set_sample_bit(evsel, DATA_PAGE_SIZE);
 
+	if (opts->sample_code_page_size)
+		perf_evsel__set_sample_bit(evsel, CODE_PAGE_SIZE);
+
 	if (opts->no_buffering) {
 		attr->watermark = 0;
 		attr->wakeup_events = 1;
@@ -1565,6 +1568,7 @@ static void __p_sample_type(char *buf, size_t size, u64 value)
 		bit_name(BRANCH_STACK), bit_name(REGS_USER), bit_name(STACK_USER),
 		bit_name(IDENTIFIER), bit_name(REGS_INTR), bit_name(DATA_SRC),
 		bit_name(WEIGHT), bit_name(PHYS_ADDR), bit_name(DATA_PAGE_SIZE),
+		bit_name(CODE_PAGE_SIZE),
 		{ .name = NULL, }
 	};
 #undef bit_name
@@ -2401,6 +2405,12 @@ int perf_evsel__parse_sample(struct perf_evsel *evsel, union perf_event *event,
 		array++;
 	}
 
+	data->code_page_size = 0;
+	if (type & PERF_SAMPLE_CODE_PAGE_SIZE) {
+		data->code_page_size = *array;
+		array++;
+	}
+
 	return 0;
 }
 
@@ -2556,6 +2566,9 @@ size_t perf_event__sample_event_size(const struct perf_sample *sample, u64 type,
 	if (type & PERF_SAMPLE_DATA_PAGE_SIZE)
 		result += sizeof(u64);
 
+	if (type & PERF_SAMPLE_CODE_PAGE_SIZE)
+		result += sizeof(u64);
+
 	return result;
 }
 
@@ -2730,6 +2743,11 @@ int perf_event__synthesize_sample(union perf_event *event, u64 type,
 		array++;
 	}
 
+	if (type & PERF_SAMPLE_CODE_PAGE_SIZE) {
+		*array = sample->code_page_size;
+		array++;
+	}
+
 	return 0;
 }
 
-- 
2.7.4


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH V4 11/13] perf script: Add support for PERF_SAMPLE_CODE_PAGE_SIZE
  2019-01-31 20:27 [PATCH V4 01/13] perf/core, x86: Add PERF_SAMPLE_DATA_PAGE_SIZE kan.liang
                   ` (8 preceding siblings ...)
  2019-01-31 20:28 ` [PATCH V4 10/13] perf tools: " kan.liang
@ 2019-01-31 20:28 ` kan.liang
  2019-01-31 20:28 ` [PATCH V4 12/13] perf report: " kan.liang
                   ` (2 subsequent siblings)
  12 siblings, 0 replies; 23+ messages in thread
From: kan.liang @ 2019-01-31 20:28 UTC (permalink / raw)
  To: peterz, acme, tglx, mingo, linux-kernel
  Cc: eranian, jolsa, namhyung, ak, luto, Kan Liang

From: Stephane Eranian <eranian@google.com>

Add a new perf script filter called code_page_size. There will be more
than 32 filters. Extend the enum perf_output_field from U to UL.
Display sampled code page sizes when PERF_SAMPLE_CODE_PAGE_SIZE was set.

For example,
perf script --fields comm,event,ip,code_page_size
            dtlb mem-loads:uP:            445777 4K
            dtlb mem-loads:uP:            40f724 4K
            dtlb mem-loads:uP:            474926 4K
            dtlb mem-loads:uP:            401075 4K
            dtlb mem-loads:uP:            401095 4K
            dtlb mem-loads:uP:            401095 4K
            dtlb mem-loads:uP:            4010cc 4K
            dtlb mem-loads:uP:            440b6f 4K

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---

Changes since V3
- Apply modified get_page_size_name()

 tools/perf/Documentation/perf-script.txt |  2 +-
 tools/perf/builtin-script.c              | 76 ++++++++++++++++++--------------
 tools/perf/util/session.c                |  3 ++
 3 files changed, 46 insertions(+), 35 deletions(-)

diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt
index 14ae84c1..08c6deb 100644
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -118,7 +118,7 @@ OPTIONS
         comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr, symoff,
         srcline, period, iregs, uregs, brstack, brstacksym, flags, bpf-output,
         brstackinsn, brstackoff, callindent, insn, insnlen, synth, phys_addr,
-        metric, misc, srccode, data_page_size.
+        metric, misc, srccode, data_page_size, code_page_size.
         Field list can be prepended with the type, trace, sw or hw,
         to indicate to which event type the field list applies.
         e.g., -F sw:comm,tid,time,ip,sym  and -F trace:time,cpu,trace
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 440ae80..5db8582 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -67,38 +67,39 @@ static int			max_blocks;
 unsigned int scripting_max_stack = PERF_MAX_STACK_DEPTH;
 
 enum perf_output_field {
-	PERF_OUTPUT_COMM            = 1U << 0,
-	PERF_OUTPUT_TID             = 1U << 1,
-	PERF_OUTPUT_PID             = 1U << 2,
-	PERF_OUTPUT_TIME            = 1U << 3,
-	PERF_OUTPUT_CPU             = 1U << 4,
-	PERF_OUTPUT_EVNAME          = 1U << 5,
-	PERF_OUTPUT_TRACE           = 1U << 6,
-	PERF_OUTPUT_IP              = 1U << 7,
-	PERF_OUTPUT_SYM             = 1U << 8,
-	PERF_OUTPUT_DSO             = 1U << 9,
-	PERF_OUTPUT_ADDR            = 1U << 10,
-	PERF_OUTPUT_SYMOFFSET       = 1U << 11,
-	PERF_OUTPUT_SRCLINE         = 1U << 12,
-	PERF_OUTPUT_PERIOD          = 1U << 13,
-	PERF_OUTPUT_IREGS	    = 1U << 14,
-	PERF_OUTPUT_BRSTACK	    = 1U << 15,
-	PERF_OUTPUT_BRSTACKSYM	    = 1U << 16,
-	PERF_OUTPUT_DATA_SRC	    = 1U << 17,
-	PERF_OUTPUT_WEIGHT	    = 1U << 18,
-	PERF_OUTPUT_BPF_OUTPUT	    = 1U << 19,
-	PERF_OUTPUT_CALLINDENT	    = 1U << 20,
-	PERF_OUTPUT_INSN	    = 1U << 21,
-	PERF_OUTPUT_INSNLEN	    = 1U << 22,
-	PERF_OUTPUT_BRSTACKINSN	    = 1U << 23,
-	PERF_OUTPUT_BRSTACKOFF	    = 1U << 24,
-	PERF_OUTPUT_SYNTH           = 1U << 25,
-	PERF_OUTPUT_PHYS_ADDR       = 1U << 26,
-	PERF_OUTPUT_UREGS	    = 1U << 27,
-	PERF_OUTPUT_METRIC	    = 1U << 28,
-	PERF_OUTPUT_MISC            = 1U << 29,
-	PERF_OUTPUT_SRCCODE	    = 1U << 30,
-	PERF_OUTPUT_DATA_PAGE_SIZE  = 1U << 31,
+	PERF_OUTPUT_COMM            = 1ULL << 0,
+	PERF_OUTPUT_TID             = 1ULL << 1,
+	PERF_OUTPUT_PID             = 1ULL << 2,
+	PERF_OUTPUT_TIME            = 1ULL << 3,
+	PERF_OUTPUT_CPU             = 1ULL << 4,
+	PERF_OUTPUT_EVNAME          = 1ULL << 5,
+	PERF_OUTPUT_TRACE           = 1ULL << 6,
+	PERF_OUTPUT_IP              = 1ULL << 7,
+	PERF_OUTPUT_SYM             = 1ULL << 8,
+	PERF_OUTPUT_DSO             = 1ULL << 9,
+	PERF_OUTPUT_ADDR            = 1ULL << 10,
+	PERF_OUTPUT_SYMOFFSET       = 1ULL << 11,
+	PERF_OUTPUT_SRCLINE         = 1ULL << 12,
+	PERF_OUTPUT_PERIOD          = 1ULL << 13,
+	PERF_OUTPUT_IREGS	    = 1ULL << 14,
+	PERF_OUTPUT_BRSTACK	    = 1ULL << 15,
+	PERF_OUTPUT_BRSTACKSYM	    = 1ULL << 16,
+	PERF_OUTPUT_DATA_SRC	    = 1ULL << 17,
+	PERF_OUTPUT_WEIGHT	    = 1ULL << 18,
+	PERF_OUTPUT_BPF_OUTPUT	    = 1ULL << 19,
+	PERF_OUTPUT_CALLINDENT	    = 1ULL << 20,
+	PERF_OUTPUT_INSN	    = 1ULL << 21,
+	PERF_OUTPUT_INSNLEN	    = 1ULL << 22,
+	PERF_OUTPUT_BRSTACKINSN	    = 1ULL << 23,
+	PERF_OUTPUT_BRSTACKOFF	    = 1ULL << 24,
+	PERF_OUTPUT_SYNTH           = 1ULL << 25,
+	PERF_OUTPUT_PHYS_ADDR       = 1ULL << 26,
+	PERF_OUTPUT_UREGS	    = 1ULL << 27,
+	PERF_OUTPUT_METRIC	    = 1ULL << 28,
+	PERF_OUTPUT_MISC            = 1ULL << 29,
+	PERF_OUTPUT_SRCCODE	    = 1ULL << 30,
+	PERF_OUTPUT_DATA_PAGE_SIZE  = 1ULL << 31,
+	PERF_OUTPUT_CODE_PAGE_SIZE  = 1ULL << 32,
 };
 
 struct output_option {
@@ -137,6 +138,7 @@ struct output_option {
 	{.str = "misc", .field = PERF_OUTPUT_MISC},
 	{.str = "srccode", .field = PERF_OUTPUT_SRCCODE},
 	{.str = "data_page_size", .field = PERF_OUTPUT_DATA_PAGE_SIZE},
+	{.str = "code_page_size", .field = PERF_OUTPUT_CODE_PAGE_SIZE},
 };
 
 enum {
@@ -208,7 +210,7 @@ static struct {
 			      PERF_OUTPUT_DSO | PERF_OUTPUT_PERIOD |
 			      PERF_OUTPUT_ADDR | PERF_OUTPUT_DATA_SRC |
 			      PERF_OUTPUT_WEIGHT | PERF_OUTPUT_PHYS_ADDR |
-			      PERF_OUTPUT_DATA_PAGE_SIZE,
+			      PERF_OUTPUT_DATA_PAGE_SIZE | PERF_OUTPUT_CODE_PAGE_SIZE,
 
 		.invalid_fields = PERF_OUTPUT_TRACE | PERF_OUTPUT_BPF_OUTPUT,
 	},
@@ -475,7 +477,10 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel,
 	if (PRINT_FIELD(DATA_PAGE_SIZE) &&
 		perf_evsel__check_stype(evsel, PERF_SAMPLE_DATA_PAGE_SIZE, "DATA_PAGE_SIZE",
 					PERF_OUTPUT_DATA_PAGE_SIZE))
-		return -EINVAL;
+	if (PRINT_FIELD(CODE_PAGE_SIZE) &&
+		perf_evsel__check_stype(evsel, PERF_SAMPLE_CODE_PAGE_SIZE, "CODE_PAGE_SIZE",
+					PERF_OUTPUT_CODE_PAGE_SIZE))
+			return -EINVAL;
 
 	return 0;
 }
@@ -1856,6 +1861,9 @@ static void process_event(struct perf_script *script,
 	if (PRINT_FIELD(DATA_PAGE_SIZE))
 		fprintf(fp, " %s", get_page_size_name(sample->data_page_size, str));
 
+	if (PRINT_FIELD(CODE_PAGE_SIZE))
+		fprintf(fp, " %s", get_page_size_name(sample->code_page_size, str));
+
 	fprintf(fp, "\n");
 
 	if (PRINT_FIELD(SRCCODE)) {
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 9ad3686..c7428fe 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -1143,6 +1143,9 @@ static void dump_sample(struct perf_evsel *evsel, union perf_event *event,
 	if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
 		printf(" .. data page size: %s\n", get_page_size_name(sample->data_page_size, str));
 
+	if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
+		printf(" .. code page size: %s\n", get_page_size_name(sample->code_page_size, str));
+
 	if (sample_type & PERF_SAMPLE_TRANSACTION)
 		printf("... transaction: %" PRIx64 "\n", sample->transaction);
 
-- 
2.7.4


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH V4 12/13] perf report: Add support for PERF_SAMPLE_CODE_PAGE_SIZE
  2019-01-31 20:27 [PATCH V4 01/13] perf/core, x86: Add PERF_SAMPLE_DATA_PAGE_SIZE kan.liang
                   ` (9 preceding siblings ...)
  2019-01-31 20:28 ` [PATCH V4 11/13] perf script: " kan.liang
@ 2019-01-31 20:28 ` kan.liang
  2019-01-31 20:28 ` [PATCH V4 13/13] perf test: Add test case " kan.liang
  2019-02-01  9:22 ` [PATCH V4 01/13] perf/core, x86: Add PERF_SAMPLE_DATA_PAGE_SIZE Peter Zijlstra
  12 siblings, 0 replies; 23+ messages in thread
From: kan.liang @ 2019-01-31 20:28 UTC (permalink / raw)
  To: peterz, acme, tglx, mingo, linux-kernel
  Cc: eranian, jolsa, namhyung, ak, luto, Kan Liang

From: Stephane Eranian <eranian@google.com>

Add a new sort dimension "code_page_size" for common sort.
With this option applied, perf can sort and report by sample's code page
size.

For example,
perf report --stdio --sort=comm,symbol,code_page_size
 # To display the perf.data header info, please use
 # --header/--header-only options.
 #
 #
 # Total Lost Samples: 0
 #
 # Samples: 3K of event 'mem-loads:uP'
 # Event count (approx.): 1470769
 #
 # Overhead  Command  Symbol                        Code Page Size IPC
 # [IPC Coverage]
 # ........  .......  ............................  ..............
 # ....................
 #
     69.56%  dtlb     [.] GetTickCount              4K             -

     17.93%  dtlb     [.] Calibrate                 4K             -
 -
     11.40%  dtlb     [.] __gettimeofday            4K             -
 -

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---

Changes since V3
- Apply modified get_page_size_name()

 tools/perf/Documentation/perf-report.txt |  1 +
 tools/perf/util/hist.c                   |  2 ++
 tools/perf/util/hist.h                   |  1 +
 tools/perf/util/sort.c                   | 26 ++++++++++++++++++++++++++
 tools/perf/util/sort.h                   |  2 ++
 5 files changed, 32 insertions(+)

diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index 2ca0477..b6ecdbc 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -105,6 +105,7 @@ OPTIONS
 	guest machine
 	- sample: Number of sample
 	- period: Raw number of event count of sample
+	- code_page_size: the code page size of sampled code address (ip)
 
 	By default, comm, dso and symbol keys are used.
 	(i.e. --sort comm,dso,symbol)
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 253bf3f..e4ab496 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -193,6 +193,7 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
 	hists__new_col_len(hists, HISTC_MEM_LVL, 21 + 3);
 	hists__new_col_len(hists, HISTC_LOCAL_WEIGHT, 12);
 	hists__new_col_len(hists, HISTC_GLOBAL_WEIGHT, 12);
+	hists__new_col_len(hists, HISTC_CODE_PAGE_SIZE, 6);
 
 	if (h->srcline) {
 		len = MAX(strlen(h->srcline), strlen(sort_srcline.se_header));
@@ -613,6 +614,7 @@ __hists__add_entry(struct hists *hists,
 		.cpumode = al->cpumode,
 		.ip	 = al->addr,
 		.level	 = al->level,
+		.code_page_size = sample->code_page_size,
 		.stat = {
 			.nr_events = 1,
 			.period	= sample->period,
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 2b72d03..76640fc 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -46,6 +46,7 @@ enum hist_column {
 	HISTC_DSO_TO,
 	HISTC_LOCAL_WEIGHT,
 	HISTC_GLOBAL_WEIGHT,
+	HISTC_CODE_PAGE_SIZE,
 	HISTC_MEM_DADDR_SYMBOL,
 	HISTC_MEM_DADDR_DSO,
 	HISTC_MEM_PHYS_DADDR,
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 1e0bb0c..c6d5e5c 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -1422,6 +1422,31 @@ struct sort_entry sort_mem_data_page_size = {
 };
 
 static int64_t
+sort__code_page_size_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	uint64_t l = left->code_page_size;
+	uint64_t r = right->code_page_size;
+
+	return (int64_t)(r - l);
+}
+
+static int hist_entry__code_page_size_snprintf(struct hist_entry *he, char *bf,
+					  size_t size, unsigned int width)
+{
+	char str[PAGE_SIZE_NAME_LEN];
+
+	return repsep_snprintf(bf, size, "%-*s", width,
+			       get_page_size_name(he->code_page_size, str));
+}
+
+struct sort_entry sort_code_page_size = {
+	.se_header	= "Code Page Size",
+	.se_cmp		= sort__code_page_size_cmp,
+	.se_snprintf	= hist_entry__code_page_size_snprintf,
+	.se_width_idx	= HISTC_CODE_PAGE_SIZE,
+};
+
+static int64_t
 sort__abort_cmp(struct hist_entry *left, struct hist_entry *right)
 {
 	if (!left->branch_info || !right->branch_info)
@@ -1663,6 +1688,7 @@ static struct sort_dimension common_sort_dimensions[] = {
 	DIM(SORT_DSO_SIZE, "dso_size", sort_dso_size),
 	DIM(SORT_CGROUP_ID, "cgroup_id", sort_cgroup_id),
 	DIM(SORT_SYM_IPC_NULL, "ipc_null", sort_sym_ipc_null),
+	DIM(SORT_CODE_PAGE_SIZE, "code_page_size", sort_code_page_size),
 };
 
 #undef DIM
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index 2e324ae..33e5b46 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -103,6 +103,7 @@ struct hist_entry {
 	u64			transaction;
 	s32			socket;
 	s32			cpu;
+	u64			code_page_size;
 	u8			cpumode;
 	u8			depth;
 
@@ -230,6 +231,7 @@ enum sort_type {
 	SORT_DSO_SIZE,
 	SORT_CGROUP_ID,
 	SORT_SYM_IPC_NULL,
+	SORT_CODE_PAGE_SIZE,
 
 	/* branch stack specific sort keys */
 	__SORT_BRANCH_STACK,
-- 
2.7.4


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH V4 13/13] perf test: Add test case for PERF_SAMPLE_CODE_PAGE_SIZE
  2019-01-31 20:27 [PATCH V4 01/13] perf/core, x86: Add PERF_SAMPLE_DATA_PAGE_SIZE kan.liang
                   ` (10 preceding siblings ...)
  2019-01-31 20:28 ` [PATCH V4 12/13] perf report: " kan.liang
@ 2019-01-31 20:28 ` kan.liang
  2019-02-01  9:22 ` [PATCH V4 01/13] perf/core, x86: Add PERF_SAMPLE_DATA_PAGE_SIZE Peter Zijlstra
  12 siblings, 0 replies; 23+ messages in thread
From: kan.liang @ 2019-01-31 20:28 UTC (permalink / raw)
  To: peterz, acme, tglx, mingo, linux-kernel
  Cc: eranian, jolsa, namhyung, ak, luto, Kan Liang

From: Stephane Eranian <eranian@google.com>

Extend sample-parsing test cases to support new sample type
PERF_SAMPLE_CODE_PAGE_SIZE.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---

Changes since V3
- Use the real page size to replace enum.

 tools/perf/tests/sample-parsing.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tools/perf/tests/sample-parsing.c b/tools/perf/tests/sample-parsing.c
index cf74f9d..1545392 100644
--- a/tools/perf/tests/sample-parsing.c
+++ b/tools/perf/tests/sample-parsing.c
@@ -148,6 +148,9 @@ static bool samples_same(const struct perf_sample *s1,
 	if (type & PERF_SAMPLE_DATA_PAGE_SIZE)
 		COMP(data_page_size);
 
+	if (type & PERF_SAMPLE_CODE_PAGE_SIZE)
+		COMP(code_page_size);
+
 	return true;
 }
 
@@ -216,6 +219,7 @@ static int do_test(u64 sample_type, u64 sample_regs, u64 read_format)
 
 		.phys_addr	= 113,
 		.data_page_size	= 4096,
+		.code_page_size = 4096,
 	};
 	struct sample_read_value values[] = {{1, 5}, {9, 3}, {2, 7}, {6, 4},};
 	struct perf_sample sample_out;
@@ -315,7 +319,7 @@ int test__sample_parsing(struct test *test __maybe_unused, int subtest __maybe_u
 	 * were added.  Please actually update the test rather than just change
 	 * the condition below.
 	 */
-	if (PERF_SAMPLE_MAX > PERF_SAMPLE_DATA_PAGE_SIZE << 1) {
+	if (PERF_SAMPLE_MAX > PERF_SAMPLE_CODE_PAGE_SIZE << 1) {
 		pr_debug("sample format has changed, some new PERF_SAMPLE_ bit was introduced - test needs updating\n");
 		return -1;
 	}
-- 
2.7.4


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V4 01/13] perf/core, x86: Add PERF_SAMPLE_DATA_PAGE_SIZE
  2019-01-31 20:27 [PATCH V4 01/13] perf/core, x86: Add PERF_SAMPLE_DATA_PAGE_SIZE kan.liang
                   ` (11 preceding siblings ...)
  2019-01-31 20:28 ` [PATCH V4 13/13] perf test: Add test case " kan.liang
@ 2019-02-01  9:22 ` Peter Zijlstra
  2019-02-01 10:03   ` Peter Zijlstra
                     ` (2 more replies)
  12 siblings, 3 replies; 23+ messages in thread
From: Peter Zijlstra @ 2019-02-01  9:22 UTC (permalink / raw)
  To: kan.liang
  Cc: acme, tglx, mingo, linux-kernel, eranian, jolsa, namhyung, ak,
	luto, Vlastimil Babka, kirill

On Thu, Jan 31, 2019 at 12:27:54PM -0800, kan.liang@linux.intel.com wrote:
> diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
> index 374a197..229a73b 100644
> --- a/arch/x86/events/core.c
> +++ b/arch/x86/events/core.c
> @@ -2578,3 +2578,34 @@ void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
>  	cap->events_mask_len	= x86_pmu.events_mask_len;
>  }
>  EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
> +
> +u64 perf_get_page_size(u64 virt)
> +{
> +	unsigned long flags;
> +	unsigned int level;
> +	pte_t *pte;
> +
> +	if (!virt)
> +		return 0;
> +
> +	/*
> +	 * Interrupts are disabled, so it prevents any tear down
> +	 * of the page tables.
> +	 * See the comment near struct mmu_table_batch.
> +	 */
> +	local_irq_save(flags);
> +	if (virt >= TASK_SIZE)
> +		pte = lookup_address(virt, &level);
> +	else {
> +		if (current->mm) {
> +			pte = lookup_address_in_pgd(pgd_offset(current->mm, virt),
> +						    virt, &level);
> +		} else
> +			level = PG_LEVEL_NUM;
> +	}
> +	local_irq_restore(flags);
> +	if (level >= PG_LEVEL_NUM)
> +		return 0;
> +
> +	return (u64)page_level_size(level);
> +}

*sigh* there really isn't anything x86 specific there.

> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 236bb8d..d233f45 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -6352,6 +6358,12 @@ static u64 perf_virt_to_phys(u64 virt)
>  	return phys_addr;
>  }
>  
> +/* Return page size of given virtual address. IRQ-safe required. */
> +u64 __weak perf_get_page_size(u64 virt)
> +{
> +	return 0;
> +}
> +
>  static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
>  
>  struct perf_callchain_entry *

How about something like so instead?

(completely untested, will likely make your grandma eat puppies)

--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6357,10 +6357,72 @@ static u64 perf_virt_to_phys(u64 virt)
 	return phys_addr;
 }
 
-/* Return page size of given virtual address. IRQ-safe required. */
-u64 __weak perf_get_page_size(u64 virt)
+static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr)
 {
-	return 0;
+	pgd_t *pgd;
+	p4d_t *p4d;
+	pud_t *pud;
+	pmd_t *pmd;
+
+	pgd = pgd_offset(mm, addr);
+	if (pgd_none(*pgd))
+		return 0;
+
+	p4d = p4d_offset(pgd, addr);
+	if (p4d_none(*p4d))
+		return 0;
+
+	if (p4d_large(*p4d));
+		return 1ULL << P4D_SHIFT;
+
+	if (!p4d_present(*p4d))
+		return 0;
+
+	pud = pud_offset(p4d, addr);
+	if (pud_none(*pud))
+		return 0;
+
+	if (pud_large(*pud))
+		return 1ULL << PUD_SHIFT;
+
+	if (!pud_present(*pud))
+		return 0;
+
+	pmd = pmd_offset(pud, addr);
+	if (pmd_none(*pmd))
+		return 0;
+
+	if (pmd_large(*pmd))
+		return 1ULL << PMD_SHIFT;
+
+	if (!pmd_present(*pmd))
+		return 0;
+
+	return 1ULL << PAGE_SHIFT;
+}
+
+static u64 perf_get_page_size(unsigned long addr)
+{
+	struct mm_struct *mm;
+	unsigned long flags;
+	u64 ret;
+
+	/*
+	 * Software page-table walkers must disable IRQs, see asm-generic/tlb.h.
+	 */
+	local_irq_save(flags);
+	mm = current->mm;
+	if (!mm) {
+		/*
+		 * For kernel threads and the like, use init_mm so that
+		 * we can find kernel memory.
+		 */
+		mm = &init_mm;
+	}
+	ret = __perf_get_page_size(mm, addr);
+	local_irq_restore(flags);
+
+	return ret;
 }
 
 static struct perf_callchain_entry __empty_callchain = { .nr = 0, };

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V4 01/13] perf/core, x86: Add PERF_SAMPLE_DATA_PAGE_SIZE
  2019-02-01  9:22 ` [PATCH V4 01/13] perf/core, x86: Add PERF_SAMPLE_DATA_PAGE_SIZE Peter Zijlstra
@ 2019-02-01 10:03   ` Peter Zijlstra
  2019-02-01 10:36     ` Kirill A. Shutemov
  2019-02-01 10:34   ` Kirill A. Shutemov
  2019-02-01 14:45   ` Liang, Kan
  2 siblings, 1 reply; 23+ messages in thread
From: Peter Zijlstra @ 2019-02-01 10:03 UTC (permalink / raw)
  To: kan.liang
  Cc: acme, tglx, mingo, linux-kernel, eranian, jolsa, namhyung, ak,
	luto, Vlastimil Babka, kirill, Will Deacon

On Fri, Feb 01, 2019 at 10:22:40AM +0100, Peter Zijlstra wrote:

> +static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr)
>  {
> +	pgd_t *pgd;
> +	p4d_t *p4d;
> +	pud_t *pud;
> +	pmd_t *pmd;
> +
> +	pgd = pgd_offset(mm, addr);
> +	if (pgd_none(*pgd))
> +		return 0;
> +
> +	p4d = p4d_offset(pgd, addr);
> +	if (p4d_none(*p4d))
> +		return 0;
> +
> +	if (p4d_large(*p4d));
> +		return 1ULL << P4D_SHIFT;
> +
> +	if (!p4d_present(*p4d))
> +		return 0;
> +
> +	pud = pud_offset(p4d, addr);
> +	if (pud_none(*pud))
> +		return 0;
> +
> +	if (pud_large(*pud))
> +		return 1ULL << PUD_SHIFT;

Will just mentioned a lovely feature where some archs have multi entry
large pages.

Possible something like:

	if (pud_large(*pud)) {
		struct page *page = pud_page(*pud);
		int order = PUD_SHIFT;

		if (PageHuge(page)) {
			page = compound_head(page);
			order += compound_order(page);
		}

		return 1ULL << order;
	}

works correctly.

> +
> +	if (!pud_present(*pud))
> +		return 0;
> +
> +	pmd = pmd_offset(pud, addr);
> +	if (pmd_none(*pmd))
> +		return 0;
> +
> +	if (pmd_large(*pmd))
> +		return 1ULL << PMD_SHIFT;

And same here I suppose..

> +
> +	if (!pmd_present(*pmd))
> +		return 0;
> +
> +	return 1ULL << PAGE_SHIFT;
> +}

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V4 01/13] perf/core, x86: Add PERF_SAMPLE_DATA_PAGE_SIZE
  2019-02-01  9:22 ` [PATCH V4 01/13] perf/core, x86: Add PERF_SAMPLE_DATA_PAGE_SIZE Peter Zijlstra
  2019-02-01 10:03   ` Peter Zijlstra
@ 2019-02-01 10:34   ` Kirill A. Shutemov
  2019-02-01 14:45   ` Liang, Kan
  2 siblings, 0 replies; 23+ messages in thread
From: Kirill A. Shutemov @ 2019-02-01 10:34 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: kan.liang, acme, tglx, mingo, linux-kernel, eranian, jolsa,
	namhyung, ak, luto, Vlastimil Babka

On Fri, Feb 01, 2019 at 10:22:40AM +0100, Peter Zijlstra wrote:
> On Thu, Jan 31, 2019 at 12:27:54PM -0800, kan.liang@linux.intel.com wrote:
> > diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
> > index 374a197..229a73b 100644
> > --- a/arch/x86/events/core.c
> > +++ b/arch/x86/events/core.c
> > @@ -2578,3 +2578,34 @@ void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
> >  	cap->events_mask_len	= x86_pmu.events_mask_len;
> >  }
> >  EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
> > +
> > +u64 perf_get_page_size(u64 virt)
> > +{
> > +	unsigned long flags;
> > +	unsigned int level;
> > +	pte_t *pte;
> > +
> > +	if (!virt)
> > +		return 0;
> > +
> > +	/*
> > +	 * Interrupts are disabled, so it prevents any tear down
> > +	 * of the page tables.
> > +	 * See the comment near struct mmu_table_batch.
> > +	 */
> > +	local_irq_save(flags);
> > +	if (virt >= TASK_SIZE)
> > +		pte = lookup_address(virt, &level);
> > +	else {
> > +		if (current->mm) {
> > +			pte = lookup_address_in_pgd(pgd_offset(current->mm, virt),
> > +						    virt, &level);
> > +		} else
> > +			level = PG_LEVEL_NUM;
> > +	}
> > +	local_irq_restore(flags);
> > +	if (level >= PG_LEVEL_NUM)
> > +		return 0;
> > +
> > +	return (u64)page_level_size(level);
> > +}
> 
> *sigh* there really isn't anything x86 specific there.
> 
> > diff --git a/kernel/events/core.c b/kernel/events/core.c
> > index 236bb8d..d233f45 100644
> > --- a/kernel/events/core.c
> > +++ b/kernel/events/core.c
> > @@ -6352,6 +6358,12 @@ static u64 perf_virt_to_phys(u64 virt)
> >  	return phys_addr;
> >  }
> >  
> > +/* Return page size of given virtual address. IRQ-safe required. */
> > +u64 __weak perf_get_page_size(u64 virt)
> > +{
> > +	return 0;
> > +}
> > +
> >  static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
> >  
> >  struct perf_callchain_entry *
> 
> How about something like so instead?
> 
> (completely untested, will likely make your grandma eat puppies)
> 
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -6357,10 +6357,72 @@ static u64 perf_virt_to_phys(u64 virt)
>  	return phys_addr;
>  }
>  
> -/* Return page size of given virtual address. IRQ-safe required. */
> -u64 __weak perf_get_page_size(u64 virt)
> +static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr)
>  {
> -	return 0;
> +	pgd_t *pgd;
> +	p4d_t *p4d;
> +	pud_t *pud;
> +	pmd_t *pmd;
> +
> +	pgd = pgd_offset(mm, addr);
> +	if (pgd_none(*pgd))
> +		return 0;
> +
> +	p4d = p4d_offset(pgd, addr);
> +	if (p4d_none(*p4d))
> +		return 0;
> +
> +	if (p4d_large(*p4d));

We dont have 512GiB pages yet.

> +		return 1ULL << P4D_SHIFT;

		return P4D_SIZE;

And the same P?D_SIZE below.

> +
> +	if (!p4d_present(*p4d))
> +		return 0;

No need to check p4d_none() *and* p4d_present(). Just p4d_present() should
be enough. Large is still suppose to be present. The same for other levels.

> +
> +	pud = pud_offset(p4d, addr);
> +	if (pud_none(*pud))
> +		return 0;
> +
> +	if (pud_large(*pud))
> +		return 1ULL << PUD_SHIFT;
> +
> +	if (!pud_present(*pud))
> +		return 0;
> +
> +	pmd = pmd_offset(pud, addr);
> +	if (pmd_none(*pmd))
> +		return 0;
> +
> +	if (pmd_large(*pmd))
> +		return 1ULL << PMD_SHIFT;
> +
> +	if (!pmd_present(*pmd))
> +		return 0;
> +
> +	return 1ULL << PAGE_SHIFT;
> +}
> +
> +static u64 perf_get_page_size(unsigned long addr)
> +{
> +	struct mm_struct *mm;
> +	unsigned long flags;
> +	u64 ret;
> +
> +	/*
> +	 * Software page-table walkers must disable IRQs, see asm-generic/tlb.h.
> +	 */
> +	local_irq_save(flags);
> +	mm = current->mm;
> +	if (!mm) {
> +		/*
> +		 * For kernel threads and the like, use init_mm so that
> +		 * we can find kernel memory.
> +		 */
> +		mm = &init_mm;
> +	}
> +	ret = __perf_get_page_size(mm, addr);
> +	local_irq_restore(flags);
> +
> +	return ret;
>  }
>  
>  static struct perf_callchain_entry __empty_callchain = { .nr = 0, };

-- 
 Kirill A. Shutemov

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V4 01/13] perf/core, x86: Add PERF_SAMPLE_DATA_PAGE_SIZE
  2019-02-01 10:03   ` Peter Zijlstra
@ 2019-02-01 10:36     ` Kirill A. Shutemov
  2019-02-01 12:43       ` Peter Zijlstra
  0 siblings, 1 reply; 23+ messages in thread
From: Kirill A. Shutemov @ 2019-02-01 10:36 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: kan.liang, acme, tglx, mingo, linux-kernel, eranian, jolsa,
	namhyung, ak, luto, Vlastimil Babka, Will Deacon

On Fri, Feb 01, 2019 at 11:03:58AM +0100, Peter Zijlstra wrote:
> On Fri, Feb 01, 2019 at 10:22:40AM +0100, Peter Zijlstra wrote:
> 
> > +static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr)
> >  {
> > +	pgd_t *pgd;
> > +	p4d_t *p4d;
> > +	pud_t *pud;
> > +	pmd_t *pmd;
> > +
> > +	pgd = pgd_offset(mm, addr);
> > +	if (pgd_none(*pgd))
> > +		return 0;
> > +
> > +	p4d = p4d_offset(pgd, addr);
> > +	if (p4d_none(*p4d))
> > +		return 0;
> > +
> > +	if (p4d_large(*p4d));
> > +		return 1ULL << P4D_SHIFT;
> > +
> > +	if (!p4d_present(*p4d))
> > +		return 0;
> > +
> > +	pud = pud_offset(p4d, addr);
> > +	if (pud_none(*pud))
> > +		return 0;
> > +
> > +	if (pud_large(*pud))
> > +		return 1ULL << PUD_SHIFT;
> 
> Will just mentioned a lovely feature where some archs have multi entry
> large pages.
> 
> Possible something like:
> 
> 	if (pud_large(*pud)) {
> 		struct page *page = pud_page(*pud);
> 		int order = PUD_SHIFT;
> 
> 		if (PageHuge(page)) {
> 			page = compound_head(page);
> 			order += compound_order(page);
> 		}
> 
> 		return 1ULL << order;
> 	}
> 
> works correctly.

For more fun: some compound pages can be mapped withe page table entries
not matching it's compound size, i.e. 2M pages mapped with PTE.

-- 
 Kirill A. Shutemov

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V4 01/13] perf/core, x86: Add PERF_SAMPLE_DATA_PAGE_SIZE
  2019-02-01 10:36     ` Kirill A. Shutemov
@ 2019-02-01 12:43       ` Peter Zijlstra
  2019-02-01 12:47         ` Peter Zijlstra
  2019-02-01 16:16         ` Liang, Kan
  0 siblings, 2 replies; 23+ messages in thread
From: Peter Zijlstra @ 2019-02-01 12:43 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: kan.liang, acme, tglx, mingo, linux-kernel, eranian, jolsa,
	namhyung, ak, luto, Vlastimil Babka, Will Deacon

On Fri, Feb 01, 2019 at 01:36:00PM +0300, Kirill A. Shutemov wrote:
> On Fri, Feb 01, 2019 at 11:03:58AM +0100, Peter Zijlstra wrote:

> > Will just mentioned a lovely feature where some archs have multi entry
> > large pages.
> > 
> > Possible something like:
> > 
> > 	if (pud_large(*pud)) {
> > 		struct page *page = pud_page(*pud);
> > 		int order = PUD_SHIFT;
> > 
> > 		if (PageHuge(page)) {
> > 			page = compound_head(page);
> > 			order += compound_order(page);
> > 		}
> > 
> > 		return 1ULL << order;
> > 	}
> > 
> > works correctly.
> 
> For more fun: some compound pages can be mapped withe page table entries
> not matching it's compound size, i.e. 2M pages mapped with PTE.

Surely not for PageHuge() ?! I thought the point of hugetlbfs was to
guarantee page granularity.

How is the below?

static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr)
{
	pgd_t *pgd;
	p4d_t *p4d;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;

	pgd = pgd_offset(mm, addr);
	if (pgd_none(*pgd))
		return 0;

	p4d = p4d_offset(pgd, addr);
	if (!p4d_present(*p4d))
		return 0;

	if (p4d_large(*p4d)) {
		struct page *page = p4d_page(*p4d);
		int shift = P4D_SHIFT;

		if (PageHuge(page)) {
			page = compound_head(page);
			shift = PAGE_SHIFT + compound_order(page);
		}

		return 1ULL << shift;
	}

	if (!p4d_present(*p4d))
		return 0;

	pud = pud_offset(p4d, addr);
	if (!pud_present(*pud))
		return 0;

	if (pud_large(*pud)) {
		struct page *page = pud_page(*pud);
		int shift = P4D_SHIFT;

		if (PageHuge(page)) {
			page = compound_head(page);
			shift = PAGE_SHIFT + compound_order(page);
		}

		return 1ULL << shift;
	}

	pmd = pmd_offset(pud, addr);
	if (!pmd_present(*pmd))
		return 0;

	if (pmd_large(*pmd)) {
		struct page *page = pud_page(*pud);
		int shift = P4D_SHIFT;

		if (PageHuge(page)) {
			page = compound_head(page);
			shift = PAGE_SHIFT + compound_order(page);
		}

		return 1ULL << shift;
	}

	pte = pte_offset_map(pmd, addr);
	if (!pte_present(*pte)) {
		pte_unmap(pte);
		return 0;
	}

	pte_unmap(pte);
	return PAGE_SIZE;
}

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V4 01/13] perf/core, x86: Add PERF_SAMPLE_DATA_PAGE_SIZE
  2019-02-01 12:43       ` Peter Zijlstra
@ 2019-02-01 12:47         ` Peter Zijlstra
  2019-02-01 16:16         ` Liang, Kan
  1 sibling, 0 replies; 23+ messages in thread
From: Peter Zijlstra @ 2019-02-01 12:47 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: kan.liang, acme, tglx, mingo, linux-kernel, eranian, jolsa,
	namhyung, ak, luto, Vlastimil Babka, Will Deacon

On Fri, Feb 01, 2019 at 01:43:48PM +0100, Peter Zijlstra wrote:

> static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr)
> {
> 	pgd_t *pgd;
> 	p4d_t *p4d;
> 	pud_t *pud;
> 	pmd_t *pmd;
> 	pte_t *pte;
> 
> 	pgd = pgd_offset(mm, addr);
> 	if (pgd_none(*pgd))
> 		return 0;
> 
> 	p4d = p4d_offset(pgd, addr);
> 	if (!p4d_present(*p4d))
> 		return 0;
> 
> 	if (p4d_large(*p4d)) {
> 		struct page *page = p4d_page(*p4d);
> 		int shift = P4D_SHIFT;
> 
> 		if (PageHuge(page)) {
> 			page = compound_head(page);
> 			shift = PAGE_SHIFT + compound_order(page);
> 		}
> 
> 		return 1ULL << shift;
> 	}
> 
> 	if (!p4d_present(*p4d))
> 		return 0;
> 
> 	pud = pud_offset(p4d, addr);
> 	if (!pud_present(*pud))
> 		return 0;
> 
> 	if (pud_large(*pud)) {
> 		struct page *page = pud_page(*pud);
> 		int shift = P4D_SHIFT;

PUD_SHIFT

> 
> 		if (PageHuge(page)) {
> 			page = compound_head(page);
> 			shift = PAGE_SHIFT + compound_order(page);
> 		}
> 
> 		return 1ULL << shift;
> 	}
> 
> 	pmd = pmd_offset(pud, addr);
> 	if (!pmd_present(*pmd))
> 		return 0;
> 
> 	if (pmd_large(*pmd)) {
> 		struct page *page = pud_page(*pud);
> 		int shift = P4D_SHIFT;

PMD_SHIFT

> 
> 		if (PageHuge(page)) {
> 			page = compound_head(page);
> 			shift = PAGE_SHIFT + compound_order(page);
> 		}
> 
> 		return 1ULL << shift;
> 	}
> 
> 	pte = pte_offset_map(pmd, addr);
> 	if (!pte_present(*pte)) {
> 		pte_unmap(pte);
> 		return 0;
> 	}
> 
> 	pte_unmap(pte);
> 	return PAGE_SIZE;
> }

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V4 01/13] perf/core, x86: Add PERF_SAMPLE_DATA_PAGE_SIZE
  2019-02-01  9:22 ` [PATCH V4 01/13] perf/core, x86: Add PERF_SAMPLE_DATA_PAGE_SIZE Peter Zijlstra
  2019-02-01 10:03   ` Peter Zijlstra
  2019-02-01 10:34   ` Kirill A. Shutemov
@ 2019-02-01 14:45   ` Liang, Kan
  2 siblings, 0 replies; 23+ messages in thread
From: Liang, Kan @ 2019-02-01 14:45 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: acme, tglx, mingo, linux-kernel, eranian, jolsa, namhyung, ak,
	luto, Vlastimil Babka, kirill



On 2/1/2019 4:22 AM, Peter Zijlstra wrote:
> On Thu, Jan 31, 2019 at 12:27:54PM -0800, kan.liang@linux.intel.com wrote:
>> diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
>> index 374a197..229a73b 100644
>> --- a/arch/x86/events/core.c
>> +++ b/arch/x86/events/core.c
>> @@ -2578,3 +2578,34 @@ void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
>>   	cap->events_mask_len	= x86_pmu.events_mask_len;
>>   }
>>   EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
>> +
>> +u64 perf_get_page_size(u64 virt)
>> +{
>> +	unsigned long flags;
>> +	unsigned int level;
>> +	pte_t *pte;
>> +
>> +	if (!virt)
>> +		return 0;
>> +
>> +	/*
>> +	 * Interrupts are disabled, so it prevents any tear down
>> +	 * of the page tables.
>> +	 * See the comment near struct mmu_table_batch.
>> +	 */
>> +	local_irq_save(flags);
>> +	if (virt >= TASK_SIZE)
>> +		pte = lookup_address(virt, &level);
>> +	else {
>> +		if (current->mm) {
>> +			pte = lookup_address_in_pgd(pgd_offset(current->mm, virt),
>> +						    virt, &level);
>> +		} else
>> +			level = PG_LEVEL_NUM;
>> +	}
>> +	local_irq_restore(flags);
>> +	if (level >= PG_LEVEL_NUM)
>> +		return 0;
>> +
>> +	return (u64)page_level_size(level);
>> +}
> 
> *sigh* there really isn't anything x86 specific there.

OK. I will split the patch and move the common code to a dedicated patch 
in V5. I will try the proposed code and do some tests on X86.

> >> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index 236bb8d..d233f45 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -6352,6 +6358,12 @@ static u64 perf_virt_to_phys(u64 virt)
>>   	return phys_addr;
>>   }
>>   
>> +/* Return page size of given virtual address. IRQ-safe required. */
>> +u64 __weak perf_get_page_size(u64 virt)
>> +{
>> +	return 0;
>> +}
>> +
>>   static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
>>   
>>   struct perf_callchain_entry *
> 
> How about something like so instead?
> 
> (completely untested, will likely make your grandma eat puppies
That's not funny!


Kan


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V4 01/13] perf/core, x86: Add PERF_SAMPLE_DATA_PAGE_SIZE
  2019-02-01 12:43       ` Peter Zijlstra
  2019-02-01 12:47         ` Peter Zijlstra
@ 2019-02-01 16:16         ` Liang, Kan
  2019-02-04 10:54           ` Peter Zijlstra
  1 sibling, 1 reply; 23+ messages in thread
From: Liang, Kan @ 2019-02-01 16:16 UTC (permalink / raw)
  To: Peter Zijlstra, Kirill A. Shutemov
  Cc: acme, tglx, mingo, linux-kernel, eranian, jolsa, namhyung, ak,
	luto, Vlastimil Babka, Will Deacon



On 2/1/2019 7:43 AM, Peter Zijlstra wrote:
> On Fri, Feb 01, 2019 at 01:36:00PM +0300, Kirill A. Shutemov wrote:
>> On Fri, Feb 01, 2019 at 11:03:58AM +0100, Peter Zijlstra wrote:
> 
>>> Will just mentioned a lovely feature where some archs have multi entry
>>> large pages.
>>>
>>> Possible something like:
>>>
>>> 	if (pud_large(*pud)) {
>>> 		struct page *page = pud_page(*pud);
>>> 		int order = PUD_SHIFT;
>>>
>>> 		if (PageHuge(page)) {
>>> 			page = compound_head(page);
>>> 			order += compound_order(page);
>>> 		}
>>>
>>> 		return 1ULL << order;
>>> 	}
>>>
>>> works correctly.
>>
>> For more fun: some compound pages can be mapped withe page table entries
>> not matching it's compound size, i.e. 2M pages mapped with PTE.
> 
> Surely not for PageHuge() ?! I thought the point of hugetlbfs was to
> guarantee page granularity.
> 
> How is the below?
> 
> static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr)
> {
> 	pgd_t *pgd;
> 	p4d_t *p4d;
> 	pud_t *pud;
> 	pmd_t *pmd;
> 	pte_t *pte;
> 
> 	pgd = pgd_offset(mm, addr);
> 	if (pgd_none(*pgd))
> 		return 0;
> 
> 	p4d = p4d_offset(pgd, addr);
> 	if (!p4d_present(*p4d))
> 		return 0;
> 
> 	if (p4d_large(*p4d)) {

This one looks like x86 specific?

Kan

> 		struct page *page = p4d_page(*p4d);
> 		int shift = P4D_SHIFT;
> 
> 		if (PageHuge(page)) {
> 			page = compound_head(page);
> 			shift = PAGE_SHIFT + compound_order(page);
> 		}
> 
> 		return 1ULL << shift;
> 	}
> 
> 	if (!p4d_present(*p4d))
> 		return 0;
> 
> 	pud = pud_offset(p4d, addr);
> 	if (!pud_present(*pud))
> 		return 0;
> 
> 	if (pud_large(*pud)) {
> 		struct page *page = pud_page(*pud);
> 		int shift = P4D_SHIFT;
> 
> 		if (PageHuge(page)) {
> 			page = compound_head(page);
> 			shift = PAGE_SHIFT + compound_order(page);
> 		}
> 
> 		return 1ULL << shift;
> 	}
> 
> 	pmd = pmd_offset(pud, addr);
> 	if (!pmd_present(*pmd))
> 		return 0;
> 
> 	if (pmd_large(*pmd)) {
> 		struct page *page = pud_page(*pud);
> 		int shift = P4D_SHIFT;
> 
> 		if (PageHuge(page)) {
> 			page = compound_head(page);
> 			shift = PAGE_SHIFT + compound_order(page);
> 		}
> 
> 		return 1ULL << shift;
> 	}
> 
> 	pte = pte_offset_map(pmd, addr);
> 	if (!pte_present(*pte)) {
> 		pte_unmap(pte);
> 		return 0;
> 	}
> 
> 	pte_unmap(pte);
> 	return PAGE_SIZE;
> }
> 

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V4 01/13] perf/core, x86: Add PERF_SAMPLE_DATA_PAGE_SIZE
  2019-02-01 16:16         ` Liang, Kan
@ 2019-02-04 10:54           ` Peter Zijlstra
  2019-02-06 20:23             ` Liang, Kan
  0 siblings, 1 reply; 23+ messages in thread
From: Peter Zijlstra @ 2019-02-04 10:54 UTC (permalink / raw)
  To: Liang, Kan
  Cc: Kirill A. Shutemov, acme, tglx, mingo, linux-kernel, eranian,
	jolsa, namhyung, ak, luto, Vlastimil Babka, Will Deacon

On Fri, Feb 01, 2019 at 11:16:51AM -0500, Liang, Kan wrote:

> > 	if (p4d_large(*p4d)) {
> 
> This one looks like x86 specific?

> > 	if (pud_large(*pud)) {

> > 	if (pmd_large(*pmd)) {

Kirill did indeed note that p*_large() isn't universally availale (but
there's definitely !x86 archs that have them). He also said it would
probably make sense to have them universally available and might help
clean up mm/gup.c a little.

A quick grep shows that: ARM, PowerPC, S390, Sparc and x86 have
'pmd_large'.

Anyway; it probably makes sense (and shouldn't be too hard) to fix up
all architectures to provide this.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH V4 01/13] perf/core, x86: Add PERF_SAMPLE_DATA_PAGE_SIZE
  2019-02-04 10:54           ` Peter Zijlstra
@ 2019-02-06 20:23             ` Liang, Kan
  0 siblings, 0 replies; 23+ messages in thread
From: Liang, Kan @ 2019-02-06 20:23 UTC (permalink / raw)
  To: Peter Zijlstra, Kirill A. Shutemov
  Cc: acme, tglx, mingo, linux-kernel, eranian, jolsa, namhyung, ak,
	luto, Vlastimil Babka, Will Deacon



On 2/4/2019 5:54 AM, Peter Zijlstra wrote:
> On Fri, Feb 01, 2019 at 11:16:51AM -0500, Liang, Kan wrote:
> 
>>> 	if (p4d_large(*p4d)) {
>>
>> This one looks like x86 specific?
> 
>>> 	if (pud_large(*pud)) {
> 
>>> 	if (pmd_large(*pmd)) {
> 
> Kirill did indeed note that p*_large() isn't universally availale (but
> there's definitely !x86 archs that have them). He also said it would
> probably make sense to have them universally available and might help
> clean up mm/gup.c a little.
> 
> A quick grep shows that: ARM, PowerPC, S390, Sparc and x86 have
> 'pmd_large'.
> 
> Anyway; it probably makes sense (and shouldn't be too hard) to fix up
> all architectures to provide this.
> 
Hi Peter and Kirill,

It looks like it's not easy to support get_page_size() universally.
Even the 'pmd_large' you mentioned is not universal. I got error message 
when building with ARCH=riscv.
There is even less support for pud_large and p4d_large.
We have to check and add something like "#define p*d_large(a) 0" in the 
pg headers for each ARCH. I think it's ugly.


> +               if (PageHuge(page)) {
> +                       page = compound_head(page);
> +                       shift = PAGE_SHIFT + compound_order(page);
> +               }

PageHuge() only returns true for hugetlbfs. I think the transparent huge 
pages should also use compound pages, right? Besides hugetlbfs and THP, 
are there any other cases which also use compound pages?
Can the codes handle all these cases?

> +static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long
> +addr) {
> +	pgd_t *pgd;
> +	p4d_t *p4d;

An universal get_page_size() function should not be implemented in perf. 
It will be a problem for future maintenance.

All in all, I think we are far away from an universal get_page_size(). A 
__weak function + x86 implementation solution proposed in this patch 
series should be a better choice.
- Other ARCH can have their own implementation later if they want this 
feature.
- Standard pg table helper functions are used for x86. Maintenance will 
not be a problem.

What do you think?

Thanks,
Kan

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2019-02-06 20:23 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-01-31 20:27 [PATCH V4 01/13] perf/core, x86: Add PERF_SAMPLE_DATA_PAGE_SIZE kan.liang
2019-01-31 20:27 ` [PATCH V4 02/13] perf tools: Support new sample type for data page size kan.liang
2019-01-31 20:27 ` [PATCH V4 03/13] perf script: Support " kan.liang
2019-01-31 20:27 ` [PATCH V4 04/13] perf sort: Add sort option for " kan.liang
2019-01-31 20:27 ` [PATCH V4 05/13] perf mem: Factor out a function to generate sort order kan.liang
2019-01-31 20:27 ` [PATCH V4 06/13] perf mem: Clean up output format kan.liang
2019-01-31 20:28 ` [PATCH V4 07/13] perf mem: Support data page size kan.liang
2019-01-31 20:28 ` [PATCH V4 08/13] perf test: Add test case for PERF_SAMPLE_DATA_PAGE_SIZE kan.liang
2019-01-31 20:28 ` [PATCH V4 09/13] perf/core, x86: Add support for PERF_SAMPLE_CODE_PAGE_SIZE kan.liang
2019-01-31 20:28 ` [PATCH V4 10/13] perf tools: " kan.liang
2019-01-31 20:28 ` [PATCH V4 11/13] perf script: " kan.liang
2019-01-31 20:28 ` [PATCH V4 12/13] perf report: " kan.liang
2019-01-31 20:28 ` [PATCH V4 13/13] perf test: Add test case " kan.liang
2019-02-01  9:22 ` [PATCH V4 01/13] perf/core, x86: Add PERF_SAMPLE_DATA_PAGE_SIZE Peter Zijlstra
2019-02-01 10:03   ` Peter Zijlstra
2019-02-01 10:36     ` Kirill A. Shutemov
2019-02-01 12:43       ` Peter Zijlstra
2019-02-01 12:47         ` Peter Zijlstra
2019-02-01 16:16         ` Liang, Kan
2019-02-04 10:54           ` Peter Zijlstra
2019-02-06 20:23             ` Liang, Kan
2019-02-01 10:34   ` Kirill A. Shutemov
2019-02-01 14:45   ` Liang, Kan

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.