linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/5] perf, tools: Add support for skipping itrace instructions
@ 2016-03-28 17:45 Andi Kleen
  2016-03-28 17:45 ` [PATCH 2/5] perf, tools: Add probing for udev86 Andi Kleen
                   ` (5 more replies)
  0 siblings, 6 replies; 8+ messages in thread
From: Andi Kleen @ 2016-03-28 17:45 UTC (permalink / raw)
  To: acme; +Cc: jolsa, linux-kernel, eranian, Andi Kleen, Adrian Hunter

From: Andi Kleen <ak@linux.intel.com>

When using perf script to look at PT traces it is often
useful to ignore the initialization code at the beginning.

On larger traces which may have many millions of instructions
in initialization code doing that in a pipeline can be very
slow, with perf script spending a lot of CPU time calling
printf and writing data.

This patch adds an extension to the --itrace argument
that skips 'n' events (instructions, branches or transactions)
at the beginning. This is much more efficient.

Cc: Adrian Hunter <adrian.hunter@intel.com>

v2:
Add support for BTS (Adrian Hunter)
Document in itrace.txt
Fix branch check
Check transactions and instructions too
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 tools/perf/Documentation/intel-pt.txt |  7 +++++++
 tools/perf/Documentation/itrace.txt   |  8 ++++++++
 tools/perf/util/auxtrace.c            |  7 +++++++
 tools/perf/util/auxtrace.h            |  2 ++
 tools/perf/util/intel-bts.c           |  5 +++++
 tools/perf/util/intel-pt.c            | 22 ++++++++++++++++++++--
 6 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/tools/perf/Documentation/intel-pt.txt b/tools/perf/Documentation/intel-pt.txt
index be764f9..c6c8318 100644
--- a/tools/perf/Documentation/intel-pt.txt
+++ b/tools/perf/Documentation/intel-pt.txt
@@ -672,6 +672,7 @@ The letters are:
 	d	create a debug log
 	g	synthesize a call chain (use with i or x)
 	l	synthesize last branch entries (use with i or x)
+	s	skip initial number of events
 
 "Instructions" events look like they were recorded by "perf record -e
 instructions".
@@ -730,6 +731,12 @@ from one sample to the next.
 
 To disable trace decoding entirely, use the option --no-itrace.
 
+It is also possible to skip events generated (instructions, branches, transactions)
+at the beginning. This is useful to ignore initialization code.
+
+	--itrace=i0nss1000000
+
+skips the first million instructions.
 
 dump option
 -----------
diff --git a/tools/perf/Documentation/itrace.txt b/tools/perf/Documentation/itrace.txt
index 65453f4..e2a4c5e 100644
--- a/tools/perf/Documentation/itrace.txt
+++ b/tools/perf/Documentation/itrace.txt
@@ -7,6 +7,7 @@
 		d	create a debug log
 		g	synthesize a call chain (use with i or x)
 		l	synthesize last branch entries (use with i or x)
+		s       skip initial number of events
 
 	The default is all events i.e. the same as --itrace=ibxe
 
@@ -24,3 +25,10 @@
 
 	Also the number of last branch entries (default 64, max. 1024) for
 	instructions or transactions events can be specified.
+
+	It is also possible to skip events generated (instructions, branches, transactions)
+	at the beginning. This is useful to ignore initialization code.
+
+	--itrace=i0nss1000000
+
+	skips the first million instructions.
diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c
index ec164fe..c916901 100644
--- a/tools/perf/util/auxtrace.c
+++ b/tools/perf/util/auxtrace.c
@@ -940,6 +940,7 @@ void itrace_synth_opts__set_default(struct itrace_synth_opts *synth_opts)
 	synth_opts->period = PERF_ITRACE_DEFAULT_PERIOD;
 	synth_opts->callchain_sz = PERF_ITRACE_DEFAULT_CALLCHAIN_SZ;
 	synth_opts->last_branch_sz = PERF_ITRACE_DEFAULT_LAST_BRANCH_SZ;
+	synth_opts->initial_skip = 0;
 }
 
 /*
@@ -1064,6 +1065,12 @@ int itrace_parse_synth_opts(const struct option *opt, const char *str,
 				synth_opts->last_branch_sz = val;
 			}
 			break;
+		case 's':
+			synth_opts->initial_skip = strtoul(p, &endptr, 10);
+			if (p == endptr)
+				goto out_err;
+			p = endptr;
+			break;
 		case ' ':
 		case ',':
 			break;
diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h
index 57ff31e..767989e 100644
--- a/tools/perf/util/auxtrace.h
+++ b/tools/perf/util/auxtrace.h
@@ -68,6 +68,7 @@ enum itrace_period_type {
  * @last_branch_sz: branch context size
  * @period: 'instructions' events period
  * @period_type: 'instructions' events period type
+ * @initial_skip: skip N events at the beginning.
  */
 struct itrace_synth_opts {
 	bool			set;
@@ -86,6 +87,7 @@ struct itrace_synth_opts {
 	unsigned int		last_branch_sz;
 	unsigned long long	period;
 	enum itrace_period_type	period_type;
+	unsigned long		initial_skip;
 };
 
 /**
diff --git a/tools/perf/util/intel-bts.c b/tools/perf/util/intel-bts.c
index 6bc3ecd..3ff2f72f 100644
--- a/tools/perf/util/intel-bts.c
+++ b/tools/perf/util/intel-bts.c
@@ -66,6 +66,7 @@ struct intel_bts {
 	u64				branches_id;
 	size_t				branches_event_size;
 	bool				synth_needs_swap;
+	unsigned long			num_events;
 };
 
 struct intel_bts_queue {
@@ -275,6 +276,10 @@ static int intel_bts_synth_branch_sample(struct intel_bts_queue *btsq,
 	union perf_event event;
 	struct perf_sample sample = { .ip = 0, };
 
+	if (bts->synth_opts.initial_skip &&
+	    bts->num_events++ <= bts->synth_opts.initial_skip)
+		return 0;
+
 	event.sample.header.type = PERF_RECORD_SAMPLE;
 	event.sample.header.misc = PERF_RECORD_MISC_USER;
 	event.sample.header.size = sizeof(struct perf_event_header);
diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c
index 05d8158..07f0020 100644
--- a/tools/perf/util/intel-pt.c
+++ b/tools/perf/util/intel-pt.c
@@ -100,6 +100,8 @@ struct intel_pt {
 	u64 cyc_bit;
 	u64 noretcomp_bit;
 	unsigned max_non_turbo_ratio;
+
+	unsigned long num_events;
 };
 
 enum switch_state {
@@ -972,6 +974,10 @@ static int intel_pt_synth_branch_sample(struct intel_pt_queue *ptq)
 	if (pt->branches_filter && !(pt->branches_filter & ptq->flags))
 		return 0;
 
+	if (pt->synth_opts.initial_skip &&
+	    pt->num_events++ < pt->synth_opts.initial_skip)
+		return 0;
+
 	event->sample.header.type = PERF_RECORD_SAMPLE;
 	event->sample.header.misc = PERF_RECORD_MISC_USER;
 	event->sample.header.size = sizeof(struct perf_event_header);
@@ -1028,6 +1034,10 @@ static int intel_pt_synth_instruction_sample(struct intel_pt_queue *ptq)
 	union perf_event *event = ptq->event_buf;
 	struct perf_sample sample = { .ip = 0, };
 
+	if (pt->synth_opts.initial_skip &&
+	    pt->num_events++ < pt->synth_opts.initial_skip)
+		return 0;
+
 	event->sample.header.type = PERF_RECORD_SAMPLE;
 	event->sample.header.misc = PERF_RECORD_MISC_USER;
 	event->sample.header.size = sizeof(struct perf_event_header);
@@ -1085,6 +1095,10 @@ static int intel_pt_synth_transaction_sample(struct intel_pt_queue *ptq)
 	union perf_event *event = ptq->event_buf;
 	struct perf_sample sample = { .ip = 0, };
 
+	if (pt->synth_opts.initial_skip &&
+	    pt->num_events++ < pt->synth_opts.initial_skip)
+		return 0;
+
 	event->sample.header.type = PERF_RECORD_SAMPLE;
 	event->sample.header.misc = PERF_RECORD_MISC_USER;
 	event->sample.header.size = sizeof(struct perf_event_header);
@@ -1196,14 +1210,18 @@ static int intel_pt_sample(struct intel_pt_queue *ptq)
 	ptq->have_sample = false;
 
 	if (pt->sample_instructions &&
-	    (state->type & INTEL_PT_INSTRUCTION)) {
+	    (state->type & INTEL_PT_INSTRUCTION) &&
+	    (!pt->synth_opts.initial_skip ||
+	     pt->num_events++ >= pt->synth_opts.initial_skip)) {
 		err = intel_pt_synth_instruction_sample(ptq);
 		if (err)
 			return err;
 	}
 
 	if (pt->sample_transactions &&
-	    (state->type & INTEL_PT_TRANSACTION)) {
+	    (state->type & INTEL_PT_TRANSACTION) &&
+	    (!pt->synth_opts.initial_skip ||
+	     pt->num_events++ >= pt->synth_opts.initial_skip)) {
 		err = intel_pt_synth_transaction_sample(ptq);
 		if (err)
 			return err;
-- 
2.5.5

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 2/5] perf, tools: Add probing for udev86
  2016-03-28 17:45 [PATCH 1/5] perf, tools: Add support for skipping itrace instructions Andi Kleen
@ 2016-03-28 17:45 ` Andi Kleen
  2016-03-28 17:45 ` [PATCH 3/5] perf, tools, script: Add support for printing assembler Andi Kleen
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 8+ messages in thread
From: Andi Kleen @ 2016-03-28 17:45 UTC (permalink / raw)
  To: acme; +Cc: jolsa, linux-kernel, eranian, Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

Add autoprobing for the udev86 disassembler library.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 tools/build/Makefile.feature      | 6 ++++--
 tools/build/feature/Makefile      | 8 ++++++--
 tools/build/feature/test-all.c    | 5 +++++
 tools/build/feature/test-udis86.c | 8 ++++++++
 tools/perf/config/Makefile        | 5 +++++
 5 files changed, 28 insertions(+), 4 deletions(-)
 create mode 100644 tools/build/feature/test-udis86.c

diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature
index 6b77072..db4f426 100644
--- a/tools/build/Makefile.feature
+++ b/tools/build/Makefile.feature
@@ -55,7 +55,8 @@ FEATURE_TESTS_BASIC :=			\
 	zlib				\
 	lzma				\
 	get_cpuid			\
-	bpf
+	bpf				\
+	udis86
 
 # FEATURE_TESTS_BASIC + FEATURE_TESTS_EXTRA is the complete list
 # of all feature tests
@@ -94,7 +95,8 @@ FEATURE_DISPLAY ?=			\
 	zlib				\
 	lzma				\
 	get_cpuid			\
-	bpf
+	bpf				\
+	udis86
 
 # Set FEATURE_CHECK_(C|LD)FLAGS-all for all FEATURE_TESTS features.
 # If in the future we need per-feature checks/flags for features not
diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile
index c5f4c41..d05c312 100644
--- a/tools/build/feature/Makefile
+++ b/tools/build/feature/Makefile
@@ -36,7 +36,8 @@ FILES=					\
 	test-zlib.bin			\
 	test-lzma.bin			\
 	test-bpf.bin			\
-	test-get_cpuid.bin
+	test-get_cpuid.bin		\
+	test-udis86.bin
 
 FILES := $(addprefix $(OUTPUT),$(FILES))
 
@@ -51,7 +52,7 @@ __BUILD = $(CC) $(CFLAGS) -Wall -Werror -o $@ $(patsubst %.bin,%.c,$(@F)) $(LDFL
 ###############################
 
 $(OUTPUT)test-all.bin:
-	$(BUILD) -fstack-protector-all -O2 -D_FORTIFY_SOURCE=2 -ldw -lelf -lnuma -lelf -laudit -I/usr/include/slang -lslang $(shell $(PKG_CONFIG) --libs --cflags gtk+-2.0 2>/dev/null) $(FLAGS_PERL_EMBED) $(FLAGS_PYTHON_EMBED) -DPACKAGE='"perf"' -lbfd -ldl -lz -llzma
+	$(BUILD) -fstack-protector-all -O2 -D_FORTIFY_SOURCE=2 -ldw -lelf -lnuma -lelf -laudit -I/usr/include/slang -lslang $(shell $(PKG_CONFIG) --libs --cflags gtk+-2.0 2>/dev/null) $(FLAGS_PERL_EMBED) $(FLAGS_PYTHON_EMBED) -DPACKAGE='"perf"' -lbfd -ldl -lz -llzma -ludis86
 
 $(OUTPUT)test-hello.bin:
 	$(BUILD)
@@ -97,6 +98,9 @@ $(OUTPUT)test-numa_num_possible_cpus.bin:
 $(OUTPUT)test-libunwind.bin:
 	$(BUILD) -lelf
 
+$(OUTPUT)test-udis86.bin:
+	$(BUILD) -ludis86
+
 $(OUTPUT)test-libunwind-debug-frame.bin:
 	$(BUILD) -lelf
 
diff --git a/tools/build/feature/test-all.c b/tools/build/feature/test-all.c
index e499a36..76b0de3 100644
--- a/tools/build/feature/test-all.c
+++ b/tools/build/feature/test-all.c
@@ -133,6 +133,10 @@
 # include "test-libcrypto.c"
 #undef main
 
+#define main main_test_udis86
+#  include "test-udis86.c"
+#endif
+
 int main(int argc, char *argv[])
 {
 	main_test_libpython();
@@ -163,6 +167,7 @@ int main(int argc, char *argv[])
 	main_test_get_cpuid();
 	main_test_bpf();
 	main_test_libcrypto();
+	main_test_udis86();
 
 	return 0;
 }
diff --git a/tools/build/feature/test-udis86.c b/tools/build/feature/test-udis86.c
new file mode 100644
index 0000000..623c545
--- /dev/null
+++ b/tools/build/feature/test-udis86.c
@@ -0,0 +1,8 @@
+#include <udis86.h>
+
+int main(void)
+{
+	ud_t ud;
+	ud_init(&ud);
+	return 0;
+}
diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile
index f7d7f5a..399ada8 100644
--- a/tools/perf/config/Makefile
+++ b/tools/perf/config/Makefile
@@ -587,6 +587,11 @@ ifneq ($(filter -lbfd,$(EXTLIBS)),)
   CFLAGS += -DHAVE_LIBBFD_SUPPORT
 endif
 
+ifeq ($(feature-udis86), 1)
+  CFLAGS += -DHAVE_UDIS86
+  EXTLIBS += -ludis86
+endif
+
 ifndef NO_ZLIB
   ifeq ($(feature-zlib), 1)
     CFLAGS += -DHAVE_ZLIB_SUPPORT
-- 
2.5.5

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 3/5] perf, tools, script: Add support for printing assembler
  2016-03-28 17:45 [PATCH 1/5] perf, tools: Add support for skipping itrace instructions Andi Kleen
  2016-03-28 17:45 ` [PATCH 2/5] perf, tools: Add probing for udev86 Andi Kleen
@ 2016-03-28 17:45 ` Andi Kleen
  2016-03-28 17:45 ` [PATCH 4/5] perf, tools, script: Add brstackasm output for branch stacks Andi Kleen
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 8+ messages in thread
From: Andi Kleen @ 2016-03-28 17:45 UTC (permalink / raw)
  To: acme; +Cc: jolsa, linux-kernel, eranian, Andi Kleen, adrian.hunter

From: Andi Kleen <ak@linux.intel.com>

When dumping PT traces with perf script it is very useful to see the
assembler for each sample, so that it is easily possible to follow
the control flow.

As using objdump is difficult and inefficient from perf script this
patch uses the udis86 library to implement assembler output.
The library can be downloaded from http://udis86.sourceforge.net/

The library is probed as an external dependency in the usual way. Then perf
script calls into it when needed, and handles callbacks to resolve
symbols.

% perf record -e intel_pt//u true
% perf script -F sym,symoff,ip,asm --itrace=i0ns | head
     7fc7188b4190 _start+0x0	mov %rsp, %rdi
     7fc7188b4193 _start+0x3	call _dl_start
     7fc7188b7710 _dl_start+0x0	push %rbp
     7fc7188b7711 _dl_start+0x1	mov %rsp, %rbp
     7fc7188b7714 _dl_start+0x4	push %r15
     7fc7188b7716 _dl_start+0x6	push %r14
     7fc7188b7718 _dl_start+0x8	push %r13
     7fc7188b771a _dl_start+0xa	push %r12
     7fc7188b771c _dl_start+0xc	mov %rdi, %r12
     7fc7188b771f _dl_start+0xf	push %rbx

Current issues:
- Some jump references do not get resolved to symbols.
- udis86 release does not support STAC/CLAC, which are used in the kernel,
but there is a pending patch for it.

Cc: adrian.hunter@intel.com
v2: Fix address resolution. Port to latest acme/perf/core
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 tools/perf/Documentation/perf-script.txt |   4 +-
 tools/perf/builtin-script.c              | 107 +++++++++++++++++++++++++++++--
 2 files changed, 105 insertions(+), 6 deletions(-)

diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt
index 22ef393..f2b81d8 100644
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -116,7 +116,7 @@ OPTIONS
 --fields::
         Comma separated list of fields to print. Options are:
         comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr, symoff,
-	srcline, period, iregs, brstack, brstacksym, flags.
+	srcline, period, iregs, brstack, brstacksym, flags, asm.
         Field list can be prepended with the type, trace, sw or hw,
         to indicate to which event type the field list applies.
         e.g., -f sw:comm,tid,time,ip,sym  and -f trace:time,cpu,trace
@@ -185,6 +185,8 @@ OPTIONS
 
 	The brstacksym is identical to brstack, except that the FROM and TO addresses are printed in a symbolic form if possible.
 
+	When asm is specified the assembler instruction of each sample is printed in disassembled form.
+
 -k::
 --vmlinux=<file>::
         vmlinux pathname
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 3770c3d..323572e 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -25,6 +25,10 @@
 #include "asm/bug.h"
 #include "util/mem-events.h"
 
+#ifdef HAVE_UDIS86
+#include <udis86.h>
+#endif
+
 static char const		*script_name;
 static char const		*generate_script_lang;
 static bool			debug_mode;
@@ -62,6 +66,7 @@ enum perf_output_field {
 	PERF_OUTPUT_DATA_SRC	    = 1U << 17,
 	PERF_OUTPUT_WEIGHT	    = 1U << 18,
 	PERF_OUTPUT_BPF_OUTPUT	    = 1U << 19,
+	PERF_OUTPUT_ASM		    = 1U << 20,
 };
 
 struct output_option {
@@ -88,6 +93,7 @@ struct output_option {
 	{.str = "data_src", .field = PERF_OUTPUT_DATA_SRC},
 	{.str = "weight",   .field = PERF_OUTPUT_WEIGHT},
 	{.str = "bpf-output",   .field = PERF_OUTPUT_BPF_OUTPUT},
+	{.str = "asm", .field = PERF_OUTPUT_ASM},
 };
 
 /* default set to maintain compatibility with current format */
@@ -282,7 +288,11 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel,
 		       "selected. Hence, no address to lookup the source line number.\n");
 		return -EINVAL;
 	}
-
+	if (PRINT_FIELD(ASM) && !PRINT_FIELD(IP)) {
+		pr_err("Display of assembler requested but sample IP is not\n"
+		       "selected.\n");
+		return -EINVAL;
+	}
 	if ((PRINT_FIELD(PID) || PRINT_FIELD(TID)) &&
 		perf_evsel__check_stype(evsel, PERF_SAMPLE_TID, "TID",
 					PERF_OUTPUT_TID|PERF_OUTPUT_PID))
@@ -421,6 +431,88 @@ static void print_sample_iregs(struct perf_sample *sample,
 	}
 }
 
+#ifdef HAVE_UDIS86
+
+struct perf_ud {
+	ud_t ud_obj;
+	struct thread *thread;
+	u8 cpumode;
+	int cpu;
+};
+
+static const char *dis_resolve(struct ud *u, uint64_t addr, int64_t *off)
+{
+	struct perf_ud *ud = container_of(u, struct perf_ud, ud_obj);
+	struct addr_location al;
+
+	memset(&al, 0, sizeof(struct addr_location));
+
+	thread__find_addr_map(ud->thread, ud->cpumode, MAP__FUNCTION, addr, &al);
+	if (!al.map)
+		thread__find_addr_map(ud->thread, ud->cpumode, MAP__VARIABLE,
+					addr, &al);
+	al.cpu = ud->cpu;
+	al.sym = NULL;
+
+	if (al.map)
+		al.sym = map__find_symbol(al.map, al.addr, NULL);
+
+	if (!al.sym)
+		return NULL;
+
+	if (al.addr < al.sym->end)
+		*off = al.addr - al.sym->start;
+	else
+		*off = al.addr - al.map->start - al.sym->start;
+	return al.sym->name;
+}
+#endif
+
+static void print_sample_asm(struct perf_sample *sample __maybe_unused,
+			     struct thread *thread __maybe_unused,
+			     struct perf_event_attr *attr __maybe_unused,
+			     struct addr_location *al __maybe_unused,
+			     struct machine *machine __maybe_unused)
+{
+#ifdef HAVE_UDIS86
+	static bool ud_initialized = false;
+	static struct perf_ud ud;
+	u8 buffer[32];
+	int len;
+	u64 offset;
+
+	if (!ud_initialized) {
+		ud_initialized = true;
+		ud_init(&ud.ud_obj);
+		ud_set_syntax(&ud.ud_obj, UD_SYN_ATT);
+		ud_set_sym_resolver(&ud.ud_obj, dis_resolve);
+	}
+	ud.thread = thread;
+	ud.cpumode = sample->cpumode;
+	ud.cpu = sample->cpu;
+
+	if (!al->map || !al->map->dso)
+		return;
+	if (al->map->dso->data.status == DSO_DATA_STATUS_ERROR)
+		return;
+
+	/* Load maps to ensure dso->is_64_bit has been updated */
+	map__load(al->map, machine->symbol_filter);
+
+	offset = al->map->map_ip(al->map, sample->ip);
+	len = dso__data_read_offset(al->map->dso, machine,
+				    offset, buffer, 32);
+	if (len <= 0)
+		return;
+
+	ud_set_mode(&ud.ud_obj, al->map->dso->is_64_bit ? 64 : 32);
+	ud_set_pc(&ud.ud_obj, sample->ip);
+	ud_set_input_buffer(&ud.ud_obj, buffer, len);
+	ud_disassemble(&ud.ud_obj);
+	printf("\t%s", ud_insn_asm(&ud.ud_obj));
+#endif
+}
+
 static void print_sample_start(struct perf_sample *sample,
 			       struct thread *thread,
 			       struct perf_evsel *evsel)
@@ -739,7 +831,8 @@ static size_t data_src__printf(u64 data_src)
 
 static void process_event(struct perf_script *script,
 			  struct perf_sample *sample, struct perf_evsel *evsel,
-			  struct addr_location *al)
+			  struct addr_location *al,
+			  struct machine *machine)
 {
 	struct thread *thread = al->thread;
 	struct perf_event_attr *attr = &evsel->attr;
@@ -767,7 +860,7 @@ static void process_event(struct perf_script *script,
 
 	if (is_bts_event(attr)) {
 		print_sample_bts(sample, evsel, thread, al);
-		return;
+		goto print_rest;
 	}
 
 	if (PRINT_FIELD(TRACE))
@@ -796,6 +889,7 @@ static void process_event(struct perf_script *script,
 	if (PRINT_FIELD(IREGS))
 		print_sample_iregs(sample, attr);
 
+print_rest:
 	if (PRINT_FIELD(BRSTACK))
 		print_sample_brstack(sample);
 	else if (PRINT_FIELD(BRSTACKSYM))
@@ -804,6 +898,9 @@ static void process_event(struct perf_script *script,
 	if (perf_evsel__is_bpf_output(evsel) && PRINT_FIELD(BPF_OUTPUT))
 		print_sample_bpf_output(sample);
 
+	if (PRINT_FIELD(ASM))
+		print_sample_asm(sample, thread, attr, al, machine);
+
 	printf("\n");
 }
 
@@ -910,7 +1007,7 @@ static int process_sample_event(struct perf_tool *tool,
 	if (scripting_ops)
 		scripting_ops->process_event(event, sample, evsel, &al);
 	else
-		process_event(scr, sample, evsel, &al);
+		process_event(scr, sample, evsel, &al, machine);
 
 out_put:
 	addr_location__put(&al);
@@ -2010,7 +2107,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
 		     "comma separated output fields prepend with 'type:'. "
 		     "Valid types: hw,sw,trace,raw. "
 		     "Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso,"
-		     "addr,symoff,period,iregs,brstack,brstacksym,flags", parse_output_fields),
+		     "addr,symoff,period,iregs,brstack,brstacksym,flags,asm", parse_output_fields),
 	OPT_BOOLEAN('a', "all-cpus", &system_wide,
 		    "system-wide collection from all CPUs"),
 	OPT_STRING('S', "symbols", &symbol_conf.sym_list_str, "symbol[,symbol...]",
-- 
2.5.5

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 4/5] perf, tools, script: Add brstackasm output for branch stacks
  2016-03-28 17:45 [PATCH 1/5] perf, tools: Add support for skipping itrace instructions Andi Kleen
  2016-03-28 17:45 ` [PATCH 2/5] perf, tools: Add probing for udev86 Andi Kleen
  2016-03-28 17:45 ` [PATCH 3/5] perf, tools, script: Add support for printing assembler Andi Kleen
@ 2016-03-28 17:45 ` Andi Kleen
  2016-03-28 17:45 ` [PATCH 5/5] perf, tools, report: Add srcline_from/to branch sort keys Andi Kleen
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 8+ messages in thread
From: Andi Kleen @ 2016-03-28 17:45 UTC (permalink / raw)
  To: acme; +Cc: jolsa, linux-kernel, eranian, Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

Implement printing full disassembled sequences for branch stacks in perf
script. This allows to directly print hot paths for individual samples,
together with branch misprediction and even cycle count information.

% perf record -b ...
% perf script -F brstackasm
...
        00007f0668d54e88        movsx (%rsi), %ecx
        00007f0668d54e8b        lea -0x30(%rcx), %eax
        00007f0668d54e8e        cmp $0x9, %al
        00007f0668d54e90        jbe 0x68d54eaf
        00007f0668d54e92        cmp %cl, %dl
        00007f0668d54e94        jnz 0x68d54eb5
        00007f0668d54e96        add $0x1, %rdi
        00007f0668d54e9a        movsx (%rdi), %edx
        00007f0668d54e9d        add $0x1, %rsi
        00007f0668d54ea1        test %dl, %dl
        00007f0668d54ea3        jnz _dl_cache_libcmp+11       # PRED 21 cycles
        00007f0668d54dfb        lea -0x30(%rdx), %eax
        00007f0668d54dfe        cmp $0x9, %al
        00007f0668d54e00        ja _dl_cache_libcmp+152       # PRED 2 cycles
        00007f0668d54e88        movsx (%rsi), %ecx
        00007f0668d54e8b        lea -0x30(%rcx), %eax
        00007f0668d54e8e        cmp $0x9, %al
        00007f0668d54e90        jbe 0x68d54eaf
        00007f0668d54e92        cmp %cl, %dl
        00007f0668d54e94        jnz 0x68d54eb5                # PRED 3 cycles
        00007f0668d54eb5        movsx %dl, %eax
        00007f0668d54eb8        sub %ecx, %eax
        00007f0668d54eba        ret                           # PRED 1 cycles
        00007f0668d54fae        test %eax, %eax
        00007f0668d54fb0        jz _dl_load_cache_lookup+688
        00007f0668d54fb6        jns 0x68d54f70
        00007f0668d54fb8        lea 0x1(%r14), %ebx
        00007f0668d54fbc        cmp %r15d, %ebx
        00007f0668d54fbf        nop
        00007f0668d54fc0        jle 0x68d54f79                # PRED 2 cycles

Open issues:
- Occasionally the path does not reach up to the sample IP, as the LBRs
may be freezed earlier. Use precise events to avoid that.

v2: Remove bogus hunk. Document --max-blocks. Fix some printfs.
Port to latest tree.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 tools/perf/Documentation/perf-script.txt |  14 ++-
 tools/perf/builtin-script.c              | 183 +++++++++++++++++++++++++++++++
 2 files changed, 195 insertions(+), 2 deletions(-)

diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt
index f2b81d8..0903985 100644
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -116,7 +116,7 @@ OPTIONS
 --fields::
         Comma separated list of fields to print. Options are:
         comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr, symoff,
-	srcline, period, iregs, brstack, brstacksym, flags, asm.
+	srcline, period, iregs, brstack, brstacksym, flags, asm, brstackasm
         Field list can be prepended with the type, trace, sw or hw,
         to indicate to which event type the field list applies.
         e.g., -f sw:comm,tid,time,ip,sym  and -f trace:time,cpu,trace
@@ -176,17 +176,24 @@ OPTIONS
 	i.e., -f "" is not allowed.
 
 	The brstack output includes branch related information with raw addresses using the
-	/v/v/v/v/ syntax in the following order:
+	/v/v/v/v/cycles syntax in the following order:
 	FROM: branch source instruction
 	TO  : branch target instruction
         M/P/-: M=branch target mispredicted or branch direction was mispredicted, P=target predicted or direction predicted, -=not supported
 	X/- : X=branch inside a transactional region, -=not in transaction region or not supported
 	A/- : A=TSX abort entry, -=not aborted region or not supported
+	cycles
 
 	The brstacksym is identical to brstack, except that the FROM and TO addresses are printed in a symbolic form if possible.
 
 	When asm is specified the assembler instruction of each sample is printed in disassembled form.
 
+	When brstackasm is specified the full assembler sequences of branch blocks for each sample
+	is printed (a branch block is a sequence of instructions not containing taken branches).
+	This is the full execution path leading to the sample. This is only supported when the
+	sample was recorded with perf record -b or -j any.
+	The maximum number of branch blocks to print can be configured with the --max-blocks option.
+
 -k::
 --vmlinux=<file>::
         vmlinux pathname
@@ -268,6 +275,9 @@ include::itrace.txt[]
 --force::
 	Don't do ownership validation.
 
+--max-blocks=N:
+	Maximum number of branch blocks to print with -F brstackasm
+
 SEE ALSO
 --------
 linkperf:perf-record[1], linkperf:perf-script-perl[1],
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 323572e..1072cbb 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -42,6 +42,7 @@ static bool			nanosecs;
 static const char		*cpu_list;
 static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
 static struct perf_stat_config	stat_config;
+static int 			max_blocks;
 
 unsigned int scripting_max_stack = PERF_MAX_STACK_DEPTH;
 
@@ -67,6 +68,7 @@ enum perf_output_field {
 	PERF_OUTPUT_WEIGHT	    = 1U << 18,
 	PERF_OUTPUT_BPF_OUTPUT	    = 1U << 19,
 	PERF_OUTPUT_ASM		    = 1U << 20,
+	PERF_OUTPUT_BRSTACKASM	    = 1U << 21,
 };
 
 struct output_option {
@@ -94,6 +96,7 @@ struct output_option {
 	{.str = "weight",   .field = PERF_OUTPUT_WEIGHT},
 	{.str = "bpf-output",   .field = PERF_OUTPUT_BPF_OUTPUT},
 	{.str = "asm", .field = PERF_OUTPUT_ASM},
+	{.str = "brstackasm", .field = PERF_OUTPUT_BRSTACKASM},
 };
 
 /* default set to maintain compatibility with current format */
@@ -293,6 +296,13 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel,
 		       "selected.\n");
 		return -EINVAL;
 	}
+	if (PRINT_FIELD(BRSTACKASM) &&
+	    !(perf_evlist__combined_branch_type(session->evlist) &
+	      PERF_SAMPLE_BRANCH_ANY)) {
+		pr_err("Display of branch stack assembler requested, but non all-branch filter set\n");
+		return -EINVAL;
+	}
+
 	if ((PRINT_FIELD(PID) || PRINT_FIELD(TID)) &&
 		perf_evsel__check_stype(evsel, PERF_SAMPLE_TID, "TID",
 					PERF_OUTPUT_TID|PERF_OUTPUT_PID))
@@ -621,6 +631,175 @@ static void print_sample_brstacksym(struct perf_sample *sample,
 	}
 }
 
+#ifdef HAVE_UDIS86
+#define MAXBB 16384UL
+#define MAXINSN 16
+
+static int grab_bb(char *buffer, u64 start, u64 end,
+		    struct machine *machine, struct thread *thread,
+		    bool *is64bit, u8 *cpumode)
+{
+	int offset, len;
+	struct addr_location al;
+	bool kernel;
+
+	if (!start || !end)
+		return 0;
+
+	kernel = machine__kernel_ip(machine, start);
+	if (kernel)
+		*cpumode = PERF_RECORD_MISC_KERNEL;
+	else
+		*cpumode = PERF_RECORD_MISC_USER;
+	if (kernel != machine__kernel_ip(machine, end))
+		return 0;
+
+	memset(&al, 0, sizeof(al));
+	if (end - start > MAXBB - MAXINSN) {
+		printf("\tbasic block %" PRIx64 "-%" PRIx64 " (%ld) too long to dump\n",
+		       start, end, end - start);
+		return 0;
+	}
+
+	thread__find_addr_map(thread, *cpumode, MAP__FUNCTION, start, &al);
+	if (!al.map || !al.map->dso) {
+		printf("\tcannot resolve %" PRIx64 "-%" PRIx64 "\n",
+				start, end);
+		return 0;
+	}
+	if (al.map->dso->data.status == DSO_DATA_STATUS_ERROR) {
+		printf("\tcannot resolve %" PRIx64 "-%" PRIx64 "\n",
+				start, end);
+		return 0;
+	}
+
+	/* Load maps to ensure dso->is_64_bit has been updated */
+	map__load(al.map, machine->symbol_filter);
+
+	offset = al.map->map_ip(al.map, start);
+	len = dso__data_read_offset(al.map->dso, machine,
+				    offset, (u8 *)buffer,
+				    end - start + MAXINSN);
+
+	*is64bit = al.map->dso->is_64_bit;
+	return len;
+}
+#endif
+
+static void print_sample_brstackasm(struct perf_sample *sample,
+				    struct thread *thread __maybe_unused,
+				    struct perf_event_attr *attr __maybe_unused,
+				    struct machine *machine __maybe_unused)
+{
+#ifdef HAVE_UDIS86
+	struct branch_stack *br = sample->branch_stack;
+	u64 start, end;
+	int i;
+	static bool ud_initialized = false;
+	static struct perf_ud ud;
+	char buffer[MAXBB];
+	int len;
+	bool last;
+	bool is64bit;
+	int nr;
+
+	if (!(br && br->nr))
+		return;
+	nr = br->nr;
+	if (max_blocks && nr > max_blocks + 1)
+		nr = max_blocks + 1;
+
+	if (!ud_initialized) {
+		ud_initialized = true;
+		ud_init(&ud.ud_obj);
+		ud_set_syntax(&ud.ud_obj, UD_SYN_ATT);
+		ud_set_sym_resolver(&ud.ud_obj, dis_resolve);
+	}
+	ud.thread = thread;
+	ud.cpu = sample->cpu;
+
+	putchar('\n');
+	for (i = nr - 2; i >= 0; i--) {
+		if (br->entries[i].from || br->entries[i].to)
+			printf("%d: %lx-%lx\n", i,
+				br->entries[i].from,
+				br->entries[i].to);
+		start = br->entries[i + 1].to;
+		end = br->entries[i].from;
+
+		/*
+		 * Leave extra bytes for the final jump instruction for
+		 * which we don't know the length
+		 */
+		len = grab_bb(buffer, start, end + MAXINSN,
+				machine, thread, &is64bit,
+				&ud.cpumode);
+		if (len <= 0)
+			continue;
+
+		ud_set_mode(&ud.ud_obj, is64bit ? 64 : 32);
+		ud_set_pc(&ud.ud_obj, start);
+		ud_set_input_buffer(&ud.ud_obj, (uint8_t *)buffer, len);
+		last = false;
+		while (ud_disassemble(&ud.ud_obj) && !last) {
+			if (ud_insn_ptr(&ud.ud_obj) ==
+					(uint8_t *)buffer + end - start) {
+				printf("\t%016" PRIx64 "\t%-30s\t#%s%s%s%s\n",
+					ud_insn_off(&ud.ud_obj),
+					ud_insn_asm(&ud.ud_obj),
+					br->entries[i].flags.predicted ? " PRED" : "",
+					br->entries[i].flags.mispred ? " MISPRED" : "",
+					br->entries[i].flags.in_tx ? " INTX" : "",
+					br->entries[i].flags.abort ? " ABORT" : "");
+				if (br->entries[i].flags.cycles)
+					printf(" %d cycles", br->entries[i].flags.cycles);
+				last = true;
+			} else {
+				printf("\t%016" PRIx64 "\t%s\n",
+						ud_insn_off(&ud.ud_obj),
+					ud_insn_asm(&ud.ud_obj));
+			}
+		}
+	}
+
+	/*
+	 * Hit the branch? In this case we are already done, and the target
+	 * has not been executed yet.
+	 */
+	if (br->entries[0].from == sample->ip)
+		return;
+	if (br->entries[0].flags.abort)
+		return;
+
+	/*
+	 * Print final block upto sample
+	 */
+	start = br->entries[0].to;
+	end = sample->ip;
+	len = grab_bb(buffer, start, end, machine, thread, &is64bit,
+			&ud.cpumode);
+	ud_set_input_buffer(&ud.ud_obj, (uint8_t *)buffer, len);
+	if (len <= 0) {
+		/* Print at least last IP if basic block did not work */
+		len = grab_bb(buffer, sample->ip, sample->ip + MAXINSN,
+				machine, thread, &is64bit, &ud.cpumode);
+		if (len <= 0)
+			return;
+		ud_set_mode(&ud.ud_obj, is64bit ? 64 : 32);
+		ud_set_pc(&ud.ud_obj, sample->ip);
+		if (ud_disassemble(&ud.ud_obj))
+			printf("\t%016" PRIx64 "\t%s\n", ud_insn_off(&ud.ud_obj),
+			       ud_insn_asm(&ud.ud_obj));
+		return;
+	}
+	ud_set_mode(&ud.ud_obj, is64bit ? 64 : 32);
+	ud_set_pc(&ud.ud_obj, start);
+	while (ud_disassemble(&ud.ud_obj) &&
+		ud_insn_ptr(&ud.ud_obj) <= (uint8_t *)buffer + end - start)
+		printf("\t%016" PRIx64 "\t%s\n", ud_insn_off(&ud.ud_obj),
+			       ud_insn_asm(&ud.ud_obj));
+#endif
+}
 
 static void print_sample_addr(struct perf_sample *sample,
 			  struct thread *thread,
@@ -898,6 +1077,8 @@ print_rest:
 	if (perf_evsel__is_bpf_output(evsel) && PRINT_FIELD(BPF_OUTPUT))
 		print_sample_bpf_output(sample);
 
+	if (PRINT_FIELD(BRSTACKASM))
+		print_sample_brstackasm(sample, thread, attr, machine);
 	if (PRINT_FIELD(ASM))
 		print_sample_asm(sample, thread, attr, al, machine);
 
@@ -2129,6 +2310,8 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
 		    "Show the mmap events"),
 	OPT_BOOLEAN('\0', "show-switch-events", &script.show_switch_events,
 		    "Show context switch events (if recorded)"),
+	OPT_INTEGER(0, "max-blocks", &max_blocks,
+		    "Maximum number of code blocks to dump with brstackasm"),
 	OPT_BOOLEAN('f', "force", &file.force, "don't complain, do it"),
 	OPT_BOOLEAN(0, "ns", &nanosecs,
 		    "Use 9 decimal places when displaying time"),
-- 
2.5.5

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 5/5] perf, tools, report: Add srcline_from/to branch sort keys
  2016-03-28 17:45 [PATCH 1/5] perf, tools: Add support for skipping itrace instructions Andi Kleen
                   ` (2 preceding siblings ...)
  2016-03-28 17:45 ` [PATCH 4/5] perf, tools, script: Add brstackasm output for branch stacks Andi Kleen
@ 2016-03-28 17:45 ` Andi Kleen
  2016-03-29 16:27 ` [PATCH 1/5] perf, tools: Add support for skipping itrace instructions Arnaldo Carvalho de Melo
  2016-03-31  6:54 ` [tip:perf/core] perf " tip-bot for Andi Kleen
  5 siblings, 0 replies; 8+ messages in thread
From: Andi Kleen @ 2016-03-28 17:45 UTC (permalink / raw)
  To: acme; +Cc: jolsa, linux-kernel, eranian, Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

Add srcline_from and srcline_to branch sort keys that allow
to show the source lines of a branch. That makes it much easier
to track down where particular branches happen in the program,
for example to examine branch mispredictions, or to associate
it with cycle counts:

% perf record -b -e cycles:p ./tcall
% perf report --sort srcline_from,srcline_to,mispredict
...
    15.10%  tcall.c:18                                 tcall.c:10                                  N
    14.83%  tcall.c:11                                 tcall.c:5                                   N
    14.12%  tcall.c:7                                  tcall.c:12                                  N
    14.04%  tcall.c:12                                 tcall.c:5                                   N
    12.42%  tcall.c:17                                 tcall.c:18                                  N
    12.39%  tcall.c:7                                  tcall.c:13                                  N
    12.27%  tcall.c:13                                 tcall.c:17                                  N
...

% perf report --sort srcline_from,srcline_to,cycles
...
    17.12%  tcall.c:18                                 tcall.c:11                                                  1
    17.01%  tcall.c:12                                 tcall.c:6                                                   1
    16.98%  tcall.c:11                                 tcall.c:6                                                   1
    15.91%  tcall.c:17                                 tcall.c:18                                                  1
     6.38%  tcall.c:7                                  tcall.c:17                                                  7
     4.80%  tcall.c:7                                  tcall.c:12                                                  8
     4.21%  tcall.c:7                                  tcall.c:17                                                  8
     2.67%  tcall.c:7                                  tcall.c:12                                                  7
     2.62%  tcall.c:7                                  tcall.c:12                                                  10
     2.10%  tcall.c:7                                  tcall.c:17                                                  9
     1.58%  tcall.c:7                                  tcall.c:12                                                  6
     1.44%  tcall.c:7                                  tcall.c:12                                                  5
     1.38%  tcall.c:7                                  tcall.c:12                                                  9
     1.06%  tcall.c:7                                  tcall.c:17                                                  13
     1.05%  tcall.c:7                                  tcall.c:12                                                  4
     1.01%  tcall.c:7                                  tcall.c:17                                                  6

Open issues:
- Some kernel symbols get misresolved.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 tools/perf/Documentation/perf-report.txt |  3 +-
 tools/perf/util/hist.c                   |  9 ++++
 tools/perf/util/hist.h                   |  2 +
 tools/perf/util/sort.c                   | 84 ++++++++++++++++++++++++++++++++
 tools/perf/util/sort.h                   |  2 +
 tools/perf/util/symbol.h                 |  2 +
 6 files changed, 101 insertions(+), 1 deletion(-)

diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index 496d42c..9cbddc2 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -103,12 +103,13 @@ OPTIONS
 
 	If --branch-stack option is used, following sort keys are also
 	available:
-	dso_from, dso_to, symbol_from, symbol_to, mispredict.
 
 	- dso_from: name of library or module branched from
 	- dso_to: name of library or module branched to
 	- symbol_from: name of function branched from
 	- symbol_to: name of function branched to
+	- srcline_from: source file and line branched from
+	- srcline_to: source file and line branched to
 	- mispredict: "N" for predicted branch, "Y" for mispredicted branch
 	- in_tx: branch in TSX transaction
 	- abort: TSX transaction abort.
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 3d34c57..58caa69 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -117,6 +117,13 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
 			hists__new_col_len(hists, HISTC_SYMBOL_TO, symlen);
 			hists__set_unres_dso_col_len(hists, HISTC_DSO_TO);
 		}
+
+		if (h->branch_info->srcline_from)
+			hists__new_col_len(hists, HISTC_SRCLINE_FROM,
+					strlen(h->branch_info->srcline_from));
+		if (h->branch_info->srcline_to)
+			hists__new_col_len(hists, HISTC_SRCLINE_TO,
+					strlen(h->branch_info->srcline_to));
 	}
 
 	if (h->mem_info) {
@@ -1042,6 +1049,8 @@ void hist_entry__delete(struct hist_entry *he)
 	if (he->branch_info) {
 		map__zput(he->branch_info->from.map);
 		map__zput(he->branch_info->to.map);
+		free_srcline(he->branch_info->srcline_from);
+		free_srcline(he->branch_info->srcline_to);
 		zfree(&he->branch_info);
 	}
 
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 5885965..4dc87aa 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -52,6 +52,8 @@ enum hist_column {
 	HISTC_MEM_IADDR_SYMBOL,
 	HISTC_TRANSACTION,
 	HISTC_CYCLES,
+	HISTC_SRCLINE_FROM,
+	HISTC_SRCLINE_TO,
 	HISTC_TRACE,
 	HISTC_NR_COLS, /* Last entry */
 };
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 47966a1..04093c4 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -360,6 +360,88 @@ struct sort_entry sort_srcline = {
 	.se_width_idx	= HISTC_SRCLINE,
 };
 
+/* --sort srcline_from */
+
+static int64_t
+sort__srcline_from_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	if (!left->branch_info->srcline_from) {
+		struct map *map = left->branch_info->from.map;
+		if (!map)
+			left->branch_info->srcline_from = SRCLINE_UNKNOWN;
+		else
+			left->branch_info->srcline_from = get_srcline(map->dso,
+					   map__rip_2objdump(map,
+							     left->branch_info->from.al_addr),
+							 left->branch_info->from.sym, true);
+	}
+	if (!right->branch_info->srcline_from) {
+		struct map *map = right->branch_info->from.map;
+		if (!map)
+			right->branch_info->srcline_from = SRCLINE_UNKNOWN;
+		else
+			right->branch_info->srcline_from = get_srcline(map->dso,
+					     map__rip_2objdump(map,
+							       right->branch_info->from.al_addr),
+						     right->branch_info->from.sym, true);
+	}
+	return strcmp(right->branch_info->srcline_from, left->branch_info->srcline_from);
+}
+
+static int hist_entry__srcline_from_snprintf(struct hist_entry *he, char *bf,
+					size_t size, unsigned int width)
+{
+	return repsep_snprintf(bf, size, "%-*.*s", width, width, he->branch_info->srcline_from);
+}
+
+struct sort_entry sort_srcline_from = {
+	.se_header	= "From Source:Line",
+	.se_cmp		= sort__srcline_from_cmp,
+	.se_snprintf	= hist_entry__srcline_from_snprintf,
+	.se_width_idx	= HISTC_SRCLINE_FROM,
+};
+
+/* --sort srcline_to */
+
+static int64_t
+sort__srcline_to_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	if (!left->branch_info->srcline_to) {
+		struct map *map = left->branch_info->to.map;
+		if (!map)
+			left->branch_info->srcline_to = SRCLINE_UNKNOWN;
+		else
+			left->branch_info->srcline_to = get_srcline(map->dso,
+					   map__rip_2objdump(map,
+							     left->branch_info->to.al_addr),
+							 left->branch_info->from.sym, true);
+	}
+	if (!right->branch_info->srcline_to) {
+		struct map *map = right->branch_info->to.map;
+		if (!map)
+			right->branch_info->srcline_to = SRCLINE_UNKNOWN;
+		else
+			right->branch_info->srcline_to = get_srcline(map->dso,
+					     map__rip_2objdump(map,
+							       right->branch_info->to.al_addr),
+						     right->branch_info->to.sym, true);
+	}
+	return strcmp(right->branch_info->srcline_to, left->branch_info->srcline_to);
+}
+
+static int hist_entry__srcline_to_snprintf(struct hist_entry *he, char *bf,
+					size_t size, unsigned int width)
+{
+	return repsep_snprintf(bf, size, "%-*.*s", width, width, he->branch_info->srcline_to);
+}
+
+struct sort_entry sort_srcline_to = {
+	.se_header	= "To Source:Line",
+	.se_cmp		= sort__srcline_to_cmp,
+	.se_snprintf	= hist_entry__srcline_to_snprintf,
+	.se_width_idx	= HISTC_SRCLINE_TO,
+};
+
 /* --sort srcfile */
 
 static char no_srcfile[1];
@@ -1354,6 +1436,8 @@ static struct sort_dimension bstack_sort_dimensions[] = {
 	DIM(SORT_IN_TX, "in_tx", sort_in_tx),
 	DIM(SORT_ABORT, "abort", sort_abort),
 	DIM(SORT_CYCLES, "cycles", sort_cycles),
+	DIM(SORT_SRCLINE_FROM, "srcline_from", sort_srcline_from),
+	DIM(SORT_SRCLINE_TO, "srcline_to", sort_srcline_to),
 };
 
 #undef DIM
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index 3f4e359..9e3d80f 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -222,6 +222,8 @@ enum sort_type {
 	SORT_ABORT,
 	SORT_IN_TX,
 	SORT_CYCLES,
+	SORT_SRCLINE_FROM,
+	SORT_SRCLINE_TO,
 
 	/* memory mode specific sort keys */
 	__SORT_MEMORY_MODE,
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index c8b7544..fecc30c 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -177,6 +177,8 @@ struct branch_info {
 	struct addr_map_symbol from;
 	struct addr_map_symbol to;
 	struct branch_flags flags;
+	char			*srcline_from;
+	char			*srcline_to;
 };
 
 struct mem_info {
-- 
2.5.5

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/5] perf, tools: Add support for skipping itrace instructions
  2016-03-28 17:45 [PATCH 1/5] perf, tools: Add support for skipping itrace instructions Andi Kleen
                   ` (3 preceding siblings ...)
  2016-03-28 17:45 ` [PATCH 5/5] perf, tools, report: Add srcline_from/to branch sort keys Andi Kleen
@ 2016-03-29 16:27 ` Arnaldo Carvalho de Melo
  2016-03-29 17:33   ` Adrian Hunter
  2016-03-31  6:54 ` [tip:perf/core] perf " tip-bot for Andi Kleen
  5 siblings, 1 reply; 8+ messages in thread
From: Arnaldo Carvalho de Melo @ 2016-03-29 16:27 UTC (permalink / raw)
  To: Adrian Hunter; +Cc: Andi Kleen, jolsa, linux-kernel, eranian, Andi Kleen

Em Mon, Mar 28, 2016 at 10:45:38AM -0700, Andi Kleen escreveu:
> From: Andi Kleen <ak@linux.intel.com>
> 
> When using perf script to look at PT traces it is often
> useful to ignore the initialization code at the beginning.
> 
> On larger traces which may have many millions of instructions
> in initialization code doing that in a pipeline can be very
> slow, with perf script spending a lot of CPU time calling
> printf and writing data.
> 
> This patch adds an extension to the --itrace argument
> that skips 'n' events (instructions, branches or transactions)
> at the beginning. This is much more efficient.
> 
> Cc: Adrian Hunter <adrian.hunter@intel.com>

Adrian, are you ok now? Can I have your Acked-by?

- Arnaldo
 
> v2:
> Add support for BTS (Adrian Hunter)
> Document in itrace.txt
> Fix branch check
> Check transactions and instructions too
> Signed-off-by: Andi Kleen <ak@linux.intel.com>
> ---
>  tools/perf/Documentation/intel-pt.txt |  7 +++++++
>  tools/perf/Documentation/itrace.txt   |  8 ++++++++
>  tools/perf/util/auxtrace.c            |  7 +++++++
>  tools/perf/util/auxtrace.h            |  2 ++
>  tools/perf/util/intel-bts.c           |  5 +++++
>  tools/perf/util/intel-pt.c            | 22 ++++++++++++++++++++--
>  6 files changed, 49 insertions(+), 2 deletions(-)
> 
> diff --git a/tools/perf/Documentation/intel-pt.txt b/tools/perf/Documentation/intel-pt.txt
> index be764f9..c6c8318 100644
> --- a/tools/perf/Documentation/intel-pt.txt
> +++ b/tools/perf/Documentation/intel-pt.txt
> @@ -672,6 +672,7 @@ The letters are:
>  	d	create a debug log
>  	g	synthesize a call chain (use with i or x)
>  	l	synthesize last branch entries (use with i or x)
> +	s	skip initial number of events
>  
>  "Instructions" events look like they were recorded by "perf record -e
>  instructions".
> @@ -730,6 +731,12 @@ from one sample to the next.
>  
>  To disable trace decoding entirely, use the option --no-itrace.
>  
> +It is also possible to skip events generated (instructions, branches, transactions)
> +at the beginning. This is useful to ignore initialization code.
> +
> +	--itrace=i0nss1000000
> +
> +skips the first million instructions.
>  
>  dump option
>  -----------
> diff --git a/tools/perf/Documentation/itrace.txt b/tools/perf/Documentation/itrace.txt
> index 65453f4..e2a4c5e 100644
> --- a/tools/perf/Documentation/itrace.txt
> +++ b/tools/perf/Documentation/itrace.txt
> @@ -7,6 +7,7 @@
>  		d	create a debug log
>  		g	synthesize a call chain (use with i or x)
>  		l	synthesize last branch entries (use with i or x)
> +		s       skip initial number of events
>  
>  	The default is all events i.e. the same as --itrace=ibxe
>  
> @@ -24,3 +25,10 @@
>  
>  	Also the number of last branch entries (default 64, max. 1024) for
>  	instructions or transactions events can be specified.
> +
> +	It is also possible to skip events generated (instructions, branches, transactions)
> +	at the beginning. This is useful to ignore initialization code.
> +
> +	--itrace=i0nss1000000
> +
> +	skips the first million instructions.
> diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c
> index ec164fe..c916901 100644
> --- a/tools/perf/util/auxtrace.c
> +++ b/tools/perf/util/auxtrace.c
> @@ -940,6 +940,7 @@ void itrace_synth_opts__set_default(struct itrace_synth_opts *synth_opts)
>  	synth_opts->period = PERF_ITRACE_DEFAULT_PERIOD;
>  	synth_opts->callchain_sz = PERF_ITRACE_DEFAULT_CALLCHAIN_SZ;
>  	synth_opts->last_branch_sz = PERF_ITRACE_DEFAULT_LAST_BRANCH_SZ;
> +	synth_opts->initial_skip = 0;
>  }
>  
>  /*
> @@ -1064,6 +1065,12 @@ int itrace_parse_synth_opts(const struct option *opt, const char *str,
>  				synth_opts->last_branch_sz = val;
>  			}
>  			break;
> +		case 's':
> +			synth_opts->initial_skip = strtoul(p, &endptr, 10);
> +			if (p == endptr)
> +				goto out_err;
> +			p = endptr;
> +			break;
>  		case ' ':
>  		case ',':
>  			break;
> diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h
> index 57ff31e..767989e 100644
> --- a/tools/perf/util/auxtrace.h
> +++ b/tools/perf/util/auxtrace.h
> @@ -68,6 +68,7 @@ enum itrace_period_type {
>   * @last_branch_sz: branch context size
>   * @period: 'instructions' events period
>   * @period_type: 'instructions' events period type
> + * @initial_skip: skip N events at the beginning.
>   */
>  struct itrace_synth_opts {
>  	bool			set;
> @@ -86,6 +87,7 @@ struct itrace_synth_opts {
>  	unsigned int		last_branch_sz;
>  	unsigned long long	period;
>  	enum itrace_period_type	period_type;
> +	unsigned long		initial_skip;
>  };
>  
>  /**
> diff --git a/tools/perf/util/intel-bts.c b/tools/perf/util/intel-bts.c
> index 6bc3ecd..3ff2f72f 100644
> --- a/tools/perf/util/intel-bts.c
> +++ b/tools/perf/util/intel-bts.c
> @@ -66,6 +66,7 @@ struct intel_bts {
>  	u64				branches_id;
>  	size_t				branches_event_size;
>  	bool				synth_needs_swap;
> +	unsigned long			num_events;
>  };
>  
>  struct intel_bts_queue {
> @@ -275,6 +276,10 @@ static int intel_bts_synth_branch_sample(struct intel_bts_queue *btsq,
>  	union perf_event event;
>  	struct perf_sample sample = { .ip = 0, };
>  
> +	if (bts->synth_opts.initial_skip &&
> +	    bts->num_events++ <= bts->synth_opts.initial_skip)
> +		return 0;
> +
>  	event.sample.header.type = PERF_RECORD_SAMPLE;
>  	event.sample.header.misc = PERF_RECORD_MISC_USER;
>  	event.sample.header.size = sizeof(struct perf_event_header);
> diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c
> index 05d8158..07f0020 100644
> --- a/tools/perf/util/intel-pt.c
> +++ b/tools/perf/util/intel-pt.c
> @@ -100,6 +100,8 @@ struct intel_pt {
>  	u64 cyc_bit;
>  	u64 noretcomp_bit;
>  	unsigned max_non_turbo_ratio;
> +
> +	unsigned long num_events;
>  };
>  
>  enum switch_state {
> @@ -972,6 +974,10 @@ static int intel_pt_synth_branch_sample(struct intel_pt_queue *ptq)
>  	if (pt->branches_filter && !(pt->branches_filter & ptq->flags))
>  		return 0;
>  
> +	if (pt->synth_opts.initial_skip &&
> +	    pt->num_events++ < pt->synth_opts.initial_skip)
> +		return 0;
> +
>  	event->sample.header.type = PERF_RECORD_SAMPLE;
>  	event->sample.header.misc = PERF_RECORD_MISC_USER;
>  	event->sample.header.size = sizeof(struct perf_event_header);
> @@ -1028,6 +1034,10 @@ static int intel_pt_synth_instruction_sample(struct intel_pt_queue *ptq)
>  	union perf_event *event = ptq->event_buf;
>  	struct perf_sample sample = { .ip = 0, };
>  
> +	if (pt->synth_opts.initial_skip &&
> +	    pt->num_events++ < pt->synth_opts.initial_skip)
> +		return 0;
> +
>  	event->sample.header.type = PERF_RECORD_SAMPLE;
>  	event->sample.header.misc = PERF_RECORD_MISC_USER;
>  	event->sample.header.size = sizeof(struct perf_event_header);
> @@ -1085,6 +1095,10 @@ static int intel_pt_synth_transaction_sample(struct intel_pt_queue *ptq)
>  	union perf_event *event = ptq->event_buf;
>  	struct perf_sample sample = { .ip = 0, };
>  
> +	if (pt->synth_opts.initial_skip &&
> +	    pt->num_events++ < pt->synth_opts.initial_skip)
> +		return 0;
> +
>  	event->sample.header.type = PERF_RECORD_SAMPLE;
>  	event->sample.header.misc = PERF_RECORD_MISC_USER;
>  	event->sample.header.size = sizeof(struct perf_event_header);
> @@ -1196,14 +1210,18 @@ static int intel_pt_sample(struct intel_pt_queue *ptq)
>  	ptq->have_sample = false;
>  
>  	if (pt->sample_instructions &&
> -	    (state->type & INTEL_PT_INSTRUCTION)) {
> +	    (state->type & INTEL_PT_INSTRUCTION) &&
> +	    (!pt->synth_opts.initial_skip ||
> +	     pt->num_events++ >= pt->synth_opts.initial_skip)) {
>  		err = intel_pt_synth_instruction_sample(ptq);
>  		if (err)
>  			return err;
>  	}
>  
>  	if (pt->sample_transactions &&
> -	    (state->type & INTEL_PT_TRANSACTION)) {
> +	    (state->type & INTEL_PT_TRANSACTION) &&
> +	    (!pt->synth_opts.initial_skip ||
> +	     pt->num_events++ >= pt->synth_opts.initial_skip)) {
>  		err = intel_pt_synth_transaction_sample(ptq);
>  		if (err)
>  			return err;
> -- 
> 2.5.5

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/5] perf, tools: Add support for skipping itrace instructions
  2016-03-29 16:27 ` [PATCH 1/5] perf, tools: Add support for skipping itrace instructions Arnaldo Carvalho de Melo
@ 2016-03-29 17:33   ` Adrian Hunter
  0 siblings, 0 replies; 8+ messages in thread
From: Adrian Hunter @ 2016-03-29 17:33 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: Andi Kleen, jolsa, linux-kernel, eranian, Andi Kleen

On 29/03/2016 7:27 p.m., Arnaldo Carvalho de Melo wrote:
> Em Mon, Mar 28, 2016 at 10:45:38AM -0700, Andi Kleen escreveu:
>> From: Andi Kleen <ak@linux.intel.com>
>>
>> When using perf script to look at PT traces it is often
>> useful to ignore the initialization code at the beginning.
>>
>> On larger traces which may have many millions of instructions
>> in initialization code doing that in a pipeline can be very
>> slow, with perf script spending a lot of CPU time calling
>> printf and writing data.
>>
>> This patch adds an extension to the --itrace argument
>> that skips 'n' events (instructions, branches or transactions)
>> at the beginning. This is much more efficient.
>>
>> Cc: Adrian Hunter <adrian.hunter@intel.com>
>
> Adrian, are you ok now? Can I have your Acked-by?

The last hunk isn't actually needed, but apart from that:

Acked-by: Adrian Hunter <adrian.hunter@intel.com>

>
> - Arnaldo
>
>> v2:
>> Add support for BTS (Adrian Hunter)
>> Document in itrace.txt
>> Fix branch check
>> Check transactions and instructions too
>> Signed-off-by: Andi Kleen <ak@linux.intel.com>
>> ---
>>   tools/perf/Documentation/intel-pt.txt |  7 +++++++
>>   tools/perf/Documentation/itrace.txt   |  8 ++++++++
>>   tools/perf/util/auxtrace.c            |  7 +++++++
>>   tools/perf/util/auxtrace.h            |  2 ++
>>   tools/perf/util/intel-bts.c           |  5 +++++
>>   tools/perf/util/intel-pt.c            | 22 ++++++++++++++++++++--
>>   6 files changed, 49 insertions(+), 2 deletions(-)
>>
>> diff --git a/tools/perf/Documentation/intel-pt.txt b/tools/perf/Documentation/intel-pt.txt
>> index be764f9..c6c8318 100644
>> --- a/tools/perf/Documentation/intel-pt.txt
>> +++ b/tools/perf/Documentation/intel-pt.txt
>> @@ -672,6 +672,7 @@ The letters are:
>>   	d	create a debug log
>>   	g	synthesize a call chain (use with i or x)
>>   	l	synthesize last branch entries (use with i or x)
>> +	s	skip initial number of events
>>
>>   "Instructions" events look like they were recorded by "perf record -e
>>   instructions".
>> @@ -730,6 +731,12 @@ from one sample to the next.
>>
>>   To disable trace decoding entirely, use the option --no-itrace.
>>
>> +It is also possible to skip events generated (instructions, branches, transactions)
>> +at the beginning. This is useful to ignore initialization code.
>> +
>> +	--itrace=i0nss1000000
>> +
>> +skips the first million instructions.
>>
>>   dump option
>>   -----------
>> diff --git a/tools/perf/Documentation/itrace.txt b/tools/perf/Documentation/itrace.txt
>> index 65453f4..e2a4c5e 100644
>> --- a/tools/perf/Documentation/itrace.txt
>> +++ b/tools/perf/Documentation/itrace.txt
>> @@ -7,6 +7,7 @@
>>   		d	create a debug log
>>   		g	synthesize a call chain (use with i or x)
>>   		l	synthesize last branch entries (use with i or x)
>> +		s       skip initial number of events
>>
>>   	The default is all events i.e. the same as --itrace=ibxe
>>
>> @@ -24,3 +25,10 @@
>>
>>   	Also the number of last branch entries (default 64, max. 1024) for
>>   	instructions or transactions events can be specified.
>> +
>> +	It is also possible to skip events generated (instructions, branches, transactions)
>> +	at the beginning. This is useful to ignore initialization code.
>> +
>> +	--itrace=i0nss1000000
>> +
>> +	skips the first million instructions.
>> diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c
>> index ec164fe..c916901 100644
>> --- a/tools/perf/util/auxtrace.c
>> +++ b/tools/perf/util/auxtrace.c
>> @@ -940,6 +940,7 @@ void itrace_synth_opts__set_default(struct itrace_synth_opts *synth_opts)
>>   	synth_opts->period = PERF_ITRACE_DEFAULT_PERIOD;
>>   	synth_opts->callchain_sz = PERF_ITRACE_DEFAULT_CALLCHAIN_SZ;
>>   	synth_opts->last_branch_sz = PERF_ITRACE_DEFAULT_LAST_BRANCH_SZ;
>> +	synth_opts->initial_skip = 0;
>>   }
>>
>>   /*
>> @@ -1064,6 +1065,12 @@ int itrace_parse_synth_opts(const struct option *opt, const char *str,
>>   				synth_opts->last_branch_sz = val;
>>   			}
>>   			break;
>> +		case 's':
>> +			synth_opts->initial_skip = strtoul(p, &endptr, 10);
>> +			if (p == endptr)
>> +				goto out_err;
>> +			p = endptr;
>> +			break;
>>   		case ' ':
>>   		case ',':
>>   			break;
>> diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h
>> index 57ff31e..767989e 100644
>> --- a/tools/perf/util/auxtrace.h
>> +++ b/tools/perf/util/auxtrace.h
>> @@ -68,6 +68,7 @@ enum itrace_period_type {
>>    * @last_branch_sz: branch context size
>>    * @period: 'instructions' events period
>>    * @period_type: 'instructions' events period type
>> + * @initial_skip: skip N events at the beginning.
>>    */
>>   struct itrace_synth_opts {
>>   	bool			set;
>> @@ -86,6 +87,7 @@ struct itrace_synth_opts {
>>   	unsigned int		last_branch_sz;
>>   	unsigned long long	period;
>>   	enum itrace_period_type	period_type;
>> +	unsigned long		initial_skip;
>>   };
>>
>>   /**
>> diff --git a/tools/perf/util/intel-bts.c b/tools/perf/util/intel-bts.c
>> index 6bc3ecd..3ff2f72f 100644
>> --- a/tools/perf/util/intel-bts.c
>> +++ b/tools/perf/util/intel-bts.c
>> @@ -66,6 +66,7 @@ struct intel_bts {
>>   	u64				branches_id;
>>   	size_t				branches_event_size;
>>   	bool				synth_needs_swap;
>> +	unsigned long			num_events;
>>   };
>>
>>   struct intel_bts_queue {
>> @@ -275,6 +276,10 @@ static int intel_bts_synth_branch_sample(struct intel_bts_queue *btsq,
>>   	union perf_event event;
>>   	struct perf_sample sample = { .ip = 0, };
>>
>> +	if (bts->synth_opts.initial_skip &&
>> +	    bts->num_events++ <= bts->synth_opts.initial_skip)
>> +		return 0;
>> +
>>   	event.sample.header.type = PERF_RECORD_SAMPLE;
>>   	event.sample.header.misc = PERF_RECORD_MISC_USER;
>>   	event.sample.header.size = sizeof(struct perf_event_header);
>> diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c
>> index 05d8158..07f0020 100644
>> --- a/tools/perf/util/intel-pt.c
>> +++ b/tools/perf/util/intel-pt.c
>> @@ -100,6 +100,8 @@ struct intel_pt {
>>   	u64 cyc_bit;
>>   	u64 noretcomp_bit;
>>   	unsigned max_non_turbo_ratio;
>> +
>> +	unsigned long num_events;
>>   };
>>
>>   enum switch_state {
>> @@ -972,6 +974,10 @@ static int intel_pt_synth_branch_sample(struct intel_pt_queue *ptq)
>>   	if (pt->branches_filter && !(pt->branches_filter & ptq->flags))
>>   		return 0;
>>
>> +	if (pt->synth_opts.initial_skip &&
>> +	    pt->num_events++ < pt->synth_opts.initial_skip)
>> +		return 0;
>> +
>>   	event->sample.header.type = PERF_RECORD_SAMPLE;
>>   	event->sample.header.misc = PERF_RECORD_MISC_USER;
>>   	event->sample.header.size = sizeof(struct perf_event_header);
>> @@ -1028,6 +1034,10 @@ static int intel_pt_synth_instruction_sample(struct intel_pt_queue *ptq)
>>   	union perf_event *event = ptq->event_buf;
>>   	struct perf_sample sample = { .ip = 0, };
>>
>> +	if (pt->synth_opts.initial_skip &&
>> +	    pt->num_events++ < pt->synth_opts.initial_skip)
>> +		return 0;
>> +
>>   	event->sample.header.type = PERF_RECORD_SAMPLE;
>>   	event->sample.header.misc = PERF_RECORD_MISC_USER;
>>   	event->sample.header.size = sizeof(struct perf_event_header);
>> @@ -1085,6 +1095,10 @@ static int intel_pt_synth_transaction_sample(struct intel_pt_queue *ptq)
>>   	union perf_event *event = ptq->event_buf;
>>   	struct perf_sample sample = { .ip = 0, };
>>
>> +	if (pt->synth_opts.initial_skip &&
>> +	    pt->num_events++ < pt->synth_opts.initial_skip)
>> +		return 0;
>> +
>>   	event->sample.header.type = PERF_RECORD_SAMPLE;
>>   	event->sample.header.misc = PERF_RECORD_MISC_USER;
>>   	event->sample.header.size = sizeof(struct perf_event_header);
>> @@ -1196,14 +1210,18 @@ static int intel_pt_sample(struct intel_pt_queue *ptq)
>>   	ptq->have_sample = false;
>>
>>   	if (pt->sample_instructions &&
>> -	    (state->type & INTEL_PT_INSTRUCTION)) {
>> +	    (state->type & INTEL_PT_INSTRUCTION) &&
>> +	    (!pt->synth_opts.initial_skip ||
>> +	     pt->num_events++ >= pt->synth_opts.initial_skip)) {
>>   		err = intel_pt_synth_instruction_sample(ptq);
>>   		if (err)
>>   			return err;
>>   	}
>>
>>   	if (pt->sample_transactions &&
>> -	    (state->type & INTEL_PT_TRANSACTION)) {
>> +	    (state->type & INTEL_PT_TRANSACTION) &&
>> +	    (!pt->synth_opts.initial_skip ||
>> +	     pt->num_events++ >= pt->synth_opts.initial_skip)) {
>>   		err = intel_pt_synth_transaction_sample(ptq);
>>   		if (err)
>>   			return err;

This last hunk (@@ -1196,14 +1210,18 @@) isn't needed since
intel_pt_synth_instruction_sample() and
intel_pt_synth_transaction_sample() check initial_skip themselves.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [tip:perf/core] perf tools: Add support for skipping itrace instructions
  2016-03-28 17:45 [PATCH 1/5] perf, tools: Add support for skipping itrace instructions Andi Kleen
                   ` (4 preceding siblings ...)
  2016-03-29 16:27 ` [PATCH 1/5] perf, tools: Add support for skipping itrace instructions Arnaldo Carvalho de Melo
@ 2016-03-31  6:54 ` tip-bot for Andi Kleen
  5 siblings, 0 replies; 8+ messages in thread
From: tip-bot for Andi Kleen @ 2016-03-31  6:54 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: tglx, acme, hpa, eranian, linux-kernel, jolsa, adrian.hunter, mingo, ak

Commit-ID:  d1706b39f0af6901ab2a5e2ebb210b53c1a5bdc7
Gitweb:     http://git.kernel.org/tip/d1706b39f0af6901ab2a5e2ebb210b53c1a5bdc7
Author:     Andi Kleen <ak@linux.intel.com>
AuthorDate: Mon, 28 Mar 2016 10:45:38 -0700
Committer:  Arnaldo Carvalho de Melo <acme@redhat.com>
CommitDate: Wed, 30 Mar 2016 11:14:09 -0300

perf tools: Add support for skipping itrace instructions

When using 'perf script' to look at PT traces it is often useful to
ignore the initialization code at the beginning.

On larger traces which may have many millions of instructions in
initialization code doing that in a pipeline can be very slow, with perf
script spending a lot of CPU time calling printf and writing data.

This patch adds an extension to the --itrace argument that skips 'n'
events (instructions, branches or transactions) at the beginning. This
is much more efficient.

v2:
Add support for BTS (Adrian Hunter)
Document in itrace.txt
Fix branch check
Check transactions and instructions too

Committer note:

To test intel_pt one needs to make sure VT-x isn't active, i.e.
stopping KVM guests on the test machine, as described by Andi Kleen
at http://lkml.kernel.org/r/20160301234953.GD23621@tassilo.jf.intel.com

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1459187142-20035-1-git-send-email-andi@firstfloor.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/intel-pt.txt |  7 +++++++
 tools/perf/Documentation/itrace.txt   |  8 ++++++++
 tools/perf/util/auxtrace.c            |  7 +++++++
 tools/perf/util/auxtrace.h            |  2 ++
 tools/perf/util/intel-bts.c           |  5 +++++
 tools/perf/util/intel-pt.c            | 22 ++++++++++++++++++++--
 6 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/tools/perf/Documentation/intel-pt.txt b/tools/perf/Documentation/intel-pt.txt
index be764f9..c6c8318 100644
--- a/tools/perf/Documentation/intel-pt.txt
+++ b/tools/perf/Documentation/intel-pt.txt
@@ -672,6 +672,7 @@ The letters are:
 	d	create a debug log
 	g	synthesize a call chain (use with i or x)
 	l	synthesize last branch entries (use with i or x)
+	s	skip initial number of events
 
 "Instructions" events look like they were recorded by "perf record -e
 instructions".
@@ -730,6 +731,12 @@ from one sample to the next.
 
 To disable trace decoding entirely, use the option --no-itrace.
 
+It is also possible to skip events generated (instructions, branches, transactions)
+at the beginning. This is useful to ignore initialization code.
+
+	--itrace=i0nss1000000
+
+skips the first million instructions.
 
 dump option
 -----------
diff --git a/tools/perf/Documentation/itrace.txt b/tools/perf/Documentation/itrace.txt
index 65453f4..e2a4c5e 100644
--- a/tools/perf/Documentation/itrace.txt
+++ b/tools/perf/Documentation/itrace.txt
@@ -7,6 +7,7 @@
 		d	create a debug log
 		g	synthesize a call chain (use with i or x)
 		l	synthesize last branch entries (use with i or x)
+		s       skip initial number of events
 
 	The default is all events i.e. the same as --itrace=ibxe
 
@@ -24,3 +25,10 @@
 
 	Also the number of last branch entries (default 64, max. 1024) for
 	instructions or transactions events can be specified.
+
+	It is also possible to skip events generated (instructions, branches, transactions)
+	at the beginning. This is useful to ignore initialization code.
+
+	--itrace=i0nss1000000
+
+	skips the first million instructions.
diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c
index ec164fe..c916901 100644
--- a/tools/perf/util/auxtrace.c
+++ b/tools/perf/util/auxtrace.c
@@ -940,6 +940,7 @@ void itrace_synth_opts__set_default(struct itrace_synth_opts *synth_opts)
 	synth_opts->period = PERF_ITRACE_DEFAULT_PERIOD;
 	synth_opts->callchain_sz = PERF_ITRACE_DEFAULT_CALLCHAIN_SZ;
 	synth_opts->last_branch_sz = PERF_ITRACE_DEFAULT_LAST_BRANCH_SZ;
+	synth_opts->initial_skip = 0;
 }
 
 /*
@@ -1064,6 +1065,12 @@ int itrace_parse_synth_opts(const struct option *opt, const char *str,
 				synth_opts->last_branch_sz = val;
 			}
 			break;
+		case 's':
+			synth_opts->initial_skip = strtoul(p, &endptr, 10);
+			if (p == endptr)
+				goto out_err;
+			p = endptr;
+			break;
 		case ' ':
 		case ',':
 			break;
diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h
index 57ff31e..767989e 100644
--- a/tools/perf/util/auxtrace.h
+++ b/tools/perf/util/auxtrace.h
@@ -68,6 +68,7 @@ enum itrace_period_type {
  * @last_branch_sz: branch context size
  * @period: 'instructions' events period
  * @period_type: 'instructions' events period type
+ * @initial_skip: skip N events at the beginning.
  */
 struct itrace_synth_opts {
 	bool			set;
@@ -86,6 +87,7 @@ struct itrace_synth_opts {
 	unsigned int		last_branch_sz;
 	unsigned long long	period;
 	enum itrace_period_type	period_type;
+	unsigned long		initial_skip;
 };
 
 /**
diff --git a/tools/perf/util/intel-bts.c b/tools/perf/util/intel-bts.c
index abf1366..9df9960 100644
--- a/tools/perf/util/intel-bts.c
+++ b/tools/perf/util/intel-bts.c
@@ -66,6 +66,7 @@ struct intel_bts {
 	u64				branches_id;
 	size_t				branches_event_size;
 	bool				synth_needs_swap;
+	unsigned long			num_events;
 };
 
 struct intel_bts_queue {
@@ -275,6 +276,10 @@ static int intel_bts_synth_branch_sample(struct intel_bts_queue *btsq,
 	union perf_event event;
 	struct perf_sample sample = { .ip = 0, };
 
+	if (bts->synth_opts.initial_skip &&
+	    bts->num_events++ <= bts->synth_opts.initial_skip)
+		return 0;
+
 	event.sample.header.type = PERF_RECORD_SAMPLE;
 	event.sample.header.misc = PERF_RECORD_MISC_USER;
 	event.sample.header.size = sizeof(struct perf_event_header);
diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c
index 407f11b..ddec87f 100644
--- a/tools/perf/util/intel-pt.c
+++ b/tools/perf/util/intel-pt.c
@@ -100,6 +100,8 @@ struct intel_pt {
 	u64 cyc_bit;
 	u64 noretcomp_bit;
 	unsigned max_non_turbo_ratio;
+
+	unsigned long num_events;
 };
 
 enum switch_state {
@@ -972,6 +974,10 @@ static int intel_pt_synth_branch_sample(struct intel_pt_queue *ptq)
 	if (pt->branches_filter && !(pt->branches_filter & ptq->flags))
 		return 0;
 
+	if (pt->synth_opts.initial_skip &&
+	    pt->num_events++ < pt->synth_opts.initial_skip)
+		return 0;
+
 	event->sample.header.type = PERF_RECORD_SAMPLE;
 	event->sample.header.misc = PERF_RECORD_MISC_USER;
 	event->sample.header.size = sizeof(struct perf_event_header);
@@ -1029,6 +1035,10 @@ static int intel_pt_synth_instruction_sample(struct intel_pt_queue *ptq)
 	union perf_event *event = ptq->event_buf;
 	struct perf_sample sample = { .ip = 0, };
 
+	if (pt->synth_opts.initial_skip &&
+	    pt->num_events++ < pt->synth_opts.initial_skip)
+		return 0;
+
 	event->sample.header.type = PERF_RECORD_SAMPLE;
 	event->sample.header.misc = PERF_RECORD_MISC_USER;
 	event->sample.header.size = sizeof(struct perf_event_header);
@@ -1087,6 +1097,10 @@ static int intel_pt_synth_transaction_sample(struct intel_pt_queue *ptq)
 	union perf_event *event = ptq->event_buf;
 	struct perf_sample sample = { .ip = 0, };
 
+	if (pt->synth_opts.initial_skip &&
+	    pt->num_events++ < pt->synth_opts.initial_skip)
+		return 0;
+
 	event->sample.header.type = PERF_RECORD_SAMPLE;
 	event->sample.header.misc = PERF_RECORD_MISC_USER;
 	event->sample.header.size = sizeof(struct perf_event_header);
@@ -1199,14 +1213,18 @@ static int intel_pt_sample(struct intel_pt_queue *ptq)
 	ptq->have_sample = false;
 
 	if (pt->sample_instructions &&
-	    (state->type & INTEL_PT_INSTRUCTION)) {
+	    (state->type & INTEL_PT_INSTRUCTION) &&
+	    (!pt->synth_opts.initial_skip ||
+	     pt->num_events++ >= pt->synth_opts.initial_skip)) {
 		err = intel_pt_synth_instruction_sample(ptq);
 		if (err)
 			return err;
 	}
 
 	if (pt->sample_transactions &&
-	    (state->type & INTEL_PT_TRANSACTION)) {
+	    (state->type & INTEL_PT_TRANSACTION) &&
+	    (!pt->synth_opts.initial_skip ||
+	     pt->num_events++ >= pt->synth_opts.initial_skip)) {
 		err = intel_pt_synth_transaction_sample(ptq);
 		if (err)
 			return err;

^ permalink raw reply related	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2016-03-31  6:55 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-03-28 17:45 [PATCH 1/5] perf, tools: Add support for skipping itrace instructions Andi Kleen
2016-03-28 17:45 ` [PATCH 2/5] perf, tools: Add probing for udev86 Andi Kleen
2016-03-28 17:45 ` [PATCH 3/5] perf, tools, script: Add support for printing assembler Andi Kleen
2016-03-28 17:45 ` [PATCH 4/5] perf, tools, script: Add brstackasm output for branch stacks Andi Kleen
2016-03-28 17:45 ` [PATCH 5/5] perf, tools, report: Add srcline_from/to branch sort keys Andi Kleen
2016-03-29 16:27 ` [PATCH 1/5] perf, tools: Add support for skipping itrace instructions Arnaldo Carvalho de Melo
2016-03-29 17:33   ` Adrian Hunter
2016-03-31  6:54 ` [tip:perf/core] perf " tip-bot for Andi Kleen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).