All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] perf, tools, script: Add brstackinsn for branch stacks
@ 2017-02-23 23:46 Andi Kleen
  2017-03-16  2:16 ` Michael Ellerman
  2017-03-16 16:36 ` [tip:perf/core] perf script: Add 'brstackinsn' " tip-bot for Andi Kleen
  0 siblings, 2 replies; 5+ messages in thread
From: Andi Kleen @ 2017-02-23 23:46 UTC (permalink / raw)
  To: acme; +Cc: mingo, jolsa, linux-kernel, Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

Implement printing instruction sequences as hex dump for branch stacks.
This relies on the x86 instruction decoder used by the PT decoder to find
the lengths of instructions to dump them individually.

This is good enough for pattern matching.

This allows to study hot paths for individual samples, together with branch
misprediction and cycle count / IPC information if available (on Skylake systems).

% perf record -b ...
% perf script -F brstackinsn
...
  read_hpet+67:
        ffffffff9905b843        insn: 74 ea                     # PRED
        ffffffff9905b82f        insn: 85 c9
        ffffffff9905b831        insn: 74 12
        ffffffff9905b833        insn: f3 90
        ffffffff9905b835        insn: 48 8b 0f
        ffffffff9905b838        insn: 48 89 ca
        ffffffff9905b83b        insn: 48 c1 ea 20
        ffffffff9905b83f        insn: 39 f2
        ffffffff9905b841        insn: 89 d0
        ffffffff9905b843        insn: 74 ea                     # PRED

Only works when no special branch filters are specified.

Occasionally the path does not reach up to the sample IP, as the LBRs
may be frozen before executing a final jump. In this case we print
a special message.

The instruction dumper piggy backs on the existing infrastructure from
the IP PT decoder.

An earlier iteration of this patch relied on a disassembler, but
this version only uses the existing instruction decoder.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 tools/perf/Documentation/perf-script.txt           |  13 +-
 tools/perf/builtin-script.c                        | 289 ++++++++++++++++++++-
 tools/perf/util/Build                              |   1 +
 tools/perf/util/dump-insn.c                        |  16 ++
 tools/perf/util/dump-insn.h                        |  19 ++
 .../util/intel-pt-decoder/intel-pt-insn-decoder.c  |  24 ++
 6 files changed, 351 insertions(+), 11 deletions(-)
 create mode 100644 tools/perf/util/dump-insn.c
 create mode 100644 tools/perf/util/dump-insn.h

diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt
index 4ed5f239ba7d..2ad57201ea02 100644
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -116,7 +116,7 @@ OPTIONS
 --fields::
         Comma separated list of fields to print. Options are:
         comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr, symoff,
-        srcline, period, iregs, brstack, brstacksym, flags, bpf-output,
+        srcline, period, iregs, brstack, brstacksym, flags, bpf-output, brstackinsn,
         callindent, insn, insnlen. Field list can be prepended with the type, trace, sw or hw,
         to indicate to which event type the field list applies.
         e.g., -F sw:comm,tid,time,ip,sym  and -F trace:time,cpu,trace
@@ -189,15 +189,20 @@ OPTIONS
 	i.e., -F "" is not allowed.
 
 	The brstack output includes branch related information with raw addresses using the
-	/v/v/v/v/ syntax in the following order:
+	/v/v/v/v/cycles syntax in the following order:
 	FROM: branch source instruction
 	TO  : branch target instruction
         M/P/-: M=branch target mispredicted or branch direction was mispredicted, P=target predicted or direction predicted, -=not supported
 	X/- : X=branch inside a transactional region, -=not in transaction region or not supported
 	A/- : A=TSX abort entry, -=not aborted region or not supported
+	cycles
 
 	The brstacksym is identical to brstack, except that the FROM and TO addresses are printed in a symbolic form if possible.
 
+	When brstackinsn is specified the full assembler sequences of branch sequences for each sample
+	is printed. This is the full execution path leading to the sample. This is only supported when the
+	sample was recorded with perf record -b or -j any.
+
 -k::
 --vmlinux=<file>::
         vmlinux pathname
@@ -299,6 +304,10 @@ include::itrace.txt[]
 	stop time is not given (i.e, time string is 'x.y,') then analysis goes
 	to end of file.
 
+--max-blocks::
+	Set the maximum number of program blocks to print with brstackasm for
+	each sample.
+
 SEE ALSO
 --------
 linkperf:perf-record[1], linkperf:perf-script-perl[1],
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index c0783b4f7b6c..b0fdb6c032a5 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -28,6 +28,7 @@
 #include <linux/time64.h>
 #include "asm/bug.h"
 #include "util/mem-events.h"
+#include "util/dump-insn.h"
 
 static char const		*script_name;
 static char const		*generate_script_lang;
@@ -42,6 +43,7 @@ static bool			nanosecs;
 static const char		*cpu_list;
 static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
 static struct perf_stat_config	stat_config;
+static int			max_blocks;
 
 unsigned int scripting_max_stack = PERF_MAX_STACK_DEPTH;
 
@@ -69,6 +71,7 @@ enum perf_output_field {
 	PERF_OUTPUT_CALLINDENT	    = 1U << 20,
 	PERF_OUTPUT_INSN	    = 1U << 21,
 	PERF_OUTPUT_INSNLEN	    = 1U << 22,
+	PERF_OUTPUT_BRSTACKINSN	    = 1U << 23,
 };
 
 struct output_option {
@@ -98,6 +101,7 @@ struct output_option {
 	{.str = "callindent", .field = PERF_OUTPUT_CALLINDENT},
 	{.str = "insn", .field = PERF_OUTPUT_INSN},
 	{.str = "insnlen", .field = PERF_OUTPUT_INSNLEN},
+	{.str = "brstackinsn", .field = PERF_OUTPUT_BRSTACKINSN},
 };
 
 /* default set to maintain compatibility with current format */
@@ -292,7 +296,12 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel,
 		       "selected. Hence, no address to lookup the source line number.\n");
 		return -EINVAL;
 	}
-
+	if (PRINT_FIELD(BRSTACKINSN) &&
+	    !(perf_evlist__combined_branch_type(session->evlist) &
+	      PERF_SAMPLE_BRANCH_ANY)) {
+		pr_err("Display of branch stack assembler requested, but non all-branch filter set\n");
+		return -EINVAL;
+	}
 	if ((PRINT_FIELD(PID) || PRINT_FIELD(TID)) &&
 		perf_evsel__check_stype(evsel, PERF_SAMPLE_TID, "TID",
 					PERF_OUTPUT_TID|PERF_OUTPUT_PID))
@@ -546,6 +555,259 @@ static void print_sample_brstacksym(struct perf_sample *sample,
 	}
 }
 
+#define MAXBB 16384UL
+
+static int grab_bb(u8 *buffer, u64 start, u64 end,
+		    struct machine *machine, struct thread *thread,
+		    bool *is64bit, u8 *cpumode, bool last)
+{
+	long offset, len;
+	struct addr_location al;
+	bool kernel;
+
+	if (!start || !end)
+		return 0;
+
+	kernel = machine__kernel_ip(machine, start);
+	if (kernel)
+		*cpumode = PERF_RECORD_MISC_KERNEL;
+	else
+		*cpumode = PERF_RECORD_MISC_USER;
+
+	/*
+	 * Block overlaps between kernel and user.
+	 * This can happen due to ring filtering
+	 * On Intel CPUs the entry into the kernel is filtered,
+	 * but the exit is not. Let the caller patch it up.
+	 */
+	if (kernel != machine__kernel_ip(machine, end)) {
+		printf("\tblock %" PRIx64 "-%" PRIx64 " transfers between kernel and user\n",
+				start, end);
+		return -ENXIO;
+	}
+
+	memset(&al, 0, sizeof(al));
+	if (end - start > MAXBB - MAXINSN) {
+		if (last)
+			printf("\tbrstack does not reach to final jump (%" PRIx64 "-%" PRIx64 ")\n",
+					start, end);
+		else
+			printf("\tblock %" PRIx64 "-%" PRIx64 " (%ld) too long to dump\n",
+					start, end, end - start);
+		return 0;
+	}
+
+	thread__find_addr_map(thread, *cpumode, MAP__FUNCTION, start, &al);
+	if (!al.map || !al.map->dso) {
+		printf("\tcannot resolve %" PRIx64 "-%" PRIx64 "\n",
+				start, end);
+		return 0;
+	}
+	if (al.map->dso->data.status == DSO_DATA_STATUS_ERROR) {
+		printf("\tcannot resolve %" PRIx64 "-%" PRIx64 "\n",
+				start, end);
+		return 0;
+	}
+
+	/* Load maps to ensure dso->is_64_bit has been updated */
+	map__load(al.map);
+
+	offset = al.map->map_ip(al.map, start);
+	len = dso__data_read_offset(al.map->dso, machine,
+				    offset, (u8 *)buffer,
+				    end - start + MAXINSN);
+
+	*is64bit = al.map->dso->is_64_bit;
+	if (len <= 0)
+		printf("\tcannot fetch code for block at %" PRIx64 "-%" PRIx64 "\n",
+			start, end);
+	return len;
+}
+
+static void print_jump(uint64_t ip, struct branch_entry *en,
+		       struct perf_insn *x, u8 *inbuf, int len,
+		       int insn)
+{
+	printf("\t%016" PRIx64 "\t%-30s\t#%s%s%s%s",
+	       ip,
+	       dump_insn(x, ip, inbuf, len, NULL),
+	       en->flags.predicted ? " PRED" : "",
+	       en->flags.mispred ? " MISPRED" : "",
+	       en->flags.in_tx ? " INTX" : "",
+	       en->flags.abort ? " ABORT" : "");
+	if (en->flags.cycles) {
+		printf(" %d cycles", en->flags.cycles);
+		if (insn)
+			printf(" %.2f IPC", (float)insn / en->flags.cycles);
+	}
+	putchar('\n');
+}
+
+static void print_ip_sym(struct thread *thread,
+			 u8 cpumode, int cpu,
+			 uint64_t addr,
+			 struct symbol **lastsym,
+			 struct perf_event_attr *attr)
+{
+	struct addr_location al;
+	int off;
+
+	memset(&al, 0, sizeof(struct addr_location));
+
+	thread__find_addr_map(thread, cpumode, MAP__FUNCTION, addr, &al);
+	if (!al.map)
+		thread__find_addr_map(thread, cpumode, MAP__VARIABLE,
+				      addr, &al);
+	if ((*lastsym) && al.addr >= (*lastsym)->start && al.addr < (*lastsym)->end)
+		return;
+
+	al.cpu = cpu;
+	al.sym = NULL;
+	if (al.map)
+		al.sym = map__find_symbol(al.map, al.addr);
+
+	if (!al.sym)
+		return;
+
+	if (al.addr < al.sym->end)
+		off = al.addr - al.sym->start;
+	else
+		off = al.addr - al.map->start - al.sym->start;
+	printf("\t%s", al.sym->name);
+	if (off)
+		printf("%+d", off);
+	putchar(':');
+	if (PRINT_FIELD(SRCLINE))
+		map__fprintf_srcline(al.map, al.addr, "\t", stdout);
+	putchar('\n');
+	*lastsym = al.sym;
+}
+
+static void print_sample_brstackinsn(struct perf_sample *sample,
+				    struct thread *thread,
+				    struct perf_event_attr *attr,
+				    struct machine *machine)
+{
+	struct branch_stack *br = sample->branch_stack;
+	u64 start, end;
+	int i, insn;
+	struct perf_insn x;
+	u8 buffer[MAXBB];
+	int len;
+	int nr;
+	unsigned off;
+	int ilen;
+	struct symbol *lastsym = NULL;
+
+	if (!(br && br->nr))
+		return;
+	nr = br->nr;
+	if (max_blocks && nr > max_blocks + 1)
+		nr = max_blocks + 1;
+
+	x.thread = thread;
+	x.cpu = sample->cpu;
+
+	putchar('\n');
+
+	/* Handle first from jump, of which we don't know the entry. */
+	len = grab_bb(buffer, br->entries[nr-1].from,
+			br->entries[nr-1].from,
+			machine, thread, &x.is64bit, &x.cpumode, false);
+	if (len > 0) {
+		print_ip_sym(thread, x.cpumode, x.cpu,
+			     br->entries[nr - 1].from,
+			     &lastsym, attr);
+		print_jump(br->entries[nr - 1].from, &br->entries[nr - 1],
+			&x, buffer, len, 0);
+	}
+
+	/* Print all blocks */
+	for (i = nr - 2; i >= 0; i--) {
+		if (br->entries[i].from || br->entries[i].to)
+			pr_debug("%d: %" PRIx64 "-%" PRIx64 "\n", i,
+				 br->entries[i].from,
+				 br->entries[i].to);
+		start = br->entries[i + 1].to;
+		end = br->entries[i].from;
+
+		len = grab_bb(buffer, start, end,
+				machine, thread, &x.is64bit,
+				&x.cpumode, false);
+		/* Patch up missing kernel transfers due to ring filters */
+		if (len == -ENXIO && i > 0) {
+			end = br->entries[--i].from;
+			pr_debug("\tpatching up to %" PRIx64 "-%" PRIx64 "\n",
+					start, end);
+			len = grab_bb(buffer, start, end,
+				      machine, thread, &x.is64bit,
+				      &x.cpumode, false);
+		}
+		if (len <= 0)
+			continue;
+
+		insn = 0;
+		for (off = 0;; off += ilen) {
+			uint64_t ip = start + off;
+
+			print_ip_sym(thread, x.cpumode, x.cpu,
+				     ip,
+				     &lastsym, attr);
+			if (ip == end) {
+				print_jump(ip, &br->entries[i], &x,
+					   buffer + off,
+					   len - off, insn);
+				break;
+			} else {
+				printf("\t%016" PRIx64 "\t%s\n", ip,
+					dump_insn(&x, ip, buffer + off,
+						   len - off, &ilen));
+				if (ilen == 0)
+					break;
+				insn++;
+			}
+		}
+	}
+
+	/*
+	 * Hit the branch? In this case we are already done, and the target
+	 * has not been executed yet.
+	 */
+	if (br->entries[0].from == sample->ip)
+		return;
+	if (br->entries[0].flags.abort)
+		return;
+
+	/*
+	 * Print final block upto sample
+	 */
+	start = br->entries[0].to;
+	end = sample->ip;
+	len = grab_bb(buffer, start, end, machine, thread, &x.is64bit,
+			&x.cpumode, true);
+	print_ip_sym(thread, x.cpumode, x.cpu,
+		     start,
+		     &lastsym, attr);
+	if (len <= 0) {
+		/* Print at least last IP if basic block did not work */
+		len = grab_bb(buffer, sample->ip, sample->ip,
+				machine, thread, &x.is64bit, &x.cpumode,
+				false);
+		if (len <= 0)
+			return;
+
+		printf("\t%016" PRIx64 "\t%s\n", sample->ip,
+			dump_insn(&x, sample->ip, buffer, len, NULL));
+		return;
+	}
+	for (off = 0; off <= end - start; off += ilen) {
+		printf("\t%016" PRIx64 "\t%s\n", start + off,
+				dump_insn(&x, start + off, buffer + off,
+					   len - off, &ilen));
+		if (ilen == 0)
+			break;
+	}
+}
 
 static void print_sample_addr(struct perf_sample *sample,
 			  struct thread *thread,
@@ -632,7 +894,9 @@ static void print_sample_callindent(struct perf_sample *sample,
 }
 
 static void print_insn(struct perf_sample *sample,
-		       struct perf_event_attr *attr)
+		       struct perf_event_attr *attr,
+		       struct thread *thread,
+		       struct machine *machine)
 {
 	if (PRINT_FIELD(INSNLEN))
 		printf(" ilen: %d", sample->insn_len);
@@ -643,12 +907,15 @@ static void print_insn(struct perf_sample *sample,
 		for (i = 0; i < sample->insn_len; i++)
 			printf(" %02x", (unsigned char)sample->insn[i]);
 	}
+	if (PRINT_FIELD(BRSTACKINSN))
+		print_sample_brstackinsn(sample, thread, attr, machine);
 }
 
 static void print_sample_bts(struct perf_sample *sample,
 			     struct perf_evsel *evsel,
 			     struct thread *thread,
-			     struct addr_location *al)
+			     struct addr_location *al,
+			     struct machine *machine)
 {
 	struct perf_event_attr *attr = &evsel->attr;
 	bool print_srcline_last = false;
@@ -689,7 +956,7 @@ static void print_sample_bts(struct perf_sample *sample,
 	if (print_srcline_last)
 		map__fprintf_srcline(al->map, al->addr, "\n  ", stdout);
 
-	print_insn(sample, attr);
+	print_insn(sample, attr, thread, machine);
 
 	printf("\n");
 }
@@ -871,7 +1138,8 @@ static size_t data_src__printf(u64 data_src)
 
 static void process_event(struct perf_script *script,
 			  struct perf_sample *sample, struct perf_evsel *evsel,
-			  struct addr_location *al)
+			  struct addr_location *al,
+			  struct machine *machine)
 {
 	struct thread *thread = al->thread;
 	struct perf_event_attr *attr = &evsel->attr;
@@ -898,7 +1166,7 @@ static void process_event(struct perf_script *script,
 		print_sample_flags(sample->flags);
 
 	if (is_bts_event(attr)) {
-		print_sample_bts(sample, evsel, thread, al);
+		print_sample_bts(sample, evsel, thread, al, machine);
 		return;
 	}
 
@@ -936,7 +1204,7 @@ static void process_event(struct perf_script *script,
 
 	if (perf_evsel__is_bpf_output(evsel) && PRINT_FIELD(BPF_OUTPUT))
 		print_sample_bpf_output(sample);
-	print_insn(sample, attr);
+	print_insn(sample, attr, thread, machine);
 	printf("\n");
 }
 
@@ -1046,7 +1314,7 @@ static int process_sample_event(struct perf_tool *tool,
 	if (scripting_ops)
 		scripting_ops->process_event(event, sample, evsel, &al);
 	else
-		process_event(scr, sample, evsel, &al);
+		process_event(scr, sample, evsel, &al, machine);
 
 out_put:
 	addr_location__put(&al);
@@ -2152,7 +2420,8 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
 		     "Valid types: hw,sw,trace,raw. "
 		     "Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso,"
 		     "addr,symoff,period,iregs,brstack,brstacksym,flags,"
-		     "bpf-output,callindent,insn,insnlen", parse_output_fields),
+		     "bpf-output,callindent,insn,insnlen,brstackinsn",
+		     parse_output_fields),
 	OPT_BOOLEAN('a', "all-cpus", &system_wide,
 		    "system-wide collection from all CPUs"),
 	OPT_STRING('S', "symbols", &symbol_conf.sym_list_str, "symbol[,symbol...]",
@@ -2181,6 +2450,8 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
 	OPT_BOOLEAN('\0', "show-switch-events", &script.show_switch_events,
 		    "Show context switch events (if recorded)"),
 	OPT_BOOLEAN('f', "force", &symbol_conf.force, "don't complain, do it"),
+	OPT_INTEGER(0, "max-blocks", &max_blocks,
+		    "Maximum number of code blocks to dump with brstackinsn"),
 	OPT_BOOLEAN(0, "ns", &nanosecs,
 		    "Use 9 decimal places when displaying time"),
 	OPT_CALLBACK_OPTARG(0, "itrace", &itrace_synth_opts, NULL, "opts",
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 5da376bc1afc..47ff85d393bb 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -81,6 +81,7 @@ libperf-$(CONFIG_AUXTRACE) += intel-pt-decoder/
 libperf-$(CONFIG_AUXTRACE) += intel-pt.o
 libperf-$(CONFIG_AUXTRACE) += intel-bts.o
 libperf-y += parse-branch-options.o
+libperf-y += dump-insn.o
 libperf-y += parse-regs-options.o
 libperf-y += term.o
 libperf-y += help-unknown-cmd.o
diff --git a/tools/perf/util/dump-insn.c b/tools/perf/util/dump-insn.c
new file mode 100644
index 000000000000..400d767c5dbb
--- /dev/null
+++ b/tools/perf/util/dump-insn.c
@@ -0,0 +1,16 @@
+#include "perf.h"
+#include "dump-insn.h"
+
+/* Fallback code */
+
+__weak
+const char *dump_insn(struct perf_insn *x __maybe_unused,
+		      uint64_t ip __maybe_unused,
+		      u8 *inbuf __maybe_unused,
+		      int inlen __maybe_unused,
+		      int *lenp __maybe_unused)
+{
+	if (lenp)
+		*lenp = 0;
+	return "?";
+}
diff --git a/tools/perf/util/dump-insn.h b/tools/perf/util/dump-insn.h
new file mode 100644
index 000000000000..39485cdd43e9
--- /dev/null
+++ b/tools/perf/util/dump-insn.h
@@ -0,0 +1,19 @@
+#ifndef DUMP_INSN_H
+#define DUMP_INSN_H 1
+
+#define MAXINSN 15
+
+struct perf_insn {
+	/* Initialized by callers: */
+	struct thread *thread;
+	u8 cpumode;
+	int cpu;
+	bool is64bit;
+	/* Temporary */
+	char out[256];
+};
+
+const char *dump_insn(struct perf_insn *x, uint64_t ip,
+		      u8 *inbuf, int inlen, int *lenp);
+
+#endif
diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c
index 7913363bde5c..6b102d3117f7 100644
--- a/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c
+++ b/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c
@@ -26,6 +26,7 @@
 #include "insn.c"
 
 #include "intel-pt-insn-decoder.h"
+#include "dump-insn.h"
 
 #if INTEL_PT_INSN_BUF_SZ < MAX_INSN_SIZE || INTEL_PT_INSN_BUF_SZ > MAX_INSN
 #error Instruction buffer size too small
@@ -177,6 +178,29 @@ int intel_pt_get_insn(const unsigned char *buf, size_t len, int x86_64,
 	return 0;
 }
 
+const char *dump_insn(struct perf_insn *x, uint64_t ip __maybe_unused,
+		      u8 *inbuf, int inlen, int *lenp)
+{
+	struct insn insn;
+	int n, i;
+	int left;
+
+	insn_init(&insn, inbuf, inlen, x->is64bit);
+	insn_get_length(&insn);
+	if (!insn_complete(&insn) || insn.length > inlen)
+		return "<bad>";
+	if (lenp)
+		*lenp = insn.length;
+	left = sizeof(x->out);
+	n = snprintf(x->out, left, "insn: ");
+	left -= n;
+	for (i = 0; i < insn.length; i++) {
+		n += snprintf(x->out + n, left, "%02x ", inbuf[i]);
+		left -= n;
+	}
+	return x->out;
+}
+
 const char *branch_name[] = {
 	[INTEL_PT_OP_OTHER]	= "Other",
 	[INTEL_PT_OP_CALL]	= "Call",
-- 
2.9.3

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH] perf, tools, script: Add brstackinsn for branch stacks
  2017-02-23 23:46 [PATCH] perf, tools, script: Add brstackinsn for branch stacks Andi Kleen
@ 2017-03-16  2:16 ` Michael Ellerman
  2017-03-16  2:22   ` Andi Kleen
  2017-03-16 16:36 ` [tip:perf/core] perf script: Add 'brstackinsn' " tip-bot for Andi Kleen
  1 sibling, 1 reply; 5+ messages in thread
From: Michael Ellerman @ 2017-03-16  2:16 UTC (permalink / raw)
  To: acme; +Cc: mingo, jolsa, linux-kernel, Andi Kleen, Andi Kleen

Hi acme,

Andi Kleen <andi@firstfloor.org> writes:

> diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
> index c0783b4f7b6c..b0fdb6c032a5 100644
> --- a/tools/perf/builtin-script.c
> +++ b/tools/perf/builtin-script.c
> @@ -546,6 +555,259 @@ static void print_sample_brstacksym(struct perf_sample *sample,
>  	}
>  }
>  
> +#define MAXBB 16384UL
> +
> +static int grab_bb(u8 *buffer, u64 start, u64 end,
> +		    struct machine *machine, struct thread *thread,
> +		    bool *is64bit, u8 *cpumode, bool last)
> +{
> +	long offset, len;
> +	struct addr_location al;
> +	bool kernel;
> +
> +	if (!start || !end)
> +		return 0;
> +
> +	kernel = machine__kernel_ip(machine, start);
> +	if (kernel)
> +		*cpumode = PERF_RECORD_MISC_KERNEL;
> +	else
> +		*cpumode = PERF_RECORD_MISC_USER;
> +
> +	/*
> +	 * Block overlaps between kernel and user.
> +	 * This can happen due to ring filtering
> +	 * On Intel CPUs the entry into the kernel is filtered,
> +	 * but the exit is not. Let the caller patch it up.
> +	 */
> +	if (kernel != machine__kernel_ip(machine, end)) {
> +		printf("\tblock %" PRIx64 "-%" PRIx64 " transfers between kernel and user\n",
> +				start, end);
> +		return -ENXIO;
> +	}
> +
> +	memset(&al, 0, sizeof(al));
> +	if (end - start > MAXBB - MAXINSN) {
> +		if (last)
> +			printf("\tbrstack does not reach to final jump (%" PRIx64 "-%" PRIx64 ")\n",
> +					start, end);
> +		else
> +			printf("\tblock %" PRIx64 "-%" PRIx64 " (%ld) too long to dump\n",
> +					start, end, end - start);

This looks to be breaking the build on ppc64:

  builtin-script.c: In function ‘grab_bb’:
  builtin-script.c:595:11: error: format ‘%ld’ expects argument of type ‘long int’, but argument 4 has type ‘u64 {aka long long unsigned int}’ [-Werror=format=]
      printf("\tblock %" PRIx64 "-%" PRIx64 " (%ld) too long to dump\n", start, end, end - start);
             ^

cheers

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] perf, tools, script: Add brstackinsn for branch stacks
  2017-03-16  2:16 ` Michael Ellerman
@ 2017-03-16  2:22   ` Andi Kleen
  2017-03-16 12:25     ` Arnaldo Carvalho de Melo
  0 siblings, 1 reply; 5+ messages in thread
From: Andi Kleen @ 2017-03-16  2:22 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: acme, mingo, jolsa, linux-kernel, Andi Kleen, Andi Kleen

> > +	memset(&al, 0, sizeof(al));
> > +	if (end - start > MAXBB - MAXINSN) {
> > +		if (last)
> > +			printf("\tbrstack does not reach to final jump (%" PRIx64 "-%" PRIx64 ")\n",
> > +					start, end);
> > +		else
> > +			printf("\tblock %" PRIx64 "-%" PRIx64 " (%ld) too long to dump\n",

Thanks. Should be %" PRIu64 " of course.

Arnaldo, can you do the change or should I send a patch?

-Andi

> > +					start, end, end - start);
> 
> This looks to be breaking the build on ppc64:
> 
>   builtin-script.c: In function ‘grab_bb’:
>   builtin-script.c:595:11: error: format ‘%ld’ expects argument of type ‘long int’, but argument 4 has type ‘u64 {aka long long unsigned int}’ [-Werror=format=]
>       printf("\tblock %" PRIx64 "-%" PRIx64 " (%ld) too long to dump\n", start, end, end - start);
>              ^
> 
> cheers
> 

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] perf, tools, script: Add brstackinsn for branch stacks
  2017-03-16  2:22   ` Andi Kleen
@ 2017-03-16 12:25     ` Arnaldo Carvalho de Melo
  0 siblings, 0 replies; 5+ messages in thread
From: Arnaldo Carvalho de Melo @ 2017-03-16 12:25 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Michael Ellerman, mingo, jolsa, linux-kernel, Andi Kleen

Em Wed, Mar 15, 2017 at 07:22:18PM -0700, Andi Kleen escreveu:
> > > +	memset(&al, 0, sizeof(al));
> > > +	if (end - start > MAXBB - MAXINSN) {
> > > +		if (last)
> > > +			printf("\tbrstack does not reach to final jump (%" PRIx64 "-%" PRIx64 ")\n",
> > > +					start, end);
> > > +		else
> > > +			printf("\tblock %" PRIx64 "-%" PRIx64 " (%ld) too long to dump\n",
> 
> Thanks. Should be %" PRIu64 " of course.
> 
> Arnaldo, can you do the change 

Done.

> or should I send a patch?

- Arnaldo

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [tip:perf/core] perf script: Add 'brstackinsn' for branch stacks
  2017-02-23 23:46 [PATCH] perf, tools, script: Add brstackinsn for branch stacks Andi Kleen
  2017-03-16  2:16 ` Michael Ellerman
@ 2017-03-16 16:36 ` tip-bot for Andi Kleen
  1 sibling, 0 replies; 5+ messages in thread
From: tip-bot for Andi Kleen @ 2017-03-16 16:36 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: hpa, tglx, mpe, mingo, linux-kernel, acme, jolsa, ak

Commit-ID:  48d02a1d5c137d362defd11a5d57d0af4a75a983
Gitweb:     http://git.kernel.org/tip/48d02a1d5c137d362defd11a5d57d0af4a75a983
Author:     Andi Kleen <ak@linux.intel.com>
AuthorDate: Thu, 23 Feb 2017 15:46:34 -0800
Committer:  Arnaldo Carvalho de Melo <acme@redhat.com>
CommitDate: Thu, 16 Mar 2017 09:24:35 -0300

perf script: Add 'brstackinsn' for branch stacks

Implement printing instruction sequences as hex dump for branch stacks.

This relies on the x86 instruction decoder used by the PT decoder to
find the lengths of instructions to dump them individually.

This is good enough for pattern matching.

This allows to study hot paths for individual samples, together with
branch misprediction and cycle count / IPC information if available (on
Skylake systems).

  % perf record -b ...
  % perf script -F brstackinsn
  ...
    read_hpet+67:
          ffffffff9905b843        insn: 74 ea                     # PRED
          ffffffff9905b82f        insn: 85 c9
          ffffffff9905b831        insn: 74 12
          ffffffff9905b833        insn: f3 90
          ffffffff9905b835        insn: 48 8b 0f
          ffffffff9905b838        insn: 48 89 ca
          ffffffff9905b83b        insn: 48 c1 ea 20
          ffffffff9905b83f        insn: 39 f2
          ffffffff9905b841        insn: 89 d0
          ffffffff9905b843        insn: 74 ea                     # PRED

Only works when no special branch filters are specified.

Occasionally the path does not reach up to the sample IP, as the LBRs
may be frozen before executing a final jump. In this case we print a
special message.

The instruction dumper piggy backs on the existing infrastructure from
the IP PT decoder.

An earlier iteration of this patch relied on a disassembler, but this
version only uses the existing instruction decoder.

Committer note:

Added hint about how to get suitable perf.data files for use with
'-F brstackinsm':

  $ perf record usleep 1
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.018 MB perf.data (8 samples) ]
  $
  $ perf script -F brstackinsn
  Display of branch stack assembler requested, but non all-branch filter set
  Hint: run 'perf record -b ...'
  $

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Link: http://lkml.kernel.org/r/20170223234634.583-1-andi@firstfloor.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-script.txt           |  13 +-
 tools/perf/builtin-script.c                        | 264 ++++++++++++++++++++-
 tools/perf/util/Build                              |   1 +
 tools/perf/util/dump-insn.c                        |  14 ++
 tools/perf/util/dump-insn.h                        |  22 ++
 .../util/intel-pt-decoder/intel-pt-insn-decoder.c  |  24 ++
 6 files changed, 327 insertions(+), 11 deletions(-)

diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt
index 62c9b0c..cb0eda3 100644
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -116,7 +116,7 @@ OPTIONS
 --fields::
         Comma separated list of fields to print. Options are:
         comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr, symoff,
-        srcline, period, iregs, brstack, brstacksym, flags, bpf-output,
+        srcline, period, iregs, brstack, brstacksym, flags, bpf-output, brstackinsn,
         callindent, insn, insnlen. Field list can be prepended with the type, trace, sw or hw,
         to indicate to which event type the field list applies.
         e.g., -F sw:comm,tid,time,ip,sym  and -F trace:time,cpu,trace
@@ -189,15 +189,20 @@ OPTIONS
 	i.e., -F "" is not allowed.
 
 	The brstack output includes branch related information with raw addresses using the
-	/v/v/v/v/ syntax in the following order:
+	/v/v/v/v/cycles syntax in the following order:
 	FROM: branch source instruction
 	TO  : branch target instruction
         M/P/-: M=branch target mispredicted or branch direction was mispredicted, P=target predicted or direction predicted, -=not supported
 	X/- : X=branch inside a transactional region, -=not in transaction region or not supported
 	A/- : A=TSX abort entry, -=not aborted region or not supported
+	cycles
 
 	The brstacksym is identical to brstack, except that the FROM and TO addresses are printed in a symbolic form if possible.
 
+	When brstackinsn is specified the full assembler sequences of branch sequences for each sample
+	is printed. This is the full execution path leading to the sample. This is only supported when the
+	sample was recorded with perf record -b or -j any.
+
 -k::
 --vmlinux=<file>::
         vmlinux pathname
@@ -302,6 +307,10 @@ include::itrace.txt[]
 	stop time is not given (i.e, time string is 'x.y,') then analysis goes
 	to end of file.
 
+--max-blocks::
+	Set the maximum number of program blocks to print with brstackasm for
+	each sample.
+
 SEE ALSO
 --------
 linkperf:perf-record[1], linkperf:perf-script-perl[1],
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 66d62c9..c98e166 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -28,6 +28,7 @@
 #include <linux/time64.h>
 #include "asm/bug.h"
 #include "util/mem-events.h"
+#include "util/dump-insn.h"
 
 static char const		*script_name;
 static char const		*generate_script_lang;
@@ -42,6 +43,7 @@ static bool			nanosecs;
 static const char		*cpu_list;
 static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
 static struct perf_stat_config	stat_config;
+static int			max_blocks;
 
 unsigned int scripting_max_stack = PERF_MAX_STACK_DEPTH;
 
@@ -69,6 +71,7 @@ enum perf_output_field {
 	PERF_OUTPUT_CALLINDENT	    = 1U << 20,
 	PERF_OUTPUT_INSN	    = 1U << 21,
 	PERF_OUTPUT_INSNLEN	    = 1U << 22,
+	PERF_OUTPUT_BRSTACKINSN	    = 1U << 23,
 };
 
 struct output_option {
@@ -98,6 +101,7 @@ struct output_option {
 	{.str = "callindent", .field = PERF_OUTPUT_CALLINDENT},
 	{.str = "insn", .field = PERF_OUTPUT_INSN},
 	{.str = "insnlen", .field = PERF_OUTPUT_INSNLEN},
+	{.str = "brstackinsn", .field = PERF_OUTPUT_BRSTACKINSN},
 };
 
 /* default set to maintain compatibility with current format */
@@ -292,7 +296,13 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel,
 		       "selected. Hence, no address to lookup the source line number.\n");
 		return -EINVAL;
 	}
-
+	if (PRINT_FIELD(BRSTACKINSN) &&
+	    !(perf_evlist__combined_branch_type(session->evlist) &
+	      PERF_SAMPLE_BRANCH_ANY)) {
+		pr_err("Display of branch stack assembler requested, but non all-branch filter set\n"
+		       "Hint: run 'perf record -b ...'\n");
+		return -EINVAL;
+	}
 	if ((PRINT_FIELD(PID) || PRINT_FIELD(TID)) &&
 		perf_evsel__check_stype(evsel, PERF_SAMPLE_TID, "TID",
 					PERF_OUTPUT_TID|PERF_OUTPUT_PID))
@@ -546,6 +556,233 @@ static void print_sample_brstacksym(struct perf_sample *sample,
 	}
 }
 
+#define MAXBB 16384UL
+
+static int grab_bb(u8 *buffer, u64 start, u64 end,
+		    struct machine *machine, struct thread *thread,
+		    bool *is64bit, u8 *cpumode, bool last)
+{
+	long offset, len;
+	struct addr_location al;
+	bool kernel;
+
+	if (!start || !end)
+		return 0;
+
+	kernel = machine__kernel_ip(machine, start);
+	if (kernel)
+		*cpumode = PERF_RECORD_MISC_KERNEL;
+	else
+		*cpumode = PERF_RECORD_MISC_USER;
+
+	/*
+	 * Block overlaps between kernel and user.
+	 * This can happen due to ring filtering
+	 * On Intel CPUs the entry into the kernel is filtered,
+	 * but the exit is not. Let the caller patch it up.
+	 */
+	if (kernel != machine__kernel_ip(machine, end)) {
+		printf("\tblock %" PRIx64 "-%" PRIx64 " transfers between kernel and user\n",
+				start, end);
+		return -ENXIO;
+	}
+
+	memset(&al, 0, sizeof(al));
+	if (end - start > MAXBB - MAXINSN) {
+		if (last)
+			printf("\tbrstack does not reach to final jump (%" PRIx64 "-%" PRIx64 ")\n", start, end);
+		else
+			printf("\tblock %" PRIx64 "-%" PRIx64 " (%" PRIu64 ") too long to dump\n", start, end, end - start);
+		return 0;
+	}
+
+	thread__find_addr_map(thread, *cpumode, MAP__FUNCTION, start, &al);
+	if (!al.map || !al.map->dso) {
+		printf("\tcannot resolve %" PRIx64 "-%" PRIx64 "\n", start, end);
+		return 0;
+	}
+	if (al.map->dso->data.status == DSO_DATA_STATUS_ERROR) {
+		printf("\tcannot resolve %" PRIx64 "-%" PRIx64 "\n", start, end);
+		return 0;
+	}
+
+	/* Load maps to ensure dso->is_64_bit has been updated */
+	map__load(al.map);
+
+	offset = al.map->map_ip(al.map, start);
+	len = dso__data_read_offset(al.map->dso, machine, offset, (u8 *)buffer,
+				    end - start + MAXINSN);
+
+	*is64bit = al.map->dso->is_64_bit;
+	if (len <= 0)
+		printf("\tcannot fetch code for block at %" PRIx64 "-%" PRIx64 "\n",
+			start, end);
+	return len;
+}
+
+static void print_jump(uint64_t ip, struct branch_entry *en,
+		       struct perf_insn *x, u8 *inbuf, int len,
+		       int insn)
+{
+	printf("\t%016" PRIx64 "\t%-30s\t#%s%s%s%s",
+	       ip,
+	       dump_insn(x, ip, inbuf, len, NULL),
+	       en->flags.predicted ? " PRED" : "",
+	       en->flags.mispred ? " MISPRED" : "",
+	       en->flags.in_tx ? " INTX" : "",
+	       en->flags.abort ? " ABORT" : "");
+	if (en->flags.cycles) {
+		printf(" %d cycles", en->flags.cycles);
+		if (insn)
+			printf(" %.2f IPC", (float)insn / en->flags.cycles);
+	}
+	putchar('\n');
+}
+
+static void print_ip_sym(struct thread *thread, u8 cpumode, int cpu,
+			 uint64_t addr, struct symbol **lastsym,
+			 struct perf_event_attr *attr)
+{
+	struct addr_location al;
+	int off;
+
+	memset(&al, 0, sizeof(al));
+
+	thread__find_addr_map(thread, cpumode, MAP__FUNCTION, addr, &al);
+	if (!al.map)
+		thread__find_addr_map(thread, cpumode, MAP__VARIABLE,
+				      addr, &al);
+	if ((*lastsym) && al.addr >= (*lastsym)->start && al.addr < (*lastsym)->end)
+		return;
+
+	al.cpu = cpu;
+	al.sym = NULL;
+	if (al.map)
+		al.sym = map__find_symbol(al.map, al.addr);
+
+	if (!al.sym)
+		return;
+
+	if (al.addr < al.sym->end)
+		off = al.addr - al.sym->start;
+	else
+		off = al.addr - al.map->start - al.sym->start;
+	printf("\t%s", al.sym->name);
+	if (off)
+		printf("%+d", off);
+	putchar(':');
+	if (PRINT_FIELD(SRCLINE))
+		map__fprintf_srcline(al.map, al.addr, "\t", stdout);
+	putchar('\n');
+	*lastsym = al.sym;
+}
+
+static void print_sample_brstackinsn(struct perf_sample *sample,
+				     struct thread *thread,
+				     struct perf_event_attr *attr,
+				     struct machine *machine)
+{
+	struct branch_stack *br = sample->branch_stack;
+	u64 start, end;
+	int i, insn, len, nr, ilen;
+	struct perf_insn x;
+	u8 buffer[MAXBB];
+	unsigned off;
+	struct symbol *lastsym = NULL;
+
+	if (!(br && br->nr))
+		return;
+	nr = br->nr;
+	if (max_blocks && nr > max_blocks + 1)
+		nr = max_blocks + 1;
+
+	x.thread = thread;
+	x.cpu = sample->cpu;
+
+	putchar('\n');
+
+	/* Handle first from jump, of which we don't know the entry. */
+	len = grab_bb(buffer, br->entries[nr-1].from,
+			br->entries[nr-1].from,
+			machine, thread, &x.is64bit, &x.cpumode, false);
+	if (len > 0) {
+		print_ip_sym(thread, x.cpumode, x.cpu,
+			     br->entries[nr - 1].from, &lastsym, attr);
+		print_jump(br->entries[nr - 1].from, &br->entries[nr - 1],
+			    &x, buffer, len, 0);
+	}
+
+	/* Print all blocks */
+	for (i = nr - 2; i >= 0; i--) {
+		if (br->entries[i].from || br->entries[i].to)
+			pr_debug("%d: %" PRIx64 "-%" PRIx64 "\n", i,
+				 br->entries[i].from,
+				 br->entries[i].to);
+		start = br->entries[i + 1].to;
+		end   = br->entries[i].from;
+
+		len = grab_bb(buffer, start, end, machine, thread, &x.is64bit, &x.cpumode, false);
+		/* Patch up missing kernel transfers due to ring filters */
+		if (len == -ENXIO && i > 0) {
+			end = br->entries[--i].from;
+			pr_debug("\tpatching up to %" PRIx64 "-%" PRIx64 "\n", start, end);
+			len = grab_bb(buffer, start, end, machine, thread, &x.is64bit, &x.cpumode, false);
+		}
+		if (len <= 0)
+			continue;
+
+		insn = 0;
+		for (off = 0;; off += ilen) {
+			uint64_t ip = start + off;
+
+			print_ip_sym(thread, x.cpumode, x.cpu, ip, &lastsym, attr);
+			if (ip == end) {
+				print_jump(ip, &br->entries[i], &x, buffer + off, len - off, insn);
+				break;
+			} else {
+				printf("\t%016" PRIx64 "\t%s\n", ip,
+					dump_insn(&x, ip, buffer + off, len - off, &ilen));
+				if (ilen == 0)
+					break;
+				insn++;
+			}
+		}
+	}
+
+	/*
+	 * Hit the branch? In this case we are already done, and the target
+	 * has not been executed yet.
+	 */
+	if (br->entries[0].from == sample->ip)
+		return;
+	if (br->entries[0].flags.abort)
+		return;
+
+	/*
+	 * Print final block upto sample
+	 */
+	start = br->entries[0].to;
+	end = sample->ip;
+	len = grab_bb(buffer, start, end, machine, thread, &x.is64bit, &x.cpumode, true);
+	print_ip_sym(thread, x.cpumode, x.cpu, start, &lastsym, attr);
+	if (len <= 0) {
+		/* Print at least last IP if basic block did not work */
+		len = grab_bb(buffer, sample->ip, sample->ip,
+			      machine, thread, &x.is64bit, &x.cpumode, false);
+		if (len <= 0)
+			return;
+
+		printf("\t%016" PRIx64 "\t%s\n", sample->ip,
+			dump_insn(&x, sample->ip, buffer, len, NULL));
+		return;
+	}
+	for (off = 0; off <= end - start; off += ilen) {
+		printf("\t%016" PRIx64 "\t%s\n", start + off,
+			dump_insn(&x, start + off, buffer + off, len - off, &ilen));
+		if (ilen == 0)
+			break;
+	}
+}
 
 static void print_sample_addr(struct perf_sample *sample,
 			  struct thread *thread,
@@ -632,7 +869,9 @@ static void print_sample_callindent(struct perf_sample *sample,
 }
 
 static void print_insn(struct perf_sample *sample,
-		       struct perf_event_attr *attr)
+		       struct perf_event_attr *attr,
+		       struct thread *thread,
+		       struct machine *machine)
 {
 	if (PRINT_FIELD(INSNLEN))
 		printf(" ilen: %d", sample->insn_len);
@@ -643,12 +882,15 @@ static void print_insn(struct perf_sample *sample,
 		for (i = 0; i < sample->insn_len; i++)
 			printf(" %02x", (unsigned char)sample->insn[i]);
 	}
+	if (PRINT_FIELD(BRSTACKINSN))
+		print_sample_brstackinsn(sample, thread, attr, machine);
 }
 
 static void print_sample_bts(struct perf_sample *sample,
 			     struct perf_evsel *evsel,
 			     struct thread *thread,
-			     struct addr_location *al)
+			     struct addr_location *al,
+			     struct machine *machine)
 {
 	struct perf_event_attr *attr = &evsel->attr;
 	bool print_srcline_last = false;
@@ -689,7 +931,7 @@ static void print_sample_bts(struct perf_sample *sample,
 	if (print_srcline_last)
 		map__fprintf_srcline(al->map, al->addr, "\n  ", stdout);
 
-	print_insn(sample, attr);
+	print_insn(sample, attr, thread, machine);
 
 	printf("\n");
 }
@@ -872,7 +1114,8 @@ static size_t data_src__printf(u64 data_src)
 
 static void process_event(struct perf_script *script,
 			  struct perf_sample *sample, struct perf_evsel *evsel,
-			  struct addr_location *al)
+			  struct addr_location *al,
+			  struct machine *machine)
 {
 	struct thread *thread = al->thread;
 	struct perf_event_attr *attr = &evsel->attr;
@@ -899,7 +1142,7 @@ static void process_event(struct perf_script *script,
 		print_sample_flags(sample->flags);
 
 	if (is_bts_event(attr)) {
-		print_sample_bts(sample, evsel, thread, al);
+		print_sample_bts(sample, evsel, thread, al, machine);
 		return;
 	}
 
@@ -937,7 +1180,7 @@ static void process_event(struct perf_script *script,
 
 	if (perf_evsel__is_bpf_output(evsel) && PRINT_FIELD(BPF_OUTPUT))
 		print_sample_bpf_output(sample);
-	print_insn(sample, attr);
+	print_insn(sample, attr, thread, machine);
 	printf("\n");
 }
 
@@ -1047,7 +1290,7 @@ static int process_sample_event(struct perf_tool *tool,
 	if (scripting_ops)
 		scripting_ops->process_event(event, sample, evsel, &al);
 	else
-		process_event(scr, sample, evsel, &al);
+		process_event(scr, sample, evsel, &al, machine);
 
 out_put:
 	addr_location__put(&al);
@@ -2191,7 +2434,8 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
 		     "Valid types: hw,sw,trace,raw. "
 		     "Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso,"
 		     "addr,symoff,period,iregs,brstack,brstacksym,flags,"
-		     "bpf-output,callindent,insn,insnlen", parse_output_fields),
+		     "bpf-output,callindent,insn,insnlen,brstackinsn",
+		     parse_output_fields),
 	OPT_BOOLEAN('a', "all-cpus", &system_wide,
 		    "system-wide collection from all CPUs"),
 	OPT_STRING('S', "symbols", &symbol_conf.sym_list_str, "symbol[,symbol...]",
@@ -2222,6 +2466,8 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
 	OPT_BOOLEAN('\0', "show-namespace-events", &script.show_namespace_events,
 		    "Show namespace events (if recorded)"),
 	OPT_BOOLEAN('f', "force", &symbol_conf.force, "don't complain, do it"),
+	OPT_INTEGER(0, "max-blocks", &max_blocks,
+		    "Maximum number of code blocks to dump with brstackinsn"),
 	OPT_BOOLEAN(0, "ns", &nanosecs,
 		    "Use 9 decimal places when displaying time"),
 	OPT_CALLBACK_OPTARG(0, "itrace", &itrace_synth_opts, NULL, "opts",
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 2ea5ee1..fb4f42f 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -82,6 +82,7 @@ libperf-$(CONFIG_AUXTRACE) += intel-pt-decoder/
 libperf-$(CONFIG_AUXTRACE) += intel-pt.o
 libperf-$(CONFIG_AUXTRACE) += intel-bts.o
 libperf-y += parse-branch-options.o
+libperf-y += dump-insn.o
 libperf-y += parse-regs-options.o
 libperf-y += term.o
 libperf-y += help-unknown-cmd.o
diff --git a/tools/perf/util/dump-insn.c b/tools/perf/util/dump-insn.c
new file mode 100644
index 0000000..ffbdb19
--- /dev/null
+++ b/tools/perf/util/dump-insn.c
@@ -0,0 +1,14 @@
+#include <linux/compiler.h>
+#include "dump-insn.h"
+
+/* Fallback code */
+
+__weak
+const char *dump_insn(struct perf_insn *x __maybe_unused,
+		      u64 ip __maybe_unused, u8 *inbuf __maybe_unused,
+		      int inlen __maybe_unused, int *lenp)
+{
+	if (lenp)
+		*lenp = 0;
+	return "?";
+}
diff --git a/tools/perf/util/dump-insn.h b/tools/perf/util/dump-insn.h
new file mode 100644
index 0000000..90fb115
--- /dev/null
+++ b/tools/perf/util/dump-insn.h
@@ -0,0 +1,22 @@
+#ifndef __PERF_DUMP_INSN_H
+#define __PERF_DUMP_INSN_H 1
+
+#define MAXINSN 15
+
+#include <linux/types.h>
+
+struct thread;
+
+struct perf_insn {
+	/* Initialized by callers: */
+	struct thread *thread;
+	u8	      cpumode;
+	bool	      is64bit;
+	int	      cpu;
+	/* Temporary */
+	char	      out[256];
+};
+
+const char *dump_insn(struct perf_insn *x, u64 ip,
+		      u8 *inbuf, int inlen, int *lenp);
+#endif
diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c
index 55b6250..a5f35b2 100644
--- a/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c
+++ b/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c
@@ -26,6 +26,7 @@
 #include "insn.c"
 
 #include "intel-pt-insn-decoder.h"
+#include "dump-insn.h"
 
 #if INTEL_PT_INSN_BUF_SZ < MAX_INSN_SIZE || INTEL_PT_INSN_BUF_SZ > MAX_INSN
 #error Instruction buffer size too small
@@ -179,6 +180,29 @@ int intel_pt_get_insn(const unsigned char *buf, size_t len, int x86_64,
 	return 0;
 }
 
+const char *dump_insn(struct perf_insn *x, uint64_t ip __maybe_unused,
+		      u8 *inbuf, int inlen, int *lenp)
+{
+	struct insn insn;
+	int n, i;
+	int left;
+
+	insn_init(&insn, inbuf, inlen, x->is64bit);
+	insn_get_length(&insn);
+	if (!insn_complete(&insn) || insn.length > inlen)
+		return "<bad>";
+	if (lenp)
+		*lenp = insn.length;
+	left = sizeof(x->out);
+	n = snprintf(x->out, left, "insn: ");
+	left -= n;
+	for (i = 0; i < insn.length; i++) {
+		n += snprintf(x->out + n, left, "%02x ", inbuf[i]);
+		left -= n;
+	}
+	return x->out;
+}
+
 const char *branch_name[] = {
 	[INTEL_PT_OP_OTHER]	= "Other",
 	[INTEL_PT_OP_CALL]	= "Call",

^ permalink raw reply related	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2017-03-16 16:36 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-02-23 23:46 [PATCH] perf, tools, script: Add brstackinsn for branch stacks Andi Kleen
2017-03-16  2:16 ` Michael Ellerman
2017-03-16  2:22   ` Andi Kleen
2017-03-16 12:25     ` Arnaldo Carvalho de Melo
2017-03-16 16:36 ` [tip:perf/core] perf script: Add 'brstackinsn' " tip-bot for Andi Kleen

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.