Re: [PATCH 7/8] perf stat: Basic support for TopDown in perf stat

From: Arnaldo Carvalho de Melo <acme@kernel.org>
To: Andi Kleen <andi@firstfloor.org>
Cc: peterz@infradead.org, jolsa@kernel.org,
	linux-kernel@vger.kernel.org, mingo@kernel.org,
	Andi Kleen <ak@linux.intel.com>
Subject: Re: [PATCH 7/8] perf stat: Basic support for TopDown in perf stat
Date: Mon, 23 May 2016 11:40:07 -0300	[thread overview]
Message-ID: <20160523144007.GJ8897@kernel.org> (raw)
In-Reply-To: <1463703002-19686-8-git-send-email-andi@firstfloor.org>

Em Thu, May 19, 2016 at 05:10:01PM -0700, Andi Kleen escreveu:
> From: Andi Kleen <ak@linux.intel.com>
> 
> Add basic plumbing for TopDown in perf stat
> 
> Add a new --topdown options to enable events.
> When --topdown is specified set up events for all topdown
> events supported by the kernel.
> Add topdown-* as a special case to the event parser, as is
> needed for all events containing -.
> 
> The actual code to compute the metrics is in follow-on patches.
> 
> v2: Use standard sysctl read function.
> v3: Move x86 specific code to arch/
> v4: Enable --metric-only implicitly for topdown.
> v5: Add --single-thread option to not force per core mode
> v6: Fix output order of topdown metrics
> v7: Allow combining with -d
> v8: Remove --single-thread again
> Signed-off-by: Andi Kleen <ak@linux.intel.com>
> ---
>  tools/perf/Documentation/perf-stat.txt |  16 +++++
>  tools/perf/arch/x86/util/Build         |   1 +
>  tools/perf/arch/x86/util/group.c       |  27 ++++++++
>  tools/perf/builtin-stat.c              | 114 ++++++++++++++++++++++++++++++++-
>  tools/perf/util/group.h                |   7 ++
>  tools/perf/util/parse-events.l         |   1 +
>  6 files changed, 163 insertions(+), 3 deletions(-)
>  create mode 100644 tools/perf/arch/x86/util/group.c
>  create mode 100644 tools/perf/util/group.h
> 
> diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
> index 04f23b404bbc..3aaa2916f604 100644
> --- a/tools/perf/Documentation/perf-stat.txt
> +++ b/tools/perf/Documentation/perf-stat.txt
> @@ -204,6 +204,22 @@ Aggregate counts per physical processor for system-wide mode measurements.
>  --no-aggr::
>  Do not aggregate counts across all monitored CPUs.
>  
> +--topdown::
> +Print top down level 1 metrics if supported by the CPU. This allows to
> +determine bottle necks in the CPU pipeline for CPU bound workloads,
> +by breaking it down into frontend bound, backend bound, bad speculation
> +and retiring. Metrics are only printed when they cross a threshold.
> +
> +The top down metrics may be collected per core instead of per
> +CPU thread. In this case per core mode is automatically enabled
> +and -a (global monitoring) is needed, requiring root rights or
> +perf.perf_event_paranoid=-1.
> +
> +This enables --metric-only, unless overriden with --no-metric-only.
> +
> +To interpret the results it is usually needed to know on which
> +CPUs the workload runs on. If needed the CPUs can be forced using
> +taskset.
>  
>  EXAMPLES
>  --------
> diff --git a/tools/perf/arch/x86/util/Build b/tools/perf/arch/x86/util/Build
> index 465970370f3e..4cd8a16b1b7b 100644
> --- a/tools/perf/arch/x86/util/Build
> +++ b/tools/perf/arch/x86/util/Build
> @@ -3,6 +3,7 @@ libperf-y += tsc.o
>  libperf-y += pmu.o
>  libperf-y += kvm-stat.o
>  libperf-y += perf_regs.o
> +libperf-y += group.o
>  
>  libperf-$(CONFIG_DWARF) += dwarf-regs.o
>  libperf-$(CONFIG_BPF_PROLOGUE) += dwarf-regs.o
> diff --git a/tools/perf/arch/x86/util/group.c b/tools/perf/arch/x86/util/group.c
> new file mode 100644
> index 000000000000..f3039b5ce8b1
> --- /dev/null
> +++ b/tools/perf/arch/x86/util/group.c
> @@ -0,0 +1,27 @@
> +#include <stdio.h>
> +#include "api/fs/fs.h"
> +#include "util/group.h"
> +
> +/*
> + * Check whether we can use a group for top down.
> + * Without a group may get bad results due to multiplexing.
> + */
> +bool check_group(bool *warn)

Please rename this, "check_group" is way too generic, and things that
are possibly renamed by arch code, which there are plenty in the tree,
usually come prefixed by "arch_" so that we know at a glance that this
may be overriden by arch code.

> +{
> +	int n;
> +
> +	if (sysctl__read_int("kernel/nmi_watchdog", &n) < 0)
> +		return false;
> +	if (n > 0) {
> +		*warn = true;
> +		return false;
> +	}
> +	return true;
> +}
> +
> +void group_warn(void)
> +{
> +	fprintf(stderr,
> +		"nmi_watchdog enabled with topdown. May give wrong results.\n"
> +		"Disable with echo 0 > /proc/sys/kernel/nmi_watchdog\n");
> +}
> diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
> index db84bfc0a478..7c5c50b61b28 100644
> --- a/tools/perf/builtin-stat.c
> +++ b/tools/perf/builtin-stat.c
> @@ -59,10 +59,13 @@
>  #include "util/thread.h"
>  #include "util/thread_map.h"
>  #include "util/counts.h"
> +#include "util/group.h"
>  #include "util/session.h"
>  #include "util/tool.h"
> +#include "util/group.h"
>  #include "asm/bug.h"
>  
> +#include <api/fs/fs.h>
>  #include <stdlib.h>
>  #include <sys/prctl.h>
>  #include <locale.h>
> @@ -98,6 +101,15 @@ static const char * transaction_limited_attrs = {
>  	"}"
>  };
>  
> +static const char * topdown_attrs[] = {
> +	"topdown-total-slots",
> +	"topdown-slots-retired",
> +	"topdown-recovery-bubbles",
> +	"topdown-fetch-bubbles",
> +	"topdown-slots-issued",
> +	NULL,
> +};
> +
>  static struct perf_evlist	*evsel_list;
>  
>  static struct target target = {
> @@ -112,6 +124,7 @@ static volatile pid_t		child_pid			= -1;
>  static bool			null_run			=  false;
>  static int			detailed_run			=  0;
>  static bool			transaction_run;
> +static bool			topdown_run			= false;
>  static bool			big_num				=  true;
>  static int			big_num_opt			=  -1;
>  static const char		*csv_sep			= NULL;
> @@ -124,6 +137,7 @@ static unsigned int		initial_delay			= 0;
>  static unsigned int		unit_width			= 4; /* strlen("unit") */
>  static bool			forever				= false;
>  static bool			metric_only			= false;
> +static bool			force_metric_only		= false;
>  static struct timespec		ref_time;
>  static struct cpu_map		*aggr_map;
>  static aggr_get_id_t		aggr_get_id;
> @@ -1515,6 +1529,14 @@ static int stat__set_big_num(const struct option *opt __maybe_unused,
>  	return 0;
>  }
>  
> +static int enable_metric_only(const struct option *opt __maybe_unused,
> +			      const char *s __maybe_unused, int unset)
> +{
> +	force_metric_only = true;
> +	metric_only = !unset;
> +	return 0;
> +}
> +
>  static const struct option stat_options[] = {
>  	OPT_BOOLEAN('T', "transaction", &transaction_run,
>  		    "hardware transaction statistics"),
> @@ -1573,8 +1595,10 @@ static const struct option stat_options[] = {
>  		     "aggregate counts per thread", AGGR_THREAD),
>  	OPT_UINTEGER('D', "delay", &initial_delay,
>  		     "ms to wait before starting measurement after program start"),
> -	OPT_BOOLEAN(0, "metric-only", &metric_only,
> -			"Only print computed metrics. No raw values"),
> +	OPT_CALLBACK_NOOPT(0, "metric-only", &metric_only, NULL,
> +			"Only print computed metrics. No raw values", enable_metric_only),
> +	OPT_BOOLEAN(0, "topdown", &topdown_run,
> +			"measure topdown level 1 statistics"),
>  	OPT_END()
>  };
>  
> @@ -1767,12 +1791,61 @@ static int perf_stat_init_aggr_mode_file(struct perf_stat *st)
>  	return 0;
>  }
>  
> +static void filter_events(const char **attr, char **str, bool use_group)

Is this really a generic function or is something topdown specific? If
the later, please prefix it with "topdown_".

> +{
> +	int off = 0;
> +	int i;
> +	int len = 0;
> +	char *s;
> +
> +	for (i = 0; attr[i]; i++) {
> +		if (pmu_have_event("cpu", attr[i])) {
> +			len += strlen(attr[i]) + 1;
> +			attr[i - off] = attr[i];
> +		} else
> +			off++;
> +	}
> +	attr[i - off] = NULL;
> +
> +	*str = malloc(len + 1 + 2);
> +	if (!*str)
> +		return;
> +	s = *str;
> +	if (i - off == 0) {
> +		*s = 0;
> +		return;
> +	}
> +	if (use_group)
> +		*s++ = '{';
> +	for (i = 0; attr[i]; i++) {
> +		strcpy(s, attr[i]);
> +		s += strlen(s);
> +		*s++ = ',';
> +	}
> +	if (use_group) {
> +		s[-1] = '}';
> +		*s = 0;
> +	} else
> +		s[-1] = 0;
> +}
> +
> +__weak bool check_group(bool *warn)
> +{
> +	*warn = false;
> +	return false;
> +}
> +
> +__weak void group_warn(void)
> +{
> +}
> +
>  /*
>   * Add default attributes, if there were no attributes specified or
>   * if -d/--detailed, -d -d or -d -d -d is used:
>   */
>  static int add_default_attributes(void)
>  {
> +	int err;
>  	struct perf_event_attr default_attrs0[] = {
>  
>    { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK		},
> @@ -1891,7 +1964,6 @@ static int add_default_attributes(void)
>  		return 0;
>  
>  	if (transaction_run) {
> -		int err;
>  		if (pmu_have_event("cpu", "cycles-ct") &&
>  		    pmu_have_event("cpu", "el-start"))
>  			err = parse_events(evsel_list, transaction_attrs, NULL);
> @@ -1904,6 +1976,42 @@ static int add_default_attributes(void)
>  		return 0;
>  	}
>  
> +	if (topdown_run) {
> +		char *str = NULL;
> +		bool warn = false;
> +
> +		if (stat_config.aggr_mode != AGGR_GLOBAL &&
> +		    stat_config.aggr_mode != AGGR_CORE) {
> +			pr_err("top down event configuration requires --per-core mode\n");
> +			return -1;
> +		}
> +		stat_config.aggr_mode = AGGR_CORE;
> +		if (nr_cgroups || !target__has_cpu(&target)) {
> +			pr_err("top down event configuration requires system-wide mode (-a)\n");
> +			return -1;
> +		}
> +
> +		if (!force_metric_only)
> +			metric_only = true;
> +		filter_events(topdown_attrs, &str, check_group(&warn));
> +		if (topdown_attrs[0] && str) {
> +			if (warn)
> +				group_warn();
> +			err = parse_events(evsel_list, str, NULL);
> +			if (err) {
> +				fprintf(stderr,
> +					"Cannot set up top down events %s: %d\n",
> +					str, err);
> +				free(str);
> +				return -1;
> +			}
> +		} else {
> +			fprintf(stderr, "System does not support topdown\n");
> +			return -1;
> +		}
> +		free(str);
> +	}
> +
>  	if (!evsel_list->nr_entries) {
>  		if (perf_evlist__add_default_attrs(evsel_list, default_attrs0) < 0)
>  			return -1;
> diff --git a/tools/perf/util/group.h b/tools/perf/util/group.h
> new file mode 100644
> index 000000000000..daad3ffdc68d
> --- /dev/null
> +++ b/tools/perf/util/group.h
> @@ -0,0 +1,7 @@
> +#ifndef GROUP_H
> +#define GROUP_H 1
> +
> +bool check_group(bool *warn);
> +void group_warn(void);
> +
> +#endif
> diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l
> index 1477fbc78993..744ebe3fa30f 100644
> --- a/tools/perf/util/parse-events.l
> +++ b/tools/perf/util/parse-events.l
> @@ -259,6 +259,7 @@ cycles-ct					{ return str(yyscanner, PE_KERNEL_PMU_EVENT); }
>  cycles-t					{ return str(yyscanner, PE_KERNEL_PMU_EVENT); }
>  mem-loads					{ return str(yyscanner, PE_KERNEL_PMU_EVENT); }
>  mem-stores					{ return str(yyscanner, PE_KERNEL_PMU_EVENT); }
> +topdown-[a-z-]+					{ return str(yyscanner, PE_KERNEL_PMU_EVENT); }
>  
>  L1-dcache|l1-d|l1d|L1-data		|
>  L1-icache|l1-i|l1i|L1-instruction	|
> -- 
> 2.5.5