linux-perf-users.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] perf stat: Support per-cluster aggregation
@ 2023-03-13  8:59 Yicong Yang
  2023-03-23 13:03 ` Yicong Yang
                   ` (2 more replies)
  0 siblings, 3 replies; 10+ messages in thread
From: Yicong Yang @ 2023-03-13  8:59 UTC (permalink / raw)
  To: acme, mark.rutland, peterz, mingo, james.clark,
	alexander.shishkin, linux-perf-users, linux-kernel
  Cc: Jonathan.Cameron, 21cnbao, tim.c.chen, prime.zeng, shenyang39,
	linuxarm, yangyicong

From: Yicong Yang <yangyicong@hisilicon.com>

Some platforms have 'cluster' topology and CPUs in the cluster will
share resources like L3 Cache Tag (for HiSilicon Kunpeng SoC) or L2
cache (for Intel Jacobsville). Currently parsing and building cluster
topology have been supported since [1].

perf stat has already supported aggregation for other topologies like
die or socket, etc. It'll be useful to aggregate per-cluster to find
problems like L3T bandwidth contention or imbalance.

This patch adds support for "--per-cluster" option for per-cluster
aggregation. Also update the docs and related test. The output will
be like:

[root@localhost tmp]# perf stat -a -e LLC-load --per-cluster -- sleep 5

 Performance counter stats for 'system wide':

S56-D0-CLS158    4      1,321,521,570      LLC-load
S56-D0-CLS594    4        794,211,453      LLC-load
S56-D0-CLS1030    4             41,623      LLC-load
S56-D0-CLS1466    4             41,646      LLC-load
S56-D0-CLS1902    4             16,863      LLC-load
S56-D0-CLS2338    4             15,721      LLC-load
S56-D0-CLS2774    4             22,671      LLC-load
[...]

[1] commit c5e22feffdd7 ("topology: Represent clusters of CPUs within a die")

Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
---
 tools/perf/Documentation/perf-stat.txt        | 10 ++++
 tools/perf/builtin-stat.c                     | 52 +++++++++++++++++--
 .../tests/shell/lib/perf_json_output_lint.py  |  4 +-
 tools/perf/tests/shell/stat+csv_output.sh     | 14 +++++
 tools/perf/tests/shell/stat+json_output.sh    | 13 +++++
 tools/perf/util/cpumap.c                      | 28 +++++++++-
 tools/perf/util/cpumap.h                      | 19 +++++--
 tools/perf/util/env.h                         |  1 +
 tools/perf/util/stat-display.c                | 13 +++++
 tools/perf/util/stat.h                        |  1 +
 10 files changed, 146 insertions(+), 9 deletions(-)

diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index 18abdc1dce05..688433c3756c 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -308,6 +308,13 @@ use --per-die in addition to -a. (system-wide).  The output includes the
 die number and the number of online processors on that die. This is
 useful to gauge the amount of aggregation.
 
+--per-cluster::
+Aggregate counts per processor cluster for system-wide mode measurement.  This
+is a useful mode to detect imbalance between clusters.  To enable this mode,
+use --per-cluster in addition to -a. (system-wide).  The output includes the
+cluster number and the number of online processors on that cluster. This is
+useful to gauge the amount of aggregation.
+
 --per-core::
 Aggregate counts per physical processor for system-wide mode measurements.  This
 is a useful mode to detect imbalance between physical cores.  To enable this mode,
@@ -379,6 +386,9 @@ Aggregate counts per processor socket for system-wide mode measurements.
 --per-die::
 Aggregate counts per processor die for system-wide mode measurements.
 
+--per-cluster::
+Aggregate counts perf processor cluster for system-wide mode measurements.
+
 --per-core::
 Aggregate counts per physical processor for system-wide mode measurements.
 
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index fa7c40956d0f..e5630cb28985 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1237,6 +1237,8 @@ static struct option stat_options[] = {
 		     "aggregate counts per processor socket", AGGR_SOCKET),
 	OPT_SET_UINT(0, "per-die", &stat_config.aggr_mode,
 		     "aggregate counts per processor die", AGGR_DIE),
+	OPT_SET_UINT(0, "per-cluster", &stat_config.aggr_mode,
+		     "aggregate counts per processor cluster", AGGR_CLUSTER),
 	OPT_SET_UINT(0, "per-core", &stat_config.aggr_mode,
 		     "aggregate counts per physical processor core", AGGR_CORE),
 	OPT_SET_UINT(0, "per-thread", &stat_config.aggr_mode,
@@ -1298,6 +1300,7 @@ static struct option stat_options[] = {
 
 static const char *const aggr_mode__string[] = {
 	[AGGR_CORE] = "core",
+	[AGGR_CLUSTER] = "cluster",
 	[AGGR_DIE] = "die",
 	[AGGR_GLOBAL] = "global",
 	[AGGR_NODE] = "node",
@@ -1319,6 +1322,12 @@ static struct aggr_cpu_id perf_stat__get_die(struct perf_stat_config *config __m
 	return aggr_cpu_id__die(cpu, /*data=*/NULL);
 }
 
+static struct aggr_cpu_id perf_stat__get_cluster(struct perf_stat_config *config __maybe_unused,
+						 struct perf_cpu cpu)
+{
+	return aggr_cpu_id__cluster(cpu, /*data=*/NULL);
+}
+
 static struct aggr_cpu_id perf_stat__get_core(struct perf_stat_config *config __maybe_unused,
 					      struct perf_cpu cpu)
 {
@@ -1371,6 +1380,12 @@ static struct aggr_cpu_id perf_stat__get_die_cached(struct perf_stat_config *con
 	return perf_stat__get_aggr(config, perf_stat__get_die, cpu);
 }
 
+static struct aggr_cpu_id perf_stat__get_cluster_cached(struct perf_stat_config *config,
+							struct perf_cpu cpu)
+{
+	return perf_stat__get_aggr(config, perf_stat__get_cluster, cpu);
+}
+
 static struct aggr_cpu_id perf_stat__get_core_cached(struct perf_stat_config *config,
 						     struct perf_cpu cpu)
 {
@@ -1402,6 +1417,8 @@ static aggr_cpu_id_get_t aggr_mode__get_aggr(enum aggr_mode aggr_mode)
 		return aggr_cpu_id__socket;
 	case AGGR_DIE:
 		return aggr_cpu_id__die;
+	case AGGR_CLUSTER:
+		return aggr_cpu_id__cluster;
 	case AGGR_CORE:
 		return aggr_cpu_id__core;
 	case AGGR_NODE:
@@ -1425,6 +1442,8 @@ static aggr_get_id_t aggr_mode__get_id(enum aggr_mode aggr_mode)
 		return perf_stat__get_socket_cached;
 	case AGGR_DIE:
 		return perf_stat__get_die_cached;
+	case AGGR_CLUSTER:
+		return perf_stat__get_cluster_cached;
 	case AGGR_CORE:
 		return perf_stat__get_core_cached;
 	case AGGR_NODE:
@@ -1537,6 +1556,21 @@ static struct aggr_cpu_id perf_env__get_die_aggr_by_cpu(struct perf_cpu cpu, voi
 	return id;
 }
 
+static struct aggr_cpu_id perf_env__get_cluster_aggr_by_cpu(struct perf_cpu cpu,
+							    void *data)
+{
+	struct perf_env *env = data;
+	struct aggr_cpu_id id = aggr_cpu_id__empty();
+
+	if (cpu.cpu != -1) {
+		id.socket = env->cpu[cpu.cpu].socket_id;
+		id.die = env->cpu[cpu.cpu].die_id;
+		id.cluster = env->cpu[cpu.cpu].cluster_id;
+	}
+
+	return id;
+}
+
 static struct aggr_cpu_id perf_env__get_core_aggr_by_cpu(struct perf_cpu cpu, void *data)
 {
 	struct perf_env *env = data;
@@ -1544,12 +1578,12 @@ static struct aggr_cpu_id perf_env__get_core_aggr_by_cpu(struct perf_cpu cpu, vo
 
 	if (cpu.cpu != -1) {
 		/*
-		 * core_id is relative to socket and die,
-		 * we need a global id. So we set
-		 * socket, die id and core id
+		 * core_id is relative to socket, die and cluster, we need a
+		 * global id. So we set socket, die id, cluster id and core id.
 		 */
 		id.socket = env->cpu[cpu.cpu].socket_id;
 		id.die = env->cpu[cpu.cpu].die_id;
+		id.cluster = env->cpu[cpu.cpu].cluster_id;
 		id.core = env->cpu[cpu.cpu].core_id;
 	}
 
@@ -1605,6 +1639,12 @@ static struct aggr_cpu_id perf_stat__get_die_file(struct perf_stat_config *confi
 	return perf_env__get_die_aggr_by_cpu(cpu, &perf_stat.session->header.env);
 }
 
+static struct aggr_cpu_id perf_stat__get_cluster_file(struct perf_stat_config *config __maybe_unused,
+						      struct perf_cpu cpu)
+{
+	return perf_env__get_cluster_aggr_by_cpu(cpu, &perf_stat.session->header.env);
+}
+
 static struct aggr_cpu_id perf_stat__get_core_file(struct perf_stat_config *config __maybe_unused,
 						   struct perf_cpu cpu)
 {
@@ -1636,6 +1676,8 @@ static aggr_cpu_id_get_t aggr_mode__get_aggr_file(enum aggr_mode aggr_mode)
 		return perf_env__get_socket_aggr_by_cpu;
 	case AGGR_DIE:
 		return perf_env__get_die_aggr_by_cpu;
+	case AGGR_CLUSTER:
+		return perf_env__get_cluster_aggr_by_cpu;
 	case AGGR_CORE:
 		return perf_env__get_core_aggr_by_cpu;
 	case AGGR_NODE:
@@ -1659,6 +1701,8 @@ static aggr_get_id_t aggr_mode__get_id_file(enum aggr_mode aggr_mode)
 		return perf_stat__get_socket_file;
 	case AGGR_DIE:
 		return perf_stat__get_die_file;
+	case AGGR_CLUSTER:
+		return perf_stat__get_cluster_file;
 	case AGGR_CORE:
 		return perf_stat__get_core_file;
 	case AGGR_NODE:
@@ -2219,6 +2263,8 @@ static int __cmd_report(int argc, const char **argv)
 		     "aggregate counts per processor socket", AGGR_SOCKET),
 	OPT_SET_UINT(0, "per-die", &perf_stat.aggr_mode,
 		     "aggregate counts per processor die", AGGR_DIE),
+	OPT_SET_UINT(0, "per-cluster", &perf_stat.aggr_mode,
+		     "aggregate counts perf processor cluster", AGGR_CLUSTER),
 	OPT_SET_UINT(0, "per-core", &perf_stat.aggr_mode,
 		     "aggregate counts per physical processor core", AGGR_CORE),
 	OPT_SET_UINT(0, "per-node", &perf_stat.aggr_mode,
diff --git a/tools/perf/tests/shell/lib/perf_json_output_lint.py b/tools/perf/tests/shell/lib/perf_json_output_lint.py
index 97598d14e532..1869ff9b92c1 100644
--- a/tools/perf/tests/shell/lib/perf_json_output_lint.py
+++ b/tools/perf/tests/shell/lib/perf_json_output_lint.py
@@ -14,6 +14,7 @@ ap.add_argument('--system-wide', action='store_true')
 ap.add_argument('--event', action='store_true')
 ap.add_argument('--per-core', action='store_true')
 ap.add_argument('--per-thread', action='store_true')
+ap.add_argument('--per-cluster', action='store_true')
 ap.add_argument('--per-die', action='store_true')
 ap.add_argument('--per-node', action='store_true')
 ap.add_argument('--per-socket', action='store_true')
@@ -46,6 +47,7 @@ def check_json_output(expected_items):
       'counter-value': lambda x: is_counter_value(x),
       'cgroup': lambda x: True,
       'cpu': lambda x: isint(x),
+      'cluster': lambda x: True,
       'die': lambda x: True,
       'event': lambda x: True,
       'event-runtime': lambda x: isfloat(x),
@@ -82,7 +84,7 @@ try:
     expected_items = 7
   elif args.interval or args.per_thread or args.system_wide_no_aggr:
     expected_items = 8
-  elif args.per_core or args.per_socket or args.per_node or args.per_die:
+  elif args.per_core or args.per_socket or args.per_node or args.per_die or args.per_cluster:
     expected_items = 9
   else:
     # If no option is specified, don't check the number of items.
diff --git a/tools/perf/tests/shell/stat+csv_output.sh b/tools/perf/tests/shell/stat+csv_output.sh
index 324fc9e6edd7..7311bc835280 100755
--- a/tools/perf/tests/shell/stat+csv_output.sh
+++ b/tools/perf/tests/shell/stat+csv_output.sh
@@ -26,6 +26,7 @@ function commachecker()
 	;; "--per-socket")	exp=8
 	;; "--per-node")	exp=8
 	;; "--per-die")		exp=8
+	;; "--per-cluster")	exp=8
 	esac
 
 	while read line
@@ -123,6 +124,18 @@ check_per_thread()
 	echo "[Success]"
 }
 
+check_per_cluster()
+{
+	echo -n "Checking CSV output: per cluster "
+	if ParanoidAndNotRoot 0
+	then
+		echo "[Skip] paranoid and not root"
+		return
+	fi
+	perf stat -x$csv_sep --per-cluster -a true 2>&1 | commachecker --per-cluster
+	echo "[Success]"
+}
+
 check_per_die()
 {
 	echo -n "Checking CSV output: per die "
@@ -197,6 +210,7 @@ if [ $skip_test -ne 1 ]
 then
 	check_system_wide_no_aggr
 	check_per_core
+	check_per_cluster
 	check_per_die
 	check_per_socket
 else
diff --git a/tools/perf/tests/shell/stat+json_output.sh b/tools/perf/tests/shell/stat+json_output.sh
index 2c4212c641ed..c74bfd32abcb 100755
--- a/tools/perf/tests/shell/stat+json_output.sh
+++ b/tools/perf/tests/shell/stat+json_output.sh
@@ -100,6 +100,18 @@ check_per_thread()
 	echo "[Success]"
 }
 
+check_per_cluster()
+{
+	echo -n "Checking json output: per cluster "
+	if ParanoidAndNotRoot 0
+	then
+		echo "[Skip] paranoia and not root"
+		return
+	fi
+	perf stat -j --per-cluster -a true 2>&1 | $PYTHON $pythonchecker --per-cluster
+	echo "[Success]"
+}
+
 check_per_die()
 {
 	echo -n "Checking json output: per die "
@@ -174,6 +186,7 @@ if [ $skip_test -ne 1 ]
 then
 	check_system_wide_no_aggr
 	check_per_core
+	check_per_cluster
 	check_per_die
 	check_per_socket
 else
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index 5e564974fba4..636b3bc0cc04 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -227,6 +227,8 @@ static int aggr_cpu_id__cmp(const void *a_pointer, const void *b_pointer)
 		return a->socket - b->socket;
 	else if (a->die != b->die)
 		return a->die - b->die;
+	else if (a->cluster != b->cluster)
+		return a->cluster - b->cluster;
 	else if (a->core != b->core)
 		return a->core - b->core;
 	else
@@ -310,6 +312,25 @@ struct aggr_cpu_id aggr_cpu_id__die(struct perf_cpu cpu, void *data)
 	return id;
 }
 
+int cpu__get_cluster_id(struct perf_cpu cpu)
+{
+	int value, ret = cpu__get_topology_int(cpu.cpu, "cluster_id", &value);
+	return ret ?: value;
+}
+
+struct aggr_cpu_id aggr_cpu_id__cluster(struct perf_cpu cpu, void *data)
+{
+	int cluster = cpu__get_cluster_id(cpu);
+	struct aggr_cpu_id id;
+
+	id = aggr_cpu_id__die(cpu, data);
+	if (aggr_cpu_id__is_empty(&id))
+		return id;
+
+	id.cluster = cluster;
+	return id;
+}
+
 int cpu__get_core_id(struct perf_cpu cpu)
 {
 	int value, ret = cpu__get_topology_int(cpu.cpu, "core_id", &value);
@@ -321,8 +342,8 @@ struct aggr_cpu_id aggr_cpu_id__core(struct perf_cpu cpu, void *data)
 	struct aggr_cpu_id id;
 	int core = cpu__get_core_id(cpu);
 
-	/* aggr_cpu_id__die returns a struct with socket and die set. */
-	id = aggr_cpu_id__die(cpu, data);
+	/* aggr_cpu_id__die returns a struct with socket die, and cluster set. */
+	id = aggr_cpu_id__cluster(cpu, data);
 	if (aggr_cpu_id__is_empty(&id))
 		return id;
 
@@ -684,6 +705,7 @@ bool aggr_cpu_id__equal(const struct aggr_cpu_id *a, const struct aggr_cpu_id *b
 		a->node == b->node &&
 		a->socket == b->socket &&
 		a->die == b->die &&
+		a->cluster == b->cluster &&
 		a->core == b->core &&
 		a->cpu.cpu == b->cpu.cpu;
 }
@@ -694,6 +716,7 @@ bool aggr_cpu_id__is_empty(const struct aggr_cpu_id *a)
 		a->node == -1 &&
 		a->socket == -1 &&
 		a->die == -1 &&
+		a->cluster == -1 &&
 		a->core == -1 &&
 		a->cpu.cpu == -1;
 }
@@ -705,6 +728,7 @@ struct aggr_cpu_id aggr_cpu_id__empty(void)
 		.node = -1,
 		.socket = -1,
 		.die = -1,
+		.cluster = -1,
 		.core = -1,
 		.cpu = (struct perf_cpu){ .cpu = -1 },
 	};
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index c2f5824a3a22..e3b6b8c1b0b4 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -20,6 +20,8 @@ struct aggr_cpu_id {
 	int socket;
 	/** The die id as read from /sys/devices/system/cpu/cpuX/topology/die_id. */
 	int die;
+	/** The cluster id as read from /sys/devices/system/cpu/cpuX/topology/cluster_id */
+	int cluster;
 	/** The core id as read from /sys/devices/system/cpu/cpuX/topology/core_id. */
 	int core;
 	/** CPU aggregation, note there is one CPU for each SMT thread. */
@@ -76,6 +78,11 @@ int cpu__get_socket_id(struct perf_cpu cpu);
  * /sys/devices/system/cpu/cpuX/topology/die_id for the given CPU.
  */
 int cpu__get_die_id(struct perf_cpu cpu);
+/**
+ * cpu__get_cluster_id - Returns the cluster id as read from
+ * /sys/devices/system/cpu/cpuX/topology/cluster_id for the given CPU
+ */
+int cpu__get_cluster_id(struct perf_cpu cpu);
 /**
  * cpu__get_core_id - Returns the core id as read from
  * /sys/devices/system/cpu/cpuX/topology/core_id for the given CPU.
@@ -117,9 +124,15 @@ struct aggr_cpu_id aggr_cpu_id__socket(struct perf_cpu cpu, void *data);
  */
 struct aggr_cpu_id aggr_cpu_id__die(struct perf_cpu cpu, void *data);
 /**
- * aggr_cpu_id__core - Create an aggr_cpu_id with the core, die and socket
- * populated with the core, die and socket for cpu. The function signature is
- * compatible with aggr_cpu_id_get_t.
+ * aggr_cpu_id__cluster - Create an aggr_cpu_id with cluster, die and socket
+ * populated with the cluster, die and socket for cpu. The function signature
+ * is compatible with aggr_cpu_id_get_t.
+ */
+struct aggr_cpu_id aggr_cpu_id__cluster(struct perf_cpu cpu, void *data);
+/**
+ * aggr_cpu_id__core - Create an aggr_cpu_id with the core, cluster, die and
+ * socket populated with the core, die and socket for cpu. The function
+ * signature is compatible with aggr_cpu_id_get_t.
  */
 struct aggr_cpu_id aggr_cpu_id__core(struct perf_cpu cpu, void *data);
 /**
diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h
index 4566c51f2fd9..e288649627d5 100644
--- a/tools/perf/util/env.h
+++ b/tools/perf/util/env.h
@@ -12,6 +12,7 @@ struct perf_cpu_map;
 struct cpu_topology_map {
 	int	socket_id;
 	int	die_id;
+	int	cluster_id;
 	int	core_id;
 };
 
diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c
index 1b5cb20efd23..a6aca21ab040 100644
--- a/tools/perf/util/stat-display.c
+++ b/tools/perf/util/stat-display.c
@@ -193,6 +193,9 @@ static void print_aggr_id_std(struct perf_stat_config *config,
 	case AGGR_CORE:
 		snprintf(buf, sizeof(buf), "S%d-D%d-C%d", id.socket, id.die, id.core);
 		break;
+	case AGGR_CLUSTER:
+		snprintf(buf, sizeof(buf), "S%d-D%d-CLS%d", id.socket, id.die, id.cluster);
+		break;
 	case AGGR_DIE:
 		snprintf(buf, sizeof(buf), "S%d-D%d", id.socket, id.die);
 		break;
@@ -239,6 +242,10 @@ static void print_aggr_id_csv(struct perf_stat_config *config,
 		fprintf(output, "S%d-D%d-C%d%s%d%s",
 			id.socket, id.die, id.core, sep, nr, sep);
 		break;
+	case AGGR_CLUSTER:
+		fprintf(config->output, "S%d-D%d-CLS%d%s%d%s",
+			id.socket, id.die, id.cluster, sep, nr, sep);
+		break;
 	case AGGR_DIE:
 		fprintf(output, "S%d-D%d%s%d%s",
 			id.socket, id.die, sep, nr, sep);
@@ -284,6 +291,10 @@ static void print_aggr_id_json(struct perf_stat_config *config,
 		fprintf(output, "\"core\" : \"S%d-D%d-C%d\", \"aggregate-number\" : %d, ",
 			id.socket, id.die, id.core, nr);
 		break;
+	case AGGR_CLUSTER:
+		fprintf(output, "\"cluster\" : \"S%d-D%d-CLS%d\", \"aggregate-number\" : %d, ",
+			id.socket, id.die, id.cluster, nr);
+		break;
 	case AGGR_DIE:
 		fprintf(output, "\"die\" : \"S%d-D%d\", \"aggregate-number\" : %d, ",
 			id.socket, id.die, nr);
@@ -1126,6 +1137,7 @@ static void print_header_interval_std(struct perf_stat_config *config,
 	case AGGR_NODE:
 	case AGGR_SOCKET:
 	case AGGR_DIE:
+	case AGGR_CLUSTER:
 	case AGGR_CORE:
 		fprintf(output, "#%*s %-*s cpus",
 			INTERVAL_LEN - 1, "time",
@@ -1422,6 +1434,7 @@ void evlist__print_counters(struct evlist *evlist, struct perf_stat_config *conf
 
 	switch (config->aggr_mode) {
 	case AGGR_CORE:
+	case AGGR_CLUSTER:
 	case AGGR_DIE:
 	case AGGR_SOCKET:
 	case AGGR_NODE:
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
index bf1794ebc916..9efbfbc25bc7 100644
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -74,6 +74,7 @@ enum aggr_mode {
 	AGGR_GLOBAL,
 	AGGR_SOCKET,
 	AGGR_DIE,
+	AGGR_CLUSTER,
 	AGGR_CORE,
 	AGGR_THREAD,
 	AGGR_UNSET,
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] perf stat: Support per-cluster aggregation
  2023-03-13  8:59 [PATCH] perf stat: Support per-cluster aggregation Yicong Yang
@ 2023-03-23 13:03 ` Yicong Yang
  2023-03-24  2:34 ` Jie Zhan
  2023-03-24 18:05 ` Chen, Tim C
  2 siblings, 0 replies; 10+ messages in thread
From: Yicong Yang @ 2023-03-23 13:03 UTC (permalink / raw)
  To: acme, mark.rutland, peterz, mingo, james.clark,
	alexander.shishkin, linux-perf-users, linux-kernel
  Cc: yangyicong, Jonathan.Cameron, 21cnbao, tim.c.chen, prime.zeng,
	shenyang39, linuxarm

Hi perf maintainers,

A gentle ping. Appreciate for any comment!

Thanks

On 2023/3/13 16:59, Yicong Yang wrote:
> From: Yicong Yang <yangyicong@hisilicon.com>
> 
> Some platforms have 'cluster' topology and CPUs in the cluster will
> share resources like L3 Cache Tag (for HiSilicon Kunpeng SoC) or L2
> cache (for Intel Jacobsville). Currently parsing and building cluster
> topology have been supported since [1].
> 
> perf stat has already supported aggregation for other topologies like
> die or socket, etc. It'll be useful to aggregate per-cluster to find
> problems like L3T bandwidth contention or imbalance.
> 
> This patch adds support for "--per-cluster" option for per-cluster
> aggregation. Also update the docs and related test. The output will
> be like:
> 
> [root@localhost tmp]# perf stat -a -e LLC-load --per-cluster -- sleep 5
> 
>  Performance counter stats for 'system wide':
> 
> S56-D0-CLS158    4      1,321,521,570      LLC-load
> S56-D0-CLS594    4        794,211,453      LLC-load
> S56-D0-CLS1030    4             41,623      LLC-load
> S56-D0-CLS1466    4             41,646      LLC-load
> S56-D0-CLS1902    4             16,863      LLC-load
> S56-D0-CLS2338    4             15,721      LLC-load
> S56-D0-CLS2774    4             22,671      LLC-load
> [...]
> 
> [1] commit c5e22feffdd7 ("topology: Represent clusters of CPUs within a die")
> 
> Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
> ---
>  tools/perf/Documentation/perf-stat.txt        | 10 ++++
>  tools/perf/builtin-stat.c                     | 52 +++++++++++++++++--
>  .../tests/shell/lib/perf_json_output_lint.py  |  4 +-
>  tools/perf/tests/shell/stat+csv_output.sh     | 14 +++++
>  tools/perf/tests/shell/stat+json_output.sh    | 13 +++++
>  tools/perf/util/cpumap.c                      | 28 +++++++++-
>  tools/perf/util/cpumap.h                      | 19 +++++--
>  tools/perf/util/env.h                         |  1 +
>  tools/perf/util/stat-display.c                | 13 +++++
>  tools/perf/util/stat.h                        |  1 +
>  10 files changed, 146 insertions(+), 9 deletions(-)
> 
> diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
> index 18abdc1dce05..688433c3756c 100644
> --- a/tools/perf/Documentation/perf-stat.txt
> +++ b/tools/perf/Documentation/perf-stat.txt
> @@ -308,6 +308,13 @@ use --per-die in addition to -a. (system-wide).  The output includes the
>  die number and the number of online processors on that die. This is
>  useful to gauge the amount of aggregation.
>  
> +--per-cluster::
> +Aggregate counts per processor cluster for system-wide mode measurement.  This
> +is a useful mode to detect imbalance between clusters.  To enable this mode,
> +use --per-cluster in addition to -a. (system-wide).  The output includes the
> +cluster number and the number of online processors on that cluster. This is
> +useful to gauge the amount of aggregation.
> +
>  --per-core::
>  Aggregate counts per physical processor for system-wide mode measurements.  This
>  is a useful mode to detect imbalance between physical cores.  To enable this mode,
> @@ -379,6 +386,9 @@ Aggregate counts per processor socket for system-wide mode measurements.
>  --per-die::
>  Aggregate counts per processor die for system-wide mode measurements.
>  
> +--per-cluster::
> +Aggregate counts perf processor cluster for system-wide mode measurements.
> +
>  --per-core::
>  Aggregate counts per physical processor for system-wide mode measurements.
>  
> diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
> index fa7c40956d0f..e5630cb28985 100644
> --- a/tools/perf/builtin-stat.c
> +++ b/tools/perf/builtin-stat.c
> @@ -1237,6 +1237,8 @@ static struct option stat_options[] = {
>  		     "aggregate counts per processor socket", AGGR_SOCKET),
>  	OPT_SET_UINT(0, "per-die", &stat_config.aggr_mode,
>  		     "aggregate counts per processor die", AGGR_DIE),
> +	OPT_SET_UINT(0, "per-cluster", &stat_config.aggr_mode,
> +		     "aggregate counts per processor cluster", AGGR_CLUSTER),
>  	OPT_SET_UINT(0, "per-core", &stat_config.aggr_mode,
>  		     "aggregate counts per physical processor core", AGGR_CORE),
>  	OPT_SET_UINT(0, "per-thread", &stat_config.aggr_mode,
> @@ -1298,6 +1300,7 @@ static struct option stat_options[] = {
>  
>  static const char *const aggr_mode__string[] = {
>  	[AGGR_CORE] = "core",
> +	[AGGR_CLUSTER] = "cluster",
>  	[AGGR_DIE] = "die",
>  	[AGGR_GLOBAL] = "global",
>  	[AGGR_NODE] = "node",
> @@ -1319,6 +1322,12 @@ static struct aggr_cpu_id perf_stat__get_die(struct perf_stat_config *config __m
>  	return aggr_cpu_id__die(cpu, /*data=*/NULL);
>  }
>  
> +static struct aggr_cpu_id perf_stat__get_cluster(struct perf_stat_config *config __maybe_unused,
> +						 struct perf_cpu cpu)
> +{
> +	return aggr_cpu_id__cluster(cpu, /*data=*/NULL);
> +}
> +
>  static struct aggr_cpu_id perf_stat__get_core(struct perf_stat_config *config __maybe_unused,
>  					      struct perf_cpu cpu)
>  {
> @@ -1371,6 +1380,12 @@ static struct aggr_cpu_id perf_stat__get_die_cached(struct perf_stat_config *con
>  	return perf_stat__get_aggr(config, perf_stat__get_die, cpu);
>  }
>  
> +static struct aggr_cpu_id perf_stat__get_cluster_cached(struct perf_stat_config *config,
> +							struct perf_cpu cpu)
> +{
> +	return perf_stat__get_aggr(config, perf_stat__get_cluster, cpu);
> +}
> +
>  static struct aggr_cpu_id perf_stat__get_core_cached(struct perf_stat_config *config,
>  						     struct perf_cpu cpu)
>  {
> @@ -1402,6 +1417,8 @@ static aggr_cpu_id_get_t aggr_mode__get_aggr(enum aggr_mode aggr_mode)
>  		return aggr_cpu_id__socket;
>  	case AGGR_DIE:
>  		return aggr_cpu_id__die;
> +	case AGGR_CLUSTER:
> +		return aggr_cpu_id__cluster;
>  	case AGGR_CORE:
>  		return aggr_cpu_id__core;
>  	case AGGR_NODE:
> @@ -1425,6 +1442,8 @@ static aggr_get_id_t aggr_mode__get_id(enum aggr_mode aggr_mode)
>  		return perf_stat__get_socket_cached;
>  	case AGGR_DIE:
>  		return perf_stat__get_die_cached;
> +	case AGGR_CLUSTER:
> +		return perf_stat__get_cluster_cached;
>  	case AGGR_CORE:
>  		return perf_stat__get_core_cached;
>  	case AGGR_NODE:
> @@ -1537,6 +1556,21 @@ static struct aggr_cpu_id perf_env__get_die_aggr_by_cpu(struct perf_cpu cpu, voi
>  	return id;
>  }
>  
> +static struct aggr_cpu_id perf_env__get_cluster_aggr_by_cpu(struct perf_cpu cpu,
> +							    void *data)
> +{
> +	struct perf_env *env = data;
> +	struct aggr_cpu_id id = aggr_cpu_id__empty();
> +
> +	if (cpu.cpu != -1) {
> +		id.socket = env->cpu[cpu.cpu].socket_id;
> +		id.die = env->cpu[cpu.cpu].die_id;
> +		id.cluster = env->cpu[cpu.cpu].cluster_id;
> +	}
> +
> +	return id;
> +}
> +
>  static struct aggr_cpu_id perf_env__get_core_aggr_by_cpu(struct perf_cpu cpu, void *data)
>  {
>  	struct perf_env *env = data;
> @@ -1544,12 +1578,12 @@ static struct aggr_cpu_id perf_env__get_core_aggr_by_cpu(struct perf_cpu cpu, vo
>  
>  	if (cpu.cpu != -1) {
>  		/*
> -		 * core_id is relative to socket and die,
> -		 * we need a global id. So we set
> -		 * socket, die id and core id
> +		 * core_id is relative to socket, die and cluster, we need a
> +		 * global id. So we set socket, die id, cluster id and core id.
>  		 */
>  		id.socket = env->cpu[cpu.cpu].socket_id;
>  		id.die = env->cpu[cpu.cpu].die_id;
> +		id.cluster = env->cpu[cpu.cpu].cluster_id;
>  		id.core = env->cpu[cpu.cpu].core_id;
>  	}
>  
> @@ -1605,6 +1639,12 @@ static struct aggr_cpu_id perf_stat__get_die_file(struct perf_stat_config *confi
>  	return perf_env__get_die_aggr_by_cpu(cpu, &perf_stat.session->header.env);
>  }
>  
> +static struct aggr_cpu_id perf_stat__get_cluster_file(struct perf_stat_config *config __maybe_unused,
> +						      struct perf_cpu cpu)
> +{
> +	return perf_env__get_cluster_aggr_by_cpu(cpu, &perf_stat.session->header.env);
> +}
> +
>  static struct aggr_cpu_id perf_stat__get_core_file(struct perf_stat_config *config __maybe_unused,
>  						   struct perf_cpu cpu)
>  {
> @@ -1636,6 +1676,8 @@ static aggr_cpu_id_get_t aggr_mode__get_aggr_file(enum aggr_mode aggr_mode)
>  		return perf_env__get_socket_aggr_by_cpu;
>  	case AGGR_DIE:
>  		return perf_env__get_die_aggr_by_cpu;
> +	case AGGR_CLUSTER:
> +		return perf_env__get_cluster_aggr_by_cpu;
>  	case AGGR_CORE:
>  		return perf_env__get_core_aggr_by_cpu;
>  	case AGGR_NODE:
> @@ -1659,6 +1701,8 @@ static aggr_get_id_t aggr_mode__get_id_file(enum aggr_mode aggr_mode)
>  		return perf_stat__get_socket_file;
>  	case AGGR_DIE:
>  		return perf_stat__get_die_file;
> +	case AGGR_CLUSTER:
> +		return perf_stat__get_cluster_file;
>  	case AGGR_CORE:
>  		return perf_stat__get_core_file;
>  	case AGGR_NODE:
> @@ -2219,6 +2263,8 @@ static int __cmd_report(int argc, const char **argv)
>  		     "aggregate counts per processor socket", AGGR_SOCKET),
>  	OPT_SET_UINT(0, "per-die", &perf_stat.aggr_mode,
>  		     "aggregate counts per processor die", AGGR_DIE),
> +	OPT_SET_UINT(0, "per-cluster", &perf_stat.aggr_mode,
> +		     "aggregate counts perf processor cluster", AGGR_CLUSTER),
>  	OPT_SET_UINT(0, "per-core", &perf_stat.aggr_mode,
>  		     "aggregate counts per physical processor core", AGGR_CORE),
>  	OPT_SET_UINT(0, "per-node", &perf_stat.aggr_mode,
> diff --git a/tools/perf/tests/shell/lib/perf_json_output_lint.py b/tools/perf/tests/shell/lib/perf_json_output_lint.py
> index 97598d14e532..1869ff9b92c1 100644
> --- a/tools/perf/tests/shell/lib/perf_json_output_lint.py
> +++ b/tools/perf/tests/shell/lib/perf_json_output_lint.py
> @@ -14,6 +14,7 @@ ap.add_argument('--system-wide', action='store_true')
>  ap.add_argument('--event', action='store_true')
>  ap.add_argument('--per-core', action='store_true')
>  ap.add_argument('--per-thread', action='store_true')
> +ap.add_argument('--per-cluster', action='store_true')
>  ap.add_argument('--per-die', action='store_true')
>  ap.add_argument('--per-node', action='store_true')
>  ap.add_argument('--per-socket', action='store_true')
> @@ -46,6 +47,7 @@ def check_json_output(expected_items):
>        'counter-value': lambda x: is_counter_value(x),
>        'cgroup': lambda x: True,
>        'cpu': lambda x: isint(x),
> +      'cluster': lambda x: True,
>        'die': lambda x: True,
>        'event': lambda x: True,
>        'event-runtime': lambda x: isfloat(x),
> @@ -82,7 +84,7 @@ try:
>      expected_items = 7
>    elif args.interval or args.per_thread or args.system_wide_no_aggr:
>      expected_items = 8
> -  elif args.per_core or args.per_socket or args.per_node or args.per_die:
> +  elif args.per_core or args.per_socket or args.per_node or args.per_die or args.per_cluster:
>      expected_items = 9
>    else:
>      # If no option is specified, don't check the number of items.
> diff --git a/tools/perf/tests/shell/stat+csv_output.sh b/tools/perf/tests/shell/stat+csv_output.sh
> index 324fc9e6edd7..7311bc835280 100755
> --- a/tools/perf/tests/shell/stat+csv_output.sh
> +++ b/tools/perf/tests/shell/stat+csv_output.sh
> @@ -26,6 +26,7 @@ function commachecker()
>  	;; "--per-socket")	exp=8
>  	;; "--per-node")	exp=8
>  	;; "--per-die")		exp=8
> +	;; "--per-cluster")	exp=8
>  	esac
>  
>  	while read line
> @@ -123,6 +124,18 @@ check_per_thread()
>  	echo "[Success]"
>  }
>  
> +check_per_cluster()
> +{
> +	echo -n "Checking CSV output: per cluster "
> +	if ParanoidAndNotRoot 0
> +	then
> +		echo "[Skip] paranoid and not root"
> +		return
> +	fi
> +	perf stat -x$csv_sep --per-cluster -a true 2>&1 | commachecker --per-cluster
> +	echo "[Success]"
> +}
> +
>  check_per_die()
>  {
>  	echo -n "Checking CSV output: per die "
> @@ -197,6 +210,7 @@ if [ $skip_test -ne 1 ]
>  then
>  	check_system_wide_no_aggr
>  	check_per_core
> +	check_per_cluster
>  	check_per_die
>  	check_per_socket
>  else
> diff --git a/tools/perf/tests/shell/stat+json_output.sh b/tools/perf/tests/shell/stat+json_output.sh
> index 2c4212c641ed..c74bfd32abcb 100755
> --- a/tools/perf/tests/shell/stat+json_output.sh
> +++ b/tools/perf/tests/shell/stat+json_output.sh
> @@ -100,6 +100,18 @@ check_per_thread()
>  	echo "[Success]"
>  }
>  
> +check_per_cluster()
> +{
> +	echo -n "Checking json output: per cluster "
> +	if ParanoidAndNotRoot 0
> +	then
> +		echo "[Skip] paranoia and not root"
> +		return
> +	fi
> +	perf stat -j --per-cluster -a true 2>&1 | $PYTHON $pythonchecker --per-cluster
> +	echo "[Success]"
> +}
> +
>  check_per_die()
>  {
>  	echo -n "Checking json output: per die "
> @@ -174,6 +186,7 @@ if [ $skip_test -ne 1 ]
>  then
>  	check_system_wide_no_aggr
>  	check_per_core
> +	check_per_cluster
>  	check_per_die
>  	check_per_socket
>  else
> diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
> index 5e564974fba4..636b3bc0cc04 100644
> --- a/tools/perf/util/cpumap.c
> +++ b/tools/perf/util/cpumap.c
> @@ -227,6 +227,8 @@ static int aggr_cpu_id__cmp(const void *a_pointer, const void *b_pointer)
>  		return a->socket - b->socket;
>  	else if (a->die != b->die)
>  		return a->die - b->die;
> +	else if (a->cluster != b->cluster)
> +		return a->cluster - b->cluster;
>  	else if (a->core != b->core)
>  		return a->core - b->core;
>  	else
> @@ -310,6 +312,25 @@ struct aggr_cpu_id aggr_cpu_id__die(struct perf_cpu cpu, void *data)
>  	return id;
>  }
>  
> +int cpu__get_cluster_id(struct perf_cpu cpu)
> +{
> +	int value, ret = cpu__get_topology_int(cpu.cpu, "cluster_id", &value);
> +	return ret ?: value;
> +}
> +
> +struct aggr_cpu_id aggr_cpu_id__cluster(struct perf_cpu cpu, void *data)
> +{
> +	int cluster = cpu__get_cluster_id(cpu);
> +	struct aggr_cpu_id id;
> +
> +	id = aggr_cpu_id__die(cpu, data);
> +	if (aggr_cpu_id__is_empty(&id))
> +		return id;
> +
> +	id.cluster = cluster;
> +	return id;
> +}
> +
>  int cpu__get_core_id(struct perf_cpu cpu)
>  {
>  	int value, ret = cpu__get_topology_int(cpu.cpu, "core_id", &value);
> @@ -321,8 +342,8 @@ struct aggr_cpu_id aggr_cpu_id__core(struct perf_cpu cpu, void *data)
>  	struct aggr_cpu_id id;
>  	int core = cpu__get_core_id(cpu);
>  
> -	/* aggr_cpu_id__die returns a struct with socket and die set. */
> -	id = aggr_cpu_id__die(cpu, data);
> +	/* aggr_cpu_id__die returns a struct with socket die, and cluster set. */
> +	id = aggr_cpu_id__cluster(cpu, data);
>  	if (aggr_cpu_id__is_empty(&id))
>  		return id;
>  
> @@ -684,6 +705,7 @@ bool aggr_cpu_id__equal(const struct aggr_cpu_id *a, const struct aggr_cpu_id *b
>  		a->node == b->node &&
>  		a->socket == b->socket &&
>  		a->die == b->die &&
> +		a->cluster == b->cluster &&
>  		a->core == b->core &&
>  		a->cpu.cpu == b->cpu.cpu;
>  }
> @@ -694,6 +716,7 @@ bool aggr_cpu_id__is_empty(const struct aggr_cpu_id *a)
>  		a->node == -1 &&
>  		a->socket == -1 &&
>  		a->die == -1 &&
> +		a->cluster == -1 &&
>  		a->core == -1 &&
>  		a->cpu.cpu == -1;
>  }
> @@ -705,6 +728,7 @@ struct aggr_cpu_id aggr_cpu_id__empty(void)
>  		.node = -1,
>  		.socket = -1,
>  		.die = -1,
> +		.cluster = -1,
>  		.core = -1,
>  		.cpu = (struct perf_cpu){ .cpu = -1 },
>  	};
> diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
> index c2f5824a3a22..e3b6b8c1b0b4 100644
> --- a/tools/perf/util/cpumap.h
> +++ b/tools/perf/util/cpumap.h
> @@ -20,6 +20,8 @@ struct aggr_cpu_id {
>  	int socket;
>  	/** The die id as read from /sys/devices/system/cpu/cpuX/topology/die_id. */
>  	int die;
> +	/** The cluster id as read from /sys/devices/system/cpu/cpuX/topology/cluster_id */
> +	int cluster;
>  	/** The core id as read from /sys/devices/system/cpu/cpuX/topology/core_id. */
>  	int core;
>  	/** CPU aggregation, note there is one CPU for each SMT thread. */
> @@ -76,6 +78,11 @@ int cpu__get_socket_id(struct perf_cpu cpu);
>   * /sys/devices/system/cpu/cpuX/topology/die_id for the given CPU.
>   */
>  int cpu__get_die_id(struct perf_cpu cpu);
> +/**
> + * cpu__get_cluster_id - Returns the cluster id as read from
> + * /sys/devices/system/cpu/cpuX/topology/cluster_id for the given CPU
> + */
> +int cpu__get_cluster_id(struct perf_cpu cpu);
>  /**
>   * cpu__get_core_id - Returns the core id as read from
>   * /sys/devices/system/cpu/cpuX/topology/core_id for the given CPU.
> @@ -117,9 +124,15 @@ struct aggr_cpu_id aggr_cpu_id__socket(struct perf_cpu cpu, void *data);
>   */
>  struct aggr_cpu_id aggr_cpu_id__die(struct perf_cpu cpu, void *data);
>  /**
> - * aggr_cpu_id__core - Create an aggr_cpu_id with the core, die and socket
> - * populated with the core, die and socket for cpu. The function signature is
> - * compatible with aggr_cpu_id_get_t.
> + * aggr_cpu_id__cluster - Create an aggr_cpu_id with cluster, die and socket
> + * populated with the cluster, die and socket for cpu. The function signature
> + * is compatible with aggr_cpu_id_get_t.
> + */
> +struct aggr_cpu_id aggr_cpu_id__cluster(struct perf_cpu cpu, void *data);
> +/**
> + * aggr_cpu_id__core - Create an aggr_cpu_id with the core, cluster, die and
> + * socket populated with the core, die and socket for cpu. The function
> + * signature is compatible with aggr_cpu_id_get_t.
>   */
>  struct aggr_cpu_id aggr_cpu_id__core(struct perf_cpu cpu, void *data);
>  /**
> diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h
> index 4566c51f2fd9..e288649627d5 100644
> --- a/tools/perf/util/env.h
> +++ b/tools/perf/util/env.h
> @@ -12,6 +12,7 @@ struct perf_cpu_map;
>  struct cpu_topology_map {
>  	int	socket_id;
>  	int	die_id;
> +	int	cluster_id;
>  	int	core_id;
>  };
>  
> diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c
> index 1b5cb20efd23..a6aca21ab040 100644
> --- a/tools/perf/util/stat-display.c
> +++ b/tools/perf/util/stat-display.c
> @@ -193,6 +193,9 @@ static void print_aggr_id_std(struct perf_stat_config *config,
>  	case AGGR_CORE:
>  		snprintf(buf, sizeof(buf), "S%d-D%d-C%d", id.socket, id.die, id.core);
>  		break;
> +	case AGGR_CLUSTER:
> +		snprintf(buf, sizeof(buf), "S%d-D%d-CLS%d", id.socket, id.die, id.cluster);
> +		break;
>  	case AGGR_DIE:
>  		snprintf(buf, sizeof(buf), "S%d-D%d", id.socket, id.die);
>  		break;
> @@ -239,6 +242,10 @@ static void print_aggr_id_csv(struct perf_stat_config *config,
>  		fprintf(output, "S%d-D%d-C%d%s%d%s",
>  			id.socket, id.die, id.core, sep, nr, sep);
>  		break;
> +	case AGGR_CLUSTER:
> +		fprintf(config->output, "S%d-D%d-CLS%d%s%d%s",
> +			id.socket, id.die, id.cluster, sep, nr, sep);
> +		break;
>  	case AGGR_DIE:
>  		fprintf(output, "S%d-D%d%s%d%s",
>  			id.socket, id.die, sep, nr, sep);
> @@ -284,6 +291,10 @@ static void print_aggr_id_json(struct perf_stat_config *config,
>  		fprintf(output, "\"core\" : \"S%d-D%d-C%d\", \"aggregate-number\" : %d, ",
>  			id.socket, id.die, id.core, nr);
>  		break;
> +	case AGGR_CLUSTER:
> +		fprintf(output, "\"cluster\" : \"S%d-D%d-CLS%d\", \"aggregate-number\" : %d, ",
> +			id.socket, id.die, id.cluster, nr);
> +		break;
>  	case AGGR_DIE:
>  		fprintf(output, "\"die\" : \"S%d-D%d\", \"aggregate-number\" : %d, ",
>  			id.socket, id.die, nr);
> @@ -1126,6 +1137,7 @@ static void print_header_interval_std(struct perf_stat_config *config,
>  	case AGGR_NODE:
>  	case AGGR_SOCKET:
>  	case AGGR_DIE:
> +	case AGGR_CLUSTER:
>  	case AGGR_CORE:
>  		fprintf(output, "#%*s %-*s cpus",
>  			INTERVAL_LEN - 1, "time",
> @@ -1422,6 +1434,7 @@ void evlist__print_counters(struct evlist *evlist, struct perf_stat_config *conf
>  
>  	switch (config->aggr_mode) {
>  	case AGGR_CORE:
> +	case AGGR_CLUSTER:
>  	case AGGR_DIE:
>  	case AGGR_SOCKET:
>  	case AGGR_NODE:
> diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
> index bf1794ebc916..9efbfbc25bc7 100644
> --- a/tools/perf/util/stat.h
> +++ b/tools/perf/util/stat.h
> @@ -74,6 +74,7 @@ enum aggr_mode {
>  	AGGR_GLOBAL,
>  	AGGR_SOCKET,
>  	AGGR_DIE,
> +	AGGR_CLUSTER,
>  	AGGR_CORE,
>  	AGGR_THREAD,
>  	AGGR_UNSET,
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] perf stat: Support per-cluster aggregation
  2023-03-13  8:59 [PATCH] perf stat: Support per-cluster aggregation Yicong Yang
  2023-03-23 13:03 ` Yicong Yang
@ 2023-03-24  2:34 ` Jie Zhan
  2023-03-24 12:24   ` Jonathan Cameron
  2023-03-24 18:05 ` Chen, Tim C
  2 siblings, 1 reply; 10+ messages in thread
From: Jie Zhan @ 2023-03-24  2:34 UTC (permalink / raw)
  To: Yicong Yang, acme, mark.rutland, peterz, mingo, james.clark,
	alexander.shishkin, linux-perf-users, linux-kernel
  Cc: Jonathan.Cameron, 21cnbao, tim.c.chen, prime.zeng, shenyang39,
	linuxarm, yangyicong


On 13/03/2023 16:59, Yicong Yang wrote:
> From: Yicong Yang <yangyicong@hisilicon.com>
>
> Some platforms have 'cluster' topology and CPUs in the cluster will
> share resources like L3 Cache Tag (for HiSilicon Kunpeng SoC) or L2
> cache (for Intel Jacobsville). Currently parsing and building cluster
> topology have been supported since [1].
>
> perf stat has already supported aggregation for other topologies like
> die or socket, etc. It'll be useful to aggregate per-cluster to find
> problems like L3T bandwidth contention or imbalance.
>
> This patch adds support for "--per-cluster" option for per-cluster
> aggregation. Also update the docs and related test. The output will
> be like:
>
> [root@localhost tmp]# perf stat -a -e LLC-load --per-cluster -- sleep 5
>
>   Performance counter stats for 'system wide':
>
> S56-D0-CLS158    4      1,321,521,570      LLC-load
> S56-D0-CLS594    4        794,211,453      LLC-load
> S56-D0-CLS1030    4             41,623      LLC-load
> S56-D0-CLS1466    4             41,646      LLC-load
> S56-D0-CLS1902    4             16,863      LLC-load
> S56-D0-CLS2338    4             15,721      LLC-load
> S56-D0-CLS2774    4             22,671      LLC-load
> [...]
>
> [1] commit c5e22feffdd7 ("topology: Represent clusters of CPUs within a die")
>
> Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>

An end user may have to check sysfs to figure out what CPUs those 
cluster IDs account for.

Any better method to show the mapping between CPUs and cluster IDs?

Perhaps adding a conditional cluster id (when there are clusters) in the 
"--per-core" output may help.

Apart form that, this works well on my aarch64.

Tested-by: Jie Zhan <zhanjie9@hisilicon.com>



^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] perf stat: Support per-cluster aggregation
  2023-03-24  2:34 ` Jie Zhan
@ 2023-03-24 12:24   ` Jonathan Cameron
  2023-03-24 12:30     ` Jonathan Cameron
  0 siblings, 1 reply; 10+ messages in thread
From: Jonathan Cameron @ 2023-03-24 12:24 UTC (permalink / raw)
  To: Jie Zhan
  Cc: Yicong Yang, acme, mark.rutland, peterz, mingo, james.clark,
	alexander.shishkin, linux-perf-users, linux-kernel, 21cnbao,
	tim.c.chen, prime.zeng, shenyang39, linuxarm, yangyicong

On Fri, 24 Mar 2023 10:34:33 +0800
Jie Zhan <zhanjie9@hisilicon.com> wrote:

> On 13/03/2023 16:59, Yicong Yang wrote:
> > From: Yicong Yang <yangyicong@hisilicon.com>
> >
> > Some platforms have 'cluster' topology and CPUs in the cluster will
> > share resources like L3 Cache Tag (for HiSilicon Kunpeng SoC) or L2
> > cache (for Intel Jacobsville). Currently parsing and building cluster
> > topology have been supported since [1].
> >
> > perf stat has already supported aggregation for other topologies like
> > die or socket, etc. It'll be useful to aggregate per-cluster to find
> > problems like L3T bandwidth contention or imbalance.
> >
> > This patch adds support for "--per-cluster" option for per-cluster
> > aggregation. Also update the docs and related test. The output will
> > be like:
> >
> > [root@localhost tmp]# perf stat -a -e LLC-load --per-cluster -- sleep 5
> >
> >   Performance counter stats for 'system wide':
> >
> > S56-D0-CLS158    4      1,321,521,570      LLC-load
> > S56-D0-CLS594    4        794,211,453      LLC-load
> > S56-D0-CLS1030    4             41,623      LLC-load
> > S56-D0-CLS1466    4             41,646      LLC-load
> > S56-D0-CLS1902    4             16,863      LLC-load
> > S56-D0-CLS2338    4             15,721      LLC-load
> > S56-D0-CLS2774    4             22,671      LLC-load
> > [...]
> >
> > [1] commit c5e22feffdd7 ("topology: Represent clusters of CPUs within a die")
> >
> > Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>  
> 
> An end user may have to check sysfs to figure out what CPUs those 
> cluster IDs account for.
> 
> Any better method to show the mapping between CPUs and cluster IDs?

The cluster code is capable of using the ACPI_PPTT_ACPI_PROCESSOR_ID field
if valid for the cluster level of PPTT.

The numbers in the example above look like offsets into the PPTT table
so I think the PPTT table is missing that information.

Whilst not a great description anyway (it's just an index), the UUID
that would be in there can convey more info on which cluster this is.


> 
> Perhaps adding a conditional cluster id (when there are clusters) in the 
> "--per-core" output may help.

That's an interesting idea.  You'd want to include the other levels
if doing that.  So whenever you do a --per-xxx it also provides the
cluster / die / node / socket etc as relevant 'above' the level of xxx
Fun is that node and die can flip which would make this tricky to do.

Jonathan

> 
> Apart form that, this works well on my aarch64.
> 
> Tested-by: Jie Zhan <zhanjie9@hisilicon.com>

> 
> 


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] perf stat: Support per-cluster aggregation
  2023-03-24 12:24   ` Jonathan Cameron
@ 2023-03-24 12:30     ` Jonathan Cameron
  2023-03-27  6:20       ` Yicong Yang
  0 siblings, 1 reply; 10+ messages in thread
From: Jonathan Cameron @ 2023-03-24 12:30 UTC (permalink / raw)
  To: Jie Zhan
  Cc: Yicong Yang, acme, mark.rutland, peterz, mingo, james.clark,
	alexander.shishkin, linux-perf-users, linux-kernel, 21cnbao,
	tim.c.chen, prime.zeng, shenyang39, linuxarm, yangyicong

On Fri, 24 Mar 2023 12:24:22 +0000
Jonathan Cameron <Jonathan.Cameron@Huawei.com> wrote:

> On Fri, 24 Mar 2023 10:34:33 +0800
> Jie Zhan <zhanjie9@hisilicon.com> wrote:
> 
> > On 13/03/2023 16:59, Yicong Yang wrote:  
> > > From: Yicong Yang <yangyicong@hisilicon.com>
> > >
> > > Some platforms have 'cluster' topology and CPUs in the cluster will
> > > share resources like L3 Cache Tag (for HiSilicon Kunpeng SoC) or L2
> > > cache (for Intel Jacobsville). Currently parsing and building cluster
> > > topology have been supported since [1].
> > >
> > > perf stat has already supported aggregation for other topologies like
> > > die or socket, etc. It'll be useful to aggregate per-cluster to find
> > > problems like L3T bandwidth contention or imbalance.
> > >
> > > This patch adds support for "--per-cluster" option for per-cluster
> > > aggregation. Also update the docs and related test. The output will
> > > be like:
> > >
> > > [root@localhost tmp]# perf stat -a -e LLC-load --per-cluster -- sleep 5
> > >
> > >   Performance counter stats for 'system wide':
> > >
> > > S56-D0-CLS158    4      1,321,521,570      LLC-load
> > > S56-D0-CLS594    4        794,211,453      LLC-load
> > > S56-D0-CLS1030    4             41,623      LLC-load
> > > S56-D0-CLS1466    4             41,646      LLC-load
> > > S56-D0-CLS1902    4             16,863      LLC-load
> > > S56-D0-CLS2338    4             15,721      LLC-load
> > > S56-D0-CLS2774    4             22,671      LLC-load
> > > [...]
> > >
> > > [1] commit c5e22feffdd7 ("topology: Represent clusters of CPUs within a die")
> > >
> > > Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>    
> > 
> > An end user may have to check sysfs to figure out what CPUs those 
> > cluster IDs account for.
> > 
> > Any better method to show the mapping between CPUs and cluster IDs?  
> 
> The cluster code is capable of using the ACPI_PPTT_ACPI_PROCESSOR_ID field
> if valid for the cluster level of PPTT.
> 
> The numbers in the example above look like offsets into the PPTT table
> so I think the PPTT table is missing that information.
> 
> Whilst not a great description anyway (it's just an index), the UUID
> that would be in there can convey more info on which cluster this is.
> 
> 
> > 
> > Perhaps adding a conditional cluster id (when there are clusters) in the 
> > "--per-core" output may help.  
> 
> That's an interesting idea.  You'd want to include the other levels
> if doing that.  So whenever you do a --per-xxx it also provides the
> cluster / die / node / socket etc as relevant 'above' the level of xxx
> Fun is that node and die can flip which would make this tricky to do.

Ignore me on this.  I'd not looked at the patch closely when I wrote
this.  Clearly a lot of this information is already provided - the
suggestion was to consider adding cluster to that mix which makes
sense to me.

Jonathan

> 
> Jonathan
> 
> > 
> > Apart form that, this works well on my aarch64.
> > 
> > Tested-by: Jie Zhan <zhanjie9@hisilicon.com>  
> 
> > 
> >   
> 


^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH] perf stat: Support per-cluster aggregation
  2023-03-13  8:59 [PATCH] perf stat: Support per-cluster aggregation Yicong Yang
  2023-03-23 13:03 ` Yicong Yang
  2023-03-24  2:34 ` Jie Zhan
@ 2023-03-24 18:05 ` Chen, Tim C
  2023-03-27  4:03   ` Yicong Yang
  2023-03-29  6:47   ` Namhyung Kim
  2 siblings, 2 replies; 10+ messages in thread
From: Chen, Tim C @ 2023-03-24 18:05 UTC (permalink / raw)
  To: Yicong Yang, acme, mark.rutland, peterz, mingo, james.clark,
	alexander.shishkin, linux-perf-users, linux-kernel
  Cc: Jonathan.Cameron, 21cnbao, prime.zeng, shenyang39, linuxarm, yangyicong

>
>From: Yicong Yang <yangyicong@hisilicon.com>
>
>Some platforms have 'cluster' topology and CPUs in the cluster will share
>resources like L3 Cache Tag (for HiSilicon Kunpeng SoC) or L2 cache (for Intel
>Jacobsville). Currently parsing and building cluster topology have been
>supported since [1].
>
>perf stat has already supported aggregation for other topologies like die or
>socket, etc. It'll be useful to aggregate per-cluster to find problems like L3T
>bandwidth contention or imbalance.
>
>This patch adds support for "--per-cluster" option for per-cluster aggregation.
>Also update the docs and related test. The output will be like:
>
>[root@localhost tmp]# perf stat -a -e LLC-load --per-cluster -- sleep 5
>
> Performance counter stats for 'system wide':
>
>S56-D0-CLS158    4      1,321,521,570      LLC-load
>S56-D0-CLS594    4        794,211,453      LLC-load
>S56-D0-CLS1030    4             41,623      LLC-load
>S56-D0-CLS1466    4             41,646      LLC-load
>S56-D0-CLS1902    4             16,863      LLC-load
>S56-D0-CLS2338    4             15,721      LLC-load
>S56-D0-CLS2774    4             22,671      LLC-load
>[...]

Overall it looks good.  You can add my reviewed-by.

I wonder if we could enhance the help message 
in perf stat to tell user to refer to 
/sys/devices/system/cpu/cpuX/topology/*_id
to map relevant ids back to overall cpu topology.

For example the above example, cluster S56-D0-CLS158  has
really heavy load. It took me  a while
going through the code to figure out how to find
the info that maps cluster id to cpu.

Tim

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] perf stat: Support per-cluster aggregation
  2023-03-24 18:05 ` Chen, Tim C
@ 2023-03-27  4:03   ` Yicong Yang
  2023-03-29  6:47   ` Namhyung Kim
  1 sibling, 0 replies; 10+ messages in thread
From: Yicong Yang @ 2023-03-27  4:03 UTC (permalink / raw)
  To: Chen, Tim C, acme, mark.rutland, peterz, mingo, james.clark,
	alexander.shishkin, linux-perf-users, linux-kernel
  Cc: yangyicong, Jonathan.Cameron, 21cnbao, prime.zeng, shenyang39, linuxarm

Hi Tim,

On 2023/3/25 2:05, Chen, Tim C wrote:
>>
>> From: Yicong Yang <yangyicong@hisilicon.com>
>>
>> Some platforms have 'cluster' topology and CPUs in the cluster will share
>> resources like L3 Cache Tag (for HiSilicon Kunpeng SoC) or L2 cache (for Intel
>> Jacobsville). Currently parsing and building cluster topology have been
>> supported since [1].
>>
>> perf stat has already supported aggregation for other topologies like die or
>> socket, etc. It'll be useful to aggregate per-cluster to find problems like L3T
>> bandwidth contention or imbalance.
>>
>> This patch adds support for "--per-cluster" option for per-cluster aggregation.
>> Also update the docs and related test. The output will be like:
>>
>> [root@localhost tmp]# perf stat -a -e LLC-load --per-cluster -- sleep 5
>>
>> Performance counter stats for 'system wide':
>>
>> S56-D0-CLS158    4      1,321,521,570      LLC-load
>> S56-D0-CLS594    4        794,211,453      LLC-load
>> S56-D0-CLS1030    4             41,623      LLC-load
>> S56-D0-CLS1466    4             41,646      LLC-load
>> S56-D0-CLS1902    4             16,863      LLC-load
>> S56-D0-CLS2338    4             15,721      LLC-load
>> S56-D0-CLS2774    4             22,671      LLC-load
>> [...]
> 
> Overall it looks good.  You can add my reviewed-by.
> 

thanks.

> I wonder if we could enhance the help message 
> in perf stat to tell user to refer to 
> /sys/devices/system/cpu/cpuX/topology/*_id
> to map relevant ids back to overall cpu topology.
> 
> For example the above example, cluster S56-D0-CLS158  has
> really heavy load. It took me  a while
> going through the code to figure out how to find
> the info that maps cluster id to cpu.
> 

yes, indeed. Actually this is because my bios doesn't report a valid
ID for these topologies so the ACPI use the offset of the topology
node in the PPTT as a fallback. Other topologies also suffers the same:

On my machine:
[root@localhost debug]# perf stat --per-socket -e cycles -a -- sleep 1

 Performance counter stats for 'system wide':

S56      64         21,563,375      cycles
S7182    64         32,140,641      cycles

       1.008520310 seconds time elapsed

On x86:
root@ubuntu204:/home/yang/linux/tools/perf# ./perf stat -a --per-socket -e cycles -- sleep 1

 Performance counter stats for 'system wide':

S0       40        137,205,897      cycles
S1       40         67,720,731      cycles

       1.003546720 seconds time elapsed

Maybe I can try to add a separate patch for describing the source of the
topology ids in the perf manual.

Thanks,
Yicong


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] perf stat: Support per-cluster aggregation
  2023-03-24 12:30     ` Jonathan Cameron
@ 2023-03-27  6:20       ` Yicong Yang
  0 siblings, 0 replies; 10+ messages in thread
From: Yicong Yang @ 2023-03-27  6:20 UTC (permalink / raw)
  To: Jonathan Cameron, Jie Zhan
  Cc: yangyicong, acme, mark.rutland, peterz, mingo, james.clark,
	alexander.shishkin, linux-perf-users, linux-kernel, 21cnbao,
	tim.c.chen, prime.zeng, shenyang39, linuxarm

Hi Jie and Jonathan,

On 2023/3/24 20:30, Jonathan Cameron wrote:
> On Fri, 24 Mar 2023 12:24:22 +0000
> Jonathan Cameron <Jonathan.Cameron@Huawei.com> wrote:
> 
>> On Fri, 24 Mar 2023 10:34:33 +0800
>> Jie Zhan <zhanjie9@hisilicon.com> wrote:
>>
>>> On 13/03/2023 16:59, Yicong Yang wrote:  
>>>> From: Yicong Yang <yangyicong@hisilicon.com>
>>>>
>>>> Some platforms have 'cluster' topology and CPUs in the cluster will
>>>> share resources like L3 Cache Tag (for HiSilicon Kunpeng SoC) or L2
>>>> cache (for Intel Jacobsville). Currently parsing and building cluster
>>>> topology have been supported since [1].
>>>>
>>>> perf stat has already supported aggregation for other topologies like
>>>> die or socket, etc. It'll be useful to aggregate per-cluster to find
>>>> problems like L3T bandwidth contention or imbalance.
>>>>
>>>> This patch adds support for "--per-cluster" option for per-cluster
>>>> aggregation. Also update the docs and related test. The output will
>>>> be like:
>>>>
>>>> [root@localhost tmp]# perf stat -a -e LLC-load --per-cluster -- sleep 5
>>>>
>>>>   Performance counter stats for 'system wide':
>>>>
>>>> S56-D0-CLS158    4      1,321,521,570      LLC-load
>>>> S56-D0-CLS594    4        794,211,453      LLC-load
>>>> S56-D0-CLS1030    4             41,623      LLC-load
>>>> S56-D0-CLS1466    4             41,646      LLC-load
>>>> S56-D0-CLS1902    4             16,863      LLC-load
>>>> S56-D0-CLS2338    4             15,721      LLC-load
>>>> S56-D0-CLS2774    4             22,671      LLC-load
>>>> [...]
>>>>
>>>> [1] commit c5e22feffdd7 ("topology: Represent clusters of CPUs within a die")
>>>>
>>>> Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>    
>>>
>>> An end user may have to check sysfs to figure out what CPUs those 
>>> cluster IDs account for.
>>>
>>> Any better method to show the mapping between CPUs and cluster IDs?  
>>
>> The cluster code is capable of using the ACPI_PPTT_ACPI_PROCESSOR_ID field
>> if valid for the cluster level of PPTT.
>>
>> The numbers in the example above look like offsets into the PPTT table
>> so I think the PPTT table is missing that information.
>>

Yes it is, the PPTT doesn't give a valid ID on my machine, for cluster and other
topologies. It's not a problem of this patch.

>> Whilst not a great description anyway (it's just an index), the UUID
>> that would be in there can convey more info on which cluster this is.
>>
>>
>>>
>>> Perhaps adding a conditional cluster id (when there are clusters) in the 
>>> "--per-core" output may help.  
>>
>> That's an interesting idea.  You'd want to include the other levels
>> if doing that.  So whenever you do a --per-xxx it also provides the
>> cluster / die / node / socket etc as relevant 'above' the level of xxx
>> Fun is that node and die can flip which would make this tricky to do.
> 
> Ignore me on this.  I'd not looked at the patch closely when I wrote
> this.  Clearly a lot of this information is already provided - the
> suggestion was to consider adding cluster to that mix which makes
> sense to me.
> 

In the early version of this patch I added the cluster info in the "--per-core"
output as "Sxxx-Dxxx-CLSxxx-Cxxx". But I decide to keep it as is to not break
the existed tools/scripts using --per-core outputs. Maybe we can add it later
if there's requirement.

Thanks,
Yicong


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] perf stat: Support per-cluster aggregation
  2023-03-24 18:05 ` Chen, Tim C
  2023-03-27  4:03   ` Yicong Yang
@ 2023-03-29  6:47   ` Namhyung Kim
  2023-03-29 12:46     ` Yicong Yang
  1 sibling, 1 reply; 10+ messages in thread
From: Namhyung Kim @ 2023-03-29  6:47 UTC (permalink / raw)
  To: Chen, Tim C
  Cc: Yicong Yang, acme, mark.rutland, peterz, mingo, james.clark,
	alexander.shishkin, linux-perf-users, linux-kernel,
	Jonathan.Cameron, 21cnbao, prime.zeng, shenyang39, linuxarm,
	yangyicong

Hello,

On Fri, Mar 24, 2023 at 11:09 AM Chen, Tim C <tim.c.chen@intel.com> wrote:
>
> >
> >From: Yicong Yang <yangyicong@hisilicon.com>
> >
> >Some platforms have 'cluster' topology and CPUs in the cluster will share
> >resources like L3 Cache Tag (for HiSilicon Kunpeng SoC) or L2 cache (for Intel
> >Jacobsville). Currently parsing and building cluster topology have been
> >supported since [1].
> >
> >perf stat has already supported aggregation for other topologies like die or
> >socket, etc. It'll be useful to aggregate per-cluster to find problems like L3T
> >bandwidth contention or imbalance.
> >
> >This patch adds support for "--per-cluster" option for per-cluster aggregation.
> >Also update the docs and related test. The output will be like:
> >
> >[root@localhost tmp]# perf stat -a -e LLC-load --per-cluster -- sleep 5
> >
> > Performance counter stats for 'system wide':
> >
> >S56-D0-CLS158    4      1,321,521,570      LLC-load
> >S56-D0-CLS594    4        794,211,453      LLC-load
> >S56-D0-CLS1030    4             41,623      LLC-load
> >S56-D0-CLS1466    4             41,646      LLC-load
> >S56-D0-CLS1902    4             16,863      LLC-load
> >S56-D0-CLS2338    4             15,721      LLC-load
> >S56-D0-CLS2774    4             22,671      LLC-load
> >[...]
>
> Overall it looks good.  You can add my reviewed-by.
>
> I wonder if we could enhance the help message
> in perf stat to tell user to refer to
> /sys/devices/system/cpu/cpuX/topology/*_id
> to map relevant ids back to overall cpu topology.
>
> For example the above example, cluster S56-D0-CLS158  has
> really heavy load. It took me  a while
> going through the code to figure out how to find
> the info that maps cluster id to cpu.

Maybe we could enhance the cpu filter to accept something
like -C S56-D0-CLS158.

I also wonder what if it runs on an old kernel which doesn't
have the cluster_id file.

Thanks,
Namhyung

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] perf stat: Support per-cluster aggregation
  2023-03-29  6:47   ` Namhyung Kim
@ 2023-03-29 12:46     ` Yicong Yang
  0 siblings, 0 replies; 10+ messages in thread
From: Yicong Yang @ 2023-03-29 12:46 UTC (permalink / raw)
  To: Namhyung Kim, Chen, Tim C
  Cc: yangyicong, acme, mark.rutland, peterz, mingo, james.clark,
	alexander.shishkin, linux-perf-users, linux-kernel,
	Jonathan.Cameron, 21cnbao, prime.zeng, shenyang39, linuxarm

On 2023/3/29 14:47, Namhyung Kim wrote:
> Hello,
> 
> On Fri, Mar 24, 2023 at 11:09 AM Chen, Tim C <tim.c.chen@intel.com> wrote:
>>
>>>
>>> From: Yicong Yang <yangyicong@hisilicon.com>
>>>
>>> Some platforms have 'cluster' topology and CPUs in the cluster will share
>>> resources like L3 Cache Tag (for HiSilicon Kunpeng SoC) or L2 cache (for Intel
>>> Jacobsville). Currently parsing and building cluster topology have been
>>> supported since [1].
>>>
>>> perf stat has already supported aggregation for other topologies like die or
>>> socket, etc. It'll be useful to aggregate per-cluster to find problems like L3T
>>> bandwidth contention or imbalance.
>>>
>>> This patch adds support for "--per-cluster" option for per-cluster aggregation.
>>> Also update the docs and related test. The output will be like:
>>>
>>> [root@localhost tmp]# perf stat -a -e LLC-load --per-cluster -- sleep 5
>>>
>>> Performance counter stats for 'system wide':
>>>
>>> S56-D0-CLS158    4      1,321,521,570      LLC-load
>>> S56-D0-CLS594    4        794,211,453      LLC-load
>>> S56-D0-CLS1030    4             41,623      LLC-load
>>> S56-D0-CLS1466    4             41,646      LLC-load
>>> S56-D0-CLS1902    4             16,863      LLC-load
>>> S56-D0-CLS2338    4             15,721      LLC-load
>>> S56-D0-CLS2774    4             22,671      LLC-load
>>> [...]
>>
>> Overall it looks good.  You can add my reviewed-by.
>>
>> I wonder if we could enhance the help message
>> in perf stat to tell user to refer to
>> /sys/devices/system/cpu/cpuX/topology/*_id
>> to map relevant ids back to overall cpu topology.
>>
>> For example the above example, cluster S56-D0-CLS158  has
>> really heavy load. It took me  a while
>> going through the code to figure out how to find
>> the info that maps cluster id to cpu.
> 
> Maybe we could enhance the cpu filter to accept something
> like -C S56-D0-CLS158.
> 

you mean specified the CPUs by a topology ID like this S56-D0-CLS158
then we actually filtering the CPUs in the CLS 158?

> I also wonder what if it runs on an old kernel which doesn't
> have the cluster_id file.

It should work well but may not be proper for the cluster. There's
no die topology nor related sysfs attributes on arm64, but --per-die
works like:

[root@localhost perf]# perf stat -a -e cycles --per-die -- sleep 1

 Performance counter stats for 'system wide':

S56-D0         64         12,700,186      cycles
S7182-D0       64         20,297,320      cycles

       1.003638080 seconds time elapsed

On a legacy kernel without cluster sysfs attributes, the output will be
look like:

[root@localhost perf]# perf stat -a -e cycles --per-cluster -- sleep 1

 Performance counter stats for 'system wide':

S56-D0-CLS-1   64         12,634,251      cycles
S7182-D0-CLS-1   64         16,348,322      cycles

       1.003696680 seconds time elapsed

The patch just assign -1 to the cluster id. I'll modify this to keep consistence
with the output of --per-die. Thanks for catching this!

Thanks,
Yicong



^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2023-03-29 12:47 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-03-13  8:59 [PATCH] perf stat: Support per-cluster aggregation Yicong Yang
2023-03-23 13:03 ` Yicong Yang
2023-03-24  2:34 ` Jie Zhan
2023-03-24 12:24   ` Jonathan Cameron
2023-03-24 12:30     ` Jonathan Cameron
2023-03-27  6:20       ` Yicong Yang
2023-03-24 18:05 ` Chen, Tim C
2023-03-27  4:03   ` Yicong Yang
2023-03-29  6:47   ` Namhyung Kim
2023-03-29 12:46     ` Yicong Yang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).