[PATCH 0/3] perf stat: Add --per-numa option

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 0/3] perf stat: Add --per-numa option
@ 2019-09-02 12:12 Jiri Olsa
  2019-09-02 12:12 ` [PATCH 1/3] libperf: Add perf_cpu_map__max function Jiri Olsa
                   ` (2 more replies)
  0 siblings, 3 replies; 12+ messages in thread
From: Jiri Olsa @ 2019-09-02 12:12 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: lkml, Ingo Molnar, Namhyung Kim, Alexander Shishkin,
	Peter Zijlstra, Michael Petlan, Joe Mario, Kan Liang, Andi Kleen

hi,
adding --per-numa option to aggregate stats per NUMA nodes,
you can get now use stat command like:
    
  # perf stat  -a -I 1000 -e cycles --per-numa
  #           time numa   cpus             counts unit events
       1.000542550 N0       20          6,202,097      cycles
       1.000542550 N1       20            639,559      cycles
       2.002040063 N0       20          7,412,495      cycles
       2.002040063 N1       20          2,185,577      cycles
       3.003451699 N0       20          6,508,917      cycles
       3.003451699 N1       20            765,607      cycles
  ...

thanks,
jirka


---
Jiri Olsa (3):
      libperf: Add perf_cpu_map__max function
      perf tools: Add perf_env__numa_node function
      perf stat: Add --per-numa agregation support

 tools/perf/Documentation/perf-stat.txt |  5 +++++
 tools/perf/builtin-stat.c              | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++----------
 tools/perf/lib/cpumap.c                | 12 ++++++++++++
 tools/perf/lib/include/perf/cpumap.h   |  1 +
 tools/perf/lib/libperf.map             |  1 +
 tools/perf/util/cpumap.c               | 18 ++++++++++++++++++
 tools/perf/util/cpumap.h               |  3 +++
 tools/perf/util/env.c                  | 35 +++++++++++++++++++++++++++++++++++
 tools/perf/util/env.h                  |  6 ++++++
 tools/perf/util/stat-display.c         | 15 +++++++++++++++
 tools/perf/util/stat.c                 |  1 +
 tools/perf/util/stat.h                 |  1 +
 12 files changed, 148 insertions(+), 10 deletions(-)

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 1/3] libperf: Add perf_cpu_map__max function
  2019-09-02 12:12 [PATCH 0/3] perf stat: Add --per-numa option Jiri Olsa
@ 2019-09-02 12:12 ` Jiri Olsa
  2019-09-02 13:57   ` Arnaldo Carvalho de Melo
  2019-09-20 16:21   ` [tip: perf/urgent] libperf: Adopt perf_cpu_map__max() function tip-bot2 for Jiri Olsa
  2019-09-02 12:12 ` [PATCH 2/3] perf tools: Add perf_env__numa_node function Jiri Olsa
  2019-09-02 12:12 ` [PATCH 3/3] perf stat: Add --per-numa agregation support Jiri Olsa
  2 siblings, 2 replies; 12+ messages in thread
From: Jiri Olsa @ 2019-09-02 12:12 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: lkml, Ingo Molnar, Namhyung Kim, Alexander Shishkin,
	Peter Zijlstra, Michael Petlan, Joe Mario, Kan Liang, Andi Kleen

So it can be used from multiple places.

Link: http://lkml.kernel.org/n/tip-yp3h5rl9e8piybufq41zqnla@git.kernel.org
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 tools/perf/builtin-stat.c            | 14 +-------------
 tools/perf/lib/cpumap.c              | 12 ++++++++++++
 tools/perf/lib/include/perf/cpumap.h |  1 +
 tools/perf/lib/libperf.map           |  1 +
 4 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 7e17bf9f700a..5bc0c570b7b6 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -822,18 +822,6 @@ static int perf_stat__get_core(struct perf_stat_config *config __maybe_unused,
 	return cpu_map__get_core(map, cpu, NULL);
 }
 
-static int cpu_map__get_max(struct perf_cpu_map *map)
-{
-	int i, max = -1;
-
-	for (i = 0; i < map->nr; i++) {
-		if (map->map[i] > max)
-			max = map->map[i];
-	}
-
-	return max;
-}
-
 static int perf_stat__get_aggr(struct perf_stat_config *config,
 			       aggr_get_id_t get_id, struct perf_cpu_map *map, int idx)
 {
@@ -928,7 +916,7 @@ static int perf_stat_init_aggr_mode(void)
 	 * taking the highest cpu number to be the size of
 	 * the aggregation translate cpumap.
 	 */
-	nr = cpu_map__get_max(evsel_list->core.cpus);
+	nr = perf_cpu_map__max(evsel_list->core.cpus);
 	stat_config.cpus_aggr_map = perf_cpu_map__empty_new(nr + 1);
 	return stat_config.cpus_aggr_map ? 0 : -ENOMEM;
 }
diff --git a/tools/perf/lib/cpumap.c b/tools/perf/lib/cpumap.c
index 1f0e6f334237..2ca1fafa620d 100644
--- a/tools/perf/lib/cpumap.c
+++ b/tools/perf/lib/cpumap.c
@@ -260,3 +260,15 @@ int perf_cpu_map__idx(struct perf_cpu_map *cpus, int cpu)
 
 	return -1;
 }
+
+int perf_cpu_map__max(struct perf_cpu_map *map)
+{
+	int i, max = -1;
+
+	for (i = 0; i < map->nr; i++) {
+		if (map->map[i] > max)
+			max = map->map[i];
+	}
+
+	return max;
+}
diff --git a/tools/perf/lib/include/perf/cpumap.h b/tools/perf/lib/include/perf/cpumap.h
index 8aa995c59498..ac9aa497f84a 100644
--- a/tools/perf/lib/include/perf/cpumap.h
+++ b/tools/perf/lib/include/perf/cpumap.h
@@ -16,6 +16,7 @@ LIBPERF_API void perf_cpu_map__put(struct perf_cpu_map *map);
 LIBPERF_API int perf_cpu_map__cpu(const struct perf_cpu_map *cpus, int idx);
 LIBPERF_API int perf_cpu_map__nr(const struct perf_cpu_map *cpus);
 LIBPERF_API bool perf_cpu_map__empty(const struct perf_cpu_map *map);
+LIBPERF_API int perf_cpu_map__max(struct perf_cpu_map *map);
 
 #define perf_cpu_map__for_each_cpu(cpu, idx, cpus)		\
 	for ((idx) = 0, (cpu) = perf_cpu_map__cpu(cpus, idx);	\
diff --git a/tools/perf/lib/libperf.map b/tools/perf/lib/libperf.map
index dc4d66363bc4..cd0d17b996c8 100644
--- a/tools/perf/lib/libperf.map
+++ b/tools/perf/lib/libperf.map
@@ -9,6 +9,7 @@ LIBPERF_0.0.1 {
 		perf_cpu_map__nr;
 		perf_cpu_map__cpu;
 		perf_cpu_map__empty;
+		perf_cpu_map__max;
 		perf_thread_map__new_dummy;
 		perf_thread_map__set_pid;
 		perf_thread_map__comm;
-- 
2.21.0


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 2/3] perf tools: Add perf_env__numa_node function
  2019-09-02 12:12 [PATCH 0/3] perf stat: Add --per-numa option Jiri Olsa
  2019-09-02 12:12 ` [PATCH 1/3] libperf: Add perf_cpu_map__max function Jiri Olsa
@ 2019-09-02 12:12 ` Jiri Olsa
  2019-09-02 13:57   ` Arnaldo Carvalho de Melo
  2019-09-02 12:12 ` [PATCH 3/3] perf stat: Add --per-numa agregation support Jiri Olsa
  2 siblings, 1 reply; 12+ messages in thread
From: Jiri Olsa @ 2019-09-02 12:12 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: lkml, Ingo Molnar, Namhyung Kim, Alexander Shishkin,
	Peter Zijlstra, Michael Petlan, Joe Mario, Kan Liang, Andi Kleen

To speed up cpu to node lookup, adding perf_env__numa_node
function, that creates cpu array on the first lookup, that
holds numa nodes for each stored cpu.

Link: http://lkml.kernel.org/n/tip-qqwxklhissf3yjyuaszh6480@git.kernel.org
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 tools/perf/util/env.c | 35 +++++++++++++++++++++++++++++++++++
 tools/perf/util/env.h |  6 ++++++
 2 files changed, 41 insertions(+)

diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c
index 3baca06786fb..6385961e45df 100644
--- a/tools/perf/util/env.c
+++ b/tools/perf/util/env.c
@@ -179,6 +179,7 @@ void perf_env__exit(struct perf_env *env)
 	zfree(&env->sibling_threads);
 	zfree(&env->pmu_mappings);
 	zfree(&env->cpu);
+	zfree(&env->numa_map);
 
 	for (i = 0; i < env->nr_numa_nodes; i++)
 		perf_cpu_map__put(env->numa_nodes[i].map);
@@ -338,3 +339,37 @@ const char *perf_env__arch(struct perf_env *env)
 
 	return normalize_arch(arch_name);
 }
+
+
+int perf_env__numa_node(struct perf_env *env, int cpu)
+{
+	if (!env->nr_numa_map) {
+		struct numa_node *nn;
+		int i, nr = 0;
+
+		for (i = 0; i < env->nr_numa_nodes; i++) {
+			nn = &env->numa_nodes[i];
+			nr = max(nr, perf_cpu_map__max(nn->map));
+		}
+
+		nr++;
+		env->numa_map = zalloc(nr * sizeof(int));
+		if (!env->numa_map)
+			return -1;
+
+		for (i = 0; i < nr; i++)
+			env->numa_map[i] = -1;
+
+		env->nr_numa_map = nr;
+
+		for (i = 0; i < env->nr_numa_nodes; i++) {
+			int tmp, j;
+
+			nn = &env->numa_nodes[i];
+			perf_cpu_map__for_each_cpu(j, tmp, nn->map)
+				env->numa_map[j] = i;
+		}
+	}
+
+	return cpu >= 0 && cpu < env->nr_numa_map ? env->numa_map[cpu] : -1;
+}
diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h
index d8e083d42610..777008f8007a 100644
--- a/tools/perf/util/env.h
+++ b/tools/perf/util/env.h
@@ -86,6 +86,10 @@ struct perf_env {
 		struct rb_root		btfs;
 		u32			btfs_cnt;
 	} bpf_progs;
+
+	/* For fast cpu to numa node lookup via perf_env__numa_node */
+	int			*numa_map;
+	int			 nr_numa_map;
 };
 
 enum perf_compress_type {
@@ -118,4 +122,6 @@ struct bpf_prog_info_node *perf_env__find_bpf_prog_info(struct perf_env *env,
 							__u32 prog_id);
 void perf_env__insert_btf(struct perf_env *env, struct btf_node *btf_node);
 struct btf_node *perf_env__find_btf(struct perf_env *env, __u32 btf_id);
+
+int perf_env__numa_node(struct perf_env *env, int cpu);
 #endif /* __PERF_ENV_H */
-- 
2.21.0


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 3/3] perf stat: Add --per-numa agregation support
  2019-09-02 12:12 [PATCH 0/3] perf stat: Add --per-numa option Jiri Olsa
  2019-09-02 12:12 ` [PATCH 1/3] libperf: Add perf_cpu_map__max function Jiri Olsa
  2019-09-02 12:12 ` [PATCH 2/3] perf tools: Add perf_env__numa_node function Jiri Olsa
@ 2019-09-02 12:12 ` Jiri Olsa
  2019-09-02 15:13   ` Alexey Budankov
  2 siblings, 1 reply; 12+ messages in thread
From: Jiri Olsa @ 2019-09-02 12:12 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: lkml, Ingo Molnar, Namhyung Kim, Alexander Shishkin,
	Peter Zijlstra, Michael Petlan, Joe Mario, Kan Liang, Andi Kleen

Adding new --per-numa option to aggregate counts per NUMA
nodes for system-wide mode measurements.

You can specify --per-numa in live mode:

  # perf stat  -a -I 1000 -e cycles --per-numa
  #           time numa   cpus             counts unit events
       1.000542550 N0       20          6,202,097      cycles
       1.000542550 N1       20            639,559      cycles
       2.002040063 N0       20          7,412,495      cycles
       2.002040063 N1       20          2,185,577      cycles
       3.003451699 N0       20          6,508,917      cycles
       3.003451699 N1       20            765,607      cycles
  ...

Or in the record/report stat session:

  # perf stat record -a -I 1000 -e cycles
  #           time             counts unit events
       1.000536937         10,008,468      cycles
       2.002090152          9,578,539      cycles
       3.003625233          7,647,869      cycles
       4.005135036          7,032,086      cycles
  ^C     4.340902364          3,923,893      cycles

  # perf stat report --per-numa
  #           time numa   cpus             counts unit events
       1.000536937 N0       20          9,355,086      cycles
       1.000536937 N1       20            653,382      cycles
       2.002090152 N0       20          7,712,838      cycles
       2.002090152 N1       20          1,865,701      cycles
       3.003625233 N0       20          6,604,441      cycles
       3.003625233 N1       20          1,043,428      cycles
       4.005135036 N0       20          6,350,522      cycles
       4.005135036 N1       20            681,564      cycles
       4.340902364 N0       20          3,403,188      cycles
       4.340902364 N1       20            520,705      cycles

Link: http://lkml.kernel.org/n/tip-h57ftv8vmqrgzz3kdvlvh4yk@git.kernel.org
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 tools/perf/Documentation/perf-stat.txt |  5 +++
 tools/perf/builtin-stat.c              | 52 ++++++++++++++++++++++++++
 tools/perf/util/cpumap.c               | 18 +++++++++
 tools/perf/util/cpumap.h               |  3 ++
 tools/perf/util/stat-display.c         | 15 ++++++++
 tools/perf/util/stat.c                 |  1 +
 tools/perf/util/stat.h                 |  1 +
 7 files changed, 95 insertions(+)

diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index 930c51c01201..74299dc2ffd1 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -217,6 +217,11 @@ core number and the number of online logical processors on that physical process
 Aggregate counts per monitored threads, when monitoring threads (-t option)
 or processes (-p option).
 
+--per-numa::
+Aggregate counts per NUMA nodes for system-wide mode measurements. This
+is a useful mode to detect imbalance between NUMA nodes. To enable this
+mode, use --per-numa in addition to -a. (system-wide).
+
 -D msecs::
 --delay msecs::
 After starting the program, wait msecs before measuring. This is useful to
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 5bc0c570b7b6..5c30e9e3de19 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -790,6 +790,8 @@ static struct option stat_options[] = {
 		     "aggregate counts per physical processor core", AGGR_CORE),
 	OPT_SET_UINT(0, "per-thread", &stat_config.aggr_mode,
 		     "aggregate counts per thread", AGGR_THREAD),
+	OPT_SET_UINT(0, "per-numa", &stat_config.aggr_mode,
+		     "aggregate counts per numa node", AGGR_NUMA),
 	OPT_UINTEGER('D', "delay", &stat_config.initial_delay,
 		     "ms to wait before starting measurement after program start"),
 	OPT_CALLBACK_NOOPT(0, "metric-only", &stat_config.metric_only, NULL,
@@ -822,6 +824,12 @@ static int perf_stat__get_core(struct perf_stat_config *config __maybe_unused,
 	return cpu_map__get_core(map, cpu, NULL);
 }
 
+static int perf_stat__get_numa(struct perf_stat_config *config __maybe_unused,
+			       struct perf_cpu_map *map, int cpu)
+{
+	return cpu_map__get_numa(map, cpu, NULL);
+}
+
 static int perf_stat__get_aggr(struct perf_stat_config *config,
 			       aggr_get_id_t get_id, struct perf_cpu_map *map, int idx)
 {
@@ -856,6 +864,12 @@ static int perf_stat__get_core_cached(struct perf_stat_config *config,
 	return perf_stat__get_aggr(config, perf_stat__get_core, map, idx);
 }
 
+static int perf_stat__get_numa_cached(struct perf_stat_config *config,
+				      struct perf_cpu_map *map, int idx)
+{
+	return perf_stat__get_aggr(config, perf_stat__get_numa, map, idx);
+}
+
 static bool term_percore_set(void)
 {
 	struct evsel *counter;
@@ -894,6 +908,13 @@ static int perf_stat_init_aggr_mode(void)
 		}
 		stat_config.aggr_get_id = perf_stat__get_core_cached;
 		break;
+	case AGGR_NUMA:
+		if (cpu_map__build_numa_map(evsel_list->core.cpus, &stat_config.aggr_map)) {
+			perror("cannot build core map");
+			return -1;
+		}
+		stat_config.aggr_get_id = perf_stat__get_numa_cached;
+		break;
 	case AGGR_NONE:
 		if (term_percore_set()) {
 			if (cpu_map__build_core_map(evsel_list->core.cpus,
@@ -1006,6 +1027,13 @@ static int perf_env__get_core(struct perf_cpu_map *map, int idx, void *data)
 	return core;
 }
 
+static int perf_env__get_numa(struct perf_cpu_map *map, int idx, void *data)
+{
+	int cpu = perf_env__get_cpu(data, map, idx);
+
+	return perf_env__numa_node(data, cpu);
+}
+
 static int perf_env__build_socket_map(struct perf_env *env, struct perf_cpu_map *cpus,
 				      struct perf_cpu_map **sockp)
 {
@@ -1024,6 +1052,12 @@ static int perf_env__build_core_map(struct perf_env *env, struct perf_cpu_map *c
 	return cpu_map__build_map(cpus, corep, perf_env__get_core, env);
 }
 
+static int perf_env__build_numa_map(struct perf_env *env, struct perf_cpu_map *cpus,
+				    struct perf_cpu_map **numap)
+{
+	return cpu_map__build_map(cpus, numap, perf_env__get_numa, env);
+}
+
 static int perf_stat__get_socket_file(struct perf_stat_config *config __maybe_unused,
 				      struct perf_cpu_map *map, int idx)
 {
@@ -1041,6 +1075,12 @@ static int perf_stat__get_core_file(struct perf_stat_config *config __maybe_unus
 	return perf_env__get_core(map, idx, &perf_stat.session->header.env);
 }
 
+static int perf_stat__get_numa_file(struct perf_stat_config *config __maybe_unused,
+				    struct perf_cpu_map *map, int idx)
+{
+	return perf_env__get_numa(map, idx, &perf_stat.session->header.env);
+}
+
 static int perf_stat_init_aggr_mode_file(struct perf_stat *st)
 {
 	struct perf_env *env = &st->session->header.env;
@@ -1067,6 +1107,13 @@ static int perf_stat_init_aggr_mode_file(struct perf_stat *st)
 		}
 		stat_config.aggr_get_id = perf_stat__get_core_file;
 		break;
+	case AGGR_NUMA:
+		if (perf_env__build_numa_map(env, evsel_list->core.cpus, &stat_config.aggr_map)) {
+			perror("cannot build core map");
+			return -1;
+		}
+		stat_config.aggr_get_id = perf_stat__get_numa_file;
+		break;
 	case AGGR_NONE:
 	case AGGR_GLOBAL:
 	case AGGR_THREAD:
@@ -1614,6 +1661,8 @@ static int __cmd_report(int argc, const char **argv)
 		     "aggregate counts per processor die", AGGR_DIE),
 	OPT_SET_UINT(0, "per-core", &perf_stat.aggr_mode,
 		     "aggregate counts per physical processor core", AGGR_CORE),
+	OPT_SET_UINT(0, "per-numa", &perf_stat.aggr_mode,
+		     "aggregate counts per numa node", AGGR_NUMA),
 	OPT_SET_UINT('A', "no-aggr", &perf_stat.aggr_mode,
 		     "disable CPU count aggregation", AGGR_NONE),
 	OPT_END()
@@ -1888,6 +1937,9 @@ int cmd_stat(int argc, const char **argv)
 		}
 	}
 
+	if (stat_config.aggr_mode == AGGR_NUMA)
+		cpu__setup_cpunode_map();
+
 	if (stat_config.times && interval)
 		interval_count = true;
 	else if (stat_config.times && !interval) {
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index a22c1114e880..dbca1ee069b8 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -206,6 +206,11 @@ int cpu_map__get_core_id(int cpu)
 	return ret ?: value;
 }
 
+int cpu_map__get_numa_id(int cpu)
+{
+	return cpu__get_node(cpu);
+}
+
 int cpu_map__get_core(struct perf_cpu_map *map, int idx, void *data)
 {
 	int cpu, s_die;
@@ -235,6 +240,14 @@ int cpu_map__get_core(struct perf_cpu_map *map, int idx, void *data)
 	return (s_die << 16) | (cpu & 0xffff);
 }
 
+int cpu_map__get_numa(struct perf_cpu_map *map, int idx, void *data __maybe_unused)
+{
+	if (idx < 0 || idx >= map->nr)
+		return -1;
+
+	return cpu_map__get_numa_id(map->map[idx]);
+}
+
 int cpu_map__build_socket_map(struct perf_cpu_map *cpus, struct perf_cpu_map **sockp)
 {
 	return cpu_map__build_map(cpus, sockp, cpu_map__get_socket, NULL);
@@ -250,6 +263,11 @@ int cpu_map__build_core_map(struct perf_cpu_map *cpus, struct perf_cpu_map **cor
 	return cpu_map__build_map(cpus, corep, cpu_map__get_core, NULL);
 }
 
+int cpu_map__build_numa_map(struct perf_cpu_map *cpus, struct perf_cpu_map **numap)
+{
+	return cpu_map__build_map(cpus, numap, cpu_map__get_numa, NULL);
+}
+
 /* setup simple routines to easily access node numbers given a cpu number */
 static int get_max_num(char *path, int *max)
 {
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index 2553bef1279d..6122fd6588d1 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -20,9 +20,12 @@ int cpu_map__get_die_id(int cpu);
 int cpu_map__get_die(struct perf_cpu_map *map, int idx, void *data);
 int cpu_map__get_core_id(int cpu);
 int cpu_map__get_core(struct perf_cpu_map *map, int idx, void *data);
+int cpu_map__get_numa_id(int cpu);
+int cpu_map__get_numa(struct perf_cpu_map *map, int idx, void *data);
 int cpu_map__build_socket_map(struct perf_cpu_map *cpus, struct perf_cpu_map **sockp);
 int cpu_map__build_die_map(struct perf_cpu_map *cpus, struct perf_cpu_map **diep);
 int cpu_map__build_core_map(struct perf_cpu_map *cpus, struct perf_cpu_map **corep);
+int cpu_map__build_numa_map(struct perf_cpu_map *cpus, struct perf_cpu_map **numap);
 const struct perf_cpu_map *cpu_map__online(void); /* thread unsafe */
 
 static inline int cpu_map__socket(struct perf_cpu_map *sock, int s)
diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c
index ed3b0ac2f785..adbd80f54fee 100644
--- a/tools/perf/util/stat-display.c
+++ b/tools/perf/util/stat-display.c
@@ -100,6 +100,15 @@ static void aggr_printout(struct perf_stat_config *config,
 			nr,
 			config->csv_sep);
 			break;
+	case AGGR_NUMA:
+		fprintf(config->output, "N%*d%s%*d%s",
+			config->csv_output ? 0 : -5,
+			id,
+			config->csv_sep,
+			config->csv_output ? 0 : 4,
+			nr,
+			config->csv_sep);
+			break;
 	case AGGR_NONE:
 		if (evsel->percore) {
 			fprintf(config->output, "S%d-D%d-C%*d%s",
@@ -965,6 +974,11 @@ static void print_interval(struct perf_stat_config *config,
 
 	if ((num_print_interval == 0 && !config->csv_output) || config->interval_clear) {
 		switch (config->aggr_mode) {
+		case AGGR_NUMA:
+			fprintf(output, "#           time numa   cpus");
+			if (!metric_only)
+				fprintf(output, "             counts %*s events\n", unit_width, "unit");
+			break;
 		case AGGR_SOCKET:
 			fprintf(output, "#           time socket cpus");
 			if (!metric_only)
@@ -1188,6 +1202,7 @@ perf_evlist__print_counters(struct evlist *evlist,
 	case AGGR_CORE:
 	case AGGR_DIE:
 	case AGGR_SOCKET:
+	case AGGR_NUMA:
 		print_aggr(config, evlist, prefix);
 		break;
 	case AGGR_THREAD:
diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c
index 8f1ea27f976f..cde91e0842b2 100644
--- a/tools/perf/util/stat.c
+++ b/tools/perf/util/stat.c
@@ -281,6 +281,7 @@ process_counter_values(struct perf_stat_config *config, struct evsel *evsel,
 	case AGGR_CORE:
 	case AGGR_DIE:
 	case AGGR_SOCKET:
+	case AGGR_NUMA:
 	case AGGR_NONE:
 		if (!evsel->snapshot)
 			perf_evsel__compute_deltas(evsel, cpu, thread, count);
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
index 14fe3e548229..388c90ca7855 100644
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -46,6 +46,7 @@ enum aggr_mode {
 	AGGR_CORE,
 	AGGR_THREAD,
 	AGGR_UNSET,
+	AGGR_NUMA,
 };
 
 enum {
-- 
2.21.0


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/3] perf tools: Add perf_env__numa_node function
  2019-09-02 12:12 ` [PATCH 2/3] perf tools: Add perf_env__numa_node function Jiri Olsa
@ 2019-09-02 13:57   ` Arnaldo Carvalho de Melo
  2019-09-02 14:16     ` Jiri Olsa
  0 siblings, 1 reply; 12+ messages in thread
From: Arnaldo Carvalho de Melo @ 2019-09-02 13:57 UTC (permalink / raw)
  To: Jiri Olsa
  Cc: lkml, Ingo Molnar, Namhyung Kim, Alexander Shishkin,
	Peter Zijlstra, Michael Petlan, Joe Mario, Kan Liang, Andi Kleen

Em Mon, Sep 02, 2019 at 02:12:54PM +0200, Jiri Olsa escreveu:
> To speed up cpu to node lookup, adding perf_env__numa_node
> function, that creates cpu array on the first lookup, that
> holds numa nodes for each stored cpu.
> 
> Link: http://lkml.kernel.org/n/tip-qqwxklhissf3yjyuaszh6480@git.kernel.org
> Signed-off-by: Jiri Olsa <jolsa@kernel.org>
> ---
>  tools/perf/util/env.c | 35 +++++++++++++++++++++++++++++++++++
>  tools/perf/util/env.h |  6 ++++++
>  2 files changed, 41 insertions(+)
> 
> diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c
> index 3baca06786fb..6385961e45df 100644
> --- a/tools/perf/util/env.c
> +++ b/tools/perf/util/env.c
> @@ -179,6 +179,7 @@ void perf_env__exit(struct perf_env *env)
>  	zfree(&env->sibling_threads);
>  	zfree(&env->pmu_mappings);
>  	zfree(&env->cpu);
> +	zfree(&env->numa_map);
>  
>  	for (i = 0; i < env->nr_numa_nodes; i++)
>  		perf_cpu_map__put(env->numa_nodes[i].map);
> @@ -338,3 +339,37 @@ const char *perf_env__arch(struct perf_env *env)
>  
>  	return normalize_arch(arch_name);
>  }
> +
> +
> +int perf_env__numa_node(struct perf_env *env, int cpu)
> +{
> +	if (!env->nr_numa_map) {
> +		struct numa_node *nn;
> +		int i, nr = 0;
> +
> +		for (i = 0; i < env->nr_numa_nodes; i++) {
> +			nn = &env->numa_nodes[i];
> +			nr = max(nr, perf_cpu_map__max(nn->map));
> +		}
> +
> +		nr++;
> +		env->numa_map = zalloc(nr * sizeof(int));

Why do you use zalloc()...

> +		if (!env->numa_map)
> +			return -1;

Only to right after allocating it set all entries to -1?

That zalloc() should be downgraded to a plain malloc(), right?

The setting to -1 is because we may have holes in the array, right? I
think this deserves a comment here as well.

> +		for (i = 0; i < nr; i++)
> +			env->numa_map[i] = -1;
> +
> +		env->nr_numa_map = nr;
> +
> +		for (i = 0; i < env->nr_numa_nodes; i++) {
> +			int tmp, j;
> +
> +			nn = &env->numa_nodes[i];
> +			perf_cpu_map__for_each_cpu(j, tmp, nn->map)
> +				env->numa_map[j] = i;
> +		}
> +	}
> +
> +	return cpu >= 0 && cpu < env->nr_numa_map ? env->numa_map[cpu] : -1;
> +}
> diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h
> index d8e083d42610..777008f8007a 100644
> --- a/tools/perf/util/env.h
> +++ b/tools/perf/util/env.h
> @@ -86,6 +86,10 @@ struct perf_env {
>  		struct rb_root		btfs;
>  		u32			btfs_cnt;
>  	} bpf_progs;
> +
> +	/* For fast cpu to numa node lookup via perf_env__numa_node */
> +	int			*numa_map;
> +	int			 nr_numa_map;
>  };
>  
>  enum perf_compress_type {
> @@ -118,4 +122,6 @@ struct bpf_prog_info_node *perf_env__find_bpf_prog_info(struct perf_env *env,
>  							__u32 prog_id);
>  void perf_env__insert_btf(struct perf_env *env, struct btf_node *btf_node);
>  struct btf_node *perf_env__find_btf(struct perf_env *env, __u32 btf_id);
> +
> +int perf_env__numa_node(struct perf_env *env, int cpu);
>  #endif /* __PERF_ENV_H */
> -- 
> 2.21.0

-- 

- Arnaldo

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/3] libperf: Add perf_cpu_map__max function
  2019-09-02 12:12 ` [PATCH 1/3] libperf: Add perf_cpu_map__max function Jiri Olsa
@ 2019-09-02 13:57   ` Arnaldo Carvalho de Melo
  2019-09-20 16:21   ` [tip: perf/urgent] libperf: Adopt perf_cpu_map__max() function tip-bot2 for Jiri Olsa
  1 sibling, 0 replies; 12+ messages in thread
From: Arnaldo Carvalho de Melo @ 2019-09-02 13:57 UTC (permalink / raw)
  To: Jiri Olsa
  Cc: lkml, Ingo Molnar, Namhyung Kim, Alexander Shishkin,
	Peter Zijlstra, Michael Petlan, Joe Mario, Kan Liang, Andi Kleen

Em Mon, Sep 02, 2019 at 02:12:53PM +0200, Jiri Olsa escreveu:
> So it can be used from multiple places.

Applied.

- Arnaldo
 
> Link: http://lkml.kernel.org/n/tip-yp3h5rl9e8piybufq41zqnla@git.kernel.org
> Signed-off-by: Jiri Olsa <jolsa@kernel.org>
> ---
>  tools/perf/builtin-stat.c            | 14 +-------------
>  tools/perf/lib/cpumap.c              | 12 ++++++++++++
>  tools/perf/lib/include/perf/cpumap.h |  1 +
>  tools/perf/lib/libperf.map           |  1 +
>  4 files changed, 15 insertions(+), 13 deletions(-)
> 
> diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
> index 7e17bf9f700a..5bc0c570b7b6 100644
> --- a/tools/perf/builtin-stat.c
> +++ b/tools/perf/builtin-stat.c
> @@ -822,18 +822,6 @@ static int perf_stat__get_core(struct perf_stat_config *config __maybe_unused,
>  	return cpu_map__get_core(map, cpu, NULL);
>  }
>  
> -static int cpu_map__get_max(struct perf_cpu_map *map)
> -{
> -	int i, max = -1;
> -
> -	for (i = 0; i < map->nr; i++) {
> -		if (map->map[i] > max)
> -			max = map->map[i];
> -	}
> -
> -	return max;
> -}
> -
>  static int perf_stat__get_aggr(struct perf_stat_config *config,
>  			       aggr_get_id_t get_id, struct perf_cpu_map *map, int idx)
>  {
> @@ -928,7 +916,7 @@ static int perf_stat_init_aggr_mode(void)
>  	 * taking the highest cpu number to be the size of
>  	 * the aggregation translate cpumap.
>  	 */
> -	nr = cpu_map__get_max(evsel_list->core.cpus);
> +	nr = perf_cpu_map__max(evsel_list->core.cpus);
>  	stat_config.cpus_aggr_map = perf_cpu_map__empty_new(nr + 1);
>  	return stat_config.cpus_aggr_map ? 0 : -ENOMEM;
>  }
> diff --git a/tools/perf/lib/cpumap.c b/tools/perf/lib/cpumap.c
> index 1f0e6f334237..2ca1fafa620d 100644
> --- a/tools/perf/lib/cpumap.c
> +++ b/tools/perf/lib/cpumap.c
> @@ -260,3 +260,15 @@ int perf_cpu_map__idx(struct perf_cpu_map *cpus, int cpu)
>  
>  	return -1;
>  }
> +
> +int perf_cpu_map__max(struct perf_cpu_map *map)
> +{
> +	int i, max = -1;
> +
> +	for (i = 0; i < map->nr; i++) {
> +		if (map->map[i] > max)
> +			max = map->map[i];
> +	}
> +
> +	return max;
> +}
> diff --git a/tools/perf/lib/include/perf/cpumap.h b/tools/perf/lib/include/perf/cpumap.h
> index 8aa995c59498..ac9aa497f84a 100644
> --- a/tools/perf/lib/include/perf/cpumap.h
> +++ b/tools/perf/lib/include/perf/cpumap.h
> @@ -16,6 +16,7 @@ LIBPERF_API void perf_cpu_map__put(struct perf_cpu_map *map);
>  LIBPERF_API int perf_cpu_map__cpu(const struct perf_cpu_map *cpus, int idx);
>  LIBPERF_API int perf_cpu_map__nr(const struct perf_cpu_map *cpus);
>  LIBPERF_API bool perf_cpu_map__empty(const struct perf_cpu_map *map);
> +LIBPERF_API int perf_cpu_map__max(struct perf_cpu_map *map);
>  
>  #define perf_cpu_map__for_each_cpu(cpu, idx, cpus)		\
>  	for ((idx) = 0, (cpu) = perf_cpu_map__cpu(cpus, idx);	\
> diff --git a/tools/perf/lib/libperf.map b/tools/perf/lib/libperf.map
> index dc4d66363bc4..cd0d17b996c8 100644
> --- a/tools/perf/lib/libperf.map
> +++ b/tools/perf/lib/libperf.map
> @@ -9,6 +9,7 @@ LIBPERF_0.0.1 {
>  		perf_cpu_map__nr;
>  		perf_cpu_map__cpu;
>  		perf_cpu_map__empty;
> +		perf_cpu_map__max;
>  		perf_thread_map__new_dummy;
>  		perf_thread_map__set_pid;
>  		perf_thread_map__comm;
> -- 
> 2.21.0

-- 

- Arnaldo

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/3] perf tools: Add perf_env__numa_node function
  2019-09-02 13:57   ` Arnaldo Carvalho de Melo
@ 2019-09-02 14:16     ` Jiri Olsa
  0 siblings, 0 replies; 12+ messages in thread
From: Jiri Olsa @ 2019-09-02 14:16 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: Jiri Olsa, lkml, Ingo Molnar, Namhyung Kim, Alexander Shishkin,
	Peter Zijlstra, Michael Petlan, Joe Mario, Kan Liang, Andi Kleen

On Mon, Sep 02, 2019 at 10:57:10AM -0300, Arnaldo Carvalho de Melo wrote:
> Em Mon, Sep 02, 2019 at 02:12:54PM +0200, Jiri Olsa escreveu:
> > To speed up cpu to node lookup, adding perf_env__numa_node
> > function, that creates cpu array on the first lookup, that
> > holds numa nodes for each stored cpu.
> > 
> > Link: http://lkml.kernel.org/n/tip-qqwxklhissf3yjyuaszh6480@git.kernel.org
> > Signed-off-by: Jiri Olsa <jolsa@kernel.org>
> > ---
> >  tools/perf/util/env.c | 35 +++++++++++++++++++++++++++++++++++
> >  tools/perf/util/env.h |  6 ++++++
> >  2 files changed, 41 insertions(+)
> > 
> > diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c
> > index 3baca06786fb..6385961e45df 100644
> > --- a/tools/perf/util/env.c
> > +++ b/tools/perf/util/env.c
> > @@ -179,6 +179,7 @@ void perf_env__exit(struct perf_env *env)
> >  	zfree(&env->sibling_threads);
> >  	zfree(&env->pmu_mappings);
> >  	zfree(&env->cpu);
> > +	zfree(&env->numa_map);
> >  
> >  	for (i = 0; i < env->nr_numa_nodes; i++)
> >  		perf_cpu_map__put(env->numa_nodes[i].map);
> > @@ -338,3 +339,37 @@ const char *perf_env__arch(struct perf_env *env)
> >  
> >  	return normalize_arch(arch_name);
> >  }
> > +
> > +
> > +int perf_env__numa_node(struct perf_env *env, int cpu)
> > +{
> > +	if (!env->nr_numa_map) {
> > +		struct numa_node *nn;
> > +		int i, nr = 0;
> > +
> > +		for (i = 0; i < env->nr_numa_nodes; i++) {
> > +			nn = &env->numa_nodes[i];
> > +			nr = max(nr, perf_cpu_map__max(nn->map));
> > +		}
> > +
> > +		nr++;
> > +		env->numa_map = zalloc(nr * sizeof(int));
> 
> Why do you use zalloc()...
> 
> > +		if (!env->numa_map)
> > +			return -1;
> 
> Only to right after allocating it set all entries to -1?
> 
> That zalloc() should be downgraded to a plain malloc(), right?
> 
> The setting to -1 is because we may have holes in the array, right? I
> think this deserves a comment here as well.

yea, I added that later on and missed the zalloc above ;-)

I'll send new version

thanks,
jirka

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/3] perf stat: Add --per-numa agregation support
  2019-09-02 12:12 ` [PATCH 3/3] perf stat: Add --per-numa agregation support Jiri Olsa
@ 2019-09-02 15:13   ` Alexey Budankov
  2019-09-02 15:43     ` Arnaldo Carvalho de Melo
  0 siblings, 1 reply; 12+ messages in thread
From: Alexey Budankov @ 2019-09-02 15:13 UTC (permalink / raw)
  To: Jiri Olsa, Arnaldo Carvalho de Melo
  Cc: lkml, Ingo Molnar, Namhyung Kim, Alexander Shishkin,
	Peter Zijlstra, Michael Petlan, Joe Mario, Kan Liang, Andi Kleen


On 02.09.2019 15:12, Jiri Olsa wrote:
> Adding new --per-numa option to aggregate counts per NUMA
> nodes for system-wide mode measurements.
> 
> You can specify --per-numa in live mode:
> 
>   # perf stat  -a -I 1000 -e cycles --per-numa
>   #           time numa   cpus             counts unit events

It might probably better have 'node' instead of 'numa' as in the 
option name '--per-node' as in the table header, like this:

    #           time node     cpus             counts unit events
         1.000542550 0        20          6,202,097      cycles
         1.000542550 1        20            639,559      cycles
         2.002040063 0        20          7,412,495      cycles
         2.002040063 1        20          2,185,577      cycles
         3.003451699 0        20          6,508,917      cycles
         3.003451699 1        20            765,607      cycles
   ...

BR,
Alexey

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/3] perf stat: Add --per-numa agregation support
  2019-09-02 15:13   ` Alexey Budankov
@ 2019-09-02 15:43     ` Arnaldo Carvalho de Melo
  2019-09-02 17:46       ` Jiri Olsa
  0 siblings, 1 reply; 12+ messages in thread
From: Arnaldo Carvalho de Melo @ 2019-09-02 15:43 UTC (permalink / raw)
  To: Alexey Budankov
  Cc: Jiri Olsa, lkml, Ingo Molnar, Namhyung Kim, Alexander Shishkin,
	Peter Zijlstra, Michael Petlan, Joe Mario, Kan Liang, Andi Kleen

Em Mon, Sep 02, 2019 at 06:13:17PM +0300, Alexey Budankov escreveu:
> 
> On 02.09.2019 15:12, Jiri Olsa wrote:
> > Adding new --per-numa option to aggregate counts per NUMA
> > nodes for system-wide mode measurements.
> > 
> > You can specify --per-numa in live mode:
> > 
> >   # perf stat  -a -I 1000 -e cycles --per-numa
> >   #           time numa   cpus             counts unit events
> 
> It might probably better have 'node' instead of 'numa' as in the 
> option name '--per-node' as in the table header, like this:

Agreed

> 
>     #           time node     cpus             counts unit events
>          1.000542550 0        20          6,202,097      cycles
>          1.000542550 1        20            639,559      cycles
>          2.002040063 0        20          7,412,495      cycles
>          2.002040063 1        20          2,185,577      cycles
>          3.003451699 0        20          6,508,917      cycles
>          3.003451699 1        20            765,607      cycles
>    ...
> 
> BR,
> Alexey

-- 

- Arnaldo

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/3] perf stat: Add --per-numa agregation support
  2019-09-02 15:43     ` Arnaldo Carvalho de Melo
@ 2019-09-02 17:46       ` Jiri Olsa
  0 siblings, 0 replies; 12+ messages in thread
From: Jiri Olsa @ 2019-09-02 17:46 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: Alexey Budankov, Jiri Olsa, lkml, Ingo Molnar, Namhyung Kim,
	Alexander Shishkin, Peter Zijlstra, Michael Petlan, Joe Mario,
	Kan Liang, Andi Kleen

On Mon, Sep 02, 2019 at 12:43:29PM -0300, Arnaldo Carvalho de Melo wrote:
> Em Mon, Sep 02, 2019 at 06:13:17PM +0300, Alexey Budankov escreveu:
> > 
> > On 02.09.2019 15:12, Jiri Olsa wrote:
> > > Adding new --per-numa option to aggregate counts per NUMA
> > > nodes for system-wide mode measurements.
> > > 
> > > You can specify --per-numa in live mode:
> > > 
> > >   # perf stat  -a -I 1000 -e cycles --per-numa
> > >   #           time numa   cpus             counts unit events
> > 
> > It might probably better have 'node' instead of 'numa' as in the 
> > option name '--per-node' as in the table header, like this:
> 
> Agreed

ok, will change

jirka

> 
> > 
> >     #           time node     cpus             counts unit events
> >          1.000542550 0        20          6,202,097      cycles
> >          1.000542550 1        20            639,559      cycles
> >          2.002040063 0        20          7,412,495      cycles
> >          2.002040063 1        20          2,185,577      cycles
> >          3.003451699 0        20          6,508,917      cycles
> >          3.003451699 1        20            765,607      cycles
> >    ...
> > 
> > BR,
> > Alexey
> 
> -- 
> 
> - Arnaldo

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [tip: perf/urgent] libperf: Adopt perf_cpu_map__max() function
  2019-09-02 12:12 ` [PATCH 1/3] libperf: Add perf_cpu_map__max function Jiri Olsa
  2019-09-02 13:57   ` Arnaldo Carvalho de Melo
@ 2019-09-20 16:21   ` tip-bot2 for Jiri Olsa
  1 sibling, 0 replies; 12+ messages in thread
From: tip-bot2 for Jiri Olsa @ 2019-09-20 16:21 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Jiri Olsa, Alexander Shishkin, Andi Kleen, Joe Mario, Kan Liang,
	Michael Petlan, Namhyung Kim, Peter Zijlstra,
	Arnaldo Carvalho de Melo, Ingo Molnar, Borislav Petkov,
	linux-kernel

The following commit has been merged into the perf/urgent branch of tip:

Commit-ID:     4256d434935e9c85a731823be562785494ca364b
Gitweb:        https://git.kernel.org/tip/4256d434935e9c85a731823be562785494ca364b
Author:        Jiri Olsa <jolsa@kernel.org>
AuthorDate:    Mon, 02 Sep 2019 14:12:53 +02:00
Committer:     Arnaldo Carvalho de Melo <acme@redhat.com>
CommitterDate: Tue, 10 Sep 2019 14:33:32 +01:00

libperf: Adopt perf_cpu_map__max() function

>From 'perf stat', so that it can be used from multiple places.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Joe Mario <jmario@redhat.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Michael Petlan <mpetlan@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lore.kernel.org/lkml/20190902121255.536-2-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-stat.c            | 14 +-------------
 tools/perf/lib/cpumap.c              | 12 ++++++++++++
 tools/perf/lib/include/perf/cpumap.h |  1 +
 tools/perf/lib/libperf.map           |  1 +
 4 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 7e17bf9..5bc0c57 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -822,18 +822,6 @@ static int perf_stat__get_core(struct perf_stat_config *config __maybe_unused,
 	return cpu_map__get_core(map, cpu, NULL);
 }
 
-static int cpu_map__get_max(struct perf_cpu_map *map)
-{
-	int i, max = -1;
-
-	for (i = 0; i < map->nr; i++) {
-		if (map->map[i] > max)
-			max = map->map[i];
-	}
-
-	return max;
-}
-
 static int perf_stat__get_aggr(struct perf_stat_config *config,
 			       aggr_get_id_t get_id, struct perf_cpu_map *map, int idx)
 {
@@ -928,7 +916,7 @@ static int perf_stat_init_aggr_mode(void)
 	 * taking the highest cpu number to be the size of
 	 * the aggregation translate cpumap.
 	 */
-	nr = cpu_map__get_max(evsel_list->core.cpus);
+	nr = perf_cpu_map__max(evsel_list->core.cpus);
 	stat_config.cpus_aggr_map = perf_cpu_map__empty_new(nr + 1);
 	return stat_config.cpus_aggr_map ? 0 : -ENOMEM;
 }
diff --git a/tools/perf/lib/cpumap.c b/tools/perf/lib/cpumap.c
index 1f0e6f3..2ca1faf 100644
--- a/tools/perf/lib/cpumap.c
+++ b/tools/perf/lib/cpumap.c
@@ -260,3 +260,15 @@ int perf_cpu_map__idx(struct perf_cpu_map *cpus, int cpu)
 
 	return -1;
 }
+
+int perf_cpu_map__max(struct perf_cpu_map *map)
+{
+	int i, max = -1;
+
+	for (i = 0; i < map->nr; i++) {
+		if (map->map[i] > max)
+			max = map->map[i];
+	}
+
+	return max;
+}
diff --git a/tools/perf/lib/include/perf/cpumap.h b/tools/perf/lib/include/perf/cpumap.h
index 8aa995c..ac9aa49 100644
--- a/tools/perf/lib/include/perf/cpumap.h
+++ b/tools/perf/lib/include/perf/cpumap.h
@@ -16,6 +16,7 @@ LIBPERF_API void perf_cpu_map__put(struct perf_cpu_map *map);
 LIBPERF_API int perf_cpu_map__cpu(const struct perf_cpu_map *cpus, int idx);
 LIBPERF_API int perf_cpu_map__nr(const struct perf_cpu_map *cpus);
 LIBPERF_API bool perf_cpu_map__empty(const struct perf_cpu_map *map);
+LIBPERF_API int perf_cpu_map__max(struct perf_cpu_map *map);
 
 #define perf_cpu_map__for_each_cpu(cpu, idx, cpus)		\
 	for ((idx) = 0, (cpu) = perf_cpu_map__cpu(cpus, idx);	\
diff --git a/tools/perf/lib/libperf.map b/tools/perf/lib/libperf.map
index dc4d663..cd0d17b 100644
--- a/tools/perf/lib/libperf.map
+++ b/tools/perf/lib/libperf.map
@@ -9,6 +9,7 @@ LIBPERF_0.0.1 {
 		perf_cpu_map__nr;
 		perf_cpu_map__cpu;
 		perf_cpu_map__empty;
+		perf_cpu_map__max;
 		perf_thread_map__new_dummy;
 		perf_thread_map__set_pid;
 		perf_thread_map__comm;

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 2/3] perf tools: Add perf_env__numa_node function
  2019-09-04  7:34 [PATCHv2 0/3] perf stat: Add --per-node option Jiri Olsa
@ 2019-09-04  7:34 ` Jiri Olsa
  0 siblings, 0 replies; 12+ messages in thread
From: Jiri Olsa @ 2019-09-04  7:34 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: lkml, Ingo Molnar, Namhyung Kim, Alexander Shishkin,
	Peter Zijlstra, Michael Petlan, Joe Mario, Kan Liang, Andi Kleen,
	Alexey Budankov

To speed up cpu to node lookup, adding perf_env__numa_node
function, that creates cpu array on the first lookup, that
holds numa nodes for each stored cpu.

Link: http://lkml.kernel.org/n/tip-qqwxklhissf3yjyuaszh6480@git.kernel.org
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 tools/perf/util/env.c | 40 ++++++++++++++++++++++++++++++++++++++++
 tools/perf/util/env.h |  6 ++++++
 2 files changed, 46 insertions(+)

diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c
index 3baca06786fb..ee53e89a9535 100644
--- a/tools/perf/util/env.c
+++ b/tools/perf/util/env.c
@@ -179,6 +179,7 @@ void perf_env__exit(struct perf_env *env)
 	zfree(&env->sibling_threads);
 	zfree(&env->pmu_mappings);
 	zfree(&env->cpu);
+	zfree(&env->numa_map);
 
 	for (i = 0; i < env->nr_numa_nodes; i++)
 		perf_cpu_map__put(env->numa_nodes[i].map);
@@ -338,3 +339,42 @@ const char *perf_env__arch(struct perf_env *env)
 
 	return normalize_arch(arch_name);
 }
+
+
+int perf_env__numa_node(struct perf_env *env, int cpu)
+{
+	if (!env->nr_numa_map) {
+		struct numa_node *nn;
+		int i, nr = 0;
+
+		for (i = 0; i < env->nr_numa_nodes; i++) {
+			nn = &env->numa_nodes[i];
+			nr = max(nr, perf_cpu_map__max(nn->map));
+		}
+
+		nr++;
+
+		/*
+		 * We initialize the numa_map array to prepare
+		 * it for missing cpus, which return node -1.
+		 */
+		env->numa_map = malloc(nr * sizeof(int));
+		if (!env->numa_map)
+			return -1;
+
+		for (i = 0; i < nr; i++)
+			env->numa_map[i] = -1;
+
+		env->nr_numa_map = nr;
+
+		for (i = 0; i < env->nr_numa_nodes; i++) {
+			int tmp, j;
+
+			nn = &env->numa_nodes[i];
+			perf_cpu_map__for_each_cpu(j, tmp, nn->map)
+				env->numa_map[j] = i;
+		}
+	}
+
+	return cpu >= 0 && cpu < env->nr_numa_map ? env->numa_map[cpu] : -1;
+}
diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h
index d8e083d42610..777008f8007a 100644
--- a/tools/perf/util/env.h
+++ b/tools/perf/util/env.h
@@ -86,6 +86,10 @@ struct perf_env {
 		struct rb_root		btfs;
 		u32			btfs_cnt;
 	} bpf_progs;
+
+	/* For fast cpu to numa node lookup via perf_env__numa_node */
+	int			*numa_map;
+	int			 nr_numa_map;
 };
 
 enum perf_compress_type {
@@ -118,4 +122,6 @@ struct bpf_prog_info_node *perf_env__find_bpf_prog_info(struct perf_env *env,
 							__u32 prog_id);
 void perf_env__insert_btf(struct perf_env *env, struct btf_node *btf_node);
 struct btf_node *perf_env__find_btf(struct perf_env *env, __u32 btf_id);
+
+int perf_env__numa_node(struct perf_env *env, int cpu);
 #endif /* __PERF_ENV_H */
-- 
2.21.0


^ permalink raw reply related	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2019-09-20 16:21 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-09-02 12:12 [PATCH 0/3] perf stat: Add --per-numa option Jiri Olsa
2019-09-02 12:12 ` [PATCH 1/3] libperf: Add perf_cpu_map__max function Jiri Olsa
2019-09-02 13:57   ` Arnaldo Carvalho de Melo
2019-09-20 16:21   ` [tip: perf/urgent] libperf: Adopt perf_cpu_map__max() function tip-bot2 for Jiri Olsa
2019-09-02 12:12 ` [PATCH 2/3] perf tools: Add perf_env__numa_node function Jiri Olsa
2019-09-02 13:57   ` Arnaldo Carvalho de Melo
2019-09-02 14:16     ` Jiri Olsa
2019-09-02 12:12 ` [PATCH 3/3] perf stat: Add --per-numa agregation support Jiri Olsa
2019-09-02 15:13   ` Alexey Budankov
2019-09-02 15:43     ` Arnaldo Carvalho de Melo
2019-09-02 17:46       ` Jiri Olsa
2019-09-04  7:34 [PATCHv2 0/3] perf stat: Add --per-node option Jiri Olsa
2019-09-04  7:34 ` [PATCH 2/3] perf tools: Add perf_env__numa_node function Jiri Olsa

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).