All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 0/3] perf stat: add per-core count aggregation
@ 2013-02-14 12:57 Stephane Eranian
  2013-02-14 12:57 ` [PATCH v2 1/3] perf stat: refactor aggregation code Stephane Eranian
                   ` (3 more replies)
  0 siblings, 4 replies; 11+ messages in thread
From: Stephane Eranian @ 2013-02-14 12:57 UTC (permalink / raw)
  To: linux-kernel; +Cc: peterz, mingo, ak, acme, jolsa, namhyung.kim

This patch series contains improvement to the aggregation support
in perf stat.

First, the aggregation code is refactored and a aggr_mode enum
is defined. There is also an important bug fix for the existing
per-socket aggregation.

Second, the option --aggr-socket is renamed --per-socket.

Third, the patch adds a new --per-core option to perf stat.
It aggregates counts per physical core and becomes useful on
systems with hyper-threading. The cores are presented per
socket: S0-C1, means socket 0 core 1. Note that the core number
represents its physical core id. As such, numbers may not always
be contiguous. All of this is based on topology information available
in sysfs.

Per-core aggregation can be combined with interval printing:

 # perf stat -a --per-core -I 1000 -e cycles sleep 100
 #           time core         cpus             counts events
      1.000101160 S0-C0           2      6,051,254,899 cycles                   
      1.000101160 S0-C1           2      6,379,230,776 cycles                   
      1.000101160 S0-C2           2      6,480,268,471 cycles                   
      1.000101160 S0-C3           2      6,110,514,321 cycles                   
      2.000663750 S0-C0           2      6,572,533,016 cycles                   
      2.000663750 S0-C1           2      6,378,623,674 cycles                   
      2.000663750 S0-C2           2      6,264,127,589 cycles                   
      2.000663750 S0-C3           2      6,305,346,613 cycles                   

For instance here on this SNB machine, we can see that the load
is evenly balanced across all 4 physical core (HT is on).

In v2, we print events across all cores or socket and we renamed
--aggr-socket to --per-socket and renamed --aggr-core to --per-core

Signed-off-by: Stephane Eranian <eranian@google.com>

Stephane Eranian (3):
  perf stat: refactor aggregation code
  perf stat: rename --aggr-socket to --per-socket
  perf stat: add per-core aggregation

 tools/perf/Documentation/perf-stat.txt |   10 +-
 tools/perf/builtin-stat.c              |  237 ++++++++++++++++++++------------
 tools/perf/util/cpumap.c               |   86 ++++++++++--
 tools/perf/util/cpumap.h               |   12 ++
 4 files changed, 241 insertions(+), 104 deletions(-)

-- 
1.7.9.5


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH v2 1/3] perf stat: refactor aggregation code
  2013-02-14 12:57 [PATCH v2 0/3] perf stat: add per-core count aggregation Stephane Eranian
@ 2013-02-14 12:57 ` Stephane Eranian
  2013-03-07 21:38   ` Jiri Olsa
                     ` (2 more replies)
  2013-02-14 12:57 ` [PATCH v2 2/3] perf stat: rename --aggr-socket to --per-socket Stephane Eranian
                   ` (2 subsequent siblings)
  3 siblings, 3 replies; 11+ messages in thread
From: Stephane Eranian @ 2013-02-14 12:57 UTC (permalink / raw)
  To: linux-kernel; +Cc: peterz, mingo, ak, acme, jolsa, namhyung.kim

Refactor aggregation code by introducing
a single aggr_mode variable and an enum
for aggregation.

Also refactor cpumap code having to do
with cpu to socket mappings. All in preparation
for extended modes, such as cpu -> core.

Also fix socket aggregation and ensure
that sockets are printed in increasing order.

Signed-off-by: Stephane Eranian <eranian@google.com>
---
 tools/perf/builtin-stat.c |  208 ++++++++++++++++++++++++++-------------------
 tools/perf/util/cpumap.c  |   40 ++++++---
 2 files changed, 148 insertions(+), 100 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 9984876..a19f8d5 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -68,7 +68,7 @@
 static void print_stat(int argc, const char **argv);
 static void print_counter_aggr(struct perf_evsel *counter, char *prefix);
 static void print_counter(struct perf_evsel *counter, char *prefix);
-static void print_aggr_socket(char *prefix);
+static void print_aggr(char *prefix);
 
 static struct perf_evlist	*evsel_list;
 
@@ -76,11 +76,16 @@ static struct perf_target	target = {
 	.uid	= UINT_MAX,
 };
 
+enum aggr_mode {
+	AGGR_NONE,
+	AGGR_GLOBAL,
+	AGGR_SOCKET,
+};
+
 static int			run_count			=  1;
 static bool			no_inherit			= false;
 static bool			scale				=  true;
-static bool			no_aggr				= false;
-static bool			aggr_socket			= false;
+static enum aggr_mode		aggr_mode			= AGGR_GLOBAL;
 static pid_t			child_pid			= -1;
 static bool			null_run			=  false;
 static int			detailed_run			=  0;
@@ -95,7 +100,8 @@ static const char		*post_cmd			= NULL;
 static bool			sync_run			= false;
 static unsigned int		interval			= 0;
 static struct timespec		ref_time;
-static struct cpu_map		*sock_map;
+static struct cpu_map		*aggr_map;
+static int			(*aggr_get_id)(struct cpu_map *m, int cpu);
 
 static volatile int done = 0;
 
@@ -297,41 +303,51 @@ static void print_interval(void)
 	struct timespec ts, rs;
 	char prefix[64];
 
-	if (no_aggr) {
+	if (aggr_mode == AGGR_GLOBAL) {
 		list_for_each_entry(counter, &evsel_list->entries, node) {
 			ps = counter->priv;
 			memset(ps->res_stats, 0, sizeof(ps->res_stats));
-			read_counter(counter);
+			read_counter_aggr(counter);
 		}
-	} else {
+	} else	{
 		list_for_each_entry(counter, &evsel_list->entries, node) {
 			ps = counter->priv;
 			memset(ps->res_stats, 0, sizeof(ps->res_stats));
-			read_counter_aggr(counter);
+			read_counter(counter);
 		}
 	}
+
 	clock_gettime(CLOCK_MONOTONIC, &ts);
 	diff_timespec(&rs, &ts, &ref_time);
 	sprintf(prefix, "%6lu.%09lu%s", rs.tv_sec, rs.tv_nsec, csv_sep);
 
 	if (num_print_interval == 0 && !csv_output) {
-		if (aggr_socket)
+		switch (aggr_mode) {
+		case AGGR_SOCKET:
 			fprintf(output, "#           time socket cpus             counts events\n");
-		else if (no_aggr)
+			break;
+		case AGGR_NONE:
 			fprintf(output, "#           time CPU                 counts events\n");
-		else
+			break;
+		case AGGR_GLOBAL:
+		default:
 			fprintf(output, "#           time             counts events\n");
+		}
 	}
 
 	if (++num_print_interval == 25)
 		num_print_interval = 0;
 
-	if (aggr_socket)
-		print_aggr_socket(prefix);
-	else if (no_aggr) {
+	switch (aggr_mode) {
+	case AGGR_SOCKET:
+		print_aggr(prefix);
+		break;
+	case AGGR_NONE:
 		list_for_each_entry(counter, &evsel_list->entries, node)
 			print_counter(counter, prefix);
-	} else {
+		break;
+	case AGGR_GLOBAL:
+	default:
 		list_for_each_entry(counter, &evsel_list->entries, node)
 			print_counter_aggr(counter, prefix);
 	}
@@ -356,12 +372,6 @@ static int __run_perf_stat(int argc __maybe_unused, const char **argv)
 		ts.tv_nsec = 0;
 	}
 
-	if (aggr_socket
-	    && cpu_map__build_socket_map(evsel_list->cpus, &sock_map)) {
-		perror("cannot build socket map");
-		return -1;
-	}
-
 	if (forks && (pipe(child_ready_pipe) < 0 || pipe(go_pipe) < 0)) {
 		perror("failed to create pipes");
 		return -1;
@@ -479,17 +489,18 @@ static int __run_perf_stat(int argc __maybe_unused, const char **argv)
 
 	update_stats(&walltime_nsecs_stats, t1 - t0);
 
-	if (no_aggr) {
-		list_for_each_entry(counter, &evsel_list->entries, node) {
-			read_counter(counter);
-			perf_evsel__close_fd(counter, perf_evsel__nr_cpus(counter), 1);
-		}
-	} else {
+	if (aggr_mode == AGGR_GLOBAL) {
 		list_for_each_entry(counter, &evsel_list->entries, node) {
 			read_counter_aggr(counter);
 			perf_evsel__close_fd(counter, perf_evsel__nr_cpus(counter),
 					     evsel_list->threads->nr);
 		}
+	} else {
+		list_for_each_entry(counter, &evsel_list->entries, node) {
+			read_counter(counter);
+			perf_evsel__close_fd(counter,
+					     perf_evsel__nr_cpus(counter), 1);
+		}
 	}
 
 	return WEXITSTATUS(status);
@@ -542,26 +553,37 @@ static void print_noise(struct perf_evsel *evsel, double avg)
 	print_noise_pct(stddev_stats(&ps->res_stats[0]), avg);
 }
 
-static void nsec_printout(int cpu, int nr, struct perf_evsel *evsel, double avg)
+static void aggr_printout(int cpu, int nr)
 {
-	double msecs = avg / 1e6;
-	char cpustr[16] = { '\0', };
-	const char *fmt = csv_output ? "%s%.6f%s%s" : "%s%18.6f%s%-25s";
-
-	if (aggr_socket)
-		sprintf(cpustr, "S%*d%s%*d%s",
+	switch (aggr_mode) {
+	case AGGR_SOCKET:
+		fprintf(output, "S%*d%s%*d%s",
 			csv_output ? 0 : -5,
 			cpu,
 			csv_sep,
 			csv_output ? 0 : 4,
 			nr,
 			csv_sep);
-	else if (no_aggr)
-		sprintf(cpustr, "CPU%*d%s",
+			break;
+	case AGGR_NONE:
+		fprintf(output, "CPU%*d%s",
 			csv_output ? 0 : -4,
 			perf_evsel__cpus(evsel)->map[cpu], csv_sep);
+		break;
+	case AGGR_GLOBAL:
+	default:
+		break;
+	}
+}
+
+static void nsec_printout(int cpu, int nr, struct perf_evsel *evsel, double avg)
+{
+	double msecs = avg / 1e6;
+	const char *fmt = csv_output ? "%.6f%s%s" : "%18.6f%s%-25s";
 
-	fprintf(output, fmt, cpustr, msecs, csv_sep, perf_evsel__name(evsel));
+	aggr_printout(cpu, nr);
+
+	fprintf(output, fmt, msecs, csv_sep, perf_evsel__name(evsel));
 
 	if (evsel->cgrp)
 		fprintf(output, "%s%s", csv_sep, evsel->cgrp->name);
@@ -758,32 +780,21 @@ static void print_ll_cache_misses(int cpu,
 static void abs_printout(int cpu, int nr, struct perf_evsel *evsel, double avg)
 {
 	double total, ratio = 0.0;
-	char cpustr[16] = { '\0', };
 	const char *fmt;
 
 	if (csv_output)
-		fmt = "%s%.0f%s%s";
+		fmt = "%.0f%s%s";
 	else if (big_num)
-		fmt = "%s%'18.0f%s%-25s";
+		fmt = "%'18.0f%s%-25s";
 	else
-		fmt = "%s%18.0f%s%-25s";
+		fmt = "%18.0f%s%-25s";
 
-	if (aggr_socket)
-		sprintf(cpustr, "S%*d%s%*d%s",
-			csv_output ? 0 : -5,
-			cpu,
-			csv_sep,
-			csv_output ? 0 : 4,
-			nr,
-			csv_sep);
-	else if (no_aggr)
-		sprintf(cpustr, "CPU%*d%s",
-			csv_output ? 0 : -4,
-			perf_evsel__cpus(evsel)->map[cpu], csv_sep);
-	else
+	aggr_printout(cpu, nr);
+
+	if (aggr_mode == AGGR_GLOBAL)
 		cpu = 0;
 
-	fprintf(output, fmt, cpustr, avg, csv_sep, perf_evsel__name(evsel));
+	fprintf(output, fmt, avg, csv_sep, perf_evsel__name(evsel));
 
 	if (evsel->cgrp)
 		fprintf(output, "%s%s", csv_sep, evsel->cgrp->name);
@@ -882,23 +893,23 @@ static void abs_printout(int cpu, int nr, struct perf_evsel *evsel, double avg)
 	}
 }
 
-static void print_aggr_socket(char *prefix)
+static void print_aggr(char *prefix)
 {
 	struct perf_evsel *counter;
+	int cpu, s, s2, id, nr;
 	u64 ena, run, val;
-	int cpu, s, s2, sock, nr;
 
-	if (!sock_map)
+	if (!(aggr_map || aggr_get_id))
 		return;
 
-	for (s = 0; s < sock_map->nr; s++) {
-		sock = cpu_map__socket(sock_map, s);
+	for (s = 0; s < aggr_map->nr; s++) {
+		id = aggr_map->map[s];
 		list_for_each_entry(counter, &evsel_list->entries, node) {
 			val = ena = run = 0;
 			nr = 0;
 			for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) {
-				s2 = cpu_map__get_socket(evsel_list->cpus, cpu);
-				if (s2 != sock)
+				s2 = aggr_get_id(evsel_list->cpus, cpu);
+				if (s2 != id)
 					continue;
 				val += counter->counts->cpu[cpu].val;
 				ena += counter->counts->cpu[cpu].ena;
@@ -909,18 +920,15 @@ static void print_aggr_socket(char *prefix)
 				fprintf(output, "%s", prefix);
 
 			if (run == 0 || ena == 0) {
-				fprintf(output, "S%*d%s%*d%s%*s%s%*s",
-					csv_output ? 0 : -5,
-					s,
-					csv_sep,
-					csv_output ? 0 : 4,
-					nr,
-					csv_sep,
+				aggr_printout(cpu, nr);
+
+				fprintf(output, "%*s%s%*s",
 					csv_output ? 0 : 18,
 					counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED,
 					csv_sep,
 					csv_output ? 0 : -24,
 					perf_evsel__name(counter));
+
 				if (counter->cgrp)
 					fprintf(output, "%s%s",
 						csv_sep, counter->cgrp->name);
@@ -930,9 +938,9 @@ static void print_aggr_socket(char *prefix)
 			}
 
 			if (nsec_counter(counter))
-				nsec_printout(sock, nr, counter, val);
+				nsec_printout(id, nr, counter, val);
 			else
-				abs_printout(sock, nr, counter, val);
+				abs_printout(id, nr, counter, val);
 
 			if (!csv_output) {
 				print_noise(counter, 1.0);
@@ -1073,14 +1081,20 @@ static void print_stat(int argc, const char **argv)
 		fprintf(output, ":\n\n");
 	}
 
-	if (aggr_socket)
-		print_aggr_socket(NULL);
-	else if (no_aggr) {
-		list_for_each_entry(counter, &evsel_list->entries, node)
-			print_counter(counter, NULL);
-	} else {
+	switch (aggr_mode) {
+	case AGGR_SOCKET:
+		print_aggr(NULL);
+		break;
+	case AGGR_GLOBAL:
 		list_for_each_entry(counter, &evsel_list->entries, node)
 			print_counter_aggr(counter, NULL);
+		break;
+	case AGGR_NONE:
+		list_for_each_entry(counter, &evsel_list->entries, node)
+			print_counter(counter, NULL);
+		break;
+	default:
+		break;
 	}
 
 	if (!csv_output) {
@@ -1126,6 +1140,25 @@ static int stat__set_big_num(const struct option *opt __maybe_unused,
 	return 0;
 }
 
+static int perf_stat_init_aggr_mode(void)
+{
+	switch (aggr_mode) {
+	case AGGR_SOCKET:
+		if (cpu_map__build_socket_map(evsel_list->cpus, &aggr_map)) {
+			perror("cannot build socket map");
+			return -1;
+		}
+		aggr_get_id = cpu_map__get_socket;
+		break;
+	case AGGR_NONE:
+	case AGGR_GLOBAL:
+	default:
+		break;
+	}
+	return 0;
+}
+
+
 /*
  * Add default attributes, if there were no attributes specified or
  * if -d/--detailed, -d -d or -d -d -d is used:
@@ -1308,7 +1341,8 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
 			   stat__set_big_num),
 	OPT_STRING('C', "cpu", &target.cpu_list, "cpu",
 		    "list of cpus to monitor in system-wide"),
-	OPT_BOOLEAN('A', "no-aggr", &no_aggr, "disable CPU count aggregation"),
+	OPT_SET_UINT('A', "no-aggr", &aggr_mode,
+		    "disable CPU count aggregation", AGGR_NONE),
 	OPT_STRING('x', "field-separator", &csv_sep, "separator",
 		   "print counts with custom separator"),
 	OPT_CALLBACK('G', "cgroup", &evsel_list, "name",
@@ -1323,7 +1357,8 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
 			"command to run after to the measured command"),
 	OPT_UINTEGER('I', "interval-print", &interval,
 		    "print counts at regular interval in ms (>= 100)"),
-	OPT_BOOLEAN(0, "aggr-socket", &aggr_socket, "aggregate counts per processor socket"),
+	OPT_SET_UINT(0, "aggr-socket", &aggr_mode,
+		     "aggregate counts per processor socket", AGGR_SOCKET),
 	OPT_END()
 	};
 	const char * const stat_usage[] = {
@@ -1403,19 +1438,13 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
 		usage_with_options(stat_usage, options);
 
 	/* no_aggr, cgroup are for system-wide only */
-	if ((no_aggr || nr_cgroups) && !perf_target__has_cpu(&target)) {
+	if ((aggr_mode != AGGR_GLOBAL || nr_cgroups)
+	     && !perf_target__has_cpu(&target)) {
 		fprintf(stderr, "both cgroup and no-aggregation "
 			"modes only available in system-wide mode\n");
 
 		usage_with_options(stat_usage, options);
-	}
-
-	if (aggr_socket) {
-		if (!perf_target__has_cpu(&target)) {
-			fprintf(stderr, "--aggr-socket only available in system-wide mode (-a)\n");
-			usage_with_options(stat_usage, options);
-		}
-		no_aggr = true;
+		return -1;
 	}
 
 	if (add_default_attributes())
@@ -1450,6 +1479,9 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
 		}
 	}
 
+	if (perf_stat_init_aggr_mode())
+		goto out;
+
 	/*
 	 * We dont want to block the signals - that would cause
 	 * child tasks to inherit that and Ctrl-C would not work.
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index f817046..7bb8e87 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -4,6 +4,7 @@
 #include "cpumap.h"
 #include <assert.h>
 #include <stdio.h>
+#include <stdlib.h>
 
 static struct cpu_map *cpu_map__default_new(void)
 {
@@ -219,7 +220,7 @@ int cpu_map__get_socket(struct cpu_map *map, int idx)
 	if (!mnt)
 		return -1;
 
-	sprintf(path,
+	snprintf(path, PATH_MAX,
 		"%s/devices/system/cpu/cpu%d/topology/physical_package_id",
 		mnt, cpu);
 
@@ -231,27 +232,42 @@ int cpu_map__get_socket(struct cpu_map *map, int idx)
 	return ret == 1 ? cpu : -1;
 }
 
-int cpu_map__build_socket_map(struct cpu_map *cpus, struct cpu_map **sockp)
+static int cmp_ids(const void *a, const void *b)
 {
-	struct cpu_map *sock;
+	return *(int *)a - *(int *)b;
+}
+
+static int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res,
+			      int (*f)(struct cpu_map *map, int cpu))
+{
+	struct cpu_map *c;
 	int nr = cpus->nr;
 	int cpu, s1, s2;
 
-	sock = calloc(1, sizeof(*sock) + nr * sizeof(int));
-	if (!sock)
+	/* allocate as much as possible */
+	c = calloc(1, sizeof(*c) + nr * sizeof(int));
+	if (!c)
 		return -1;
 
 	for (cpu = 0; cpu < nr; cpu++) {
-		s1 = cpu_map__get_socket(cpus, cpu);
-		for (s2 = 0; s2 < sock->nr; s2++) {
-			if (s1 == sock->map[s2])
+		s1 = f(cpus, cpu);
+		for (s2 = 0; s2 < c->nr; s2++) {
+			if (s1 == c->map[s2])
 				break;
 		}
-		if (s2 == sock->nr) {
-			sock->map[sock->nr] = s1;
-			sock->nr++;
+		if (s2 == c->nr) {
+			c->map[c->nr] = s1;
+			c->nr++;
 		}
 	}
-	*sockp = sock;
+	/* ensure we process id in increasing order */
+	qsort(c->map, c->nr, sizeof(int), cmp_ids);
+
+	*res = c;
 	return 0;
 }
+
+int cpu_map__build_socket_map(struct cpu_map *cpus, struct cpu_map **sockp)
+{
+	return cpu_map__build_map(cpus, sockp, cpu_map__get_socket);
+}
-- 
1.7.9.5


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v2 2/3] perf stat: rename --aggr-socket to --per-socket
  2013-02-14 12:57 [PATCH v2 0/3] perf stat: add per-core count aggregation Stephane Eranian
  2013-02-14 12:57 ` [PATCH v2 1/3] perf stat: refactor aggregation code Stephane Eranian
@ 2013-02-14 12:57 ` Stephane Eranian
  2013-04-02  9:34   ` [tip:perf/core] perf stat: Rename " tip-bot for Stephane Eranian
  2013-02-14 12:57 ` [PATCH v2 3/3] perf stat: add per-core aggregation Stephane Eranian
  2013-03-07 16:22 ` [PATCH v2 0/3] perf stat: add per-core count aggregation Stephane Eranian
  3 siblings, 1 reply; 11+ messages in thread
From: Stephane Eranian @ 2013-02-14 12:57 UTC (permalink / raw)
  To: linux-kernel; +Cc: peterz, mingo, ak, acme, jolsa, namhyung.kim

To make it more obvious what this option does as
suggested by Andi on LKML.

Signed-off-by: Stephane Eranian <eranian@google.com>
---
 tools/perf/Documentation/perf-stat.txt |    4 ++--
 tools/perf/builtin-stat.c              |    2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index faf4f4f..01117c5 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -119,10 +119,10 @@ perf stat --repeat 10 --null --sync --pre 'make -s O=defconfig-build/clean' -- m
 	Print count deltas every N milliseconds (minimum: 100ms)
 	example: perf stat -I 1000 -e cycles -a sleep 5
 
---aggr-socket::
+--per-socket::
 Aggregate counts per processor socket for system-wide mode measurements.  This
 is a useful mode to detect imbalance between sockets.  To enable this mode,
-use --aggr-socket in addition to -a. (system-wide).  The output includes the
+use --per-socket in addition to -a. (system-wide).  The output includes the
 socket number and the number of online processors on that socket. This is
 useful to gauge the amount of aggregation.
 
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index a19f8d5..508d9b4 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1357,7 +1357,7 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
 			"command to run after to the measured command"),
 	OPT_UINTEGER('I', "interval-print", &interval,
 		    "print counts at regular interval in ms (>= 100)"),
-	OPT_SET_UINT(0, "aggr-socket", &aggr_mode,
+	OPT_SET_UINT(0, "per-socket", &aggr_mode,
 		     "aggregate counts per processor socket", AGGR_SOCKET),
 	OPT_END()
 	};
-- 
1.7.9.5


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v2 3/3] perf stat: add per-core aggregation
  2013-02-14 12:57 [PATCH v2 0/3] perf stat: add per-core count aggregation Stephane Eranian
  2013-02-14 12:57 ` [PATCH v2 1/3] perf stat: refactor aggregation code Stephane Eranian
  2013-02-14 12:57 ` [PATCH v2 2/3] perf stat: rename --aggr-socket to --per-socket Stephane Eranian
@ 2013-02-14 12:57 ` Stephane Eranian
  2013-04-02  9:36   ` [tip:perf/core] perf stat: Add " tip-bot for Stephane Eranian
  2013-03-07 16:22 ` [PATCH v2 0/3] perf stat: add per-core count aggregation Stephane Eranian
  3 siblings, 1 reply; 11+ messages in thread
From: Stephane Eranian @ 2013-02-14 12:57 UTC (permalink / raw)
  To: linux-kernel
  Cc: peterz, mingo, ak, acme, jolsa, namhyung.kim, Stephane Eranian

From: Stephane Eranian <eranian@gmail.com>

This patch adds the --per-core option to perf stat.

This option is used to aggregate system-wide counts
on a per physical core basis. On processors with
hyperthreading, this means counts of all HT threads
running on a physical core are aggregated.

This mode is useful to find imblance between physical
cores running an uniform workload. Cores are identified
by socket: S0-C1, means physical core 1 on socket 0. Note
that cores are identified using their physical core id,
thus their numbering may not be continuous.

Per core aggregation can be combined with interval printing:

 # perf stat -a --per-core -I 1000 -e cycles sleep 1000
 #           time core         cpus             counts events
      1.000090030 S0-C0           1          4,765,747 cycles
      1.000090030 S0-C1           1          5,580,647 cycles
      1.000090030 S0-C2           1            221,181 cycles
      1.000090030 S0-C3           1            266,092 cycles

Signed-off-by: Stephane Eranian <eranian@google.com>
---
 tools/perf/Documentation/perf-stat.txt |    6 +++++
 tools/perf/builtin-stat.c              |   37 ++++++++++++++++++++-----
 tools/perf/util/cpumap.c               |   46 ++++++++++++++++++++++++++++++++
 tools/perf/util/cpumap.h               |   12 +++++++++
 4 files changed, 95 insertions(+), 6 deletions(-)

diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index 01117c5..8059d43 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -126,6 +126,12 @@ use --per-socket in addition to -a. (system-wide).  The output includes the
 socket number and the number of online processors on that socket. This is
 useful to gauge the amount of aggregation.
 
+--per-core::
+Aggregate counts per physical processor for system-wide mode measurements.  This
+is a useful mode to detect imbalance between physical cores.  To enable this mode,
+use --per-core in addition to -a. (system-wide).  The output includes the
+core number and the number of online logical processors on that physical processor.
+
 EXAMPLES
 --------
 
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 508d9b4..578f711 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -80,6 +80,7 @@ enum aggr_mode {
 	AGGR_NONE,
 	AGGR_GLOBAL,
 	AGGR_SOCKET,
+	AGGR_CORE,
 };
 
 static int			run_count			=  1;
@@ -326,6 +327,9 @@ static void print_interval(void)
 		case AGGR_SOCKET:
 			fprintf(output, "#           time socket cpus             counts events\n");
 			break;
+		case AGGR_CORE:
+			fprintf(output, "#           time core         cpus             counts events\n");
+			break;
 		case AGGR_NONE:
 			fprintf(output, "#           time CPU                 counts events\n");
 			break;
@@ -339,6 +343,7 @@ static void print_interval(void)
 		num_print_interval = 0;
 
 	switch (aggr_mode) {
+	case AGGR_CORE:
 	case AGGR_SOCKET:
 		print_aggr(prefix);
 		break;
@@ -553,13 +558,23 @@ static void print_noise(struct perf_evsel *evsel, double avg)
 	print_noise_pct(stddev_stats(&ps->res_stats[0]), avg);
 }
 
-static void aggr_printout(int cpu, int nr)
+static void aggr_printout(struct perf_evsel *evsel, int id, int nr)
 {
 	switch (aggr_mode) {
+	case AGGR_CORE:
+		fprintf(output, "S%d-C%*d%s%*d%s",
+			cpu_map__id_to_socket(id),
+			csv_output ? 0 : -8,
+			cpu_map__id_to_cpu(id),
+			csv_sep,
+			csv_output ? 0 : 4,
+			nr,
+			csv_sep);
+		break;
 	case AGGR_SOCKET:
 		fprintf(output, "S%*d%s%*d%s",
 			csv_output ? 0 : -5,
-			cpu,
+			id,
 			csv_sep,
 			csv_output ? 0 : 4,
 			nr,
@@ -568,7 +583,7 @@ static void aggr_printout(int cpu, int nr)
 	case AGGR_NONE:
 		fprintf(output, "CPU%*d%s",
 			csv_output ? 0 : -4,
-			perf_evsel__cpus(evsel)->map[cpu], csv_sep);
+			perf_evsel__cpus(evsel)->map[id], csv_sep);
 		break;
 	case AGGR_GLOBAL:
 	default:
@@ -581,7 +596,7 @@ static void nsec_printout(int cpu, int nr, struct perf_evsel *evsel, double avg)
 	double msecs = avg / 1e6;
 	const char *fmt = csv_output ? "%.6f%s%s" : "%18.6f%s%-25s";
 
-	aggr_printout(cpu, nr);
+	aggr_printout(evsel, cpu, nr);
 
 	fprintf(output, fmt, msecs, csv_sep, perf_evsel__name(evsel));
 
@@ -789,7 +804,7 @@ static void abs_printout(int cpu, int nr, struct perf_evsel *evsel, double avg)
 	else
 		fmt = "%18.0f%s%-25s";
 
-	aggr_printout(cpu, nr);
+	aggr_printout(evsel, cpu, nr);
 
 	if (aggr_mode == AGGR_GLOBAL)
 		cpu = 0;
@@ -920,7 +935,7 @@ static void print_aggr(char *prefix)
 				fprintf(output, "%s", prefix);
 
 			if (run == 0 || ena == 0) {
-				aggr_printout(cpu, nr);
+				aggr_printout(counter, cpu, nr);
 
 				fprintf(output, "%*s%s%*s",
 					csv_output ? 0 : 18,
@@ -1082,6 +1097,7 @@ static void print_stat(int argc, const char **argv)
 	}
 
 	switch (aggr_mode) {
+	case AGGR_CORE:
 	case AGGR_SOCKET:
 		print_aggr(NULL);
 		break;
@@ -1150,6 +1166,13 @@ static int perf_stat_init_aggr_mode(void)
 		}
 		aggr_get_id = cpu_map__get_socket;
 		break;
+	case AGGR_CORE:
+		if (cpu_map__build_core_map(evsel_list->cpus, &aggr_map)) {
+			perror("cannot build core map");
+			return -1;
+		}
+		aggr_get_id = cpu_map__get_core;
+		break;
 	case AGGR_NONE:
 	case AGGR_GLOBAL:
 	default:
@@ -1359,6 +1382,8 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
 		    "print counts at regular interval in ms (>= 100)"),
 	OPT_SET_UINT(0, "per-socket", &aggr_mode,
 		     "aggregate counts per processor socket", AGGR_SOCKET),
+	OPT_SET_UINT(0, "per-core", &aggr_mode,
+		     "aggregate counts per physical processor core", AGGR_CORE),
 	OPT_END()
 	};
 	const char * const stat_usage[] = {
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index 7bb8e87..beb8cf9 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -267,7 +267,53 @@ static int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res,
 	return 0;
 }
 
+int cpu_map__get_core(struct cpu_map *map, int idx)
+{
+	FILE *fp;
+	const char *mnt;
+	char path[PATH_MAX];
+	int cpu, ret, s;
+
+	if (idx > map->nr)
+		return -1;
+
+	cpu = map->map[idx];
+
+	mnt = sysfs_find_mountpoint();
+	if (!mnt)
+		return -1;
+
+	snprintf(path, PATH_MAX,
+		"%s/devices/system/cpu/cpu%d/topology/core_id",
+		mnt, cpu);
+
+	fp = fopen(path, "r");
+	if (!fp)
+		return -1;
+	ret = fscanf(fp, "%d", &cpu);
+	fclose(fp);
+	if (ret != 1)
+		return -1;
+
+	s = cpu_map__get_socket(map, idx);
+	if (s == -1)
+		return -1;
+
+	/*
+	 * encode socket in upper 16 bits
+	 * core_id is relative to socket, and
+	 * we need a global id. So we combine
+	 * socket+ core id
+	 */
+	return (s << 16) | (cpu & 0xffff);
+}
+
 int cpu_map__build_socket_map(struct cpu_map *cpus, struct cpu_map **sockp)
 {
 	return cpu_map__build_map(cpus, sockp, cpu_map__get_socket);
 }
+
+int cpu_map__build_core_map(struct cpu_map *cpus, struct cpu_map **corep)
+{
+	return cpu_map__build_map(cpus, corep, cpu_map__get_core);
+}
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index 161b007..9bed02e 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -15,7 +15,9 @@ void cpu_map__delete(struct cpu_map *map);
 struct cpu_map *cpu_map__read(FILE *file);
 size_t cpu_map__fprintf(struct cpu_map *map, FILE *fp);
 int cpu_map__get_socket(struct cpu_map *map, int idx);
+int cpu_map__get_core(struct cpu_map *map, int idx);
 int cpu_map__build_socket_map(struct cpu_map *cpus, struct cpu_map **sockp);
+int cpu_map__build_core_map(struct cpu_map *cpus, struct cpu_map **corep);
 
 static inline int cpu_map__socket(struct cpu_map *sock, int s)
 {
@@ -24,6 +26,16 @@ static inline int cpu_map__socket(struct cpu_map *sock, int s)
 	return sock->map[s];
 }
 
+static inline int cpu_map__id_to_socket(int id)
+{
+	return id >> 16;
+}
+
+static inline int cpu_map__id_to_cpu(int id)
+{
+	return id & 0xffff;
+}
+
 static inline int cpu_map__nr(const struct cpu_map *map)
 {
 	return map ? map->nr : 1;
-- 
1.7.9.5


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 0/3] perf stat: add per-core count aggregation
  2013-02-14 12:57 [PATCH v2 0/3] perf stat: add per-core count aggregation Stephane Eranian
                   ` (2 preceding siblings ...)
  2013-02-14 12:57 ` [PATCH v2 3/3] perf stat: add per-core aggregation Stephane Eranian
@ 2013-03-07 16:22 ` Stephane Eranian
  2013-03-25 13:57   ` Stephane Eranian
  3 siblings, 1 reply; 11+ messages in thread
From: Stephane Eranian @ 2013-03-07 16:22 UTC (permalink / raw)
  To: LKML
  Cc: Peter Zijlstra, mingo, ak, Arnaldo Carvalho de Melo, Jiri Olsa,
	Namhyung Kim

Arnaldo,

Any comments on this series?


On Thu, Feb 14, 2013 at 1:57 PM, Stephane Eranian <eranian@google.com> wrote:
> This patch series contains improvement to the aggregation support
> in perf stat.
>
> First, the aggregation code is refactored and a aggr_mode enum
> is defined. There is also an important bug fix for the existing
> per-socket aggregation.
>
> Second, the option --aggr-socket is renamed --per-socket.
>
> Third, the patch adds a new --per-core option to perf stat.
> It aggregates counts per physical core and becomes useful on
> systems with hyper-threading. The cores are presented per
> socket: S0-C1, means socket 0 core 1. Note that the core number
> represents its physical core id. As such, numbers may not always
> be contiguous. All of this is based on topology information available
> in sysfs.
>
> Per-core aggregation can be combined with interval printing:
>
>  # perf stat -a --per-core -I 1000 -e cycles sleep 100
>  #           time core         cpus             counts events
>       1.000101160 S0-C0           2      6,051,254,899 cycles
>       1.000101160 S0-C1           2      6,379,230,776 cycles
>       1.000101160 S0-C2           2      6,480,268,471 cycles
>       1.000101160 S0-C3           2      6,110,514,321 cycles
>       2.000663750 S0-C0           2      6,572,533,016 cycles
>       2.000663750 S0-C1           2      6,378,623,674 cycles
>       2.000663750 S0-C2           2      6,264,127,589 cycles
>       2.000663750 S0-C3           2      6,305,346,613 cycles
>
> For instance here on this SNB machine, we can see that the load
> is evenly balanced across all 4 physical core (HT is on).
>
> In v2, we print events across all cores or socket and we renamed
> --aggr-socket to --per-socket and renamed --aggr-core to --per-core
>
> Signed-off-by: Stephane Eranian <eranian@google.com>
>
> Stephane Eranian (3):
>   perf stat: refactor aggregation code
>   perf stat: rename --aggr-socket to --per-socket
>   perf stat: add per-core aggregation
>
>  tools/perf/Documentation/perf-stat.txt |   10 +-
>  tools/perf/builtin-stat.c              |  237 ++++++++++++++++++++------------
>  tools/perf/util/cpumap.c               |   86 ++++++++++--
>  tools/perf/util/cpumap.h               |   12 ++
>  4 files changed, 241 insertions(+), 104 deletions(-)
>
> --
> 1.7.9.5
>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 1/3] perf stat: refactor aggregation code
  2013-02-14 12:57 ` [PATCH v2 1/3] perf stat: refactor aggregation code Stephane Eranian
@ 2013-03-07 21:38   ` Jiri Olsa
  2013-03-25 16:22   ` Arnaldo Carvalho de Melo
  2013-04-02  9:33   ` [tip:perf/core] perf stat: Refactor " tip-bot for Stephane Eranian
  2 siblings, 0 replies; 11+ messages in thread
From: Jiri Olsa @ 2013-03-07 21:38 UTC (permalink / raw)
  To: Stephane Eranian; +Cc: linux-kernel, peterz, mingo, ak, acme, namhyung.kim

On Thu, Feb 14, 2013 at 01:57:27PM +0100, Stephane Eranian wrote:

SNIP

> -		if (aggr_socket)
> +		switch (aggr_mode) {
> +		case AGGR_SOCKET:
>  			fprintf(output, "#           time socket cpus             counts events\n");
> -		else if (no_aggr)
> +			break;
> +		case AGGR_NONE:
>  			fprintf(output, "#           time CPU                 counts events\n");
> -		else
> +			break;
> +		case AGGR_GLOBAL:
> +		default:
>  			fprintf(output, "#           time             counts events\n");
> +		}
>  	}
>  
>  	if (++num_print_interval == 25)
>  		num_print_interval = 0;
>  
> -	if (aggr_socket)
> -		print_aggr_socket(prefix);
> -	else if (no_aggr) {



this:

---
> +	switch (aggr_mode) {
> +	case AGGR_SOCKET:
> +		print_aggr(prefix);
> +		break;
> +	case AGGR_NONE:
>  		list_for_each_entry(counter, &evsel_list->entries, node)
>  			print_counter(counter, prefix);
> -	} else {
> +		break;
> +	case AGGR_GLOBAL:
> +	default:
>  		list_for_each_entry(counter, &evsel_list->entries, node)
>  			print_counter_aggr(counter, prefix);
>  	}
---



> @@ -356,12 +372,6 @@ static int __run_perf_stat(int argc __maybe_unused, const char **argv)
>  		ts.tv_nsec = 0;
>  	}

SNIP

> -	if (aggr_socket)
> -		print_aggr_socket(NULL);
> -	else if (no_aggr) {
> -		list_for_each_entry(counter, &evsel_list->entries, node)
> -			print_counter(counter, NULL);
> -	} else {


and this:
---
> +	switch (aggr_mode) {
> +	case AGGR_SOCKET:
> +		print_aggr(NULL);
> +		break;
> +	case AGGR_GLOBAL:
>  		list_for_each_entry(counter, &evsel_list->entries, node)
>  			print_counter_aggr(counter, NULL);
> +		break;
> +	case AGGR_NONE:
> +		list_for_each_entry(counter, &evsel_list->entries, node)
> +			print_counter(counter, NULL);
> +		break;
> +	default:
> +		break;
>  	}
---

could be in a single function with 'prefix' arg

Also in non interval mode no column headers are printed and the
output is sort of not user friendly, I think we could print the
header same as for the interval case.

[jolsa@krava perf]$ sudo ./perf stat -a --per-socket -e cycles /bin/true 

 Performance counter stats for '/bin/true':

S0        4          1,472,345 cycles                    #    0.000 GHz                    

^^^^^^^^^^^, versus:

[jolsa@krava perf]$ sudo ./perf stat -a --per-socket -I 100 -e cycles
sleep 10
#           time socket cpus             counts events
     0.100283713 S0        4         12,975,188 cycles                   
     0.200881622 S0        4         15,982,354 cycles                   

thanks,
jirka

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 0/3] perf stat: add per-core count aggregation
  2013-03-07 16:22 ` [PATCH v2 0/3] perf stat: add per-core count aggregation Stephane Eranian
@ 2013-03-25 13:57   ` Stephane Eranian
  0 siblings, 0 replies; 11+ messages in thread
From: Stephane Eranian @ 2013-03-25 13:57 UTC (permalink / raw)
  To: LKML
  Cc: Peter Zijlstra, mingo, ak, Arnaldo Carvalho de Melo, Jiri Olsa,
	Namhyung Kim

Arnaldo,

Where are we with this one?


On Thu, Mar 7, 2013 at 5:22 PM, Stephane Eranian <eranian@google.com> wrote:
> Arnaldo,
>
> Any comments on this series?
>
>
> On Thu, Feb 14, 2013 at 1:57 PM, Stephane Eranian <eranian@google.com> wrote:
>> This patch series contains improvement to the aggregation support
>> in perf stat.
>>
>> First, the aggregation code is refactored and a aggr_mode enum
>> is defined. There is also an important bug fix for the existing
>> per-socket aggregation.
>>
>> Second, the option --aggr-socket is renamed --per-socket.
>>
>> Third, the patch adds a new --per-core option to perf stat.
>> It aggregates counts per physical core and becomes useful on
>> systems with hyper-threading. The cores are presented per
>> socket: S0-C1, means socket 0 core 1. Note that the core number
>> represents its physical core id. As such, numbers may not always
>> be contiguous. All of this is based on topology information available
>> in sysfs.
>>
>> Per-core aggregation can be combined with interval printing:
>>
>>  # perf stat -a --per-core -I 1000 -e cycles sleep 100
>>  #           time core         cpus             counts events
>>       1.000101160 S0-C0           2      6,051,254,899 cycles
>>       1.000101160 S0-C1           2      6,379,230,776 cycles
>>       1.000101160 S0-C2           2      6,480,268,471 cycles
>>       1.000101160 S0-C3           2      6,110,514,321 cycles
>>       2.000663750 S0-C0           2      6,572,533,016 cycles
>>       2.000663750 S0-C1           2      6,378,623,674 cycles
>>       2.000663750 S0-C2           2      6,264,127,589 cycles
>>       2.000663750 S0-C3           2      6,305,346,613 cycles
>>
>> For instance here on this SNB machine, we can see that the load
>> is evenly balanced across all 4 physical core (HT is on).
>>
>> In v2, we print events across all cores or socket and we renamed
>> --aggr-socket to --per-socket and renamed --aggr-core to --per-core
>>
>> Signed-off-by: Stephane Eranian <eranian@google.com>
>>
>> Stephane Eranian (3):
>>   perf stat: refactor aggregation code
>>   perf stat: rename --aggr-socket to --per-socket
>>   perf stat: add per-core aggregation
>>
>>  tools/perf/Documentation/perf-stat.txt |   10 +-
>>  tools/perf/builtin-stat.c              |  237 ++++++++++++++++++++------------
>>  tools/perf/util/cpumap.c               |   86 ++++++++++--
>>  tools/perf/util/cpumap.h               |   12 ++
>>  4 files changed, 241 insertions(+), 104 deletions(-)
>>
>> --
>> 1.7.9.5
>>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 1/3] perf stat: refactor aggregation code
  2013-02-14 12:57 ` [PATCH v2 1/3] perf stat: refactor aggregation code Stephane Eranian
  2013-03-07 21:38   ` Jiri Olsa
@ 2013-03-25 16:22   ` Arnaldo Carvalho de Melo
  2013-04-02  9:33   ` [tip:perf/core] perf stat: Refactor " tip-bot for Stephane Eranian
  2 siblings, 0 replies; 11+ messages in thread
From: Arnaldo Carvalho de Melo @ 2013-03-25 16:22 UTC (permalink / raw)
  To: Stephane Eranian; +Cc: linux-kernel, peterz, mingo, ak, jolsa, namhyung.kim

Em Thu, Feb 14, 2013 at 01:57:27PM +0100, Stephane Eranian escreveu:
> Refactor aggregation code by introducing a single aggr_mode variable
> and an enum for aggregation.

<SNIP>
 
> @@ -542,26 +553,37 @@ static void print_noise(struct perf_evsel *evsel, double avg)
>  	print_noise_pct(stddev_stats(&ps->res_stats[0]), avg);
>  }
>  
> -static void nsec_printout(int cpu, int nr, struct perf_evsel *evsel, double avg)
> +static void aggr_printout(int cpu, int nr)
>  {
> -	double msecs = avg / 1e6;
> -	char cpustr[16] = { '\0', };
> -	const char *fmt = csv_output ? "%s%.6f%s%s" : "%s%18.6f%s%-25s";
> -
> -	if (aggr_socket)
> -		sprintf(cpustr, "S%*d%s%*d%s",
> +	switch (aggr_mode) {
> +	case AGGR_SOCKET:
> +		fprintf(output, "S%*d%s%*d%s",
>  			csv_output ? 0 : -5,
>  			cpu,
>  			csv_sep,
>  			csv_output ? 0 : 4,
>  			nr,
>  			csv_sep);
> -	else if (no_aggr)
> -		sprintf(cpustr, "CPU%*d%s",
> +			break;
> +	case AGGR_NONE:
> +		fprintf(output, "CPU%*d%s",
>  			csv_output ? 0 : -4,
>  			perf_evsel__cpus(evsel)->map[cpu], csv_sep);

I'm fixing this up, how would evsel here work if it is not passed to
aggr_printout()?

I'm also fixing it up wrt the --forever patch and the one Namyung sent
that makes perf stat use perf_evlist__prepare_workload that I had
applied before this one. Will do some tests, put on a separate branch
for you to check that all is still ok and works as expected,

thanks,

- Arnaldo

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [tip:perf/core] perf stat: Refactor aggregation code
  2013-02-14 12:57 ` [PATCH v2 1/3] perf stat: refactor aggregation code Stephane Eranian
  2013-03-07 21:38   ` Jiri Olsa
  2013-03-25 16:22   ` Arnaldo Carvalho de Melo
@ 2013-04-02  9:33   ` tip-bot for Stephane Eranian
  2 siblings, 0 replies; 11+ messages in thread
From: tip-bot for Stephane Eranian @ 2013-04-02  9:33 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: acme, linux-kernel, eranian, hpa, mingo, peterz, namhyung.kim,
	jolsa, ak, tglx, mingo

Commit-ID:  86ee6e18f6cb43ab0cb67347bda5b6f5b016121d
Gitweb:     http://git.kernel.org/tip/86ee6e18f6cb43ab0cb67347bda5b6f5b016121d
Author:     Stephane Eranian <eranian@google.com>
AuthorDate: Thu, 14 Feb 2013 13:57:27 +0100
Committer:  Arnaldo Carvalho de Melo <acme@redhat.com>
CommitDate: Mon, 25 Mar 2013 15:29:53 -0300

perf stat: Refactor aggregation code

Refactor aggregation code by introducing a single aggr_mode variable and an
enum for aggregation.

Also refactor cpumap code having to do with cpu to socket mappings. All in
preparation for extended modes, such as cpu -> core.

Also fix socket aggregation and ensure that sockets are printed in increasing
order.

Signed-off-by: Stephane Eranian <eranian@google.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung.kim@lge.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1360846649-6411-2-git-send-email-eranian@google.com
[ committer note: Fixup conflicts with a7e191c "--repeat forever" and
  acf2892 "Use perf_evlist__prepare/start_workload()" ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-stat.c | 207 ++++++++++++++++++++++++++--------------------
 tools/perf/util/cpumap.c  |  40 ++++++---
 2 files changed, 147 insertions(+), 100 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index ba0bdd8..ded34fc 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -68,7 +68,7 @@
 static void print_stat(int argc, const char **argv);
 static void print_counter_aggr(struct perf_evsel *counter, char *prefix);
 static void print_counter(struct perf_evsel *counter, char *prefix);
-static void print_aggr_socket(char *prefix);
+static void print_aggr(char *prefix);
 
 static struct perf_evlist	*evsel_list;
 
@@ -76,11 +76,16 @@ static struct perf_target	target = {
 	.uid	= UINT_MAX,
 };
 
+enum aggr_mode {
+	AGGR_NONE,
+	AGGR_GLOBAL,
+	AGGR_SOCKET,
+};
+
 static int			run_count			=  1;
 static bool			no_inherit			= false;
 static bool			scale				=  true;
-static bool			no_aggr				= false;
-static bool			aggr_socket			= false;
+static enum aggr_mode		aggr_mode			= AGGR_GLOBAL;
 static pid_t			child_pid			= -1;
 static bool			null_run			=  false;
 static int			detailed_run			=  0;
@@ -96,7 +101,8 @@ static bool			sync_run			= false;
 static unsigned int		interval			= 0;
 static bool			forever				= false;
 static struct timespec		ref_time;
-static struct cpu_map		*sock_map;
+static struct cpu_map		*aggr_map;
+static int			(*aggr_get_id)(struct cpu_map *m, int cpu);
 
 static volatile int done = 0;
 
@@ -355,41 +361,51 @@ static void print_interval(void)
 	struct timespec ts, rs;
 	char prefix[64];
 
-	if (no_aggr) {
+	if (aggr_mode == AGGR_GLOBAL) {
 		list_for_each_entry(counter, &evsel_list->entries, node) {
 			ps = counter->priv;
 			memset(ps->res_stats, 0, sizeof(ps->res_stats));
-			read_counter(counter);
+			read_counter_aggr(counter);
 		}
-	} else {
+	} else	{
 		list_for_each_entry(counter, &evsel_list->entries, node) {
 			ps = counter->priv;
 			memset(ps->res_stats, 0, sizeof(ps->res_stats));
-			read_counter_aggr(counter);
+			read_counter(counter);
 		}
 	}
+
 	clock_gettime(CLOCK_MONOTONIC, &ts);
 	diff_timespec(&rs, &ts, &ref_time);
 	sprintf(prefix, "%6lu.%09lu%s", rs.tv_sec, rs.tv_nsec, csv_sep);
 
 	if (num_print_interval == 0 && !csv_output) {
-		if (aggr_socket)
+		switch (aggr_mode) {
+		case AGGR_SOCKET:
 			fprintf(output, "#           time socket cpus             counts events\n");
-		else if (no_aggr)
+			break;
+		case AGGR_NONE:
 			fprintf(output, "#           time CPU                 counts events\n");
-		else
+			break;
+		case AGGR_GLOBAL:
+		default:
 			fprintf(output, "#           time             counts events\n");
+		}
 	}
 
 	if (++num_print_interval == 25)
 		num_print_interval = 0;
 
-	if (aggr_socket)
-		print_aggr_socket(prefix);
-	else if (no_aggr) {
+	switch (aggr_mode) {
+	case AGGR_SOCKET:
+		print_aggr(prefix);
+		break;
+	case AGGR_NONE:
 		list_for_each_entry(counter, &evsel_list->entries, node)
 			print_counter(counter, prefix);
-	} else {
+		break;
+	case AGGR_GLOBAL:
+	default:
 		list_for_each_entry(counter, &evsel_list->entries, node)
 			print_counter_aggr(counter, prefix);
 	}
@@ -412,12 +428,6 @@ static int __run_perf_stat(int argc, const char **argv)
 		ts.tv_nsec = 0;
 	}
 
-	if (aggr_socket
-	    && cpu_map__build_socket_map(evsel_list->cpus, &sock_map)) {
-		perror("cannot build socket map");
-		return -1;
-	}
-
 	if (forks) {
 		if (perf_evlist__prepare_workload(evsel_list, &target, argv,
 						  false, false) < 0) {
@@ -493,17 +503,17 @@ static int __run_perf_stat(int argc, const char **argv)
 
 	update_stats(&walltime_nsecs_stats, t1 - t0);
 
-	if (no_aggr) {
-		list_for_each_entry(counter, &evsel_list->entries, node) {
-			read_counter(counter);
-			perf_evsel__close_fd(counter, perf_evsel__nr_cpus(counter), 1);
-		}
-	} else {
+	if (aggr_mode == AGGR_GLOBAL) {
 		list_for_each_entry(counter, &evsel_list->entries, node) {
 			read_counter_aggr(counter);
 			perf_evsel__close_fd(counter, perf_evsel__nr_cpus(counter),
 					     thread_map__nr(evsel_list->threads));
 		}
+	} else {
+		list_for_each_entry(counter, &evsel_list->entries, node) {
+			read_counter(counter);
+			perf_evsel__close_fd(counter, perf_evsel__nr_cpus(counter), 1);
+		}
 	}
 
 	return WEXITSTATUS(status);
@@ -556,26 +566,37 @@ static void print_noise(struct perf_evsel *evsel, double avg)
 	print_noise_pct(stddev_stats(&ps->res_stats[0]), avg);
 }
 
-static void nsec_printout(int cpu, int nr, struct perf_evsel *evsel, double avg)
+static void aggr_printout(struct perf_evsel *evsel, int cpu, int nr)
 {
-	double msecs = avg / 1e6;
-	char cpustr[16] = { '\0', };
-	const char *fmt = csv_output ? "%s%.6f%s%s" : "%s%18.6f%s%-25s";
-
-	if (aggr_socket)
-		sprintf(cpustr, "S%*d%s%*d%s",
+	switch (aggr_mode) {
+	case AGGR_SOCKET:
+		fprintf(output, "S%*d%s%*d%s",
 			csv_output ? 0 : -5,
 			cpu,
 			csv_sep,
 			csv_output ? 0 : 4,
 			nr,
 			csv_sep);
-	else if (no_aggr)
-		sprintf(cpustr, "CPU%*d%s",
+			break;
+	case AGGR_NONE:
+		fprintf(output, "CPU%*d%s",
 			csv_output ? 0 : -4,
 			perf_evsel__cpus(evsel)->map[cpu], csv_sep);
+		break;
+	case AGGR_GLOBAL:
+	default:
+		break;
+	}
+}
+
+static void nsec_printout(int cpu, int nr, struct perf_evsel *evsel, double avg)
+{
+	double msecs = avg / 1e6;
+	const char *fmt = csv_output ? "%.6f%s%s" : "%18.6f%s%-25s";
+
+	aggr_printout(evsel, cpu, nr);
 
-	fprintf(output, fmt, cpustr, msecs, csv_sep, perf_evsel__name(evsel));
+	fprintf(output, fmt, msecs, csv_sep, perf_evsel__name(evsel));
 
 	if (evsel->cgrp)
 		fprintf(output, "%s%s", csv_sep, evsel->cgrp->name);
@@ -772,32 +793,21 @@ static void print_ll_cache_misses(int cpu,
 static void abs_printout(int cpu, int nr, struct perf_evsel *evsel, double avg)
 {
 	double total, ratio = 0.0;
-	char cpustr[16] = { '\0', };
 	const char *fmt;
 
 	if (csv_output)
-		fmt = "%s%.0f%s%s";
+		fmt = "%.0f%s%s";
 	else if (big_num)
-		fmt = "%s%'18.0f%s%-25s";
+		fmt = "%'18.0f%s%-25s";
 	else
-		fmt = "%s%18.0f%s%-25s";
+		fmt = "%18.0f%s%-25s";
 
-	if (aggr_socket)
-		sprintf(cpustr, "S%*d%s%*d%s",
-			csv_output ? 0 : -5,
-			cpu,
-			csv_sep,
-			csv_output ? 0 : 4,
-			nr,
-			csv_sep);
-	else if (no_aggr)
-		sprintf(cpustr, "CPU%*d%s",
-			csv_output ? 0 : -4,
-			perf_evsel__cpus(evsel)->map[cpu], csv_sep);
-	else
+	aggr_printout(evsel, cpu, nr);
+
+	if (aggr_mode == AGGR_GLOBAL)
 		cpu = 0;
 
-	fprintf(output, fmt, cpustr, avg, csv_sep, perf_evsel__name(evsel));
+	fprintf(output, fmt, avg, csv_sep, perf_evsel__name(evsel));
 
 	if (evsel->cgrp)
 		fprintf(output, "%s%s", csv_sep, evsel->cgrp->name);
@@ -896,23 +906,23 @@ static void abs_printout(int cpu, int nr, struct perf_evsel *evsel, double avg)
 	}
 }
 
-static void print_aggr_socket(char *prefix)
+static void print_aggr(char *prefix)
 {
 	struct perf_evsel *counter;
+	int cpu, s, s2, id, nr;
 	u64 ena, run, val;
-	int cpu, s, s2, sock, nr;
 
-	if (!sock_map)
+	if (!(aggr_map || aggr_get_id))
 		return;
 
-	for (s = 0; s < sock_map->nr; s++) {
-		sock = cpu_map__socket(sock_map, s);
+	for (s = 0; s < aggr_map->nr; s++) {
+		id = aggr_map->map[s];
 		list_for_each_entry(counter, &evsel_list->entries, node) {
 			val = ena = run = 0;
 			nr = 0;
 			for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) {
-				s2 = cpu_map__get_socket(evsel_list->cpus, cpu);
-				if (s2 != sock)
+				s2 = aggr_get_id(evsel_list->cpus, cpu);
+				if (s2 != id)
 					continue;
 				val += counter->counts->cpu[cpu].val;
 				ena += counter->counts->cpu[cpu].ena;
@@ -923,18 +933,15 @@ static void print_aggr_socket(char *prefix)
 				fprintf(output, "%s", prefix);
 
 			if (run == 0 || ena == 0) {
-				fprintf(output, "S%*d%s%*d%s%*s%s%*s",
-					csv_output ? 0 : -5,
-					s,
-					csv_sep,
-					csv_output ? 0 : 4,
-					nr,
-					csv_sep,
+				aggr_printout(counter, cpu, nr);
+
+				fprintf(output, "%*s%s%*s",
 					csv_output ? 0 : 18,
 					counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED,
 					csv_sep,
 					csv_output ? 0 : -24,
 					perf_evsel__name(counter));
+
 				if (counter->cgrp)
 					fprintf(output, "%s%s",
 						csv_sep, counter->cgrp->name);
@@ -944,9 +951,9 @@ static void print_aggr_socket(char *prefix)
 			}
 
 			if (nsec_counter(counter))
-				nsec_printout(sock, nr, counter, val);
+				nsec_printout(id, nr, counter, val);
 			else
-				abs_printout(sock, nr, counter, val);
+				abs_printout(id, nr, counter, val);
 
 			if (!csv_output) {
 				print_noise(counter, 1.0);
@@ -1087,14 +1094,20 @@ static void print_stat(int argc, const char **argv)
 		fprintf(output, ":\n\n");
 	}
 
-	if (aggr_socket)
-		print_aggr_socket(NULL);
-	else if (no_aggr) {
-		list_for_each_entry(counter, &evsel_list->entries, node)
-			print_counter(counter, NULL);
-	} else {
+	switch (aggr_mode) {
+	case AGGR_SOCKET:
+		print_aggr(NULL);
+		break;
+	case AGGR_GLOBAL:
 		list_for_each_entry(counter, &evsel_list->entries, node)
 			print_counter_aggr(counter, NULL);
+		break;
+	case AGGR_NONE:
+		list_for_each_entry(counter, &evsel_list->entries, node)
+			print_counter(counter, NULL);
+		break;
+	default:
+		break;
 	}
 
 	if (!csv_output) {
@@ -1140,6 +1153,25 @@ static int stat__set_big_num(const struct option *opt __maybe_unused,
 	return 0;
 }
 
+static int perf_stat_init_aggr_mode(void)
+{
+	switch (aggr_mode) {
+	case AGGR_SOCKET:
+		if (cpu_map__build_socket_map(evsel_list->cpus, &aggr_map)) {
+			perror("cannot build socket map");
+			return -1;
+		}
+		aggr_get_id = cpu_map__get_socket;
+		break;
+	case AGGR_NONE:
+	case AGGR_GLOBAL:
+	default:
+		break;
+	}
+	return 0;
+}
+
+
 /*
  * Add default attributes, if there were no attributes specified or
  * if -d/--detailed, -d -d or -d -d -d is used:
@@ -1322,7 +1354,8 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
 			   stat__set_big_num),
 	OPT_STRING('C', "cpu", &target.cpu_list, "cpu",
 		    "list of cpus to monitor in system-wide"),
-	OPT_BOOLEAN('A', "no-aggr", &no_aggr, "disable CPU count aggregation"),
+	OPT_SET_UINT('A', "no-aggr", &aggr_mode,
+		    "disable CPU count aggregation", AGGR_NONE),
 	OPT_STRING('x', "field-separator", &csv_sep, "separator",
 		   "print counts with custom separator"),
 	OPT_CALLBACK('G', "cgroup", &evsel_list, "name",
@@ -1337,7 +1370,8 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
 			"command to run after to the measured command"),
 	OPT_UINTEGER('I', "interval-print", &interval,
 		    "print counts at regular interval in ms (>= 100)"),
-	OPT_BOOLEAN(0, "aggr-socket", &aggr_socket, "aggregate counts per processor socket"),
+	OPT_SET_UINT(0, "aggr-socket", &aggr_mode,
+		     "aggregate counts per processor socket", AGGR_SOCKET),
 	OPT_END()
 	};
 	const char * const stat_usage[] = {
@@ -1420,19 +1454,13 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
 	}
 
 	/* no_aggr, cgroup are for system-wide only */
-	if ((no_aggr || nr_cgroups) && !perf_target__has_cpu(&target)) {
+	if ((aggr_mode != AGGR_GLOBAL || nr_cgroups)
+	     && !perf_target__has_cpu(&target)) {
 		fprintf(stderr, "both cgroup and no-aggregation "
 			"modes only available in system-wide mode\n");
 
 		usage_with_options(stat_usage, options);
-	}
-
-	if (aggr_socket) {
-		if (!perf_target__has_cpu(&target)) {
-			fprintf(stderr, "--aggr-socket only available in system-wide mode (-a)\n");
-			usage_with_options(stat_usage, options);
-		}
-		no_aggr = true;
+		return -1;
 	}
 
 	if (add_default_attributes())
@@ -1458,6 +1486,9 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
 	if (perf_evlist__alloc_stats(evsel_list, interval))
 		goto out_free_maps;
 
+	if (perf_stat_init_aggr_mode())
+		goto out;
+
 	/*
 	 * We dont want to block the signals - that would cause
 	 * child tasks to inherit that and Ctrl-C would not work.
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index f817046..7bb8e87 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -4,6 +4,7 @@
 #include "cpumap.h"
 #include <assert.h>
 #include <stdio.h>
+#include <stdlib.h>
 
 static struct cpu_map *cpu_map__default_new(void)
 {
@@ -219,7 +220,7 @@ int cpu_map__get_socket(struct cpu_map *map, int idx)
 	if (!mnt)
 		return -1;
 
-	sprintf(path,
+	snprintf(path, PATH_MAX,
 		"%s/devices/system/cpu/cpu%d/topology/physical_package_id",
 		mnt, cpu);
 
@@ -231,27 +232,42 @@ int cpu_map__get_socket(struct cpu_map *map, int idx)
 	return ret == 1 ? cpu : -1;
 }
 
-int cpu_map__build_socket_map(struct cpu_map *cpus, struct cpu_map **sockp)
+static int cmp_ids(const void *a, const void *b)
 {
-	struct cpu_map *sock;
+	return *(int *)a - *(int *)b;
+}
+
+static int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res,
+			      int (*f)(struct cpu_map *map, int cpu))
+{
+	struct cpu_map *c;
 	int nr = cpus->nr;
 	int cpu, s1, s2;
 
-	sock = calloc(1, sizeof(*sock) + nr * sizeof(int));
-	if (!sock)
+	/* allocate as much as possible */
+	c = calloc(1, sizeof(*c) + nr * sizeof(int));
+	if (!c)
 		return -1;
 
 	for (cpu = 0; cpu < nr; cpu++) {
-		s1 = cpu_map__get_socket(cpus, cpu);
-		for (s2 = 0; s2 < sock->nr; s2++) {
-			if (s1 == sock->map[s2])
+		s1 = f(cpus, cpu);
+		for (s2 = 0; s2 < c->nr; s2++) {
+			if (s1 == c->map[s2])
 				break;
 		}
-		if (s2 == sock->nr) {
-			sock->map[sock->nr] = s1;
-			sock->nr++;
+		if (s2 == c->nr) {
+			c->map[c->nr] = s1;
+			c->nr++;
 		}
 	}
-	*sockp = sock;
+	/* ensure we process id in increasing order */
+	qsort(c->map, c->nr, sizeof(int), cmp_ids);
+
+	*res = c;
 	return 0;
 }
+
+int cpu_map__build_socket_map(struct cpu_map *cpus, struct cpu_map **sockp)
+{
+	return cpu_map__build_map(cpus, sockp, cpu_map__get_socket);
+}

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [tip:perf/core] perf stat: Rename --aggr-socket to --per-socket
  2013-02-14 12:57 ` [PATCH v2 2/3] perf stat: rename --aggr-socket to --per-socket Stephane Eranian
@ 2013-04-02  9:34   ` tip-bot for Stephane Eranian
  0 siblings, 0 replies; 11+ messages in thread
From: tip-bot for Stephane Eranian @ 2013-04-02  9:34 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: acme, linux-kernel, eranian, hpa, mingo, peterz, namhyung.kim,
	jolsa, ak, tglx, mingo

Commit-ID:  d4304958a25414a6e67b8a41c0f230e05cafafb6
Gitweb:     http://git.kernel.org/tip/d4304958a25414a6e67b8a41c0f230e05cafafb6
Author:     Stephane Eranian <eranian@google.com>
AuthorDate: Thu, 14 Feb 2013 13:57:28 +0100
Committer:  Arnaldo Carvalho de Melo <acme@redhat.com>
CommitDate: Mon, 25 Mar 2013 16:09:24 -0300

perf stat: Rename --aggr-socket to --per-socket

To make it more obvious what this option does as suggested by Andi on
LKML.

Signed-off-by: Stephane Eranian <eranian@google.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung.kim@lge.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1360846649-6411-3-git-send-email-eranian@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-stat.txt | 4 ++--
 tools/perf/builtin-stat.c              | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index 23e587a..46027e1 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -119,10 +119,10 @@ perf stat --repeat 10 --null --sync --pre 'make -s O=defconfig-build/clean' -- m
 	Print count deltas every N milliseconds (minimum: 100ms)
 	example: perf stat -I 1000 -e cycles -a sleep 5
 
---aggr-socket::
+--per-socket::
 Aggregate counts per processor socket for system-wide mode measurements.  This
 is a useful mode to detect imbalance between sockets.  To enable this mode,
-use --aggr-socket in addition to -a. (system-wide).  The output includes the
+use --per-socket in addition to -a. (system-wide).  The output includes the
 socket number and the number of online processors on that socket. This is
 useful to gauge the amount of aggregation.
 
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index ded34fc..6f6ea931 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1370,7 +1370,7 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
 			"command to run after to the measured command"),
 	OPT_UINTEGER('I', "interval-print", &interval,
 		    "print counts at regular interval in ms (>= 100)"),
-	OPT_SET_UINT(0, "aggr-socket", &aggr_mode,
+	OPT_SET_UINT(0, "per-socket", &aggr_mode,
 		     "aggregate counts per processor socket", AGGR_SOCKET),
 	OPT_END()
 	};

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [tip:perf/core] perf stat: Add per-core aggregation
  2013-02-14 12:57 ` [PATCH v2 3/3] perf stat: add per-core aggregation Stephane Eranian
@ 2013-04-02  9:36   ` tip-bot for Stephane Eranian
  0 siblings, 0 replies; 11+ messages in thread
From: tip-bot for Stephane Eranian @ 2013-04-02  9:36 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: acme, linux-kernel, eranian, hpa, mingo, peterz, namhyung.kim,
	eranian, jolsa, ak, tglx, mingo

Commit-ID:  12c08a9f591aeda57fb3b05897169e7da5439a79
Gitweb:     http://git.kernel.org/tip/12c08a9f591aeda57fb3b05897169e7da5439a79
Author:     Stephane Eranian <eranian@gmail.com>
AuthorDate: Thu, 14 Feb 2013 13:57:29 +0100
Committer:  Arnaldo Carvalho de Melo <acme@redhat.com>
CommitDate: Mon, 25 Mar 2013 16:13:26 -0300

perf stat: Add per-core aggregation

This patch adds the --per-core option to perf stat.

This option is used to aggregate system-wide counts
on a per physical core basis. On processors with
hyperthreading, this means counts of all HT threads
running on a physical core are aggregated.

This mode is useful to find imblance between physical
cores running an uniform workload. Cores are identified
by socket: S0-C1, means physical core 1 on socket 0. Note
that cores are identified using their physical core id,
thus their numbering may not be continuous.

Per core aggregation can be combined with interval printing:

 # perf stat -a --per-core -I 1000 -e cycles sleep 1000
 #           time core         cpus             counts events
      1.000090030 S0-C0           1          4,765,747 cycles
      1.000090030 S0-C1           1          5,580,647 cycles
      1.000090030 S0-C2           1            221,181 cycles
      1.000090030 S0-C3           1            266,092 cycles

Signed-off-by: Stephane Eranian <eranian@google.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung.kim@lge.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1360846649-6411-4-git-send-email-eranian@google.com
[ committer note: Remove parts already applied on 86ee6e1 to keep bisectability ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-stat.txt |  6 +++++
 tools/perf/builtin-stat.c              | 31 ++++++++++++++++++++---
 tools/perf/util/cpumap.c               | 46 ++++++++++++++++++++++++++++++++++
 tools/perf/util/cpumap.h               | 12 +++++++++
 4 files changed, 92 insertions(+), 3 deletions(-)

diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index 46027e1..2fe87fb 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -126,6 +126,12 @@ use --per-socket in addition to -a. (system-wide).  The output includes the
 socket number and the number of online processors on that socket. This is
 useful to gauge the amount of aggregation.
 
+--per-core::
+Aggregate counts per physical processor for system-wide mode measurements.  This
+is a useful mode to detect imbalance between physical cores.  To enable this mode,
+use --per-core in addition to -a. (system-wide).  The output includes the
+core number and the number of online logical processors on that physical processor.
+
 EXAMPLES
 --------
 
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 6f6ea931..7e910ba 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -80,6 +80,7 @@ enum aggr_mode {
 	AGGR_NONE,
 	AGGR_GLOBAL,
 	AGGR_SOCKET,
+	AGGR_CORE,
 };
 
 static int			run_count			=  1;
@@ -384,6 +385,9 @@ static void print_interval(void)
 		case AGGR_SOCKET:
 			fprintf(output, "#           time socket cpus             counts events\n");
 			break;
+		case AGGR_CORE:
+			fprintf(output, "#           time core         cpus             counts events\n");
+			break;
 		case AGGR_NONE:
 			fprintf(output, "#           time CPU                 counts events\n");
 			break;
@@ -397,6 +401,7 @@ static void print_interval(void)
 		num_print_interval = 0;
 
 	switch (aggr_mode) {
+	case AGGR_CORE:
 	case AGGR_SOCKET:
 		print_aggr(prefix);
 		break;
@@ -566,13 +571,23 @@ static void print_noise(struct perf_evsel *evsel, double avg)
 	print_noise_pct(stddev_stats(&ps->res_stats[0]), avg);
 }
 
-static void aggr_printout(struct perf_evsel *evsel, int cpu, int nr)
+static void aggr_printout(struct perf_evsel *evsel, int id, int nr)
 {
 	switch (aggr_mode) {
+	case AGGR_CORE:
+		fprintf(output, "S%d-C%*d%s%*d%s",
+			cpu_map__id_to_socket(id),
+			csv_output ? 0 : -8,
+			cpu_map__id_to_cpu(id),
+			csv_sep,
+			csv_output ? 0 : 4,
+			nr,
+			csv_sep);
+		break;
 	case AGGR_SOCKET:
 		fprintf(output, "S%*d%s%*d%s",
 			csv_output ? 0 : -5,
-			cpu,
+			id,
 			csv_sep,
 			csv_output ? 0 : 4,
 			nr,
@@ -581,7 +596,7 @@ static void aggr_printout(struct perf_evsel *evsel, int cpu, int nr)
 	case AGGR_NONE:
 		fprintf(output, "CPU%*d%s",
 			csv_output ? 0 : -4,
-			perf_evsel__cpus(evsel)->map[cpu], csv_sep);
+			perf_evsel__cpus(evsel)->map[id], csv_sep);
 		break;
 	case AGGR_GLOBAL:
 	default:
@@ -1095,6 +1110,7 @@ static void print_stat(int argc, const char **argv)
 	}
 
 	switch (aggr_mode) {
+	case AGGR_CORE:
 	case AGGR_SOCKET:
 		print_aggr(NULL);
 		break;
@@ -1163,6 +1179,13 @@ static int perf_stat_init_aggr_mode(void)
 		}
 		aggr_get_id = cpu_map__get_socket;
 		break;
+	case AGGR_CORE:
+		if (cpu_map__build_core_map(evsel_list->cpus, &aggr_map)) {
+			perror("cannot build core map");
+			return -1;
+		}
+		aggr_get_id = cpu_map__get_core;
+		break;
 	case AGGR_NONE:
 	case AGGR_GLOBAL:
 	default:
@@ -1372,6 +1395,8 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
 		    "print counts at regular interval in ms (>= 100)"),
 	OPT_SET_UINT(0, "per-socket", &aggr_mode,
 		     "aggregate counts per processor socket", AGGR_SOCKET),
+	OPT_SET_UINT(0, "per-core", &aggr_mode,
+		     "aggregate counts per physical processor core", AGGR_CORE),
 	OPT_END()
 	};
 	const char * const stat_usage[] = {
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index 7bb8e87..beb8cf9 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -267,7 +267,53 @@ static int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res,
 	return 0;
 }
 
+int cpu_map__get_core(struct cpu_map *map, int idx)
+{
+	FILE *fp;
+	const char *mnt;
+	char path[PATH_MAX];
+	int cpu, ret, s;
+
+	if (idx > map->nr)
+		return -1;
+
+	cpu = map->map[idx];
+
+	mnt = sysfs_find_mountpoint();
+	if (!mnt)
+		return -1;
+
+	snprintf(path, PATH_MAX,
+		"%s/devices/system/cpu/cpu%d/topology/core_id",
+		mnt, cpu);
+
+	fp = fopen(path, "r");
+	if (!fp)
+		return -1;
+	ret = fscanf(fp, "%d", &cpu);
+	fclose(fp);
+	if (ret != 1)
+		return -1;
+
+	s = cpu_map__get_socket(map, idx);
+	if (s == -1)
+		return -1;
+
+	/*
+	 * encode socket in upper 16 bits
+	 * core_id is relative to socket, and
+	 * we need a global id. So we combine
+	 * socket+ core id
+	 */
+	return (s << 16) | (cpu & 0xffff);
+}
+
 int cpu_map__build_socket_map(struct cpu_map *cpus, struct cpu_map **sockp)
 {
 	return cpu_map__build_map(cpus, sockp, cpu_map__get_socket);
 }
+
+int cpu_map__build_core_map(struct cpu_map *cpus, struct cpu_map **corep)
+{
+	return cpu_map__build_map(cpus, corep, cpu_map__get_core);
+}
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index 161b007..9bed02e 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -15,7 +15,9 @@ void cpu_map__delete(struct cpu_map *map);
 struct cpu_map *cpu_map__read(FILE *file);
 size_t cpu_map__fprintf(struct cpu_map *map, FILE *fp);
 int cpu_map__get_socket(struct cpu_map *map, int idx);
+int cpu_map__get_core(struct cpu_map *map, int idx);
 int cpu_map__build_socket_map(struct cpu_map *cpus, struct cpu_map **sockp);
+int cpu_map__build_core_map(struct cpu_map *cpus, struct cpu_map **corep);
 
 static inline int cpu_map__socket(struct cpu_map *sock, int s)
 {
@@ -24,6 +26,16 @@ static inline int cpu_map__socket(struct cpu_map *sock, int s)
 	return sock->map[s];
 }
 
+static inline int cpu_map__id_to_socket(int id)
+{
+	return id >> 16;
+}
+
+static inline int cpu_map__id_to_cpu(int id)
+{
+	return id & 0xffff;
+}
+
 static inline int cpu_map__nr(const struct cpu_map *map)
 {
 	return map ? map->nr : 1;

^ permalink raw reply related	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2013-04-02  9:36 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-02-14 12:57 [PATCH v2 0/3] perf stat: add per-core count aggregation Stephane Eranian
2013-02-14 12:57 ` [PATCH v2 1/3] perf stat: refactor aggregation code Stephane Eranian
2013-03-07 21:38   ` Jiri Olsa
2013-03-25 16:22   ` Arnaldo Carvalho de Melo
2013-04-02  9:33   ` [tip:perf/core] perf stat: Refactor " tip-bot for Stephane Eranian
2013-02-14 12:57 ` [PATCH v2 2/3] perf stat: rename --aggr-socket to --per-socket Stephane Eranian
2013-04-02  9:34   ` [tip:perf/core] perf stat: Rename " tip-bot for Stephane Eranian
2013-02-14 12:57 ` [PATCH v2 3/3] perf stat: add per-core aggregation Stephane Eranian
2013-04-02  9:36   ` [tip:perf/core] perf stat: Add " tip-bot for Stephane Eranian
2013-03-07 16:22 ` [PATCH v2 0/3] perf stat: add per-core count aggregation Stephane Eranian
2013-03-25 13:57   ` Stephane Eranian

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.