* [PATCH v2] perf stat: Show percore counts in per CPU output
@ 2020-02-11 2:34 Jin Yao
2020-02-11 23:29 ` Jiri Olsa
0 siblings, 1 reply; 3+ messages in thread
From: Jin Yao @ 2020-02-11 2:34 UTC (permalink / raw)
To: acme, jolsa, peterz, mingo, alexander.shishkin
Cc: Linux-kernel, ak, kan.liang, yao.jin, Jin Yao
We have supported the event modifier "percore" which sums up the
event counts for all hardware threads in a core and show the counts
per core.
For example,
# perf stat -e cpu/event=cpu-cycles,percore/ -a -A -- sleep 1
Performance counter stats for 'system wide':
S0-D0-C0 395,072 cpu/event=cpu-cycles,percore/
S0-D0-C1 851,248 cpu/event=cpu-cycles,percore/
S0-D0-C2 954,226 cpu/event=cpu-cycles,percore/
S0-D0-C3 1,233,659 cpu/event=cpu-cycles,percore/
This patch provides a new option "--percore-show-thread". It is
used with event modifier "percore" together to sum up the event counts
for all hardware threads in a core but show the counts per hardware
thread.
This is essentially a replacement for the any bit (which is gone in
Icelake). Per core counts are useful for some formulas, e.g. CoreIPC.
The original percore version was inconvenient to post process. This
variant matches the output of the any bit.
With this patch, for example,
# perf stat -e cpu/event=cpu-cycles,percore/ -a -A --percore-show-thread -- sleep 1
Performance counter stats for 'system wide':
CPU0 2,453,061 cpu/event=cpu-cycles,percore/
CPU1 1,823,921 cpu/event=cpu-cycles,percore/
CPU2 1,383,166 cpu/event=cpu-cycles,percore/
CPU3 1,102,652 cpu/event=cpu-cycles,percore/
CPU4 2,453,061 cpu/event=cpu-cycles,percore/
CPU5 1,823,921 cpu/event=cpu-cycles,percore/
CPU6 1,383,166 cpu/event=cpu-cycles,percore/
CPU7 1,102,652 cpu/event=cpu-cycles,percore/
We can see counts are duplicated in CPU pairs
(CPU0/CPU4, CPU1/CPU5, CPU2/CPU6, CPU3/CPU7).
v2:
---
Add the explanation in change log. This is essentially a replacement
for the any bit. No code change.
Signed-off-by: Jin Yao <yao.jin@linux.intel.com>
---
tools/perf/Documentation/perf-stat.txt | 9 ++++
tools/perf/builtin-stat.c | 4 ++
tools/perf/util/stat-display.c | 57 ++++++++++++++++++++++----
tools/perf/util/stat.h | 1 +
4 files changed, 62 insertions(+), 9 deletions(-)
diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index 9431b8066fb4..86d1fd92f017 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -334,6 +334,15 @@ Configure all used events to run in kernel space.
--all-user::
Configure all used events to run in user space.
+--percore-show-thread::
+The event modifier "percore" has supported to sum up the event counts
+for all hardware threads in a core and show the counts per core.
+
+This option with event modifier "percore" enabled also sums up the event
+counts for all hardware threads in a core but show the sum counts per
+hardware thread. This is essentially a replacement for the any bit and
+convenient for posting process.
+
EXAMPLES
--------
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index a098c2ebf4ea..ec053dc1e35c 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -929,6 +929,10 @@ static struct option stat_options[] = {
OPT_BOOLEAN_FLAG(0, "all-user", &stat_config.all_user,
"Configure all used events to run in user space.",
PARSE_OPT_EXCLUSIVE),
+ OPT_BOOLEAN(0, "percore-show-thread", &stat_config.percore_show_thread,
+ "Use with 'percore' event qualifier to show the event "
+ "counts of one hardware thread by sum up total hardware "
+ "threads of same physical core"),
OPT_END()
};
diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c
index bc31fccc0057..ca603e59dfe1 100644
--- a/tools/perf/util/stat-display.c
+++ b/tools/perf/util/stat-display.c
@@ -110,7 +110,7 @@ static void aggr_printout(struct perf_stat_config *config,
config->csv_sep);
break;
case AGGR_NONE:
- if (evsel->percore) {
+ if (evsel->percore && !config->percore_show_thread) {
fprintf(config->output, "S%d-D%d-C%*d%s",
cpu_map__id_to_socket(id),
cpu_map__id_to_die(id),
@@ -628,7 +628,7 @@ static void aggr_cb(struct perf_stat_config *config,
static void print_counter_aggrdata(struct perf_stat_config *config,
struct evsel *counter, int s,
char *prefix, bool metric_only,
- bool *first)
+ bool *first, int cpu)
{
struct aggr_data ad;
FILE *output = config->output;
@@ -654,8 +654,15 @@ static void print_counter_aggrdata(struct perf_stat_config *config,
fprintf(output, "%s", prefix);
uval = val * counter->scale;
- printout(config, id, nr, counter, uval, prefix,
- run, ena, 1.0, &rt_stat);
+
+ if (cpu == -1) {
+ printout(config, id, nr, counter, uval, prefix,
+ run, ena, 1.0, &rt_stat);
+ } else {
+ printout(config, cpu, nr, counter, uval, prefix,
+ run, ena, 1.0, &rt_stat);
+ }
+
if (!metric_only)
fputc('\n', output);
}
@@ -687,7 +694,7 @@ static void print_aggr(struct perf_stat_config *config,
evlist__for_each_entry(evlist, counter) {
print_counter_aggrdata(config, counter, s,
prefix, metric_only,
- &first);
+ &first, -1);
}
if (metric_only)
fputc('\n', output);
@@ -1163,13 +1170,38 @@ static void print_percore(struct perf_stat_config *config,
print_counter_aggrdata(config, counter, s,
prefix, metric_only,
- &first);
+ &first, -1);
}
if (metric_only)
fputc('\n', output);
}
+static void print_percore_thread(struct perf_stat_config *config,
+ struct evsel *counter, char *prefix)
+{
+ int cpu, s, s2, id;
+ bool first = true;
+ FILE *output = config->output;
+
+ for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) {
+ s2 = config->aggr_get_id(config, evsel__cpus(counter), cpu);
+
+ for (s = 0; s < config->aggr_map->nr; s++) {
+ id = config->aggr_map->map[s];
+ if (s2 == id)
+ break;
+ }
+
+ if (prefix)
+ fprintf(output, "%s", prefix);
+
+ print_counter_aggrdata(config, counter, s,
+ prefix, false,
+ &first, cpu);
+ }
+}
+
void
perf_evlist__print_counters(struct evlist *evlist,
struct perf_stat_config *config,
@@ -1222,9 +1254,16 @@ perf_evlist__print_counters(struct evlist *evlist,
print_no_aggr_metric(config, evlist, prefix);
else {
evlist__for_each_entry(evlist, counter) {
- if (counter->percore)
- print_percore(config, counter, prefix);
- else
+ if (counter->percore) {
+ if (config->percore_show_thread) {
+ print_percore_thread(config,
+ counter,
+ prefix);
+ } else {
+ print_percore(config, counter,
+ prefix);
+ }
+ } else
print_counter(config, counter, prefix);
}
}
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
index fb990efa54a8..b4fdfaa7f2c0 100644
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -109,6 +109,7 @@ struct perf_stat_config {
bool walltime_run_table;
bool all_kernel;
bool all_user;
+ bool percore_show_thread;
FILE *output;
unsigned int interval;
unsigned int timeout;
--
2.17.1
^ permalink raw reply related [flat|nested] 3+ messages in thread
* Re: [PATCH v2] perf stat: Show percore counts in per CPU output
2020-02-11 2:34 [PATCH v2] perf stat: Show percore counts in per CPU output Jin Yao
@ 2020-02-11 23:29 ` Jiri Olsa
2020-02-12 4:41 ` Jin, Yao
0 siblings, 1 reply; 3+ messages in thread
From: Jiri Olsa @ 2020-02-11 23:29 UTC (permalink / raw)
To: Jin Yao
Cc: acme, jolsa, peterz, mingo, alexander.shishkin, Linux-kernel, ak,
kan.liang, yao.jin
On Tue, Feb 11, 2020 at 10:34:34AM +0800, Jin Yao wrote:
> We have supported the event modifier "percore" which sums up the
> event counts for all hardware threads in a core and show the counts
> per core.
>
> For example,
>
> # perf stat -e cpu/event=cpu-cycles,percore/ -a -A -- sleep 1
>
> Performance counter stats for 'system wide':
>
> S0-D0-C0 395,072 cpu/event=cpu-cycles,percore/
> S0-D0-C1 851,248 cpu/event=cpu-cycles,percore/
> S0-D0-C2 954,226 cpu/event=cpu-cycles,percore/
> S0-D0-C3 1,233,659 cpu/event=cpu-cycles,percore/
>
> This patch provides a new option "--percore-show-thread". It is
> used with event modifier "percore" together to sum up the event counts
> for all hardware threads in a core but show the counts per hardware
> thread.
>
> This is essentially a replacement for the any bit (which is gone in
> Icelake). Per core counts are useful for some formulas, e.g. CoreIPC.
> The original percore version was inconvenient to post process. This
> variant matches the output of the any bit.
>
> With this patch, for example,
>
> # perf stat -e cpu/event=cpu-cycles,percore/ -a -A --percore-show-thread -- sleep 1
>
> Performance counter stats for 'system wide':
>
> CPU0 2,453,061 cpu/event=cpu-cycles,percore/
> CPU1 1,823,921 cpu/event=cpu-cycles,percore/
> CPU2 1,383,166 cpu/event=cpu-cycles,percore/
> CPU3 1,102,652 cpu/event=cpu-cycles,percore/
> CPU4 2,453,061 cpu/event=cpu-cycles,percore/
> CPU5 1,823,921 cpu/event=cpu-cycles,percore/
> CPU6 1,383,166 cpu/event=cpu-cycles,percore/
> CPU7 1,102,652 cpu/event=cpu-cycles,percore/
>
> We can see counts are duplicated in CPU pairs
> (CPU0/CPU4, CPU1/CPU5, CPU2/CPU6, CPU3/CPU7).
>
> v2:
> ---
> Add the explanation in change log. This is essentially a replacement
> for the any bit. No code change.
-I output is still wrong:
$ sudo ./perf stat -e cpu/event=cpu-cycles,percore/ -a -A --percore-show-thread -I 1000
# time CPU counts unit events
1.000251427 1.000251427 CPU0 43,474,700 cpu/event=cpu-cycles,percore/
1.000251427 1.000251427 CPU1 66,495,270 cpu/event=cpu-cycles,percore/
1.000251427 1.000251427 CPU2 42,342,367 cpu/event=cpu-cycles,percore/
1.000251427 1.000251427 CPU3 43,247,607 cpu/event=cpu-cycles,percore/
plus some comments below,
jirka
SNIP
> @@ -628,7 +628,7 @@ static void aggr_cb(struct perf_stat_config *config,
> static void print_counter_aggrdata(struct perf_stat_config *config,
> struct evsel *counter, int s,
> char *prefix, bool metric_only,
> - bool *first)
> + bool *first, int cpu)
> {
> struct aggr_data ad;
> FILE *output = config->output;
> @@ -654,8 +654,15 @@ static void print_counter_aggrdata(struct perf_stat_config *config,
> fprintf(output, "%s", prefix);
>
> uval = val * counter->scale;
> - printout(config, id, nr, counter, uval, prefix,
> - run, ena, 1.0, &rt_stat);
> +
> + if (cpu == -1) {
> + printout(config, id, nr, counter, uval, prefix,
> + run, ena, 1.0, &rt_stat);
> + } else {
> + printout(config, cpu, nr, counter, uval, prefix,
> + run, ena, 1.0, &rt_stat);
> + }
this would be shorter instad of above:
printout(config, cpu != -1 ?: id, nr, counter, uval, prefix, run, ena, 1.0, &rt_stat);
> +
> if (!metric_only)
> fputc('\n', output);
> }
> @@ -687,7 +694,7 @@ static void print_aggr(struct perf_stat_config *config,
> evlist__for_each_entry(evlist, counter) {
> print_counter_aggrdata(config, counter, s,
> prefix, metric_only,
> - &first);
> + &first, -1);
> }
> if (metric_only)
> fputc('\n', output);
> @@ -1163,13 +1170,38 @@ static void print_percore(struct perf_stat_config *config,
>
> print_counter_aggrdata(config, counter, s,
> prefix, metric_only,
> - &first);
> + &first, -1);
> }
>
> if (metric_only)
> fputc('\n', output);
> }
>
> +static void print_percore_thread(struct perf_stat_config *config,
> + struct evsel *counter, char *prefix)
> +{
> + int cpu, s, s2, id;
> + bool first = true;
> + FILE *output = config->output;
> +
missing check for
if (!(config->aggr_map || config->aggr_get_id))
> + for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) {
> + s2 = config->aggr_get_id(config, evsel__cpus(counter), cpu);
should you use real cpu valu in here instead of an index? like value of:
perf_cpu_map__cpu(,evsel__cpus(counter), cpu)
instead of 'cpu' above
> +
> + for (s = 0; s < config->aggr_map->nr; s++) {
> + id = config->aggr_map->map[s];
> + if (s2 == id)
> + break;
> + }
> +
> + if (prefix)
> + fprintf(output, "%s", prefix);
> +
> + print_counter_aggrdata(config, counter, s,
> + prefix, false,
> + &first, cpu);
> + }
> +}
> +
> void
> perf_evlist__print_counters(struct evlist *evlist,
> struct perf_stat_config *config,
> @@ -1222,9 +1254,16 @@ perf_evlist__print_counters(struct evlist *evlist,
> print_no_aggr_metric(config, evlist, prefix);
> else {
> evlist__for_each_entry(evlist, counter) {
> - if (counter->percore)
> - print_percore(config, counter, prefix);
> - else
> + if (counter->percore) {
> + if (config->percore_show_thread) {
> + print_percore_thread(config,
> + counter,
> + prefix);
> + } else {
> + print_percore(config, counter,
> + prefix);
please keep the print_percore call in here and check/call
for the percore_show_thread in it
> + }
> + } else
> print_counter(config, counter, prefix);
> }
> }
> diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
> index fb990efa54a8..b4fdfaa7f2c0 100644
> --- a/tools/perf/util/stat.h
> +++ b/tools/perf/util/stat.h
> @@ -109,6 +109,7 @@ struct perf_stat_config {
> bool walltime_run_table;
> bool all_kernel;
> bool all_user;
> + bool percore_show_thread;
> FILE *output;
> unsigned int interval;
> unsigned int timeout;
> --
> 2.17.1
>
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH v2] perf stat: Show percore counts in per CPU output
2020-02-11 23:29 ` Jiri Olsa
@ 2020-02-12 4:41 ` Jin, Yao
0 siblings, 0 replies; 3+ messages in thread
From: Jin, Yao @ 2020-02-12 4:41 UTC (permalink / raw)
To: Jiri Olsa
Cc: acme, jolsa, peterz, mingo, alexander.shishkin, Linux-kernel, ak,
kan.liang, yao.jin
On 2/12/2020 7:29 AM, Jiri Olsa wrote:
> On Tue, Feb 11, 2020 at 10:34:34AM +0800, Jin Yao wrote:
>> We have supported the event modifier "percore" which sums up the
>> event counts for all hardware threads in a core and show the counts
>> per core.
>>
>> For example,
>>
>> # perf stat -e cpu/event=cpu-cycles,percore/ -a -A -- sleep 1
>>
>> Performance counter stats for 'system wide':
>>
>> S0-D0-C0 395,072 cpu/event=cpu-cycles,percore/
>> S0-D0-C1 851,248 cpu/event=cpu-cycles,percore/
>> S0-D0-C2 954,226 cpu/event=cpu-cycles,percore/
>> S0-D0-C3 1,233,659 cpu/event=cpu-cycles,percore/
>>
>> This patch provides a new option "--percore-show-thread". It is
>> used with event modifier "percore" together to sum up the event counts
>> for all hardware threads in a core but show the counts per hardware
>> thread.
>>
>> This is essentially a replacement for the any bit (which is gone in
>> Icelake). Per core counts are useful for some formulas, e.g. CoreIPC.
>> The original percore version was inconvenient to post process. This
>> variant matches the output of the any bit.
>>
>> With this patch, for example,
>>
>> # perf stat -e cpu/event=cpu-cycles,percore/ -a -A --percore-show-thread -- sleep 1
>>
>> Performance counter stats for 'system wide':
>>
>> CPU0 2,453,061 cpu/event=cpu-cycles,percore/
>> CPU1 1,823,921 cpu/event=cpu-cycles,percore/
>> CPU2 1,383,166 cpu/event=cpu-cycles,percore/
>> CPU3 1,102,652 cpu/event=cpu-cycles,percore/
>> CPU4 2,453,061 cpu/event=cpu-cycles,percore/
>> CPU5 1,823,921 cpu/event=cpu-cycles,percore/
>> CPU6 1,383,166 cpu/event=cpu-cycles,percore/
>> CPU7 1,102,652 cpu/event=cpu-cycles,percore/
>>
>> We can see counts are duplicated in CPU pairs
>> (CPU0/CPU4, CPU1/CPU5, CPU2/CPU6, CPU3/CPU7).
>>
>> v2:
>> ---
>> Add the explanation in change log. This is essentially a replacement
>> for the any bit. No code change.
>
> -I output is still wrong:
>
> $ sudo ./perf stat -e cpu/event=cpu-cycles,percore/ -a -A --percore-show-thread -I 1000
> # time CPU counts unit events
> 1.000251427 1.000251427 CPU0 43,474,700 cpu/event=cpu-cycles,percore/
> 1.000251427 1.000251427 CPU1 66,495,270 cpu/event=cpu-cycles,percore/
> 1.000251427 1.000251427 CPU2 42,342,367 cpu/event=cpu-cycles,percore/
> 1.000251427 1.000251427 CPU3 43,247,607 cpu/event=cpu-cycles,percore/
>
Ah, double time output, I just realize this issue, very sorry about that :(
It doesn't need to print the prefix.
> plus some comments below,
> jirka
>
>
> SNIP
>
>> @@ -628,7 +628,7 @@ static void aggr_cb(struct perf_stat_config *config,
>> static void print_counter_aggrdata(struct perf_stat_config *config,
>> struct evsel *counter, int s,
>> char *prefix, bool metric_only,
>> - bool *first)
>> + bool *first, int cpu)
>> {
>> struct aggr_data ad;
>> FILE *output = config->output;
>> @@ -654,8 +654,15 @@ static void print_counter_aggrdata(struct perf_stat_config *config,
>> fprintf(output, "%s", prefix);
>>
>> uval = val * counter->scale;
>> - printout(config, id, nr, counter, uval, prefix,
>> - run, ena, 1.0, &rt_stat);
>> +
>> + if (cpu == -1) {
>> + printout(config, id, nr, counter, uval, prefix,
>> + run, ena, 1.0, &rt_stat);
>> + } else {
>> + printout(config, cpu, nr, counter, uval, prefix,
>> + run, ena, 1.0, &rt_stat);
>> + }
>
> this would be shorter instad of above:
>
> printout(config, cpu != -1 ?: id, nr, counter, uval, prefix, run, ena, 1.0, &rt_stat);
>
Yes, that would be shorter.
>> +
>> if (!metric_only)
>> fputc('\n', output);
>> }
>> @@ -687,7 +694,7 @@ static void print_aggr(struct perf_stat_config *config,
>> evlist__for_each_entry(evlist, counter) {
>> print_counter_aggrdata(config, counter, s,
>> prefix, metric_only,
>> - &first);
>> + &first, -1);
>> }
>> if (metric_only)
>> fputc('\n', output);
>> @@ -1163,13 +1170,38 @@ static void print_percore(struct perf_stat_config *config,
>>
>> print_counter_aggrdata(config, counter, s,
>> prefix, metric_only,
>> - &first);
>> + &first, -1);
>> }
>>
>> if (metric_only)
>> fputc('\n', output);
>> }
>>
>> +static void print_percore_thread(struct perf_stat_config *config,
>> + struct evsel *counter, char *prefix)
>> +{
>> + int cpu, s, s2, id;
>> + bool first = true;
>> + FILE *output = config->output;
>> +
>
> missing check for
> if (!(config->aggr_map || config->aggr_get_id))
>
>
Yes, we should add checking for config->aggr_map and
config->aggr_get_id, like what the print_aggr does.
>> + for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) {
>> + s2 = config->aggr_get_id(config, evsel__cpus(counter), cpu);
>
> should you use real cpu valu in here instead of an index? like value of:
>
> perf_cpu_map__cpu(,evsel__cpus(counter), cpu)
>
> instead of 'cpu' above
>
Yes, my fault. I should use the cpu value here but not the cpu idx.
Thanks for pointing this issue out.
>> +
>> + for (s = 0; s < config->aggr_map->nr; s++) {
>> + id = config->aggr_map->map[s];
>> + if (s2 == id)
>> + break;
>> + }
>> +
>> + if (prefix)
>> + fprintf(output, "%s", prefix);
>> +
>> + print_counter_aggrdata(config, counter, s,
>> + prefix, false,
>> + &first, cpu);
>> + }
>> +}
>> +
>> void
>> perf_evlist__print_counters(struct evlist *evlist,
>> struct perf_stat_config *config,
>> @@ -1222,9 +1254,16 @@ perf_evlist__print_counters(struct evlist *evlist,
>> print_no_aggr_metric(config, evlist, prefix);
>> else {
>> evlist__for_each_entry(evlist, counter) {
>> - if (counter->percore)
>> - print_percore(config, counter, prefix);
>> - else
>> + if (counter->percore) {
>> + if (config->percore_show_thread) {
>> + print_percore_thread(config,
>> + counter,
>> + prefix);
>> + } else {
>> + print_percore(config, counter,
>> + prefix);
>
> please keep the print_percore call in here and check/call
> for the percore_show_thread in it
>
OK, Yes, that's fine. I will keep the print_percore here and call
percore_show_thread from it.
Thanks
Jin Yao
>> + }
>> + } else
>> print_counter(config, counter, prefix);
>> }
>> }
>> diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
>> index fb990efa54a8..b4fdfaa7f2c0 100644
>> --- a/tools/perf/util/stat.h
>> +++ b/tools/perf/util/stat.h
>> @@ -109,6 +109,7 @@ struct perf_stat_config {
>> bool walltime_run_table;
>> bool all_kernel;
>> bool all_user;
>> + bool percore_show_thread;
>> FILE *output;
>> unsigned int interval;
>> unsigned int timeout;
>> --
>> 2.17.1
>>
>
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2020-02-12 4:41 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-02-11 2:34 [PATCH v2] perf stat: Show percore counts in per CPU output Jin Yao
2020-02-11 23:29 ` Jiri Olsa
2020-02-12 4:41 ` Jin, Yao
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.