* [PATCH v13 1/8] perf tools: Drop redundant evsel->overwrite indicator
2016-07-04 6:20 [PATCH v13 0/8] perf tools: Support overwritable ring buffer Wang Nan
@ 2016-07-04 6:20 ` Wang Nan
2016-07-06 10:53 ` Jiri Olsa
2016-07-04 6:20 ` [PATCH v13 2/8] perf evlist: Introduce aux evlist Wang Nan
` (6 subsequent siblings)
7 siblings, 1 reply; 20+ messages in thread
From: Wang Nan @ 2016-07-04 6:20 UTC (permalink / raw)
To: acme, jolsa
Cc: linux-kernel, pi3orama, lizefan, Arnaldo Carvalho de Melo,
Wang Nan, He Kuang, Jiri Olsa, Masami Hiramatsu, Namhyung Kim,
Nilay Vaish
From: Arnaldo Carvalho de Melo <acme@redhat.com>
evsel->overwrite indicator means an event should be put into
overwritable ring buffer. In current implementation, it equals to
evsel->attr.write_backward. To reduce compliexity, remove
evsel->overwrite, use evsel->attr.write_backward instead.
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: Nilay Vaish <nilayvaish@gmail.com>
Cc: pi3orama@163.com
---
tools/perf/tests/backward-ring-buffer.c | 1 +
tools/perf/util/evlist.c | 4 ++--
tools/perf/util/evsel.c | 12 +++++-------
tools/perf/util/evsel.h | 1 -
4 files changed, 8 insertions(+), 10 deletions(-)
diff --git a/tools/perf/tests/backward-ring-buffer.c b/tools/perf/tests/backward-ring-buffer.c
index e70313f..1750ef2 100644
--- a/tools/perf/tests/backward-ring-buffer.c
+++ b/tools/perf/tests/backward-ring-buffer.c
@@ -101,6 +101,7 @@ int test__backward_ring_buffer(int subtest __maybe_unused)
return TEST_FAIL;
}
+ evlist->backward = true;
err = perf_evlist__create_maps(evlist, &opts.target);
if (err < 0) {
pr_debug("Not enough memory to create thread/cpu maps\n");
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 1135077..7228596 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -1003,7 +1003,7 @@ static bool
perf_evlist__should_poll(struct perf_evlist *evlist __maybe_unused,
struct perf_evsel *evsel)
{
- if (evsel->overwrite)
+ if (evsel->attr.write_backward)
return false;
return true;
}
@@ -1018,7 +1018,7 @@ static int perf_evlist__mmap_per_evsel(struct perf_evlist *evlist, int idx,
evlist__for_each_entry(evlist, evsel) {
int fd;
- if (evsel->overwrite != (evlist->overwrite && evlist->backward))
+ if (!!evsel->attr.write_backward != (evlist->overwrite && evlist->backward))
continue;
if (evsel->system_wide && thread)
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 0fea724..3abe519 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1359,6 +1359,9 @@ static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
int pid = -1, err;
enum { NO_CHANGE, SET_TO_MAX, INCREASED_MAX } set_rlimit = NO_CHANGE;
+ if (perf_missing_features.write_backward && evsel->attr.write_backward)
+ return -EINVAL;
+
if (evsel->system_wide)
nthreads = 1;
else
@@ -1389,11 +1392,6 @@ fallback_missing_features:
if (perf_missing_features.lbr_flags)
evsel->attr.branch_sample_type &= ~(PERF_SAMPLE_BRANCH_NO_FLAGS |
PERF_SAMPLE_BRANCH_NO_CYCLES);
- if (perf_missing_features.write_backward) {
- if (evsel->overwrite)
- return -EINVAL;
- evsel->attr.write_backward = false;
- }
retry_sample_id:
if (perf_missing_features.sample_id_all)
evsel->attr.sample_id_all = 0;
@@ -1495,7 +1493,7 @@ try_fallback:
*/
if (!perf_missing_features.write_backward && evsel->attr.write_backward) {
perf_missing_features.write_backward = true;
- goto fallback_missing_features;
+ goto out_close;
} else if (!perf_missing_features.clockid_wrong && evsel->attr.use_clockid) {
perf_missing_features.clockid_wrong = true;
goto fallback_missing_features;
@@ -2404,7 +2402,7 @@ int perf_evsel__open_strerror(struct perf_evsel *evsel, struct target *target,
"We found oprofile daemon running, please stop it and try again.");
break;
case EINVAL:
- if (evsel->overwrite && perf_missing_features.write_backward)
+ if (evsel->attr.write_backward && perf_missing_features.write_backward)
return scnprintf(msg, size, "Reading from overwrite event is not supported by this kernel.");
if (perf_missing_features.clockid)
return scnprintf(msg, size, "clockid feature not supported.");
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 86fed7a..a31ee2d 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -114,7 +114,6 @@ struct perf_evsel {
bool tracking;
bool per_pkg;
bool precise_max;
- bool overwrite;
/* parse modifier helper */
int exclude_GH;
int nr_members;
--
1.8.3.4
^ permalink raw reply related [flat|nested] 20+ messages in thread
* Re: [PATCH v13 1/8] perf tools: Drop redundant evsel->overwrite indicator
2016-07-04 6:20 ` [PATCH v13 1/8] perf tools: Drop redundant evsel->overwrite indicator Wang Nan
@ 2016-07-06 10:53 ` Jiri Olsa
2016-07-06 10:55 ` Wangnan (F)
0 siblings, 1 reply; 20+ messages in thread
From: Jiri Olsa @ 2016-07-06 10:53 UTC (permalink / raw)
To: Wang Nan
Cc: acme, linux-kernel, pi3orama, lizefan, Arnaldo Carvalho de Melo,
He Kuang, Jiri Olsa, Masami Hiramatsu, Namhyung Kim, Nilay Vaish
On Mon, Jul 04, 2016 at 06:20:02AM +0000, Wang Nan wrote:
SNIP
> diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
> index 0fea724..3abe519 100644
> --- a/tools/perf/util/evsel.c
> +++ b/tools/perf/util/evsel.c
> @@ -1359,6 +1359,9 @@ static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
> int pid = -1, err;
> enum { NO_CHANGE, SET_TO_MAX, INCREASED_MAX } set_rlimit = NO_CHANGE;
>
> + if (perf_missing_features.write_backward && evsel->attr.write_backward)
> + return -EINVAL;
> +
> if (evsel->system_wide)
> nthreads = 1;
> else
> @@ -1389,11 +1392,6 @@ fallback_missing_features:
> if (perf_missing_features.lbr_flags)
> evsel->attr.branch_sample_type &= ~(PERF_SAMPLE_BRANCH_NO_FLAGS |
> PERF_SAMPLE_BRANCH_NO_CYCLES);
> - if (perf_missing_features.write_backward) {
> - if (evsel->overwrite)
> - return -EINVAL;
> - evsel->attr.write_backward = false;
> - }
so we don't change the attr.write_backward anymore?
based on the kernel support..
we do it for other features, why not here? also changelog
did not mentioned it at all ;-)
thanks,
jirka
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH v13 1/8] perf tools: Drop redundant evsel->overwrite indicator
2016-07-06 10:53 ` Jiri Olsa
@ 2016-07-06 10:55 ` Wangnan (F)
0 siblings, 0 replies; 20+ messages in thread
From: Wangnan (F) @ 2016-07-06 10:55 UTC (permalink / raw)
To: Jiri Olsa
Cc: acme, linux-kernel, pi3orama, lizefan, Arnaldo Carvalho de Melo,
He Kuang, Jiri Olsa, Masami Hiramatsu, Namhyung Kim, Nilay Vaish
On 2016/7/6 18:53, Jiri Olsa wrote:
> On Mon, Jul 04, 2016 at 06:20:02AM +0000, Wang Nan wrote:
>
> SNIP
>
>> diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
>> index 0fea724..3abe519 100644
>> --- a/tools/perf/util/evsel.c
>> +++ b/tools/perf/util/evsel.c
>> @@ -1359,6 +1359,9 @@ static int __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
>> int pid = -1, err;
>> enum { NO_CHANGE, SET_TO_MAX, INCREASED_MAX } set_rlimit = NO_CHANGE;
>>
>> + if (perf_missing_features.write_backward && evsel->attr.write_backward)
>> + return -EINVAL;
>> +
>> if (evsel->system_wide)
>> nthreads = 1;
>> else
>> @@ -1389,11 +1392,6 @@ fallback_missing_features:
>> if (perf_missing_features.lbr_flags)
>> evsel->attr.branch_sample_type &= ~(PERF_SAMPLE_BRANCH_NO_FLAGS |
>> PERF_SAMPLE_BRANCH_NO_CYCLES);
>> - if (perf_missing_features.write_backward) {
>> - if (evsel->overwrite)
>> - return -EINVAL;
>> - evsel->attr.write_backward = false;
>> - }
> so we don't change the attr.write_backward anymore?
> based on the kernel support..
We don't need fallback here. If evsel->attr.write_backward is selected
and it is missing, we should fail.
> we do it for other features, why not here? also changelog
> did not mentioned it at all ;-)
>
> thanks,
> jirka
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH v13 2/8] perf evlist: Introduce aux evlist
2016-07-04 6:20 [PATCH v13 0/8] perf tools: Support overwritable ring buffer Wang Nan
2016-07-04 6:20 ` [PATCH v13 1/8] perf tools: Drop redundant evsel->overwrite indicator Wang Nan
@ 2016-07-04 6:20 ` Wang Nan
2016-07-06 11:36 ` Jiri Olsa
2016-07-04 6:20 ` [PATCH v13 3/8] perf tests: Add testcase for auxiliary evlist Wang Nan
` (5 subsequent siblings)
7 siblings, 1 reply; 20+ messages in thread
From: Wang Nan @ 2016-07-04 6:20 UTC (permalink / raw)
To: acme, jolsa
Cc: linux-kernel, pi3orama, lizefan, Wang Nan, He Kuang, Jiri Olsa,
Masami Hiramatsu, Namhyung Kim, Nilay Vaish
An auxiliary evlist is created by perf_evlist__new_aux() using an
existing evlist as its parent. An auxiliary evlist can have its own
'struct perf_mmap', but can't have any other data. User should use its
parent instead when accessing other data.
Auxiliary evlists are containers of 'struct perf_mmap'. It is introduced
to allow its parent evlist to map different events into separated mmaps.
Following commits create an auxiliary evlist for overwritable
events, because overwritable events need a read only and backwards ring
buffer, which is different from normal events.
To achieve this goal, this patch carefully changes 'evlist' to
'evlist->parent' in all functions in the path of 'perf_evlist__mmap_ex',
except 'evlist->mmap' related operations, to make sure all evlist
modifications (like pollfd and event id hash tables) goes to original
evlist.
A 'evlist->parent' pointer is added to 'struct perf_evlist' and points to
the evlist itself for normal evlists.
Children of one evlist are linked into it so one can find all children
from its parent.
To avoid potential complexity, forbid creating aux evlist from another
aux evlist.
Improve perf_evlist__munmap_filtered(), so when recording, if an event
is terminated, unmap mmaps, from parent and children.
Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: Nilay Vaish <nilayvaish@gmail.com>
Cc: pi3orama@163.com
---
tools/perf/util/evlist.c | 49 +++++++++++++++++++++++++++++++++++++-----------
tools/perf/util/evlist.h | 12 ++++++++++++
2 files changed, 50 insertions(+), 11 deletions(-)
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 7228596..7000fe2 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -41,10 +41,12 @@ void perf_evlist__init(struct perf_evlist *evlist, struct cpu_map *cpus,
for (i = 0; i < PERF_EVLIST__HLIST_SIZE; ++i)
INIT_HLIST_HEAD(&evlist->heads[i]);
INIT_LIST_HEAD(&evlist->entries);
+ INIT_LIST_HEAD(&evlist->children);
perf_evlist__set_maps(evlist, cpus, threads);
fdarray__init(&evlist->pollfd, 64);
evlist->workload.pid = -1;
evlist->backward = false;
+ evlist->parent = evlist;
}
struct perf_evlist *perf_evlist__new(void)
@@ -490,13 +492,17 @@ static void perf_evlist__munmap_filtered(struct fdarray *fda, int fd,
void *arg __maybe_unused)
{
struct perf_evlist *evlist = container_of(fda, struct perf_evlist, pollfd);
+ struct perf_evlist *child;
perf_evlist__mmap_put(evlist, fda->priv[fd].idx);
+ list_for_each_entry(child, &evlist->children, list)
+ perf_evlist__mmap_put(child, fda->priv[fd].idx);
+
}
int perf_evlist__filter_pollfd(struct perf_evlist *evlist, short revents_and_mask)
{
- return fdarray__filter(&evlist->pollfd, revents_and_mask,
+ return fdarray__filter(&evlist->parent->pollfd, revents_and_mask,
perf_evlist__munmap_filtered, NULL);
}
@@ -1015,7 +1021,7 @@ static int perf_evlist__mmap_per_evsel(struct perf_evlist *evlist, int idx,
struct perf_evsel *evsel;
int revent;
- evlist__for_each_entry(evlist, evsel) {
+ evlist__for_each_entry(evlist->parent, evsel) {
int fd;
if (!!evsel->attr.write_backward != (evlist->overwrite && evlist->backward))
@@ -1047,16 +1053,16 @@ static int perf_evlist__mmap_per_evsel(struct perf_evlist *evlist, int idx,
* Therefore don't add it for polling.
*/
if (!evsel->system_wide &&
- __perf_evlist__add_pollfd(evlist, fd, idx, revent) < 0) {
+ __perf_evlist__add_pollfd(evlist->parent, fd, idx, revent) < 0) {
perf_evlist__mmap_put(evlist, idx);
return -1;
}
if (evsel->attr.read_format & PERF_FORMAT_ID) {
- if (perf_evlist__id_add_fd(evlist, evsel, cpu, thread,
+ if (perf_evlist__id_add_fd(evlist->parent, evsel, cpu, thread,
fd) < 0)
return -1;
- perf_evlist__set_sid_idx(evlist, evsel, idx, cpu,
+ perf_evlist__set_sid_idx(evlist->parent, evsel, idx, cpu,
thread);
}
}
@@ -1097,13 +1103,13 @@ static int perf_evlist__mmap_per_thread(struct perf_evlist *evlist,
struct mmap_params *mp)
{
int thread;
- int nr_threads = thread_map__nr(evlist->threads);
+ int nr_threads = thread_map__nr(evlist->parent->threads);
pr_debug2("perf event ring buffer mmapped per thread\n");
for (thread = 0; thread < nr_threads; thread++) {
int output = -1;
- auxtrace_mmap_params__set_idx(&mp->auxtrace_mp, evlist, thread,
+ auxtrace_mmap_params__set_idx(&mp->auxtrace_mp, evlist->parent, thread,
false);
if (perf_evlist__mmap_per_evsel(evlist, thread, mp, 0, thread,
@@ -1242,8 +1248,8 @@ int perf_evlist__mmap_ex(struct perf_evlist *evlist, unsigned int pages,
bool auxtrace_overwrite)
{
struct perf_evsel *evsel;
- const struct cpu_map *cpus = evlist->cpus;
- const struct thread_map *threads = evlist->threads;
+ const struct cpu_map *cpus = evlist->parent->cpus;
+ const struct thread_map *threads = evlist->parent->threads;
struct mmap_params mp = {
.prot = PROT_READ | (overwrite ? 0 : PROT_WRITE),
};
@@ -1251,7 +1257,7 @@ int perf_evlist__mmap_ex(struct perf_evlist *evlist, unsigned int pages,
if (evlist->mmap == NULL && perf_evlist__alloc_mmap(evlist) < 0)
return -ENOMEM;
- if (evlist->pollfd.entries == NULL && perf_evlist__alloc_pollfd(evlist) < 0)
+ if (evlist->parent->pollfd.entries == NULL && perf_evlist__alloc_pollfd(evlist->parent) < 0)
return -ENOMEM;
evlist->overwrite = overwrite;
@@ -1262,7 +1268,7 @@ int perf_evlist__mmap_ex(struct perf_evlist *evlist, unsigned int pages,
auxtrace_mmap_params__init(&mp.auxtrace_mp, evlist->mmap_len,
auxtrace_pages, auxtrace_overwrite);
- evlist__for_each_entry(evlist, evsel) {
+ evlist__for_each_entry(evlist->parent, evsel) {
if ((evsel->attr.read_format & PERF_FORMAT_ID) &&
evsel->sample_id == NULL &&
perf_evsel__alloc_id(evsel, cpu_map__nr(cpus), threads->nr) < 0)
@@ -1919,3 +1925,24 @@ perf_evlist__find_evsel_by_str(struct perf_evlist *evlist,
return NULL;
}
+
+struct perf_evlist *perf_evlist__new_aux(struct perf_evlist *parent)
+{
+ struct perf_evlist *evlist;
+
+ if (perf_evlist__is_aux(parent)) {
+ pr_err("Internal error: create aux evlist from another aux evlist\n");
+ return NULL;
+ }
+
+ evlist = zalloc(sizeof(*evlist));
+ if (!evlist)
+ return NULL;
+
+ perf_evlist__init(evlist, parent->cpus, parent->threads);
+ evlist->parent = parent;
+ INIT_LIST_HEAD(&evlist->list);
+ list_add(&evlist->list, &parent->children);
+
+ return evlist;
+}
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index 872912b..99bcc02 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -37,6 +37,10 @@ struct perf_mmap {
struct perf_evlist {
struct list_head entries;
+ union {
+ struct list_head children;
+ struct list_head list;
+ };
struct hlist_head heads[PERF_EVLIST__HLIST_SIZE];
int nr_entries;
int nr_groups;
@@ -60,8 +64,14 @@ struct perf_evlist {
struct perf_evsel *selected;
struct events_stats stats;
struct perf_env *env;
+ struct perf_evlist *parent;
};
+static inline bool perf_evlist__is_aux(struct perf_evlist *evlist)
+{
+ return evlist->parent != evlist;
+}
+
struct perf_evsel_str_handler {
const char *name;
void *handler;
@@ -70,6 +80,8 @@ struct perf_evsel_str_handler {
struct perf_evlist *perf_evlist__new(void);
struct perf_evlist *perf_evlist__new_default(void);
struct perf_evlist *perf_evlist__new_dummy(void);
+struct perf_evlist *perf_evlist__new_aux(struct perf_evlist *);
+
void perf_evlist__init(struct perf_evlist *evlist, struct cpu_map *cpus,
struct thread_map *threads);
void perf_evlist__exit(struct perf_evlist *evlist);
--
1.8.3.4
^ permalink raw reply related [flat|nested] 20+ messages in thread
* Re: [PATCH v13 2/8] perf evlist: Introduce aux evlist
2016-07-04 6:20 ` [PATCH v13 2/8] perf evlist: Introduce aux evlist Wang Nan
@ 2016-07-06 11:36 ` Jiri Olsa
2016-07-06 12:16 ` Wangnan (F)
0 siblings, 1 reply; 20+ messages in thread
From: Jiri Olsa @ 2016-07-06 11:36 UTC (permalink / raw)
To: Wang Nan
Cc: acme, linux-kernel, pi3orama, lizefan, He Kuang, Jiri Olsa,
Masami Hiramatsu, Namhyung Kim, Nilay Vaish
On Mon, Jul 04, 2016 at 06:20:03AM +0000, Wang Nan wrote:
SNIP
> +struct perf_evlist *perf_evlist__new_aux(struct perf_evlist *parent)
> +{
> + struct perf_evlist *evlist;
> +
> + if (perf_evlist__is_aux(parent)) {
> + pr_err("Internal error: create aux evlist from another aux evlist\n");
> + return NULL;
> + }
> +
> + evlist = zalloc(sizeof(*evlist));
> + if (!evlist)
> + return NULL;
> +
> + perf_evlist__init(evlist, parent->cpus, parent->threads);
> + evlist->parent = parent;
> + INIT_LIST_HEAD(&evlist->list);
> + list_add(&evlist->list, &parent->children);
I understand there's some reason for separating maps with and
without overwrite set, but I'm missing it.. why is that?
thanks,
jirka
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH v13 2/8] perf evlist: Introduce aux evlist
2016-07-06 11:36 ` Jiri Olsa
@ 2016-07-06 12:16 ` Wangnan (F)
2016-07-08 14:46 ` Jiri Olsa
0 siblings, 1 reply; 20+ messages in thread
From: Wangnan (F) @ 2016-07-06 12:16 UTC (permalink / raw)
To: Jiri Olsa
Cc: acme, linux-kernel, pi3orama, lizefan, He Kuang, Jiri Olsa,
Masami Hiramatsu, Namhyung Kim, Nilay Vaish
On 2016/7/6 19:36, Jiri Olsa wrote:
> On Mon, Jul 04, 2016 at 06:20:03AM +0000, Wang Nan wrote:
>
> SNIP
>
>> +struct perf_evlist *perf_evlist__new_aux(struct perf_evlist *parent)
>> +{
>> + struct perf_evlist *evlist;
>> +
>> + if (perf_evlist__is_aux(parent)) {
>> + pr_err("Internal error: create aux evlist from another aux evlist\n");
>> + return NULL;
>> + }
>> +
>> + evlist = zalloc(sizeof(*evlist));
>> + if (!evlist)
>> + return NULL;
>> +
>> + perf_evlist__init(evlist, parent->cpus, parent->threads);
>> + evlist->parent = parent;
>> + INIT_LIST_HEAD(&evlist->list);
>> + list_add(&evlist->list, &parent->children);
> I understand there's some reason for separating maps with and
> without overwrite set, but I'm missing it.. why is that?
You are asking overwrite, not write_backward?
Overwrite mapping needs to be mapped without PROT_WRITE, so its
control page is also read only, so perf_evlist__mmap_consume() is
not able to use, and there's no way to tell kernel to where we have
read. Kernel overwrite old records when its full. Compare with normal
mapping: perf uses perf_evlist__mmap_consume() to tell kernel the
last byte it has read, so kernel stop writing data to it when it full,
and issues LOST event. This is the reason we need to separate maps
with and without overwrite set.
For write backward: kernel write data in different direction, so
requires map separation.
Thank you.
> thanks,
> jirka
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH v13 2/8] perf evlist: Introduce aux evlist
2016-07-06 12:16 ` Wangnan (F)
@ 2016-07-08 14:46 ` Jiri Olsa
2016-07-11 10:20 ` Wangnan (F)
0 siblings, 1 reply; 20+ messages in thread
From: Jiri Olsa @ 2016-07-08 14:46 UTC (permalink / raw)
To: Wangnan (F)
Cc: acme, linux-kernel, pi3orama, lizefan, He Kuang, Jiri Olsa,
Masami Hiramatsu, Namhyung Kim, Nilay Vaish
On Wed, Jul 06, 2016 at 08:16:52PM +0800, Wangnan (F) wrote:
>
>
> On 2016/7/6 19:36, Jiri Olsa wrote:
> > On Mon, Jul 04, 2016 at 06:20:03AM +0000, Wang Nan wrote:
> >
> > SNIP
> >
> > > +struct perf_evlist *perf_evlist__new_aux(struct perf_evlist *parent)
> > > +{
> > > + struct perf_evlist *evlist;
> > > +
> > > + if (perf_evlist__is_aux(parent)) {
> > > + pr_err("Internal error: create aux evlist from another aux evlist\n");
> > > + return NULL;
> > > + }
> > > +
> > > + evlist = zalloc(sizeof(*evlist));
> > > + if (!evlist)
> > > + return NULL;
> > > +
> > > + perf_evlist__init(evlist, parent->cpus, parent->threads);
> > > + evlist->parent = parent;
> > > + INIT_LIST_HEAD(&evlist->list);
> > > + list_add(&evlist->list, &parent->children);
> > I understand there's some reason for separating maps with and
> > without overwrite set, but I'm missing it.. why is that?
>
> You are asking overwrite, not write_backward?
>
> Overwrite mapping needs to be mapped without PROT_WRITE, so its
> control page is also read only, so perf_evlist__mmap_consume() is
> not able to use, and there's no way to tell kernel to where we have
> read. Kernel overwrite old records when its full. Compare with normal
> mapping: perf uses perf_evlist__mmap_consume() to tell kernel the
> last byte it has read, so kernel stop writing data to it when it full,
> and issues LOST event. This is the reason we need to separate maps
> with and without overwrite set.
>
> For write backward: kernel write data in different direction, so
> requires map separation.
I dont like the idea of duplicating whole perf_evlist
in order just to map some events with overwrite/backward
perf_evlist carries all the other info about events,
not just memory maping..
I think it'd be better to do it some other way, like:
- we have mmaps for events/evsels, so you're able to map
it differently with or without PROT_WRITE even in current
design.. there's struct perf_mmap that can carry that info
then it's the matter of reading/processing those maps
that needs to change.. new perf_evlist interface
- we could keep separate struct perf_mmap arrays for forward
and backward/overwrite maps
- ...
I understand both mapping need different treatment,
but I think that should be encapsulated within the
struct perf_evlist interface
thanks,
jirka
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH v13 2/8] perf evlist: Introduce aux evlist
2016-07-08 14:46 ` Jiri Olsa
@ 2016-07-11 10:20 ` Wangnan (F)
0 siblings, 0 replies; 20+ messages in thread
From: Wangnan (F) @ 2016-07-11 10:20 UTC (permalink / raw)
To: Jiri Olsa
Cc: acme, linux-kernel, pi3orama, lizefan, He Kuang, Jiri Olsa,
Masami Hiramatsu, Namhyung Kim, Nilay Vaish
On 2016/7/8 22:46, Jiri Olsa wrote:
> On Wed, Jul 06, 2016 at 08:16:52PM +0800, Wangnan (F) wrote:
>>
>> On 2016/7/6 19:36, Jiri Olsa wrote:
>>> On Mon, Jul 04, 2016 at 06:20:03AM +0000, Wang Nan wrote:
>>>
>>> SNIP
>>>
>>>> +struct perf_evlist *perf_evlist__new_aux(struct perf_evlist *parent)
>>>> +{
>>>> + struct perf_evlist *evlist;
>>>> +
>>>> + if (perf_evlist__is_aux(parent)) {
>>>> + pr_err("Internal error: create aux evlist from another aux evlist\n");
>>>> + return NULL;
>>>> + }
>>>> +
>>>> + evlist = zalloc(sizeof(*evlist));
>>>> + if (!evlist)
>>>> + return NULL;
>>>> +
>>>> + perf_evlist__init(evlist, parent->cpus, parent->threads);
>>>> + evlist->parent = parent;
>>>> + INIT_LIST_HEAD(&evlist->list);
>>>> + list_add(&evlist->list, &parent->children);
>>> I understand there's some reason for separating maps with and
>>> without overwrite set, but I'm missing it.. why is that?
>> You are asking overwrite, not write_backward?
>>
>> Overwrite mapping needs to be mapped without PROT_WRITE, so its
>> control page is also read only, so perf_evlist__mmap_consume() is
>> not able to use, and there's no way to tell kernel to where we have
>> read. Kernel overwrite old records when its full. Compare with normal
>> mapping: perf uses perf_evlist__mmap_consume() to tell kernel the
>> last byte it has read, so kernel stop writing data to it when it full,
>> and issues LOST event. This is the reason we need to separate maps
>> with and without overwrite set.
>>
>> For write backward: kernel write data in different direction, so
>> requires map separation.
> I dont like the idea of duplicating whole perf_evlist
> in order just to map some events with overwrite/backward
>
> perf_evlist carries all the other info about events,
> not just memory maping..
>
> I think it'd be better to do it some other way, like:
>
> - we have mmaps for events/evsels, so you're able to map
> it differently with or without PROT_WRITE even in current
> design.. there's struct perf_mmap that can carry that info
> then it's the matter of reading/processing those maps
> that needs to change.. new perf_evlist interface
>
> - we could keep separate struct perf_mmap arrays for forward
> and backward/overwrite maps
>
> - ...
>
> I understand both mapping need different treatment,
> but I think that should be encapsulated within the
> struct perf_evlist interface
I don't like it either, but aux_evlist is the easiest way to
do this work. Other potential solutions require heavy API changes.
Thank you.
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH v13 3/8] perf tests: Add testcase for auxiliary evlist
2016-07-04 6:20 [PATCH v13 0/8] perf tools: Support overwritable ring buffer Wang Nan
2016-07-04 6:20 ` [PATCH v13 1/8] perf tools: Drop redundant evsel->overwrite indicator Wang Nan
2016-07-04 6:20 ` [PATCH v13 2/8] perf evlist: Introduce aux evlist Wang Nan
@ 2016-07-04 6:20 ` Wang Nan
2016-07-04 6:20 ` [PATCH v13 4/8] perf record: Introduce rec->overwrite_evlist for overwritable events Wang Nan
` (4 subsequent siblings)
7 siblings, 0 replies; 20+ messages in thread
From: Wang Nan @ 2016-07-04 6:20 UTC (permalink / raw)
To: acme, jolsa
Cc: linux-kernel, pi3orama, lizefan, Wang Nan,
Arnaldo Carvalho de Melo, Jiri Olsa, Masami Hiramatsu,
Namhyung Kim, Nilay Vaish, He Kuang
Improve test backward-ring-buffer, trace both enter and exit event of
prctl() syscall, utilize auxiliary evlist to mmap enter and exit event
into separated mmaps.
Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: Nilay Vaish <nilayvaish@gmail.com>
Cc: He Kuang <hekuang@huawei.com>
---
tools/perf/tests/backward-ring-buffer.c | 85 ++++++++++++++++++++++++++-------
tools/perf/util/evlist.h | 8 ++++
2 files changed, 75 insertions(+), 18 deletions(-)
diff --git a/tools/perf/tests/backward-ring-buffer.c b/tools/perf/tests/backward-ring-buffer.c
index 1750ef2..db7393c 100644
--- a/tools/perf/tests/backward-ring-buffer.c
+++ b/tools/perf/tests/backward-ring-buffer.c
@@ -31,7 +31,11 @@ static int count_samples(struct perf_evlist *evlist, int *sample_count,
for (i = 0; i < evlist->nr_mmaps; i++) {
union perf_event *event;
- perf_evlist__mmap_read_catchup(evlist, i);
+ /*
+ * Before calling count_samples(), ring buffers in backward
+ * evlist should have catched up with newest record
+ * using perf_evlist__mmap_read_catchup_all().
+ */
while ((event = perf_evlist__mmap_read_backward(evlist, i)) != NULL) {
const u32 type = event->header.type;
@@ -51,34 +55,54 @@ static int count_samples(struct perf_evlist *evlist, int *sample_count,
return TEST_OK;
}
-static int do_test(struct perf_evlist *evlist, int mmap_pages,
- int *sample_count, int *comm_count)
+static int do_test(struct perf_evlist *evlist,
+ struct perf_evlist *aux_evlist,
+ int mmap_pages,
+ int *enter_sample_count,
+ int *exit_sample_count,
+ int *comm_count)
{
- int err;
+ int err, dummy;
char sbuf[STRERR_BUFSIZE];
- err = perf_evlist__mmap(evlist, mmap_pages, true);
+ err = perf_evlist__mmap(evlist, mmap_pages, false);
if (err < 0) {
pr_debug("perf_evlist__mmap: %s\n",
strerror_r(errno, sbuf, sizeof(sbuf)));
return TEST_FAIL;
}
+ err = perf_evlist__mmap(aux_evlist, mmap_pages, true);
+ if (err < 0) {
+ pr_debug("perf_evlist__mmap for aux_evlist: %s\n",
+ strerror_r(errno, sbuf, sizeof(sbuf)));
+ return TEST_FAIL;
+ }
+
perf_evlist__enable(evlist);
testcase();
perf_evlist__disable(evlist);
- err = count_samples(evlist, sample_count, comm_count);
+ perf_evlist__mmap_read_catchup_all(aux_evlist);
+ err = count_samples(aux_evlist, exit_sample_count, comm_count);
+ if (err)
+ goto errout;
+ err = count_samples(evlist, enter_sample_count, &dummy);
+ if (err)
+ goto errout;
+errout:
perf_evlist__munmap(evlist);
+ perf_evlist__munmap(aux_evlist);
return err;
}
int test__backward_ring_buffer(int subtest __maybe_unused)
{
- int ret = TEST_SKIP, err, sample_count = 0, comm_count = 0;
+ int ret = TEST_SKIP, err, dummy;
+ int enter_sample_count = 0, exit_sample_count = 0, comm_count = 0;
char pid[16], sbuf[STRERR_BUFSIZE];
- struct perf_evlist *evlist;
+ struct perf_evlist *evlist, *aux_evlist = NULL;
struct perf_evsel *evsel __maybe_unused;
struct parse_events_error parse_error;
struct record_opts opts = {
@@ -101,7 +125,6 @@ int test__backward_ring_buffer(int subtest __maybe_unused)
return TEST_FAIL;
}
- evlist->backward = true;
err = perf_evlist__create_maps(evlist, &opts.target);
if (err < 0) {
pr_debug("Not enough memory to create thread/cpu maps\n");
@@ -116,11 +139,21 @@ int test__backward_ring_buffer(int subtest __maybe_unused)
goto out_delete_evlist;
}
- perf_evlist__config(evlist, &opts, NULL);
+ /*
+ * Set backward bit, ring buffer should be writing from end. Record
+ * it in aux evlist
+ */
+ perf_evlist__last(evlist)->attr.write_backward = 1;
- /* Set backward bit, ring buffer should be writing from end */
- evlist__for_each_entry(evlist, evsel)
- evsel->attr.write_backward = 1;
+ err = parse_events(evlist, "syscalls:sys_exit_prctl", &parse_error);
+ if (err) {
+ pr_debug("Failed to parse tracepoint event, try use root\n");
+ ret = TEST_SKIP;
+ goto out_delete_evlist;
+ }
+ /* Don't set backward bit for exit event. Record it in main evlist */
+
+ perf_evlist__config(evlist, &opts, NULL);
err = perf_evlist__open(evlist);
if (err < 0) {
@@ -129,24 +162,40 @@ int test__backward_ring_buffer(int subtest __maybe_unused)
goto out_delete_evlist;
}
+ aux_evlist = perf_evlist__new_aux(evlist);
+ if (!aux_evlist) {
+ pr_debug("perf_evlist__new_aux failed\n");
+ goto out_delete_evlist;
+ }
+ aux_evlist->backward = true;
+
ret = TEST_FAIL;
- err = do_test(evlist, opts.mmap_pages, &sample_count,
+ err = do_test(evlist, aux_evlist, opts.mmap_pages,
+ &enter_sample_count, &exit_sample_count,
&comm_count);
if (err != TEST_OK)
goto out_delete_evlist;
- if ((sample_count != NR_ITERS) || (comm_count != NR_ITERS)) {
- pr_err("Unexpected counter: sample_count=%d, comm_count=%d\n",
- sample_count, comm_count);
+ if (enter_sample_count != exit_sample_count) {
+ pr_err("Unexpected counter: enter_sample_count=%d, exit_sample_count=%d\n",
+ enter_sample_count, exit_sample_count);
+ goto out_delete_evlist;
+ }
+
+ if ((exit_sample_count != NR_ITERS) || (comm_count != NR_ITERS)) {
+ pr_err("Unexpected counter: exit_sample_count=%d, comm_count=%d\n",
+ exit_sample_count, comm_count);
goto out_delete_evlist;
}
- err = do_test(evlist, 1, &sample_count, &comm_count);
+ err = do_test(evlist, aux_evlist, 1, &dummy, &dummy, &dummy);
if (err != TEST_OK)
goto out_delete_evlist;
ret = TEST_OK;
out_delete_evlist:
+ if (aux_evlist)
+ perf_evlist__delete(aux_evlist);
perf_evlist__delete(evlist);
return ret;
}
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index 99bcc02..d736cd3 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -149,6 +149,14 @@ union perf_event *perf_evlist__mmap_read_backward(struct perf_evlist *evlist,
int idx);
void perf_evlist__mmap_read_catchup(struct perf_evlist *evlist, int idx);
+static inline void perf_evlist__mmap_read_catchup_all(struct perf_evlist *evlist)
+{
+ int i;
+
+ for (i = 0; i < evlist->nr_mmaps; i++)
+ perf_evlist__mmap_read_catchup(evlist, i);
+}
+
void perf_evlist__mmap_consume(struct perf_evlist *evlist, int idx);
int perf_evlist__pause(struct perf_evlist *evlist);
--
1.8.3.4
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH v13 4/8] perf record: Introduce rec->overwrite_evlist for overwritable events
2016-07-04 6:20 [PATCH v13 0/8] perf tools: Support overwritable ring buffer Wang Nan
` (2 preceding siblings ...)
2016-07-04 6:20 ` [PATCH v13 3/8] perf tests: Add testcase for auxiliary evlist Wang Nan
@ 2016-07-04 6:20 ` Wang Nan
2016-07-04 6:20 ` [PATCH v13 5/8] perf record: Read from overwritable ring buffer Wang Nan
` (3 subsequent siblings)
7 siblings, 0 replies; 20+ messages in thread
From: Wang Nan @ 2016-07-04 6:20 UTC (permalink / raw)
To: acme, jolsa
Cc: linux-kernel, pi3orama, lizefan, Wang Nan, He Kuang,
Arnaldo Carvalho de Melo, Jiri Olsa, Masami Hiramatsu,
Namhyung Kim, Nilay Vaish
Create an auxiliary evlist for overwritable events.
Before mmap, build this evlist and set 'overwrite' and 'backward'
attribute. Since perf_evlist__mmap_ex() only maps events when
evsel->overwrite matches evlist's corresponding attributes, with
these two evlists an event goes to either rec->evlist or
rec->overwrite_evlist.
Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: Nilay Vaish <nilayvaish@gmail.com>
Cc: pi3orama@163.com
---
tools/perf/builtin-record.c | 59 ++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 56 insertions(+), 3 deletions(-)
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index b2b3b60..3b62295 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -51,6 +51,7 @@ struct record {
struct perf_data_file file;
struct auxtrace_record *itr;
struct perf_evlist *evlist;
+ struct perf_evlist *overwrite_evlist;
struct perf_session *session;
const char *progname;
int realtime_prio;
@@ -342,13 +343,41 @@ int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
#endif
+static int record__create_overwrite_evlist(struct record *rec)
+{
+ struct perf_evlist *evlist = rec->evlist;
+ struct perf_evsel *pos;
+
+ evlist__for_each_entry(evlist, pos) {
+ if (!pos->attr.write_backward)
+ continue;
+
+ if (!rec->overwrite_evlist) {
+ rec->overwrite_evlist = perf_evlist__new_aux(evlist);
+ if (rec->overwrite_evlist) {
+ rec->overwrite_evlist->backward = true;
+ rec->overwrite_evlist->overwrite = true;
+ return 0;
+ } else
+ return -ENOMEM;
+ }
+ }
+ return 0;
+}
+
static int record__mmap_evlist(struct record *rec,
- struct perf_evlist *evlist)
+ struct perf_evlist *evlist,
+ bool overwrite)
{
struct record_opts *opts = &rec->opts;
char msg[512];
- if (perf_evlist__mmap_ex(evlist, opts->mmap_pages, false,
+ /*
+ * Don't use evlist->overwrite because it is logically an
+ * internal attribute and is set by perf_evlist__mmap_ex().
+ * Avoid circular dependency.
+ */
+ if (perf_evlist__mmap_ex(evlist, opts->mmap_pages, overwrite,
opts->auxtrace_mmap_pages,
opts->auxtrace_snapshot_mode) < 0) {
if (errno == EPERM) {
@@ -373,7 +402,23 @@ static int record__mmap_evlist(struct record *rec,
static int record__mmap(struct record *rec)
{
- return record__mmap_evlist(rec, rec->evlist);
+ int err;
+
+ err = record__create_overwrite_evlist(rec);
+ if (err)
+ return err;
+
+ err = record__mmap_evlist(rec, rec->evlist, false);
+ if (err)
+ return err;
+
+ if (!rec->overwrite_evlist)
+ return 0;
+
+ err = record__mmap_evlist(rec, rec->overwrite_evlist, true);
+ if (err)
+ return err;
+ return 0;
}
static int record__open(struct record *rec)
@@ -698,9 +743,14 @@ static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
{
const struct perf_event_mmap_page *pc;
+ /* Change it to a loop if a new aux evlist is added */
pc = perf_evlist__pick_pc(rec->evlist);
if (pc)
return pc;
+ pc = perf_evlist__pick_pc(rec->overwrite_evlist);
+ if (pc)
+ return pc;
+
return NULL;
}
@@ -1311,6 +1361,7 @@ static struct record record = {
.mmap2 = perf_event__process_mmap2,
.ordered_events = true,
},
+ .overwrite_evlist = NULL,
};
const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
@@ -1614,6 +1665,8 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
err = __cmd_record(&record, argc, argv);
out_symbol_exit:
perf_evlist__delete(rec->evlist);
+ if (rec->overwrite_evlist)
+ perf_evlist__delete(rec->overwrite_evlist);
symbol__exit();
auxtrace_record__free(rec->itr);
return err;
--
1.8.3.4
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH v13 5/8] perf record: Read from overwritable ring buffer
2016-07-04 6:20 [PATCH v13 0/8] perf tools: Support overwritable ring buffer Wang Nan
` (3 preceding siblings ...)
2016-07-04 6:20 ` [PATCH v13 4/8] perf record: Introduce rec->overwrite_evlist for overwritable events Wang Nan
@ 2016-07-04 6:20 ` Wang Nan
2016-07-06 11:37 ` Jiri Olsa
2016-07-06 11:38 ` Jiri Olsa
2016-07-04 6:20 ` [PATCH v13 6/8] perf tools: Enable overwrite settings Wang Nan
` (2 subsequent siblings)
7 siblings, 2 replies; 20+ messages in thread
From: Wang Nan @ 2016-07-04 6:20 UTC (permalink / raw)
To: acme, jolsa
Cc: linux-kernel, pi3orama, lizefan, Wang Nan, He Kuang,
Arnaldo Carvalho de Melo, Jiri Olsa, Masami Hiramatsu,
Namhyung Kim, Nilay Vaish
overwrite_evt_state is introduced to reflect the state of overwritable
ring buffers. It is a state machine with 3 states:
.________________(forbid)_____________.
| |
| V
RUNNING --(1)--> DATA_PENDING --(2)--> EMPTY
^ ^ | ^ |
| |__(forbid)____/ |___(forbid)___/|
| |
\_________________(3)_______________/
RUNNING : Overwritable ring buffers are recording
DATA_PENDING : We are required to collect overwritable ring buffers
EMPTY : We have collected data from those ring buffers.
(1): Pause ring buffers for reading
(2): Read from ring buffers
(3): Resume ring buffers for recording
We can't avoid this complexity. Since we deliberately drop records from
overwritable ring buffer, there's no way for us to check remaining from
ring buffer itself (by checking head and old pointers). Therefore, we
need DATA_PENDING and EMPTY state to help us recording what we have done
to the ring buffer.
With the above state machine, this patch improves record__mmap_read_all(),
read from overwritable ring buffer when DATA_PENDING state is observed.
Signed-off-by: Wang Nan <wangnan0@huawei.com>
Signed-off-by: He Kuang <hekuang@huawei.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: Nilay Vaish <nilayvaish@gmail.com>
Cc: pi3orama@163.com
---
tools/perf/builtin-record.c | 137 +++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 136 insertions(+), 1 deletion(-)
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 3b62295..2a1b3c0 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -43,6 +43,30 @@
#include <sys/mman.h>
#include <asm/bug.h>
+/*
+ * State machine of overwrite_evt_state:
+ *
+ * .________________(forbid)_____________.
+ * | V
+ * RUNNING --(1)--> DATA_PENDING --(2)--> EMPTY
+ * ^ ^ | ^ |
+ * | |__(forbid)____/ |___(forbid)___/|
+ * | |
+ * \_________________(3)_______________/
+ *
+ * RUNNING : Overwritable ring buffers are recording
+ * DATA_PENDING : We are required to collect overwritable ring buffers
+ * EMPTY : We have collected data from those ring buffers.
+ *
+ * (1): Pause ring buffers for reading
+ * (2): Read from ring buffers
+ * (3): Resume ring buffers for recording
+ */
+enum overwrite_evt_state {
+ OVERWRITE_EVT_RUNNING,
+ OVERWRITE_EVT_DATA_PENDING,
+ OVERWRITE_EVT_EMPTY,
+};
struct record {
struct perf_tool tool;
@@ -62,6 +86,7 @@ struct record {
bool buildid_all;
bool timestamp_filename;
bool switch_output;
+ enum overwrite_evt_state overwrite_evt_state;
unsigned long long samples;
};
@@ -463,6 +488,7 @@ try_again:
session->evlist = evlist;
perf_session__set_id_hdr_size(session);
+ rec->overwrite_evt_state = OVERWRITE_EVT_RUNNING;
out:
return rc;
}
@@ -543,6 +569,79 @@ static struct perf_event_header finished_round_event = {
.type = PERF_RECORD_FINISHED_ROUND,
};
+static void
+record__toggle_overwrite_evsels(struct record *rec,
+ enum overwrite_evt_state state)
+{
+ struct perf_evlist *evlist = rec->overwrite_evlist;
+ enum overwrite_evt_state old_state = rec->overwrite_evt_state;
+ enum action {
+ NONE,
+ PAUSE,
+ RESUME,
+ } action = NONE;
+
+ switch (old_state) {
+ case OVERWRITE_EVT_RUNNING: {
+ switch (state) {
+ case OVERWRITE_EVT_DATA_PENDING:
+ action = PAUSE;
+ break;
+ case OVERWRITE_EVT_RUNNING:
+ case OVERWRITE_EVT_EMPTY:
+ default:
+ goto state_err;
+ }
+ break;
+ }
+ case OVERWRITE_EVT_DATA_PENDING: {
+ switch (state) {
+ case OVERWRITE_EVT_EMPTY:
+ break;
+ case OVERWRITE_EVT_RUNNING:
+ case OVERWRITE_EVT_DATA_PENDING:
+ default:
+ goto state_err;
+ }
+ break;
+ }
+ case OVERWRITE_EVT_EMPTY: {
+ switch (state) {
+ case OVERWRITE_EVT_RUNNING:
+ action = RESUME;
+ break;
+ case OVERWRITE_EVT_EMPTY:
+ case OVERWRITE_EVT_DATA_PENDING:
+ default:
+ goto state_err;
+ }
+ break;
+ }
+ default:
+ WARN_ONCE(1, "Shouldn't get there\n");
+ }
+
+ rec->overwrite_evt_state = state;
+
+ if (!evlist)
+ return;
+
+ switch (action) {
+ case PAUSE:
+ perf_evlist__pause(evlist);
+ break;
+ case RESUME:
+ perf_evlist__resume(evlist);
+ break;
+ case NONE:
+ default:
+ break;
+ }
+
+state_err:
+ return;
+}
+
static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evlist)
{
u64 bytes_written = rec->bytes_written;
@@ -588,7 +687,13 @@ static int record__mmap_read_all(struct record *rec)
if (err)
return err;
- return err;
+ if (rec->overwrite_evt_state == OVERWRITE_EVT_DATA_PENDING) {
+ err = record__mmap_read_evlist(rec, rec->overwrite_evlist);
+ if (err)
+ return err;
+ record__toggle_overwrite_evsels(rec, OVERWRITE_EVT_EMPTY);
+ }
+ return 0;
}
static void record__init_features(struct record *rec)
@@ -987,6 +1092,17 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
for (;;) {
unsigned long long hits = rec->samples;
+ /*
+ * rec->overwrite_evt_state is possible to be
+ * OVERWRITE_EVT_EMPTY here: when done == true and
+ * hits != rec->samples in previous round.
+ *
+ * record__toggle_overwrite_evsels ensure we never
+ * convert OVERWRITE_EVT_EMPTY to OVERWRITE_EVT_DATA_PENDING.
+ */
+ if (trigger_is_hit(&switch_output_trigger) || done || draining)
+ record__toggle_overwrite_evsels(rec, OVERWRITE_EVT_DATA_PENDING);
+
if (record__mmap_read_all(rec) < 0) {
trigger_error(&auxtrace_snapshot_trigger);
trigger_error(&switch_output_trigger);
@@ -1006,8 +1122,27 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
}
if (trigger_is_hit(&switch_output_trigger)) {
+ /*
+ * If switch_output_trigger is hit, the data in
+ * overwritable ring buffer should have been collected,
+ * so overwrite_evt_state should be set to
+ * OVERWRITE_EVT_EMPTY.
+ *
+ * If SIGUSR2 raise after or during record__mmap_read_all(),
+ * record__mmap_read_all() didn't collect data from
+ * overwritable ring buffer. Read again.
+ */
+ if (rec->overwrite_evt_state == OVERWRITE_EVT_RUNNING)
+ continue;
trigger_ready(&switch_output_trigger);
+ /*
+ * Reenable events in overwrite ring buffer after
+ * record__mmap_read_all(): we should have collected
+ * data from it.
+ */
+ record__toggle_overwrite_evsels(rec, OVERWRITE_EVT_RUNNING);
+
if (!quiet)
fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
waking);
--
1.8.3.4
^ permalink raw reply related [flat|nested] 20+ messages in thread
* Re: [PATCH v13 5/8] perf record: Read from overwritable ring buffer
2016-07-04 6:20 ` [PATCH v13 5/8] perf record: Read from overwritable ring buffer Wang Nan
@ 2016-07-06 11:37 ` Jiri Olsa
2016-07-06 11:38 ` Jiri Olsa
1 sibling, 0 replies; 20+ messages in thread
From: Jiri Olsa @ 2016-07-06 11:37 UTC (permalink / raw)
To: Wang Nan
Cc: acme, linux-kernel, pi3orama, lizefan, He Kuang,
Arnaldo Carvalho de Melo, Jiri Olsa, Masami Hiramatsu,
Namhyung Kim, Nilay Vaish
On Mon, Jul 04, 2016 at 06:20:06AM +0000, Wang Nan wrote:
SNIP
> @@ -463,6 +488,7 @@ try_again:
>
> session->evlist = evlist;
> perf_session__set_id_hdr_size(session);
> + rec->overwrite_evt_state = OVERWRITE_EVT_RUNNING;
> out:
> return rc;
> }
> @@ -543,6 +569,79 @@ static struct perf_event_header finished_round_event = {
> .type = PERF_RECORD_FINISHED_ROUND,
> };
>
> +static void
> +record__toggle_overwrite_evsels(struct record *rec,
> + enum overwrite_evt_state state)
> +{
record__toggle_overwrite_evlist might be better name
jirka
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH v13 5/8] perf record: Read from overwritable ring buffer
2016-07-04 6:20 ` [PATCH v13 5/8] perf record: Read from overwritable ring buffer Wang Nan
2016-07-06 11:37 ` Jiri Olsa
@ 2016-07-06 11:38 ` Jiri Olsa
2016-07-06 12:03 ` Wangnan (F)
1 sibling, 1 reply; 20+ messages in thread
From: Jiri Olsa @ 2016-07-06 11:38 UTC (permalink / raw)
To: Wang Nan
Cc: acme, linux-kernel, pi3orama, lizefan, He Kuang,
Arnaldo Carvalho de Melo, Jiri Olsa, Masami Hiramatsu,
Namhyung Kim, Nilay Vaish
On Mon, Jul 04, 2016 at 06:20:06AM +0000, Wang Nan wrote:
SNIP
> +static void
> +record__toggle_overwrite_evsels(struct record *rec,
> + enum overwrite_evt_state state)
> +{
> + struct perf_evlist *evlist = rec->overwrite_evlist;
> + enum overwrite_evt_state old_state = rec->overwrite_evt_state;
> + enum action {
> + NONE,
> + PAUSE,
> + RESUME,
> + } action = NONE;
> +
> + switch (old_state) {
> + case OVERWRITE_EVT_RUNNING: {
> + switch (state) {
> + case OVERWRITE_EVT_DATA_PENDING:
> + action = PAUSE;
> + break;
> + case OVERWRITE_EVT_RUNNING:
> + case OVERWRITE_EVT_EMPTY:
> + default:
> + goto state_err;
> + }
> + break;
> + }
> + case OVERWRITE_EVT_DATA_PENDING: {
> + switch (state) {
> + case OVERWRITE_EVT_EMPTY:
> + break;
> + case OVERWRITE_EVT_RUNNING:
> + case OVERWRITE_EVT_DATA_PENDING:
> + default:
> + goto state_err;
> + }
> + break;
> + }
> + case OVERWRITE_EVT_EMPTY: {
> + switch (state) {
> + case OVERWRITE_EVT_RUNNING:
> + action = RESUME;
> + break;
> + case OVERWRITE_EVT_EMPTY:
> + case OVERWRITE_EVT_DATA_PENDING:
> + default:
> + goto state_err;
> + }
> + break;
> + }
> + default:
> + WARN_ONCE(1, "Shouldn't get there\n");
> + }
> +
> + rec->overwrite_evt_state = state;
> +
> + if (!evlist)
> + return;
I'd expect this check at the begining
jirka
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH v13 5/8] perf record: Read from overwritable ring buffer
2016-07-06 11:38 ` Jiri Olsa
@ 2016-07-06 12:03 ` Wangnan (F)
2016-07-06 12:34 ` Jiri Olsa
0 siblings, 1 reply; 20+ messages in thread
From: Wangnan (F) @ 2016-07-06 12:03 UTC (permalink / raw)
To: Jiri Olsa
Cc: acme, linux-kernel, pi3orama, lizefan, He Kuang,
Arnaldo Carvalho de Melo, Jiri Olsa, Masami Hiramatsu,
Namhyung Kim, Nilay Vaish
On 2016/7/6 19:38, Jiri Olsa wrote:
> On Mon, Jul 04, 2016 at 06:20:06AM +0000, Wang Nan wrote:
>
> SNIP
>
>> +static void
>> +record__toggle_overwrite_evsels(struct record *rec,
>> + enum overwrite_evt_state state)
>> +{
>> + struct perf_evlist *evlist = rec->overwrite_evlist;
>> + enum overwrite_evt_state old_state = rec->overwrite_evt_state;
>> + enum action {
>> + NONE,
>> + PAUSE,
>> + RESUME,
>> + } action = NONE;
>> +
>> + switch (old_state) {
>> + case OVERWRITE_EVT_RUNNING: {
>> + switch (state) {
>> + case OVERWRITE_EVT_DATA_PENDING:
>> + action = PAUSE;
>> + break;
>> + case OVERWRITE_EVT_RUNNING:
>> + case OVERWRITE_EVT_EMPTY:
>> + default:
>> + goto state_err;
>> + }
>> + break;
>> + }
>> + case OVERWRITE_EVT_DATA_PENDING: {
>> + switch (state) {
>> + case OVERWRITE_EVT_EMPTY:
>> + break;
>> + case OVERWRITE_EVT_RUNNING:
>> + case OVERWRITE_EVT_DATA_PENDING:
>> + default:
>> + goto state_err;
>> + }
>> + break;
>> + }
>> + case OVERWRITE_EVT_EMPTY: {
>> + switch (state) {
>> + case OVERWRITE_EVT_RUNNING:
>> + action = RESUME;
>> + break;
>> + case OVERWRITE_EVT_EMPTY:
>> + case OVERWRITE_EVT_DATA_PENDING:
>> + default:
>> + goto state_err;
>> + }
>> + break;
>> + }
>> + default:
>> + WARN_ONCE(1, "Shouldn't get there\n");
>> + }
>> +
>> + rec->overwrite_evt_state = state;
>> +
>> + if (!evlist)
>> + return;
> I'd expect this check at the begining
I think even evlist is NULL the state changing is still required.
Actually, the state machine is independent with aux evlist. Even
we without overwritable evsels the state machine is still valid.
So let the state machine runs unconditionally.
> jirka
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH v13 5/8] perf record: Read from overwritable ring buffer
2016-07-06 12:03 ` Wangnan (F)
@ 2016-07-06 12:34 ` Jiri Olsa
2016-07-07 4:59 ` Wangnan (F)
0 siblings, 1 reply; 20+ messages in thread
From: Jiri Olsa @ 2016-07-06 12:34 UTC (permalink / raw)
To: Wangnan (F)
Cc: acme, linux-kernel, pi3orama, lizefan, He Kuang,
Arnaldo Carvalho de Melo, Jiri Olsa, Masami Hiramatsu,
Namhyung Kim, Nilay Vaish
On Wed, Jul 06, 2016 at 08:03:28PM +0800, Wangnan (F) wrote:
>
>
> On 2016/7/6 19:38, Jiri Olsa wrote:
> > On Mon, Jul 04, 2016 at 06:20:06AM +0000, Wang Nan wrote:
> >
> > SNIP
> >
> > > +static void
> > > +record__toggle_overwrite_evsels(struct record *rec,
> > > + enum overwrite_evt_state state)
> > > +{
> > > + struct perf_evlist *evlist = rec->overwrite_evlist;
> > > + enum overwrite_evt_state old_state = rec->overwrite_evt_state;
> > > + enum action {
> > > + NONE,
> > > + PAUSE,
> > > + RESUME,
> > > + } action = NONE;
> > > +
> > > + switch (old_state) {
> > > + case OVERWRITE_EVT_RUNNING: {
> > > + switch (state) {
> > > + case OVERWRITE_EVT_DATA_PENDING:
> > > + action = PAUSE;
> > > + break;
> > > + case OVERWRITE_EVT_RUNNING:
> > > + case OVERWRITE_EVT_EMPTY:
> > > + default:
> > > + goto state_err;
> > > + }
> > > + break;
> > > + }
> > > + case OVERWRITE_EVT_DATA_PENDING: {
> > > + switch (state) {
> > > + case OVERWRITE_EVT_EMPTY:
> > > + break;
> > > + case OVERWRITE_EVT_RUNNING:
> > > + case OVERWRITE_EVT_DATA_PENDING:
> > > + default:
> > > + goto state_err;
> > > + }
> > > + break;
> > > + }
> > > + case OVERWRITE_EVT_EMPTY: {
> > > + switch (state) {
> > > + case OVERWRITE_EVT_RUNNING:
> > > + action = RESUME;
> > > + break;
> > > + case OVERWRITE_EVT_EMPTY:
> > > + case OVERWRITE_EVT_DATA_PENDING:
> > > + default:
> > > + goto state_err;
> > > + }
> > > + break;
> > > + }
> > > + default:
> > > + WARN_ONCE(1, "Shouldn't get there\n");
> > > + }
> > > +
> > > + rec->overwrite_evt_state = state;
> > > +
> > > + if (!evlist)
> > > + return;
> > I'd expect this check at the begining
>
> I think even evlist is NULL the state changing is still required.
> Actually, the state machine is independent with aux evlist. Even
> we without overwritable evsels the state machine is still valid.
> So let the state machine runs unconditionally.
hum, can't see that.. it's state machine to govern overwrite evlist, right?
if there's no overwrite evlist we should keep the current processing
if it's meant to govern the mmap reading in general
we should at least rename it
jirka
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH v13 5/8] perf record: Read from overwritable ring buffer
2016-07-06 12:34 ` Jiri Olsa
@ 2016-07-07 4:59 ` Wangnan (F)
0 siblings, 0 replies; 20+ messages in thread
From: Wangnan (F) @ 2016-07-07 4:59 UTC (permalink / raw)
To: Jiri Olsa
Cc: acme, linux-kernel, pi3orama, lizefan, He Kuang,
Arnaldo Carvalho de Melo, Jiri Olsa, Masami Hiramatsu,
Namhyung Kim, Nilay Vaish
On 2016/7/6 20:34, Jiri Olsa wrote:
> On Wed, Jul 06, 2016 at 08:03:28PM +0800, Wangnan (F) wrote:
>>
>> On 2016/7/6 19:38, Jiri Olsa wrote:
>>> On Mon, Jul 04, 2016 at 06:20:06AM +0000, Wang Nan wrote:
>>>
>>> SNIP
>>>
>>>> +static void
>>>> +record__toggle_overwrite_evsels(struct record *rec,
>>>> + enum overwrite_evt_state state)
>>>> +{
>>>> + struct perf_evlist *evlist = rec->overwrite_evlist;
>>>> + enum overwrite_evt_state old_state = rec->overwrite_evt_state;
>>>> + enum action {
>>>> + NONE,
>>>> + PAUSE,
>>>> + RESUME,
>>>> + } action = NONE;
>>>> +
>>>> + switch (old_state) {
>>>> + case OVERWRITE_EVT_RUNNING: {
>>>> + switch (state) {
>>>> + case OVERWRITE_EVT_DATA_PENDING:
>>>> + action = PAUSE;
>>>> + break;
>>>> + case OVERWRITE_EVT_RUNNING:
>>>> + case OVERWRITE_EVT_EMPTY:
>>>> + default:
>>>> + goto state_err;
>>>> + }
>>>> + break;
>>>> + }
>>>> + case OVERWRITE_EVT_DATA_PENDING: {
>>>> + switch (state) {
>>>> + case OVERWRITE_EVT_EMPTY:
>>>> + break;
>>>> + case OVERWRITE_EVT_RUNNING:
>>>> + case OVERWRITE_EVT_DATA_PENDING:
>>>> + default:
>>>> + goto state_err;
>>>> + }
>>>> + break;
>>>> + }
>>>> + case OVERWRITE_EVT_EMPTY: {
>>>> + switch (state) {
>>>> + case OVERWRITE_EVT_RUNNING:
>>>> + action = RESUME;
>>>> + break;
>>>> + case OVERWRITE_EVT_EMPTY:
>>>> + case OVERWRITE_EVT_DATA_PENDING:
>>>> + default:
>>>> + goto state_err;
>>>> + }
>>>> + break;
>>>> + }
>>>> + default:
>>>> + WARN_ONCE(1, "Shouldn't get there\n");
>>>> + }
>>>> +
>>>> + rec->overwrite_evt_state = state;
>>>> +
>>>> + if (!evlist)
>>>> + return;
>>> I'd expect this check at the begining
>> I think even evlist is NULL the state changing is still required.
>> Actually, the state machine is independent with aux evlist. Even
>> we without overwritable evsels the state machine is still valid.
>> So let the state machine runs unconditionally.
> hum, can't see that.. it's state machine to govern overwrite evlist, right?
> if there's no overwrite evlist we should keep the current processing
Not as easy as I thought. Look at following code:
>@@ -1006,8 +1122,27 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
> }
>
> if (trigger_is_hit(&switch_output_trigger)) {
>+ /*
>+ * If switch_output_trigger is hit, the data in
>+ * overwritable ring buffer should have been collected,
>+ * so overwrite_evt_state should be set to
>+ * OVERWRITE_EVT_EMPTY.
>+ *
>+ * If SIGUSR2 raise after or during record__mmap_read_all(),
>+ * record__mmap_read_all() didn't collect data from
>+ * overwritable ring buffer. Read again.
>+ */
>+ if (rec->overwrite_evt_state == OVERWRITE_EVT_RUNNING)
>+ continue;
> trigger_ready(&switch_output_trigger);
>
>+ /*
>+ * Reenable events in overwrite ring buffer after
>+ * record__mmap_read_all(): we should have collected
>+ * data from it.
>+ */
>+ record__toggle_overwrite_evsels(rec, OVERWRITE_EVT_RUNNING);
>+
> if (!quiet)
> fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
> waking);
Here perf tests whether reading from overwritable ring buffer is required.
If SIGUSR2 is received just before the above trigger_is_hit, we should
read from
overwrite ring buffer again. The OVERWRITE_EVT_RUNNING checker is for
this reason.
Now if we stop the state machine, the state is stopped at
OVERWRITE_EVT_RUNNING,
causes perf loops forever.
We can check rec->overwrite_evlist first, but it is ugly, since I
believe the
overwritable state is independent to overwrite evlist. So I decide to
introduce
a new state indicate the overwrite evlist is not ready.
Thank you.
> if it's meant to govern the mmap reading in general
> we should at least rename it
> jirka
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH v13 6/8] perf tools: Enable overwrite settings
2016-07-04 6:20 [PATCH v13 0/8] perf tools: Support overwritable ring buffer Wang Nan
` (4 preceding siblings ...)
2016-07-04 6:20 ` [PATCH v13 5/8] perf record: Read from overwritable ring buffer Wang Nan
@ 2016-07-04 6:20 ` Wang Nan
2016-07-04 6:20 ` [PATCH v13 7/8] perf tools: Don't warn about out of order event if write_backward is used Wang Nan
2016-07-04 6:20 ` [PATCH v13 8/8] perf tools: Add --tail-synthesize option Wang Nan
7 siblings, 0 replies; 20+ messages in thread
From: Wang Nan @ 2016-07-04 6:20 UTC (permalink / raw)
To: acme, jolsa
Cc: linux-kernel, pi3orama, lizefan, Wang Nan, He Kuang,
Arnaldo Carvalho de Melo, Jiri Olsa, Masami Hiramatsu,
Namhyung Kim, Nilay Vaish
This patch allows following config terms and option:
Globally setting events to overwrite;
# perf record --overwrite ...
Set specific events to be overwrite or no-overwrite.
# perf record --event cycles/overwrite/ ...
# perf record --event cycles/no-overwrite/ ...
Add missing config terms and update config term array size because the
longest string length is changed.
For overwritable events, automatically select attr.write_backward since
perf requires it to be backward for reading.
Test result:
# perf record --overwrite -e syscalls:*enter_nanosleep* usleep 1
[ perf record: Woken up 2 times to write data ]
[ perf record: Captured and wrote 0.011 MB perf.data (1 samples) ]
# perf evlist -v
syscalls:sys_enter_nanosleep: type: 2, size: 112, config: 0x134, { sample_period, sample_freq }: 1, sample_type: IP|TID|TIME|CPU|PERIOD|RAW, disabled: 1, inherit: 1, mmap: 1, comm: 1, enable_on_exec: 1, task: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, write_backward: 1
# Tip: use 'perf evlist --trace-fields' to show fields for tracepoint events
Signed-off-by: Wang Nan <wangnan0@huawei.com>
Signed-off-by: He Kuang <hekuang@huawei.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: Nilay Vaish <nilayvaish@gmail.com>
Cc: pi3orama@163.com
---
tools/perf/Documentation/perf-record.txt | 14 ++++++++++++++
tools/perf/builtin-record.c | 1 +
tools/perf/perf.h | 1 +
tools/perf/tests/backward-ring-buffer.c | 14 ++++++--------
tools/perf/util/evsel.c | 4 ++++
tools/perf/util/evsel.h | 2 ++
tools/perf/util/parse-events.c | 20 ++++++++++++++++++--
tools/perf/util/parse-events.h | 2 ++
tools/perf/util/parse-events.l | 2 ++
9 files changed, 50 insertions(+), 10 deletions(-)
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index 5b46b1d..384c630 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -367,6 +367,20 @@ options.
'perf record --dry-run -e' can act as a BPF script compiler if llvm.dump-obj
in config file is set to true.
+--overwrite::
+Makes all events use an overwritable ring buffer. An overwritable ring
+buffer works like a flight recorder: when it gets full, the kernel will
+overwrite the oldest records, that thus will never make it to the
+perf.data file.
+
+When '--overwrite' and '--switch-output' are used perf records and drops
+events until it receives a signal, meaning that something unusual was
+detected that warrants taking a snapshot of the most current events,
+those fitting in the ring buffer at that moment.
+
+'overwrite' attribute can also be set or canceled for an event using
+config terms. For example: 'cycles/overwrite/' and 'instructions/no-overwrite/'.
+
SEE ALSO
--------
linkperf:perf-stat[1], linkperf:perf-list[1]
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 2a1b3c0..19f13ba 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -1540,6 +1540,7 @@ struct option __record_options[] = {
OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
&record.opts.no_inherit_set,
"child tasks do not inherit counters"),
+ OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
OPT_UINTEGER('F', "freq", &record.opts.user_freq, "profile at this frequency"),
OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
"number of mmap data pages and AUX area tracing mmap pages",
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index cd8f1b1..608b42b 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -59,6 +59,7 @@ struct record_opts {
bool record_switch_events;
bool all_kernel;
bool all_user;
+ bool overwrite;
unsigned int freq;
unsigned int mmap_pages;
unsigned int auxtrace_mmap_pages;
diff --git a/tools/perf/tests/backward-ring-buffer.c b/tools/perf/tests/backward-ring-buffer.c
index db7393c..c0618c7 100644
--- a/tools/perf/tests/backward-ring-buffer.c
+++ b/tools/perf/tests/backward-ring-buffer.c
@@ -132,26 +132,24 @@ int test__backward_ring_buffer(int subtest __maybe_unused)
}
bzero(&parse_error, sizeof(parse_error));
- err = parse_events(evlist, "syscalls:sys_enter_prctl", &parse_error);
+ /*
+ * Set backward bit, ring buffer should be writing from end. Record
+ * it in aux evlist
+ */
+ err = parse_events(evlist, "syscalls:sys_enter_prctl/overwrite/", &parse_error);
if (err) {
pr_debug("Failed to parse tracepoint event, try use root\n");
ret = TEST_SKIP;
goto out_delete_evlist;
}
- /*
- * Set backward bit, ring buffer should be writing from end. Record
- * it in aux evlist
- */
- perf_evlist__last(evlist)->attr.write_backward = 1;
-
+ /* Don't set backward bit for exit event. Record it in main evlist */
err = parse_events(evlist, "syscalls:sys_exit_prctl", &parse_error);
if (err) {
pr_debug("Failed to parse tracepoint event, try use root\n");
ret = TEST_SKIP;
goto out_delete_evlist;
}
- /* Don't set backward bit for exit event. Record it in main evlist */
perf_evlist__config(evlist, &opts, NULL);
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 3abe519..b5ed8cc 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -677,6 +677,9 @@ static void apply_config_terms(struct perf_evsel *evsel,
*/
attr->inherit = term->val.inherit ? 1 : 0;
break;
+ case PERF_EVSEL__CONFIG_TERM_OVERWRITE:
+ attr->write_backward = term->val.overwrite ? 1 : 0;
+ break;
default:
break;
}
@@ -758,6 +761,7 @@ void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts,
attr->sample_id_all = perf_missing_features.sample_id_all ? 0 : 1;
attr->inherit = !opts->no_inherit;
+ attr->write_backward = opts->overwrite ? 1 : 0;
perf_evsel__set_sample_bit(evsel, IP);
perf_evsel__set_sample_bit(evsel, TID);
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index a31ee2d..87912b6 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -45,6 +45,7 @@ enum {
PERF_EVSEL__CONFIG_TERM_STACK_USER,
PERF_EVSEL__CONFIG_TERM_INHERIT,
PERF_EVSEL__CONFIG_TERM_MAX_STACK,
+ PERF_EVSEL__CONFIG_TERM_OVERWRITE,
PERF_EVSEL__CONFIG_TERM_MAX,
};
@@ -59,6 +60,7 @@ struct perf_evsel_config_term {
u64 stack_user;
int max_stack;
bool inherit;
+ bool overwrite;
} val;
};
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index ebd87b7..4801b8b 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -901,6 +901,8 @@ static const char *config_term_names[__PARSE_EVENTS__TERM_TYPE_NR] = {
[PARSE_EVENTS__TERM_TYPE_NOINHERIT] = "no-inherit",
[PARSE_EVENTS__TERM_TYPE_INHERIT] = "inherit",
[PARSE_EVENTS__TERM_TYPE_MAX_STACK] = "max-stack",
+ [PARSE_EVENTS__TERM_TYPE_OVERWRITE] = "overwrite",
+ [PARSE_EVENTS__TERM_TYPE_NOOVERWRITE] = "no-overwrite",
};
static bool config_term_shrinked;
@@ -993,6 +995,12 @@ do { \
case PARSE_EVENTS__TERM_TYPE_NOINHERIT:
CHECK_TYPE_VAL(NUM);
break;
+ case PARSE_EVENTS__TERM_TYPE_OVERWRITE:
+ CHECK_TYPE_VAL(NUM);
+ break;
+ case PARSE_EVENTS__TERM_TYPE_NOOVERWRITE:
+ CHECK_TYPE_VAL(NUM);
+ break;
case PARSE_EVENTS__TERM_TYPE_NAME:
CHECK_TYPE_VAL(STR);
break;
@@ -1045,6 +1053,8 @@ static int config_term_tracepoint(struct perf_event_attr *attr,
case PARSE_EVENTS__TERM_TYPE_INHERIT:
case PARSE_EVENTS__TERM_TYPE_NOINHERIT:
case PARSE_EVENTS__TERM_TYPE_MAX_STACK:
+ case PARSE_EVENTS__TERM_TYPE_OVERWRITE:
+ case PARSE_EVENTS__TERM_TYPE_NOOVERWRITE:
return config_term_common(attr, term, err);
default:
if (err) {
@@ -1117,6 +1127,12 @@ do { \
case PARSE_EVENTS__TERM_TYPE_MAX_STACK:
ADD_CONFIG_TERM(MAX_STACK, max_stack, term->val.num);
break;
+ case PARSE_EVENTS__TERM_TYPE_OVERWRITE:
+ ADD_CONFIG_TERM(OVERWRITE, overwrite, term->val.num ? 1 : 0);
+ break;
+ case PARSE_EVENTS__TERM_TYPE_NOOVERWRITE:
+ ADD_CONFIG_TERM(OVERWRITE, overwrite, term->val.num ? 0 : 1);
+ break;
default:
break;
}
@@ -2330,9 +2346,9 @@ static void config_terms_list(char *buf, size_t buf_sz)
char *parse_events_formats_error_string(char *additional_terms)
{
char *str;
- /* "branch_type" is the longest name */
+ /* "no-overwrite" is the longest name */
char static_terms[__PARSE_EVENTS__TERM_TYPE_NR *
- (sizeof("branch_type") - 1)];
+ (sizeof("no-overwrite") - 1)];
config_terms_list(static_terms, sizeof(static_terms));
/* valid terms */
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index 46c05cc..1b04d82 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -69,6 +69,8 @@ enum {
PARSE_EVENTS__TERM_TYPE_NOINHERIT,
PARSE_EVENTS__TERM_TYPE_INHERIT,
PARSE_EVENTS__TERM_TYPE_MAX_STACK,
+ PARSE_EVENTS__TERM_TYPE_NOOVERWRITE,
+ PARSE_EVENTS__TERM_TYPE_OVERWRITE,
__PARSE_EVENTS__TERM_TYPE_NR,
};
diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l
index 3c15b33..7a25194 100644
--- a/tools/perf/util/parse-events.l
+++ b/tools/perf/util/parse-events.l
@@ -202,6 +202,8 @@ stack-size { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_STACKSIZE); }
max-stack { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_MAX_STACK); }
inherit { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_INHERIT); }
no-inherit { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_NOINHERIT); }
+overwrite { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_OVERWRITE); }
+no-overwrite { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_NOOVERWRITE); }
, { return ','; }
"/" { BEGIN(INITIAL); return '/'; }
{name_minus} { return str(yyscanner, PE_NAME); }
--
1.8.3.4
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH v13 7/8] perf tools: Don't warn about out of order event if write_backward is used
2016-07-04 6:20 [PATCH v13 0/8] perf tools: Support overwritable ring buffer Wang Nan
` (5 preceding siblings ...)
2016-07-04 6:20 ` [PATCH v13 6/8] perf tools: Enable overwrite settings Wang Nan
@ 2016-07-04 6:20 ` Wang Nan
2016-07-04 6:20 ` [PATCH v13 8/8] perf tools: Add --tail-synthesize option Wang Nan
7 siblings, 0 replies; 20+ messages in thread
From: Wang Nan @ 2016-07-04 6:20 UTC (permalink / raw)
To: acme, jolsa
Cc: linux-kernel, pi3orama, lizefan, Wang Nan, He Kuang,
Arnaldo Carvalho de Melo, Jiri Olsa, Masami Hiramatsu,
Namhyung Kim, Nilay Vaish
If write_backward attribute is set, records are written into kernel
ring buffer from end to beginning, but read from beginning to end.
To avoid 'XX out of order events recorded' warning message (timestamps
of records is in reverse order when using write_backward), suppress the
warning message if write_backward is selected by at lease one event.
Result:
Before this patch:
# perf record -m 1 -e raw_syscalls:sys_exit/overwrite/ \
-e raw_syscalls:sys_enter \
dd if=/dev/zero of=/dev/null count=300
300+0 records in
300+0 records out
153600 bytes (154 kB) copied, 0.000601617 s, 255 MB/s
[ perf record: Woken up 5 times to write data ]
Warning:
40 out of order events recorded.
[ perf record: Captured and wrote 0.096 MB perf.data (696 samples) ]
After this patch:
# perf record -m 1 -e raw_syscalls:sys_exit/overwrite/ \
-e raw_syscalls:sys_enter \
dd if=/dev/zero of=/dev/null count=300
300+0 records in
300+0 records out
153600 bytes (154 kB) copied, 0.000644873 s, 238 MB/s
[ perf record: Woken up 5 times to write data ]
[ perf record: Captured and wrote 0.096 MB perf.data (696 samples) ]
Signed-off-by: Wang Nan <wangnan0@huawei.com>
Signed-off-by: He Kuang <hekuang@huawei.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: Nilay Vaish <nilayvaish@gmail.com>
Cc: pi3orama@163.com
---
tools/perf/util/session.c | 22 +++++++++++++++++++---
1 file changed, 19 insertions(+), 3 deletions(-)
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 078d496..5d61242 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -1499,10 +1499,27 @@ int perf_session__register_idle_thread(struct perf_session *session)
return err;
}
+static void
+perf_session__warn_order(const struct perf_session *session)
+{
+ const struct ordered_events *oe = &session->ordered_events;
+ struct perf_evsel *evsel;
+ bool should_warn = true;
+
+ evlist__for_each_entry(session->evlist, evsel) {
+ if (evsel->attr.write_backward)
+ should_warn = false;
+ }
+
+ if (!should_warn)
+ return;
+ if (oe->nr_unordered_events != 0)
+ ui__warning("%u out of order events recorded.\n", oe->nr_unordered_events);
+}
+
static void perf_session__warn_about_errors(const struct perf_session *session)
{
const struct events_stats *stats = &session->evlist->stats;
- const struct ordered_events *oe = &session->ordered_events;
if (session->tool->lost == perf_event__process_lost &&
stats->nr_events[PERF_RECORD_LOST] != 0) {
@@ -1559,8 +1576,7 @@ static void perf_session__warn_about_errors(const struct perf_session *session)
stats->nr_unprocessable_samples);
}
- if (oe->nr_unordered_events != 0)
- ui__warning("%u out of order events recorded.\n", oe->nr_unordered_events);
+ perf_session__warn_order(session);
events_stats__auxtrace_error_warn(stats);
--
1.8.3.4
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH v13 8/8] perf tools: Add --tail-synthesize option
2016-07-04 6:20 [PATCH v13 0/8] perf tools: Support overwritable ring buffer Wang Nan
` (6 preceding siblings ...)
2016-07-04 6:20 ` [PATCH v13 7/8] perf tools: Don't warn about out of order event if write_backward is used Wang Nan
@ 2016-07-04 6:20 ` Wang Nan
7 siblings, 0 replies; 20+ messages in thread
From: Wang Nan @ 2016-07-04 6:20 UTC (permalink / raw)
To: acme, jolsa
Cc: linux-kernel, pi3orama, lizefan, Wang Nan, He Kuang,
Arnaldo Carvalho de Melo, Jiri Olsa, Masami Hiramatsu,
Namhyung Kim, Nilay Vaish
When working with overwritable ring buffer there's a inconvenience
problem: if perf dumps data after a long period after it starts, non-sample
events may lost, which makes following 'perf report' unable to identify
proc name and mmap layout. For example:
# perf record -m 4 -e raw_syscalls:* -g --overwrite --switch-output \
dd if=/dev/zero of=/dev/null
send SIGUSR2 after dd runs long enough. The resuling perf.data lost
correct comm and mmap events:
# perf script -i perf.data.2016061522374354
perf 24478 [004] 2581325.601789: raw_syscalls:sys_exit: NR 0 = 512
^^^^
Should be 'dd'
27b2e8 syscall_slow_exit_work+0xfe2000e3 (/lib/modules/4.6.0-rc3+/build/vmlinux)
203cc7 do_syscall_64+0xfe200117 (/lib/modules/4.6.0-rc3+/build/vmlinux)
b18d83 return_from_SYSCALL_64+0xfe200000 (/lib/modules/4.6.0-rc3+/build/vmlinux)
7f47c417edf0 [unknown] ([unknown])
^^^^^^^^^^^^
Fail to unwind
This patch provides a '--tail-synthesize' option, allows perf to collect
system status when finalizing output file. In resuling output file, the
non-sample events reflect system status when dumping data.
After this patch:
# perf record -m 4 -e raw_syscalls:* -g --overwrite --switch-output --tail-synthesize \
dd if=/dev/zero of=/dev/null
# perf script -i perf.data.2016061600544998
dd 27364 [004] 2583244.994464: raw_syscalls:sys_enter: NR 1 (1, ...
^^
Correct comm
203a18 syscall_trace_enter_phase2+0xfe2001a8 ([kernel.kallsyms])
203aa5 syscall_trace_enter+0xfe200055 ([kernel.kallsyms])
203caa do_syscall_64+0xfe2000fa ([kernel.kallsyms])
b18d83 return_from_SYSCALL_64+0xfe200000 ([kernel.kallsyms])
d8e50 __GI___libc_write+0xffff01d9639f4010 (/tmp/oxygen_root-w00229757/lib64/libc-2.18.so)
^^^^^
Correct unwind
This option doesn't aim to solve this problem completely. If a process
terminates before SIGUSR2, we still lost its COMM and MMAP events. For
example, we can't unwind correctly from the final perf.data we get from
the previous example, because when perf collects the final output file
(when we press C-c), 'dd' has been terminated so its
'/proc/<pid>/mmap' becomes empty. However, this is a cheaper choice. To
completely solve this problem we need to continously output non-sample
events. To satisify the requirement of daemonization, we need to merge
them periodically. It is possible but requires much more code and cycles.
Automatically select --tail-synthesize when --overwrite is provided.
Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: Nilay Vaish <nilayvaish@gmail.com>
Cc: pi3orama@163.com
---
tools/perf/Documentation/perf-record.txt | 8 ++++++++
tools/perf/builtin-record.c | 31 +++++++++++++++++++++++++------
tools/perf/perf.h | 1 +
3 files changed, 34 insertions(+), 6 deletions(-)
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index 384c630..69966ab 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -367,6 +367,12 @@ options.
'perf record --dry-run -e' can act as a BPF script compiler if llvm.dump-obj
in config file is set to true.
+--tail-synthesize::
+Instead of collecting non-sample events (for example, fork, comm, mmap) at
+the beginning of record, collect them during finalizing an output file.
+The collected non-sample events reflects the status of the system when
+record is finished.
+
--overwrite::
Makes all events use an overwritable ring buffer. An overwritable ring
buffer works like a flight recorder: when it gets full, the kernel will
@@ -381,6 +387,8 @@ those fitting in the ring buffer at that moment.
'overwrite' attribute can also be set or canceled for an event using
config terms. For example: 'cycles/overwrite/' and 'instructions/no-overwrite/'.
+Implies --tail-synthesize.
+
SEE ALSO
--------
linkperf:perf-stat[1], linkperf:perf-list[1]
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 19f13ba..7d0a726 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -742,13 +742,16 @@ record__finish_output(struct record *rec)
return;
}
-static int record__synthesize_workload(struct record *rec)
+static int record__synthesize_workload(struct record *rec, bool tail)
{
struct {
struct thread_map map;
struct thread_map_data map_data;
} thread_map;
+ if (rec->opts.tail_synthesize != tail)
+ return 0;
+
thread_map.map.nr = 1;
thread_map.map.map[0].pid = rec->evlist->workload.pid;
thread_map.map.map[0].comm = NULL;
@@ -759,7 +762,7 @@ static int record__synthesize_workload(struct record *rec)
rec->opts.proc_map_timeout);
}
-static int record__synthesize(struct record *rec);
+static int record__synthesize(struct record *rec, bool tail);
static int
record__switch_output(struct record *rec, bool at_exit)
@@ -770,6 +773,10 @@ record__switch_output(struct record *rec, bool at_exit)
/* Same Size: "2015122520103046"*/
char timestamp[] = "InvalidTimestamp";
+ record__synthesize(rec, true);
+ if (target__none(&rec->opts.target))
+ record__synthesize_workload(rec, true);
+
rec->samples = 0;
record__finish_output(rec);
err = fetch_current_timestamp(timestamp, sizeof(timestamp));
@@ -792,7 +799,7 @@ record__switch_output(struct record *rec, bool at_exit)
/* Output tracking events */
if (!at_exit) {
- record__synthesize(rec);
+ record__synthesize(rec, false);
/*
* In 'perf record --switch-output' without -a,
@@ -804,7 +811,7 @@ record__switch_output(struct record *rec, bool at_exit)
* perf_event__synthesize_thread_map() for those events.
*/
if (target__none(&rec->opts.target))
- record__synthesize_workload(rec);
+ record__synthesize_workload(rec, false);
}
return fd;
}
@@ -859,7 +866,7 @@ static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
return NULL;
}
-static int record__synthesize(struct record *rec)
+static int record__synthesize(struct record *rec, bool tail)
{
struct perf_session *session = rec->session;
struct machine *machine = &session->machines.host;
@@ -869,6 +876,9 @@ static int record__synthesize(struct record *rec)
int fd = perf_data_file__fd(file);
int err = 0;
+ if (rec->opts.tail_synthesize != tail)
+ return 0;
+
if (file->is_pipe) {
err = perf_event__synthesize_attrs(tool, session,
process_synthesized_event);
@@ -1032,7 +1042,7 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
machine = &session->machines.host;
- err = record__synthesize(rec);
+ err = record__synthesize(rec, false);
if (err < 0)
goto out_child;
@@ -1197,6 +1207,9 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
if (!quiet)
fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
+ if (target__none(&rec->opts.target))
+ record__synthesize_workload(rec, true);
+
out_child:
if (forks) {
int exit_status;
@@ -1215,6 +1228,7 @@ out_child:
} else
status = err;
+ record__synthesize(rec, true);
/* this will be recalculated during process_buildids() */
rec->samples = 0;
@@ -1540,6 +1554,8 @@ struct option __record_options[] = {
OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
&record.opts.no_inherit_set,
"child tasks do not inherit counters"),
+ OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
+ "synthesize non-sample events at the end of output"),
OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
OPT_UINTEGER('F', "freq", &record.opts.user_freq, "profile at this frequency"),
OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
@@ -1751,6 +1767,9 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
}
}
+ if (record.opts.overwrite)
+ record.opts.tail_synthesize = true;
+
if (rec->evlist->nr_entries == 0 &&
perf_evlist__add_default(rec->evlist) < 0) {
pr_err("Not enough memory for event selector list\n");
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index 608b42b..a7e0f14 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -59,6 +59,7 @@ struct record_opts {
bool record_switch_events;
bool all_kernel;
bool all_user;
+ bool tail_synthesize;
bool overwrite;
unsigned int freq;
unsigned int mmap_pages;
--
1.8.3.4
^ permalink raw reply related [flat|nested] 20+ messages in thread