linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] perf: update perf_cgroup time for ancestor cgroup(s)
@ 2018-03-12 16:59 Song Liu
  2018-03-14 21:30 ` Song Liu
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: Song Liu @ 2018-03-12 16:59 UTC (permalink / raw)
  To: linux-kernel, peterz, jolsa; +Cc: kernel-team, ephiepark, Song Liu

When a perf_event is attached to parent cgroup, it should count events
for all children cgroups:

 parent_group   <---- perf_event
   \
    - child_group  <---- process(es)

However, in our tests, we found this perf_event cannot report reliable
results. Here is an example case:

  # create cgroups
  mkdir -p /sys/fs/cgroup/p/c
  # start perf for parent group
  perf stat -e instructions -G "p"

  # on another console, run test process in child cgroup:
  stressapptest -s 2 -M 1000 & echo $! > /sys/fs/cgroup/p/c/cgroup.procs

  # after the test process is done, stop perf in the first console shows

       <not counted>      instructions              p

The instruction should not be "not counted" as the process runs in the
child cgroup.

We found this is because perf_event->cgrp and cpuctx->cgrp are not
identical, thus perf_event->cgrp are not updated properly.

This patch fixes this by updating perf_cgroup properly for ancestor
cgroup(s).

Signed-off-by: Song Liu <songliubraving@fb.com>
Reported-by: Ephraim Park <ephiepark@fb.com>
---
 kernel/events/core.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5789810..6f015ff 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -724,9 +724,14 @@ static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
 
 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
 {
-	struct perf_cgroup *cgrp_out = cpuctx->cgrp;
-	if (cgrp_out)
-		__update_cgrp_time(cgrp_out);
+	struct perf_cgroup *cgrp = cpuctx->cgrp;
+	struct cgroup_subsys_state *css;
+
+	if (cgrp)
+		for (css = &cgrp->css; css; css = css->parent) {
+			cgrp = container_of(css, struct perf_cgroup, css);
+			__update_cgrp_time(cgrp);
+		}
 }
 
 static inline void update_cgrp_time_from_event(struct perf_event *event)
@@ -754,6 +759,7 @@ perf_cgroup_set_timestamp(struct task_struct *task,
 {
 	struct perf_cgroup *cgrp;
 	struct perf_cgroup_info *info;
+	struct cgroup_subsys_state *css;
 
 	/*
 	 * ctx->lock held by caller
@@ -764,8 +770,12 @@ perf_cgroup_set_timestamp(struct task_struct *task,
 		return;
 
 	cgrp = perf_cgroup_from_task(task, ctx);
-	info = this_cpu_ptr(cgrp->info);
-	info->timestamp = ctx->timestamp;
+
+	for (css = &cgrp->css; css; css = css->parent) {
+		cgrp = container_of(css, struct perf_cgroup, css);
+		info = this_cpu_ptr(cgrp->info);
+		info->timestamp = ctx->timestamp;
+	}
 }
 
 static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
-- 
2.9.5

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH v2] perf: update perf_cgroup time for ancestor cgroup(s)
  2018-03-12 16:59 [PATCH v2] perf: update perf_cgroup time for ancestor cgroup(s) Song Liu
@ 2018-03-14 21:30 ` Song Liu
  2018-03-16 14:24 ` Peter Zijlstra
  2018-03-20 11:18 ` [tip:perf/urgent] perf/cgroup: Fix child event counting bug tip-bot for Song Liu
  2 siblings, 0 replies; 4+ messages in thread
From: Song Liu @ 2018-03-14 21:30 UTC (permalink / raw)
  To: LKML, peterz, jolsa; +Cc: Kernel Team, Ephraim Park

Dear Peter, 

Could you please share your comments/suggestions on this patch? We would
like to fix this issue in our kernel, as we are using perf events with 
nested cgroups. 

Thanks,
Song

> On Mar 12, 2018, at 9:59 AM, Song Liu <songliubraving@fb.com> wrote:
> 
> When a perf_event is attached to parent cgroup, it should count events
> for all children cgroups:
> 
> parent_group   <---- perf_event
>   \
>    - child_group  <---- process(es)
> 
> However, in our tests, we found this perf_event cannot report reliable
> results. Here is an example case:
> 
>  # create cgroups
>  mkdir -p /sys/fs/cgroup/p/c
>  # start perf for parent group
>  perf stat -e instructions -G "p"
> 
>  # on another console, run test process in child cgroup:
>  stressapptest -s 2 -M 1000 & echo $! > /sys/fs/cgroup/p/c/cgroup.procs
> 
>  # after the test process is done, stop perf in the first console shows
> 
>       <not counted>      instructions              p
> 
> The instruction should not be "not counted" as the process runs in the
> child cgroup.
> 
> We found this is because perf_event->cgrp and cpuctx->cgrp are not
> identical, thus perf_event->cgrp are not updated properly.
> 
> This patch fixes this by updating perf_cgroup properly for ancestor
> cgroup(s).
> 
> Signed-off-by: Song Liu <songliubraving@fb.com>
> Reported-by: Ephraim Park <ephiepark@fb.com>
> ---
> kernel/events/core.c | 20 +++++++++++++++-----
> 1 file changed, 15 insertions(+), 5 deletions(-)
> 
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 5789810..6f015ff 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -724,9 +724,14 @@ static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
> 
> static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
> {
> -	struct perf_cgroup *cgrp_out = cpuctx->cgrp;
> -	if (cgrp_out)
> -		__update_cgrp_time(cgrp_out);
> +	struct perf_cgroup *cgrp = cpuctx->cgrp;
> +	struct cgroup_subsys_state *css;
> +
> +	if (cgrp)
> +		for (css = &cgrp->css; css; css = css->parent) {
> +			cgrp = container_of(css, struct perf_cgroup, css);
> +			__update_cgrp_time(cgrp);
> +		}
> }
> 
> static inline void update_cgrp_time_from_event(struct perf_event *event)
> @@ -754,6 +759,7 @@ perf_cgroup_set_timestamp(struct task_struct *task,
> {
> 	struct perf_cgroup *cgrp;
> 	struct perf_cgroup_info *info;
> +	struct cgroup_subsys_state *css;
> 
> 	/*
> 	 * ctx->lock held by caller
> @@ -764,8 +770,12 @@ perf_cgroup_set_timestamp(struct task_struct *task,
> 		return;
> 
> 	cgrp = perf_cgroup_from_task(task, ctx);
> -	info = this_cpu_ptr(cgrp->info);
> -	info->timestamp = ctx->timestamp;
> +
> +	for (css = &cgrp->css; css; css = css->parent) {
> +		cgrp = container_of(css, struct perf_cgroup, css);
> +		info = this_cpu_ptr(cgrp->info);
> +		info->timestamp = ctx->timestamp;
> +	}
> }
> 
> static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
> -- 
> 2.9.5
> 

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v2] perf: update perf_cgroup time for ancestor cgroup(s)
  2018-03-12 16:59 [PATCH v2] perf: update perf_cgroup time for ancestor cgroup(s) Song Liu
  2018-03-14 21:30 ` Song Liu
@ 2018-03-16 14:24 ` Peter Zijlstra
  2018-03-20 11:18 ` [tip:perf/urgent] perf/cgroup: Fix child event counting bug tip-bot for Song Liu
  2 siblings, 0 replies; 4+ messages in thread
From: Peter Zijlstra @ 2018-03-16 14:24 UTC (permalink / raw)
  To: Song Liu; +Cc: linux-kernel, jolsa, kernel-team, ephiepark

On Mon, Mar 12, 2018 at 09:59:43AM -0700, Song Liu wrote:
> When a perf_event is attached to parent cgroup, it should count events
> for all children cgroups:
> 
>  parent_group   <---- perf_event
>    \
>     - child_group  <---- process(es)
> 
> However, in our tests, we found this perf_event cannot report reliable
> results. Here is an example case:
> 
>   # create cgroups
>   mkdir -p /sys/fs/cgroup/p/c
>   # start perf for parent group
>   perf stat -e instructions -G "p"
> 
>   # on another console, run test process in child cgroup:
>   stressapptest -s 2 -M 1000 & echo $! > /sys/fs/cgroup/p/c/cgroup.procs
> 
>   # after the test process is done, stop perf in the first console shows
> 
>        <not counted>      instructions              p
> 
> The instruction should not be "not counted" as the process runs in the
> child cgroup.
> 
> We found this is because perf_event->cgrp and cpuctx->cgrp are not
> identical, thus perf_event->cgrp are not updated properly.
> 
> This patch fixes this by updating perf_cgroup properly for ancestor
> cgroup(s).
> 
> Signed-off-by: Song Liu <songliubraving@fb.com>
> Reported-by: Ephraim Park <ephiepark@fb.com>

Yeah, that looks about right, Thanks!

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [tip:perf/urgent] perf/cgroup: Fix child event counting bug
  2018-03-12 16:59 [PATCH v2] perf: update perf_cgroup time for ancestor cgroup(s) Song Liu
  2018-03-14 21:30 ` Song Liu
  2018-03-16 14:24 ` Peter Zijlstra
@ 2018-03-20 11:18 ` tip-bot for Song Liu
  2 siblings, 0 replies; 4+ messages in thread
From: tip-bot for Song Liu @ 2018-03-20 11:18 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: hpa, peterz, mingo, vincent.weaver, songliubraving, acme,
	torvalds, linux-kernel, tglx, kernel-team, eranian,
	alexander.shishkin, jolsa, ephiepark

Commit-ID:  c917e0f259908e75bd2a65877e25f9d90c22c848
Gitweb:     https://git.kernel.org/tip/c917e0f259908e75bd2a65877e25f9d90c22c848
Author:     Song Liu <songliubraving@fb.com>
AuthorDate: Mon, 12 Mar 2018 09:59:43 -0700
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Tue, 20 Mar 2018 08:58:47 +0100

perf/cgroup: Fix child event counting bug

When a perf_event is attached to parent cgroup, it should count events
for all children cgroups:

   parent_group   <---- perf_event
     \
      - child_group  <---- process(es)

However, in our tests, we found this perf_event cannot report reliable
results. Here is an example case:

  # create cgroups
  mkdir -p /sys/fs/cgroup/p/c
  # start perf for parent group
  perf stat -e instructions -G "p"

  # on another console, run test process in child cgroup:
  stressapptest -s 2 -M 1000 & echo $! > /sys/fs/cgroup/p/c/cgroup.procs

  # after the test process is done, stop perf in the first console shows

       <not counted>      instructions              p

The instruction should not be "not counted" as the process runs in the
child cgroup.

We found this is because perf_event->cgrp and cpuctx->cgrp are not
identical, thus perf_event->cgrp are not updated properly.

This patch fixes this by updating perf_cgroup properly for ancestor
cgroup(s).

Reported-by: Ephraim Park <ephiepark@fb.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <jolsa@redhat.com>
Cc: <kernel-team@fb.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/20180312165943.1057894-1-songliubraving@fb.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4b838470fac4..709a55b9ad97 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -724,9 +724,15 @@ static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
 
 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
 {
-	struct perf_cgroup *cgrp_out = cpuctx->cgrp;
-	if (cgrp_out)
-		__update_cgrp_time(cgrp_out);
+	struct perf_cgroup *cgrp = cpuctx->cgrp;
+	struct cgroup_subsys_state *css;
+
+	if (cgrp) {
+		for (css = &cgrp->css; css; css = css->parent) {
+			cgrp = container_of(css, struct perf_cgroup, css);
+			__update_cgrp_time(cgrp);
+		}
+	}
 }
 
 static inline void update_cgrp_time_from_event(struct perf_event *event)
@@ -754,6 +760,7 @@ perf_cgroup_set_timestamp(struct task_struct *task,
 {
 	struct perf_cgroup *cgrp;
 	struct perf_cgroup_info *info;
+	struct cgroup_subsys_state *css;
 
 	/*
 	 * ctx->lock held by caller
@@ -764,8 +771,12 @@ perf_cgroup_set_timestamp(struct task_struct *task,
 		return;
 
 	cgrp = perf_cgroup_from_task(task, ctx);
-	info = this_cpu_ptr(cgrp->info);
-	info->timestamp = ctx->timestamp;
+
+	for (css = &cgrp->css; css; css = css->parent) {
+		cgrp = container_of(css, struct perf_cgroup, css);
+		info = this_cpu_ptr(cgrp->info);
+		info->timestamp = ctx->timestamp;
+	}
 }
 
 static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);

^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2018-03-20 11:18 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-03-12 16:59 [PATCH v2] perf: update perf_cgroup time for ancestor cgroup(s) Song Liu
2018-03-14 21:30 ` Song Liu
2018-03-16 14:24 ` Peter Zijlstra
2018-03-20 11:18 ` [tip:perf/urgent] perf/cgroup: Fix child event counting bug tip-bot for Song Liu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).