All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] perf/core: fix RCU issues with cgroup monitoring mode
@ 2015-10-27 19:25 Stephane Eranian
  2015-10-27 20:24 ` Eric Dumazet
  0 siblings, 1 reply; 6+ messages in thread
From: Stephane Eranian @ 2015-10-27 19:25 UTC (permalink / raw)
  To: linux-kernel; +Cc: peterz, mingo, ak, edumazet, acme


This patch eliminates all known RCU violations detected
by the RCU  checker (PROVE_RCU). The impact code paths
were all related to cgroup mode monitoring and involved
access a task's cgrp.

V2 is updated to include suggestions from PeterZ to eliminate
some of the warnings without grabbing the rcu_read lock because
we know we are already holding the ctx->lock which prevents
the cgroup from disappearing while we are accessing it.
The trick, as suggested by Peter, is to modify the
perf_cgroup_from_task() to take an extra boolean parameter
to allow bypassing the lockdep test in the task_subsys_cstate()
macros. This patch uses this approach to update all calls the
perf_cgroup_from_task().
V2 Patch relative to:
8b3c8e6 Revert "rculist: Make list_entry_rcu() use lockless_dereference()"

Signed-off-by: Stephane Eranian <eranian@google.com>
---
 arch/x86/kernel/cpu/perf_event_intel_cqm.c |  2 +-
 include/linux/perf_event.h                 |  4 ++--
 kernel/events/core.c                       | 27 +++++++++++++++++----------
 3 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
index 377e8f8..d96bbf1 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -298,7 +298,7 @@ static bool __match_event(struct perf_event *a, struct perf_event *b)
 static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
 {
 	if (event->attach_state & PERF_ATTACH_TASK)
-		return perf_cgroup_from_task(event->hw.target);
+		return perf_cgroup_from_task(event->hw.target, false);
 
 	return event->cgrp;
 }
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index d841d33..24f3539 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -697,9 +697,9 @@ struct perf_cgroup {
  * if there is no cgroup event for the current CPU context.
  */
 static inline struct perf_cgroup *
-perf_cgroup_from_task(struct task_struct *task)
+perf_cgroup_from_task(struct task_struct *task, bool safe)
 {
-	return container_of(task_css(task, perf_event_cgrp_id),
+	return container_of(task_css_check(task, perf_event_cgrp_id, safe),
 			    struct perf_cgroup, css);
 }
 #endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ea02109..2003240 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -435,7 +435,8 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)
 	if (!is_cgroup_event(event))
 		return;
 
-	cgrp = perf_cgroup_from_task(current);
+	/* holding ctx->lock, so cgroup access is safe */
+	cgrp = perf_cgroup_from_task(current, true);
 	/*
 	 * Do not update time when cgroup is not active
 	 */
@@ -458,7 +459,8 @@ perf_cgroup_set_timestamp(struct task_struct *task,
 	if (!task || !ctx->nr_cgroups)
 		return;
 
-	cgrp = perf_cgroup_from_task(task);
+	/* holding ctx->lock, so cgroup access is safe */
+	cgrp = perf_cgroup_from_task(task, true);
 	info = this_cpu_ptr(cgrp->info);
 	info->timestamp = ctx->timestamp;
 }
@@ -489,7 +491,6 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
 	 * we reschedule only in the presence of cgroup
 	 * constrained events.
 	 */
-	rcu_read_lock();
 
 	list_for_each_entry_rcu(pmu, &pmus, entry) {
 		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
@@ -523,7 +524,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
 				 * event_filter_match() to not have to pass
 				 * task around
 				 */
-				cpuctx->cgrp = perf_cgroup_from_task(task);
+				cpuctx->cgrp = perf_cgroup_from_task(task, false);
 				cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
 			}
 			perf_pmu_enable(cpuctx->ctx.pmu);
@@ -531,8 +532,6 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
 		}
 	}
 
-	rcu_read_unlock();
-
 	local_irq_restore(flags);
 }
 
@@ -542,17 +541,18 @@ static inline void perf_cgroup_sched_out(struct task_struct *task,
 	struct perf_cgroup *cgrp1;
 	struct perf_cgroup *cgrp2 = NULL;
 
+	rcu_read_lock();
 	/*
 	 * we come here when we know perf_cgroup_events > 0
 	 */
-	cgrp1 = perf_cgroup_from_task(task);
+	cgrp1 = perf_cgroup_from_task(task, false);
 
 	/*
 	 * next is NULL when called from perf_event_enable_on_exec()
 	 * that will systematically cause a cgroup_switch()
 	 */
 	if (next)
-		cgrp2 = perf_cgroup_from_task(next);
+		cgrp2 = perf_cgroup_from_task(next, false);
 
 	/*
 	 * only schedule out current cgroup events if we know
@@ -561,6 +561,8 @@ static inline void perf_cgroup_sched_out(struct task_struct *task,
 	 */
 	if (cgrp1 != cgrp2)
 		perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
+
+	rcu_read_unlock();
 }
 
 static inline void perf_cgroup_sched_in(struct task_struct *prev,
@@ -569,13 +571,14 @@ static inline void perf_cgroup_sched_in(struct task_struct *prev,
 	struct perf_cgroup *cgrp1;
 	struct perf_cgroup *cgrp2 = NULL;
 
+	rcu_read_lock();
 	/*
 	 * we come here when we know perf_cgroup_events > 0
 	 */
-	cgrp1 = perf_cgroup_from_task(task);
+	cgrp1 = perf_cgroup_from_task(task, false);
 
 	/* prev can never be NULL */
-	cgrp2 = perf_cgroup_from_task(prev);
+	cgrp2 = perf_cgroup_from_task(prev, false);
 
 	/*
 	 * only need to schedule in cgroup events if we are changing
@@ -584,6 +587,8 @@ static inline void perf_cgroup_sched_in(struct task_struct *prev,
 	 */
 	if (cgrp1 != cgrp2)
 		perf_cgroup_switch(task, PERF_CGROUP_SWIN);
+
+	rcu_read_unlock();
 }
 
 static inline int perf_cgroup_connect(int fd, struct perf_event *event,
@@ -9442,7 +9447,9 @@ static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
 static int __perf_cgroup_move(void *info)
 {
 	struct task_struct *task = info;
+	rcu_read_lock();
 	perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
+	rcu_read_unlock();
 	return 0;
 }
 
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] perf/core: fix RCU issues with cgroup monitoring mode
  2015-10-27 19:25 [PATCH v2] perf/core: fix RCU issues with cgroup monitoring mode Stephane Eranian
@ 2015-10-27 20:24 ` Eric Dumazet
  2015-10-27 22:34   ` Peter Zijlstra
  2015-10-27 23:37   ` Stephane Eranian
  0 siblings, 2 replies; 6+ messages in thread
From: Eric Dumazet @ 2015-10-27 20:24 UTC (permalink / raw)
  To: Stephane Eranian; +Cc: linux-kernel, peterz, mingo, ak, edumazet, acme

On Tue, 2015-10-27 at 20:25 +0100, Stephane Eranian wrote:
> This patch eliminates all known RCU violations detected
> by the RCU  checker (PROVE_RCU). The impact code paths
> were all related to cgroup mode monitoring and involved
> access a task's cgrp.
> 
> V2 is updated to include suggestions from PeterZ to eliminate
> some of the warnings without grabbing the rcu_read lock because
> we know we are already holding the ctx->lock which prevents
> the cgroup from disappearing while we are accessing it.
> The trick, as suggested by Peter, is to modify the
> perf_cgroup_from_task() to take an extra boolean parameter
> to allow bypassing the lockdep test in the task_subsys_cstate()
> macros. This patch uses this approach to update all calls the
> perf_cgroup_from_task().
> V2 Patch relative to:
> 8b3c8e6 Revert "rculist: Make list_entry_rcu() use lockless_dereference()"
> 
> Signed-off-by: Stephane Eranian <eranian@google.com>
> ---


Instead trusting caller to provide correct 'safe' boolean,
what about using lockdep_is_held() ?

This way, you keep full lockep support.

A random example is 

#define rcu_dereference_rtnl(p) rcu_dereference_check(p, lockdep_rtnl_is_held())




^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] perf/core: fix RCU issues with cgroup monitoring mode
  2015-10-27 20:24 ` Eric Dumazet
@ 2015-10-27 22:34   ` Peter Zijlstra
  2015-10-27 23:37   ` Stephane Eranian
  1 sibling, 0 replies; 6+ messages in thread
From: Peter Zijlstra @ 2015-10-27 22:34 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Stephane Eranian, linux-kernel, mingo, ak, edumazet, acme

On Tue, Oct 27, 2015 at 01:24:05PM -0700, Eric Dumazet wrote:
> On Tue, 2015-10-27 at 20:25 +0100, Stephane Eranian wrote:
> > This patch eliminates all known RCU violations detected
> > by the RCU  checker (PROVE_RCU). The impact code paths
> > were all related to cgroup mode monitoring and involved
> > access a task's cgrp.
> > 
> > V2 is updated to include suggestions from PeterZ to eliminate
> > some of the warnings without grabbing the rcu_read lock because
> > we know we are already holding the ctx->lock which prevents
> > the cgroup from disappearing while we are accessing it.
> > The trick, as suggested by Peter, is to modify the
> > perf_cgroup_from_task() to take an extra boolean parameter
> > to allow bypassing the lockdep test in the task_subsys_cstate()
> > macros. This patch uses this approach to update all calls the
> > perf_cgroup_from_task().
> > V2 Patch relative to:
> > 8b3c8e6 Revert "rculist: Make list_entry_rcu() use lockless_dereference()"
> > 
> > Signed-off-by: Stephane Eranian <eranian@google.com>
> > ---
> 
> 
> Instead trusting caller to provide correct 'safe' boolean,
> what about using lockdep_is_held() ?
> 
> This way, you keep full lockep support.

Because its really really hard to get from a random task to the right
lock in this case :/

The connection is something like:

	task <-> cgroup <-> event <-> ctx <-> lock

So for any given task, we need to find its cgroup (easy, but this
already requires knowing the lock), we need to find all events for that
cgroup and locate the one that is for the cpu the task runs on. Then we
need to find its context and see it the lock is taken.

So aside from if being rather hard, there's also the chicken-egg
problem.

Now I suppose we can pass the right ctx in, and when !NULL use
lock_is_held() on it. But we'd need to validate that the ctx passed
matches the cgroup, and that's not entirely trivial either.

A well :/


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] perf/core: fix RCU issues with cgroup monitoring mode
  2015-10-27 20:24 ` Eric Dumazet
  2015-10-27 22:34   ` Peter Zijlstra
@ 2015-10-27 23:37   ` Stephane Eranian
  2015-10-28  0:17     ` Eric Dumazet
  1 sibling, 1 reply; 6+ messages in thread
From: Stephane Eranian @ 2015-10-27 23:37 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: LKML, Peter Zijlstra, mingo, ak, Eric Dumazet, Arnaldo Carvalho de Melo

On Tue, Oct 27, 2015 at 1:24 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Tue, 2015-10-27 at 20:25 +0100, Stephane Eranian wrote:
>> This patch eliminates all known RCU violations detected
>> by the RCU  checker (PROVE_RCU). The impact code paths
>> were all related to cgroup mode monitoring and involved
>> access a task's cgrp.
>>
>> V2 is updated to include suggestions from PeterZ to eliminate
>> some of the warnings without grabbing the rcu_read lock because
>> we know we are already holding the ctx->lock which prevents
>> the cgroup from disappearing while we are accessing it.
>> The trick, as suggested by Peter, is to modify the
>> perf_cgroup_from_task() to take an extra boolean parameter
>> to allow bypassing the lockdep test in the task_subsys_cstate()
>> macros. This patch uses this approach to update all calls the
>> perf_cgroup_from_task().
>> V2 Patch relative to:
>> 8b3c8e6 Revert "rculist: Make list_entry_rcu() use lockless_dereference()"
>>
>> Signed-off-by: Stephane Eranian <eranian@google.com>
>> ---
>
>
> Instead trusting caller to provide correct 'safe' boolean,
> what about using lockdep_is_held() ?
>
But that macro is already embedded into the  task_css_set() macro.
What we are saying here is that we have another way of ensuring the
cgroup cannot disappear here, and thus the rcu lockdep detects we
are not holding the rcu read lock but we know this is okay. We are
trying to avoid grabbing the rcu read lock when it is not really needed
to guarantee correct execution. So we are just shutting up the lockdep
in these particular cases by passing true as the safe argument value.

> This way, you keep full lockep support.
>
> A random example is
>
> #define rcu_dereference_rtnl(p) rcu_dereference_check(p, lockdep_rtnl_is_held())
>
>
>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] perf/core: fix RCU issues with cgroup monitoring mode
  2015-10-27 23:37   ` Stephane Eranian
@ 2015-10-28  0:17     ` Eric Dumazet
  2015-10-28  0:27       ` Stephane Eranian
  0 siblings, 1 reply; 6+ messages in thread
From: Eric Dumazet @ 2015-10-28  0:17 UTC (permalink / raw)
  To: Stephane Eranian
  Cc: LKML, Peter Zijlstra, mingo, ak, Eric Dumazet, Arnaldo Carvalho de Melo

On Tue, 2015-10-27 at 16:37 -0700, Stephane Eranian wrote:

> But that macro is already embedded into the  task_css_set() macro.
> What we are saying here is that we have another way of ensuring the
> cgroup cannot disappear here, and thus the rcu lockdep detects we
> are not holding the rcu read lock but we know this is okay. We are
> trying to avoid grabbing the rcu read lock when it is not really needed
> to guarantee correct execution. So we are just shutting up the lockdep
> in these particular cases by passing true as the safe argument value.

Sure, but in lockdep verbs, the following :

+       /* holding ctx->lock, so cgroup access is safe */
+       cgrp = perf_cgroup_from_task(task, true);

would map to :

cgrp = perf_cgroup_from_task(task, lockdep_is_held(&ctx->lock));

Notice the comment becomes useless.




^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] perf/core: fix RCU issues with cgroup monitoring mode
  2015-10-28  0:17     ` Eric Dumazet
@ 2015-10-28  0:27       ` Stephane Eranian
  0 siblings, 0 replies; 6+ messages in thread
From: Stephane Eranian @ 2015-10-28  0:27 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: LKML, Peter Zijlstra, mingo, ak, Eric Dumazet, Arnaldo Carvalho de Melo

On Tue, Oct 27, 2015 at 5:17 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Tue, 2015-10-27 at 16:37 -0700, Stephane Eranian wrote:
>
>> But that macro is already embedded into the  task_css_set() macro.
>> What we are saying here is that we have another way of ensuring the
>> cgroup cannot disappear here, and thus the rcu lockdep detects we
>> are not holding the rcu read lock but we know this is okay. We are
>> trying to avoid grabbing the rcu read lock when it is not really needed
>> to guarantee correct execution. So we are just shutting up the lockdep
>> in these particular cases by passing true as the safe argument value.
>
> Sure, but in lockdep verbs, the following :
>
> +       /* holding ctx->lock, so cgroup access is safe */
> +       cgrp = perf_cgroup_from_task(task, true);
>
> would map to :
>
> cgrp = perf_cgroup_from_task(task, lockdep_is_held(&ctx->lock));
>
> Notice the comment becomes useless.
>
Ok, this is more explicit for sure and should anything change, we would
catch it. I misunderstand what Eric was proposing. I will modify this for V3
then and test again.
Thanks.

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2015-10-28  0:27 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-10-27 19:25 [PATCH v2] perf/core: fix RCU issues with cgroup monitoring mode Stephane Eranian
2015-10-27 20:24 ` Eric Dumazet
2015-10-27 22:34   ` Peter Zijlstra
2015-10-27 23:37   ` Stephane Eranian
2015-10-28  0:17     ` Eric Dumazet
2015-10-28  0:27       ` Stephane Eranian

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.