linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC PATCH] perf/core: fix cpuctx cgrp warning
@ 2022-03-08 13:59 Chengming Zhou
  2022-03-10  9:25 ` Namhyung Kim
  0 siblings, 1 reply; 5+ messages in thread
From: Chengming Zhou @ 2022-03-08 13:59 UTC (permalink / raw)
  To: peterz, mingo, acme, mark.rutland, alexander.shishkin, jolsa,
	namhyung, eranian
  Cc: linux-perf-users, linux-kernel, songmuchun, Chengming Zhou

There is a race problem that can trigger WARN_ON_ONCE(cpuctx->cgrp)
in perf_cgroup_switch().

CPU1					CPU2
(in context_switch)			(attach running task)
perf_cgroup_sched_out(task, next)
	if (cgrp1 != cgrp2) True
					task->cgroups = xxx
					perf_cgroup_attach()
perf_cgroup_sched_in(prev, task)
	if (cgrp1 != cgrp2) False

The commit a8d757ef076f ("perf events: Fix slow and broken cgroup
context switch code") would save cpuctx switch in/out when the
perf_cgroup of "prev" and "next" are the same.

But perf_cgroup of task can change in concurrent with context_switch.
If cgrp1 == cgrp2 in sched_out(), cpuctx won't do switch out, then
task perf_cgroup changed cause cgrp1 != cgrp2 in sched_in(), cpuctx
will do switch in, and trigger WARN_ON_ONCE(cpuctx->cgrp).

The perf_cgroup of "prev" and "next" can be changed at any time, so we
first have to combine perf_cgroup_sched_in() into perf_cgroup_sched_out(),
so we can get a consistent value of condition (cgrp1 == cgrp2).

And we introduce a percpu "cpu_perf_cgroups" to track the current used
perf_cgroup, instead of using the unstable perf_cgroup of "prev", which
maybe not the cpuctx->cgrp we used to schedule cgroup events on cpu.

Fixes: a8d757ef076f ("perf events: Fix slow and broken cgroup context
switch code")
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
---
 kernel/events/core.c | 95 +++++++++++---------------------------------
 1 file changed, 23 insertions(+), 72 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6859229497b1..f3bc2841141f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -826,6 +826,7 @@ perf_cgroup_set_timestamp(struct task_struct *task,
 	}
 }
 
+static DEFINE_PER_CPU(struct perf_cgroup *, cpu_perf_cgroups);
 static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
 
 #define PERF_CGROUP_SWOUT	0x1 /* cgroup switch out every event */
@@ -837,8 +838,9 @@ static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
  * mode SWOUT : schedule out everything
  * mode SWIN : schedule in based on cgroup for next
  */
-static void perf_cgroup_switch(struct task_struct *task, int mode)
+static void perf_cgroup_switch(struct task_struct *task)
 {
+	struct perf_cgroup *cgrp;
 	struct perf_cpu_context *cpuctx, *tmp;
 	struct list_head *list;
 	unsigned long flags;
@@ -849,6 +851,9 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
 	 */
 	local_irq_save(flags);
 
+	cgrp = perf_cgroup_from_task(task, NULL);
+	__this_cpu_write(cpu_perf_cgroups, cgrp);
+
 	list = this_cpu_ptr(&cgrp_cpuctx_list);
 	list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) {
 		WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
@@ -856,28 +861,15 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
 		perf_ctx_lock(cpuctx, cpuctx->task_ctx);
 		perf_pmu_disable(cpuctx->ctx.pmu);
 
-		if (mode & PERF_CGROUP_SWOUT) {
-			cpu_ctx_sched_out(cpuctx, EVENT_ALL);
-			/*
-			 * must not be done before ctxswout due
-			 * to event_filter_match() in event_sched_out()
-			 */
-			cpuctx->cgrp = NULL;
-		}
+		cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+		/*
+		 * must not be done before ctxswout due
+		 * to event_filter_match() in event_sched_out()
+		 */
+		cpuctx->cgrp = cgrp;
+
+		cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
 
-		if (mode & PERF_CGROUP_SWIN) {
-			WARN_ON_ONCE(cpuctx->cgrp);
-			/*
-			 * set cgrp before ctxsw in to allow
-			 * event_filter_match() to not have to pass
-			 * task around
-			 * we pass the cpuctx->ctx to perf_cgroup_from_task()
-			 * because cgorup events are only per-cpu
-			 */
-			cpuctx->cgrp = perf_cgroup_from_task(task,
-							     &cpuctx->ctx);
-			cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
-		}
 		perf_pmu_enable(cpuctx->ctx.pmu);
 		perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 	}
@@ -885,8 +877,8 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
 	local_irq_restore(flags);
 }
 
-static inline void perf_cgroup_sched_out(struct task_struct *task,
-					 struct task_struct *next)
+static inline void perf_cgroup_sched_switch(struct task_struct *task,
+					    struct task_struct *next)
 {
 	struct perf_cgroup *cgrp1;
 	struct perf_cgroup *cgrp2 = NULL;
@@ -897,7 +889,7 @@ static inline void perf_cgroup_sched_out(struct task_struct *task,
 	 * we do not need to pass the ctx here because we know
 	 * we are holding the rcu lock
 	 */
-	cgrp1 = perf_cgroup_from_task(task, NULL);
+	cgrp1 = __this_cpu_read(cpu_perf_cgroups);
 	cgrp2 = perf_cgroup_from_task(next, NULL);
 
 	/*
@@ -906,33 +898,7 @@ static inline void perf_cgroup_sched_out(struct task_struct *task,
 	 * do no touch the cgroup events.
 	 */
 	if (cgrp1 != cgrp2)
-		perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
-
-	rcu_read_unlock();
-}
-
-static inline void perf_cgroup_sched_in(struct task_struct *prev,
-					struct task_struct *task)
-{
-	struct perf_cgroup *cgrp1;
-	struct perf_cgroup *cgrp2 = NULL;
-
-	rcu_read_lock();
-	/*
-	 * we come here when we know perf_cgroup_events > 0
-	 * we do not need to pass the ctx here because we know
-	 * we are holding the rcu lock
-	 */
-	cgrp1 = perf_cgroup_from_task(task, NULL);
-	cgrp2 = perf_cgroup_from_task(prev, NULL);
-
-	/*
-	 * only need to schedule in cgroup events if we are changing
-	 * cgroup during ctxsw. Cgroup events were not scheduled
-	 * out of ctxsw out if that was not the case.
-	 */
-	if (cgrp1 != cgrp2)
-		perf_cgroup_switch(task, PERF_CGROUP_SWIN);
+		perf_cgroup_switch(task);
 
 	rcu_read_unlock();
 }
@@ -1100,13 +1066,8 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx,
 {
 }
 
-static inline void perf_cgroup_sched_out(struct task_struct *task,
-					 struct task_struct *next)
-{
-}
-
-static inline void perf_cgroup_sched_in(struct task_struct *prev,
-					struct task_struct *task)
+static inline void perf_cgroup_sched_switch(struct task_struct *task,
+					    struct task_struct *next)
 {
 }
 
@@ -1124,7 +1085,7 @@ perf_cgroup_set_timestamp(struct task_struct *task,
 }
 
 static inline void
-perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
+perf_cgroup_sched_switch(struct task_struct *task, struct task_struct *next)
 {
 }
 
@@ -3668,7 +3629,7 @@ void __perf_event_task_sched_out(struct task_struct *task,
 	 * cgroup event are system-wide mode only
 	 */
 	if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
-		perf_cgroup_sched_out(task, next);
+		perf_cgroup_sched_switch(task, next);
 }
 
 /*
@@ -3984,16 +3945,6 @@ void __perf_event_task_sched_in(struct task_struct *prev,
 	struct perf_event_context *ctx;
 	int ctxn;
 
-	/*
-	 * If cgroup events exist on this CPU, then we need to check if we have
-	 * to switch in PMU state; cgroup event are system-wide mode only.
-	 *
-	 * Since cgroup events are CPU events, we must schedule these in before
-	 * we schedule in the task events.
-	 */
-	if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
-		perf_cgroup_sched_in(prev, task);
-
 	for_each_task_context_nr(ctxn) {
 		ctx = task->perf_event_ctxp[ctxn];
 		if (likely(!ctx))
@@ -13564,7 +13515,7 @@ static int __perf_cgroup_move(void *info)
 {
 	struct task_struct *task = info;
 	rcu_read_lock();
-	perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
+	perf_cgroup_switch(task);
 	rcu_read_unlock();
 	return 0;
 }
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [RFC PATCH] perf/core: fix cpuctx cgrp warning
  2022-03-08 13:59 [RFC PATCH] perf/core: fix cpuctx cgrp warning Chengming Zhou
@ 2022-03-10  9:25 ` Namhyung Kim
  2022-03-10 12:01   ` [Phishing Risk] [External] " Chengming Zhou
  0 siblings, 1 reply; 5+ messages in thread
From: Namhyung Kim @ 2022-03-10  9:25 UTC (permalink / raw)
  To: Chengming Zhou
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Mark Rutland, Alexander Shishkin, Jiri Olsa, Stephane Eranian,
	linux-perf-users, linux-kernel, songmuchun

Hello,

On Tue, Mar 8, 2022 at 6:00 AM Chengming Zhou
<zhouchengming@bytedance.com> wrote:
>
> There is a race problem that can trigger WARN_ON_ONCE(cpuctx->cgrp)
> in perf_cgroup_switch().
>
> CPU1                                    CPU2
> (in context_switch)                     (attach running task)
> perf_cgroup_sched_out(task, next)
>         if (cgrp1 != cgrp2) True
>                                         task->cgroups = xxx
>                                         perf_cgroup_attach()
> perf_cgroup_sched_in(prev, task)
>         if (cgrp1 != cgrp2) False

But perf_cgroup_switch will be synchronized as the context switch
disables the interrupt.  And right, it still can see the task->cgroups
is changing in the middle.

>
> The commit a8d757ef076f ("perf events: Fix slow and broken cgroup
> context switch code") would save cpuctx switch in/out when the
> perf_cgroup of "prev" and "next" are the same.
>
> But perf_cgroup of task can change in concurrent with context_switch.
> If cgrp1 == cgrp2 in sched_out(), cpuctx won't do switch out, then
> task perf_cgroup changed cause cgrp1 != cgrp2 in sched_in(), cpuctx
> will do switch in, and trigger WARN_ON_ONCE(cpuctx->cgrp).
>
> The perf_cgroup of "prev" and "next" can be changed at any time, so we
> first have to combine perf_cgroup_sched_in() into perf_cgroup_sched_out(),
> so we can get a consistent value of condition (cgrp1 == cgrp2).
>
> And we introduce a percpu "cpu_perf_cgroups" to track the current used
> perf_cgroup, instead of using the unstable perf_cgroup of "prev", which
> maybe not the cpuctx->cgrp we used to schedule cgroup events on cpu.

Is this really needed?  I think the warning comes because the two
cgroups were the same when in sched-out, but they became
different when in sched-in.  So just combining sched-in/out should
be ok, isn't it?

>
> Fixes: a8d757ef076f ("perf events: Fix slow and broken cgroup context
> switch code")
> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
> ---
>  kernel/events/core.c | 95 +++++++++++---------------------------------
>  1 file changed, 23 insertions(+), 72 deletions(-)
>
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 6859229497b1..f3bc2841141f 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -826,6 +826,7 @@ perf_cgroup_set_timestamp(struct task_struct *task,
>         }
>  }
>
> +static DEFINE_PER_CPU(struct perf_cgroup *, cpu_perf_cgroups);
>  static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
>
>  #define PERF_CGROUP_SWOUT      0x1 /* cgroup switch out every event */
> @@ -837,8 +838,9 @@ static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
>   * mode SWOUT : schedule out everything
>   * mode SWIN : schedule in based on cgroup for next

You can remove this comment now.

>   */
> -static void perf_cgroup_switch(struct task_struct *task, int mode)
> +static void perf_cgroup_switch(struct task_struct *task)
>  {
> +       struct perf_cgroup *cgrp;
>         struct perf_cpu_context *cpuctx, *tmp;
>         struct list_head *list;
>         unsigned long flags;
> @@ -849,6 +851,9 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
>          */
>         local_irq_save(flags);
>
> +       cgrp = perf_cgroup_from_task(task, NULL);
> +       __this_cpu_write(cpu_perf_cgroups, cgrp);
> +
>         list = this_cpu_ptr(&cgrp_cpuctx_list);
>         list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) {
>                 WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
> @@ -856,28 +861,15 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
>                 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
>                 perf_pmu_disable(cpuctx->ctx.pmu);
>
> -               if (mode & PERF_CGROUP_SWOUT) {
> -                       cpu_ctx_sched_out(cpuctx, EVENT_ALL);
> -                       /*
> -                        * must not be done before ctxswout due
> -                        * to event_filter_match() in event_sched_out()

Unrelated, but I don't see the event_filter_match() in
event_sched_out() anymore.  Does it sched-out all
non-cgroup cpu events here?

> -                        */
> -                       cpuctx->cgrp = NULL;
> -               }
> +               cpu_ctx_sched_out(cpuctx, EVENT_ALL);
> +               /*
> +                * must not be done before ctxswout due
> +                * to event_filter_match() in event_sched_out()
> +                */
> +               cpuctx->cgrp = cgrp;

Maybe we can check cpuctx->cgrp is the same as task's
cgroup before accessing the pmu.  As in the commit message
it can call perf_cgroup_switch() after the context switch so
the cgroup events might be scheduled already.

Thanks,
Namhyung


> +
> +               cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
>
> -               if (mode & PERF_CGROUP_SWIN) {
> -                       WARN_ON_ONCE(cpuctx->cgrp);
> -                       /*
> -                        * set cgrp before ctxsw in to allow
> -                        * event_filter_match() to not have to pass
> -                        * task around
> -                        * we pass the cpuctx->ctx to perf_cgroup_from_task()
> -                        * because cgorup events are only per-cpu
> -                        */
> -                       cpuctx->cgrp = perf_cgroup_from_task(task,
> -                                                            &cpuctx->ctx);
> -                       cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
> -               }
>                 perf_pmu_enable(cpuctx->ctx.pmu);
>                 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
>         }

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [Phishing Risk] [External] Re: [RFC PATCH] perf/core: fix cpuctx cgrp warning
  2022-03-10  9:25 ` Namhyung Kim
@ 2022-03-10 12:01   ` Chengming Zhou
  2022-03-10 18:02     ` Namhyung Kim
  0 siblings, 1 reply; 5+ messages in thread
From: Chengming Zhou @ 2022-03-10 12:01 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Mark Rutland, Alexander Shishkin, Jiri Olsa, Stephane Eranian,
	linux-perf-users, linux-kernel, songmuchun

Hello,

On 2022/3/10 5:25 下午, Namhyung Kim wrote:
> Hello,
> 
> On Tue, Mar 8, 2022 at 6:00 AM Chengming Zhou
> <zhouchengming@bytedance.com> wrote:
>>
>> There is a race problem that can trigger WARN_ON_ONCE(cpuctx->cgrp)
>> in perf_cgroup_switch().
>>
>> CPU1                                    CPU2
>> (in context_switch)                     (attach running task)
>> perf_cgroup_sched_out(task, next)
>>         if (cgrp1 != cgrp2) True
>>                                         task->cgroups = xxx
>>                                         perf_cgroup_attach()
>> perf_cgroup_sched_in(prev, task)
>>         if (cgrp1 != cgrp2) False
> 
> But perf_cgroup_switch will be synchronized as the context switch
> disables the interrupt.  And right, it still can see the task->cgroups
> is changing in the middle.
> 
>>
>> The commit a8d757ef076f ("perf events: Fix slow and broken cgroup
>> context switch code") would save cpuctx switch in/out when the
>> perf_cgroup of "prev" and "next" are the same.
>>
>> But perf_cgroup of task can change in concurrent with context_switch.
>> If cgrp1 == cgrp2 in sched_out(), cpuctx won't do switch out, then
>> task perf_cgroup changed cause cgrp1 != cgrp2 in sched_in(), cpuctx
>> will do switch in, and trigger WARN_ON_ONCE(cpuctx->cgrp).
>>
>> The perf_cgroup of "prev" and "next" can be changed at any time, so we
>> first have to combine perf_cgroup_sched_in() into perf_cgroup_sched_out(),
>> so we can get a consistent value of condition (cgrp1 == cgrp2).
>>
>> And we introduce a percpu "cpu_perf_cgroups" to track the current used
>> perf_cgroup, instead of using the unstable perf_cgroup of "prev", which
>> maybe not the cpuctx->cgrp we used to schedule cgroup events on cpu.
> 
> Is this really needed?  I think the warning comes because the two
> cgroups were the same when in sched-out, but they became
> different when in sched-in.  So just combining sched-in/out should
> be ok, isn't it?

If we get perf_cgroup from prev->cgroups that can be changed in the
context_switch(), make the condition (cgrp1 == cgrp2) is true, then
we won't do sched_out/in. So the events of prev's previous cgrp will
still be on the CPU.

Even that CPU would receive IPI from perf_cgroup_attach() after
context_switch(), remote_function() will do nothing because prev task
is not current running anymore.

> 
>>
>> Fixes: a8d757ef076f ("perf events: Fix slow and broken cgroup context
>> switch code")
>> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
>> ---
>>  kernel/events/core.c | 95 +++++++++++---------------------------------
>>  1 file changed, 23 insertions(+), 72 deletions(-)
>>
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index 6859229497b1..f3bc2841141f 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -826,6 +826,7 @@ perf_cgroup_set_timestamp(struct task_struct *task,
>>         }
>>  }
>>
>> +static DEFINE_PER_CPU(struct perf_cgroup *, cpu_perf_cgroups);
>>  static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
>>
>>  #define PERF_CGROUP_SWOUT      0x1 /* cgroup switch out every event */
>> @@ -837,8 +838,9 @@ static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
>>   * mode SWOUT : schedule out everything
>>   * mode SWIN : schedule in based on cgroup for next
> 
> You can remove this comment now.

Ok, will do.

> 
>>   */
>> -static void perf_cgroup_switch(struct task_struct *task, int mode)
>> +static void perf_cgroup_switch(struct task_struct *task)
>>  {
>> +       struct perf_cgroup *cgrp;
>>         struct perf_cpu_context *cpuctx, *tmp;
>>         struct list_head *list;
>>         unsigned long flags;
>> @@ -849,6 +851,9 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
>>          */
>>         local_irq_save(flags);
>>
>> +       cgrp = perf_cgroup_from_task(task, NULL);
>> +       __this_cpu_write(cpu_perf_cgroups, cgrp);
>> +
>>         list = this_cpu_ptr(&cgrp_cpuctx_list);
>>         list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) {
>>                 WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
>> @@ -856,28 +861,15 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
>>                 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
>>                 perf_pmu_disable(cpuctx->ctx.pmu);
>>
>> -               if (mode & PERF_CGROUP_SWOUT) {
>> -                       cpu_ctx_sched_out(cpuctx, EVENT_ALL);
>> -                       /*
>> -                        * must not be done before ctxswout due
>> -                        * to event_filter_match() in event_sched_out()
> 
> Unrelated, but I don't see the event_filter_match() in
> event_sched_out() anymore.  Does it sched-out all
> non-cgroup cpu events here?

Yes, I review the code and don't find event_filter_match(),
so cpu_ctx_sched_out() will sched-out all cpu events.

And I find event_filter_match() won't work here too,
because perf_cgroup_match() return matched for any
non-cgroup event. Maybe we can add another function
like perf_cgroup_match_sched_out() to use when sched-out.

> 
>> -                        */
>> -                       cpuctx->cgrp = NULL;
>> -               }
>> +               cpu_ctx_sched_out(cpuctx, EVENT_ALL);
>> +               /*
>> +                * must not be done before ctxswout due
>> +                * to event_filter_match() in event_sched_out()
>> +                */
>> +               cpuctx->cgrp = cgrp;
> 
> Maybe we can check cpuctx->cgrp is the same as task's
> cgroup before accessing the pmu.  As in the commit message
> it can call perf_cgroup_switch() after the context switch so
> the cgroup events might be scheduled already.

Good point, will do.

Thanks.

> 
> Thanks,
> Namhyung
> 
> 
>> +
>> +               cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
>>
>> -               if (mode & PERF_CGROUP_SWIN) {
>> -                       WARN_ON_ONCE(cpuctx->cgrp);
>> -                       /*
>> -                        * set cgrp before ctxsw in to allow
>> -                        * event_filter_match() to not have to pass
>> -                        * task around
>> -                        * we pass the cpuctx->ctx to perf_cgroup_from_task()
>> -                        * because cgorup events are only per-cpu
>> -                        */
>> -                       cpuctx->cgrp = perf_cgroup_from_task(task,
>> -                                                            &cpuctx->ctx);
>> -                       cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
>> -               }
>>                 perf_pmu_enable(cpuctx->ctx.pmu);
>>                 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
>>         }

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [Phishing Risk] [External] Re: [RFC PATCH] perf/core: fix cpuctx cgrp warning
  2022-03-10 12:01   ` [Phishing Risk] [External] " Chengming Zhou
@ 2022-03-10 18:02     ` Namhyung Kim
  2022-03-11 12:24       ` [Phishing Risk] " Chengming Zhou
  0 siblings, 1 reply; 5+ messages in thread
From: Namhyung Kim @ 2022-03-10 18:02 UTC (permalink / raw)
  To: Chengming Zhou
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Mark Rutland, Alexander Shishkin, Jiri Olsa, Stephane Eranian,
	linux-perf-users, linux-kernel, songmuchun

On Thu, Mar 10, 2022 at 4:01 AM Chengming Zhou
<zhouchengming@bytedance.com> wrote:
>
> Hello,
>
> On 2022/3/10 5:25 下午, Namhyung Kim wrote:
> > Hello,
> >
> > On Tue, Mar 8, 2022 at 6:00 AM Chengming Zhou
> > <zhouchengming@bytedance.com> wrote:
> >>
> >> There is a race problem that can trigger WARN_ON_ONCE(cpuctx->cgrp)
> >> in perf_cgroup_switch().
> >>
> >> CPU1                                    CPU2
> >> (in context_switch)                     (attach running task)
> >> perf_cgroup_sched_out(task, next)
> >>         if (cgrp1 != cgrp2) True
> >>                                         task->cgroups = xxx
> >>                                         perf_cgroup_attach()
> >> perf_cgroup_sched_in(prev, task)
> >>         if (cgrp1 != cgrp2) False
> >
> > But perf_cgroup_switch will be synchronized as the context switch
> > disables the interrupt.  And right, it still can see the task->cgroups
> > is changing in the middle.
> >
> >>
> >> The commit a8d757ef076f ("perf events: Fix slow and broken cgroup
> >> context switch code") would save cpuctx switch in/out when the
> >> perf_cgroup of "prev" and "next" are the same.
> >>
> >> But perf_cgroup of task can change in concurrent with context_switch.
> >> If cgrp1 == cgrp2 in sched_out(), cpuctx won't do switch out, then
> >> task perf_cgroup changed cause cgrp1 != cgrp2 in sched_in(), cpuctx
> >> will do switch in, and trigger WARN_ON_ONCE(cpuctx->cgrp).
> >>
> >> The perf_cgroup of "prev" and "next" can be changed at any time, so we
> >> first have to combine perf_cgroup_sched_in() into perf_cgroup_sched_out(),
> >> so we can get a consistent value of condition (cgrp1 == cgrp2).
> >>
> >> And we introduce a percpu "cpu_perf_cgroups" to track the current used
> >> perf_cgroup, instead of using the unstable perf_cgroup of "prev", which
> >> maybe not the cpuctx->cgrp we used to schedule cgroup events on cpu.
> >
> > Is this really needed?  I think the warning comes because the two
> > cgroups were the same when in sched-out, but they became
> > different when in sched-in.  So just combining sched-in/out should
> > be ok, isn't it?
>
> If we get perf_cgroup from prev->cgroups that can be changed in the
> context_switch(), make the condition (cgrp1 == cgrp2) is true, then
> we won't do sched_out/in. So the events of prev's previous cgrp will
> still be on the CPU.
>
> Even that CPU would receive IPI from perf_cgroup_attach() after
> context_switch(), remote_function() will do nothing because prev task
> is not current running anymore.

Right, so I don't care about changing prev->cgroups.  I can see these
two scenarios.

1. (cgrp1 == cgrp2) --> (cgrp1 != cgrp2)
  This means the next task's cgroup (cgrp2) is the same as the
  previous and it doesn't need to reschedule events even if the
  cgrp1 is changed.

2. (cgrp1 != cgrp2) --> (cgrp1 == cgrp2)
  This will trigger rescheduling anyway, and we are fine.

>
> >
> >>
> >> Fixes: a8d757ef076f ("perf events: Fix slow and broken cgroup context
> >> switch code")
> >> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
> >> ---
> >>  kernel/events/core.c | 95 +++++++++++---------------------------------
> >>  1 file changed, 23 insertions(+), 72 deletions(-)
> >>
> >> diff --git a/kernel/events/core.c b/kernel/events/core.c
> >> index 6859229497b1..f3bc2841141f 100644
> >> --- a/kernel/events/core.c
> >> +++ b/kernel/events/core.c
> >> @@ -826,6 +826,7 @@ perf_cgroup_set_timestamp(struct task_struct *task,
> >>         }
> >>  }
> >>
> >> +static DEFINE_PER_CPU(struct perf_cgroup *, cpu_perf_cgroups);
> >>  static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
> >>
> >>  #define PERF_CGROUP_SWOUT      0x1 /* cgroup switch out every event */
> >> @@ -837,8 +838,9 @@ static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
> >>   * mode SWOUT : schedule out everything
> >>   * mode SWIN : schedule in based on cgroup for next
> >
> > You can remove this comment now.
>
> Ok, will do.
>
> >
> >>   */
> >> -static void perf_cgroup_switch(struct task_struct *task, int mode)
> >> +static void perf_cgroup_switch(struct task_struct *task)
> >>  {
> >> +       struct perf_cgroup *cgrp;
> >>         struct perf_cpu_context *cpuctx, *tmp;
> >>         struct list_head *list;
> >>         unsigned long flags;
> >> @@ -849,6 +851,9 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
> >>          */
> >>         local_irq_save(flags);
> >>
> >> +       cgrp = perf_cgroup_from_task(task, NULL);
> >> +       __this_cpu_write(cpu_perf_cgroups, cgrp);
> >> +
> >>         list = this_cpu_ptr(&cgrp_cpuctx_list);
> >>         list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) {
> >>                 WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
> >> @@ -856,28 +861,15 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
> >>                 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
> >>                 perf_pmu_disable(cpuctx->ctx.pmu);
> >>
> >> -               if (mode & PERF_CGROUP_SWOUT) {
> >> -                       cpu_ctx_sched_out(cpuctx, EVENT_ALL);
> >> -                       /*
> >> -                        * must not be done before ctxswout due
> >> -                        * to event_filter_match() in event_sched_out()
> >
> > Unrelated, but I don't see the event_filter_match() in
> > event_sched_out() anymore.  Does it sched-out all
> > non-cgroup cpu events here?
>
> Yes, I review the code and don't find event_filter_match(),
> so cpu_ctx_sched_out() will sched-out all cpu events.
>
> And I find event_filter_match() won't work here too,
> because perf_cgroup_match() return matched for any
> non-cgroup event. Maybe we can add another function
> like perf_cgroup_match_sched_out() to use when sched-out.

And for sched-in too.

But we should consider multiplexing in the timer as well.
In that case it cannot know whether it needs to reschedule
cpu or cgroup events, so it does the job for all events.

But I think cgroup + multiplexing is broken already
because it cannot guarantee it sees the same cgroup
when the timer interrupt happens.

Thanks,
Namhyung

>
> >
> >> -                        */
> >> -                       cpuctx->cgrp = NULL;
> >> -               }
> >> +               cpu_ctx_sched_out(cpuctx, EVENT_ALL);
> >> +               /*
> >> +                * must not be done before ctxswout due
> >> +                * to event_filter_match() in event_sched_out()
> >> +                */
> >> +               cpuctx->cgrp = cgrp;
> >
> > Maybe we can check cpuctx->cgrp is the same as task's
> > cgroup before accessing the pmu.  As in the commit message
> > it can call perf_cgroup_switch() after the context switch so
> > the cgroup events might be scheduled already.
>
> Good point, will do.
>
> Thanks.
>
> >
> > Thanks,
> > Namhyung
> >
> >
> >> +
> >> +               cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
> >>
> >> -               if (mode & PERF_CGROUP_SWIN) {
> >> -                       WARN_ON_ONCE(cpuctx->cgrp);
> >> -                       /*
> >> -                        * set cgrp before ctxsw in to allow
> >> -                        * event_filter_match() to not have to pass
> >> -                        * task around
> >> -                        * we pass the cpuctx->ctx to perf_cgroup_from_task()
> >> -                        * because cgorup events are only per-cpu
> >> -                        */
> >> -                       cpuctx->cgrp = perf_cgroup_from_task(task,
> >> -                                                            &cpuctx->ctx);
> >> -                       cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
> >> -               }
> >>                 perf_pmu_enable(cpuctx->ctx.pmu);
> >>                 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> >>         }

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [Phishing Risk] Re: [Phishing Risk] [External] Re: [RFC PATCH] perf/core: fix cpuctx cgrp warning
  2022-03-10 18:02     ` Namhyung Kim
@ 2022-03-11 12:24       ` Chengming Zhou
  0 siblings, 0 replies; 5+ messages in thread
From: Chengming Zhou @ 2022-03-11 12:24 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Mark Rutland, Alexander Shishkin, Jiri Olsa, Stephane Eranian,
	linux-perf-users, linux-kernel, songmuchun

On 2022/3/11 2:02 上午, Namhyung Kim wrote:
> On Thu, Mar 10, 2022 at 4:01 AM Chengming Zhou
> <zhouchengming@bytedance.com> wrote:
>>
>> Hello,
>>
>> On 2022/3/10 5:25 下午, Namhyung Kim wrote:
>>> Hello,
>>>
>>> On Tue, Mar 8, 2022 at 6:00 AM Chengming Zhou
>>> <zhouchengming@bytedance.com> wrote:
>>>>
>>>> There is a race problem that can trigger WARN_ON_ONCE(cpuctx->cgrp)
>>>> in perf_cgroup_switch().
>>>>
>>>> CPU1                                    CPU2
>>>> (in context_switch)                     (attach running task)
>>>> perf_cgroup_sched_out(task, next)
>>>>         if (cgrp1 != cgrp2) True
>>>>                                         task->cgroups = xxx
>>>>                                         perf_cgroup_attach()
>>>> perf_cgroup_sched_in(prev, task)
>>>>         if (cgrp1 != cgrp2) False
>>>
>>> But perf_cgroup_switch will be synchronized as the context switch
>>> disables the interrupt.  And right, it still can see the task->cgroups
>>> is changing in the middle.
>>>
>>>>
>>>> The commit a8d757ef076f ("perf events: Fix slow and broken cgroup
>>>> context switch code") would save cpuctx switch in/out when the
>>>> perf_cgroup of "prev" and "next" are the same.
>>>>
>>>> But perf_cgroup of task can change in concurrent with context_switch.
>>>> If cgrp1 == cgrp2 in sched_out(), cpuctx won't do switch out, then
>>>> task perf_cgroup changed cause cgrp1 != cgrp2 in sched_in(), cpuctx
>>>> will do switch in, and trigger WARN_ON_ONCE(cpuctx->cgrp).
>>>>
>>>> The perf_cgroup of "prev" and "next" can be changed at any time, so we
>>>> first have to combine perf_cgroup_sched_in() into perf_cgroup_sched_out(),
>>>> so we can get a consistent value of condition (cgrp1 == cgrp2).
>>>>
>>>> And we introduce a percpu "cpu_perf_cgroups" to track the current used
>>>> perf_cgroup, instead of using the unstable perf_cgroup of "prev", which
>>>> maybe not the cpuctx->cgrp we used to schedule cgroup events on cpu.
>>>
>>> Is this really needed?  I think the warning comes because the two
>>> cgroups were the same when in sched-out, but they became
>>> different when in sched-in.  So just combining sched-in/out should
>>> be ok, isn't it?
>>
>> If we get perf_cgroup from prev->cgroups that can be changed in the
>> context_switch(), make the condition (cgrp1 == cgrp2) is true, then
>> we won't do sched_out/in. So the events of prev's previous cgrp will
>> still be on the CPU.
>>
>> Even that CPU would receive IPI from perf_cgroup_attach() after
>> context_switch(), remote_function() will do nothing because prev task
>> is not current running anymore.
> 
> Right, so I don't care about changing prev->cgroups.  I can see these
> two scenarios.
> 
> 1. (cgrp1 == cgrp2) --> (cgrp1 != cgrp2)
>   This means the next task's cgroup (cgrp2) is the same as the
>   previous and it doesn't need to reschedule events even if the
>   cgrp1 is changed.
> 
> 2. (cgrp1 != cgrp2) --> (cgrp1 == cgrp2)
>   This will trigger rescheduling anyway, and we are fine.

Yes, these two scenarios are fine, but only if perf_cgroup_switch()
see the old condition, instead of the new condition.

(cgrp1 != cgrp2) --> (cgrp1 == cgrp2)

If perf_cgroup_switch() see the new condition (cgrp1 == cgrp2), then
it won't sched_out/in, so leave the events of old cgrp1 (maybe cgrp3)
on the CPU. But we should sched_in events of cgrp2 instead.

Maybe I missed something ;-)

> 
>>
>>>
>>>>
>>>> Fixes: a8d757ef076f ("perf events: Fix slow and broken cgroup context
>>>> switch code")
>>>> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
>>>> ---
>>>>  kernel/events/core.c | 95 +++++++++++---------------------------------
>>>>  1 file changed, 23 insertions(+), 72 deletions(-)
>>>>
>>>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>>>> index 6859229497b1..f3bc2841141f 100644
>>>> --- a/kernel/events/core.c
>>>> +++ b/kernel/events/core.c
>>>> @@ -826,6 +826,7 @@ perf_cgroup_set_timestamp(struct task_struct *task,
>>>>         }
>>>>  }
>>>>
>>>> +static DEFINE_PER_CPU(struct perf_cgroup *, cpu_perf_cgroups);
>>>>  static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
>>>>
>>>>  #define PERF_CGROUP_SWOUT      0x1 /* cgroup switch out every event */
>>>> @@ -837,8 +838,9 @@ static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
>>>>   * mode SWOUT : schedule out everything
>>>>   * mode SWIN : schedule in based on cgroup for next
>>>
>>> You can remove this comment now.
>>
>> Ok, will do.
>>
>>>
>>>>   */
>>>> -static void perf_cgroup_switch(struct task_struct *task, int mode)
>>>> +static void perf_cgroup_switch(struct task_struct *task)
>>>>  {
>>>> +       struct perf_cgroup *cgrp;
>>>>         struct perf_cpu_context *cpuctx, *tmp;
>>>>         struct list_head *list;
>>>>         unsigned long flags;
>>>> @@ -849,6 +851,9 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
>>>>          */
>>>>         local_irq_save(flags);
>>>>
>>>> +       cgrp = perf_cgroup_from_task(task, NULL);
>>>> +       __this_cpu_write(cpu_perf_cgroups, cgrp);
>>>> +
>>>>         list = this_cpu_ptr(&cgrp_cpuctx_list);
>>>>         list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) {
>>>>                 WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
>>>> @@ -856,28 +861,15 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
>>>>                 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
>>>>                 perf_pmu_disable(cpuctx->ctx.pmu);
>>>>
>>>> -               if (mode & PERF_CGROUP_SWOUT) {
>>>> -                       cpu_ctx_sched_out(cpuctx, EVENT_ALL);
>>>> -                       /*
>>>> -                        * must not be done before ctxswout due
>>>> -                        * to event_filter_match() in event_sched_out()
>>>
>>> Unrelated, but I don't see the event_filter_match() in
>>> event_sched_out() anymore.  Does it sched-out all
>>> non-cgroup cpu events here?
>>
>> Yes, I review the code and don't find event_filter_match(),
>> so cpu_ctx_sched_out() will sched-out all cpu events.
>>
>> And I find event_filter_match() won't work here too,
>> because perf_cgroup_match() return matched for any
>> non-cgroup event. Maybe we can add another function
>> like perf_cgroup_match_sched_out() to use when sched-out.
> 
> And for sched-in too.
> 
> But we should consider multiplexing in the timer as well.
> In that case it cannot know whether it needs to reschedule
> cpu or cgroup events, so it does the job for all events.> 
> But I think cgroup + multiplexing is broken already
> because it cannot guarantee it sees the same cgroup
> when the timer interrupt happens.

Right, I'm still trying to figure these things out.

Thanks.

> 
> Thanks,
> Namhyung
> 
>>
>>>
>>>> -                        */
>>>> -                       cpuctx->cgrp = NULL;
>>>> -               }
>>>> +               cpu_ctx_sched_out(cpuctx, EVENT_ALL);
>>>> +               /*
>>>> +                * must not be done before ctxswout due
>>>> +                * to event_filter_match() in event_sched_out()
>>>> +                */
>>>> +               cpuctx->cgrp = cgrp;
>>>
>>> Maybe we can check cpuctx->cgrp is the same as task's
>>> cgroup before accessing the pmu.  As in the commit message
>>> it can call perf_cgroup_switch() after the context switch so
>>> the cgroup events might be scheduled already.
>>
>> Good point, will do.
>>
>> Thanks.
>>
>>>
>>> Thanks,
>>> Namhyung
>>>
>>>
>>>> +
>>>> +               cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
>>>>
>>>> -               if (mode & PERF_CGROUP_SWIN) {
>>>> -                       WARN_ON_ONCE(cpuctx->cgrp);
>>>> -                       /*
>>>> -                        * set cgrp before ctxsw in to allow
>>>> -                        * event_filter_match() to not have to pass
>>>> -                        * task around
>>>> -                        * we pass the cpuctx->ctx to perf_cgroup_from_task()
>>>> -                        * because cgorup events are only per-cpu
>>>> -                        */
>>>> -                       cpuctx->cgrp = perf_cgroup_from_task(task,
>>>> -                                                            &cpuctx->ctx);
>>>> -                       cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
>>>> -               }
>>>>                 perf_pmu_enable(cpuctx->ctx.pmu);
>>>>                 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
>>>>         }

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2022-03-11 12:24 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-03-08 13:59 [RFC PATCH] perf/core: fix cpuctx cgrp warning Chengming Zhou
2022-03-10  9:25 ` Namhyung Kim
2022-03-10 12:01   ` [Phishing Risk] [External] " Chengming Zhou
2022-03-10 18:02     ` Namhyung Kim
2022-03-11 12:24       ` [Phishing Risk] " Chengming Zhou

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).