linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v3 0/5] perf/core: Fixes and cleanup for cgroup events
@ 2022-03-25  3:53 Chengming Zhou
  2022-03-25  3:53 ` [PATCH v3 1/5] perf/core: Don't pass task around when ctx sched in Chengming Zhou
                   ` (4 more replies)
  0 siblings, 5 replies; 8+ messages in thread
From: Chengming Zhou @ 2022-03-25  3:53 UTC (permalink / raw)
  To: peterz, mingo, acme, mark.rutland, alexander.shishkin, jolsa,
	namhyung, eranian
  Cc: linux-perf-users, linux-kernel, duanxiongchun, songmuchun,
	Chengming Zhou

Hi,

This patchset mainly fix a race problem between cgroup_migrate_execute()
and perf_cgroup_sched_out/in().

v3:
 - put two fix patches together in patch 4
 - delete percpu cpu_perf_cgroup, just use cpuctx->cgrp to compare
 - reorganize these patches' order

v2:
 - split into two patches to fix the race problem for easier review
 - use cpuctx->cgrp when start ctx time and delete unused task argument
 - use cpuctx->cgrp when update ctx time from cgroup perf_event
 - always set cpuctx->cgrp when the first cgroup event enabled
 - remove obselete comments

v1:
 - https://lore.kernel.org/lkml/20220308135948.55336-1-zhouchengming@bytedance.com/

Chengming Zhou (5):
  perf/core: Don't pass task around when ctx sched in
  perf/core: Use perf_cgroup_info->active to check if cgroup is active
  perf/core: Don't need event_filter_match in merge_sched_in()
  perf/core: Fix perf_cgroup_switch()
  perf/core: Always set cpuctx cgrp when enable cgroup event

 kernel/events/core.c | 212 +++++++++++--------------------------------
 1 file changed, 52 insertions(+), 160 deletions(-)

-- 
2.20.1


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH v3 1/5] perf/core: Don't pass task around when ctx sched in
  2022-03-25  3:53 [PATCH v3 0/5] perf/core: Fixes and cleanup for cgroup events Chengming Zhou
@ 2022-03-25  3:53 ` Chengming Zhou
  2022-03-25  3:53 ` [PATCH v3 2/5] perf/core: Use perf_cgroup_info->active to check if cgroup is active Chengming Zhou
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 8+ messages in thread
From: Chengming Zhou @ 2022-03-25  3:53 UTC (permalink / raw)
  To: peterz, mingo, acme, mark.rutland, alexander.shishkin, jolsa,
	namhyung, eranian
  Cc: linux-perf-users, linux-kernel, duanxiongchun, songmuchun,
	Chengming Zhou

The current code pass task around for ctx_sched_in(), only
to get perf_cgroup of the task, then update the timestamp
of it and its ancestors and set them to active.

But we can use cpuctx->cgrp to get active perf_cgroup and
its ancestors since cpuctx->cgrp has been set before
ctx_sched_in().

This patch remove the task argument in ctx_sched_in()
and cleanup related code.

Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
---
 kernel/events/core.c | 58 ++++++++++++++++++++------------------------
 1 file changed, 26 insertions(+), 32 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index cfde994ce61c..d50f45012c05 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -574,8 +574,7 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
 			      enum event_type_t event_type);
 
 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
-			     enum event_type_t event_type,
-			     struct task_struct *task);
+			     enum event_type_t event_type);
 
 static void update_context_time(struct perf_event_context *ctx);
 static u64 perf_event_time(struct perf_event *event);
@@ -801,10 +800,10 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)
 }
 
 static inline void
-perf_cgroup_set_timestamp(struct task_struct *task,
-			  struct perf_event_context *ctx)
+perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
 {
-	struct perf_cgroup *cgrp;
+	struct perf_event_context *ctx = &cpuctx->ctx;
+	struct perf_cgroup *cgrp = cpuctx->cgrp;
 	struct perf_cgroup_info *info;
 	struct cgroup_subsys_state *css;
 
@@ -813,10 +812,10 @@ perf_cgroup_set_timestamp(struct task_struct *task,
 	 * ensure we do not access cgroup data
 	 * unless we have the cgroup pinned (css_get)
 	 */
-	if (!task || !ctx->nr_cgroups)
+	if (!cgrp)
 		return;
 
-	cgrp = perf_cgroup_from_task(task, ctx);
+	WARN_ON_ONCE(!ctx->nr_cgroups);
 
 	for (css = &cgrp->css; css; css = css->parent) {
 		cgrp = container_of(css, struct perf_cgroup, css);
@@ -869,14 +868,14 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
 			WARN_ON_ONCE(cpuctx->cgrp);
 			/*
 			 * set cgrp before ctxsw in to allow
-			 * event_filter_match() to not have to pass
-			 * task around
+			 * perf_cgroup_set_timestamp() in ctx_sched_in()
+			 * to not have to pass task around
 			 * we pass the cpuctx->ctx to perf_cgroup_from_task()
 			 * because cgorup events are only per-cpu
 			 */
 			cpuctx->cgrp = perf_cgroup_from_task(task,
 							     &cpuctx->ctx);
-			cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
+			cpu_ctx_sched_in(cpuctx, EVENT_ALL);
 		}
 		perf_pmu_enable(cpuctx->ctx.pmu);
 		perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -1118,8 +1117,7 @@ static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
 }
 
 static inline void
-perf_cgroup_set_timestamp(struct task_struct *task,
-			  struct perf_event_context *ctx)
+perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
 {
 }
 
@@ -2713,8 +2711,7 @@ static void ctx_sched_out(struct perf_event_context *ctx,
 static void
 ctx_sched_in(struct perf_event_context *ctx,
 	     struct perf_cpu_context *cpuctx,
-	     enum event_type_t event_type,
-	     struct task_struct *task);
+	     enum event_type_t event_type);
 
 static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
 			       struct perf_event_context *ctx,
@@ -2730,15 +2727,14 @@ static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
 }
 
 static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
-				struct perf_event_context *ctx,
-				struct task_struct *task)
+				struct perf_event_context *ctx)
 {
-	cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
+	cpu_ctx_sched_in(cpuctx, EVENT_PINNED);
 	if (ctx)
-		ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
-	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
+		ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
+	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
 	if (ctx)
-		ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
+		ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
 }
 
 /*
@@ -2788,7 +2784,7 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
 	else if (ctx_event_type & EVENT_PINNED)
 		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
 
-	perf_event_sched_in(cpuctx, task_ctx, current);
+	perf_event_sched_in(cpuctx, task_ctx);
 	perf_pmu_enable(cpuctx->ctx.pmu);
 }
 
@@ -3011,7 +3007,7 @@ static void __perf_event_enable(struct perf_event *event,
 		return;
 
 	if (!event_filter_match(event)) {
-		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+		ctx_sched_in(ctx, cpuctx, EVENT_TIME);
 		return;
 	}
 
@@ -3020,7 +3016,7 @@ static void __perf_event_enable(struct perf_event *event,
 	 * then don't put it on unless the group is on.
 	 */
 	if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
-		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+		ctx_sched_in(ctx, cpuctx, EVENT_TIME);
 		return;
 	}
 
@@ -3865,8 +3861,7 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
 static void
 ctx_sched_in(struct perf_event_context *ctx,
 	     struct perf_cpu_context *cpuctx,
-	     enum event_type_t event_type,
-	     struct task_struct *task)
+	     enum event_type_t event_type)
 {
 	int is_active = ctx->is_active;
 
@@ -3878,7 +3873,7 @@ ctx_sched_in(struct perf_event_context *ctx,
 	if (is_active ^ EVENT_TIME) {
 		/* start ctx time */
 		__update_context_time(ctx, false);
-		perf_cgroup_set_timestamp(task, ctx);
+		perf_cgroup_set_timestamp(cpuctx);
 		/*
 		 * CPU-release for the below ->is_active store,
 		 * see __load_acquire() in perf_event_time_now()
@@ -3909,12 +3904,11 @@ ctx_sched_in(struct perf_event_context *ctx,
 }
 
 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
-			     enum event_type_t event_type,
-			     struct task_struct *task)
+			     enum event_type_t event_type)
 {
 	struct perf_event_context *ctx = &cpuctx->ctx;
 
-	ctx_sched_in(ctx, cpuctx, event_type, task);
+	ctx_sched_in(ctx, cpuctx, event_type);
 }
 
 static void perf_event_context_sched_in(struct perf_event_context *ctx,
@@ -3956,7 +3950,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
 	 */
 	if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
 		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
-	perf_event_sched_in(cpuctx, ctx, task);
+	perf_event_sched_in(cpuctx, ctx);
 
 	if (cpuctx->sched_cb_usage && pmu->sched_task)
 		pmu->sched_task(cpuctx->task_ctx, true);
@@ -4267,7 +4261,7 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
 	if (cpu_event)
 		rotate_ctx(&cpuctx->ctx, cpu_event);
 
-	perf_event_sched_in(cpuctx, task_ctx, current);
+	perf_event_sched_in(cpuctx, task_ctx);
 
 	perf_pmu_enable(cpuctx->ctx.pmu);
 	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -4339,7 +4333,7 @@ static void perf_event_enable_on_exec(int ctxn)
 		clone_ctx = unclone_ctx(ctx);
 		ctx_resched(cpuctx, ctx, event_type);
 	} else {
-		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+		ctx_sched_in(ctx, cpuctx, EVENT_TIME);
 	}
 	perf_ctx_unlock(cpuctx, ctx);
 
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH v3 2/5] perf/core: Use perf_cgroup_info->active to check if cgroup is active
  2022-03-25  3:53 [PATCH v3 0/5] perf/core: Fixes and cleanup for cgroup events Chengming Zhou
  2022-03-25  3:53 ` [PATCH v3 1/5] perf/core: Don't pass task around when ctx sched in Chengming Zhou
@ 2022-03-25  3:53 ` Chengming Zhou
  2022-03-25  3:53 ` [PATCH v3 3/5] perf/core: Don't need event_filter_match in merge_sched_in() Chengming Zhou
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 8+ messages in thread
From: Chengming Zhou @ 2022-03-25  3:53 UTC (permalink / raw)
  To: peterz, mingo, acme, mark.rutland, alexander.shishkin, jolsa,
	namhyung, eranian
  Cc: linux-perf-users, linux-kernel, duanxiongchun, songmuchun,
	Chengming Zhou

Since we use perf_cgroup_set_timestamp() to start cgroup time and
set active to 1, then use update_cgrp_time_from_cpuctx() to stop
cgroup time and set active to 0.

We can use info->active directly to check if cgroup is active.

Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
---
 kernel/events/core.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index d50f45012c05..dd985c77bc37 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -780,7 +780,6 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx,
 static inline void update_cgrp_time_from_event(struct perf_event *event)
 {
 	struct perf_cgroup_info *info;
-	struct perf_cgroup *cgrp;
 
 	/*
 	 * ensure we access cgroup data only when needed and
@@ -789,14 +788,12 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)
 	if (!is_cgroup_event(event))
 		return;
 
-	cgrp = perf_cgroup_from_task(current, event->ctx);
+	info = this_cpu_ptr(event->cgrp->info);
 	/*
 	 * Do not update time when cgroup is not active
 	 */
-	if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) {
-		info = this_cpu_ptr(event->cgrp->info);
+	if (info->active)
 		__update_cgrp_time(info, perf_clock(), true);
-	}
 }
 
 static inline void
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH v3 3/5] perf/core: Don't need event_filter_match in merge_sched_in()
  2022-03-25  3:53 [PATCH v3 0/5] perf/core: Fixes and cleanup for cgroup events Chengming Zhou
  2022-03-25  3:53 ` [PATCH v3 1/5] perf/core: Don't pass task around when ctx sched in Chengming Zhou
  2022-03-25  3:53 ` [PATCH v3 2/5] perf/core: Use perf_cgroup_info->active to check if cgroup is active Chengming Zhou
@ 2022-03-25  3:53 ` Chengming Zhou
  2022-03-25 15:11   ` Liang, Kan
  2022-03-25  3:53 ` [PATCH v3 4/5] perf/core: Fix perf_cgroup_switch() Chengming Zhou
  2022-03-25  3:53 ` [PATCH v3 5/5] perf/core: Always set cpuctx cgrp when enable cgroup event Chengming Zhou
  4 siblings, 1 reply; 8+ messages in thread
From: Chengming Zhou @ 2022-03-25  3:53 UTC (permalink / raw)
  To: peterz, mingo, acme, mark.rutland, alexander.shishkin, jolsa,
	namhyung, eranian
  Cc: linux-perf-users, linux-kernel, duanxiongchun, songmuchun,
	Chengming Zhou

There is one obselete comment in perf_cgroup_switch(), since
we don't use event_filter_match() when event_sched_out().

Then found we needn't to use event_filter_match() in
merge_sched_in() too. Because now we use the perf_event groups
RB-tree to get the exact matched perf_events, don't need to
go through the event_filter_match() to check if matched again.

Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
---
 kernel/events/core.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index dd985c77bc37..225d408deb1a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -856,7 +856,8 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
 			cpu_ctx_sched_out(cpuctx, EVENT_ALL);
 			/*
 			 * must not be done before ctxswout due
-			 * to event_filter_match() in event_sched_out()
+			 * to update_cgrp_time_from_cpuctx() in
+			 * ctx_sched_out()
 			 */
 			cpuctx->cgrp = NULL;
 		}
@@ -3804,9 +3805,6 @@ static int merge_sched_in(struct perf_event *event, void *data)
 	if (event->state <= PERF_EVENT_STATE_OFF)
 		return 0;
 
-	if (!event_filter_match(event))
-		return 0;
-
 	if (group_can_go_on(event, cpuctx, *can_add_hw)) {
 		if (!group_sched_in(event, cpuctx, ctx))
 			list_add_tail(&event->active_list, get_event_list(event));
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH v3 4/5] perf/core: Fix perf_cgroup_switch()
  2022-03-25  3:53 [PATCH v3 0/5] perf/core: Fixes and cleanup for cgroup events Chengming Zhou
                   ` (2 preceding siblings ...)
  2022-03-25  3:53 ` [PATCH v3 3/5] perf/core: Don't need event_filter_match in merge_sched_in() Chengming Zhou
@ 2022-03-25  3:53 ` Chengming Zhou
  2022-03-25  3:53 ` [PATCH v3 5/5] perf/core: Always set cpuctx cgrp when enable cgroup event Chengming Zhou
  4 siblings, 0 replies; 8+ messages in thread
From: Chengming Zhou @ 2022-03-25  3:53 UTC (permalink / raw)
  To: peterz, mingo, acme, mark.rutland, alexander.shishkin, jolsa,
	namhyung, eranian
  Cc: linux-perf-users, linux-kernel, duanxiongchun, songmuchun,
	Chengming Zhou

There is a race problem that can trigger WARN_ON_ONCE(cpuctx->cgrp)
in perf_cgroup_switch().

CPU1						CPU2
perf_cgroup_sched_out(prev, next)
  cgrp1 = perf_cgroup_from_task(prev)
  cgrp2 = perf_cgroup_from_task(next)
  if (cgrp1 != cgrp2)
    perf_cgroup_switch(prev, PERF_CGROUP_SWOUT)
						cgroup_migrate_execute()
						  task->cgroups = ?
						  perf_cgroup_attach()
						    task_function_call(task, __perf_cgroup_move)
perf_cgroup_sched_in(prev, next)
  cgrp1 = perf_cgroup_from_task(prev)
  cgrp2 = perf_cgroup_from_task(next)
  if (cgrp1 != cgrp2)
    perf_cgroup_switch(next, PERF_CGROUP_SWIN)
						__perf_cgroup_move()
						  perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN)

The commit a8d757ef076f ("perf events: Fix slow and broken cgroup
context switch code") want to skip perf_cgroup_switch() when the
perf_cgroup of "prev" and "next" are the same.

But task->cgroups can change in concurrent with context_switch()
in cgroup_migrate_execute(). If cgrp1 == cgrp2 in sched_out(),
cpuctx won't do sched_out. Then task->cgroups changed cause
cgrp1 != cgrp2 in sched_in(), cpuctx will do sched_in. So trigger
WARN_ON_ONCE(cpuctx->cgrp).

Even though __perf_cgroup_move() will be synchronized as the context
switch disables the interrupt, context_switch() still can see the
task->cgroups is changing in the middle, since task->cgroups changed
before sending IPI.

So we have to combine perf_cgroup_sched_in() into perf_cgroup_sched_out(),
unified into perf_cgroup_switch(), to fix the incosistency between
perf_cgroup_sched_out() and perf_cgroup_sched_in().

But we can't just compare prev->cgroups with next->cgroups to decide
whether to skip cpuctx sched_out/in since the prev->cgroups is changing
too. For example:

CPU1					CPU2
					cgroup_migrate_execute()
					  prev->cgroups = ?
					  perf_cgroup_attach()
					    task_function_call(task, __perf_cgroup_move)
perf_cgroup_switch(task)
  cgrp1 = perf_cgroup_from_task(prev)
  cgrp2 = perf_cgroup_from_task(next)
  if (cgrp1 != cgrp2)
    cpuctx sched_out/in ...
					task_function_call() will return -ESRCH

In the above example, prev->cgroups changing cause (cgrp1 == cgrp2)
to be true, so skip cpuctx sched_out/in. And later task_function_call()
would return -ESRCH since the prev task isn't running on cpu anymore.
So we would leave perf_events of the old prev->cgroups still sched on
the CPU, which is wrong.

The solution is that we should use cpuctx->cgrp to compare with
the next task's perf_cgroup. Since cpuctx->cgrp can only be changed
on local CPU, and we have irq disabled, we can read cpuctx->cgrp to
compare without holding ctx lock.

Fixes: a8d757ef076f ("perf events: Fix slow and broken cgroup context switch code")
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
---
 kernel/events/core.c | 133 ++++++++-----------------------------------
 1 file changed, 25 insertions(+), 108 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 225d408deb1a..d559fbbd3aaa 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -824,17 +824,12 @@ perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
 
 static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
 
-#define PERF_CGROUP_SWOUT	0x1 /* cgroup switch out every event */
-#define PERF_CGROUP_SWIN	0x2 /* cgroup switch in events based on task */
-
 /*
  * reschedule events based on the cgroup constraint of task.
- *
- * mode SWOUT : schedule out everything
- * mode SWIN : schedule in based on cgroup for next
  */
-static void perf_cgroup_switch(struct task_struct *task, int mode)
+static void perf_cgroup_switch(struct task_struct *task)
 {
+	struct perf_cgroup *cgrp;
 	struct perf_cpu_context *cpuctx, *tmp;
 	struct list_head *list;
 	unsigned long flags;
@@ -845,36 +840,31 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
 	 */
 	local_irq_save(flags);
 
+	cgrp = perf_cgroup_from_task(task, NULL);
+
 	list = this_cpu_ptr(&cgrp_cpuctx_list);
 	list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) {
 		WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
+		if (READ_ONCE(cpuctx->cgrp) == cgrp)
+			continue;
 
 		perf_ctx_lock(cpuctx, cpuctx->task_ctx);
 		perf_pmu_disable(cpuctx->ctx.pmu);
 
-		if (mode & PERF_CGROUP_SWOUT) {
-			cpu_ctx_sched_out(cpuctx, EVENT_ALL);
-			/*
-			 * must not be done before ctxswout due
-			 * to update_cgrp_time_from_cpuctx() in
-			 * ctx_sched_out()
-			 */
-			cpuctx->cgrp = NULL;
-		}
+		cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+		/*
+		 * must not be done before ctxswout due
+		 * to update_cgrp_time_from_cpuctx() in
+		 * ctx_sched_out()
+		 */
+		cpuctx->cgrp = cgrp;
+		/*
+		 * set cgrp before ctxsw in to allow
+		 * perf_cgroup_set_timestamp() in ctx_sched_in()
+		 * to not have to pass task around
+		 */
+		cpu_ctx_sched_in(cpuctx, EVENT_ALL);
 
-		if (mode & PERF_CGROUP_SWIN) {
-			WARN_ON_ONCE(cpuctx->cgrp);
-			/*
-			 * set cgrp before ctxsw in to allow
-			 * perf_cgroup_set_timestamp() in ctx_sched_in()
-			 * to not have to pass task around
-			 * we pass the cpuctx->ctx to perf_cgroup_from_task()
-			 * because cgorup events are only per-cpu
-			 */
-			cpuctx->cgrp = perf_cgroup_from_task(task,
-							     &cpuctx->ctx);
-			cpu_ctx_sched_in(cpuctx, EVENT_ALL);
-		}
 		perf_pmu_enable(cpuctx->ctx.pmu);
 		perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 	}
@@ -882,58 +872,6 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
 	local_irq_restore(flags);
 }
 
-static inline void perf_cgroup_sched_out(struct task_struct *task,
-					 struct task_struct *next)
-{
-	struct perf_cgroup *cgrp1;
-	struct perf_cgroup *cgrp2 = NULL;
-
-	rcu_read_lock();
-	/*
-	 * we come here when we know perf_cgroup_events > 0
-	 * we do not need to pass the ctx here because we know
-	 * we are holding the rcu lock
-	 */
-	cgrp1 = perf_cgroup_from_task(task, NULL);
-	cgrp2 = perf_cgroup_from_task(next, NULL);
-
-	/*
-	 * only schedule out current cgroup events if we know
-	 * that we are switching to a different cgroup. Otherwise,
-	 * do no touch the cgroup events.
-	 */
-	if (cgrp1 != cgrp2)
-		perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
-
-	rcu_read_unlock();
-}
-
-static inline void perf_cgroup_sched_in(struct task_struct *prev,
-					struct task_struct *task)
-{
-	struct perf_cgroup *cgrp1;
-	struct perf_cgroup *cgrp2 = NULL;
-
-	rcu_read_lock();
-	/*
-	 * we come here when we know perf_cgroup_events > 0
-	 * we do not need to pass the ctx here because we know
-	 * we are holding the rcu lock
-	 */
-	cgrp1 = perf_cgroup_from_task(task, NULL);
-	cgrp2 = perf_cgroup_from_task(prev, NULL);
-
-	/*
-	 * only need to schedule in cgroup events if we are changing
-	 * cgroup during ctxsw. Cgroup events were not scheduled
-	 * out of ctxsw out if that was not the case.
-	 */
-	if (cgrp1 != cgrp2)
-		perf_cgroup_switch(task, PERF_CGROUP_SWIN);
-
-	rcu_read_unlock();
-}
-
 static int perf_cgroup_ensure_storage(struct perf_event *event,
 				struct cgroup_subsys_state *css)
 {
@@ -1097,16 +1035,6 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx,
 {
 }
 
-static inline void perf_cgroup_sched_out(struct task_struct *task,
-					 struct task_struct *next)
-{
-}
-
-static inline void perf_cgroup_sched_in(struct task_struct *prev,
-					struct task_struct *task)
-{
-}
-
 static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
 				      struct perf_event_attr *attr,
 				      struct perf_event *group_leader)
@@ -1119,11 +1047,6 @@ perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
 {
 }
 
-static inline void
-perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
-{
-}
-
 static inline u64 perf_cgroup_event_time(struct perf_event *event)
 {
 	return 0;
@@ -1143,6 +1066,10 @@ static inline void
 perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
 {
 }
+
+static void perf_cgroup_switch(struct task_struct *task)
+{
+}
 #endif
 
 /*
@@ -3662,7 +3589,7 @@ void __perf_event_task_sched_out(struct task_struct *task,
 	 * cgroup event are system-wide mode only
 	 */
 	if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
-		perf_cgroup_sched_out(task, next);
+		perf_cgroup_switch(next);
 }
 
 /*
@@ -3973,16 +3900,6 @@ void __perf_event_task_sched_in(struct task_struct *prev,
 	struct perf_event_context *ctx;
 	int ctxn;
 
-	/*
-	 * If cgroup events exist on this CPU, then we need to check if we have
-	 * to switch in PMU state; cgroup event are system-wide mode only.
-	 *
-	 * Since cgroup events are CPU events, we must schedule these in before
-	 * we schedule in the task events.
-	 */
-	if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
-		perf_cgroup_sched_in(prev, task);
-
 	for_each_task_context_nr(ctxn) {
 		ctx = task->perf_event_ctxp[ctxn];
 		if (likely(!ctx))
@@ -13551,7 +13468,7 @@ static int __perf_cgroup_move(void *info)
 {
 	struct task_struct *task = info;
 	rcu_read_lock();
-	perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
+	perf_cgroup_switch(task);
 	rcu_read_unlock();
 	return 0;
 }
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH v3 5/5] perf/core: Always set cpuctx cgrp when enable cgroup event
  2022-03-25  3:53 [PATCH v3 0/5] perf/core: Fixes and cleanup for cgroup events Chengming Zhou
                   ` (3 preceding siblings ...)
  2022-03-25  3:53 ` [PATCH v3 4/5] perf/core: Fix perf_cgroup_switch() Chengming Zhou
@ 2022-03-25  3:53 ` Chengming Zhou
  4 siblings, 0 replies; 8+ messages in thread
From: Chengming Zhou @ 2022-03-25  3:53 UTC (permalink / raw)
  To: peterz, mingo, acme, mark.rutland, alexander.shishkin, jolsa,
	namhyung, eranian
  Cc: linux-perf-users, linux-kernel, duanxiongchun, songmuchun,
	Chengming Zhou

When enable a cgroup event, cpuctx->cgrp setting is conditional
on the current task cgrp matching the event's cgroup, so have to
do it for every new event. It brings complexity but no advantage.

To keep it simple, this patch would always set cpuctx->cgrp
when enable the first cgroup event, and reset to NULL when disable
the last cgroup event.

Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
---
 kernel/events/core.c | 18 ++----------------
 1 file changed, 2 insertions(+), 16 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index d559fbbd3aaa..ccd4dfdf4810 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -967,22 +967,10 @@ perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ct
 	 */
 	cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
 
-	/*
-	 * Since setting cpuctx->cgrp is conditional on the current @cgrp
-	 * matching the event's cgroup, we must do this for every new event,
-	 * because if the first would mismatch, the second would not try again
-	 * and we would leave cpuctx->cgrp unset.
-	 */
-	if (ctx->is_active && !cpuctx->cgrp) {
-		struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
-
-		if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
-			cpuctx->cgrp = cgrp;
-	}
-
 	if (ctx->nr_cgroups++)
 		return;
 
+	cpuctx->cgrp = perf_cgroup_from_task(current, ctx);
 	list_add(&cpuctx->cgrp_cpuctx_entry,
 			per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
 }
@@ -1004,9 +992,7 @@ perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *c
 	if (--ctx->nr_cgroups)
 		return;
 
-	if (ctx->is_active && cpuctx->cgrp)
-		cpuctx->cgrp = NULL;
-
+	cpuctx->cgrp = NULL;
 	list_del(&cpuctx->cgrp_cpuctx_entry);
 }
 
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH v3 3/5] perf/core: Don't need event_filter_match in merge_sched_in()
  2022-03-25  3:53 ` [PATCH v3 3/5] perf/core: Don't need event_filter_match in merge_sched_in() Chengming Zhou
@ 2022-03-25 15:11   ` Liang, Kan
  2022-03-25 15:45     ` [External] " Chengming Zhou
  0 siblings, 1 reply; 8+ messages in thread
From: Liang, Kan @ 2022-03-25 15:11 UTC (permalink / raw)
  To: Chengming Zhou, peterz, mingo, acme, mark.rutland,
	alexander.shishkin, jolsa, namhyung, eranian
  Cc: linux-perf-users, linux-kernel, duanxiongchun, songmuchun



On 3/24/2022 11:53 PM, Chengming Zhou wrote:
> There is one obselete comment in perf_cgroup_switch(), since
> we don't use event_filter_match() when event_sched_out().
> 
> Then found we needn't to use event_filter_match() in
> merge_sched_in() too. Because now we use the perf_event groups
> RB-tree to get the exact matched perf_events, don't need to
> go through the event_filter_match() to check if matched again.
> 
> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
> ---
>   kernel/events/core.c | 6 ++----
>   1 file changed, 2 insertions(+), 4 deletions(-)
> 
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index dd985c77bc37..225d408deb1a 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -856,7 +856,8 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
>   			cpu_ctx_sched_out(cpuctx, EVENT_ALL);
>   			/*
>   			 * must not be done before ctxswout due
> -			 * to event_filter_match() in event_sched_out()
> +			 * to update_cgrp_time_from_cpuctx() in
> +			 * ctx_sched_out()
>   			 */
>   			cpuctx->cgrp = NULL;
>   		}
> @@ -3804,9 +3805,6 @@ static int merge_sched_in(struct perf_event *event, void *data)
>   	if (event->state <= PERF_EVENT_STATE_OFF)
>   		return 0;
>   
> -	if (!event_filter_match(event))
> -		return 0;
> -

Both X86 and Arm implemented PMU specific filter_match callback for the 
hybrid system. I think the check is still required at least for the 
hybrid system.


Thanks,
Kan
>   	if (group_can_go_on(event, cpuctx, *can_add_hw)) {
>   		if (!group_sched_in(event, cpuctx, ctx))
>   			list_add_tail(&event->active_list, get_event_list(event));

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [External] Re: [PATCH v3 3/5] perf/core: Don't need event_filter_match in merge_sched_in()
  2022-03-25 15:11   ` Liang, Kan
@ 2022-03-25 15:45     ` Chengming Zhou
  0 siblings, 0 replies; 8+ messages in thread
From: Chengming Zhou @ 2022-03-25 15:45 UTC (permalink / raw)
  To: Liang, Kan, peterz, mingo, acme, mark.rutland,
	alexander.shishkin, jolsa, namhyung, eranian
  Cc: linux-perf-users, linux-kernel, duanxiongchun, songmuchun

On 2022/3/25 23:11, Liang, Kan wrote:
> 
> 
> On 3/24/2022 11:53 PM, Chengming Zhou wrote:
>> There is one obselete comment in perf_cgroup_switch(), since
>> we don't use event_filter_match() when event_sched_out().
>>
>> Then found we needn't to use event_filter_match() in
>> merge_sched_in() too. Because now we use the perf_event groups
>> RB-tree to get the exact matched perf_events, don't need to
>> go through the event_filter_match() to check if matched again.
>>
>> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
>> ---
>>   kernel/events/core.c | 6 ++----
>>   1 file changed, 2 insertions(+), 4 deletions(-)
>>
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index dd985c77bc37..225d408deb1a 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -856,7 +856,8 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
>>               cpu_ctx_sched_out(cpuctx, EVENT_ALL);
>>               /*
>>                * must not be done before ctxswout due
>> -             * to event_filter_match() in event_sched_out()
>> +             * to update_cgrp_time_from_cpuctx() in
>> +             * ctx_sched_out()
>>                */
>>               cpuctx->cgrp = NULL;
>>           }
>> @@ -3804,9 +3805,6 @@ static int merge_sched_in(struct perf_event *event, void *data)
>>       if (event->state <= PERF_EVENT_STATE_OFF)
>>           return 0;
>>   -    if (!event_filter_match(event))
>> -        return 0;
>> -
> 
> Both X86 and Arm implemented PMU specific filter_match callback for the hybrid system. I think the check is still required at least for the hybrid system.
> 

Ah, I ignored this point, will fix it.

Thanks.

> 
> Thanks,
> Kan
>>       if (group_can_go_on(event, cpuctx, *can_add_hw)) {
>>           if (!group_sched_in(event, cpuctx, ctx))
>>               list_add_tail(&event->active_list, get_event_list(event));

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2022-03-25 15:46 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-03-25  3:53 [PATCH v3 0/5] perf/core: Fixes and cleanup for cgroup events Chengming Zhou
2022-03-25  3:53 ` [PATCH v3 1/5] perf/core: Don't pass task around when ctx sched in Chengming Zhou
2022-03-25  3:53 ` [PATCH v3 2/5] perf/core: Use perf_cgroup_info->active to check if cgroup is active Chengming Zhou
2022-03-25  3:53 ` [PATCH v3 3/5] perf/core: Don't need event_filter_match in merge_sched_in() Chengming Zhou
2022-03-25 15:11   ` Liang, Kan
2022-03-25 15:45     ` [External] " Chengming Zhou
2022-03-25  3:53 ` [PATCH v3 4/5] perf/core: Fix perf_cgroup_switch() Chengming Zhou
2022-03-25  3:53 ` [PATCH v3 5/5] perf/core: Always set cpuctx cgrp when enable cgroup event Chengming Zhou

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).