* [PATCH 1/4] perf: Fix system-wide events miscounting during cgroup monitoring
2019-04-29 14:44 [PATCH 0/4] Optimize cgroup context switch kan.liang
@ 2019-04-29 14:44 ` kan.liang
2019-04-29 15:04 ` Mark Rutland
2019-04-30 8:56 ` Peter Zijlstra
2019-04-29 14:44 ` [PATCH 2/4] perf: Add filter_match() as a parameter for pinned/flexible_sched_in() kan.liang
` (2 subsequent siblings)
3 siblings, 2 replies; 18+ messages in thread
From: kan.liang @ 2019-04-29 14:44 UTC (permalink / raw)
To: peterz, tglx, mingo, linux-kernel; +Cc: eranian, tj, ak, Kan Liang
From: Kan Liang <kan.liang@linux.intel.com>
When counting system-wide events and cgroup events simultaneously, the
value of system-wide events are miscounting. For example,
perf stat -e cycles,instructions -e cycles,instructions -G
cgroup1,cgroup1,cgroup2,cgroup2 -a -e cycles,instructions -I 1000
1.096265502 12,375,398,872 cycles cgroup1
1.096265502 8,396,184,503 instructions cgroup1
# 0.10 insn per cycle
1.096265502 109,609,027,112 cycles cgroup2
1.096265502 11,533,690,148 instructions cgroup2
# 0.14 insn per cycle
1.096265502 121,672,937,058 cycles
1.096265502 19,331,496,727 instructions #
0.24 insn per cycle
The events are identical events for system-wide and cgroup. The
value of system-wide events is less than the sum of cgroup events,
which is wrong.
Both system-wide and cgroup are per-cpu. They share the same cpuctx
groups, cpuctx->flexible_groups/pinned_groups.
In context switch, cgroup switch tries to schedule all the events in
the cpuctx groups. The unmatched cgroup events can be filtered by its
event->cgrp. However, system-wide events, which event->cgrp is NULL, are
unconditionally switched, which causes miscounting.
Introduce cgrp_switch in cpuctx to indicate the cgroup context switch.
If it's system-wide event in context switch, don't try to switch it.
Fixes: e5d1367f17ba ("perf: Add cgroup support")
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---
include/linux/perf_event.h | 1 +
kernel/events/core.c | 30 ++++++++++++++++++++++++++++--
2 files changed, 29 insertions(+), 2 deletions(-)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e47ef76..039e2f2 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -795,6 +795,7 @@ struct perf_cpu_context {
#ifdef CONFIG_CGROUP_PERF
struct perf_cgroup *cgrp;
struct list_head cgrp_cpuctx_entry;
+ unsigned int cgrp_switch :1;
#endif
struct list_head sched_cb_entry;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index dc7dead..388dd42 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -809,6 +809,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
perf_pmu_disable(cpuctx->ctx.pmu);
+ cpuctx->cgrp_switch = true;
if (mode & PERF_CGROUP_SWOUT) {
cpu_ctx_sched_out(cpuctx, EVENT_ALL);
@@ -832,6 +833,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
&cpuctx->ctx);
cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
}
+ cpuctx->cgrp_switch = false;
perf_pmu_enable(cpuctx->ctx.pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}
@@ -2944,13 +2946,25 @@ static void ctx_sched_out(struct perf_event_context *ctx,
perf_pmu_disable(ctx->pmu);
if (is_active & EVENT_PINNED) {
- list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
+ list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list) {
+#ifdef CONFIG_CGROUP_PERF
+ /* Don't sched system-wide event when cgroup context switch */
+ if (cpuctx->cgrp_switch && !event->cgrp)
+ continue;
+#endif
group_sched_out(event, cpuctx, ctx);
+ }
}
if (is_active & EVENT_FLEXIBLE) {
- list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
+ list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list) {
+#ifdef CONFIG_CGROUP_PERF
+ /* Don't sched system-wide event when cgroup context switch */
+ if (cpuctx->cgrp_switch && !event->cgrp)
+ continue;
+#endif
group_sched_out(event, cpuctx, ctx);
+ }
}
perf_pmu_enable(ctx->pmu);
}
@@ -3280,6 +3294,12 @@ static int pinned_sched_in(struct perf_event *event, void *data)
if (event->state <= PERF_EVENT_STATE_OFF)
return 0;
+#ifdef CONFIG_CGROUP_PERF
+ /* Don't sched system-wide event when cgroup context switch */
+ if (sid->cpuctx->cgrp_switch && !event->cgrp)
+ return 0;
+#endif
+
if (!event_filter_match(event))
return 0;
@@ -3305,6 +3325,12 @@ static int flexible_sched_in(struct perf_event *event, void *data)
if (event->state <= PERF_EVENT_STATE_OFF)
return 0;
+#ifdef CONFIG_CGROUP_PERF
+ /* Don't sched system-wide event when cgroup context switch */
+ if (sid->cpuctx->cgrp_switch && !event->cgrp)
+ return 0;
+#endif
+
if (!event_filter_match(event))
return 0;
--
2.7.4
^ permalink raw reply related [flat|nested] 18+ messages in thread
* Re: [PATCH 1/4] perf: Fix system-wide events miscounting during cgroup monitoring
2019-04-29 14:44 ` [PATCH 1/4] perf: Fix system-wide events miscounting during cgroup monitoring kan.liang
@ 2019-04-29 15:04 ` Mark Rutland
2019-04-29 15:27 ` Liang, Kan
2019-04-30 8:56 ` Peter Zijlstra
1 sibling, 1 reply; 18+ messages in thread
From: Mark Rutland @ 2019-04-29 15:04 UTC (permalink / raw)
To: kan.liang; +Cc: peterz, tglx, mingo, linux-kernel, eranian, tj, ak
On Mon, Apr 29, 2019 at 07:44:02AM -0700, kan.liang@linux.intel.com wrote:
> From: Kan Liang <kan.liang@linux.intel.com>
>
> When counting system-wide events and cgroup events simultaneously, the
> value of system-wide events are miscounting. For example,
>
> perf stat -e cycles,instructions -e cycles,instructions -G
> cgroup1,cgroup1,cgroup2,cgroup2 -a -e cycles,instructions -I 1000
>
> 1.096265502 12,375,398,872 cycles cgroup1
> 1.096265502 8,396,184,503 instructions cgroup1
> # 0.10 insn per cycle
> 1.096265502 109,609,027,112 cycles cgroup2
> 1.096265502 11,533,690,148 instructions cgroup2
> # 0.14 insn per cycle
> 1.096265502 121,672,937,058 cycles
> 1.096265502 19,331,496,727 instructions #
> 0.24 insn per cycle
>
> The events are identical events for system-wide and cgroup. The
> value of system-wide events is less than the sum of cgroup events,
> which is wrong.
>
> Both system-wide and cgroup are per-cpu. They share the same cpuctx
> groups, cpuctx->flexible_groups/pinned_groups.
> In context switch, cgroup switch tries to schedule all the events in
> the cpuctx groups. The unmatched cgroup events can be filtered by its
> event->cgrp. However, system-wide events, which event->cgrp is NULL, are
> unconditionally switched, which causes miscounting.
Why exactly does that cause mis-counting?
Are the system-wide events not switched back in? Or filtered
erroneously?
> Introduce cgrp_switch in cpuctx to indicate the cgroup context switch.
> If it's system-wide event in context switch, don't try to switch it.
>
> Fixes: e5d1367f17ba ("perf: Add cgroup support")
> Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
> ---
> include/linux/perf_event.h | 1 +
> kernel/events/core.c | 30 ++++++++++++++++++++++++++++--
> 2 files changed, 29 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index e47ef76..039e2f2 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -795,6 +795,7 @@ struct perf_cpu_context {
> #ifdef CONFIG_CGROUP_PERF
> struct perf_cgroup *cgrp;
> struct list_head cgrp_cpuctx_entry;
> + unsigned int cgrp_switch :1;
> #endif
>
> struct list_head sched_cb_entry;
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index dc7dead..388dd42 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -809,6 +809,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
>
> perf_ctx_lock(cpuctx, cpuctx->task_ctx);
> perf_pmu_disable(cpuctx->ctx.pmu);
> + cpuctx->cgrp_switch = true;
>
> if (mode & PERF_CGROUP_SWOUT) {
> cpu_ctx_sched_out(cpuctx, EVENT_ALL);
> @@ -832,6 +833,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
> &cpuctx->ctx);
> cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
> }
> + cpuctx->cgrp_switch = false;
> perf_pmu_enable(cpuctx->ctx.pmu);
> perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> }
> @@ -2944,13 +2946,25 @@ static void ctx_sched_out(struct perf_event_context *ctx,
>
> perf_pmu_disable(ctx->pmu);
> if (is_active & EVENT_PINNED) {
> - list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
> + list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list) {
> +#ifdef CONFIG_CGROUP_PERF
> + /* Don't sched system-wide event when cgroup context switch */
> + if (cpuctx->cgrp_switch && !event->cgrp)
> + continue;
> +#endif
This pattern is duplicated several times, and should probably be a
helper like:
static bool skip_cgroup_switch(struct perf_cpu_context *cpuctx,
struct perf_event *event);
{
return IS_ENABLED(CONFIG_CGROUP_PERF) &&
cpuctx->cgrp_switch &&
!event->cgrp;
}
... allowing the above to be an unconditional:
if (skip_cgroup_switch(cpuctx, event))
continue;
... and likewise for the other cases.
Thanks,
Mark.
> group_sched_out(event, cpuctx, ctx);
> + }
> }
>
> if (is_active & EVENT_FLEXIBLE) {
> - list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
> + list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list) {
> +#ifdef CONFIG_CGROUP_PERF
> + /* Don't sched system-wide event when cgroup context switch */
> + if (cpuctx->cgrp_switch && !event->cgrp)
> + continue;
> +#endif
> group_sched_out(event, cpuctx, ctx);
> + }
> }
> perf_pmu_enable(ctx->pmu);
> }
> @@ -3280,6 +3294,12 @@ static int pinned_sched_in(struct perf_event *event, void *data)
> if (event->state <= PERF_EVENT_STATE_OFF)
> return 0;
>
> +#ifdef CONFIG_CGROUP_PERF
> + /* Don't sched system-wide event when cgroup context switch */
> + if (sid->cpuctx->cgrp_switch && !event->cgrp)
> + return 0;
> +#endif
> +
> if (!event_filter_match(event))
> return 0;
>
> @@ -3305,6 +3325,12 @@ static int flexible_sched_in(struct perf_event *event, void *data)
> if (event->state <= PERF_EVENT_STATE_OFF)
> return 0;
>
> +#ifdef CONFIG_CGROUP_PERF
> + /* Don't sched system-wide event when cgroup context switch */
> + if (sid->cpuctx->cgrp_switch && !event->cgrp)
> + return 0;
> +#endif
> +
> if (!event_filter_match(event))
> return 0;
>
> --
> 2.7.4
>
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/4] perf: Fix system-wide events miscounting during cgroup monitoring
2019-04-29 15:04 ` Mark Rutland
@ 2019-04-29 15:27 ` Liang, Kan
0 siblings, 0 replies; 18+ messages in thread
From: Liang, Kan @ 2019-04-29 15:27 UTC (permalink / raw)
To: Mark Rutland; +Cc: peterz, tglx, mingo, linux-kernel, eranian, tj, ak
On 4/29/2019 11:04 AM, Mark Rutland wrote:
> On Mon, Apr 29, 2019 at 07:44:02AM -0700, kan.liang@linux.intel.com wrote:
>> From: Kan Liang <kan.liang@linux.intel.com>
>>
>> When counting system-wide events and cgroup events simultaneously, the
>> value of system-wide events are miscounting. For example,
>>
>> perf stat -e cycles,instructions -e cycles,instructions -G
>> cgroup1,cgroup1,cgroup2,cgroup2 -a -e cycles,instructions -I 1000
>>
>> 1.096265502 12,375,398,872 cycles cgroup1
>> 1.096265502 8,396,184,503 instructions cgroup1
>> # 0.10 insn per cycle
>> 1.096265502 109,609,027,112 cycles cgroup2
>> 1.096265502 11,533,690,148 instructions cgroup2
>> # 0.14 insn per cycle
>> 1.096265502 121,672,937,058 cycles
>> 1.096265502 19,331,496,727 instructions #
>> 0.24 insn per cycle
>>
>> The events are identical events for system-wide and cgroup. The
>> value of system-wide events is less than the sum of cgroup events,
>> which is wrong.
>>
>> Both system-wide and cgroup are per-cpu. They share the same cpuctx
>> groups, cpuctx->flexible_groups/pinned_groups.
>> In context switch, cgroup switch tries to schedule all the events in
>> the cpuctx groups. The unmatched cgroup events can be filtered by its
>> event->cgrp. However, system-wide events, which event->cgrp is NULL, are
>> unconditionally switched, which causes miscounting.
>
> Why exactly does that cause mis-counting?
>
> Are the system-wide events not switched back in? Or filtered
> erroneously?
The small period between the prev cgroup sched_out and the new cgroup
sched_in is missed for system-wide events.
Current code mistakenly sched_out of system-wide events during that period.
>
>> Introduce cgrp_switch in cpuctx to indicate the cgroup context switch.
>> If it's system-wide event in context switch, don't try to switch it.
>>
>> Fixes: e5d1367f17ba ("perf: Add cgroup support")
>> Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
>> ---
>> include/linux/perf_event.h | 1 +
>> kernel/events/core.c | 30 ++++++++++++++++++++++++++++--
>> 2 files changed, 29 insertions(+), 2 deletions(-)
>>
>> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
>> index e47ef76..039e2f2 100644
>> --- a/include/linux/perf_event.h
>> +++ b/include/linux/perf_event.h
>> @@ -795,6 +795,7 @@ struct perf_cpu_context {
>> #ifdef CONFIG_CGROUP_PERF
>> struct perf_cgroup *cgrp;
>> struct list_head cgrp_cpuctx_entry;
>> + unsigned int cgrp_switch :1;
>> #endif
>>
>> struct list_head sched_cb_entry;
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index dc7dead..388dd42 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -809,6 +809,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
>>
>> perf_ctx_lock(cpuctx, cpuctx->task_ctx);
>> perf_pmu_disable(cpuctx->ctx.pmu);
>> + cpuctx->cgrp_switch = true;
>>
>> if (mode & PERF_CGROUP_SWOUT) {
>> cpu_ctx_sched_out(cpuctx, EVENT_ALL);
>> @@ -832,6 +833,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
>> &cpuctx->ctx);
>> cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
>> }
>> + cpuctx->cgrp_switch = false;
>> perf_pmu_enable(cpuctx->ctx.pmu);
>> perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
>> }
>> @@ -2944,13 +2946,25 @@ static void ctx_sched_out(struct perf_event_context *ctx,
>>
>> perf_pmu_disable(ctx->pmu);
>> if (is_active & EVENT_PINNED) {
>> - list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
>> + list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list) {
>> +#ifdef CONFIG_CGROUP_PERF
>> + /* Don't sched system-wide event when cgroup context switch */
>> + if (cpuctx->cgrp_switch && !event->cgrp)
>> + continue;
>> +#endif
>
> This pattern is duplicated several times, and should probably be a
> helper like:
>
> static bool skip_cgroup_switch(struct perf_cpu_context *cpuctx,
> struct perf_event *event);
> {
> return IS_ENABLED(CONFIG_CGROUP_PERF) &&
> cpuctx->cgrp_switch &&
> !event->cgrp;
> }
>
> ... allowing the above to be an unconditional:
>
> if (skip_cgroup_switch(cpuctx, event))
> continue;
>
> ... and likewise for the other cases.
>
I will change it in V2.
Thanks,
Kan
> Thanks,
> Mark.
>
>> group_sched_out(event, cpuctx, ctx);
>> + }
>> }
>>
>> if (is_active & EVENT_FLEXIBLE) {
>> - list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
>> + list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list) {
>> +#ifdef CONFIG_CGROUP_PERF
>> + /* Don't sched system-wide event when cgroup context switch */
>> + if (cpuctx->cgrp_switch && !event->cgrp)
>> + continue;
>> +#endif
>> group_sched_out(event, cpuctx, ctx);
>> + }
>> }
>> perf_pmu_enable(ctx->pmu);
>> }
>> @@ -3280,6 +3294,12 @@ static int pinned_sched_in(struct perf_event *event, void *data)
>> if (event->state <= PERF_EVENT_STATE_OFF)
>> return 0;
>>
>> +#ifdef CONFIG_CGROUP_PERF
>> + /* Don't sched system-wide event when cgroup context switch */
>> + if (sid->cpuctx->cgrp_switch && !event->cgrp)
>> + return 0;
>> +#endif
>> +
>> if (!event_filter_match(event))
>> return 0;
>>
>> @@ -3305,6 +3325,12 @@ static int flexible_sched_in(struct perf_event *event, void *data)
>> if (event->state <= PERF_EVENT_STATE_OFF)
>> return 0;
>>
>> +#ifdef CONFIG_CGROUP_PERF
>> + /* Don't sched system-wide event when cgroup context switch */
>> + if (sid->cpuctx->cgrp_switch && !event->cgrp)
>> + return 0;
>> +#endif
>> +
>> if (!event_filter_match(event))
>> return 0;
>>
>> --
>> 2.7.4
>>
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/4] perf: Fix system-wide events miscounting during cgroup monitoring
2019-04-29 14:44 ` [PATCH 1/4] perf: Fix system-wide events miscounting during cgroup monitoring kan.liang
2019-04-29 15:04 ` Mark Rutland
@ 2019-04-30 8:56 ` Peter Zijlstra
2019-04-30 15:45 ` Liang, Kan
1 sibling, 1 reply; 18+ messages in thread
From: Peter Zijlstra @ 2019-04-30 8:56 UTC (permalink / raw)
To: kan.liang; +Cc: tglx, mingo, linux-kernel, eranian, tj, ak
On Mon, Apr 29, 2019 at 07:44:02AM -0700, kan.liang@linux.intel.com wrote:
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index e47ef76..039e2f2 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -795,6 +795,7 @@ struct perf_cpu_context {
> #ifdef CONFIG_CGROUP_PERF
> struct perf_cgroup *cgrp;
> struct list_head cgrp_cpuctx_entry;
> + unsigned int cgrp_switch :1;
If you're not adding more bits, why not just keep it an int?
> #endif
>
> struct list_head sched_cb_entry;
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index dc7dead..388dd42 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -809,6 +809,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
>
> perf_ctx_lock(cpuctx, cpuctx->task_ctx);
> perf_pmu_disable(cpuctx->ctx.pmu);
> + cpuctx->cgrp_switch = true;
>
> if (mode & PERF_CGROUP_SWOUT) {
> cpu_ctx_sched_out(cpuctx, EVENT_ALL);
> @@ -832,6 +833,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
> &cpuctx->ctx);
> cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
> }
> + cpuctx->cgrp_switch = false;
> perf_pmu_enable(cpuctx->ctx.pmu);
> perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> }
That is a bit of a hack...
> @@ -2944,13 +2946,25 @@ static void ctx_sched_out(struct perf_event_context *ctx,
>
> perf_pmu_disable(ctx->pmu);
> if (is_active & EVENT_PINNED) {
> - list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
> + list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list) {
> +#ifdef CONFIG_CGROUP_PERF
> + /* Don't sched system-wide event when cgroup context switch */
> + if (cpuctx->cgrp_switch && !event->cgrp)
> + continue;
> +#endif
> group_sched_out(event, cpuctx, ctx);
> + }
> }
This works by accident, however..
>
> if (is_active & EVENT_FLEXIBLE) {
> - list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
> + list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list) {
> +#ifdef CONFIG_CGROUP_PERF
> + /* Don't sched system-wide event when cgroup context switch */
> + if (cpuctx->cgrp_switch && !event->cgrp)
> + continue;
> +#endif
> group_sched_out(event, cpuctx, ctx);
> + }
> }
> perf_pmu_enable(ctx->pmu);
> }
this one is just wrong afaict.
Suppose the new cgroup has pinned events, which we cannot schedule
because you left !cgroup flexible events on.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/4] perf: Fix system-wide events miscounting during cgroup monitoring
2019-04-30 8:56 ` Peter Zijlstra
@ 2019-04-30 15:45 ` Liang, Kan
0 siblings, 0 replies; 18+ messages in thread
From: Liang, Kan @ 2019-04-30 15:45 UTC (permalink / raw)
To: Peter Zijlstra; +Cc: tglx, mingo, linux-kernel, eranian, tj, ak
On 4/30/2019 4:56 AM, Peter Zijlstra wrote:
> On Mon, Apr 29, 2019 at 07:44:02AM -0700, kan.liang@linux.intel.com wrote:
>
>> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
>> index e47ef76..039e2f2 100644
>> --- a/include/linux/perf_event.h
>> +++ b/include/linux/perf_event.h
>> @@ -795,6 +795,7 @@ struct perf_cpu_context {
>> #ifdef CONFIG_CGROUP_PERF
>> struct perf_cgroup *cgrp;
>> struct list_head cgrp_cpuctx_entry;
>> + unsigned int cgrp_switch :1;
>
> If you're not adding more bits, why not just keep it an int?
>
Yes, there is no more bits for now.
Sure, I will change it to int.
>> #endif
>>
>> struct list_head sched_cb_entry;
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index dc7dead..388dd42 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -809,6 +809,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
>>
>> perf_ctx_lock(cpuctx, cpuctx->task_ctx);
>> perf_pmu_disable(cpuctx->ctx.pmu);
>> + cpuctx->cgrp_switch = true;
>>
>> if (mode & PERF_CGROUP_SWOUT) {
>> cpu_ctx_sched_out(cpuctx, EVENT_ALL);
>> @@ -832,6 +833,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
>> &cpuctx->ctx);
>> cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
>> }
>> + cpuctx->cgrp_switch = false;
>> perf_pmu_enable(cpuctx->ctx.pmu);
>> perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
>> }
>
> That is a bit of a hack...
We need an indicator to indicate the context switch, so we can apply the
optimization.
I was thinking to use the existing variables, e.g. cpuctx->cgrp. But it
doesn't work. We cannot tell if it's the first sched_in or in a context
switch by checking cpuctx->cgrp.
cgrp_switch should be the simplest method.
>
>> @@ -2944,13 +2946,25 @@ static void ctx_sched_out(struct perf_event_context *ctx,
>>
>> perf_pmu_disable(ctx->pmu);
>> if (is_active & EVENT_PINNED) {
>> - list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
>> + list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list) {
>> +#ifdef CONFIG_CGROUP_PERF
>> + /* Don't sched system-wide event when cgroup context switch */
>> + if (cpuctx->cgrp_switch && !event->cgrp)
>> + continue;
>> +#endif
>> group_sched_out(event, cpuctx, ctx);
>> + }
>> }
>
> This works by accident, however..
>
>>
>> if (is_active & EVENT_FLEXIBLE) {
>> - list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
>> + list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list) {
>> +#ifdef CONFIG_CGROUP_PERF
>> + /* Don't sched system-wide event when cgroup context switch */
>> + if (cpuctx->cgrp_switch && !event->cgrp)
>> + continue;
>> +#endif
>> group_sched_out(event, cpuctx, ctx);
>> + }
>> }
>> perf_pmu_enable(ctx->pmu);
>> }
>
> this one is just wrong afaict.
>
> Suppose the new cgroup has pinned events, which we cannot schedule
> because you left !cgroup flexible events on.
>
Right, the priority order may be changed.
I think we can track the event type in perf_cgroup.
If the new cgroup has pinned events, we can fall back to slow path,
which will switch everything.
How about the patch as below? (Not test yet)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 7eff286..f751852 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -843,6 +843,8 @@ struct perf_cgroup {
struct perf_cgroup_info __percpu *info;
/* perf cgroup ID = (CPU ID << 32) | css subsys-unique ID */
u64 __percpu *cgrp_id;
+
+ int cgrp_event_type;
};
/*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2066322..557d371 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -839,6 +839,22 @@ static void perf_cgroup_switch(struct task_struct
*task, int mode)
*/
cpuctx->cgrp = perf_cgroup_from_task(task,
&cpuctx->ctx);
+
+ /*
+ * The system-wide events weren't sched out.
+ * To keep the following priority order:
+ * cpu pinned, cpu flexible
+ * We need to switch system-wide events as well,
+ * if the new cgroup has pinned events.
+ *
+ * Disable fast path and sched out the flexible events.
+ */
+ if (cpuctx->cgrp->cgrp_event_type & EVENT_PINNED) {
+ /* disable fast path */
+ cpuctx->cgrp_switch = false;
+ cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+ }
+
cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
}
cpuctx->cgrp_switch = false;
@@ -924,6 +940,11 @@ static inline int perf_cgroup_connect(int fd,
struct perf_event *event,
cgrp = container_of(css, struct perf_cgroup, css);
event->cgrp = cgrp;
+ if (event->attr.pinned)
+ cgrp->cgrp_event_type |= EVENT_PINNED;
+ else
+ cgrp->cgrp_event_type |= EVENT_FLEXIBLE;
+
cgrp_id = ((u64)smp_processor_id() << 32) | css->id;
event->cgrp_id = cgrp_id;
*per_cpu_ptr(cgrp->cgrp_id, event->cpu) = cgrp_id;
Thanks,
Kan
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [PATCH 2/4] perf: Add filter_match() as a parameter for pinned/flexible_sched_in()
2019-04-29 14:44 [PATCH 0/4] Optimize cgroup context switch kan.liang
2019-04-29 14:44 ` [PATCH 1/4] perf: Fix system-wide events miscounting during cgroup monitoring kan.liang
@ 2019-04-29 14:44 ` kan.liang
2019-04-29 15:12 ` Mark Rutland
2019-04-29 14:44 ` [PATCH 3/4] perf cgroup: Add cgroup ID as a key of RB tree kan.liang
2019-04-29 14:44 ` [PATCH 4/4] perf cgroup: Add fast path for cgroup switch kan.liang
3 siblings, 1 reply; 18+ messages in thread
From: kan.liang @ 2019-04-29 14:44 UTC (permalink / raw)
To: peterz, tglx, mingo, linux-kernel; +Cc: eranian, tj, ak, Kan Liang
From: Kan Liang <kan.liang@linux.intel.com>
A fast path will be introduced in the following patches to speed up the
cgroup events sched in, which only needs a simpler filter_match().
Add filter_match() as a parameter for pinned/flexible_sched_in().
No functional change.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---
kernel/events/core.c | 15 +++++++++------
1 file changed, 9 insertions(+), 6 deletions(-)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 388dd42..782fd86 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3251,7 +3251,8 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
}
static int visit_groups_merge(struct perf_event_groups *groups, int cpu,
- int (*func)(struct perf_event *, void *), void *data)
+ int (*func)(struct perf_event *, void *, int (*)(struct perf_event *)),
+ void *data)
{
struct perf_event **evt, *evt1, *evt2;
int ret;
@@ -3271,7 +3272,7 @@ static int visit_groups_merge(struct perf_event_groups *groups, int cpu,
evt = &evt2;
}
- ret = func(*evt, data);
+ ret = func(*evt, data, event_filter_match);
if (ret)
return ret;
@@ -3287,7 +3288,8 @@ struct sched_in_data {
int can_add_hw;
};
-static int pinned_sched_in(struct perf_event *event, void *data)
+static int pinned_sched_in(struct perf_event *event, void *data,
+ int (*filter_match)(struct perf_event *))
{
struct sched_in_data *sid = data;
@@ -3300,7 +3302,7 @@ static int pinned_sched_in(struct perf_event *event, void *data)
return 0;
#endif
- if (!event_filter_match(event))
+ if (!filter_match(event))
return 0;
if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
@@ -3318,7 +3320,8 @@ static int pinned_sched_in(struct perf_event *event, void *data)
return 0;
}
-static int flexible_sched_in(struct perf_event *event, void *data)
+static int flexible_sched_in(struct perf_event *event, void *data,
+ int (*filter_match)(struct perf_event *))
{
struct sched_in_data *sid = data;
@@ -3331,7 +3334,7 @@ static int flexible_sched_in(struct perf_event *event, void *data)
return 0;
#endif
- if (!event_filter_match(event))
+ if (!filter_match(event))
return 0;
if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
--
2.7.4
^ permalink raw reply related [flat|nested] 18+ messages in thread
* Re: [PATCH 2/4] perf: Add filter_match() as a parameter for pinned/flexible_sched_in()
2019-04-29 14:44 ` [PATCH 2/4] perf: Add filter_match() as a parameter for pinned/flexible_sched_in() kan.liang
@ 2019-04-29 15:12 ` Mark Rutland
2019-04-29 15:31 ` Liang, Kan
0 siblings, 1 reply; 18+ messages in thread
From: Mark Rutland @ 2019-04-29 15:12 UTC (permalink / raw)
To: kan.liang; +Cc: peterz, tglx, mingo, linux-kernel, eranian, tj, ak
On Mon, Apr 29, 2019 at 07:44:03AM -0700, kan.liang@linux.intel.com wrote:
> From: Kan Liang <kan.liang@linux.intel.com>
>
> A fast path will be introduced in the following patches to speed up the
> cgroup events sched in, which only needs a simpler filter_match().
>
> Add filter_match() as a parameter for pinned/flexible_sched_in().
>
> No functional change.
I suspect that the cost you're trying to avoid is pmu_filter_match()
iterating over the entire group, which arm systems rely upon for correct
behaviour on big.LITTLE systems.
Is that the case?
Thanks,
Mark.
>
> Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
> ---
> kernel/events/core.c | 15 +++++++++------
> 1 file changed, 9 insertions(+), 6 deletions(-)
>
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 388dd42..782fd86 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -3251,7 +3251,8 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
> }
>
> static int visit_groups_merge(struct perf_event_groups *groups, int cpu,
> - int (*func)(struct perf_event *, void *), void *data)
> + int (*func)(struct perf_event *, void *, int (*)(struct perf_event *)),
> + void *data)
> {
> struct perf_event **evt, *evt1, *evt2;
> int ret;
> @@ -3271,7 +3272,7 @@ static int visit_groups_merge(struct perf_event_groups *groups, int cpu,
> evt = &evt2;
> }
>
> - ret = func(*evt, data);
> + ret = func(*evt, data, event_filter_match);
> if (ret)
> return ret;
>
> @@ -3287,7 +3288,8 @@ struct sched_in_data {
> int can_add_hw;
> };
>
> -static int pinned_sched_in(struct perf_event *event, void *data)
> +static int pinned_sched_in(struct perf_event *event, void *data,
> + int (*filter_match)(struct perf_event *))
> {
> struct sched_in_data *sid = data;
>
> @@ -3300,7 +3302,7 @@ static int pinned_sched_in(struct perf_event *event, void *data)
> return 0;
> #endif
>
> - if (!event_filter_match(event))
> + if (!filter_match(event))
> return 0;
>
> if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
> @@ -3318,7 +3320,8 @@ static int pinned_sched_in(struct perf_event *event, void *data)
> return 0;
> }
>
> -static int flexible_sched_in(struct perf_event *event, void *data)
> +static int flexible_sched_in(struct perf_event *event, void *data,
> + int (*filter_match)(struct perf_event *))
> {
> struct sched_in_data *sid = data;
>
> @@ -3331,7 +3334,7 @@ static int flexible_sched_in(struct perf_event *event, void *data)
> return 0;
> #endif
>
> - if (!event_filter_match(event))
> + if (!filter_match(event))
> return 0;
>
> if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
> --
> 2.7.4
>
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 2/4] perf: Add filter_match() as a parameter for pinned/flexible_sched_in()
2019-04-29 15:12 ` Mark Rutland
@ 2019-04-29 15:31 ` Liang, Kan
2019-04-29 16:56 ` Mark Rutland
0 siblings, 1 reply; 18+ messages in thread
From: Liang, Kan @ 2019-04-29 15:31 UTC (permalink / raw)
To: Mark Rutland; +Cc: peterz, tglx, mingo, linux-kernel, eranian, tj, ak
On 4/29/2019 11:12 AM, Mark Rutland wrote:
> On Mon, Apr 29, 2019 at 07:44:03AM -0700, kan.liang@linux.intel.com wrote:
>> From: Kan Liang <kan.liang@linux.intel.com>
>>
>> A fast path will be introduced in the following patches to speed up the
>> cgroup events sched in, which only needs a simpler filter_match().
>>
>> Add filter_match() as a parameter for pinned/flexible_sched_in().
>>
>> No functional change.
>
> I suspect that the cost you're trying to avoid is pmu_filter_match()
> iterating over the entire group, which arm systems rely upon for correct
> behaviour on big.LITTLE systems.
>
> Is that the case?
No. In X86, we don't use pmu_filter_match(). The fast path still keeps
this filter.
perf_cgroup_match() is the one I want to avoid.
Thanks,
Kan
>
> Thanks,
> Mark.
>
>>
>> Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
>> ---
>> kernel/events/core.c | 15 +++++++++------
>> 1 file changed, 9 insertions(+), 6 deletions(-)
>>
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index 388dd42..782fd86 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -3251,7 +3251,8 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
>> }
>>
>> static int visit_groups_merge(struct perf_event_groups *groups, int cpu,
>> - int (*func)(struct perf_event *, void *), void *data)
>> + int (*func)(struct perf_event *, void *, int (*)(struct perf_event *)),
>> + void *data)
>> {
>> struct perf_event **evt, *evt1, *evt2;
>> int ret;
>> @@ -3271,7 +3272,7 @@ static int visit_groups_merge(struct perf_event_groups *groups, int cpu,
>> evt = &evt2;
>> }
>>
>> - ret = func(*evt, data);
>> + ret = func(*evt, data, event_filter_match);
>> if (ret)
>> return ret;
>>
>> @@ -3287,7 +3288,8 @@ struct sched_in_data {
>> int can_add_hw;
>> };
>>
>> -static int pinned_sched_in(struct perf_event *event, void *data)
>> +static int pinned_sched_in(struct perf_event *event, void *data,
>> + int (*filter_match)(struct perf_event *))
>> {
>> struct sched_in_data *sid = data;
>>
>> @@ -3300,7 +3302,7 @@ static int pinned_sched_in(struct perf_event *event, void *data)
>> return 0;
>> #endif
>>
>> - if (!event_filter_match(event))
>> + if (!filter_match(event))
>> return 0;
>>
>> if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
>> @@ -3318,7 +3320,8 @@ static int pinned_sched_in(struct perf_event *event, void *data)
>> return 0;
>> }
>>
>> -static int flexible_sched_in(struct perf_event *event, void *data)
>> +static int flexible_sched_in(struct perf_event *event, void *data,
>> + int (*filter_match)(struct perf_event *))
>> {
>> struct sched_in_data *sid = data;
>>
>> @@ -3331,7 +3334,7 @@ static int flexible_sched_in(struct perf_event *event, void *data)
>> return 0;
>> #endif
>>
>> - if (!event_filter_match(event))
>> + if (!filter_match(event))
>> return 0;
>>
>> if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
>> --
>> 2.7.4
>>
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 2/4] perf: Add filter_match() as a parameter for pinned/flexible_sched_in()
2019-04-29 15:31 ` Liang, Kan
@ 2019-04-29 16:56 ` Mark Rutland
0 siblings, 0 replies; 18+ messages in thread
From: Mark Rutland @ 2019-04-29 16:56 UTC (permalink / raw)
To: Liang, Kan; +Cc: peterz, tglx, mingo, linux-kernel, eranian, tj, ak
On Mon, Apr 29, 2019 at 11:31:26AM -0400, Liang, Kan wrote:
>
>
> On 4/29/2019 11:12 AM, Mark Rutland wrote:
> > On Mon, Apr 29, 2019 at 07:44:03AM -0700, kan.liang@linux.intel.com wrote:
> > > From: Kan Liang <kan.liang@linux.intel.com>
> > >
> > > A fast path will be introduced in the following patches to speed up the
> > > cgroup events sched in, which only needs a simpler filter_match().
> > >
> > > Add filter_match() as a parameter for pinned/flexible_sched_in().
> > >
> > > No functional change.
> >
> > I suspect that the cost you're trying to avoid is pmu_filter_match()
> > iterating over the entire group, which arm systems rely upon for correct
> > behaviour on big.LITTLE systems.
> >
> > Is that the case?
>
> No. In X86, we don't use pmu_filter_match(). The fast path still keeps this
> filter.
> perf_cgroup_match() is the one I want to avoid.
Understood; sorry for the silly question, and thanks for confirming! :)
Thanks,
Mark.
^ permalink raw reply [flat|nested] 18+ messages in thread
* [PATCH 3/4] perf cgroup: Add cgroup ID as a key of RB tree
2019-04-29 14:44 [PATCH 0/4] Optimize cgroup context switch kan.liang
2019-04-29 14:44 ` [PATCH 1/4] perf: Fix system-wide events miscounting during cgroup monitoring kan.liang
2019-04-29 14:44 ` [PATCH 2/4] perf: Add filter_match() as a parameter for pinned/flexible_sched_in() kan.liang
@ 2019-04-29 14:44 ` kan.liang
2019-04-29 23:02 ` Ian Rogers
` (2 more replies)
2019-04-29 14:44 ` [PATCH 4/4] perf cgroup: Add fast path for cgroup switch kan.liang
3 siblings, 3 replies; 18+ messages in thread
From: kan.liang @ 2019-04-29 14:44 UTC (permalink / raw)
To: peterz, tglx, mingo, linux-kernel; +Cc: eranian, tj, ak, Kan Liang
From: Kan Liang <kan.liang@linux.intel.com>
Current RB tree for pinned/flexible groups doesn't take cgroup into
account. All events on a given CPU will be fed to
pinned/flexible_sched_in(), which relies on perf_cgroup_match() to
filter the events for a specific cgroup. The method has high overhead,
especially in frequent context switch with several events and cgroups
involved.
Add unique cgrp_id for each cgroup, which is composed by CPU ID and css
subsys-unique ID. The low 32bit of cgrp_id (css subsys-unique ID) is
used as part of complex key of RB tree.
Events in the same cgroup has the same cgrp_id.
The cgrp_id is always zero for non-cgroup case. There is no functional
change for non-cgroup case.
Add perf_event_groups_first_cgroup() and
perf_event_groups_next_cgroup(), which will be used later to traverse
the events for a specific cgroup on a given CPU.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---
include/linux/perf_event.h | 6 ++++
kernel/events/core.c | 84 ++++++++++++++++++++++++++++++++++++++++++----
2 files changed, 83 insertions(+), 7 deletions(-)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 039e2f2..7eff286 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -703,6 +703,7 @@ struct perf_event {
#ifdef CONFIG_CGROUP_PERF
struct perf_cgroup *cgrp; /* cgroup event is attach to */
+ u64 cgrp_id; /* perf cgroup ID */
#endif
struct list_head sb_list;
@@ -825,6 +826,9 @@ struct bpf_perf_event_data_kern {
#ifdef CONFIG_CGROUP_PERF
+#define PERF_CGROUP_ID_MASK 0xffffffff
+#define cgrp_id_low(id) (id & PERF_CGROUP_ID_MASK)
+
/*
* perf_cgroup_info keeps track of time_enabled for a cgroup.
* This is a per-cpu dynamically allocated data structure.
@@ -837,6 +841,8 @@ struct perf_cgroup_info {
struct perf_cgroup {
struct cgroup_subsys_state css;
struct perf_cgroup_info __percpu *info;
+ /* perf cgroup ID = (CPU ID << 32) | css subsys-unique ID */
+ u64 __percpu *cgrp_id;
};
/*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 782fd86..5ecc048 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -901,6 +901,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
struct cgroup_subsys_state *css;
struct fd f = fdget(fd);
int ret = 0;
+ u64 cgrp_id;
if (!f.file)
return -EBADF;
@@ -915,6 +916,10 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
cgrp = container_of(css, struct perf_cgroup, css);
event->cgrp = cgrp;
+ cgrp_id = ((u64)smp_processor_id() << 32) | css->id;
+ event->cgrp_id = cgrp_id;
+ *per_cpu_ptr(cgrp->cgrp_id, event->cpu) = cgrp_id;
+
/*
* all events in a group must monitor
* the same cgroup because a task belongs
@@ -1494,6 +1499,9 @@ static void init_event_group(struct perf_event *event)
{
RB_CLEAR_NODE(&event->group_node);
event->group_index = 0;
+#ifdef CONFIG_CGROUP_PERF
+ event->cgrp_id = 0;
+#endif
}
/*
@@ -1521,8 +1529,8 @@ static void perf_event_groups_init(struct perf_event_groups *groups)
/*
* Compare function for event groups;
*
- * Implements complex key that first sorts by CPU and then by virtual index
- * which provides ordering when rotating groups for the same CPU.
+ * Implements complex key that first sorts by CPU and cgroup ID, then by
+ * virtual index which provides ordering when rotating groups for the same CPU.
*/
static bool
perf_event_groups_less(struct perf_event *left, struct perf_event *right)
@@ -1532,6 +1540,13 @@ perf_event_groups_less(struct perf_event *left, struct perf_event *right)
if (left->cpu > right->cpu)
return false;
+#ifdef CONFIG_CGROUP_PERF
+ if (cgrp_id_low(left->cgrp_id) < cgrp_id_low(right->cgrp_id))
+ return true;
+ if (cgrp_id_low(left->cgrp_id) > cgrp_id_low(right->cgrp_id))
+ return false;
+#endif
+
if (left->group_index < right->group_index)
return true;
if (left->group_index > right->group_index)
@@ -1541,7 +1556,8 @@ perf_event_groups_less(struct perf_event *left, struct perf_event *right)
}
/*
- * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for
+ * Insert @event into @groups' tree;
+ * Using {@event->cpu, @event->cgrp_id, ++@groups->index} for
* key (see perf_event_groups_less). This places it last inside the CPU
* subtree.
*/
@@ -1650,6 +1666,50 @@ perf_event_groups_next(struct perf_event *event)
return NULL;
}
+#ifdef CONFIG_CGROUP_PERF
+
+static struct perf_event *
+perf_event_groups_first_cgroup(struct perf_event_groups *groups,
+ int cpu, u64 cgrp_id)
+{
+ struct perf_event *node_event = NULL, *match = NULL;
+ struct rb_node *node = groups->tree.rb_node;
+
+ while (node) {
+ node_event = container_of(node, struct perf_event, group_node);
+
+ if (cpu < node_event->cpu) {
+ node = node->rb_left;
+ } else if (cpu > node_event->cpu) {
+ node = node->rb_right;
+ } else {
+ if (cgrp_id_low(cgrp_id) < cgrp_id_low(node_event->cgrp_id))
+ node = node->rb_left;
+ else if (cgrp_id_low(cgrp_id) > cgrp_id_low(node_event->cgrp_id))
+ node = node->rb_right;
+ else {
+ match = node_event;
+ node = node->rb_left;
+ }
+ }
+ }
+ return match;
+}
+
+static struct perf_event *
+perf_event_groups_next_cgroup(struct perf_event *event)
+{
+ struct perf_event *next;
+
+ next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node);
+ if (next && (next->cpu == event->cpu) && (next->cgrp_id == event->cgrp_id))
+ return next;
+
+ return NULL;
+}
+
+#endif
+
/*
* Iterate through the whole groups tree.
*/
@@ -12127,18 +12187,28 @@ perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
return ERR_PTR(-ENOMEM);
jc->info = alloc_percpu(struct perf_cgroup_info);
- if (!jc->info) {
- kfree(jc);
- return ERR_PTR(-ENOMEM);
- }
+ if (!jc->info)
+ goto free_jc;
+
+ jc->cgrp_id = alloc_percpu(u64);
+ if (!jc->cgrp_id)
+ goto free_jc_info;
return &jc->css;
+
+free_jc_info:
+ free_percpu(jc->info);
+free_jc:
+ kfree(jc);
+
+ return ERR_PTR(-ENOMEM);
}
static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
{
struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
+ free_percpu(jc->cgrp_id);
free_percpu(jc->info);
kfree(jc);
}
--
2.7.4
^ permalink raw reply related [flat|nested] 18+ messages in thread
* Re: [PATCH 3/4] perf cgroup: Add cgroup ID as a key of RB tree
2019-04-29 14:44 ` [PATCH 3/4] perf cgroup: Add cgroup ID as a key of RB tree kan.liang
@ 2019-04-29 23:02 ` Ian Rogers
2019-04-30 9:08 ` Peter Zijlstra
2019-04-30 9:03 ` Peter Zijlstra
2019-04-30 9:03 ` Peter Zijlstra
2 siblings, 1 reply; 18+ messages in thread
From: Ian Rogers @ 2019-04-29 23:02 UTC (permalink / raw)
To: kan.liang; +Cc: peterz, tglx, mingo, linux-kernel, Stephane Eranian, tj, ak
This is very interesting. How does the code handle cgroup hierarchies?
For example, if we have:
cgroup0 is the cgroup root
cgroup1 whose parent is cgroup0
cgroup2 whose parent is cgroup1
we have task0 running in cgroup0, task1 in cgroup1, task2 in cgroup2
and then a perf command line like:
perf stat -e cycles,cycles,cycles -G cgroup0,cgroup1,cgroup2 --no-merge sleep 10
we expected 3 cycles counts:
- for cgroup0 including task2, task1 and task0
- for cgroup1 including task2 and task1
- for cgroup2 just including task2
It looks as though:
+ if (next && (next->cpu == event->cpu) && (next->cgrp_id ==
event->cgrp_id))
will mean that events will only consider cgroups that directly match
the cgroup of the event. Ie we'd get 3 cycles counts of:
- for cgroup0 just task0
- for cgroup1 just task1
- for cgroup2 just task2
Thanks,
Ian
On Mon, Apr 29, 2019 at 7:45 AM <kan.liang@linux.intel.com> wrote:
>
> From: Kan Liang <kan.liang@linux.intel.com>
>
> Current RB tree for pinned/flexible groups doesn't take cgroup into
> account. All events on a given CPU will be fed to
> pinned/flexible_sched_in(), which relies on perf_cgroup_match() to
> filter the events for a specific cgroup. The method has high overhead,
> especially in frequent context switch with several events and cgroups
> involved.
>
> Add unique cgrp_id for each cgroup, which is composed by CPU ID and css
> subsys-unique ID. The low 32bit of cgrp_id (css subsys-unique ID) is
> used as part of complex key of RB tree.
> Events in the same cgroup has the same cgrp_id.
> The cgrp_id is always zero for non-cgroup case. There is no functional
> change for non-cgroup case.
>
> Add perf_event_groups_first_cgroup() and
> perf_event_groups_next_cgroup(), which will be used later to traverse
> the events for a specific cgroup on a given CPU.
>
> Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
> ---
> include/linux/perf_event.h | 6 ++++
> kernel/events/core.c | 84 ++++++++++++++++++++++++++++++++++++++++++----
> 2 files changed, 83 insertions(+), 7 deletions(-)
>
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index 039e2f2..7eff286 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -703,6 +703,7 @@ struct perf_event {
>
> #ifdef CONFIG_CGROUP_PERF
> struct perf_cgroup *cgrp; /* cgroup event is attach to */
> + u64 cgrp_id; /* perf cgroup ID */
> #endif
>
> struct list_head sb_list;
> @@ -825,6 +826,9 @@ struct bpf_perf_event_data_kern {
>
> #ifdef CONFIG_CGROUP_PERF
>
> +#define PERF_CGROUP_ID_MASK 0xffffffff
> +#define cgrp_id_low(id) (id & PERF_CGROUP_ID_MASK)
> +
> /*
> * perf_cgroup_info keeps track of time_enabled for a cgroup.
> * This is a per-cpu dynamically allocated data structure.
> @@ -837,6 +841,8 @@ struct perf_cgroup_info {
> struct perf_cgroup {
> struct cgroup_subsys_state css;
> struct perf_cgroup_info __percpu *info;
> + /* perf cgroup ID = (CPU ID << 32) | css subsys-unique ID */
> + u64 __percpu *cgrp_id;
> };
>
> /*
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 782fd86..5ecc048 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -901,6 +901,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
> struct cgroup_subsys_state *css;
> struct fd f = fdget(fd);
> int ret = 0;
> + u64 cgrp_id;
>
> if (!f.file)
> return -EBADF;
> @@ -915,6 +916,10 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
> cgrp = container_of(css, struct perf_cgroup, css);
> event->cgrp = cgrp;
>
> + cgrp_id = ((u64)smp_processor_id() << 32) | css->id;
> + event->cgrp_id = cgrp_id;
> + *per_cpu_ptr(cgrp->cgrp_id, event->cpu) = cgrp_id;
> +
> /*
> * all events in a group must monitor
> * the same cgroup because a task belongs
> @@ -1494,6 +1499,9 @@ static void init_event_group(struct perf_event *event)
> {
> RB_CLEAR_NODE(&event->group_node);
> event->group_index = 0;
> +#ifdef CONFIG_CGROUP_PERF
> + event->cgrp_id = 0;
> +#endif
> }
>
> /*
> @@ -1521,8 +1529,8 @@ static void perf_event_groups_init(struct perf_event_groups *groups)
> /*
> * Compare function for event groups;
> *
> - * Implements complex key that first sorts by CPU and then by virtual index
> - * which provides ordering when rotating groups for the same CPU.
> + * Implements complex key that first sorts by CPU and cgroup ID, then by
> + * virtual index which provides ordering when rotating groups for the same CPU.
> */
> static bool
> perf_event_groups_less(struct perf_event *left, struct perf_event *right)
> @@ -1532,6 +1540,13 @@ perf_event_groups_less(struct perf_event *left, struct perf_event *right)
> if (left->cpu > right->cpu)
> return false;
>
> +#ifdef CONFIG_CGROUP_PERF
> + if (cgrp_id_low(left->cgrp_id) < cgrp_id_low(right->cgrp_id))
> + return true;
> + if (cgrp_id_low(left->cgrp_id) > cgrp_id_low(right->cgrp_id))
> + return false;
> +#endif
> +
> if (left->group_index < right->group_index)
> return true;
> if (left->group_index > right->group_index)
> @@ -1541,7 +1556,8 @@ perf_event_groups_less(struct perf_event *left, struct perf_event *right)
> }
>
> /*
> - * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for
> + * Insert @event into @groups' tree;
> + * Using {@event->cpu, @event->cgrp_id, ++@groups->index} for
> * key (see perf_event_groups_less). This places it last inside the CPU
> * subtree.
> */
> @@ -1650,6 +1666,50 @@ perf_event_groups_next(struct perf_event *event)
> return NULL;
> }
>
> +#ifdef CONFIG_CGROUP_PERF
> +
> +static struct perf_event *
> +perf_event_groups_first_cgroup(struct perf_event_groups *groups,
> + int cpu, u64 cgrp_id)
> +{
> + struct perf_event *node_event = NULL, *match = NULL;
> + struct rb_node *node = groups->tree.rb_node;
> +
> + while (node) {
> + node_event = container_of(node, struct perf_event, group_node);
> +
> + if (cpu < node_event->cpu) {
> + node = node->rb_left;
> + } else if (cpu > node_event->cpu) {
> + node = node->rb_right;
> + } else {
> + if (cgrp_id_low(cgrp_id) < cgrp_id_low(node_event->cgrp_id))
> + node = node->rb_left;
> + else if (cgrp_id_low(cgrp_id) > cgrp_id_low(node_event->cgrp_id))
> + node = node->rb_right;
> + else {
> + match = node_event;
> + node = node->rb_left;
> + }
> + }
> + }
> + return match;
> +}
> +
> +static struct perf_event *
> +perf_event_groups_next_cgroup(struct perf_event *event)
> +{
> + struct perf_event *next;
> +
> + next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node);
> + if (next && (next->cpu == event->cpu) && (next->cgrp_id == event->cgrp_id))
> + return next;
> +
> + return NULL;
> +}
> +
> +#endif
> +
> /*
> * Iterate through the whole groups tree.
> */
> @@ -12127,18 +12187,28 @@ perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
> return ERR_PTR(-ENOMEM);
>
> jc->info = alloc_percpu(struct perf_cgroup_info);
> - if (!jc->info) {
> - kfree(jc);
> - return ERR_PTR(-ENOMEM);
> - }
> + if (!jc->info)
> + goto free_jc;
> +
> + jc->cgrp_id = alloc_percpu(u64);
> + if (!jc->cgrp_id)
> + goto free_jc_info;
>
> return &jc->css;
> +
> +free_jc_info:
> + free_percpu(jc->info);
> +free_jc:
> + kfree(jc);
> +
> + return ERR_PTR(-ENOMEM);
> }
>
> static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
> {
> struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
>
> + free_percpu(jc->cgrp_id);
> free_percpu(jc->info);
> kfree(jc);
> }
> --
> 2.7.4
>
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 3/4] perf cgroup: Add cgroup ID as a key of RB tree
2019-04-29 23:02 ` Ian Rogers
@ 2019-04-30 9:08 ` Peter Zijlstra
2019-04-30 15:46 ` Liang, Kan
0 siblings, 1 reply; 18+ messages in thread
From: Peter Zijlstra @ 2019-04-30 9:08 UTC (permalink / raw)
To: Ian Rogers; +Cc: kan.liang, tglx, mingo, linux-kernel, Stephane Eranian, tj, ak
A: Because it messes up the order in which people normally read text.
Q: Why is top-posting such a bad thing?
A: Top-posting.
Q: What is the most annoying thing in e-mail?
On Mon, Apr 29, 2019 at 04:02:33PM -0700, Ian Rogers wrote:
> This is very interesting. How does the code handle cgroup hierarchies?
> For example, if we have:
>
> cgroup0 is the cgroup root
> cgroup1 whose parent is cgroup0
> cgroup2 whose parent is cgroup1
>
> we have task0 running in cgroup0, task1 in cgroup1, task2 in cgroup2
> and then a perf command line like:
> perf stat -e cycles,cycles,cycles -G cgroup0,cgroup1,cgroup2 --no-merge sleep 10
>
> we expected 3 cycles counts:
> - for cgroup0 including task2, task1 and task0
> - for cgroup1 including task2 and task1
> - for cgroup2 just including task2
>
> It looks as though:
> + if (next && (next->cpu == event->cpu) && (next->cgrp_id ==
> event->cgrp_id))
>
> will mean that events will only consider cgroups that directly match
> the cgroup of the event. Ie we'd get 3 cycles counts of:
> - for cgroup0 just task0
> - for cgroup1 just task1
> - for cgroup2 just task2
Yeah, I think you're right; the proposed code doesn't capture the
hierarchy thing at all.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 3/4] perf cgroup: Add cgroup ID as a key of RB tree
2019-04-30 9:08 ` Peter Zijlstra
@ 2019-04-30 15:46 ` Liang, Kan
0 siblings, 0 replies; 18+ messages in thread
From: Liang, Kan @ 2019-04-30 15:46 UTC (permalink / raw)
To: Peter Zijlstra, Ian Rogers
Cc: tglx, mingo, linux-kernel, Stephane Eranian, tj, ak
On 4/30/2019 5:08 AM, Peter Zijlstra wrote:
> On Mon, Apr 29, 2019 at 04:02:33PM -0700, Ian Rogers wrote:
>> This is very interesting. How does the code handle cgroup hierarchies?
>> For example, if we have:
>>
>> cgroup0 is the cgroup root
>> cgroup1 whose parent is cgroup0
>> cgroup2 whose parent is cgroup1
>>
>> we have task0 running in cgroup0, task1 in cgroup1, task2 in cgroup2
>> and then a perf command line like:
>> perf stat -e cycles,cycles,cycles -G cgroup0,cgroup1,cgroup2 --no-merge sleep 10
>>
>> we expected 3 cycles counts:
>> - for cgroup0 including task2, task1 and task0
>> - for cgroup1 including task2 and task1
>> - for cgroup2 just including task2
>>
>> It looks as though:
>> + if (next && (next->cpu == event->cpu) && (next->cgrp_id ==
>> event->cgrp_id))
>>
>> will mean that events will only consider cgroups that directly match
>> the cgroup of the event. Ie we'd get 3 cycles counts of:
>> - for cgroup0 just task0
>> - for cgroup1 just task1
>> - for cgroup2 just task2
> Yeah, I think you're right; the proposed code doesn't capture the
> hierarchy thing at all.
The hierarchies is handled in the next patch as below.
But I once thought we only need to handle directly match. So it will be
return immediately once a match found.
I believe we can fix it by simply remove the "return 0".
> +static int cgroup_visit_groups_merge(struct perf_event_groups *groups, int cpu,
> + int (*func)(struct perf_event *, void *, int (*)(struct perf_event *)),
> + void *data)
> +{
> + struct sched_in_data *sid = data;
> + struct cgroup_subsys_state *css;
> + struct perf_cgroup *cgrp;
> + struct perf_event *evt;
> + u64 cgrp_id;
> +
> + for (css = &sid->cpuctx->cgrp->css; css; css = css->parent) {
> + /* root cgroup doesn't have events */
> + if (css->id == 1)
> + return 0;
> +
> + cgrp = container_of(css, struct perf_cgroup, css);
> + cgrp_id = *this_cpu_ptr(cgrp->cgrp_id);
> + /* Only visit groups when the cgroup has events */
> + if (cgrp_id) {
> + evt = perf_event_groups_first_cgroup(groups, cpu, cgrp_id);
> + while (evt) {
> + if (func(evt, (void *)sid, pmu_filter_match))
> + break;
> + evt = perf_event_groups_next_cgroup(evt);
> + }
> + return 0; <--- need to remove for hierarchies
> + }
> + }
> +
> + return 0;
> +}
Thanks,
Kan
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 3/4] perf cgroup: Add cgroup ID as a key of RB tree
2019-04-29 14:44 ` [PATCH 3/4] perf cgroup: Add cgroup ID as a key of RB tree kan.liang
2019-04-29 23:02 ` Ian Rogers
@ 2019-04-30 9:03 ` Peter Zijlstra
2019-04-30 15:46 ` Liang, Kan
2019-04-30 9:03 ` Peter Zijlstra
2 siblings, 1 reply; 18+ messages in thread
From: Peter Zijlstra @ 2019-04-30 9:03 UTC (permalink / raw)
To: kan.liang; +Cc: tglx, mingo, linux-kernel, eranian, tj, ak
On Mon, Apr 29, 2019 at 07:44:04AM -0700, kan.liang@linux.intel.com wrote:
> Add unique cgrp_id for each cgroup, which is composed by CPU ID and css
> subsys-unique ID.
*WHY* ?! that doesn't make any kind of sense.. In fact you mostly then
use the low word because most everything is already per CPU.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 3/4] perf cgroup: Add cgroup ID as a key of RB tree
2019-04-30 9:03 ` Peter Zijlstra
@ 2019-04-30 15:46 ` Liang, Kan
0 siblings, 0 replies; 18+ messages in thread
From: Liang, Kan @ 2019-04-30 15:46 UTC (permalink / raw)
To: Peter Zijlstra; +Cc: tglx, mingo, linux-kernel, eranian, tj, ak
On 4/30/2019 5:03 AM, Peter Zijlstra wrote:
> On Mon, Apr 29, 2019 at 07:44:04AM -0700, kan.liang@linux.intel.com wrote:
>
>> Add unique cgrp_id for each cgroup, which is composed by CPU ID and css
>> subsys-unique ID.
>
> *WHY* ?! that doesn't make any kind of sense.. In fact you mostly then
> use the low word because most everything is already per CPU.
>
I tried to assign an unique ID for each cgroup event.
But you are right, it looks like not necessary.
I will remove it.
Thanks,
Kan
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 3/4] perf cgroup: Add cgroup ID as a key of RB tree
2019-04-29 14:44 ` [PATCH 3/4] perf cgroup: Add cgroup ID as a key of RB tree kan.liang
2019-04-29 23:02 ` Ian Rogers
2019-04-30 9:03 ` Peter Zijlstra
@ 2019-04-30 9:03 ` Peter Zijlstra
2 siblings, 0 replies; 18+ messages in thread
From: Peter Zijlstra @ 2019-04-30 9:03 UTC (permalink / raw)
To: kan.liang; +Cc: tglx, mingo, linux-kernel, eranian, tj, ak
On Mon, Apr 29, 2019 at 07:44:04AM -0700, kan.liang@linux.intel.com wrote:
> +static struct perf_event *
> +perf_event_groups_first_cgroup(struct perf_event_groups *groups,
> + int cpu, u64 cgrp_id)
> +{
> + struct perf_event *node_event = NULL, *match = NULL;
> + struct rb_node *node = groups->tree.rb_node;
> +
> + while (node) {
> + node_event = container_of(node, struct perf_event, group_node);
> +
> + if (cpu < node_event->cpu) {
> + node = node->rb_left;
> + } else if (cpu > node_event->cpu) {
> + node = node->rb_right;
> + } else {
> + if (cgrp_id_low(cgrp_id) < cgrp_id_low(node_event->cgrp_id))
> + node = node->rb_left;
> + else if (cgrp_id_low(cgrp_id) > cgrp_id_low(node_event->cgrp_id))
> + node = node->rb_right;
> + else {
> + match = node_event;
> + node = node->rb_left;
> + }
> + }
> + }
> + return match;
> +}
That's whitespace challenged.
^ permalink raw reply [flat|nested] 18+ messages in thread
* [PATCH 4/4] perf cgroup: Add fast path for cgroup switch
2019-04-29 14:44 [PATCH 0/4] Optimize cgroup context switch kan.liang
` (2 preceding siblings ...)
2019-04-29 14:44 ` [PATCH 3/4] perf cgroup: Add cgroup ID as a key of RB tree kan.liang
@ 2019-04-29 14:44 ` kan.liang
3 siblings, 0 replies; 18+ messages in thread
From: kan.liang @ 2019-04-29 14:44 UTC (permalink / raw)
To: peterz, tglx, mingo, linux-kernel; +Cc: eranian, tj, ak, Kan Liang
From: Kan Liang <kan.liang@linux.intel.com>
Generic visit_groups_merge() is used in cgroup context switch to sched
in cgroup events, which has high overhead especially in frequent context
switch with several events and cgroups involved. Because it feeds all
events on a given CPU to pinned/flexible_sched_in() regardless the
cgroup.
Add a fast path to only feed the specific cgroup events to
pinned/flexible_sched_in() in cgroup context switch.
Don't need event_filter_match() to filter cgroup and CPU in fast path.
Only pmu_filter_match() is enough.
Don't need to specially handle system-wide event anymore.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---
kernel/events/core.c | 66 ++++++++++++++++++++++++++++++++++++++--------------
1 file changed, 48 insertions(+), 18 deletions(-)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5ecc048..16bb705 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3310,6 +3310,47 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
}
+struct sched_in_data {
+ struct perf_event_context *ctx;
+ struct perf_cpu_context *cpuctx;
+ int can_add_hw;
+};
+
+#ifdef CONFIG_CGROUP_PERF
+
+static int cgroup_visit_groups_merge(struct perf_event_groups *groups, int cpu,
+ int (*func)(struct perf_event *, void *, int (*)(struct perf_event *)),
+ void *data)
+{
+ struct sched_in_data *sid = data;
+ struct cgroup_subsys_state *css;
+ struct perf_cgroup *cgrp;
+ struct perf_event *evt;
+ u64 cgrp_id;
+
+ for (css = &sid->cpuctx->cgrp->css; css; css = css->parent) {
+ /* root cgroup doesn't have events */
+ if (css->id == 1)
+ return 0;
+
+ cgrp = container_of(css, struct perf_cgroup, css);
+ cgrp_id = *this_cpu_ptr(cgrp->cgrp_id);
+ /* Only visit groups when the cgroup has events */
+ if (cgrp_id) {
+ evt = perf_event_groups_first_cgroup(groups, cpu, cgrp_id);
+ while (evt) {
+ if (func(evt, (void *)sid, pmu_filter_match))
+ break;
+ evt = perf_event_groups_next_cgroup(evt);
+ }
+ return 0;
+ }
+ }
+
+ return 0;
+}
+#endif
+
static int visit_groups_merge(struct perf_event_groups *groups, int cpu,
int (*func)(struct perf_event *, void *, int (*)(struct perf_event *)),
void *data)
@@ -3317,6 +3358,13 @@ static int visit_groups_merge(struct perf_event_groups *groups, int cpu,
struct perf_event **evt, *evt1, *evt2;
int ret;
+#ifdef CONFIG_CGROUP_PERF
+ struct sched_in_data *sid = data;
+
+ /* fast path for cgroup switch */
+ if (sid->cpuctx->cgrp_switch)
+ return cgroup_visit_groups_merge(groups, cpu, func, data);
+#endif
evt1 = perf_event_groups_first(groups, -1);
evt2 = perf_event_groups_first(groups, cpu);
@@ -3342,12 +3390,6 @@ static int visit_groups_merge(struct perf_event_groups *groups, int cpu,
return 0;
}
-struct sched_in_data {
- struct perf_event_context *ctx;
- struct perf_cpu_context *cpuctx;
- int can_add_hw;
-};
-
static int pinned_sched_in(struct perf_event *event, void *data,
int (*filter_match)(struct perf_event *))
{
@@ -3356,12 +3398,6 @@ static int pinned_sched_in(struct perf_event *event, void *data,
if (event->state <= PERF_EVENT_STATE_OFF)
return 0;
-#ifdef CONFIG_CGROUP_PERF
- /* Don't sched system-wide event when cgroup context switch */
- if (sid->cpuctx->cgrp_switch && !event->cgrp)
- return 0;
-#endif
-
if (!filter_match(event))
return 0;
@@ -3388,12 +3424,6 @@ static int flexible_sched_in(struct perf_event *event, void *data,
if (event->state <= PERF_EVENT_STATE_OFF)
return 0;
-#ifdef CONFIG_CGROUP_PERF
- /* Don't sched system-wide event when cgroup context switch */
- if (sid->cpuctx->cgrp_switch && !event->cgrp)
- return 0;
-#endif
-
if (!filter_match(event))
return 0;
--
2.7.4
^ permalink raw reply related [flat|nested] 18+ messages in thread