All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/7] Optimize cgroup context switch
@ 2019-07-02  6:59 Ian Rogers
  2019-07-02  6:59 ` [PATCH 1/7] perf: propagate perf_install_in_context errors up Ian Rogers
                   ` (7 more replies)
  0 siblings, 8 replies; 29+ messages in thread
From: Ian Rogers @ 2019-07-02  6:59 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Jiri Olsa, Namhyung Kim, linux-kernel
  Cc: Kan Liang, Stephane Eranian, Ian Rogers

Organize per-CPU perf event groups by cgroup then by group/insertion index.
To support cgroup hierarchies, a set of iterators is needed in
visit_groups_merge. To make this unbounded, use a per-CPU allocated
buffer. To make the set of iterators fast, use a min-heap ordered by
the group index.

These patches include a caching algorithm that avoids a search for the
first event in a group by Kan Liang <kan.liang@linux.intel.com> and the
set of patches as a whole have benefitted from conversation with him.

Ian Rogers (7):
  perf: propagate perf_install_in_context errors up
  perf/cgroup: order events in RB tree by cgroup id
  perf: order iterators for visit_groups_merge into a min-heap
  perf: avoid a bounded set of visit_groups_merge iterators
  perf: cache perf_event_groups_first for cgroups
  perf: avoid double checking CPU and cgroup
  perf: rename visit_groups_merge to ctx_groups_sched_in

 include/linux/perf_event.h |   8 +
 kernel/events/core.c       | 511 +++++++++++++++++++++++++++++--------
 2 files changed, 414 insertions(+), 105 deletions(-)

-- 
2.22.0.410.gd8fdbe21b5-goog


^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH 1/7] perf: propagate perf_install_in_context errors up
  2019-07-02  6:59 [PATCH 0/7] Optimize cgroup context switch Ian Rogers
@ 2019-07-02  6:59 ` Ian Rogers
  2019-07-05 14:13   ` Jiri Olsa
  2019-07-02  6:59 ` [PATCH 2/7] perf/cgroup: order events in RB tree by cgroup id Ian Rogers
                   ` (6 subsequent siblings)
  7 siblings, 1 reply; 29+ messages in thread
From: Ian Rogers @ 2019-07-02  6:59 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Jiri Olsa, Namhyung Kim, linux-kernel
  Cc: Kan Liang, Stephane Eranian, Ian Rogers

The current __perf_install_in_context can fail and the error is ignored.
Changing __perf_install_in_context can add new failure modes that need
errors propagating up. This change prepares for this.

Signed-off-by: Ian Rogers <irogers@google.com>
---
 kernel/events/core.c | 38 +++++++++++++++++++++++++-------------
 1 file changed, 25 insertions(+), 13 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 785d708f8553..4faa90f5a934 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2558,11 +2558,12 @@ static int  __perf_install_in_context(void *info)
  *
  * Very similar to event_function_call, see comment there.
  */
-static void
+static int
 perf_install_in_context(struct perf_event_context *ctx,
 			struct perf_event *event,
 			int cpu)
 {
+	int err;
 	struct task_struct *task = READ_ONCE(ctx->task);
 
 	lockdep_assert_held(&ctx->mutex);
@@ -2577,15 +2578,15 @@ perf_install_in_context(struct perf_event_context *ctx,
 	smp_store_release(&event->ctx, ctx);
 
 	if (!task) {
-		cpu_function_call(cpu, __perf_install_in_context, event);
-		return;
+		err = cpu_function_call(cpu, __perf_install_in_context, event);
+		return err;
 	}
 
 	/*
 	 * Should not happen, we validate the ctx is still alive before calling.
 	 */
 	if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
-		return;
+		return 0;
 
 	/*
 	 * Installing events is tricky because we cannot rely on ctx->is_active
@@ -2619,8 +2620,9 @@ perf_install_in_context(struct perf_event_context *ctx,
 	 */
 	smp_mb();
 again:
-	if (!task_function_call(task, __perf_install_in_context, event))
-		return;
+	err = task_function_call(task, __perf_install_in_context, event);
+	if (err)
+		return err;
 
 	raw_spin_lock_irq(&ctx->lock);
 	task = ctx->task;
@@ -2631,7 +2633,7 @@ perf_install_in_context(struct perf_event_context *ctx,
 		 * against perf_event_exit_task_context().
 		 */
 		raw_spin_unlock_irq(&ctx->lock);
-		return;
+		return 0;
 	}
 	/*
 	 * If the task is not running, ctx->lock will avoid it becoming so,
@@ -2643,6 +2645,7 @@ perf_install_in_context(struct perf_event_context *ctx,
 	}
 	add_event_to_ctx(event, ctx);
 	raw_spin_unlock_irq(&ctx->lock);
+	return 0;
 }
 
 /*
@@ -11103,7 +11106,9 @@ SYSCALL_DEFINE5(perf_event_open,
 		 */
 		for_each_sibling_event(sibling, group_leader) {
 			perf_event__state_init(sibling);
-			perf_install_in_context(ctx, sibling, sibling->cpu);
+			err = perf_install_in_context(ctx, sibling,
+						      sibling->cpu);
+			WARN_ON_ONCE(err);
 			get_ctx(ctx);
 		}
 
@@ -11113,7 +11118,9 @@ SYSCALL_DEFINE5(perf_event_open,
 		 * startup state, ready to be add into new context.
 		 */
 		perf_event__state_init(group_leader);
-		perf_install_in_context(ctx, group_leader, group_leader->cpu);
+		err = perf_install_in_context(ctx, group_leader,
+					      group_leader->cpu);
+		WARN_ON_ONCE(err);
 		get_ctx(ctx);
 	}
 
@@ -11128,7 +11135,8 @@ SYSCALL_DEFINE5(perf_event_open,
 
 	event->owner = current;
 
-	perf_install_in_context(ctx, event, event->cpu);
+	err = perf_install_in_context(ctx, event, event->cpu);
+	WARN_ON_ONCE(err);
 	perf_unpin_context(ctx);
 
 	if (move_group)
@@ -11247,7 +11255,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
 		goto err_unlock;
 	}
 
-	perf_install_in_context(ctx, event, cpu);
+	err = perf_install_in_context(ctx, event, cpu);
+	WARN_ON_ONCE(err);
 	perf_unpin_context(ctx);
 	mutex_unlock(&ctx->mutex);
 
@@ -11270,6 +11279,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
 	struct perf_event_context *dst_ctx;
 	struct perf_event *event, *tmp;
 	LIST_HEAD(events);
+	int err;
 
 	src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
 	dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
@@ -11308,7 +11318,8 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
 		if (event->state >= PERF_EVENT_STATE_OFF)
 			event->state = PERF_EVENT_STATE_INACTIVE;
 		account_event_cpu(event, dst_cpu);
-		perf_install_in_context(dst_ctx, event, dst_cpu);
+		err = perf_install_in_context(dst_ctx, event, dst_cpu);
+		WARN_ON_ONCE(err);
 		get_ctx(dst_ctx);
 	}
 
@@ -11321,7 +11332,8 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
 		if (event->state >= PERF_EVENT_STATE_OFF)
 			event->state = PERF_EVENT_STATE_INACTIVE;
 		account_event_cpu(event, dst_cpu);
-		perf_install_in_context(dst_ctx, event, dst_cpu);
+		err = perf_install_in_context(dst_ctx, event, dst_cpu);
+		WARN_ON_ONCE(err);
 		get_ctx(dst_ctx);
 	}
 	mutex_unlock(&dst_ctx->mutex);
-- 
2.22.0.410.gd8fdbe21b5-goog


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 2/7] perf/cgroup: order events in RB tree by cgroup id
  2019-07-02  6:59 [PATCH 0/7] Optimize cgroup context switch Ian Rogers
  2019-07-02  6:59 ` [PATCH 1/7] perf: propagate perf_install_in_context errors up Ian Rogers
@ 2019-07-02  6:59 ` Ian Rogers
  2019-07-08 15:38   ` Peter Zijlstra
                     ` (3 more replies)
  2019-07-02  6:59 ` [PATCH 3/7] perf: order iterators for visit_groups_merge into a min-heap Ian Rogers
                   ` (5 subsequent siblings)
  7 siblings, 4 replies; 29+ messages in thread
From: Ian Rogers @ 2019-07-02  6:59 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Jiri Olsa, Namhyung Kim, linux-kernel
  Cc: Kan Liang, Stephane Eranian, Ian Rogers

If one is monitoring 6 events on 20 cgroups the per-CPU RB tree will
hold 120 events. The scheduling in of the events currently iterates
over all events looking to see which events match the task's cgroup or
its cgroup hierarchy. If a task is in 1 cgroup with 6 events, then 114
events are considered unnecessarily.

This change orders events in the RB tree by cgroup id if it is present.
This means scheduling in may go directly to events associated with the
task's cgroup if one is present. The patch extends the 2 iterators in
visit_groups_merge to a set of iterators, where different iterators are
needed for the task's cgroup and parent cgroups. By considering the set
of iterators when visiting, the lowest group_index event may be selected
and the insertion order group_index property is maintained. This also
allows event rotation to function correctly, as although events are
grouped into a cgroup, rotation always selects the lowest group_index
event to rotate (delete/insert into the tree) and the set of iterators
make it so that the group_index order is maintained.

Cgroups can form a hierarchy with an unbounded number of events being
monitored in a child's parent's cgroup and their parent's parent's
cgroup and so on. This change limits the depth of cgroups that can have
monitored events to a constant (16). The change also scans the list of
events when considering the lowest group_index. Later patches will
improve the data structure to remove the constant limit and avoid a
linear search.

Signed-off-by: Ian Rogers <irogers@google.com>
---
 kernel/events/core.c | 248 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 185 insertions(+), 63 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4faa90f5a934..9a2ad34184b8 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1530,6 +1530,32 @@ perf_event_groups_less(struct perf_event *left, struct perf_event *right)
 	if (left->cpu > right->cpu)
 		return false;
 
+#ifdef CONFIG_CGROUP_PERF
+	if (left->cgrp != right->cgrp) {
+		if (!left->cgrp)
+			/*
+			 * Left has no cgroup but right does, no cgroups come
+			 * first.
+			 */
+			return true;
+		if (!right->cgrp)
+			/*
+			 * Right has no cgroup but left does, no cgroups come
+			 * first.
+			 */
+			return false;
+		if (left->cgrp->css.cgroup != right->cgrp->css.cgroup) {
+			if (!left->cgrp->css.cgroup)
+				return true;
+			if (!right->cgrp->css.cgroup)
+				return false;
+			/* Two dissimilar cgroups, order by id. */
+			return left->cgrp->css.cgroup->id <
+					right->cgrp->css.cgroup->id;
+		}
+	}
+#endif
+
 	if (left->group_index < right->group_index)
 		return true;
 	if (left->group_index > right->group_index)
@@ -1609,25 +1635,48 @@ del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
 }
 
 /*
- * Get the leftmost event in the @cpu subtree.
+ * Get the leftmost event in the cpu/cgroup subtree.
  */
 static struct perf_event *
-perf_event_groups_first(struct perf_event_groups *groups, int cpu)
+perf_event_groups_first(struct perf_event_groups *groups, int cpu,
+			struct cgroup *cgrp)
 {
 	struct perf_event *node_event = NULL, *match = NULL;
 	struct rb_node *node = groups->tree.rb_node;
+#ifdef CONFIG_CGROUP_PERF
+	int node_cgrp_id, cgrp_id = 0;
+
+	if (cgrp)
+		cgrp_id = cgrp->id;
+#endif
 
 	while (node) {
 		node_event = container_of(node, struct perf_event, group_node);
 
 		if (cpu < node_event->cpu) {
 			node = node->rb_left;
-		} else if (cpu > node_event->cpu) {
+			continue;
+		}
+		if (cpu > node_event->cpu) {
 			node = node->rb_right;
-		} else {
-			match = node_event;
+			continue;
+		}
+#ifdef CONFIG_CGROUP_PERF
+		node_cgrp_id = 0;
+		if (node_event->cgrp && node_event->cgrp->css.cgroup)
+			node_cgrp_id = node_event->cgrp->css.cgroup->id;
+
+		if (cgrp_id < node_cgrp_id) {
 			node = node->rb_left;
+			continue;
 		}
+		if (cgrp_id > node_cgrp_id) {
+			node = node->rb_right;
+			continue;
+		}
+#endif
+		match = node_event;
+		node = node->rb_left;
 	}
 
 	return match;
@@ -1640,12 +1689,26 @@ static struct perf_event *
 perf_event_groups_next(struct perf_event *event)
 {
 	struct perf_event *next;
+#ifdef CONFIG_CGROUP_PERF
+	int curr_cgrp_id = 0;
+	int next_cgrp_id = 0;
+#endif
 
 	next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node);
-	if (next && next->cpu == event->cpu)
-		return next;
+	if (next == NULL || next->cpu != event->cpu)
+		return NULL;
 
-	return NULL;
+#ifdef CONFIG_CGROUP_PERF
+	if (event->cgrp && event->cgrp->css.cgroup)
+		curr_cgrp_id = event->cgrp->css.cgroup->id;
+
+	if (next->cgrp && next->cgrp->css.cgroup)
+		next_cgrp_id = next->cgrp->css.cgroup->id;
+
+	if (curr_cgrp_id != next_cgrp_id)
+		return NULL;
+#endif
+	return next;
 }
 
 /*
@@ -3255,28 +3318,97 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
 	ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
 }
 
-static int visit_groups_merge(struct perf_event_groups *groups, int cpu,
-			      int (*func)(struct perf_event *, void *), void *data)
+static int visit_groups_merge(struct perf_event_context *ctx,
+			      struct perf_cpu_context *cpuctx,
+			      struct perf_event_groups *groups,
+			      int (*func)(struct perf_event_context *,
+					  struct perf_cpu_context *,
+					  struct perf_event *,
+					  int *),
+			      int *data)
 {
-	struct perf_event **evt, *evt1, *evt2;
-	int ret;
+#ifndef CONFIG_CGROUP_PERF
+	/*
+	 * Without cgroups, with a task context, iterate over per-CPU and any
+	 * CPU events.
+	 */
+	const int max_itrs = 2;
+#else
+	/*
+	 * The depth of cgroups is limited by MAX_PATH. It is unlikely that this
+	 * many parent-child related cgroups will have perf events
+	 * monitored. Limit the number of cgroup iterators to 16.
+	 */
+	const int max_cgroups_with_events_depth = 16;
+	/*
+	 * With cgroups we either iterate for a task context (per-CPU or any CPU
+	 * events) or for per CPU the global and per cgroup events.
+	 */
+	const int max_itrs = max(2, 1 + max_cgroups_with_events_depth);
+#endif
+	/* The number of iterators in use. */
+	int num_itrs;
+	/*
+	 * A set of iterators, the iterator for the visit is chosen by the
+	 * group_index.
+	 */
+	struct perf_event *itrs[max_itrs];
+	/* A reference to the selected iterator. */
+	struct perf_event **evt;
+	int ret, i, cpu = smp_processor_id();
 
-	evt1 = perf_event_groups_first(groups, -1);
-	evt2 = perf_event_groups_first(groups, cpu);
-
-	while (evt1 || evt2) {
-		if (evt1 && evt2) {
-			if (evt1->group_index < evt2->group_index)
-				evt = &evt1;
-			else
-				evt = &evt2;
-		} else if (evt1) {
-			evt = &evt1;
-		} else {
-			evt = &evt2;
+	itrs[0] = perf_event_groups_first(groups, cpu, NULL);
+
+	if (ctx != &cpuctx->ctx) {
+		/*
+		 * A task context only ever has an iterator for CPU or any CPU
+		 * events.
+		 */
+		itrs[1] = perf_event_groups_first(groups, -1, NULL);
+		num_itrs = 2;
+	} else {
+		/* Per-CPU events are by definition not on any CPU. */
+		num_itrs = 1;
+#ifdef CONFIG_CGROUP_PERF
+		/*
+		 * For itrs[1..MAX] add an iterator for each cgroup parent that
+		 * has events.
+		 */
+		if (cpuctx->cgrp) {
+			struct cgroup_subsys_state *css;
+
+			for (css = &cpuctx->cgrp->css; css; css = css->parent) {
+				itrs[num_itrs] = perf_event_groups_first(groups,
+								   cpu,
+								   css->cgroup);
+				if (itrs[num_itrs]) {
+					num_itrs++;
+					if (num_itrs == max_itrs) {
+						WARN_ONCE(
+				     max_cgroups_with_events_depth,
+				     "Insufficient iterators for cgroup depth");
+						break;
+					}
+				}
+			}
 		}
+#endif
+	}
 
-		ret = func(*evt, data);
+	while (true) {
+		/* Find lowest group_index event. */
+		evt = NULL;
+		for (i = 0; i < num_itrs; ++i) {
+			if (itrs[i] == NULL)
+				continue;
+			if (evt == NULL ||
+			    itrs[i]->group_index < (*evt)->group_index)
+				evt = &itrs[i];
+		}
+		if (evt == NULL)
+			break;
+
+		ret = func(ctx, cpuctx, *evt, data);
 		if (ret)
 			return ret;
 
@@ -3286,25 +3418,20 @@ static int visit_groups_merge(struct perf_event_groups *groups, int cpu,
 	return 0;
 }
 
-struct sched_in_data {
-	struct perf_event_context *ctx;
-	struct perf_cpu_context *cpuctx;
-	int can_add_hw;
-};
-
-static int pinned_sched_in(struct perf_event *event, void *data)
+static int pinned_sched_in(struct perf_event_context *ctx,
+			   struct perf_cpu_context *cpuctx,
+			   struct perf_event *event,
+			   int *unused)
 {
-	struct sched_in_data *sid = data;
-
 	if (event->state <= PERF_EVENT_STATE_OFF)
 		return 0;
 
 	if (!event_filter_match(event))
 		return 0;
 
-	if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
-		if (!group_sched_in(event, sid->cpuctx, sid->ctx))
-			list_add_tail(&event->active_list, &sid->ctx->pinned_active);
+	if (group_can_go_on(event, cpuctx, 1)) {
+		if (!group_sched_in(event, cpuctx, ctx))
+			list_add_tail(&event->active_list, &ctx->pinned_active);
 	}
 
 	/*
@@ -3317,24 +3444,25 @@ static int pinned_sched_in(struct perf_event *event, void *data)
 	return 0;
 }
 
-static int flexible_sched_in(struct perf_event *event, void *data)
+static int flexible_sched_in(struct perf_event_context *ctx,
+			     struct perf_cpu_context *cpuctx,
+			     struct perf_event *event,
+			     int *can_add_hw)
 {
-	struct sched_in_data *sid = data;
-
 	if (event->state <= PERF_EVENT_STATE_OFF)
 		return 0;
 
 	if (!event_filter_match(event))
 		return 0;
 
-	if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
-		int ret = group_sched_in(event, sid->cpuctx, sid->ctx);
+	if (group_can_go_on(event, cpuctx, *can_add_hw)) {
+		int ret = group_sched_in(event, cpuctx, ctx);
 		if (ret) {
-			sid->can_add_hw = 0;
-			sid->ctx->rotate_necessary = 1;
+			*can_add_hw = 0;
+			ctx->rotate_necessary = 1;
 			return 0;
 		}
-		list_add_tail(&event->active_list, &sid->ctx->flexible_active);
+		list_add_tail(&event->active_list, &ctx->flexible_active);
 	}
 
 	return 0;
@@ -3344,30 +3472,24 @@ static void
 ctx_pinned_sched_in(struct perf_event_context *ctx,
 		    struct perf_cpu_context *cpuctx)
 {
-	struct sched_in_data sid = {
-		.ctx = ctx,
-		.cpuctx = cpuctx,
-		.can_add_hw = 1,
-	};
-
-	visit_groups_merge(&ctx->pinned_groups,
-			   smp_processor_id(),
-			   pinned_sched_in, &sid);
+	visit_groups_merge(ctx,
+			   cpuctx,
+			   &ctx->pinned_groups,
+			   pinned_sched_in,
+			   NULL);
 }
 
 static void
 ctx_flexible_sched_in(struct perf_event_context *ctx,
 		      struct perf_cpu_context *cpuctx)
 {
-	struct sched_in_data sid = {
-		.ctx = ctx,
-		.cpuctx = cpuctx,
-		.can_add_hw = 1,
-	};
+	int can_add_hw = 1;
 
-	visit_groups_merge(&ctx->flexible_groups,
-			   smp_processor_id(),
-			   flexible_sched_in, &sid);
+	visit_groups_merge(ctx,
+			   cpuctx,
+			   &ctx->flexible_groups,
+			   flexible_sched_in,
+			   &can_add_hw);
 }
 
 static void
-- 
2.22.0.410.gd8fdbe21b5-goog


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 3/7] perf: order iterators for visit_groups_merge into a min-heap
  2019-07-02  6:59 [PATCH 0/7] Optimize cgroup context switch Ian Rogers
  2019-07-02  6:59 ` [PATCH 1/7] perf: propagate perf_install_in_context errors up Ian Rogers
  2019-07-02  6:59 ` [PATCH 2/7] perf/cgroup: order events in RB tree by cgroup id Ian Rogers
@ 2019-07-02  6:59 ` Ian Rogers
  2019-07-08 16:30   ` Peter Zijlstra
  2019-07-08 16:36   ` Peter Zijlstra
  2019-07-02  6:59 ` [PATCH 4/7] perf: avoid a bounded set of visit_groups_merge iterators Ian Rogers
                   ` (4 subsequent siblings)
  7 siblings, 2 replies; 29+ messages in thread
From: Ian Rogers @ 2019-07-02  6:59 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Jiri Olsa, Namhyung Kim, linux-kernel
  Cc: Kan Liang, Stephane Eranian, Ian Rogers

The groups rbtree holding perf events, either for a CPU or a task, needs
to have multiple iterators that visit events in group_index (insertion)
order. Rather than linearly searching the iterators, use a min-heap to go
from a O(#iterators) search to a O(log2(#iterators)) insert cost per event
visited.

Signed-off-by: Ian Rogers <irogers@google.com>
---
 kernel/events/core.c | 123 +++++++++++++++++++++++++++++++++----------
 1 file changed, 95 insertions(+), 28 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9a2ad34184b8..396b5ac6dcd4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3318,6 +3318,77 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
 	ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
 }
 
+/* Data structure used to hold a min-heap, ordered by group_index, of a fixed
+ * maximum size.
+ */
+struct perf_event_heap {
+	struct perf_event **storage;
+	int num_elements;
+	int max_elements;
+};
+
+static void min_heap_swap(struct perf_event_heap *heap,
+			  int pos1, int pos2)
+{
+	struct perf_event *tmp = heap->storage[pos1];
+
+	heap->storage[pos1] = heap->storage[pos2];
+	heap->storage[pos2] = tmp;
+}
+
+/* Sift the perf_event at pos down the heap. */
+static void min_heapify(struct perf_event_heap *heap, int pos)
+{
+	int left_child, right_child;
+
+	while (pos > heap->num_elements / 2) {
+		left_child = pos * 2;
+		right_child = pos * 2 + 1;
+		if (heap->storage[pos]->group_index >
+		    heap->storage[left_child]->group_index) {
+			min_heap_swap(heap, pos, left_child);
+			pos = left_child;
+		} else if (heap->storage[pos]->group_index >
+			   heap->storage[right_child]->group_index) {
+			min_heap_swap(heap, pos, right_child);
+			pos = right_child;
+		} else {
+			break;
+		}
+	}
+}
+
+/* Floyd's approach to heapification that is O(n). */
+static void min_heapify_all(struct perf_event_heap *heap)
+{
+	int i;
+
+	for (i = heap->num_elements / 2; i > 0; i--)
+		min_heapify(heap, i);
+}
+
+/* Remove minimum element from the heap. */
+static void min_heap_pop(struct perf_event_heap *heap)
+{
+	WARN_ONCE(heap->num_elements <= 0, "Popping an empty heap");
+	heap->num_elements--;
+	heap->storage[0] = heap->storage[heap->num_elements];
+	min_heapify(heap, 0);
+}
+
+/* Remove the minimum element and then push the given event. */
+static void min_heap_pop_push(struct perf_event_heap *heap,
+			      struct perf_event *event)
+{
+	WARN_ONCE(heap->num_elements <= 0, "Popping an empty heap");
+	if (event == NULL) {
+		min_heap_pop(heap);
+	} else {
+		heap->storage[0] = event;
+		min_heapify(heap, 0);
+	}
+}
+
 static int visit_groups_merge(struct perf_event_context *ctx,
 			      struct perf_cpu_context *cpuctx,
 			      struct perf_event_groups *groups,
@@ -3346,29 +3417,33 @@ static int visit_groups_merge(struct perf_event_context *ctx,
 	 */
 	const int max_itrs = max(2, 1 + max_cgroups_with_events_depth);
 #endif
-	/* The number of iterators in use. */
-	int num_itrs;
 	/*
 	 * A set of iterators, the iterator for the visit is chosen by the
 	 * group_index.
 	 */
 	struct perf_event *itrs[max_itrs];
-	/* A reference to the selected iterator. */
-	struct perf_event **evt;
-	int ret, i, cpu = smp_processor_id();
+	struct perf_event_heap heap = {
+		.storage = itrs,
+		.num_elements = 0,
+		.max_elements = max_itrs
+	};
+	int ret, cpu = smp_processor_id();
 
-	itrs[0] = perf_event_groups_first(groups, cpu, NULL);
+	heap.storage[0] = perf_event_groups_first(groups, cpu, NULL);
+	if (heap.storage[0])
+		heap.num_elements++;
 
 	if (ctx != &cpuctx->ctx) {
 		/*
 		 * A task context only ever has an iterator for CPU or any CPU
 		 * events.
 		 */
-		itrs[1] = perf_event_groups_first(groups, -1, NULL);
-		num_itrs = 2;
+		heap.storage[heap.num_elements] =
+				perf_event_groups_first(groups, -1, NULL);
+		if (heap.storage[heap.num_elements])
+			heap.num_elements++;
 	} else {
 		/* Per-CPU events are by definition not on any CPU. */
-		num_itrs = 1;
 #ifdef CONFIG_CGROUP_PERF
 		/*
 		 * For itrs[1..MAX] add an iterator for each cgroup parent that
@@ -3378,12 +3453,14 @@ static int visit_groups_merge(struct perf_event_context *ctx,
 			struct cgroup_subsys_state *css;
 
 			for (css = &cpuctx->cgrp->css; css; css = css->parent) {
-				itrs[num_itrs] = perf_event_groups_first(groups,
+				heap.storage[heap.num_elements] =
+						perf_event_groups_first(groups,
 								   cpu,
 								   css->cgroup);
-				if (itrs[num_itrs]) {
-					num_itrs++;
-					if (num_itrs == max_itrs) {
+				if (heap.storage[heap.num_elements]) {
+					heap.num_elements++;
+					if (heap.num_elements ==
+					    heap.max_elements) {
 						WARN_ONCE(
 				     max_cgroups_with_events_depth,
 				     "Insufficient iterators for cgroup depth");
@@ -3394,25 +3471,15 @@ static int visit_groups_merge(struct perf_event_context *ctx,
 		}
 #endif
 	}
+	min_heapify_all(&heap);
 
-	while (true) {
-		/* Find lowest group_index event. */
-		evt = NULL;
-		for (i = 0; i < num_itrs; ++i) {
-			if (itrs[i] == NULL)
-				continue;
-			if (evt == NULL ||
-			    itrs[i]->group_index < (*evt)->group_index)
-				evt = &itrs[i];
-		}
-		if (evt == NULL)
-			break;
-
-		ret = func(ctx, cpuctx, *evt, data);
+	while (heap.num_elements > 0) {
+		ret = func(ctx, cpuctx, heap.storage[0], data);
 		if (ret)
 			return ret;
 
-		*evt = perf_event_groups_next(*evt);
+		min_heap_pop_push(&heap,
+				  perf_event_groups_next(heap.storage[0]));
 	}
 
 	return 0;
-- 
2.22.0.410.gd8fdbe21b5-goog


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 4/7] perf: avoid a bounded set of visit_groups_merge iterators
  2019-07-02  6:59 [PATCH 0/7] Optimize cgroup context switch Ian Rogers
                   ` (2 preceding siblings ...)
  2019-07-02  6:59 ` [PATCH 3/7] perf: order iterators for visit_groups_merge into a min-heap Ian Rogers
@ 2019-07-02  6:59 ` Ian Rogers
  2019-07-02  6:59 ` [PATCH 5/7] perf: cache perf_event_groups_first for cgroups Ian Rogers
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 29+ messages in thread
From: Ian Rogers @ 2019-07-02  6:59 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Jiri Olsa, Namhyung Kim, linux-kernel
  Cc: Kan Liang, Stephane Eranian, Ian Rogers

Create a per-cpu array of iterators that gets resized when cgroup events
are added. The size of the array reflects the maximum depth of cgroups,
although not all cgroups will have events monitored within them. This
approach avoids added storage cost to perf_event.

Signed-off-by: Ian Rogers <irogers@google.com>
---
 include/linux/perf_event.h |  2 +
 kernel/events/core.c       | 94 ++++++++++++++++++++++++++++----------
 2 files changed, 71 insertions(+), 25 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 16e38c286d46..5c479f61622c 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -802,6 +802,8 @@ struct perf_cpu_context {
 #ifdef CONFIG_CGROUP_PERF
 	struct perf_cgroup		*cgrp;
 	struct list_head		cgrp_cpuctx_entry;
+	struct perf_event		**visit_groups_merge_iterator_storage;
+	int			       visit_groups_merge_iterator_storage_size;
 #endif
 
 	struct list_head		sched_cb_entry;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 396b5ac6dcd4..a2c5ea868de9 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1711,6 +1711,20 @@ perf_event_groups_next(struct perf_event *event)
 	return next;
 }
 
+#ifdef CONFIG_CGROUP_PERF
+int perf_event_cgroup_depth(struct perf_event *event)
+{
+	struct cgroup_subsys_state *css;
+	struct perf_cgroup *cgrp = event->cgrp;
+	int depth = 0;
+
+	if (cgrp)
+		for (css = &cgrp->css; css; css = css->parent)
+			depth++;
+	return depth;
+}
+#endif
+
 /*
  * Iterate through the whole groups tree.
  */
@@ -2592,6 +2606,7 @@ static int  __perf_install_in_context(void *info)
 
 #ifdef CONFIG_CGROUP_PERF
 	if (is_cgroup_event(event)) {
+		int max_iterators;
 		/*
 		 * If the current cgroup doesn't match the event's
 		 * cgroup, we should not try to schedule it.
@@ -2599,6 +2614,30 @@ static int  __perf_install_in_context(void *info)
 		struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
 		reprogram = cgroup_is_descendant(cgrp->css.cgroup,
 					event->cgrp->css.cgroup);
+
+		/*
+		 * Ensure space for visit_groups_merge iterator storage. With
+		 * cgroup profiling we may have an event at each depth plus
+		 * system wide events.
+		 */
+		max_iterators = perf_event_cgroup_depth(event) + 1;
+		if (max_iterators >
+		    cpuctx->visit_groups_merge_iterator_storage_size) {
+			struct perf_event **storage =
+			   krealloc(cpuctx->visit_groups_merge_iterator_storage,
+				    sizeof(struct perf_event *) * max_iterators,
+				    GFP_KERNEL);
+			if (storage) {
+				cpuctx->visit_groups_merge_iterator_storage
+						= storage;
+				cpuctx->visit_groups_merge_iterator_storage_size
+						= max_iterators;
+			} else {
+				WARN_ONCE(1, "Unable to increase iterator "
+					"storage for perf events with cgroups");
+				ret = -ENOMEM;
+			}
+		}
 	}
 #endif
 
@@ -3389,6 +3428,13 @@ static void min_heap_pop_push(struct perf_event_heap *heap,
 	}
 }
 
+
+/*
+ * Without cgroups, with a task context, there may be per-CPU and any
+ * CPU events.
+ */
+#define MIN_VISIT_GROUP_MERGE_ITERATORS 2
+
 static int visit_groups_merge(struct perf_event_context *ctx,
 			      struct perf_cpu_context *cpuctx,
 			      struct perf_event_groups *groups,
@@ -3398,35 +3444,27 @@ static int visit_groups_merge(struct perf_event_context *ctx,
 					  int *),
 			      int *data)
 {
-#ifndef CONFIG_CGROUP_PERF
-	/*
-	 * Without cgroups, with a task context, iterate over per-CPU and any
-	 * CPU events.
-	 */
-	const int max_itrs = 2;
-#else
-	/*
-	 * The depth of cgroups is limited by MAX_PATH. It is unlikely that this
-	 * many parent-child related cgroups will have perf events
-	 * monitored. Limit the number of cgroup iterators to 16.
-	 */
-	const int max_cgroups_with_events_depth = 16;
-	/*
-	 * With cgroups we either iterate for a task context (per-CPU or any CPU
-	 * events) or for per CPU the global and per cgroup events.
-	 */
-	const int max_itrs = max(2, 1 + max_cgroups_with_events_depth);
-#endif
 	/*
 	 * A set of iterators, the iterator for the visit is chosen by the
 	 * group_index.
 	 */
-	struct perf_event *itrs[max_itrs];
+#ifndef CONFIG_CGROUP_PERF
+	struct perf_event *itrs[MIN_VISIT_GROUP_MERGE_ITERATORS];
 	struct perf_event_heap heap = {
 		.storage = itrs,
 		.num_elements = 0,
-		.max_elements = max_itrs
+		.max_elements = MIN_VISIT_GROUP_MERGE_ITERATORS
 	};
+#else
+	/*
+	 * With cgroups usage space in the CPU context reserved for iterators.
+	 */
+	struct perf_event_heap heap = {
+		.storage = cpuctx->visit_groups_merge_iterator_storage,
+		.num_elements = 0,
+		.max_elements = cpuctx->visit_groups_merge_iterator_storage_size
+	};
+#endif
 	int ret, cpu = smp_processor_id();
 
 	heap.storage[0] = perf_event_groups_first(groups, cpu, NULL);
@@ -3461,9 +3499,8 @@ static int visit_groups_merge(struct perf_event_context *ctx,
 					heap.num_elements++;
 					if (heap.num_elements ==
 					    heap.max_elements) {
-						WARN_ONCE(
-				     max_cgroups_with_events_depth,
-				     "Insufficient iterators for cgroup depth");
+						WARN_ONCE(1,
+						"per-CPU min-heap under sized");
 						break;
 					}
 				}
@@ -10155,7 +10192,14 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
 		lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
 		cpuctx->ctx.pmu = pmu;
 		cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
-
+#ifdef CONFIG_CGROUP_PERF
+		cpuctx->visit_groups_merge_iterator_storage =
+				kmalloc_array(MIN_VISIT_GROUP_MERGE_ITERATORS,
+					      sizeof(struct perf_event *),
+					      GFP_KERNEL);
+		cpuctx->visit_groups_merge_iterator_storage_size =
+				MIN_VISIT_GROUP_MERGE_ITERATORS;
+#endif
 		__perf_mux_hrtimer_init(cpuctx, cpu);
 	}
 
-- 
2.22.0.410.gd8fdbe21b5-goog


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 5/7] perf: cache perf_event_groups_first for cgroups
  2019-07-02  6:59 [PATCH 0/7] Optimize cgroup context switch Ian Rogers
                   ` (3 preceding siblings ...)
  2019-07-02  6:59 ` [PATCH 4/7] perf: avoid a bounded set of visit_groups_merge iterators Ian Rogers
@ 2019-07-02  6:59 ` Ian Rogers
  2019-07-08 16:50   ` Peter Zijlstra
  2019-07-02  6:59 ` [PATCH 6/7] perf: avoid double checking CPU and cgroup Ian Rogers
                   ` (2 subsequent siblings)
  7 siblings, 1 reply; 29+ messages in thread
From: Ian Rogers @ 2019-07-02  6:59 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Jiri Olsa, Namhyung Kim, linux-kernel
  Cc: Kan Liang, Stephane Eranian, Ian Rogers

Add a per-CPU cache of the pinned and flexible perf_event_groups_first
value for a cgroup avoiding an O(log(#perf events)) searches during
sched_in.

This patch is derived from an original patch by Kan Liang
<kan.liang@linux.intel.com>

Signed-off-by: Ian Rogers <irogers@google.com>
---
 include/linux/perf_event.h |   6 ++
 kernel/events/core.c       | 142 ++++++++++++++++++++++++++-----------
 2 files changed, 106 insertions(+), 42 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 5c479f61622c..86fb379296cb 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -845,6 +845,12 @@ struct perf_cgroup_info {
 struct perf_cgroup {
 	struct cgroup_subsys_state	css;
 	struct perf_cgroup_info	__percpu *info;
+	/* A cache of the first event with the perf_cpu_context's
+	 * perf_event_context for the first event in pinned_groups or
+	 * flexible_groups. Avoids an rbtree search during sched_in.
+	 */
+	struct perf_event * __percpu    *pinned_event;
+	struct perf_event * __percpu    *flexible_event;
 };
 
 /*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index a2c5ea868de9..7608bd562dac 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1594,6 +1594,25 @@ perf_event_groups_insert(struct perf_event_groups *groups,
 
 	rb_link_node(&event->group_node, parent, node);
 	rb_insert_color(&event->group_node, &groups->tree);
+#ifdef CONFIG_CGROUP_PERF
+	if (is_cgroup_event(event)) {
+		struct perf_event **cgrp_event;
+
+		if (event->attr.pinned)
+			cgrp_event = per_cpu_ptr(event->cgrp->pinned_event,
+						 event->cpu);
+		else
+			cgrp_event = per_cpu_ptr(event->cgrp->flexible_event,
+						 event->cpu);
+		/*
+		 * Cgroup events for the same cgroup on the same CPU will
+		 * always be inserted at the right because of bigger
+		 * @groups->index. Only need to set *cgrp_event when it's NULL.
+		 */
+		if (!*cgrp_event)
+			*cgrp_event = event;
+	}
+#endif
 }
 
 /*
@@ -1608,6 +1627,9 @@ add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
 	perf_event_groups_insert(groups, event);
 }
 
+static struct perf_event *
+perf_event_groups_next(struct perf_event *event);
+
 /*
  * Delete a group from a tree.
  */
@@ -1618,6 +1640,22 @@ perf_event_groups_delete(struct perf_event_groups *groups,
 	WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) ||
 		     RB_EMPTY_ROOT(&groups->tree));
 
+#ifdef CONFIG_CGROUP_PERF
+	if (is_cgroup_event(event)) {
+		struct perf_event **cgrp_event;
+
+		if (event->attr.pinned)
+			cgrp_event = per_cpu_ptr(event->cgrp->pinned_event,
+						 event->cpu);
+		else
+			cgrp_event = per_cpu_ptr(event->cgrp->flexible_event,
+						 event->cpu);
+
+		if (*cgrp_event == event)
+			*cgrp_event = perf_event_groups_next(event);
+	}
+#endif
+
 	rb_erase(&event->group_node, &groups->tree);
 	init_event_group(event);
 }
@@ -1635,20 +1673,14 @@ del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
 }
 
 /*
- * Get the leftmost event in the cpu/cgroup subtree.
+ * Get the leftmost event in the cpu subtree without a cgroup (ie task or
+ * system-wide).
  */
 static struct perf_event *
-perf_event_groups_first(struct perf_event_groups *groups, int cpu,
-			struct cgroup *cgrp)
+perf_event_groups_first_no_cgroup(struct perf_event_groups *groups, int cpu)
 {
 	struct perf_event *node_event = NULL, *match = NULL;
 	struct rb_node *node = groups->tree.rb_node;
-#ifdef CONFIG_CGROUP_PERF
-	int node_cgrp_id, cgrp_id = 0;
-
-	if (cgrp)
-		cgrp_id = cgrp->id;
-#endif
 
 	while (node) {
 		node_event = container_of(node, struct perf_event, group_node);
@@ -1662,18 +1694,10 @@ perf_event_groups_first(struct perf_event_groups *groups, int cpu,
 			continue;
 		}
 #ifdef CONFIG_CGROUP_PERF
-		node_cgrp_id = 0;
-		if (node_event->cgrp && node_event->cgrp->css.cgroup)
-			node_cgrp_id = node_event->cgrp->css.cgroup->id;
-
-		if (cgrp_id < node_cgrp_id) {
+		if (node_event->cgrp) {
 			node = node->rb_left;
 			continue;
 		}
-		if (cgrp_id > node_cgrp_id) {
-			node = node->rb_right;
-			continue;
-		}
 #endif
 		match = node_event;
 		node = node->rb_left;
@@ -3428,6 +3452,13 @@ static void min_heap_pop_push(struct perf_event_heap *heap,
 	}
 }
 
+static int pinned_sched_in(struct perf_event_context *ctx,
+			   struct perf_cpu_context *cpuctx,
+			   struct perf_event *event);
+static int flexible_sched_in(struct perf_event_context *ctx,
+			     struct perf_cpu_context *cpuctx,
+			     struct perf_event *event,
+			     int *can_add_hw);
 
 /*
  * Without cgroups, with a task context, there may be per-CPU and any
@@ -3437,11 +3468,7 @@ static void min_heap_pop_push(struct perf_event_heap *heap,
 
 static int visit_groups_merge(struct perf_event_context *ctx,
 			      struct perf_cpu_context *cpuctx,
-			      struct perf_event_groups *groups,
-			      int (*func)(struct perf_event_context *,
-					  struct perf_cpu_context *,
-					  struct perf_event *,
-					  int *),
+			      bool is_pinned,
 			      int *data)
 {
 	/*
@@ -3467,7 +3494,11 @@ static int visit_groups_merge(struct perf_event_context *ctx,
 #endif
 	int ret, cpu = smp_processor_id();
 
-	heap.storage[0] = perf_event_groups_first(groups, cpu, NULL);
+	struct perf_event_groups *groups = is_pinned
+					   ? &ctx->pinned_groups
+					   : &ctx->flexible_groups;
+
+	heap.storage[0] = perf_event_groups_first_no_cgroup(groups, cpu);
 	if (heap.storage[0])
 		heap.num_elements++;
 
@@ -3477,7 +3508,7 @@ static int visit_groups_merge(struct perf_event_context *ctx,
 		 * events.
 		 */
 		heap.storage[heap.num_elements] =
-				perf_event_groups_first(groups, -1, NULL);
+				perf_event_groups_first_no_cgroup(groups, -1);
 		if (heap.storage[heap.num_elements])
 			heap.num_elements++;
 	} else {
@@ -3487,14 +3518,22 @@ static int visit_groups_merge(struct perf_event_context *ctx,
 		 * For itrs[1..MAX] add an iterator for each cgroup parent that
 		 * has events.
 		 */
-		if (cpuctx->cgrp) {
+		struct perf_cgroup *cgrp = cpuctx->cgrp;
+
+		if (cgrp) {
 			struct cgroup_subsys_state *css;
 
-			for (css = &cpuctx->cgrp->css; css; css = css->parent) {
-				heap.storage[heap.num_elements] =
-						perf_event_groups_first(groups,
-								   cpu,
-								   css->cgroup);
+			for (css = &cgrp->css; css; css = css->parent) {
+				/* root cgroup doesn't have events */
+				if (css->id == 1)
+					break;
+
+				cgrp = container_of(css, struct perf_cgroup,
+						    css);
+				heap.storage[heap.num_elements] = is_pinned
+					? *per_cpu_ptr(cgrp->pinned_event, cpu)
+					: *per_cpu_ptr(cgrp->flexible_event,
+						       cpu);
 				if (heap.storage[heap.num_elements]) {
 					heap.num_elements++;
 					if (heap.num_elements ==
@@ -3511,7 +3550,12 @@ static int visit_groups_merge(struct perf_event_context *ctx,
 	min_heapify_all(&heap);
 
 	while (heap.num_elements > 0) {
-		ret = func(ctx, cpuctx, heap.storage[0], data);
+		if (is_pinned)
+			ret = pinned_sched_in(ctx, cpuctx, heap.storage[0]);
+		else
+			ret = flexible_sched_in(ctx, cpuctx, heap.storage[0],
+						data);
+
 		if (ret)
 			return ret;
 
@@ -3524,8 +3568,7 @@ static int visit_groups_merge(struct perf_event_context *ctx,
 
 static int pinned_sched_in(struct perf_event_context *ctx,
 			   struct perf_cpu_context *cpuctx,
-			   struct perf_event *event,
-			   int *unused)
+			   struct perf_event *event)
 {
 	if (event->state <= PERF_EVENT_STATE_OFF)
 		return 0;
@@ -3578,8 +3621,7 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
 {
 	visit_groups_merge(ctx,
 			   cpuctx,
-			   &ctx->pinned_groups,
-			   pinned_sched_in,
+			   /*is_pinned=*/true,
 			   NULL);
 }
 
@@ -3591,8 +3633,7 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
 
 	visit_groups_merge(ctx,
 			   cpuctx,
-			   &ctx->flexible_groups,
-			   flexible_sched_in,
+			   /*is_pinned=*/false,
 			   &can_add_hw);
 }
 
@@ -12374,18 +12415,35 @@ perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 		return ERR_PTR(-ENOMEM);
 
 	jc->info = alloc_percpu(struct perf_cgroup_info);
-	if (!jc->info) {
-		kfree(jc);
-		return ERR_PTR(-ENOMEM);
-	}
+	if (!jc->info)
+		goto free_jc;
+
+	jc->pinned_event = alloc_percpu(struct perf_event *);
+	if (!jc->pinned_event)
+		goto free_jc_info;
+
+	jc->flexible_event = alloc_percpu(struct perf_event *);
+	if (!jc->flexible_event)
+		goto free_jc_pinned;
 
 	return &jc->css;
+
+free_jc_pinned:
+	free_percpu(jc->pinned_event);
+free_jc_info:
+	free_percpu(jc->info);
+free_jc:
+	kfree(jc);
+
+	return ERR_PTR(-ENOMEM);
 }
 
 static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
 {
 	struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
 
+	free_percpu(jc->pinned_event);
+	free_percpu(jc->flexible_event);
 	free_percpu(jc->info);
 	kfree(jc);
 }
-- 
2.22.0.410.gd8fdbe21b5-goog


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 6/7] perf: avoid double checking CPU and cgroup
  2019-07-02  6:59 [PATCH 0/7] Optimize cgroup context switch Ian Rogers
                   ` (4 preceding siblings ...)
  2019-07-02  6:59 ` [PATCH 5/7] perf: cache perf_event_groups_first for cgroups Ian Rogers
@ 2019-07-02  6:59 ` Ian Rogers
  2019-07-02  6:59 ` [PATCH 7/7] perf: rename visit_groups_merge to ctx_groups_sched_in Ian Rogers
  2019-07-24 22:37 ` [PATCH v2 0/7] Optimize cgroup context switch Ian Rogers
  7 siblings, 0 replies; 29+ messages in thread
From: Ian Rogers @ 2019-07-02  6:59 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Jiri Olsa, Namhyung Kim, linux-kernel
  Cc: Kan Liang, Stephane Eranian, Ian Rogers

When ctx_groups_sched_in iterates the CPU and cgroup of events is known
to match the current task. Avoid double checking this with
event_filter_match by passing in an additional argument.

Signed-off-by: Ian Rogers <irogers@google.com>
---
 kernel/events/core.c | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7608bd562dac..a66477ee196a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2079,10 +2079,12 @@ static inline int pmu_filter_match(struct perf_event *event)
 }
 
 static inline int
-event_filter_match(struct perf_event *event)
+event_filter_match(struct perf_event *event, bool check_cgroup_and_cpu)
 {
-	return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
-	       perf_cgroup_match(event) && pmu_filter_match(event);
+	return (!check_cgroup_and_cpu ||
+		((event->cpu == -1 || event->cpu == smp_processor_id()) &&
+		 perf_cgroup_match(event))) &&
+			pmu_filter_match(event);
 }
 
 static void
@@ -2797,7 +2799,7 @@ static void __perf_event_enable(struct perf_event *event,
 	if (!ctx->is_active)
 		return;
 
-	if (!event_filter_match(event)) {
+	if (!event_filter_match(event, /*check_cpu_and_cgroup=*/true)) {
 		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
 		return;
 	}
@@ -3573,7 +3575,10 @@ static int pinned_sched_in(struct perf_event_context *ctx,
 	if (event->state <= PERF_EVENT_STATE_OFF)
 		return 0;
 
-	if (!event_filter_match(event))
+	/* The caller already checked the CPU and cgroup before calling
+	 * pinned_sched_in.
+	 */
+	if (!event_filter_match(event, /*check_cpu_and_cgroup=*/false))
 		return 0;
 
 	if (group_can_go_on(event, cpuctx, 1)) {
@@ -3599,7 +3604,10 @@ static int flexible_sched_in(struct perf_event_context *ctx,
 	if (event->state <= PERF_EVENT_STATE_OFF)
 		return 0;
 
-	if (!event_filter_match(event))
+	/* The caller already checked the CPU and cgroup before calling
+	 * felxible_sched_in.
+	 */
+	if (!event_filter_match(event, /*check_cpu_and_cgroup=*/false))
 		return 0;
 
 	if (group_can_go_on(event, cpuctx, *can_add_hw)) {
@@ -3899,7 +3907,7 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
 		if (event->state != PERF_EVENT_STATE_ACTIVE)
 			continue;
 
-		if (!event_filter_match(event))
+		if (!event_filter_match(event, /*check_cpu_and_cgroup=*/true))
 			continue;
 
 		perf_pmu_disable(event->pmu);
@@ -6929,7 +6937,8 @@ perf_iterate_ctx(struct perf_event_context *ctx,
 		if (!all) {
 			if (event->state < PERF_EVENT_STATE_INACTIVE)
 				continue;
-			if (!event_filter_match(event))
+			if (!event_filter_match(event,
+						/*check_cpu_and_cgroup=*/true))
 				continue;
 		}
 
@@ -6953,7 +6962,7 @@ static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
 
 		if (event->state < PERF_EVENT_STATE_INACTIVE)
 			continue;
-		if (!event_filter_match(event))
+		if (!event_filter_match(event, /*check_cpu_and_cgroup=*/true))
 			continue;
 		output(event, data);
 	}
-- 
2.22.0.410.gd8fdbe21b5-goog


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 7/7] perf: rename visit_groups_merge to ctx_groups_sched_in
  2019-07-02  6:59 [PATCH 0/7] Optimize cgroup context switch Ian Rogers
                   ` (5 preceding siblings ...)
  2019-07-02  6:59 ` [PATCH 6/7] perf: avoid double checking CPU and cgroup Ian Rogers
@ 2019-07-02  6:59 ` Ian Rogers
  2019-07-24 22:37 ` [PATCH v2 0/7] Optimize cgroup context switch Ian Rogers
  7 siblings, 0 replies; 29+ messages in thread
From: Ian Rogers @ 2019-07-02  6:59 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Jiri Olsa, Namhyung Kim, linux-kernel
  Cc: Kan Liang, Stephane Eranian, Ian Rogers

The visit_groups_merge function no longer takes a function pointer,
change the name to be similar to other sched_in functions.
Follow Kan Liang's <kan.liang@linux.intel.com> and remove the single
caller flexible_sched_in and pinned_sched_in, moving functionality to
caller.

Signed-off-by: Ian Rogers <irogers@google.com>
---
 include/linux/perf_event.h |  4 +-
 kernel/events/core.c       | 77 ++++++++++++++++----------------------
 2 files changed, 35 insertions(+), 46 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 86fb379296cb..1dd0250d72bf 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -802,8 +802,8 @@ struct perf_cpu_context {
 #ifdef CONFIG_CGROUP_PERF
 	struct perf_cgroup		*cgrp;
 	struct list_head		cgrp_cpuctx_entry;
-	struct perf_event		**visit_groups_merge_iterator_storage;
-	int			       visit_groups_merge_iterator_storage_size;
+	struct perf_event		**ctx_groups_sched_in_iterator_storage;
+	int			      ctx_groups_sched_in_iterator_storage_size;
 #endif
 
 	struct list_head		sched_cb_entry;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index a66477ee196a..e714c2f9ea0d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2642,22 +2642,22 @@ static int  __perf_install_in_context(void *info)
 					event->cgrp->css.cgroup);
 
 		/*
-		 * Ensure space for visit_groups_merge iterator storage. With
+		 * Ensure space for ctx_groups_sched_in iterator storage. With
 		 * cgroup profiling we may have an event at each depth plus
 		 * system wide events.
 		 */
 		max_iterators = perf_event_cgroup_depth(event) + 1;
 		if (max_iterators >
-		    cpuctx->visit_groups_merge_iterator_storage_size) {
+		    cpuctx->ctx_groups_sched_in_iterator_storage_size) {
 			struct perf_event **storage =
-			   krealloc(cpuctx->visit_groups_merge_iterator_storage,
+			  krealloc(cpuctx->ctx_groups_sched_in_iterator_storage,
 				    sizeof(struct perf_event *) * max_iterators,
 				    GFP_KERNEL);
 			if (storage) {
-				cpuctx->visit_groups_merge_iterator_storage
-						= storage;
-				cpuctx->visit_groups_merge_iterator_storage_size
-						= max_iterators;
+				cpuctx->ctx_groups_sched_in_iterator_storage
+				    = storage;
+			       cpuctx->ctx_groups_sched_in_iterator_storage_size
+				    = max_iterators;
 			} else {
 				WARN_ONCE(1, "Unable to increase iterator "
 					"storage for perf events with cgroups");
@@ -3466,32 +3466,33 @@ static int flexible_sched_in(struct perf_event_context *ctx,
  * Without cgroups, with a task context, there may be per-CPU and any
  * CPU events.
  */
-#define MIN_VISIT_GROUP_MERGE_ITERATORS 2
+#define MIN_CTX_GROUPS_SCHED_IN_ITERATORS 2
 
-static int visit_groups_merge(struct perf_event_context *ctx,
-			      struct perf_cpu_context *cpuctx,
-			      bool is_pinned,
-			      int *data)
+static int ctx_groups_sched_in(struct perf_event_context *ctx,
+			       struct perf_cpu_context *cpuctx,
+			       bool is_pinned,
+			       int *data)
 {
 	/*
 	 * A set of iterators, the iterator for the visit is chosen by the
 	 * group_index.
 	 */
 #ifndef CONFIG_CGROUP_PERF
-	struct perf_event *itrs[MIN_VISIT_GROUP_MERGE_ITERATORS];
+	struct perf_event *itrs[MIN_CTX_GROUPS_SCHED_IN_ITERATORS];
 	struct perf_event_heap heap = {
 		.storage = itrs,
 		.num_elements = 0,
-		.max_elements = MIN_VISIT_GROUP_MERGE_ITERATORS
+		.max_elements = MIN_CTX_GROUPS_SCHED_IN_ITERATORS
 	};
 #else
 	/*
 	 * With cgroups usage space in the CPU context reserved for iterators.
 	 */
 	struct perf_event_heap heap = {
-		.storage = cpuctx->visit_groups_merge_iterator_storage,
+		.storage = cpuctx->ctx_groups_sched_in_iterator_storage,
 		.num_elements = 0,
-		.max_elements = cpuctx->visit_groups_merge_iterator_storage_size
+		.max_elements =
+			cpuctx->ctx_groups_sched_in_iterator_storage_size
 	};
 #endif
 	int ret, cpu = smp_processor_id();
@@ -3623,27 +3624,6 @@ static int flexible_sched_in(struct perf_event_context *ctx,
 	return 0;
 }
 
-static void
-ctx_pinned_sched_in(struct perf_event_context *ctx,
-		    struct perf_cpu_context *cpuctx)
-{
-	visit_groups_merge(ctx,
-			   cpuctx,
-			   /*is_pinned=*/true,
-			   NULL);
-}
-
-static void
-ctx_flexible_sched_in(struct perf_event_context *ctx,
-		      struct perf_cpu_context *cpuctx)
-{
-	int can_add_hw = 1;
-
-	visit_groups_merge(ctx,
-			   cpuctx,
-			   /*is_pinned=*/false,
-			   &can_add_hw);
-}
 
 static void
 ctx_sched_in(struct perf_event_context *ctx,
@@ -3681,11 +3661,20 @@ ctx_sched_in(struct perf_event_context *ctx,
 	 * in order to give them the best chance of going on.
 	 */
 	if (is_active & EVENT_PINNED)
-		ctx_pinned_sched_in(ctx, cpuctx);
+		ctx_groups_sched_in(ctx,
+				    cpuctx,
+				    /*is_pinned=*/true,
+				    NULL);
 
 	/* Then walk through the lower prio flexible groups */
-	if (is_active & EVENT_FLEXIBLE)
-		ctx_flexible_sched_in(ctx, cpuctx);
+	if (is_active & EVENT_FLEXIBLE) {
+		int can_add_hw = 1;
+
+		ctx_groups_sched_in(ctx,
+				    cpuctx,
+				    /*is_pinned=*/false,
+				    &can_add_hw);
+	}
 }
 
 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
@@ -10243,12 +10232,12 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
 		cpuctx->ctx.pmu = pmu;
 		cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
 #ifdef CONFIG_CGROUP_PERF
-		cpuctx->visit_groups_merge_iterator_storage =
-				kmalloc_array(MIN_VISIT_GROUP_MERGE_ITERATORS,
+		cpuctx->ctx_groups_sched_in_iterator_storage =
+				kmalloc_array(MIN_CTX_GROUPS_SCHED_IN_ITERATORS,
 					      sizeof(struct perf_event *),
 					      GFP_KERNEL);
-		cpuctx->visit_groups_merge_iterator_storage_size =
-				MIN_VISIT_GROUP_MERGE_ITERATORS;
+		cpuctx->ctx_groups_sched_in_iterator_storage_size =
+				MIN_CTX_GROUPS_SCHED_IN_ITERATORS;
 #endif
 		__perf_mux_hrtimer_init(cpuctx, cpu);
 	}
-- 
2.22.0.410.gd8fdbe21b5-goog


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [PATCH 1/7] perf: propagate perf_install_in_context errors up
  2019-07-02  6:59 ` [PATCH 1/7] perf: propagate perf_install_in_context errors up Ian Rogers
@ 2019-07-05 14:13   ` Jiri Olsa
  0 siblings, 0 replies; 29+ messages in thread
From: Jiri Olsa @ 2019-07-05 14:13 UTC (permalink / raw)
  To: Ian Rogers
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Namhyung Kim, linux-kernel, Kan Liang,
	Stephane Eranian

On Mon, Jul 01, 2019 at 11:59:49PM -0700, Ian Rogers wrote:
> The current __perf_install_in_context can fail and the error is ignored.
> Changing __perf_install_in_context can add new failure modes that need
> errors propagating up. This change prepares for this.
> 
> Signed-off-by: Ian Rogers <irogers@google.com>
> ---
>  kernel/events/core.c | 38 +++++++++++++++++++++++++-------------
>  1 file changed, 25 insertions(+), 13 deletions(-)
> 
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 785d708f8553..4faa90f5a934 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -2558,11 +2558,12 @@ static int  __perf_install_in_context(void *info)
>   *
>   * Very similar to event_function_call, see comment there.
>   */
> -static void
> +static int
>  perf_install_in_context(struct perf_event_context *ctx,
>  			struct perf_event *event,
>  			int cpu)
>  {
> +	int err;
>  	struct task_struct *task = READ_ONCE(ctx->task);
>  
>  	lockdep_assert_held(&ctx->mutex);
> @@ -2577,15 +2578,15 @@ perf_install_in_context(struct perf_event_context *ctx,
>  	smp_store_release(&event->ctx, ctx);
>  
>  	if (!task) {
> -		cpu_function_call(cpu, __perf_install_in_context, event);
> -		return;
> +		err = cpu_function_call(cpu, __perf_install_in_context, event);
> +		return err;
>  	}
>  
>  	/*
>  	 * Should not happen, we validate the ctx is still alive before calling.
>  	 */
>  	if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
> -		return;
> +		return 0;
>  
>  	/*
>  	 * Installing events is tricky because we cannot rely on ctx->is_active
> @@ -2619,8 +2620,9 @@ perf_install_in_context(struct perf_event_context *ctx,
>  	 */
>  	smp_mb();
>  again:
> -	if (!task_function_call(task, __perf_install_in_context, event))
> -		return;
> +	err = task_function_call(task, __perf_install_in_context, event);
> +	if (err)
> +		return err;

you need to return in here if task_function_call succeeds and
continue in case of error, not the other way round, otherwise
bad things will happen ;-)

jirka

>  
>  	raw_spin_lock_irq(&ctx->lock);
>  	task = ctx->task;
> @@ -2631,7 +2633,7 @@ perf_install_in_context(struct perf_event_context *ctx,
>  		 * against perf_event_exit_task_context().
>  		 */
>  		raw_spin_unlock_irq(&ctx->lock);
> -		return;
> +		return 0;
>  	}
>  	/*
>  	 * If the task is not running, ctx->lock will avoid it becoming so,
> @@ -2643,6 +2645,7 @@ perf_install_in_context(struct perf_event_context *ctx,
>  	}
>  	add_event_to_ctx(event, ctx);
>  	raw_spin_unlock_irq(&ctx->lock);
> +	return 0;
>  }
>  
>  /*

SNIP

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/7] perf/cgroup: order events in RB tree by cgroup id
  2019-07-02  6:59 ` [PATCH 2/7] perf/cgroup: order events in RB tree by cgroup id Ian Rogers
@ 2019-07-08 15:38   ` Peter Zijlstra
  2019-07-08 15:45   ` Peter Zijlstra
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 29+ messages in thread
From: Peter Zijlstra @ 2019-07-08 15:38 UTC (permalink / raw)
  To: Ian Rogers
  Cc: Ingo Molnar, Arnaldo Carvalho de Melo, Alexander Shishkin,
	Jiri Olsa, Namhyung Kim, linux-kernel, Kan Liang,
	Stephane Eranian

On Mon, Jul 01, 2019 at 11:59:50PM -0700, Ian Rogers wrote:
> @@ -1530,6 +1530,32 @@ perf_event_groups_less(struct perf_event *left, struct perf_event *right)
>  	if (left->cpu > right->cpu)
>  		return false;
>  
> +#ifdef CONFIG_CGROUP_PERF
> +	if (left->cgrp != right->cgrp) {
> +		if (!left->cgrp)
> +			/*
> +			 * Left has no cgroup but right does, no cgroups come
> +			 * first.
> +			 */
> +			return true;
> +		if (!right->cgrp)
> +			/*
> +			 * Right has no cgroup but left does, no cgroups come
> +			 * first.
> +			 */
> +			return false;

Per CodingStyle any multi-line statement (here due to comments) needs {
}.

> +		if (left->cgrp->css.cgroup != right->cgrp->css.cgroup) {
> +			if (!left->cgrp->css.cgroup)
> +				return true;
> +			if (!right->cgrp->css.cgroup)
> +				return false;
> +			/* Two dissimilar cgroups, order by id. */
> +			return left->cgrp->css.cgroup->id <
> +					right->cgrp->css.cgroup->id;
> +		}

This is dis-similar to the existing style ( < true, > false, ==
fall-through).

Can all this be written like:


	if (!left->cgrp || !left->cgrp.css.cgroup)
		return true;

	if (!right->cgrp || !right->cgrp.css.cgroup)
		return false;

	if (left->cgrp->css.cgroup->id < right->cgrp->css.cgroup->id)
		retun true;

	if (left->cgrp->css.cgroup->id > right->cgrp->css.cgroup->id)
		return false;


?

> +	}
> +#endif
> +
>  	if (left->group_index < right->group_index)
>  		return true;
>  	if (left->group_index > right->group_index)

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/7] perf/cgroup: order events in RB tree by cgroup id
  2019-07-02  6:59 ` [PATCH 2/7] perf/cgroup: order events in RB tree by cgroup id Ian Rogers
  2019-07-08 15:38   ` Peter Zijlstra
@ 2019-07-08 15:45   ` Peter Zijlstra
  2019-07-08 16:16   ` Peter Zijlstra
  2019-07-08 16:21   ` Peter Zijlstra
  3 siblings, 0 replies; 29+ messages in thread
From: Peter Zijlstra @ 2019-07-08 15:45 UTC (permalink / raw)
  To: Ian Rogers
  Cc: Ingo Molnar, Arnaldo Carvalho de Melo, Alexander Shishkin,
	Jiri Olsa, Namhyung Kim, linux-kernel, Kan Liang,
	Stephane Eranian

On Mon, Jul 01, 2019 at 11:59:50PM -0700, Ian Rogers wrote:
> +perf_event_groups_first(struct perf_event_groups *groups, int cpu,
> +			struct cgroup *cgrp)
>  {
>  	struct perf_event *node_event = NULL, *match = NULL;
>  	struct rb_node *node = groups->tree.rb_node;
> +#ifdef CONFIG_CGROUP_PERF
> +	int node_cgrp_id, cgrp_id = 0;
> +
> +	if (cgrp)
> +		cgrp_id = cgrp->id;
> +#endif

Is 0 ever a valid cgroup.id ? If so, should we perhaps use -1 to denote
'none' ? Ether way around a little comment here couldn't hurt, saves one
from digging into the cgroup code.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/7] perf/cgroup: order events in RB tree by cgroup id
  2019-07-02  6:59 ` [PATCH 2/7] perf/cgroup: order events in RB tree by cgroup id Ian Rogers
  2019-07-08 15:38   ` Peter Zijlstra
  2019-07-08 15:45   ` Peter Zijlstra
@ 2019-07-08 16:16   ` Peter Zijlstra
  2019-07-24 22:34     ` Ian Rogers
  2019-07-08 16:21   ` Peter Zijlstra
  3 siblings, 1 reply; 29+ messages in thread
From: Peter Zijlstra @ 2019-07-08 16:16 UTC (permalink / raw)
  To: Ian Rogers
  Cc: Ingo Molnar, Arnaldo Carvalho de Melo, Alexander Shishkin,
	Jiri Olsa, Namhyung Kim, linux-kernel, Kan Liang,
	Stephane Eranian

On Mon, Jul 01, 2019 at 11:59:50PM -0700, Ian Rogers wrote:
> +static int visit_groups_merge(struct perf_event_context *ctx,
> +			      struct perf_cpu_context *cpuctx,
> +			      struct perf_event_groups *groups,
> +			      int (*func)(struct perf_event_context *,
> +					  struct perf_cpu_context *,
> +					  struct perf_event *,
> +					  int *),
> +			      int *data)

> -struct sched_in_data {
> -	struct perf_event_context *ctx;
> -	struct perf_cpu_context *cpuctx;
> -	int can_add_hw;
> -};
> -
> -static int pinned_sched_in(struct perf_event *event, void *data)
> +static int pinned_sched_in(struct perf_event_context *ctx,
> +			   struct perf_cpu_context *cpuctx,
> +			   struct perf_event *event,
> +			   int *unused)
>  {
> -	struct sched_in_data *sid = data;
> -
>  	if (event->state <= PERF_EVENT_STATE_OFF)
>  		return 0;
>  
>  	if (!event_filter_match(event))
>  		return 0;
>  
> -	if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
> -		if (!group_sched_in(event, sid->cpuctx, sid->ctx))
> -			list_add_tail(&event->active_list, &sid->ctx->pinned_active);
> +	if (group_can_go_on(event, cpuctx, 1)) {
> +		if (!group_sched_in(event, cpuctx, ctx))
> +			list_add_tail(&event->active_list, &ctx->pinned_active);
>  	}
>  
>  	/*
> @@ -3317,24 +3444,25 @@ static int pinned_sched_in(struct perf_event *event, void *data)
>  	return 0;
>  }
>  
> -static int flexible_sched_in(struct perf_event *event, void *data)
> +static int flexible_sched_in(struct perf_event_context *ctx,
> +			     struct perf_cpu_context *cpuctx,
> +			     struct perf_event *event,
> +			     int *can_add_hw)
>  {
> -	struct sched_in_data *sid = data;
> -
>  	if (event->state <= PERF_EVENT_STATE_OFF)
>  		return 0;
>  
>  	if (!event_filter_match(event))
>  		return 0;
>  
> -	if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
> -		int ret = group_sched_in(event, sid->cpuctx, sid->ctx);
> +	if (group_can_go_on(event, cpuctx, *can_add_hw)) {
> +		int ret = group_sched_in(event, cpuctx, ctx);
>  		if (ret) {
> -			sid->can_add_hw = 0;
> -			sid->ctx->rotate_necessary = 1;
> +			*can_add_hw = 0;
> +			ctx->rotate_necessary = 1;
>  			return 0;
>  		}
> -		list_add_tail(&event->active_list, &sid->ctx->flexible_active);
> +		list_add_tail(&event->active_list, &ctx->flexible_active);
>  	}
>  
>  	return 0;
> @@ -3344,30 +3472,24 @@ static void
>  ctx_pinned_sched_in(struct perf_event_context *ctx,
>  		    struct perf_cpu_context *cpuctx)
>  {
> -	struct sched_in_data sid = {
> -		.ctx = ctx,
> -		.cpuctx = cpuctx,
> -		.can_add_hw = 1,
> -	};
> -
> -	visit_groups_merge(&ctx->pinned_groups,
> -			   smp_processor_id(),
> -			   pinned_sched_in, &sid);
> +	visit_groups_merge(ctx,
> +			   cpuctx,
> +			   &ctx->pinned_groups,
> +			   pinned_sched_in,
> +			   NULL);
>  }
>  
>  static void
>  ctx_flexible_sched_in(struct perf_event_context *ctx,
>  		      struct perf_cpu_context *cpuctx)
>  {
> -	struct sched_in_data sid = {
> -		.ctx = ctx,
> -		.cpuctx = cpuctx,
> -		.can_add_hw = 1,
> -	};
> +	int can_add_hw = 1;
>  
> -	visit_groups_merge(&ctx->flexible_groups,
> -			   smp_processor_id(),
> -			   flexible_sched_in, &sid);
> +	visit_groups_merge(ctx,
> +			   cpuctx,
> +			   &ctx->flexible_groups,
> +			   flexible_sched_in,
> +			   &can_add_hw);
>  }

It is not clear to me why you're doing away with that sched_in_data
abstraction. AFAICT that has no material impact on this patch.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/7] perf/cgroup: order events in RB tree by cgroup id
  2019-07-02  6:59 ` [PATCH 2/7] perf/cgroup: order events in RB tree by cgroup id Ian Rogers
                     ` (2 preceding siblings ...)
  2019-07-08 16:16   ` Peter Zijlstra
@ 2019-07-08 16:21   ` Peter Zijlstra
  3 siblings, 0 replies; 29+ messages in thread
From: Peter Zijlstra @ 2019-07-08 16:21 UTC (permalink / raw)
  To: Ian Rogers
  Cc: Ingo Molnar, Arnaldo Carvalho de Melo, Alexander Shishkin,
	Jiri Olsa, Namhyung Kim, linux-kernel, Kan Liang,
	Stephane Eranian

On Mon, Jul 01, 2019 at 11:59:50PM -0700, Ian Rogers wrote:
> +#ifndef CONFIG_CGROUP_PERF
> +	/*
> +	 * Without cgroups, with a task context, iterate over per-CPU and any
> +	 * CPU events.
> +	 */
> +	const int max_itrs = 2;
> +#else
> +	/*
> +	 * The depth of cgroups is limited by MAX_PATH. It is unlikely that this
> +	 * many parent-child related cgroups will have perf events
> +	 * monitored. Limit the number of cgroup iterators to 16.
> +	 */
> +	const int max_cgroups_with_events_depth = 16;
> +	/*
> +	 * With cgroups we either iterate for a task context (per-CPU or any CPU
> +	 * events) or for per CPU the global and per cgroup events.
> +	 */
> +	const int max_itrs = max(2, 1 + max_cgroups_with_events_depth);

That seems like a very complicated way to write 17.

> +#endif
> +	/* The number of iterators in use. */
> +	int num_itrs;
> +	/*
> +	 * A set of iterators, the iterator for the visit is chosen by the
> +	 * group_index.
> +	 */
> +	struct perf_event *itrs[max_itrs];

And that is 136 bytes of stack. Which I suppose is just about
acceptible.

But why not something like:

	struct perf_event *iters[IS_ENABLED(CONFIG_CGROUP_PERF) ? 17 : 2];

	iters[i++] = foo;
	WARN_ON_ONCE(i >= ARRAY_SIZE(iters));

?

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 3/7] perf: order iterators for visit_groups_merge into a min-heap
  2019-07-02  6:59 ` [PATCH 3/7] perf: order iterators for visit_groups_merge into a min-heap Ian Rogers
@ 2019-07-08 16:30   ` Peter Zijlstra
  2019-07-24 22:34     ` Ian Rogers
  2019-07-08 16:36   ` Peter Zijlstra
  1 sibling, 1 reply; 29+ messages in thread
From: Peter Zijlstra @ 2019-07-08 16:30 UTC (permalink / raw)
  To: Ian Rogers
  Cc: Ingo Molnar, Arnaldo Carvalho de Melo, Alexander Shishkin,
	Jiri Olsa, Namhyung Kim, linux-kernel, Kan Liang,
	Stephane Eranian

On Mon, Jul 01, 2019 at 11:59:51PM -0700, Ian Rogers wrote:
> The groups rbtree holding perf events, either for a CPU or a task, needs
> to have multiple iterators that visit events in group_index (insertion)
> order. Rather than linearly searching the iterators, use a min-heap to go
> from a O(#iterators) search to a O(log2(#iterators)) insert cost per event
> visited.

Is this actually faster for the common (very small n) case?

ISTR 'stupid' sorting algorithms are actually faster when the data fits
into L1

> Signed-off-by: Ian Rogers <irogers@google.com>
> ---
>  kernel/events/core.c | 123 +++++++++++++++++++++++++++++++++----------
>  1 file changed, 95 insertions(+), 28 deletions(-)
> 
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 9a2ad34184b8..396b5ac6dcd4 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -3318,6 +3318,77 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
>  	ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
>  }
>  
> +/* Data structure used to hold a min-heap, ordered by group_index, of a fixed
> + * maximum size.
> + */

Broken comment style.

> +struct perf_event_heap {
> +	struct perf_event **storage;
> +	int num_elements;
> +	int max_elements;
> +};
> +
> +static void min_heap_swap(struct perf_event_heap *heap,
> +			  int pos1, int pos2)
> +{
> +	struct perf_event *tmp = heap->storage[pos1];
> +
> +	heap->storage[pos1] = heap->storage[pos2];
> +	heap->storage[pos2] = tmp;
> +}
> +
> +/* Sift the perf_event at pos down the heap. */
> +static void min_heapify(struct perf_event_heap *heap, int pos)
> +{
> +	int left_child, right_child;
> +
> +	while (pos > heap->num_elements / 2) {
> +		left_child = pos * 2;
> +		right_child = pos * 2 + 1;
> +		if (heap->storage[pos]->group_index >
> +		    heap->storage[left_child]->group_index) {
> +			min_heap_swap(heap, pos, left_child);
> +			pos = left_child;
> +		} else if (heap->storage[pos]->group_index >
> +			   heap->storage[right_child]->group_index) {
> +			min_heap_swap(heap, pos, right_child);
> +			pos = right_child;
> +		} else {
> +			break;
> +		}
> +	}
> +}
> +
> +/* Floyd's approach to heapification that is O(n). */
> +static void min_heapify_all(struct perf_event_heap *heap)
> +{
> +	int i;
> +
> +	for (i = heap->num_elements / 2; i > 0; i--)
> +		min_heapify(heap, i);
> +}
> +
> +/* Remove minimum element from the heap. */
> +static void min_heap_pop(struct perf_event_heap *heap)
> +{
> +	WARN_ONCE(heap->num_elements <= 0, "Popping an empty heap");
> +	heap->num_elements--;
> +	heap->storage[0] = heap->storage[heap->num_elements];
> +	min_heapify(heap, 0);
> +}

Is this really the first heap implementation in the kernel?

> @@ -3378,12 +3453,14 @@ static int visit_groups_merge(struct perf_event_context *ctx,
>  			struct cgroup_subsys_state *css;
>  
>  			for (css = &cpuctx->cgrp->css; css; css = css->parent) {
> -				itrs[num_itrs] = perf_event_groups_first(groups,
> +				heap.storage[heap.num_elements] =
> +						perf_event_groups_first(groups,
>  								   cpu,
>  								   css->cgroup);
> -				if (itrs[num_itrs]) {
> -					num_itrs++;
> -					if (num_itrs == max_itrs) {
> +				if (heap.storage[heap.num_elements]) {
> +					heap.num_elements++;
> +					if (heap.num_elements ==
> +					    heap.max_elements) {
>  						WARN_ONCE(
>  				     max_cgroups_with_events_depth,
>  				     "Insufficient iterators for cgroup depth");

That's turning into unreadable garbage due to indentation; surely
there's a solution for that.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 3/7] perf: order iterators for visit_groups_merge into a min-heap
  2019-07-02  6:59 ` [PATCH 3/7] perf: order iterators for visit_groups_merge into a min-heap Ian Rogers
  2019-07-08 16:30   ` Peter Zijlstra
@ 2019-07-08 16:36   ` Peter Zijlstra
  1 sibling, 0 replies; 29+ messages in thread
From: Peter Zijlstra @ 2019-07-08 16:36 UTC (permalink / raw)
  To: Ian Rogers
  Cc: Ingo Molnar, Arnaldo Carvalho de Melo, Alexander Shishkin,
	Jiri Olsa, Namhyung Kim, linux-kernel, Kan Liang,
	Stephane Eranian

On Mon, Jul 01, 2019 at 11:59:51PM -0700, Ian Rogers wrote:

> +/* Sift the perf_event at pos down the heap. */
> +static void min_heapify(struct perf_event_heap *heap, int pos)
> +{
> +	int left_child, right_child;
> +
> +	while (pos > heap->num_elements / 2) {

Did that want to be 'pos < head->num_elements / 2' ?

> +		left_child = pos * 2;
> +		right_child = pos * 2 + 1;
> +		if (heap->storage[pos]->group_index >
> +		    heap->storage[left_child]->group_index) {
> +			min_heap_swap(heap, pos, left_child);
> +			pos = left_child;
> +		} else if (heap->storage[pos]->group_index >
> +			   heap->storage[right_child]->group_index) {
> +			min_heap_swap(heap, pos, right_child);
> +			pos = right_child;
> +		} else {
> +			break;
> +		}
> +	}
> +}
> +
> +/* Floyd's approach to heapification that is O(n). */
> +static void min_heapify_all(struct perf_event_heap *heap)
> +{
> +	int i;
> +
> +	for (i = heap->num_elements / 2; i > 0; i--)
> +		min_heapify(heap, i);
> +}

Otherwise I don't actually see it do anything at all. Also, when pos >
nr/2, then pos * 2 is silly.



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 5/7] perf: cache perf_event_groups_first for cgroups
  2019-07-02  6:59 ` [PATCH 5/7] perf: cache perf_event_groups_first for cgroups Ian Rogers
@ 2019-07-08 16:50   ` Peter Zijlstra
  2019-07-08 16:51     ` Peter Zijlstra
  0 siblings, 1 reply; 29+ messages in thread
From: Peter Zijlstra @ 2019-07-08 16:50 UTC (permalink / raw)
  To: Ian Rogers
  Cc: Ingo Molnar, Arnaldo Carvalho de Melo, Alexander Shishkin,
	Jiri Olsa, Namhyung Kim, linux-kernel, Kan Liang,
	Stephane Eranian

On Mon, Jul 01, 2019 at 11:59:53PM -0700, Ian Rogers wrote:
> @@ -3511,7 +3550,12 @@ static int visit_groups_merge(struct perf_event_context *ctx,
>  	min_heapify_all(&heap);
>  
>  	while (heap.num_elements > 0) {
> -		ret = func(ctx, cpuctx, heap.storage[0], data);
> +		if (is_pinned)
> +			ret = pinned_sched_in(ctx, cpuctx, heap.storage[0]);
> +		else
> +			ret = flexible_sched_in(ctx, cpuctx, heap.storage[0],
> +						data);
> +
>  		if (ret)
>  			return ret;
>  

You can actually merge those two functions; see merge_sched_in() in the
below patch:

https://lkml.kernel.org/r/20181010104559.GO5728@hirez.programming.kicks-ass.net

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 5/7] perf: cache perf_event_groups_first for cgroups
  2019-07-08 16:50   ` Peter Zijlstra
@ 2019-07-08 16:51     ` Peter Zijlstra
  0 siblings, 0 replies; 29+ messages in thread
From: Peter Zijlstra @ 2019-07-08 16:51 UTC (permalink / raw)
  To: Ian Rogers
  Cc: Ingo Molnar, Arnaldo Carvalho de Melo, Alexander Shishkin,
	Jiri Olsa, Namhyung Kim, linux-kernel, Kan Liang,
	Stephane Eranian

On Mon, Jul 08, 2019 at 06:50:34PM +0200, Peter Zijlstra wrote:
> On Mon, Jul 01, 2019 at 11:59:53PM -0700, Ian Rogers wrote:
> > @@ -3511,7 +3550,12 @@ static int visit_groups_merge(struct perf_event_context *ctx,
> >  	min_heapify_all(&heap);
> >  
> >  	while (heap.num_elements > 0) {
> > -		ret = func(ctx, cpuctx, heap.storage[0], data);
> > +		if (is_pinned)
> > +			ret = pinned_sched_in(ctx, cpuctx, heap.storage[0]);
> > +		else
> > +			ret = flexible_sched_in(ctx, cpuctx, heap.storage[0],
> > +						data);
> > +
> >  		if (ret)
> >  			return ret;
> >  
> 
> You can actually merge those two functions; see merge_sched_in() in the
> below patch:
> 
> https://lkml.kernel.org/r/20181010104559.GO5728@hirez.programming.kicks-ass.net

And this change is again unrelated to the one the Changelog mentions.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/7] perf/cgroup: order events in RB tree by cgroup id
  2019-07-08 16:16   ` Peter Zijlstra
@ 2019-07-24 22:34     ` Ian Rogers
  0 siblings, 0 replies; 29+ messages in thread
From: Ian Rogers @ 2019-07-24 22:34 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Arnaldo Carvalho de Melo, Alexander Shishkin,
	Jiri Olsa, Namhyung Kim, LKML, Kan Liang, Stephane Eranian

On Mon, Jul 8, 2019 at 9:16 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Mon, Jul 01, 2019 at 11:59:50PM -0700, Ian Rogers wrote:
> > +static int visit_groups_merge(struct perf_event_context *ctx,
> > +                           struct perf_cpu_context *cpuctx,
> > +                           struct perf_event_groups *groups,
> > +                           int (*func)(struct perf_event_context *,
> > +                                       struct perf_cpu_context *,
> > +                                       struct perf_event *,
> > +                                       int *),
> > +                           int *data)
>
> > -struct sched_in_data {
> > -     struct perf_event_context *ctx;
> > -     struct perf_cpu_context *cpuctx;
> > -     int can_add_hw;
> > -};
> > -
> > -static int pinned_sched_in(struct perf_event *event, void *data)
> > +static int pinned_sched_in(struct perf_event_context *ctx,
> > +                        struct perf_cpu_context *cpuctx,
> > +                        struct perf_event *event,
> > +                        int *unused)
> >  {
> > -     struct sched_in_data *sid = data;
> > -
> >       if (event->state <= PERF_EVENT_STATE_OFF)
> >               return 0;
> >
> >       if (!event_filter_match(event))
> >               return 0;
> >
> > -     if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
> > -             if (!group_sched_in(event, sid->cpuctx, sid->ctx))
> > -                     list_add_tail(&event->active_list, &sid->ctx->pinned_active);
> > +     if (group_can_go_on(event, cpuctx, 1)) {
> > +             if (!group_sched_in(event, cpuctx, ctx))
> > +                     list_add_tail(&event->active_list, &ctx->pinned_active);
> >       }
> >
> >       /*
> > @@ -3317,24 +3444,25 @@ static int pinned_sched_in(struct perf_event *event, void *data)
> >       return 0;
> >  }
> >
> > -static int flexible_sched_in(struct perf_event *event, void *data)
> > +static int flexible_sched_in(struct perf_event_context *ctx,
> > +                          struct perf_cpu_context *cpuctx,
> > +                          struct perf_event *event,
> > +                          int *can_add_hw)
> >  {
> > -     struct sched_in_data *sid = data;
> > -
> >       if (event->state <= PERF_EVENT_STATE_OFF)
> >               return 0;
> >
> >       if (!event_filter_match(event))
> >               return 0;
> >
> > -     if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
> > -             int ret = group_sched_in(event, sid->cpuctx, sid->ctx);
> > +     if (group_can_go_on(event, cpuctx, *can_add_hw)) {
> > +             int ret = group_sched_in(event, cpuctx, ctx);
> >               if (ret) {
> > -                     sid->can_add_hw = 0;
> > -                     sid->ctx->rotate_necessary = 1;
> > +                     *can_add_hw = 0;
> > +                     ctx->rotate_necessary = 1;
> >                       return 0;
> >               }
> > -             list_add_tail(&event->active_list, &sid->ctx->flexible_active);
> > +             list_add_tail(&event->active_list, &ctx->flexible_active);
> >       }
> >
> >       return 0;
> > @@ -3344,30 +3472,24 @@ static void
> >  ctx_pinned_sched_in(struct perf_event_context *ctx,
> >                   struct perf_cpu_context *cpuctx)
> >  {
> > -     struct sched_in_data sid = {
> > -             .ctx = ctx,
> > -             .cpuctx = cpuctx,
> > -             .can_add_hw = 1,
> > -     };
> > -
> > -     visit_groups_merge(&ctx->pinned_groups,
> > -                        smp_processor_id(),
> > -                        pinned_sched_in, &sid);
> > +     visit_groups_merge(ctx,
> > +                        cpuctx,
> > +                        &ctx->pinned_groups,
> > +                        pinned_sched_in,
> > +                        NULL);
> >  }
> >
> >  static void
> >  ctx_flexible_sched_in(struct perf_event_context *ctx,
> >                     struct perf_cpu_context *cpuctx)
> >  {
> > -     struct sched_in_data sid = {
> > -             .ctx = ctx,
> > -             .cpuctx = cpuctx,
> > -             .can_add_hw = 1,
> > -     };
> > +     int can_add_hw = 1;
> >
> > -     visit_groups_merge(&ctx->flexible_groups,
> > -                        smp_processor_id(),
> > -                        flexible_sched_in, &sid);
> > +     visit_groups_merge(ctx,
> > +                        cpuctx,
> > +                        &ctx->flexible_groups,
> > +                        flexible_sched_in,
> > +                        &can_add_hw);
> >  }
>
> It is not clear to me why you're doing away with that sched_in_data
> abstraction. AFAICT that has no material impact on this patch.

The change alters visit_groups_merge parameters to take ctx and cpuctx
in order to compute the number of iterators. ctx and cpuctx were taken
out of sched_in_data to avoid passing them twice. can_add_hw remains
but not wrapped inside of sched_in_data as it seemed unnecessary to
wrap a single boolean.

Thanks,
Ian

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 3/7] perf: order iterators for visit_groups_merge into a min-heap
  2019-07-08 16:30   ` Peter Zijlstra
@ 2019-07-24 22:34     ` Ian Rogers
  0 siblings, 0 replies; 29+ messages in thread
From: Ian Rogers @ 2019-07-24 22:34 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Arnaldo Carvalho de Melo, Alexander Shishkin,
	Jiri Olsa, Namhyung Kim, LKML, Kan Liang, Stephane Eranian

On Mon, Jul 8, 2019 at 9:30 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Mon, Jul 01, 2019 at 11:59:51PM -0700, Ian Rogers wrote:
> > The groups rbtree holding perf events, either for a CPU or a task, needs
> > to have multiple iterators that visit events in group_index (insertion)
> > order. Rather than linearly searching the iterators, use a min-heap to go
> > from a O(#iterators) search to a O(log2(#iterators)) insert cost per event
> > visited.
>
> Is this actually faster for the common (very small n) case?
>
> ISTR 'stupid' sorting algorithms are actually faster when the data fits
> into L1

A common case is for there to be 1 cgroup iterator, for which all the
min_heapify calls will do no work as the event is always a leaf. It'd
be expected a min-heap to be optimal for a large number of iterators.
I'm not sure it is worth optimizing the space between small number of
iterators and large number of iterators.

Thanks,
Ian


> > Signed-off-by: Ian Rogers <irogers@google.com>
> > ---
> >  kernel/events/core.c | 123 +++++++++++++++++++++++++++++++++----------
> >  1 file changed, 95 insertions(+), 28 deletions(-)
> >
> > diff --git a/kernel/events/core.c b/kernel/events/core.c
> > index 9a2ad34184b8..396b5ac6dcd4 100644
> > --- a/kernel/events/core.c
> > +++ b/kernel/events/core.c
> > @@ -3318,6 +3318,77 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
> >       ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
> >  }
> >
> > +/* Data structure used to hold a min-heap, ordered by group_index, of a fixed
> > + * maximum size.
> > + */
>
> Broken comment style.
>
> > +struct perf_event_heap {
> > +     struct perf_event **storage;
> > +     int num_elements;
> > +     int max_elements;
> > +};
> > +
> > +static void min_heap_swap(struct perf_event_heap *heap,
> > +                       int pos1, int pos2)
> > +{
> > +     struct perf_event *tmp = heap->storage[pos1];
> > +
> > +     heap->storage[pos1] = heap->storage[pos2];
> > +     heap->storage[pos2] = tmp;
> > +}
> > +
> > +/* Sift the perf_event at pos down the heap. */
> > +static void min_heapify(struct perf_event_heap *heap, int pos)
> > +{
> > +     int left_child, right_child;
> > +
> > +     while (pos > heap->num_elements / 2) {
> > +             left_child = pos * 2;
> > +             right_child = pos * 2 + 1;
> > +             if (heap->storage[pos]->group_index >
> > +                 heap->storage[left_child]->group_index) {
> > +                     min_heap_swap(heap, pos, left_child);
> > +                     pos = left_child;
> > +             } else if (heap->storage[pos]->group_index >
> > +                        heap->storage[right_child]->group_index) {
> > +                     min_heap_swap(heap, pos, right_child);
> > +                     pos = right_child;
> > +             } else {
> > +                     break;
> > +             }
> > +     }
> > +}
> > +
> > +/* Floyd's approach to heapification that is O(n). */
> > +static void min_heapify_all(struct perf_event_heap *heap)
> > +{
> > +     int i;
> > +
> > +     for (i = heap->num_elements / 2; i > 0; i--)
> > +             min_heapify(heap, i);
> > +}
> > +
> > +/* Remove minimum element from the heap. */
> > +static void min_heap_pop(struct perf_event_heap *heap)
> > +{
> > +     WARN_ONCE(heap->num_elements <= 0, "Popping an empty heap");
> > +     heap->num_elements--;
> > +     heap->storage[0] = heap->storage[heap->num_elements];
> > +     min_heapify(heap, 0);
> > +}
>
> Is this really the first heap implementation in the kernel?
>
> > @@ -3378,12 +3453,14 @@ static int visit_groups_merge(struct perf_event_context *ctx,
> >                       struct cgroup_subsys_state *css;
> >
> >                       for (css = &cpuctx->cgrp->css; css; css = css->parent) {
> > -                             itrs[num_itrs] = perf_event_groups_first(groups,
> > +                             heap.storage[heap.num_elements] =
> > +                                             perf_event_groups_first(groups,
> >                                                                  cpu,
> >                                                                  css->cgroup);
> > -                             if (itrs[num_itrs]) {
> > -                                     num_itrs++;
> > -                                     if (num_itrs == max_itrs) {
> > +                             if (heap.storage[heap.num_elements]) {
> > +                                     heap.num_elements++;
> > +                                     if (heap.num_elements ==
> > +                                         heap.max_elements) {
> >                                               WARN_ONCE(
> >                                    max_cgroups_with_events_depth,
> >                                    "Insufficient iterators for cgroup depth");
>
> That's turning into unreadable garbage due to indentation; surely
> there's a solution for that.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH v2 0/7] Optimize cgroup context switch
  2019-07-02  6:59 [PATCH 0/7] Optimize cgroup context switch Ian Rogers
                   ` (6 preceding siblings ...)
  2019-07-02  6:59 ` [PATCH 7/7] perf: rename visit_groups_merge to ctx_groups_sched_in Ian Rogers
@ 2019-07-24 22:37 ` Ian Rogers
  2019-07-24 22:37   ` [PATCH v2 1/7] perf: propagate perf_install_in_context errors up Ian Rogers
                     ` (6 more replies)
  7 siblings, 7 replies; 29+ messages in thread
From: Ian Rogers @ 2019-07-24 22:37 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Jiri Olsa, Namhyung Kim, linux-kernel
  Cc: Kan Liang, Stephane Eranian, Ian Rogers

Organize per-CPU perf event groups by cgroup then by group/insertion index.
To support cgroup hierarchies, a set of iterators is needed in
visit_groups_merge. To make this unbounded, use a per-CPU allocated
buffer. To make the set of iterators fast, use a min-heap ordered by
the group index.

These patches include a caching algorithm that avoids a search for the
first event in a group by Kan Liang <kan.liang@linux.intel.com> and the
set of patches as a whole have benefitted from conversation with him.

Version 2 of these patches addresses review comments and fixes bugs
found by Jiri Olsa and Peter Zijlstra.

Ian Rogers (7):
  perf: propagate perf_install_in_context errors up
  perf/cgroup: order events in RB tree by cgroup id
  perf: order iterators for visit_groups_merge into a min-heap
  perf: avoid a bounded set of visit_groups_merge iterators
  perf: cache perf_event_groups_first for cgroups
  perf: avoid double checking CPU and cgroup
  perf: rename visit_groups_merge to ctx_groups_sched_in

 include/linux/perf_event.h |   8 +
 kernel/events/core.c       | 511 +++++++++++++++++++++++++++++--------
 2 files changed, 414 insertions(+), 105 deletions(-)

-- 
2.22.0.709.g102302147b-goog


^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH v2 1/7] perf: propagate perf_install_in_context errors up
  2019-07-24 22:37 ` [PATCH v2 0/7] Optimize cgroup context switch Ian Rogers
@ 2019-07-24 22:37   ` Ian Rogers
  2019-07-24 22:37   ` [PATCH v2 2/7] perf/cgroup: order events in RB tree by cgroup id Ian Rogers
                     ` (5 subsequent siblings)
  6 siblings, 0 replies; 29+ messages in thread
From: Ian Rogers @ 2019-07-24 22:37 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Jiri Olsa, Namhyung Kim, linux-kernel
  Cc: Kan Liang, Stephane Eranian, Ian Rogers

The current __perf_install_in_context can fail and the error is ignored.
Changing __perf_install_in_context can add new failure modes that need
errors propagating up. This change prepares for this.

Signed-off-by: Ian Rogers <irogers@google.com>
---
 kernel/events/core.c | 39 ++++++++++++++++++++++++++-------------
 1 file changed, 26 insertions(+), 13 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index eea9d52b010c..84a22a5c88b0 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2561,11 +2561,12 @@ static bool exclusive_event_installable(struct perf_event *event,
  *
  * Very similar to event_function_call, see comment there.
  */
-static void
+static int
 perf_install_in_context(struct perf_event_context *ctx,
 			struct perf_event *event,
 			int cpu)
 {
+	int err;
 	struct task_struct *task = READ_ONCE(ctx->task);
 
 	lockdep_assert_held(&ctx->mutex);
@@ -2582,15 +2583,15 @@ perf_install_in_context(struct perf_event_context *ctx,
 	smp_store_release(&event->ctx, ctx);
 
 	if (!task) {
-		cpu_function_call(cpu, __perf_install_in_context, event);
-		return;
+		err = cpu_function_call(cpu, __perf_install_in_context, event);
+		return err;
 	}
 
 	/*
 	 * Should not happen, we validate the ctx is still alive before calling.
 	 */
 	if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
-		return;
+		return 0;
 
 	/*
 	 * Installing events is tricky because we cannot rely on ctx->is_active
@@ -2624,9 +2625,11 @@ perf_install_in_context(struct perf_event_context *ctx,
 	 */
 	smp_mb();
 again:
-	if (!task_function_call(task, __perf_install_in_context, event))
-		return;
+	err = task_function_call(task, __perf_install_in_context, event);
+	if (!err)
+		return 0;
 
+	WARN_ON_ONCE(err != -ESRCH);
 	raw_spin_lock_irq(&ctx->lock);
 	task = ctx->task;
 	if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
@@ -2636,7 +2639,7 @@ perf_install_in_context(struct perf_event_context *ctx,
 		 * against perf_event_exit_task_context().
 		 */
 		raw_spin_unlock_irq(&ctx->lock);
-		return;
+		return 0;
 	}
 	/*
 	 * If the task is not running, ctx->lock will avoid it becoming so,
@@ -2648,6 +2651,7 @@ perf_install_in_context(struct perf_event_context *ctx,
 	}
 	add_event_to_ctx(event, ctx);
 	raw_spin_unlock_irq(&ctx->lock);
+	return 0;
 }
 
 /*
@@ -11130,7 +11134,9 @@ SYSCALL_DEFINE5(perf_event_open,
 		 */
 		for_each_sibling_event(sibling, group_leader) {
 			perf_event__state_init(sibling);
-			perf_install_in_context(ctx, sibling, sibling->cpu);
+			err = perf_install_in_context(ctx, sibling,
+						      sibling->cpu);
+			WARN_ON_ONCE(err);
 			get_ctx(ctx);
 		}
 
@@ -11140,7 +11146,9 @@ SYSCALL_DEFINE5(perf_event_open,
 		 * startup state, ready to be add into new context.
 		 */
 		perf_event__state_init(group_leader);
-		perf_install_in_context(ctx, group_leader, group_leader->cpu);
+		err = perf_install_in_context(ctx, group_leader,
+					      group_leader->cpu);
+		WARN_ON_ONCE(err);
 		get_ctx(ctx);
 	}
 
@@ -11155,7 +11163,8 @@ SYSCALL_DEFINE5(perf_event_open,
 
 	event->owner = current;
 
-	perf_install_in_context(ctx, event, event->cpu);
+	err = perf_install_in_context(ctx, event, event->cpu);
+	WARN_ON_ONCE(err);
 	perf_unpin_context(ctx);
 
 	if (move_group)
@@ -11274,7 +11283,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
 		goto err_unlock;
 	}
 
-	perf_install_in_context(ctx, event, cpu);
+	err = perf_install_in_context(ctx, event, cpu);
+	WARN_ON_ONCE(err);
 	perf_unpin_context(ctx);
 	mutex_unlock(&ctx->mutex);
 
@@ -11297,6 +11307,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
 	struct perf_event_context *dst_ctx;
 	struct perf_event *event, *tmp;
 	LIST_HEAD(events);
+	int err;
 
 	src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
 	dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
@@ -11335,7 +11346,8 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
 		if (event->state >= PERF_EVENT_STATE_OFF)
 			event->state = PERF_EVENT_STATE_INACTIVE;
 		account_event_cpu(event, dst_cpu);
-		perf_install_in_context(dst_ctx, event, dst_cpu);
+		err = perf_install_in_context(dst_ctx, event, dst_cpu);
+		WARN_ON_ONCE(err);
 		get_ctx(dst_ctx);
 	}
 
@@ -11348,7 +11360,8 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
 		if (event->state >= PERF_EVENT_STATE_OFF)
 			event->state = PERF_EVENT_STATE_INACTIVE;
 		account_event_cpu(event, dst_cpu);
-		perf_install_in_context(dst_ctx, event, dst_cpu);
+		err = perf_install_in_context(dst_ctx, event, dst_cpu);
+		WARN_ON_ONCE(err);
 		get_ctx(dst_ctx);
 	}
 	mutex_unlock(&dst_ctx->mutex);
-- 
2.22.0.709.g102302147b-goog


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH v2 2/7] perf/cgroup: order events in RB tree by cgroup id
  2019-07-24 22:37 ` [PATCH v2 0/7] Optimize cgroup context switch Ian Rogers
  2019-07-24 22:37   ` [PATCH v2 1/7] perf: propagate perf_install_in_context errors up Ian Rogers
@ 2019-07-24 22:37   ` Ian Rogers
  2020-03-06 14:42     ` [tip: perf/core] perf/cgroup: Order " tip-bot2 for Ian Rogers
  2019-07-24 22:37   ` [PATCH v2 3/7] perf: order iterators for visit_groups_merge into a min-heap Ian Rogers
                     ` (4 subsequent siblings)
  6 siblings, 1 reply; 29+ messages in thread
From: Ian Rogers @ 2019-07-24 22:37 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Jiri Olsa, Namhyung Kim, linux-kernel
  Cc: Kan Liang, Stephane Eranian, Ian Rogers

If one is monitoring 6 events on 20 cgroups the per-CPU RB tree will
hold 120 events. The scheduling in of the events currently iterates
over all events looking to see which events match the task's cgroup or
its cgroup hierarchy. If a task is in 1 cgroup with 6 events, then 114
events are considered unnecessarily.

This change orders events in the RB tree by cgroup id if it is present.
This means scheduling in may go directly to events associated with the
task's cgroup if one is present. The patch extends the 2 iterators in
visit_groups_merge to a set of iterators, where different iterators are
needed for the task's cgroup and parent cgroups. By considering the set
of iterators when visiting, the lowest group_index event may be selected
and the insertion order group_index property is maintained. This also
allows event rotation to function correctly, as although events are
grouped into a cgroup, rotation always selects the lowest group_index
event to rotate (delete/insert into the tree) and the set of iterators
make it so that the group_index order is maintained.

Cgroups can form a hierarchy with an unbounded number of events being
monitored in a child's parent's cgroup and their parent's parent's
cgroup and so on. This change limits the depth of cgroups that can have
monitored events to a constant (16). The change also scans the list of
events when considering the lowest group_index. Later patches will
improve the data structure to remove the constant limit and avoid a
linear search.

Signed-off-by: Ian Rogers <irogers@google.com>
---
 kernel/events/core.c | 232 +++++++++++++++++++++++++++++++------------
 1 file changed, 168 insertions(+), 64 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 84a22a5c88b0..47df9935de0b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1530,6 +1530,30 @@ perf_event_groups_less(struct perf_event *left, struct perf_event *right)
 	if (left->cpu > right->cpu)
 		return false;
 
+#ifdef CONFIG_CGROUP_PERF
+	if (left->cgrp != right->cgrp) {
+		if (!left->cgrp || !left->cgrp->css.cgroup) {
+			/*
+			 * Left has no cgroup but right does, no cgroups come
+			 * first.
+			 */
+			return true;
+		}
+		if (!right->cgrp || right->cgrp->css.cgroup) {
+			/*
+			 * Right has no cgroup but left does, no cgroups come
+			 * first.
+			 */
+			return false;
+		}
+		/* Two dissimilar cgroups, order by id. */
+		if (left->cgrp->css.cgroup->id < right->cgrp->css.cgroup->id)
+			return true;
+
+		return false;
+	}
+#endif
+
 	if (left->group_index < right->group_index)
 		return true;
 	if (left->group_index > right->group_index)
@@ -1609,25 +1633,48 @@ del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
 }
 
 /*
- * Get the leftmost event in the @cpu subtree.
+ * Get the leftmost event in the cpu/cgroup subtree.
  */
 static struct perf_event *
-perf_event_groups_first(struct perf_event_groups *groups, int cpu)
+perf_event_groups_first(struct perf_event_groups *groups, int cpu,
+			struct cgroup *cgrp)
 {
 	struct perf_event *node_event = NULL, *match = NULL;
 	struct rb_node *node = groups->tree.rb_node;
+#ifdef CONFIG_CGROUP_PERF
+	int node_cgrp_id, cgrp_id = 0;
+
+	if (cgrp)
+		cgrp_id = cgrp->id;
+#endif
 
 	while (node) {
 		node_event = container_of(node, struct perf_event, group_node);
 
 		if (cpu < node_event->cpu) {
 			node = node->rb_left;
-		} else if (cpu > node_event->cpu) {
+			continue;
+		}
+		if (cpu > node_event->cpu) {
 			node = node->rb_right;
-		} else {
-			match = node_event;
+			continue;
+		}
+#ifdef CONFIG_CGROUP_PERF
+		node_cgrp_id = 0;
+		if (node_event->cgrp && node_event->cgrp->css.cgroup)
+			node_cgrp_id = node_event->cgrp->css.cgroup->id;
+
+		if (cgrp_id < node_cgrp_id) {
 			node = node->rb_left;
+			continue;
 		}
+		if (cgrp_id > node_cgrp_id) {
+			node = node->rb_right;
+			continue;
+		}
+#endif
+		match = node_event;
+		node = node->rb_left;
 	}
 
 	return match;
@@ -1640,12 +1687,26 @@ static struct perf_event *
 perf_event_groups_next(struct perf_event *event)
 {
 	struct perf_event *next;
+#ifdef CONFIG_CGROUP_PERF
+	int curr_cgrp_id = 0;
+	int next_cgrp_id = 0;
+#endif
 
 	next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node);
-	if (next && next->cpu == event->cpu)
-		return next;
+	if (next == NULL || next->cpu != event->cpu)
+		return NULL;
 
-	return NULL;
+#ifdef CONFIG_CGROUP_PERF
+	if (event->cgrp && event->cgrp->css.cgroup)
+		curr_cgrp_id = event->cgrp->css.cgroup->id;
+
+	if (next->cgrp && next->cgrp->css.cgroup)
+		next_cgrp_id = next->cgrp->css.cgroup->id;
+
+	if (curr_cgrp_id != next_cgrp_id)
+		return NULL;
+#endif
+	return next;
 }
 
 /*
@@ -3261,28 +3322,81 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
 	ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
 }
 
-static int visit_groups_merge(struct perf_event_groups *groups, int cpu,
-			      int (*func)(struct perf_event *, void *), void *data)
-{
-	struct perf_event **evt, *evt1, *evt2;
-	int ret;
+static int visit_groups_merge(struct perf_event_context *ctx,
+			      struct perf_cpu_context *cpuctx,
+			      struct perf_event_groups *groups,
+			      int (*func)(struct perf_event_context *,
+					  struct perf_cpu_context *,
+					  struct perf_event *,
+					  int *),
+			      int *data)
+{
+	/* The number of iterators in use. */
+	int num_itrs;
+	/*
+	 * A set of iterators, the iterator for the visit is chosen by the
+	 * group_index. The size of the array is sized such that there is space:
+	 * - for task contexts per-CPU and any-CPU events can be iterated.
+	 * - for CPU contexts:
+	 *   - without cgroups, global events can be iterated.
+	 *   - with cgroups, global events can be iterated and 16 sets of cgroup
+	 *     events. Cgroup events may monitor a cgroup at an arbitrary
+	 *     location within the cgroup hierarchy. An iterator is needed for
+	 *     each cgroup with events in the hierarchy. Potentially this is
+	 *     only limited by MAX_PATH.
+	 */
+	struct perf_event *itrs[IS_ENABLED(CONFIG_CGROUP_PERF) ? 17 : 2];
+	/* A reference to the selected iterator. */
+	struct perf_event **evt;
+	int ret, i, cpu = smp_processor_id();
+
+	itrs[0] = perf_event_groups_first(groups, cpu, NULL);
+
+	if (ctx != &cpuctx->ctx) {
+		/*
+		 * A task context only ever has an iterator for CPU or any CPU
+		 * events.
+		 */
+		itrs[1] = perf_event_groups_first(groups, -1, NULL);
+		num_itrs = 2;
+	} else {
+		/* Per-CPU events are by definition not on any CPU. */
+		num_itrs = 1;
+#ifdef CONFIG_CGROUP_PERF
+		/*
+		 * For itrs[1..MAX] add an iterator for each cgroup parent that
+		 * has events.
+		 */
+		if (cpuctx->cgrp) {
+			struct cgroup_subsys_state *css;
+
+			for (css = &cpuctx->cgrp->css; css; css = css->parent) {
+				itrs[num_itrs] = perf_event_groups_first(groups,
+								   cpu,
+								   css->cgroup);
+				if (itrs[num_itrs] &&
+				    WARN_ONCE(++num_itrs == ARRAY_SIZE(iters)
+				     "Insufficient iterators for cgroup depth"))
+					break;
+			}
+		}
+#endif
+	}
 
-	evt1 = perf_event_groups_first(groups, -1);
-	evt2 = perf_event_groups_first(groups, cpu);
-
-	while (evt1 || evt2) {
-		if (evt1 && evt2) {
-			if (evt1->group_index < evt2->group_index)
-				evt = &evt1;
-			else
-				evt = &evt2;
-		} else if (evt1) {
-			evt = &evt1;
-		} else {
-			evt = &evt2;
+	while (true) {
+		/* Find lowest group_index event. */
+		evt = NULL;
+		for (i = 0; i < num_itrs; ++i) {
+			if (itrs[i] == NULL)
+				continue;
+			if (evt == NULL ||
+			    itrs[i]->group_index < (*evt)->group_index)
+				evt = &itrs[i];
 		}
+		if (evt == NULL)
+			break;
 
-		ret = func(*evt, data);
+		ret = func(ctx, cpuctx, *evt, data);
 		if (ret)
 			return ret;
 
@@ -3292,25 +3406,20 @@ static int visit_groups_merge(struct perf_event_groups *groups, int cpu,
 	return 0;
 }
 
-struct sched_in_data {
-	struct perf_event_context *ctx;
-	struct perf_cpu_context *cpuctx;
-	int can_add_hw;
-};
-
-static int pinned_sched_in(struct perf_event *event, void *data)
+static int pinned_sched_in(struct perf_event_context *ctx,
+			   struct perf_cpu_context *cpuctx,
+			   struct perf_event *event,
+			   int *unused)
 {
-	struct sched_in_data *sid = data;
-
 	if (event->state <= PERF_EVENT_STATE_OFF)
 		return 0;
 
 	if (!event_filter_match(event))
 		return 0;
 
-	if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
-		if (!group_sched_in(event, sid->cpuctx, sid->ctx))
-			list_add_tail(&event->active_list, &sid->ctx->pinned_active);
+	if (group_can_go_on(event, cpuctx, 1)) {
+		if (!group_sched_in(event, cpuctx, ctx))
+			list_add_tail(&event->active_list, &ctx->pinned_active);
 	}
 
 	/*
@@ -3323,24 +3432,25 @@ static int pinned_sched_in(struct perf_event *event, void *data)
 	return 0;
 }
 
-static int flexible_sched_in(struct perf_event *event, void *data)
+static int flexible_sched_in(struct perf_event_context *ctx,
+			     struct perf_cpu_context *cpuctx,
+			     struct perf_event *event,
+			     int *can_add_hw)
 {
-	struct sched_in_data *sid = data;
-
 	if (event->state <= PERF_EVENT_STATE_OFF)
 		return 0;
 
 	if (!event_filter_match(event))
 		return 0;
 
-	if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
-		int ret = group_sched_in(event, sid->cpuctx, sid->ctx);
+	if (group_can_go_on(event, cpuctx, *can_add_hw)) {
+		int ret = group_sched_in(event, cpuctx, ctx);
 		if (ret) {
-			sid->can_add_hw = 0;
-			sid->ctx->rotate_necessary = 1;
+			*can_add_hw = 0;
+			ctx->rotate_necessary = 1;
 			return 0;
 		}
-		list_add_tail(&event->active_list, &sid->ctx->flexible_active);
+		list_add_tail(&event->active_list, &ctx->flexible_active);
 	}
 
 	return 0;
@@ -3350,30 +3460,24 @@ static void
 ctx_pinned_sched_in(struct perf_event_context *ctx,
 		    struct perf_cpu_context *cpuctx)
 {
-	struct sched_in_data sid = {
-		.ctx = ctx,
-		.cpuctx = cpuctx,
-		.can_add_hw = 1,
-	};
-
-	visit_groups_merge(&ctx->pinned_groups,
-			   smp_processor_id(),
-			   pinned_sched_in, &sid);
+	visit_groups_merge(ctx,
+			   cpuctx,
+			   &ctx->pinned_groups,
+			   pinned_sched_in,
+			   NULL);
 }
 
 static void
 ctx_flexible_sched_in(struct perf_event_context *ctx,
 		      struct perf_cpu_context *cpuctx)
 {
-	struct sched_in_data sid = {
-		.ctx = ctx,
-		.cpuctx = cpuctx,
-		.can_add_hw = 1,
-	};
+	int can_add_hw = 1;
 
-	visit_groups_merge(&ctx->flexible_groups,
-			   smp_processor_id(),
-			   flexible_sched_in, &sid);
+	visit_groups_merge(ctx,
+			   cpuctx,
+			   &ctx->flexible_groups,
+			   flexible_sched_in,
+			   &can_add_hw);
 }
 
 static void
-- 
2.22.0.709.g102302147b-goog


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH v2 3/7] perf: order iterators for visit_groups_merge into a min-heap
  2019-07-24 22:37 ` [PATCH v2 0/7] Optimize cgroup context switch Ian Rogers
  2019-07-24 22:37   ` [PATCH v2 1/7] perf: propagate perf_install_in_context errors up Ian Rogers
  2019-07-24 22:37   ` [PATCH v2 2/7] perf/cgroup: order events in RB tree by cgroup id Ian Rogers
@ 2019-07-24 22:37   ` Ian Rogers
  2019-07-24 22:37   ` [PATCH v2 4/7] perf: avoid a bounded set of visit_groups_merge iterators Ian Rogers
                     ` (3 subsequent siblings)
  6 siblings, 0 replies; 29+ messages in thread
From: Ian Rogers @ 2019-07-24 22:37 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Jiri Olsa, Namhyung Kim, linux-kernel
  Cc: Kan Liang, Stephane Eranian, Ian Rogers

The groups rbtree holding perf events, either for a CPU or a task, needs
to have multiple iterators that visit events in group_index (insertion)
order. Rather than linearly searching the iterators, use a min-heap to go
from a O(#iterators) search to a O(log2(#iterators)) insert cost per event
visited.

Signed-off-by: Ian Rogers <irogers@google.com>
---
 kernel/events/core.c | 131 +++++++++++++++++++++++++++++++++----------
 1 file changed, 102 insertions(+), 29 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 47df9935de0b..4d70df0415b9 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3322,6 +3322,78 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
 	ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
 }
 
+/*
+ * Data structure used to hold a min-heap, ordered by group_index, of a fixed
+ * maximum size.
+ */
+struct perf_event_heap {
+	struct perf_event **storage;
+	int num_elements;
+	int max_elements;
+};
+
+static void min_heap_swap(struct perf_event_heap *heap,
+			  int pos1, int pos2)
+{
+	struct perf_event *tmp = heap->storage[pos1];
+
+	heap->storage[pos1] = heap->storage[pos2];
+	heap->storage[pos2] = tmp;
+}
+
+/* Sift the perf_event at pos down the heap. */
+static void min_heapify(struct perf_event_heap *heap, int pos)
+{
+	int left_child, right_child;
+
+	while (pos < heap->num_elements / 2) {
+		left_child = pos * 2;
+		right_child = pos * 2 + 1;
+		if (heap->storage[pos]->group_index >
+		    heap->storage[left_child]->group_index) {
+			min_heap_swap(heap, pos, left_child);
+			pos = left_child;
+		} else if (heap->storage[pos]->group_index >
+			   heap->storage[right_child]->group_index) {
+			min_heap_swap(heap, pos, right_child);
+			pos = right_child;
+		} else {
+			break;
+		}
+	}
+}
+
+/* Floyd's approach to heapification that is O(n). */
+static void min_heapify_all(struct perf_event_heap *heap)
+{
+	int i;
+
+	for (i = heap->num_elements / 2; i > 0; i--)
+		min_heapify(heap, i);
+}
+
+/* Remove minimum element from the heap. */
+static void min_heap_pop(struct perf_event_heap *heap)
+{
+	WARN_ONCE(heap->num_elements <= 0, "Popping an empty heap");
+	heap->num_elements--;
+	heap->storage[0] = heap->storage[heap->num_elements];
+	min_heapify(heap, 0);
+}
+
+/* Remove the minimum element and then push the given event. */
+static void min_heap_pop_push(struct perf_event_heap *heap,
+			      struct perf_event *event)
+{
+	WARN_ONCE(heap->num_elements <= 0, "Popping an empty heap");
+	if (event == NULL) {
+		min_heap_pop(heap);
+	} else {
+		heap->storage[0] = event;
+		min_heapify(heap, 0);
+	}
+}
+
 static int visit_groups_merge(struct perf_event_context *ctx,
 			      struct perf_cpu_context *cpuctx,
 			      struct perf_event_groups *groups,
@@ -3331,8 +3403,6 @@ static int visit_groups_merge(struct perf_event_context *ctx,
 					  int *),
 			      int *data)
 {
-	/* The number of iterators in use. */
-	int num_itrs;
 	/*
 	 * A set of iterators, the iterator for the visit is chosen by the
 	 * group_index. The size of the array is sized such that there is space:
@@ -3346,22 +3416,28 @@ static int visit_groups_merge(struct perf_event_context *ctx,
 	 *     only limited by MAX_PATH.
 	 */
 	struct perf_event *itrs[IS_ENABLED(CONFIG_CGROUP_PERF) ? 17 : 2];
-	/* A reference to the selected iterator. */
-	struct perf_event **evt;
-	int ret, i, cpu = smp_processor_id();
+	struct perf_event_heap heap = {
+		.storage = itrs,
+		.num_elements = 0,
+		.max_elements = ARRAYSIZE(itrs)
+	};
+	int ret, cpu = smp_processor_id();
 
-	itrs[0] = perf_event_groups_first(groups, cpu, NULL);
+	heap.storage[0] = perf_event_groups_first(groups, cpu, NULL);
+	if (heap.storage[0])
+		heap.num_elements++;
 
 	if (ctx != &cpuctx->ctx) {
 		/*
 		 * A task context only ever has an iterator for CPU or any CPU
 		 * events.
 		 */
-		itrs[1] = perf_event_groups_first(groups, -1, NULL);
-		num_itrs = 2;
+		heap.storage[heap.num_elements] =
+				perf_event_groups_first(groups, -1, NULL);
+		if (heap.storage[heap.num_elements])
+			heap.num_elements++;
 	} else {
 		/* Per-CPU events are by definition not on any CPU. */
-		num_itrs = 1;
 #ifdef CONFIG_CGROUP_PERF
 		/*
 		 * For itrs[1..MAX] add an iterator for each cgroup parent that
@@ -3371,36 +3447,33 @@ static int visit_groups_merge(struct perf_event_context *ctx,
 			struct cgroup_subsys_state *css;
 
 			for (css = &cpuctx->cgrp->css; css; css = css->parent) {
-				itrs[num_itrs] = perf_event_groups_first(groups,
+				heap.storage[heap.num_elements] =
+						perf_event_groups_first(groups,
 								   cpu,
 								   css->cgroup);
-				if (itrs[num_itrs] &&
-				    WARN_ONCE(++num_itrs == ARRAY_SIZE(iters)
-				     "Insufficient iterators for cgroup depth"))
-					break;
+				if (heap.storage[heap.num_elements]) {
+					heap.num_elements++;
+					if (heap.num_elements ==
+					    heap.max_elements) {
+						WARN_ONCE(
+				     max_cgroups_with_events_depth,
+				     "Insufficient iterators for cgroup depth");
+						break;
+					}
+				}
 			}
 		}
 #endif
 	}
+	min_heapify_all(&heap);
 
-	while (true) {
-		/* Find lowest group_index event. */
-		evt = NULL;
-		for (i = 0; i < num_itrs; ++i) {
-			if (itrs[i] == NULL)
-				continue;
-			if (evt == NULL ||
-			    itrs[i]->group_index < (*evt)->group_index)
-				evt = &itrs[i];
-		}
-		if (evt == NULL)
-			break;
-
-		ret = func(ctx, cpuctx, *evt, data);
+	while (heap.num_elements > 0) {
+		ret = func(ctx, cpuctx, heap.storage[0], data);
 		if (ret)
 			return ret;
 
-		*evt = perf_event_groups_next(*evt);
+		min_heap_pop_push(&heap,
+				  perf_event_groups_next(heap.storage[0]));
 	}
 
 	return 0;
-- 
2.22.0.709.g102302147b-goog


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH v2 4/7] perf: avoid a bounded set of visit_groups_merge iterators
  2019-07-24 22:37 ` [PATCH v2 0/7] Optimize cgroup context switch Ian Rogers
                     ` (2 preceding siblings ...)
  2019-07-24 22:37   ` [PATCH v2 3/7] perf: order iterators for visit_groups_merge into a min-heap Ian Rogers
@ 2019-07-24 22:37   ` Ian Rogers
  2019-08-07 21:11     ` Peter Zijlstra
  2019-07-24 22:37   ` [PATCH v2 5/7] perf: cache perf_event_groups_first for cgroups Ian Rogers
                     ` (2 subsequent siblings)
  6 siblings, 1 reply; 29+ messages in thread
From: Ian Rogers @ 2019-07-24 22:37 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Jiri Olsa, Namhyung Kim, linux-kernel
  Cc: Kan Liang, Stephane Eranian, Ian Rogers

Create a per-cpu array of iterators that gets resized when cgroup events
are added. The size of the array reflects the maximum depth of cgroups,
although not all cgroups will have events monitored within them. This
approach avoids added storage cost to perf_event.

Signed-off-by: Ian Rogers <irogers@google.com>
---
 include/linux/perf_event.h |  2 +
 kernel/events/core.c       | 87 +++++++++++++++++++++++++++++++-------
 2 files changed, 73 insertions(+), 16 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e8ad3c590a23..43f90cfa2c39 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -802,6 +802,8 @@ struct perf_cpu_context {
 #ifdef CONFIG_CGROUP_PERF
 	struct perf_cgroup		*cgrp;
 	struct list_head		cgrp_cpuctx_entry;
+	struct perf_event		**visit_groups_merge_iterator_storage;
+	int			       visit_groups_merge_iterator_storage_size;
 #endif
 
 	struct list_head		sched_cb_entry;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4d70df0415b9..2a2188908bed 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1709,6 +1709,20 @@ perf_event_groups_next(struct perf_event *event)
 	return next;
 }
 
+#ifdef CONFIG_CGROUP_PERF
+int perf_event_cgroup_depth(struct perf_event *event)
+{
+	struct cgroup_subsys_state *css;
+	struct perf_cgroup *cgrp = event->cgrp;
+	int depth = 0;
+
+	if (cgrp)
+		for (css = &cgrp->css; css; css = css->parent)
+			depth++;
+	return depth;
+}
+#endif
+
 /*
  * Iterate through the whole groups tree.
  */
@@ -2590,6 +2604,7 @@ static int  __perf_install_in_context(void *info)
 
 #ifdef CONFIG_CGROUP_PERF
 	if (is_cgroup_event(event)) {
+		int max_iterators;
 		/*
 		 * If the current cgroup doesn't match the event's
 		 * cgroup, we should not try to schedule it.
@@ -2597,6 +2612,30 @@ static int  __perf_install_in_context(void *info)
 		struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
 		reprogram = cgroup_is_descendant(cgrp->css.cgroup,
 					event->cgrp->css.cgroup);
+
+		/*
+		 * Ensure space for visit_groups_merge iterator storage. With
+		 * cgroup profiling we may have an event at each depth plus
+		 * system wide events.
+		 */
+		max_iterators = perf_event_cgroup_depth(event) + 1;
+		if (max_iterators >
+		    cpuctx->visit_groups_merge_iterator_storage_size) {
+			struct perf_event **storage =
+			   krealloc(cpuctx->visit_groups_merge_iterator_storage,
+				    sizeof(struct perf_event *) * max_iterators,
+				    GFP_KERNEL);
+			if (storage) {
+				cpuctx->visit_groups_merge_iterator_storage
+						= storage;
+				cpuctx->visit_groups_merge_iterator_storage_size
+						= max_iterators;
+			} else {
+				WARN_ONCE(1, "Unable to increase iterator "
+					"storage for perf events with cgroups");
+				ret = -ENOMEM;
+			}
+		}
 	}
 #endif
 
@@ -3394,6 +3433,13 @@ static void min_heap_pop_push(struct perf_event_heap *heap,
 	}
 }
 
+
+/*
+ * Without cgroups, with a task context, there may be per-CPU and any
+ * CPU events.
+ */
+#define MIN_VISIT_GROUP_MERGE_ITERATORS 2
+
 static int visit_groups_merge(struct perf_event_context *ctx,
 			      struct perf_cpu_context *cpuctx,
 			      struct perf_event_groups *groups,
@@ -3405,22 +3451,25 @@ static int visit_groups_merge(struct perf_event_context *ctx,
 {
 	/*
 	 * A set of iterators, the iterator for the visit is chosen by the
-	 * group_index. The size of the array is sized such that there is space:
-	 * - for task contexts per-CPU and any-CPU events can be iterated.
-	 * - for CPU contexts:
-	 *   - without cgroups, global events can be iterated.
-	 *   - with cgroups, global events can be iterated and 16 sets of cgroup
-	 *     events. Cgroup events may monitor a cgroup at an arbitrary
-	 *     location within the cgroup hierarchy. An iterator is needed for
-	 *     each cgroup with events in the hierarchy. Potentially this is
-	 *     only limited by MAX_PATH.
-	 */
-	struct perf_event *itrs[IS_ENABLED(CONFIG_CGROUP_PERF) ? 17 : 2];
+	 * group_index.
+	 */
+#ifndef CONFIG_CGROUP_PERF
+	struct perf_event *itrs[MIN_VISIT_GROUP_MERGE_ITERATORS];
 	struct perf_event_heap heap = {
 		.storage = itrs,
 		.num_elements = 0,
-		.max_elements = ARRAYSIZE(itrs)
+		.max_elements = MIN_VISIT_GROUP_MERGE_ITERATORS
+	};
+#else
+	/*
+	 * With cgroups usage space in the CPU context reserved for iterators.
+	 */
+	struct perf_event_heap heap = {
+		.storage = cpuctx->visit_groups_merge_iterator_storage,
+		.num_elements = 0,
+		.max_elements = cpuctx->visit_groups_merge_iterator_storage_size
 	};
+#endif
 	int ret, cpu = smp_processor_id();
 
 	heap.storage[0] = perf_event_groups_first(groups, cpu, NULL);
@@ -3455,9 +3504,8 @@ static int visit_groups_merge(struct perf_event_context *ctx,
 					heap.num_elements++;
 					if (heap.num_elements ==
 					    heap.max_elements) {
-						WARN_ONCE(
-				     max_cgroups_with_events_depth,
-				     "Insufficient iterators for cgroup depth");
+						WARN_ONCE(1,
+						"per-CPU min-heap under sized");
 						break;
 					}
 				}
@@ -10167,7 +10215,14 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
 		lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
 		cpuctx->ctx.pmu = pmu;
 		cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
-
+#ifdef CONFIG_CGROUP_PERF
+		cpuctx->visit_groups_merge_iterator_storage =
+				kmalloc_array(MIN_VISIT_GROUP_MERGE_ITERATORS,
+					      sizeof(struct perf_event *),
+					      GFP_KERNEL);
+		cpuctx->visit_groups_merge_iterator_storage_size =
+				MIN_VISIT_GROUP_MERGE_ITERATORS;
+#endif
 		__perf_mux_hrtimer_init(cpuctx, cpu);
 	}
 
-- 
2.22.0.709.g102302147b-goog


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH v2 5/7] perf: cache perf_event_groups_first for cgroups
  2019-07-24 22:37 ` [PATCH v2 0/7] Optimize cgroup context switch Ian Rogers
                     ` (3 preceding siblings ...)
  2019-07-24 22:37   ` [PATCH v2 4/7] perf: avoid a bounded set of visit_groups_merge iterators Ian Rogers
@ 2019-07-24 22:37   ` Ian Rogers
  2019-07-24 22:37   ` [PATCH v2 6/7] perf: avoid double checking CPU and cgroup Ian Rogers
  2019-07-24 22:37   ` [PATCH v2 7/7] perf: rename visit_groups_merge to ctx_groups_sched_in Ian Rogers
  6 siblings, 0 replies; 29+ messages in thread
From: Ian Rogers @ 2019-07-24 22:37 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Jiri Olsa, Namhyung Kim, linux-kernel
  Cc: Kan Liang, Stephane Eranian, Ian Rogers

Add a per-CPU cache of the pinned and flexible perf_event_groups_first
value for a cgroup avoiding an O(log(#perf events)) searches during
sched_in.

This patch is derived from an original patch by Kan Liang
<kan.liang@linux.intel.com>:
https://lkml.org/lkml/2019/5/15/1594

Signed-off-by: Ian Rogers <irogers@google.com>
---
 include/linux/perf_event.h |   6 ++
 kernel/events/core.c       | 142 ++++++++++++++++++++++++++-----------
 2 files changed, 106 insertions(+), 42 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 43f90cfa2c39..2d411786ab87 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -845,6 +845,12 @@ struct perf_cgroup_info {
 struct perf_cgroup {
 	struct cgroup_subsys_state	css;
 	struct perf_cgroup_info	__percpu *info;
+	/* A cache of the first event with the perf_cpu_context's
+	 * perf_event_context for the first event in pinned_groups or
+	 * flexible_groups. Avoids an rbtree search during sched_in.
+	 */
+	struct perf_event * __percpu    *pinned_event;
+	struct perf_event * __percpu    *flexible_event;
 };
 
 /*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2a2188908bed..c8b9c8611533 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1592,6 +1592,25 @@ perf_event_groups_insert(struct perf_event_groups *groups,
 
 	rb_link_node(&event->group_node, parent, node);
 	rb_insert_color(&event->group_node, &groups->tree);
+#ifdef CONFIG_CGROUP_PERF
+	if (is_cgroup_event(event)) {
+		struct perf_event **cgrp_event;
+
+		if (event->attr.pinned)
+			cgrp_event = per_cpu_ptr(event->cgrp->pinned_event,
+						 event->cpu);
+		else
+			cgrp_event = per_cpu_ptr(event->cgrp->flexible_event,
+						 event->cpu);
+		/*
+		 * Cgroup events for the same cgroup on the same CPU will
+		 * always be inserted at the right because of bigger
+		 * @groups->index. Only need to set *cgrp_event when it's NULL.
+		 */
+		if (!*cgrp_event)
+			*cgrp_event = event;
+	}
+#endif
 }
 
 /*
@@ -1606,6 +1625,9 @@ add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
 	perf_event_groups_insert(groups, event);
 }
 
+static struct perf_event *
+perf_event_groups_next(struct perf_event *event);
+
 /*
  * Delete a group from a tree.
  */
@@ -1616,6 +1638,22 @@ perf_event_groups_delete(struct perf_event_groups *groups,
 	WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) ||
 		     RB_EMPTY_ROOT(&groups->tree));
 
+#ifdef CONFIG_CGROUP_PERF
+	if (is_cgroup_event(event)) {
+		struct perf_event **cgrp_event;
+
+		if (event->attr.pinned)
+			cgrp_event = per_cpu_ptr(event->cgrp->pinned_event,
+						 event->cpu);
+		else
+			cgrp_event = per_cpu_ptr(event->cgrp->flexible_event,
+						 event->cpu);
+
+		if (*cgrp_event == event)
+			*cgrp_event = perf_event_groups_next(event);
+	}
+#endif
+
 	rb_erase(&event->group_node, &groups->tree);
 	init_event_group(event);
 }
@@ -1633,20 +1671,14 @@ del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
 }
 
 /*
- * Get the leftmost event in the cpu/cgroup subtree.
+ * Get the leftmost event in the cpu subtree without a cgroup (ie task or
+ * system-wide).
  */
 static struct perf_event *
-perf_event_groups_first(struct perf_event_groups *groups, int cpu,
-			struct cgroup *cgrp)
+perf_event_groups_first_no_cgroup(struct perf_event_groups *groups, int cpu)
 {
 	struct perf_event *node_event = NULL, *match = NULL;
 	struct rb_node *node = groups->tree.rb_node;
-#ifdef CONFIG_CGROUP_PERF
-	int node_cgrp_id, cgrp_id = 0;
-
-	if (cgrp)
-		cgrp_id = cgrp->id;
-#endif
 
 	while (node) {
 		node_event = container_of(node, struct perf_event, group_node);
@@ -1660,18 +1692,10 @@ perf_event_groups_first(struct perf_event_groups *groups, int cpu,
 			continue;
 		}
 #ifdef CONFIG_CGROUP_PERF
-		node_cgrp_id = 0;
-		if (node_event->cgrp && node_event->cgrp->css.cgroup)
-			node_cgrp_id = node_event->cgrp->css.cgroup->id;
-
-		if (cgrp_id < node_cgrp_id) {
+		if (node_event->cgrp) {
 			node = node->rb_left;
 			continue;
 		}
-		if (cgrp_id > node_cgrp_id) {
-			node = node->rb_right;
-			continue;
-		}
 #endif
 		match = node_event;
 		node = node->rb_left;
@@ -3433,6 +3457,13 @@ static void min_heap_pop_push(struct perf_event_heap *heap,
 	}
 }
 
+static int pinned_sched_in(struct perf_event_context *ctx,
+			   struct perf_cpu_context *cpuctx,
+			   struct perf_event *event);
+static int flexible_sched_in(struct perf_event_context *ctx,
+			     struct perf_cpu_context *cpuctx,
+			     struct perf_event *event,
+			     int *can_add_hw);
 
 /*
  * Without cgroups, with a task context, there may be per-CPU and any
@@ -3442,11 +3473,7 @@ static void min_heap_pop_push(struct perf_event_heap *heap,
 
 static int visit_groups_merge(struct perf_event_context *ctx,
 			      struct perf_cpu_context *cpuctx,
-			      struct perf_event_groups *groups,
-			      int (*func)(struct perf_event_context *,
-					  struct perf_cpu_context *,
-					  struct perf_event *,
-					  int *),
+			      bool is_pinned,
 			      int *data)
 {
 	/*
@@ -3472,7 +3499,11 @@ static int visit_groups_merge(struct perf_event_context *ctx,
 #endif
 	int ret, cpu = smp_processor_id();
 
-	heap.storage[0] = perf_event_groups_first(groups, cpu, NULL);
+	struct perf_event_groups *groups = is_pinned
+					   ? &ctx->pinned_groups
+					   : &ctx->flexible_groups;
+
+	heap.storage[0] = perf_event_groups_first_no_cgroup(groups, cpu);
 	if (heap.storage[0])
 		heap.num_elements++;
 
@@ -3482,7 +3513,7 @@ static int visit_groups_merge(struct perf_event_context *ctx,
 		 * events.
 		 */
 		heap.storage[heap.num_elements] =
-				perf_event_groups_first(groups, -1, NULL);
+				perf_event_groups_first_no_cgroup(groups, -1);
 		if (heap.storage[heap.num_elements])
 			heap.num_elements++;
 	} else {
@@ -3492,14 +3523,22 @@ static int visit_groups_merge(struct perf_event_context *ctx,
 		 * For itrs[1..MAX] add an iterator for each cgroup parent that
 		 * has events.
 		 */
-		if (cpuctx->cgrp) {
+		struct perf_cgroup *cgrp = cpuctx->cgrp;
+
+		if (cgrp) {
 			struct cgroup_subsys_state *css;
 
-			for (css = &cpuctx->cgrp->css; css; css = css->parent) {
-				heap.storage[heap.num_elements] =
-						perf_event_groups_first(groups,
-								   cpu,
-								   css->cgroup);
+			for (css = &cgrp->css; css; css = css->parent) {
+				/* root cgroup doesn't have events */
+				if (css->id == 1)
+					break;
+
+				cgrp = container_of(css, struct perf_cgroup,
+						    css);
+				heap.storage[heap.num_elements] = is_pinned
+					? *per_cpu_ptr(cgrp->pinned_event, cpu)
+					: *per_cpu_ptr(cgrp->flexible_event,
+						       cpu);
 				if (heap.storage[heap.num_elements]) {
 					heap.num_elements++;
 					if (heap.num_elements ==
@@ -3516,7 +3555,12 @@ static int visit_groups_merge(struct perf_event_context *ctx,
 	min_heapify_all(&heap);
 
 	while (heap.num_elements > 0) {
-		ret = func(ctx, cpuctx, heap.storage[0], data);
+		if (is_pinned)
+			ret = pinned_sched_in(ctx, cpuctx, heap.storage[0]);
+		else
+			ret = flexible_sched_in(ctx, cpuctx, heap.storage[0],
+						data);
+
 		if (ret)
 			return ret;
 
@@ -3529,8 +3573,7 @@ static int visit_groups_merge(struct perf_event_context *ctx,
 
 static int pinned_sched_in(struct perf_event_context *ctx,
 			   struct perf_cpu_context *cpuctx,
-			   struct perf_event *event,
-			   int *unused)
+			   struct perf_event *event)
 {
 	if (event->state <= PERF_EVENT_STATE_OFF)
 		return 0;
@@ -3583,8 +3626,7 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
 {
 	visit_groups_merge(ctx,
 			   cpuctx,
-			   &ctx->pinned_groups,
-			   pinned_sched_in,
+			   /*is_pinned=*/true,
 			   NULL);
 }
 
@@ -3596,8 +3638,7 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
 
 	visit_groups_merge(ctx,
 			   cpuctx,
-			   &ctx->flexible_groups,
-			   flexible_sched_in,
+			   /*is_pinned=*/false,
 			   &can_add_hw);
 }
 
@@ -12417,18 +12458,35 @@ perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 		return ERR_PTR(-ENOMEM);
 
 	jc->info = alloc_percpu(struct perf_cgroup_info);
-	if (!jc->info) {
-		kfree(jc);
-		return ERR_PTR(-ENOMEM);
-	}
+	if (!jc->info)
+		goto free_jc;
+
+	jc->pinned_event = alloc_percpu(struct perf_event *);
+	if (!jc->pinned_event)
+		goto free_jc_info;
+
+	jc->flexible_event = alloc_percpu(struct perf_event *);
+	if (!jc->flexible_event)
+		goto free_jc_pinned;
 
 	return &jc->css;
+
+free_jc_pinned:
+	free_percpu(jc->pinned_event);
+free_jc_info:
+	free_percpu(jc->info);
+free_jc:
+	kfree(jc);
+
+	return ERR_PTR(-ENOMEM);
 }
 
 static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
 {
 	struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
 
+	free_percpu(jc->pinned_event);
+	free_percpu(jc->flexible_event);
 	free_percpu(jc->info);
 	kfree(jc);
 }
-- 
2.22.0.709.g102302147b-goog


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH v2 6/7] perf: avoid double checking CPU and cgroup
  2019-07-24 22:37 ` [PATCH v2 0/7] Optimize cgroup context switch Ian Rogers
                     ` (4 preceding siblings ...)
  2019-07-24 22:37   ` [PATCH v2 5/7] perf: cache perf_event_groups_first for cgroups Ian Rogers
@ 2019-07-24 22:37   ` Ian Rogers
  2019-07-24 22:37   ` [PATCH v2 7/7] perf: rename visit_groups_merge to ctx_groups_sched_in Ian Rogers
  6 siblings, 0 replies; 29+ messages in thread
From: Ian Rogers @ 2019-07-24 22:37 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Jiri Olsa, Namhyung Kim, linux-kernel
  Cc: Kan Liang, Stephane Eranian, Ian Rogers

When ctx_groups_sched_in iterates the CPU and cgroup of events is known
to match the current task. Avoid double checking this with
event_filter_match by passing in an additional argument.

Signed-off-by: Ian Rogers <irogers@google.com>
---
 kernel/events/core.c | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index c8b9c8611533..fb1027387e8e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2077,10 +2077,12 @@ static inline int pmu_filter_match(struct perf_event *event)
 }
 
 static inline int
-event_filter_match(struct perf_event *event)
+event_filter_match(struct perf_event *event, bool check_cgroup_and_cpu)
 {
-	return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
-	       perf_cgroup_match(event) && pmu_filter_match(event);
+	return (!check_cgroup_and_cpu ||
+		((event->cpu == -1 || event->cpu == smp_processor_id()) &&
+		 perf_cgroup_match(event))) &&
+			pmu_filter_match(event);
 }
 
 static void
@@ -2801,7 +2803,7 @@ static void __perf_event_enable(struct perf_event *event,
 	if (!ctx->is_active)
 		return;
 
-	if (!event_filter_match(event)) {
+	if (!event_filter_match(event, /*check_cpu_and_cgroup=*/true)) {
 		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
 		return;
 	}
@@ -3578,7 +3580,10 @@ static int pinned_sched_in(struct perf_event_context *ctx,
 	if (event->state <= PERF_EVENT_STATE_OFF)
 		return 0;
 
-	if (!event_filter_match(event))
+	/* The caller already checked the CPU and cgroup before calling
+	 * pinned_sched_in.
+	 */
+	if (!event_filter_match(event, /*check_cpu_and_cgroup=*/false))
 		return 0;
 
 	if (group_can_go_on(event, cpuctx, 1)) {
@@ -3604,7 +3609,10 @@ static int flexible_sched_in(struct perf_event_context *ctx,
 	if (event->state <= PERF_EVENT_STATE_OFF)
 		return 0;
 
-	if (!event_filter_match(event))
+	/* The caller already checked the CPU and cgroup before calling
+	 * felxible_sched_in.
+	 */
+	if (!event_filter_match(event, /*check_cpu_and_cgroup=*/false))
 		return 0;
 
 	if (group_can_go_on(event, cpuctx, *can_add_hw)) {
@@ -3904,7 +3912,7 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
 		if (event->state != PERF_EVENT_STATE_ACTIVE)
 			continue;
 
-		if (!event_filter_match(event))
+		if (!event_filter_match(event, /*check_cpu_and_cgroup=*/true))
 			continue;
 
 		perf_pmu_disable(event->pmu);
@@ -6952,7 +6960,8 @@ perf_iterate_ctx(struct perf_event_context *ctx,
 		if (!all) {
 			if (event->state < PERF_EVENT_STATE_INACTIVE)
 				continue;
-			if (!event_filter_match(event))
+			if (!event_filter_match(event,
+						/*check_cpu_and_cgroup=*/true))
 				continue;
 		}
 
@@ -6976,7 +6985,7 @@ static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
 
 		if (event->state < PERF_EVENT_STATE_INACTIVE)
 			continue;
-		if (!event_filter_match(event))
+		if (!event_filter_match(event, /*check_cpu_and_cgroup=*/true))
 			continue;
 		output(event, data);
 	}
-- 
2.22.0.709.g102302147b-goog


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH v2 7/7] perf: rename visit_groups_merge to ctx_groups_sched_in
  2019-07-24 22:37 ` [PATCH v2 0/7] Optimize cgroup context switch Ian Rogers
                     ` (5 preceding siblings ...)
  2019-07-24 22:37   ` [PATCH v2 6/7] perf: avoid double checking CPU and cgroup Ian Rogers
@ 2019-07-24 22:37   ` Ian Rogers
  6 siblings, 0 replies; 29+ messages in thread
From: Ian Rogers @ 2019-07-24 22:37 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Jiri Olsa, Namhyung Kim, linux-kernel
  Cc: Kan Liang, Stephane Eranian, Ian Rogers

The visit_groups_merge function no longer takes a function pointer,
change the name to be similar to other sched_in functions.
Follow Kan Liang's <kan.liang@linux.intel.com> and remove the single
caller flexible_sched_in and pinned_sched_in, moving functionality to
caller.

Signed-off-by: Ian Rogers <irogers@google.com>
---
 include/linux/perf_event.h |  4 +-
 kernel/events/core.c       | 77 ++++++++++++++++----------------------
 2 files changed, 35 insertions(+), 46 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2d411786ab87..4ef3e954fa44 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -802,8 +802,8 @@ struct perf_cpu_context {
 #ifdef CONFIG_CGROUP_PERF
 	struct perf_cgroup		*cgrp;
 	struct list_head		cgrp_cpuctx_entry;
-	struct perf_event		**visit_groups_merge_iterator_storage;
-	int			       visit_groups_merge_iterator_storage_size;
+	struct perf_event		**ctx_groups_sched_in_iterator_storage;
+	int			      ctx_groups_sched_in_iterator_storage_size;
 #endif
 
 	struct list_head		sched_cb_entry;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index fb1027387e8e..8447cb07e90a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2640,22 +2640,22 @@ static int  __perf_install_in_context(void *info)
 					event->cgrp->css.cgroup);
 
 		/*
-		 * Ensure space for visit_groups_merge iterator storage. With
+		 * Ensure space for ctx_groups_sched_in iterator storage. With
 		 * cgroup profiling we may have an event at each depth plus
 		 * system wide events.
 		 */
 		max_iterators = perf_event_cgroup_depth(event) + 1;
 		if (max_iterators >
-		    cpuctx->visit_groups_merge_iterator_storage_size) {
+		    cpuctx->ctx_groups_sched_in_iterator_storage_size) {
 			struct perf_event **storage =
-			   krealloc(cpuctx->visit_groups_merge_iterator_storage,
+			  krealloc(cpuctx->ctx_groups_sched_in_iterator_storage,
 				    sizeof(struct perf_event *) * max_iterators,
 				    GFP_KERNEL);
 			if (storage) {
-				cpuctx->visit_groups_merge_iterator_storage
-						= storage;
-				cpuctx->visit_groups_merge_iterator_storage_size
-						= max_iterators;
+				cpuctx->ctx_groups_sched_in_iterator_storage
+				    = storage;
+			       cpuctx->ctx_groups_sched_in_iterator_storage_size
+				    = max_iterators;
 			} else {
 				WARN_ONCE(1, "Unable to increase iterator "
 					"storage for perf events with cgroups");
@@ -3471,32 +3471,33 @@ static int flexible_sched_in(struct perf_event_context *ctx,
  * Without cgroups, with a task context, there may be per-CPU and any
  * CPU events.
  */
-#define MIN_VISIT_GROUP_MERGE_ITERATORS 2
+#define MIN_CTX_GROUPS_SCHED_IN_ITERATORS 2
 
-static int visit_groups_merge(struct perf_event_context *ctx,
-			      struct perf_cpu_context *cpuctx,
-			      bool is_pinned,
-			      int *data)
+static int ctx_groups_sched_in(struct perf_event_context *ctx,
+			       struct perf_cpu_context *cpuctx,
+			       bool is_pinned,
+			       int *data)
 {
 	/*
 	 * A set of iterators, the iterator for the visit is chosen by the
 	 * group_index.
 	 */
 #ifndef CONFIG_CGROUP_PERF
-	struct perf_event *itrs[MIN_VISIT_GROUP_MERGE_ITERATORS];
+	struct perf_event *itrs[MIN_CTX_GROUPS_SCHED_IN_ITERATORS];
 	struct perf_event_heap heap = {
 		.storage = itrs,
 		.num_elements = 0,
-		.max_elements = MIN_VISIT_GROUP_MERGE_ITERATORS
+		.max_elements = MIN_CTX_GROUPS_SCHED_IN_ITERATORS
 	};
 #else
 	/*
 	 * With cgroups usage space in the CPU context reserved for iterators.
 	 */
 	struct perf_event_heap heap = {
-		.storage = cpuctx->visit_groups_merge_iterator_storage,
+		.storage = cpuctx->ctx_groups_sched_in_iterator_storage,
 		.num_elements = 0,
-		.max_elements = cpuctx->visit_groups_merge_iterator_storage_size
+		.max_elements =
+			cpuctx->ctx_groups_sched_in_iterator_storage_size
 	};
 #endif
 	int ret, cpu = smp_processor_id();
@@ -3628,27 +3629,6 @@ static int flexible_sched_in(struct perf_event_context *ctx,
 	return 0;
 }
 
-static void
-ctx_pinned_sched_in(struct perf_event_context *ctx,
-		    struct perf_cpu_context *cpuctx)
-{
-	visit_groups_merge(ctx,
-			   cpuctx,
-			   /*is_pinned=*/true,
-			   NULL);
-}
-
-static void
-ctx_flexible_sched_in(struct perf_event_context *ctx,
-		      struct perf_cpu_context *cpuctx)
-{
-	int can_add_hw = 1;
-
-	visit_groups_merge(ctx,
-			   cpuctx,
-			   /*is_pinned=*/false,
-			   &can_add_hw);
-}
 
 static void
 ctx_sched_in(struct perf_event_context *ctx,
@@ -3686,11 +3666,20 @@ ctx_sched_in(struct perf_event_context *ctx,
 	 * in order to give them the best chance of going on.
 	 */
 	if (is_active & EVENT_PINNED)
-		ctx_pinned_sched_in(ctx, cpuctx);
+		ctx_groups_sched_in(ctx,
+				    cpuctx,
+				    /*is_pinned=*/true,
+				    NULL);
 
 	/* Then walk through the lower prio flexible groups */
-	if (is_active & EVENT_FLEXIBLE)
-		ctx_flexible_sched_in(ctx, cpuctx);
+	if (is_active & EVENT_FLEXIBLE) {
+		int can_add_hw = 1;
+
+		ctx_groups_sched_in(ctx,
+				    cpuctx,
+				    /*is_pinned=*/false,
+				    &can_add_hw);
+	}
 }
 
 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
@@ -10266,12 +10255,12 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
 		cpuctx->ctx.pmu = pmu;
 		cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
 #ifdef CONFIG_CGROUP_PERF
-		cpuctx->visit_groups_merge_iterator_storage =
-				kmalloc_array(MIN_VISIT_GROUP_MERGE_ITERATORS,
+		cpuctx->ctx_groups_sched_in_iterator_storage =
+				kmalloc_array(MIN_CTX_GROUPS_SCHED_IN_ITERATORS,
 					      sizeof(struct perf_event *),
 					      GFP_KERNEL);
-		cpuctx->visit_groups_merge_iterator_storage_size =
-				MIN_VISIT_GROUP_MERGE_ITERATORS;
+		cpuctx->ctx_groups_sched_in_iterator_storage_size =
+				MIN_CTX_GROUPS_SCHED_IN_ITERATORS;
 #endif
 		__perf_mux_hrtimer_init(cpuctx, cpu);
 	}
-- 
2.22.0.709.g102302147b-goog


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 4/7] perf: avoid a bounded set of visit_groups_merge iterators
  2019-07-24 22:37   ` [PATCH v2 4/7] perf: avoid a bounded set of visit_groups_merge iterators Ian Rogers
@ 2019-08-07 21:11     ` Peter Zijlstra
  0 siblings, 0 replies; 29+ messages in thread
From: Peter Zijlstra @ 2019-08-07 21:11 UTC (permalink / raw)
  To: Ian Rogers
  Cc: Ingo Molnar, Arnaldo Carvalho de Melo, Alexander Shishkin,
	Jiri Olsa, Namhyung Kim, linux-kernel, Kan Liang,
	Stephane Eranian, Mark Rutland

On Wed, Jul 24, 2019 at 03:37:43PM -0700, Ian Rogers wrote:

> @@ -2597,6 +2612,30 @@ static int  __perf_install_in_context(void *info)
>  		struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
>  		reprogram = cgroup_is_descendant(cgrp->css.cgroup,
>  					event->cgrp->css.cgroup);
> +
> +		/*
> +		 * Ensure space for visit_groups_merge iterator storage. With
> +		 * cgroup profiling we may have an event at each depth plus
> +		 * system wide events.
> +		 */
> +		max_iterators = perf_event_cgroup_depth(event) + 1;
> +		if (max_iterators >
> +		    cpuctx->visit_groups_merge_iterator_storage_size) {
> +			struct perf_event **storage =
> +			   krealloc(cpuctx->visit_groups_merge_iterator_storage,
> +				    sizeof(struct perf_event *) * max_iterators,
> +				    GFP_KERNEL);
> +			if (storage) {
> +				cpuctx->visit_groups_merge_iterator_storage
> +						= storage;
> +				cpuctx->visit_groups_merge_iterator_storage_size
> +						= max_iterators;
> +			} else {
> +				WARN_ONCE(1, "Unable to increase iterator "
> +					"storage for perf events with cgroups");
> +				ret = -ENOMEM;
> +			}
> +		}
>  	}
>  #endif

This is completely insane and broken. You do not allocate memory from
hardirq context while holding all sorts of locks.

Also, the patches are still an unreadable mess, and they do far too much
in a single patch.

Please have a look at the completely untested lot at:

  git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git perf/cgroup



^ permalink raw reply	[flat|nested] 29+ messages in thread

* [tip: perf/core] perf/cgroup: Order events in RB tree by cgroup id
  2019-07-24 22:37   ` [PATCH v2 2/7] perf/cgroup: order events in RB tree by cgroup id Ian Rogers
@ 2020-03-06 14:42     ` tip-bot2 for Ian Rogers
  0 siblings, 0 replies; 29+ messages in thread
From: tip-bot2 for Ian Rogers @ 2020-03-06 14:42 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Ian Rogers, Peter Zijlstra (Intel), Ingo Molnar, x86, LKML

The following commit has been merged into the perf/core branch of tip:

Commit-ID:     95ed6c707f26a727a29972b60469630ae10d579c
Gitweb:        https://git.kernel.org/tip/95ed6c707f26a727a29972b60469630ae10d579c
Author:        Ian Rogers <irogers@google.com>
AuthorDate:    Thu, 13 Feb 2020 23:51:33 -08:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Fri, 06 Mar 2020 11:57:01 +01:00

perf/cgroup: Order events in RB tree by cgroup id

If one is monitoring 6 events on 20 cgroups the per-CPU RB tree will
hold 120 events. The scheduling in of the events currently iterates
over all events looking to see which events match the task's cgroup or
its cgroup hierarchy. If a task is in 1 cgroup with 6 events, then 114
events are considered unnecessarily.

This change orders events in the RB tree by cgroup id if it is present.
This means scheduling in may go directly to events associated with the
task's cgroup if one is present. The per-CPU iterator storage in
visit_groups_merge is sized sufficent for an iterator per cgroup depth,
where different iterators are needed for the task's cgroup and parent
cgroups. By considering the set of iterators when visiting, the lowest
group_index event may be selected and the insertion order group_index
property is maintained. This also allows event rotation to function
correctly, as although events are grouped into a cgroup, rotation always
selects the lowest group_index event to rotate (delete/insert into the
tree) and the min heap of iterators make it so that the group_index order
is maintained.

Signed-off-by: Ian Rogers <irogers@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/20190724223746.153620-3-irogers@google.com
---
 kernel/events/core.c | 94 ++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 84 insertions(+), 10 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 8065949..ccf8d4f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1577,6 +1577,30 @@ perf_event_groups_less(struct perf_event *left, struct perf_event *right)
 	if (left->cpu > right->cpu)
 		return false;
 
+#ifdef CONFIG_CGROUP_PERF
+	if (left->cgrp != right->cgrp) {
+		if (!left->cgrp || !left->cgrp->css.cgroup) {
+			/*
+			 * Left has no cgroup but right does, no cgroups come
+			 * first.
+			 */
+			return true;
+		}
+		if (!right->cgrp || right->cgrp->css.cgroup) {
+			/*
+			 * Right has no cgroup but left does, no cgroups come
+			 * first.
+			 */
+			return false;
+		}
+		/* Two dissimilar cgroups, order by id. */
+		if (left->cgrp->css.cgroup->kn->id < right->cgrp->css.cgroup->kn->id)
+			return true;
+
+		return false;
+	}
+#endif
+
 	if (left->group_index < right->group_index)
 		return true;
 	if (left->group_index > right->group_index)
@@ -1656,25 +1680,48 @@ del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
 }
 
 /*
- * Get the leftmost event in the @cpu subtree.
+ * Get the leftmost event in the cpu/cgroup subtree.
  */
 static struct perf_event *
-perf_event_groups_first(struct perf_event_groups *groups, int cpu)
+perf_event_groups_first(struct perf_event_groups *groups, int cpu,
+			struct cgroup *cgrp)
 {
 	struct perf_event *node_event = NULL, *match = NULL;
 	struct rb_node *node = groups->tree.rb_node;
+#ifdef CONFIG_CGROUP_PERF
+	u64 node_cgrp_id, cgrp_id = 0;
+
+	if (cgrp)
+		cgrp_id = cgrp->kn->id;
+#endif
 
 	while (node) {
 		node_event = container_of(node, struct perf_event, group_node);
 
 		if (cpu < node_event->cpu) {
 			node = node->rb_left;
-		} else if (cpu > node_event->cpu) {
+			continue;
+		}
+		if (cpu > node_event->cpu) {
 			node = node->rb_right;
-		} else {
-			match = node_event;
+			continue;
+		}
+#ifdef CONFIG_CGROUP_PERF
+		node_cgrp_id = 0;
+		if (node_event->cgrp && node_event->cgrp->css.cgroup)
+			node_cgrp_id = node_event->cgrp->css.cgroup->kn->id;
+
+		if (cgrp_id < node_cgrp_id) {
 			node = node->rb_left;
+			continue;
+		}
+		if (cgrp_id > node_cgrp_id) {
+			node = node->rb_right;
+			continue;
 		}
+#endif
+		match = node_event;
+		node = node->rb_left;
 	}
 
 	return match;
@@ -1687,12 +1734,26 @@ static struct perf_event *
 perf_event_groups_next(struct perf_event *event)
 {
 	struct perf_event *next;
+#ifdef CONFIG_CGROUP_PERF
+	u64 curr_cgrp_id = 0;
+	u64 next_cgrp_id = 0;
+#endif
 
 	next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node);
-	if (next && next->cpu == event->cpu)
-		return next;
+	if (next == NULL || next->cpu != event->cpu)
+		return NULL;
 
-	return NULL;
+#ifdef CONFIG_CGROUP_PERF
+	if (event->cgrp && event->cgrp->css.cgroup)
+		curr_cgrp_id = event->cgrp->css.cgroup->kn->id;
+
+	if (next->cgrp && next->cgrp->css.cgroup)
+		next_cgrp_id = next->cgrp->css.cgroup->kn->id;
+
+	if (curr_cgrp_id != next_cgrp_id)
+		return NULL;
+#endif
+	return next;
 }
 
 /*
@@ -3473,6 +3534,9 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
 				int (*func)(struct perf_event *, void *),
 				void *data)
 {
+#ifdef CONFIG_CGROUP_PERF
+	struct cgroup_subsys_state *css = NULL;
+#endif
 	/* Space for per CPU and/or any CPU event iterators. */
 	struct perf_event *itrs[2];
 	struct min_heap event_heap;
@@ -3487,6 +3551,11 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
 		};
 
 		lockdep_assert_held(&cpuctx->ctx.lock);
+
+#ifdef CONFIG_CGROUP_PERF
+		if (cpuctx->cgrp)
+			css = &cpuctx->cgrp->css;
+#endif
 	} else {
 		event_heap = (struct min_heap){
 			.data = itrs,
@@ -3494,11 +3563,16 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
 			.size = ARRAY_SIZE(itrs),
 		};
 		/* Events not within a CPU context may be on any CPU. */
-		__heap_add(&event_heap, perf_event_groups_first(groups, -1));
+		__heap_add(&event_heap, perf_event_groups_first(groups, -1, NULL));
 	}
 	evt = event_heap.data;
 
-	__heap_add(&event_heap, perf_event_groups_first(groups, cpu));
+	__heap_add(&event_heap, perf_event_groups_first(groups, cpu, NULL));
+
+#ifdef CONFIG_CGROUP_PERF
+	for (; css; css = css->parent)
+		__heap_add(&event_heap, perf_event_groups_first(groups, cpu, css->cgroup));
+#endif
 
 	min_heapify_all(&event_heap, &perf_min_heap);
 

^ permalink raw reply related	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2020-03-06 14:42 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-07-02  6:59 [PATCH 0/7] Optimize cgroup context switch Ian Rogers
2019-07-02  6:59 ` [PATCH 1/7] perf: propagate perf_install_in_context errors up Ian Rogers
2019-07-05 14:13   ` Jiri Olsa
2019-07-02  6:59 ` [PATCH 2/7] perf/cgroup: order events in RB tree by cgroup id Ian Rogers
2019-07-08 15:38   ` Peter Zijlstra
2019-07-08 15:45   ` Peter Zijlstra
2019-07-08 16:16   ` Peter Zijlstra
2019-07-24 22:34     ` Ian Rogers
2019-07-08 16:21   ` Peter Zijlstra
2019-07-02  6:59 ` [PATCH 3/7] perf: order iterators for visit_groups_merge into a min-heap Ian Rogers
2019-07-08 16:30   ` Peter Zijlstra
2019-07-24 22:34     ` Ian Rogers
2019-07-08 16:36   ` Peter Zijlstra
2019-07-02  6:59 ` [PATCH 4/7] perf: avoid a bounded set of visit_groups_merge iterators Ian Rogers
2019-07-02  6:59 ` [PATCH 5/7] perf: cache perf_event_groups_first for cgroups Ian Rogers
2019-07-08 16:50   ` Peter Zijlstra
2019-07-08 16:51     ` Peter Zijlstra
2019-07-02  6:59 ` [PATCH 6/7] perf: avoid double checking CPU and cgroup Ian Rogers
2019-07-02  6:59 ` [PATCH 7/7] perf: rename visit_groups_merge to ctx_groups_sched_in Ian Rogers
2019-07-24 22:37 ` [PATCH v2 0/7] Optimize cgroup context switch Ian Rogers
2019-07-24 22:37   ` [PATCH v2 1/7] perf: propagate perf_install_in_context errors up Ian Rogers
2019-07-24 22:37   ` [PATCH v2 2/7] perf/cgroup: order events in RB tree by cgroup id Ian Rogers
2020-03-06 14:42     ` [tip: perf/core] perf/cgroup: Order " tip-bot2 for Ian Rogers
2019-07-24 22:37   ` [PATCH v2 3/7] perf: order iterators for visit_groups_merge into a min-heap Ian Rogers
2019-07-24 22:37   ` [PATCH v2 4/7] perf: avoid a bounded set of visit_groups_merge iterators Ian Rogers
2019-08-07 21:11     ` Peter Zijlstra
2019-07-24 22:37   ` [PATCH v2 5/7] perf: cache perf_event_groups_first for cgroups Ian Rogers
2019-07-24 22:37   ` [PATCH v2 6/7] perf: avoid double checking CPU and cgroup Ian Rogers
2019-07-24 22:37   ` [PATCH v2 7/7] perf: rename visit_groups_merge to ctx_groups_sched_in Ian Rogers

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.