linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RESEND PATCH v3 1/n] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi
@ 2017-06-15 23:03 Alexey Budankov
  2017-06-20 13:44 ` Mark Rutland
  0 siblings, 1 reply; 5+ messages in thread
From: Alexey Budankov @ 2017-06-15 23:03 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin
  Cc: Andi Kleen, Kan Liang, Dmitri Prokhorov, Valery Cherepennikov,
	David Carrillo-Cisneros, Stephane Eranian, Mark Rutland,
	linux-kernel

perf/core: use rb trees for pinned/flexible groups

By default, the userspace perf tool opens per-cpu task-bound events
when sampling, so for N logical events requested by the user, the tool
will open N * NR_CPUS events.

In the kernel, we mux events with a hrtimer, periodically rotating the
flexible group list and trying to schedule each group in turn. We skip 
groups whose cpu filter doesn't match. So when we get unlucky, we can 
walk N * (NR_CPUS - 1) groups pointlessly for each hrtimer invocation.

This has been observed to result in significant overhead when running
the STREAM benchmark on 272 core Xeon Phi systems.

One way to avoid this is to place our events into an rb tree sorted by
CPU filter, so that our hrtimer can skip to the current CPU's
list and ignore everything else.

As a step towards that, this patch replaces event group lists with rb
trees.

Signed-off-by: Alexey Budankov <alexey.budankov@linux.intel.com>
---
  include/linux/perf_event.h |  18 ++-
  kernel/events/core.c       | 393 
+++++++++++++++++++++++++++++++++------------
  2 files changed, 307 insertions(+), 104 deletions(-)

Addressed Mark Rutland's comments from the previous patch version.

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 24a6358..c9203d7 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -574,6 +574,20 @@ struct perf_event {
  	struct list_head		sibling_list;

  	/*
+	 * Node on the pinned or flexible tree located at the event context;
+	 * the node may be empty in case its event is not directly attached
+	 * to the tree but to group_list list of the event directly
+	 * attached to the tree;
+	 */
+	struct rb_node			group_node;
+	/*
+	 * List keeps groups allocated for the same cpu;
+	 * the list may be empty in case its event is not directly
+	 * attached to the tree but to group_list list of the event directly
+	 * attached to the tree;
+	 */
+	struct list_head		group_list;
+	/*
  	 * We need storage to track the entries in perf_pmu_migrate_context; we
  	 * cannot use the event_entry because of RCU and we want to keep the
  	 * group in tact which avoids us using the other two entries.
@@ -741,8 +755,8 @@ struct perf_event_context {
  	struct mutex			mutex;

  	struct list_head		active_ctx_list;
-	struct list_head		pinned_groups;
-	struct list_head		flexible_groups;
+	struct rb_root			pinned_groups;
+	struct rb_root			flexible_groups;
  	struct list_head		event_list;
  	int				nr_events;
  	int				nr_active;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index bc63f8d..2ba055d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1458,8 +1458,8 @@ static enum event_type_t get_event_type(struct 
perf_event *event)
  	return event_type;
  }

-static struct list_head *
-ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
+static struct rb_root *
+ctx_group_cpu_tree(struct perf_event *event, struct perf_event_context 
*ctx)
  {
  	if (event->attr.pinned)
  		return &ctx->pinned_groups;
@@ -1468,6 +1468,144 @@ ctx_group_list(struct perf_event *event, struct 
perf_event_context *ctx)
  }

  /*
+ * Insert a group into a tree using event->cpu as a key. If event->cpu node
+ * is already attached to the tree then the event is added to the attached
+ * group's group_list list.
+ */
+static void
+perf_cpu_tree_insert(struct rb_root *tree, struct perf_event *event)
+{
+	struct rb_node **node;
+	struct rb_node *parent;
+
+	WARN_ON_ONCE(!tree || !event);
+
+	node = &tree->rb_node;
+	parent = *node;
+
+	while (*node) {
+		struct perf_event *node_event =	container_of(*node,
+				struct perf_event, group_node);
+
+		parent = *node;
+
+		if (event->cpu < node_event->cpu) {
+			node = &parent->rb_left;
+		} else if (event->cpu > node_event->cpu) {
+			node = &parent->rb_right;
+		} else {
+			list_add_tail(&event->group_entry,
+					&node_event->group_list);
+			return;
+		}
+	}
+
+	list_add_tail(&event->group_entry, &event->group_list);
+
+	rb_link_node(&event->group_node, parent, node);
+	rb_insert_color(&event->group_node, tree);
+}
+
+/*
+ * Delete a group from a tree. If the group is directly attached to the 
tree
+ * it also detaches all groups on the group's group_list list.
+ */
+static void
+perf_cpu_tree_delete(struct rb_root *tree, struct perf_event *event)
+{
+	WARN_ON_ONCE(!tree || !event);
+
+	if (RB_EMPTY_NODE(&event->group_node)) {
+		list_del_init(&event->group_entry);
+	} else {
+		struct perf_event *list_event, *list_next;
+
+		rb_erase(&event->group_node, tree);
+		list_for_each_entry_safe(list_event, list_next,
+				&event->group_list, group_entry)
+			list_del_init(&list_event->group_entry);
+	}
+}
+
+/*
+ * Find group_list list by a cpu key and call provided callback for every
+ * group on the list. Iteration stops if the callback returns non zero.
+ */
+
+typedef int (*perf_cpu_tree_callback_t)(struct perf_event *, void *);
+
+static int
+perf_cpu_tree_iterate_cpu(struct rb_root *tree, int cpu,
+		perf_cpu_tree_callback_t callback, void *data)
+{
+	int ret = 0;
+	struct rb_node *node;
+	struct perf_event *event;
+
+	WARN_ON_ONCE(!tree);
+
+	node = tree->rb_node;
+
+	while (node) {
+		struct perf_event *node_event = container_of(node,
+				struct perf_event, group_node);
+
+		if (cpu < node_event->cpu) {
+			node = node->rb_left;
+		} else if (cpu > node_event->cpu) {
+			node = node->rb_right;
+		} else {
+			list_for_each_entry(event, &node_event->group_list,
+					group_entry) {
+				ret = callback(event, data);
+				if (ret)
+					return ret;
+			}
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Iterate a tree and call provided callback for every group in the tree.
+ * Iteration stops if the callback returns non zero.
+ */
+static int
+perf_cpu_tree_iterate(struct rb_root *tree,
+		perf_cpu_tree_callback_t callback, void *data)
+{
+	int ret = 0;
+	struct rb_node *node;
+	struct perf_event *event;
+
+	WARN_ON_ONCE(!tree);
+
+	for (node = rb_first(tree); node; node = rb_next(node)) {
+		struct perf_event *node_event = container_of(node,
+				struct perf_event, group_node);
+
+		list_for_each_entry(event, &node_event->group_list,
+				group_entry) {
+			ret = callback(event, data);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+
+static void
+add_to_groups(struct perf_event *event, struct perf_event_context *ctx)
+{
+	struct rb_root *tree;
+
+	tree = ctx_group_cpu_tree(event, ctx);
+	perf_cpu_tree_insert(tree, event);
+}
+
+/*
   * Add a event from the lists for its context.
   * Must be called with ctx->mutex and ctx->lock held.
   */
@@ -1485,12 +1623,8 @@ list_add_event(struct perf_event *event, struct 
perf_event_context *ctx)
  	 * perf_group_detach can, at all times, locate all siblings.
  	 */
  	if (event->group_leader == event) {
-		struct list_head *list;
-
  		event->group_caps = event->event_caps;
-
-		list = ctx_group_list(event, ctx);
-		list_add_tail(&event->group_entry, list);
+		add_to_groups(event, ctx);
  	}

  	list_update_cgroup_event(event, ctx, true);
@@ -1654,6 +1788,15 @@ static void perf_group_attach(struct perf_event 
*event)
  		perf_event__header_size(pos);
  }

+static void
+del_from_groups(struct perf_event *event, struct perf_event_context *ctx)
+{
+	struct rb_root *tree;
+
+	tree = ctx_group_cpu_tree(event, ctx);
+	perf_cpu_tree_delete(tree, event);
+}
+
  /*
   * Remove a event from the lists for its context.
   * Must be called with ctx->mutex and ctx->lock held.
@@ -1681,7 +1824,7 @@ list_del_event(struct perf_event *event, struct 
perf_event_context *ctx)
  	list_del_rcu(&event->event_entry);

  	if (event->group_leader == event)
-		list_del_init(&event->group_entry);
+		del_from_groups(event, ctx);

  	update_group_times(event);

@@ -2699,12 +2842,80 @@ int perf_event_refresh(struct perf_event *event, 
int refresh)
  }
  EXPORT_SYMBOL_GPL(perf_event_refresh);

+static void
+sched_in_group(struct perf_event *event,
+		struct perf_event_context *ctx,
+		struct perf_cpu_context *cpuctx,
+		int *can_add_hw)
+{
+	/* Ignore events in OFF or ERROR state and
+	 * listen to the 'cpu' scheduling filter constraint
+	 * of events:
+	 */
+	if (event->state <= PERF_EVENT_STATE_OFF || !event_filter_match(event))
+		return;
+
+	/* may need to reset tstamp_enabled */
+	if (is_cgroup_event(event))
+		perf_cgroup_mark_enabled(event, ctx);
+
+	if (group_can_go_on(event, cpuctx, *can_add_hw)) {
+		if (group_sched_in(event, cpuctx, ctx))
+			*can_add_hw = 0;
+	}
+}
+
+struct group_sched_params {
+	struct perf_cpu_context *cpuctx;
+	struct perf_event_context *ctx;
+	int can_add_hw;
+};
+
+static int
+group_sched_in_pinned_callback(struct perf_event *event, void *data)
+{
+	struct group_sched_params *params = data;
+	int can_add_hw = 1;
+
+	sched_in_group(event, params->ctx, params->cpuctx, &can_add_hw);
+
+	if (!can_add_hw) {
+		update_group_times(event);
+		event->state = PERF_EVENT_STATE_ERROR;
+	}
+
+	return 0;
+}
+
+static int
+group_sched_in_flexible_callback(struct perf_event *event, void *data)
+{
+	struct group_sched_params *params = data;
+
+	sched_in_group(event, params->ctx, params->cpuctx,
+			&params->can_add_hw);
+
+	return 0;
+}
+
+static int group_sched_out_callback(struct perf_event *event, void *data)
+{
+	struct group_sched_params *params = data;
+
+	group_sched_out(event, params->cpuctx, params->ctx);
+
+	return 0;
+}
+
  static void ctx_sched_out(struct perf_event_context *ctx,
  			  struct perf_cpu_context *cpuctx,
  			  enum event_type_t event_type)
  {
  	int is_active = ctx->is_active;
-	struct perf_event *event;
+	struct group_sched_params params = {
+			.cpuctx = cpuctx,
+			.ctx = ctx
+	};

  	lockdep_assert_held(&ctx->lock);

@@ -2750,15 +2961,15 @@ static void ctx_sched_out(struct 
perf_event_context *ctx,
  		return;

  	perf_pmu_disable(ctx->pmu);
-	if (is_active & EVENT_PINNED) {
-		list_for_each_entry(event, &ctx->pinned_groups, group_entry)
-			group_sched_out(event, cpuctx, ctx);
-	}

-	if (is_active & EVENT_FLEXIBLE) {
-		list_for_each_entry(event, &ctx->flexible_groups, group_entry)
-			group_sched_out(event, cpuctx, ctx);
-	}
+	if (is_active & EVENT_PINNED)
+		perf_cpu_tree_iterate(&ctx->pinned_groups,
+				group_sched_out_callback, &params);
+
+	if (is_active & EVENT_FLEXIBLE)
+		perf_cpu_tree_iterate(&ctx->flexible_groups,
+				group_sched_out_callback, &params);
+
  	perf_pmu_enable(ctx->pmu);
  }

@@ -3052,72 +3263,18 @@ static void cpu_ctx_sched_out(struct 
perf_cpu_context *cpuctx,
  }

  static void
-ctx_pinned_sched_in(struct perf_event_context *ctx,
-		    struct perf_cpu_context *cpuctx)
-{
-	struct perf_event *event;
-
-	list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
-		if (event->state <= PERF_EVENT_STATE_OFF)
-			continue;
-		if (!event_filter_match(event))
-			continue;
-
-		/* may need to reset tstamp_enabled */
-		if (is_cgroup_event(event))
-			perf_cgroup_mark_enabled(event, ctx);
-
-		if (group_can_go_on(event, cpuctx, 1))
-			group_sched_in(event, cpuctx, ctx);
-
-		/*
-		 * If this pinned group hasn't been scheduled,
-		 * put it in error state.
-		 */
-		if (event->state == PERF_EVENT_STATE_INACTIVE) {
-			update_group_times(event);
-			event->state = PERF_EVENT_STATE_ERROR;
-		}
-	}
-}
-
-static void
-ctx_flexible_sched_in(struct perf_event_context *ctx,
-		      struct perf_cpu_context *cpuctx)
-{
-	struct perf_event *event;
-	int can_add_hw = 1;
-
-	list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
-		/* Ignore events in OFF or ERROR state */
-		if (event->state <= PERF_EVENT_STATE_OFF)
-			continue;
-		/*
-		 * Listen to the 'cpu' scheduling filter constraint
-		 * of events:
-		 */
-		if (!event_filter_match(event))
-			continue;
-
-		/* may need to reset tstamp_enabled */
-		if (is_cgroup_event(event))
-			perf_cgroup_mark_enabled(event, ctx);
-
-		if (group_can_go_on(event, cpuctx, can_add_hw)) {
-			if (group_sched_in(event, cpuctx, ctx))
-				can_add_hw = 0;
-		}
-	}
-}
-
-static void
  ctx_sched_in(struct perf_event_context *ctx,
  	     struct perf_cpu_context *cpuctx,
  	     enum event_type_t event_type,
  	     struct task_struct *task)
  {
  	int is_active = ctx->is_active;
-	u64 now;
+	struct group_sched_params params = {
+			.cpuctx = cpuctx,
+			.ctx = ctx,
+			.can_add_hw = 1
+
+	};

  	lockdep_assert_held(&ctx->lock);

@@ -3136,8 +3293,7 @@ ctx_sched_in(struct perf_event_context *ctx,

  	if (is_active & EVENT_TIME) {
  		/* start ctx time */
-		now = perf_clock();
-		ctx->timestamp = now;
+		ctx->timestamp = perf_clock();
  		perf_cgroup_set_timestamp(task, ctx);
  	}

@@ -3146,11 +3302,13 @@ ctx_sched_in(struct perf_event_context *ctx,
  	 * in order to give them the best chance of going on.
  	 */
  	if (is_active & EVENT_PINNED)
-		ctx_pinned_sched_in(ctx, cpuctx);
+		perf_cpu_tree_iterate(&ctx->pinned_groups,
+				group_sched_in_pinned_callback, &params);

  	/* Then walk through the lower prio flexible groups */
  	if (is_active & EVENT_FLEXIBLE)
-		ctx_flexible_sched_in(ctx, cpuctx);
+		perf_cpu_tree_iterate(&ctx->flexible_groups,
+				group_sched_in_flexible_callback, &params);
  }

  static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
@@ -3181,7 +3339,7 @@ static void perf_event_context_sched_in(struct 
perf_event_context *ctx,
  	 * However, if task's ctx is not carrying any pinned
  	 * events, no need to flip the cpuctx's events around.
  	 */
-	if (!list_empty(&ctx->pinned_groups))
+	if (!RB_EMPTY_ROOT(&ctx->pinned_groups))
  		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
  	perf_event_sched_in(cpuctx, ctx, task);
  	perf_pmu_enable(ctx->pmu);
@@ -3410,14 +3568,25 @@ static void 
perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
  /*
   * Round-robin a context's events:
   */
+static int rotate_ctx_callback(struct perf_event *event, void *data)
+{
+	list_rotate_left(&event->group_list);
+
+	return 1;
+}
+
  static void rotate_ctx(struct perf_event_context *ctx)
  {
  	/*
  	 * Rotate the first entry last of non-pinned groups. Rotation might be
  	 * disabled by the inheritance code.
  	 */
-	if (!ctx->rotate_disable)
-		list_rotate_left(&ctx->flexible_groups);
+	if (!ctx->rotate_disable) {
+		perf_cpu_tree_iterate_cpu(&ctx->flexible_groups,
+				-1, rotate_ctx_callback, NULL);
+		perf_cpu_tree_iterate_cpu(&ctx->flexible_groups,
+				smp_processor_id(), rotate_ctx_callback, NULL);
+	}
  }

  static int perf_rotate_context(struct perf_cpu_context *cpuctx)
@@ -3743,8 +3912,8 @@ static void __perf_event_init_context(struct 
perf_event_context *ctx)
  	raw_spin_lock_init(&ctx->lock);
  	mutex_init(&ctx->mutex);
  	INIT_LIST_HEAD(&ctx->active_ctx_list);
-	INIT_LIST_HEAD(&ctx->pinned_groups);
-	INIT_LIST_HEAD(&ctx->flexible_groups);
+	ctx->pinned_groups = RB_ROOT;
+	ctx->flexible_groups = RB_ROOT;
  	INIT_LIST_HEAD(&ctx->event_list);
  	atomic_set(&ctx->refcount, 1);
  }
@@ -9377,6 +9546,8 @@ perf_event_alloc(struct perf_event_attr *attr, int 
cpu,
  	INIT_LIST_HEAD(&event->child_list);

  	INIT_LIST_HEAD(&event->group_entry);
+	RB_CLEAR_NODE(&event->group_node);
+	INIT_LIST_HEAD(&event->group_list);
  	INIT_LIST_HEAD(&event->event_entry);
  	INIT_LIST_HEAD(&event->sibling_list);
  	INIT_LIST_HEAD(&event->rb_entry);
@@ -10816,6 +10987,23 @@ inherit_task_group(struct perf_event *event, 
struct task_struct *parent,
  	return ret;
  }

+struct inherit_task_group_params {
+	struct task_struct *parent;
+	struct perf_event_context *parent_ctx;
+	struct task_struct *child;
+	int ctxn;
+	int inherited_all;
+};
+
+static int
+inherit_task_group_callback(struct perf_event *event, void *data)
+{
+	struct inherit_task_group_params *params = data;
+
+	return inherit_task_group(event, params->parent, params->parent_ctx,
+			 params->child, params->ctxn, &params->inherited_all);
+}
+
  /*
   * Initialize the perf_event context in task_struct
   */
@@ -10823,11 +11011,15 @@ static int perf_event_init_context(struct 
task_struct *child, int ctxn)
  {
  	struct perf_event_context *child_ctx, *parent_ctx;
  	struct perf_event_context *cloned_ctx;
-	struct perf_event *event;
  	struct task_struct *parent = current;
-	int inherited_all = 1;
  	unsigned long flags;
  	int ret = 0;
+	struct inherit_task_group_params params = {
+			.parent = parent,
+			.child = child,
+			.ctxn = ctxn,
+			.inherited_all = 1
+	};

  	if (likely(!parent->perf_event_ctxp[ctxn]))
  		return 0;
@@ -10839,7 +11031,8 @@ static int perf_event_init_context(struct 
task_struct *child, int ctxn)
  	parent_ctx = perf_pin_task_context(parent, ctxn);
  	if (!parent_ctx)
  		return 0;
-
+	else
+		params.parent_ctx = parent_ctx;
  	/*
  	 * No need to check if parent_ctx != NULL here; since we saw
  	 * it non-NULL earlier, the only reason for it to become NULL
@@ -10857,12 +11050,10 @@ static int perf_event_init_context(struct 
task_struct *child, int ctxn)
  	 * We dont have to disable NMIs - we are only looking at
  	 * the list, not manipulating it:
  	 */
-	list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
-		ret = inherit_task_group(event, parent, parent_ctx,
-					 child, ctxn, &inherited_all);
-		if (ret)
-			goto out_unlock;
-	}
+	ret = perf_cpu_tree_iterate(&parent_ctx->pinned_groups,
+			inherit_task_group_callback, &params);
+	if (ret)
+		goto out_unlock;

  	/*
  	 * We can't hold ctx->lock when iterating the ->flexible_group list due
@@ -10873,19 +11064,17 @@ static int perf_event_init_context(struct 
task_struct *child, int ctxn)
  	parent_ctx->rotate_disable = 1;
  	raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);

-	list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
-		ret = inherit_task_group(event, parent, parent_ctx,
-					 child, ctxn, &inherited_all);
-		if (ret)
-			goto out_unlock;
-	}
+	ret = perf_cpu_tree_iterate(&parent_ctx->flexible_groups,
+				inherit_task_group_callback, &params);
+	if (ret)
+		goto out_unlock;

  	raw_spin_lock_irqsave(&parent_ctx->lock, flags);
  	parent_ctx->rotate_disable = 0;

  	child_ctx = child->perf_event_ctxp[ctxn];

-	if (child_ctx && inherited_all) {
+	if (child_ctx && params.inherited_all) {
  		/*
  		 * Mark the child context as a clone of the parent
  		 * context, or of whatever the parent is a clone of.

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [RESEND PATCH v3 1/n] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi
  2017-06-15 23:03 [RESEND PATCH v3 1/n] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi Alexey Budankov
@ 2017-06-20 13:44 ` Mark Rutland
  2017-06-20 15:12   ` Alexey Budankov
  0 siblings, 1 reply; 5+ messages in thread
From: Mark Rutland @ 2017-06-20 13:44 UTC (permalink / raw)
  To: Alexey Budankov
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Andi Kleen, Kan Liang, Dmitri Prokhorov,
	Valery Cherepennikov, David Carrillo-Cisneros, Stephane Eranian,
	linux-kernel

On Fri, Jun 16, 2017 at 02:03:58AM +0300, Alexey Budankov wrote:
> perf/core: use rb trees for pinned/flexible groups
> 
> By default, the userspace perf tool opens per-cpu task-bound events
> when sampling, so for N logical events requested by the user, the tool
> will open N * NR_CPUS events.
> 
> In the kernel, we mux events with a hrtimer, periodically rotating the
> flexible group list and trying to schedule each group in turn. We
> skip groups whose cpu filter doesn't match. So when we get unlucky,
> we can walk N * (NR_CPUS - 1) groups pointlessly for each hrtimer
> invocation.
> 
> This has been observed to result in significant overhead when running
> the STREAM benchmark on 272 core Xeon Phi systems.
> 
> One way to avoid this is to place our events into an rb tree sorted by
> CPU filter, so that our hrtimer can skip to the current CPU's
> list and ignore everything else.
> 
> As a step towards that, this patch replaces event group lists with rb
> trees.
> 
> Signed-off-by: Alexey Budankov <alexey.budankov@linux.intel.com>
> ---
>  include/linux/perf_event.h |  18 ++-
>  kernel/events/core.c       | 393
> +++++++++++++++++++++++++++++++++------------
>  2 files changed, 307 insertions(+), 104 deletions(-)
> 
> Addressed Mark Rutland's comments from the previous patch version.

... then this should be v4, no?

Which comments? Could you pelase write a changelog in future?

In future, please send patches as a series, with a known upper-bound
rather than N. It's really painful to find them when they're sent
separately, without a known upper bound.

[...]

> +/*
> + * Delete a group from a tree. If the group is directly attached to
> the tree
> + * it also detaches all groups on the group's group_list list.
> + */
> +static void
> +perf_cpu_tree_delete(struct rb_root *tree, struct perf_event *event)
> +{
> +	WARN_ON_ONCE(!tree || !event);
> +
> +	if (RB_EMPTY_NODE(&event->group_node)) {
> +		list_del_init(&event->group_entry);
> +	} else {
> +		struct perf_event *list_event, *list_next;
> +
> +		rb_erase(&event->group_node, tree);
> +		list_for_each_entry_safe(list_event, list_next,
> +				&event->group_list, group_entry)
> +			list_del_init(&list_event->group_entry);
> +	}
> +}

As I commented on the last version, this means that all groups which
were (incidentally) hanging off of this one are removed, and can
no longer be reached via the tree.

Surely one of the remaining groups should be added to the tree?

I don't beleive that is correct.

I beleive it would be simpler to reason about a threaded rb-tree here,
since that special case wouldn't exist.

Thanks,
Mark.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RESEND PATCH v3 1/n] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi
  2017-06-20 13:44 ` Mark Rutland
@ 2017-06-20 15:12   ` Alexey Budankov
  2017-06-20 16:38     ` Mark Rutland
  2017-06-21  9:25     ` Alexey Budankov
  0 siblings, 2 replies; 5+ messages in thread
From: Alexey Budankov @ 2017-06-20 15:12 UTC (permalink / raw)
  To: Mark Rutland
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Andi Kleen, Kan Liang, Dmitri Prokhorov,
	Valery Cherepennikov, David Carrillo-Cisneros, Stephane Eranian,
	linux-kernel

On 20.06.2017 16:44, Mark Rutland wrote:
> On Fri, Jun 16, 2017 at 02:03:58AM +0300, Alexey Budankov wrote:
>> perf/core: use rb trees for pinned/flexible groups
>>
>> By default, the userspace perf tool opens per-cpu task-bound events
>> when sampling, so for N logical events requested by the user, the tool
>> will open N * NR_CPUS events.
>>
>> In the kernel, we mux events with a hrtimer, periodically rotating the
>> flexible group list and trying to schedule each group in turn. We
>> skip groups whose cpu filter doesn't match. So when we get unlucky,
>> we can walk N * (NR_CPUS - 1) groups pointlessly for each hrtimer
>> invocation.
>>
>> This has been observed to result in significant overhead when running
>> the STREAM benchmark on 272 core Xeon Phi systems.
>>
>> One way to avoid this is to place our events into an rb tree sorted by
>> CPU filter, so that our hrtimer can skip to the current CPU's
>> list and ignore everything else.
>>
>> As a step towards that, this patch replaces event group lists with rb
>> trees.
>>
>> Signed-off-by: Alexey Budankov <alexey.budankov@linux.intel.com>
>> ---
>>   include/linux/perf_event.h |  18 ++-
>>   kernel/events/core.c       | 393
>> +++++++++++++++++++++++++++++++++------------
>>   2 files changed, 307 insertions(+), 104 deletions(-)
>>
>> Addressed Mark Rutland's comments from the previous patch version.
> 
> ... then this should be v4, no?
> 
> Which comments? Could you pelase write a changelog in future?

Changes are:

1. changed type of pinned_groups/flexible_groups to rb_tree;
2. removed group_list_entry and reused group_entry for that purposes;
3. added add_to_groups()/del_from_groups() helper functions;

> 
> In future, please send patches as a series, with a known upper-bound
> rather than N. It's really painful to find them when they're sent
> separately, without a known upper bound.

Accepted.

> 
> [...]
> 
>> +/*
>> + * Delete a group from a tree. If the group is directly attached to
>> the tree
>> + * it also detaches all groups on the group's group_list list.
>> + */
>> +static void
>> +perf_cpu_tree_delete(struct rb_root *tree, struct perf_event *event)
>> +{
>> +	WARN_ON_ONCE(!tree || !event);
>> +
>> +	if (RB_EMPTY_NODE(&event->group_node)) {
>> +		list_del_init(&event->group_entry);
>> +	} else {
>> +		struct perf_event *list_event, *list_next;
>> +
>> +		rb_erase(&event->group_node, tree);
>> +		list_for_each_entry_safe(list_event, list_next,
>> +				&event->group_list, group_entry)
>> +			list_del_init(&list_event->group_entry);
>> +	}
>> +}
> 
> As I commented on the last version, this means that all groups which
> were (incidentally) hanging off of this one are removed, and can
> no longer be reached via the tree.
> 
> Surely one of the remaining groups should be added to the tree?

Aww, I see. That needs to implemented. Thanks.

> 
> I don't beleive that is correct.
> 
> I beleive it would be simpler to reason about a threaded rb-tree here,
> since that special case wouldn't exist.
> 
> Thanks,
> Mark.
> 

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RESEND PATCH v3 1/n] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi
  2017-06-20 15:12   ` Alexey Budankov
@ 2017-06-20 16:38     ` Mark Rutland
  2017-06-21  9:25     ` Alexey Budankov
  1 sibling, 0 replies; 5+ messages in thread
From: Mark Rutland @ 2017-06-20 16:38 UTC (permalink / raw)
  To: Alexey Budankov
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Andi Kleen, Kan Liang, Dmitri Prokhorov,
	Valery Cherepennikov, David Carrillo-Cisneros, Stephane Eranian,
	linux-kernel

On Tue, Jun 20, 2017 at 06:12:08PM +0300, Alexey Budankov wrote:
> On 20.06.2017 16:44, Mark Rutland wrote:
> >On Fri, Jun 16, 2017 at 02:03:58AM +0300, Alexey Budankov wrote:
> >>perf/core: use rb trees for pinned/flexible groups
> >>
> >>By default, the userspace perf tool opens per-cpu task-bound events
> >>when sampling, so for N logical events requested by the user, the tool
> >>will open N * NR_CPUS events.
> >>
> >>In the kernel, we mux events with a hrtimer, periodically rotating the
> >>flexible group list and trying to schedule each group in turn. We
> >>skip groups whose cpu filter doesn't match. So when we get unlucky,
> >>we can walk N * (NR_CPUS - 1) groups pointlessly for each hrtimer
> >>invocation.
> >>
> >>This has been observed to result in significant overhead when running
> >>the STREAM benchmark on 272 core Xeon Phi systems.
> >>
> >>One way to avoid this is to place our events into an rb tree sorted by
> >>CPU filter, so that our hrtimer can skip to the current CPU's
> >>list and ignore everything else.
> >>
> >>As a step towards that, this patch replaces event group lists with rb
> >>trees.
> >>
> >>Signed-off-by: Alexey Budankov <alexey.budankov@linux.intel.com>
> >>---
> >>  include/linux/perf_event.h |  18 ++-
> >>  kernel/events/core.c       | 393
> >>+++++++++++++++++++++++++++++++++------------
> >>  2 files changed, 307 insertions(+), 104 deletions(-)
> >>
> >>Addressed Mark Rutland's comments from the previous patch version.
> >
> >... then this should be v4, no?
> >
> >Which comments? Could you pelase write a changelog in future?
> 
> Changes are:
> 
> 1. changed type of pinned_groups/flexible_groups to rb_tree;
> 2. removed group_list_entry and reused group_entry for that purposes;
> 3. added add_to_groups()/del_from_groups() helper functions;

Thanks; the new changelog is much appreciated.

Mark.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RESEND PATCH v3 1/n] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi
  2017-06-20 15:12   ` Alexey Budankov
  2017-06-20 16:38     ` Mark Rutland
@ 2017-06-21  9:25     ` Alexey Budankov
  1 sibling, 0 replies; 5+ messages in thread
From: Alexey Budankov @ 2017-06-21  9:25 UTC (permalink / raw)
  To: Mark Rutland
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Andi Kleen, Kan Liang, Dmitri Prokhorov,
	Valery Cherepennikov, David Carrillo-Cisneros, Stephane Eranian,
	linux-kernel

On 20.06.2017 18:12, Alexey Budankov wrote:
> On 20.06.2017 16:44, Mark Rutland wrote:
>> On Fri, Jun 16, 2017 at 02:03:58AM +0300, Alexey Budankov wrote:
>>> perf/core: use rb trees for pinned/flexible groups
>>>
>>> By default, the userspace perf tool opens per-cpu task-bound events
>>> when sampling, so for N logical events requested by the user, the tool
>>> will open N * NR_CPUS events.
>>>
>>> In the kernel, we mux events with a hrtimer, periodically rotating the
>>> flexible group list and trying to schedule each group in turn. We
>>> skip groups whose cpu filter doesn't match. So when we get unlucky,
>>> we can walk N * (NR_CPUS - 1) groups pointlessly for each hrtimer
>>> invocation.
>>>
>>> This has been observed to result in significant overhead when running
>>> the STREAM benchmark on 272 core Xeon Phi systems.
>>>
>>> One way to avoid this is to place our events into an rb tree sorted by
>>> CPU filter, so that our hrtimer can skip to the current CPU's
>>> list and ignore everything else.
>>>
>>> As a step towards that, this patch replaces event group lists with rb
>>> trees.
>>>
>>> Signed-off-by: Alexey Budankov <alexey.budankov@linux.intel.com>
>>> ---
>>>   include/linux/perf_event.h |  18 ++-
>>>   kernel/events/core.c       | 393
>>> +++++++++++++++++++++++++++++++++------------
>>>   2 files changed, 307 insertions(+), 104 deletions(-)
>>>
>>> Addressed Mark Rutland's comments from the previous patch version.
>>
>> ... then this should be v4, no?
>>
>> Which comments? Could you pelase write a changelog in future?
> 
> Changes are:
> 
> 1. changed type of pinned_groups/flexible_groups to rb_tree;
> 2. removed group_list_entry and reused group_entry for that purposes;
> 3. added add_to_groups()/del_from_groups() helper functions;
> 
>>
>> In future, please send patches as a series, with a known upper-bound
>> rather than N. It's really painful to find them when they're sent
>> separately, without a known upper bound.
> 
> Accepted.
> 
>>
>> [...]
>>
>>> +/*
>>> + * Delete a group from a tree. If the group is directly attached to
>>> the tree
>>> + * it also detaches all groups on the group's group_list list.
>>> + */
>>> +static void
>>> +perf_cpu_tree_delete(struct rb_root *tree, struct perf_event *event)
>>> +{
>>> +    WARN_ON_ONCE(!tree || !event);
>>> +
>>> +    if (RB_EMPTY_NODE(&event->group_node)) {
>>> +        list_del_init(&event->group_entry);
>>> +    } else {
>>> +        struct perf_event *list_event, *list_next;
>>> +
>>> +        rb_erase(&event->group_node, tree);
>>> +        list_for_each_entry_safe(list_event, list_next,
>>> +                &event->group_list, group_entry)
>>> +            list_del_init(&list_event->group_entry);
>>> +    }
>>> +}
>>
>> As I commented on the last version, this means that all groups which
>> were (incidentally) hanging off of this one are removed, and can
>> no longer be reached via the tree.
>>
>> Surely one of the remaining groups should be added to the tree?
> 
> Aww, I see. That needs to implemented. Thanks.

Addressing that inconsistency like this:

static void
perf_cpu_tree_delete(struct rb_root *tree, struct perf_event *event)
{
     struct perf_event *next;

     WARN_ON_ONCE(!tree || !event);

     list_del_init(&event->group_entry);

     if (!RB_EMPTY_NODE(&event->group_node)) {
         if (!list_empty(&event->group_list)) {
             next = list_first_entry(&event->group_list,
                     struct perf_event, group_entry);
             list_replace_init(&event->group_list,
                     &next->group_list);
             rb_replace_node(&event->group_node,
                     &next->group_node, tree);
         } else {
             rb_erase(&event->group_node, tree);
         }
         RB_CLEAR_NODE(&event->group_node);
     }
}

solves list_del corruption:

list_del corruption. prev->next should be ffff88c2c4654010, but was
ffff88c31eb0c020
[  607.632813] ------------[ cut here ]------------
[  607.632816] kernel BUG at lib/list_debug.c:53!

[  607.635531] Call Trace:
[  607.635583]  list_del_event+0x1d7/0x210

and x86_pmu_start warning:

[  484.804737] WARNING: CPU: 15 PID: 31168 at 
arch/x86/events/core.c:1257 x86_pmu_start+0x8f/0xb0

[  484.804938] RIP: 0010:x86_pmu_start+0x8f/0xb0
[  484.804971] Call Trace:
[  484.804976]  <IRQ>
[  484.804984]  x86_pmu_enable+0x27f/0x2f0

> 
>>
>> I don't beleive that is correct.
>>
>> I beleive it would be simpler to reason about a threaded rb-tree here,
>> since that special case wouldn't exist.
>>
>> Thanks,
>> Mark.
>>
> 
> 

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2017-06-21  9:25 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-06-15 23:03 [RESEND PATCH v3 1/n] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi Alexey Budankov
2017-06-20 13:44 ` Mark Rutland
2017-06-20 15:12   ` Alexey Budankov
2017-06-20 16:38     ` Mark Rutland
2017-06-21  9:25     ` Alexey Budankov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).