All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3 1/n] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi
@ 2017-06-15 17:41 Alexey Budankov
  2017-06-15 19:56 ` Mark Rutland
  0 siblings, 1 reply; 31+ messages in thread
From: Alexey Budankov @ 2017-06-15 17:41 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin
  Cc: Andi Kleen, Kan Liang, Dmitri Prokhorov, Valery Cherepennikov,
	David Carrillo-Cisneros, Stephane Eranian, Mark Rutland,
	linux-kernel

This series of patches continues v2 and addresses captured comments.

Specifically this patch replaces pinned_groups and flexible_groups
lists of perf_event_context by red-black cpu indexed trees avoiding data 
structures duplication and introducing possibility to iterate event 
groups for a specific CPU only.

Signed-off-by: Alexey Budankov <alexey.budankov@linux.intel.com>
---
  include/linux/perf_event.h |  34 +++-
  kernel/events/core.c       | 386 
+++++++++++++++++++++++++++++++++------------
  2 files changed, 315 insertions(+), 105 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 24a6358..2c1dcf1 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -574,6 +574,27 @@ struct perf_event {
  	struct list_head		sibling_list;

  	/*
+	 * Node on the pinned or flexible tree located at the event context;
+	 * the node may be empty in case its event is not directly attached
+	 * to the tree but to group_list list of the event directly
+	 * attached to the tree;
+	 */
+	struct rb_node			group_node;
+	/*
+	 * List keeps groups allocated for the same cpu;
+	 * the list may be empty in case its event is not directly
+	 * attached to the tree but to group_list list of the event directly
+	 * attached to the tree;
+	 */
+	struct list_head		group_list;
+	/*
+	 * Entry into the group_list list above;
+	 * the entry may be attached to the self group_list list above
+	 * in case the event is directly attached to the pinned or
+	 * flexible tree;
+	 */
+	struct list_head		group_list_entry;
+	/*
  	 * We need storage to track the entries in perf_pmu_migrate_context; we
  	 * cannot use the event_entry because of RCU and we want to keep the
  	 * group in tact which avoids us using the other two entries.
@@ -741,8 +762,17 @@ struct perf_event_context {
  	struct mutex			mutex;

  	struct list_head		active_ctx_list;
-	struct list_head		pinned_groups;
-	struct list_head		flexible_groups;
+	/*
+	 * RB tree for pinned groups; keeps event's group_node
+	 * nodes of attached pinned groups;
+	 */
+	struct rb_root			pinned_tree;
+	/*
+	 * RB tree for flexible groups; keeps event's group_node
+	 * nodes of attached flexible groups;
+	 */
+	struct rb_root			flexible_tree;
+
  	struct list_head		event_list;
  	int				nr_events;
  	int				nr_active;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index bc63f8d..a3531f9 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1458,13 +1458,142 @@ static enum event_type_t get_event_type(struct 
perf_event *event)
  	return event_type;
  }

-static struct list_head *
-ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
+static struct rb_root *
+ctx_group_cpu_tree(struct perf_event *event, struct perf_event_context 
*ctx)
  {
  	if (event->attr.pinned)
-		return &ctx->pinned_groups;
+		return &ctx->pinned_tree;
  	else
-		return &ctx->flexible_groups;
+		return &ctx->flexible_tree;
+}
+
+/*
+ * Insert a group into a tree using event->cpu as a key. If event->cpu node
+ * is already attached to the tree then the event is added to the attached
+ * group's group_list list.
+ */
+static void
+perf_cpu_tree_insert(struct rb_root *tree, struct perf_event *event)
+{
+	struct rb_node **node;
+	struct rb_node *parent;
+
+	WARN_ON_ONCE(!tree || !event);
+
+	node = &tree->rb_node;
+	parent = *node;
+
+	while (*node) {
+		struct perf_event *node_event =	container_of(*node,
+				struct perf_event, group_node);
+
+		parent = *node;
+
+		if (event->cpu < node_event->cpu) {
+			node = &parent->rb_left;
+		} else if (event->cpu > node_event->cpu) {
+			node = &parent->rb_right;
+		} else {
+			list_add_tail(&event->group_list_entry,
+					&node_event->group_list);
+			return;
+		}
+	}
+
+	list_add_tail(&event->group_list_entry, &event->group_list);
+
+	rb_link_node(&event->group_node, parent, node);
+	rb_insert_color(&event->group_node, tree);
+}
+
+/*
+ * Delete a group from a tree. If the group is directly attached to the 
tree
+ * it also detaches all groups on the group's group_list list.
+ */
+static void
+perf_cpu_tree_delete(struct rb_root *tree, struct perf_event *event)
+{
+	WARN_ON_ONCE(!tree || !event);
+
+	if (RB_EMPTY_NODE(&event->group_node)) {
+		list_del_init(&event->group_list_entry);
+	} else {
+		struct perf_event *list_event, *list_next;
+
+		rb_erase(&event->group_node, tree);
+		list_for_each_entry_safe(list_event, list_next,
+				&event->group_list, group_list_entry)
+			list_del_init(&list_event->group_list_entry);
+	}
+}
+
+/*
+ * Find group_list list by a cpu key and call provided callback for every
+ * group on the list. Iteration stops if the callback returns non zero.
+ */
+
+typedef int (*perf_cpu_tree_callback_t)(struct perf_event *, void *);
+
+static int
+perf_cpu_tree_iterate_cpu(struct rb_root *tree, int cpu,
+		perf_cpu_tree_callback_t callback, void *data)
+{
+	int ret = 0;
+	struct rb_node *node;
+	struct perf_event *event;
+
+	WARN_ON_ONCE(!tree);
+
+	node = tree->rb_node;
+
+	while (node) {
+		struct perf_event *node_event = container_of(node,
+				struct perf_event, group_node);
+
+		if (cpu < node_event->cpu) {
+			node = node->rb_left;
+		} else if (cpu > node_event->cpu) {
+			node = node->rb_right;
+		} else {
+			list_for_each_entry(event, &node_event->group_list,
+					group_list_entry) {
+				ret = callback(event, data);
+				if (ret)
+					return ret;
+			}
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Iterate a tree and call provided callback for every group in the tree.
+ * Iteration stops if the callback returns non zero.
+ */
+static int
+perf_cpu_tree_iterate(struct rb_root *tree,
+		perf_cpu_tree_callback_t callback, void *data)
+{
+	int ret = 0;
+	struct rb_node *node;
+	struct perf_event *event;
+
+	WARN_ON_ONCE(!tree);
+
+	for (node = rb_first(tree); node; node = rb_next(node)) {
+		struct perf_event *node_event = container_of(node,
+				struct perf_event, group_node);
+
+		list_for_each_entry(event, &node_event->group_list,
+				group_list_entry) {
+			ret = callback(event, data);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return 0;
  }

  /*
@@ -1485,12 +1614,12 @@ list_add_event(struct perf_event *event, struct 
perf_event_context *ctx)
  	 * perf_group_detach can, at all times, locate all siblings.
  	 */
  	if (event->group_leader == event) {
-		struct list_head *list;
+		struct rb_root *tree;

  		event->group_caps = event->event_caps;

-		list = ctx_group_list(event, ctx);
-		list_add_tail(&event->group_entry, list);
+		tree = ctx_group_cpu_tree(event, ctx);
+		perf_cpu_tree_insert(tree, event);
  	}

  	list_update_cgroup_event(event, ctx, true);
@@ -1680,8 +1809,12 @@ list_del_event(struct perf_event *event, struct 
perf_event_context *ctx)

  	list_del_rcu(&event->event_entry);

-	if (event->group_leader == event)
-		list_del_init(&event->group_entry);
+	if (event->group_leader == event) {
+		struct rb_root *tree;
+
+		tree = ctx_group_cpu_tree(event, ctx);
+		perf_cpu_tree_delete(tree, event);
+	}

  	update_group_times(event);

@@ -2699,12 +2832,80 @@ int perf_event_refresh(struct perf_event *event, 
int refresh)
  }
  EXPORT_SYMBOL_GPL(perf_event_refresh);

+static void
+sched_in_group(struct perf_event *event,
+		struct perf_event_context *ctx,
+		struct perf_cpu_context *cpuctx,
+		int *can_add_hw)
+{
+	/* Ignore events in OFF or ERROR state and
+	 * listen to the 'cpu' scheduling filter constraint
+	 * of events:
+	 */
+	if (event->state <= PERF_EVENT_STATE_OFF || !event_filter_match(event))
+		return;
+
+	/* may need to reset tstamp_enabled */
+	if (is_cgroup_event(event))
+		perf_cgroup_mark_enabled(event, ctx);
+
+	if (group_can_go_on(event, cpuctx, *can_add_hw)) {
+		if (group_sched_in(event, cpuctx, ctx))
+			*can_add_hw = 0;
+	}
+}
+
+struct group_sched_params {
+	struct perf_cpu_context *cpuctx;
+	struct perf_event_context *ctx;
+	int can_add_hw;
+};
+
+static int
+group_sched_in_pinned_callback(struct perf_event *event, void *data)
+{
+	struct group_sched_params *params = data;
+	int can_add_hw = 1;
+
+	sched_in_group(event, params->ctx, params->cpuctx, &can_add_hw);
+
+	if (!can_add_hw) {
+		update_group_times(event);
+		event->state = PERF_EVENT_STATE_ERROR;
+	}
+
+	return 0;
+}
+
+static int
+group_sched_in_flexible_callback(struct perf_event *event, void *data)
+{
+	struct group_sched_params *params = data;
+
+	sched_in_group(event, params->ctx, params->cpuctx,
+			&params->can_add_hw);
+
+	return 0;
+}
+
+static int group_sched_out_callback(struct perf_event *event, void *data)
+{
+	struct group_sched_params *params = data;
+
+	group_sched_out(event, params->cpuctx, params->ctx);
+
+	return 0;
+}
+
  static void ctx_sched_out(struct perf_event_context *ctx,
  			  struct perf_cpu_context *cpuctx,
  			  enum event_type_t event_type)
  {
  	int is_active = ctx->is_active;
-	struct perf_event *event;
+	struct group_sched_params params = {
+			.cpuctx = cpuctx,
+			.ctx = ctx
+	};

  	lockdep_assert_held(&ctx->lock);

@@ -2750,15 +2951,15 @@ static void ctx_sched_out(struct 
perf_event_context *ctx,
  		return;

  	perf_pmu_disable(ctx->pmu);
-	if (is_active & EVENT_PINNED) {
-		list_for_each_entry(event, &ctx->pinned_groups, group_entry)
-			group_sched_out(event, cpuctx, ctx);
-	}

-	if (is_active & EVENT_FLEXIBLE) {
-		list_for_each_entry(event, &ctx->flexible_groups, group_entry)
-			group_sched_out(event, cpuctx, ctx);
-	}
+	if (is_active & EVENT_PINNED)
+		perf_cpu_tree_iterate(&ctx->pinned_tree,
+				group_sched_out_callback, &params);
+
+	if (is_active & EVENT_FLEXIBLE)
+		perf_cpu_tree_iterate(&ctx->flexible_tree,
+				group_sched_out_callback, &params);
+
  	perf_pmu_enable(ctx->pmu);
  }

@@ -3052,72 +3253,18 @@ static void cpu_ctx_sched_out(struct 
perf_cpu_context *cpuctx,
  }

  static void
-ctx_pinned_sched_in(struct perf_event_context *ctx,
-		    struct perf_cpu_context *cpuctx)
-{
-	struct perf_event *event;
-
-	list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
-		if (event->state <= PERF_EVENT_STATE_OFF)
-			continue;
-		if (!event_filter_match(event))
-			continue;
-
-		/* may need to reset tstamp_enabled */
-		if (is_cgroup_event(event))
-			perf_cgroup_mark_enabled(event, ctx);
-
-		if (group_can_go_on(event, cpuctx, 1))
-			group_sched_in(event, cpuctx, ctx);
-
-		/*
-		 * If this pinned group hasn't been scheduled,
-		 * put it in error state.
-		 */
-		if (event->state == PERF_EVENT_STATE_INACTIVE) {
-			update_group_times(event);
-			event->state = PERF_EVENT_STATE_ERROR;
-		}
-	}
-}
-
-static void
-ctx_flexible_sched_in(struct perf_event_context *ctx,
-		      struct perf_cpu_context *cpuctx)
-{
-	struct perf_event *event;
-	int can_add_hw = 1;
-
-	list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
-		/* Ignore events in OFF or ERROR state */
-		if (event->state <= PERF_EVENT_STATE_OFF)
-			continue;
-		/*
-		 * Listen to the 'cpu' scheduling filter constraint
-		 * of events:
-		 */
-		if (!event_filter_match(event))
-			continue;
-
-		/* may need to reset tstamp_enabled */
-		if (is_cgroup_event(event))
-			perf_cgroup_mark_enabled(event, ctx);
-
-		if (group_can_go_on(event, cpuctx, can_add_hw)) {
-			if (group_sched_in(event, cpuctx, ctx))
-				can_add_hw = 0;
-		}
-	}
-}
-
-static void
  ctx_sched_in(struct perf_event_context *ctx,
  	     struct perf_cpu_context *cpuctx,
  	     enum event_type_t event_type,
  	     struct task_struct *task)
  {
  	int is_active = ctx->is_active;
-	u64 now;
+	struct group_sched_params params = {
+			.cpuctx = cpuctx,
+			.ctx = ctx,
+			.can_add_hw = 1
+
+	};

  	lockdep_assert_held(&ctx->lock);

@@ -3136,8 +3283,7 @@ ctx_sched_in(struct perf_event_context *ctx,

  	if (is_active & EVENT_TIME) {
  		/* start ctx time */
-		now = perf_clock();
-		ctx->timestamp = now;
+		ctx->timestamp = perf_clock();
  		perf_cgroup_set_timestamp(task, ctx);
  	}

@@ -3146,11 +3292,13 @@ ctx_sched_in(struct perf_event_context *ctx,
  	 * in order to give them the best chance of going on.
  	 */
  	if (is_active & EVENT_PINNED)
-		ctx_pinned_sched_in(ctx, cpuctx);
+		perf_cpu_tree_iterate(&ctx->pinned_tree,
+				group_sched_in_pinned_callback, &params);

  	/* Then walk through the lower prio flexible groups */
  	if (is_active & EVENT_FLEXIBLE)
-		ctx_flexible_sched_in(ctx, cpuctx);
+		perf_cpu_tree_iterate(&ctx->flexible_tree,
+				group_sched_in_flexible_callback, &params);
  }

  static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
@@ -3181,7 +3329,7 @@ static void perf_event_context_sched_in(struct 
perf_event_context *ctx,
  	 * However, if task's ctx is not carrying any pinned
  	 * events, no need to flip the cpuctx's events around.
  	 */
-	if (!list_empty(&ctx->pinned_groups))
+	if (!RB_EMPTY_ROOT(&ctx->pinned_tree))
  		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
  	perf_event_sched_in(cpuctx, ctx, task);
  	perf_pmu_enable(ctx->pmu);
@@ -3410,14 +3558,25 @@ static void 
perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
  /*
   * Round-robin a context's events:
   */
+static int rotate_ctx_callback(struct perf_event *event, void *data)
+{
+	list_rotate_left(&event->group_list);
+
+	return 1;
+}
+
  static void rotate_ctx(struct perf_event_context *ctx)
  {
  	/*
  	 * Rotate the first entry last of non-pinned groups. Rotation might be
  	 * disabled by the inheritance code.
  	 */
-	if (!ctx->rotate_disable)
-		list_rotate_left(&ctx->flexible_groups);
+	if (!ctx->rotate_disable) {
+		perf_cpu_tree_iterate_cpu(&ctx->flexible_tree,
+				-1, rotate_ctx_callback, NULL);
+		perf_cpu_tree_iterate_cpu(&ctx->flexible_tree,
+				smp_processor_id(), rotate_ctx_callback, NULL);
+	}
  }

  static int perf_rotate_context(struct perf_cpu_context *cpuctx)
@@ -3743,8 +3902,8 @@ static void __perf_event_init_context(struct 
perf_event_context *ctx)
  	raw_spin_lock_init(&ctx->lock);
  	mutex_init(&ctx->mutex);
  	INIT_LIST_HEAD(&ctx->active_ctx_list);
-	INIT_LIST_HEAD(&ctx->pinned_groups);
-	INIT_LIST_HEAD(&ctx->flexible_groups);
+	ctx->pinned_tree = RB_ROOT;
+	ctx->flexible_tree = RB_ROOT;
  	INIT_LIST_HEAD(&ctx->event_list);
  	atomic_set(&ctx->refcount, 1);
  }
@@ -9377,6 +9536,9 @@ perf_event_alloc(struct perf_event_attr *attr, int 
cpu,
  	INIT_LIST_HEAD(&event->child_list);

  	INIT_LIST_HEAD(&event->group_entry);
+	RB_CLEAR_NODE(&event->group_node);
+	INIT_LIST_HEAD(&event->group_list);
+	INIT_LIST_HEAD(&event->group_list_entry);
  	INIT_LIST_HEAD(&event->event_entry);
  	INIT_LIST_HEAD(&event->sibling_list);
  	INIT_LIST_HEAD(&event->rb_entry);
@@ -10816,6 +10978,23 @@ inherit_task_group(struct perf_event *event, 
struct task_struct *parent,
  	return ret;
  }

+struct inherit_task_group_params {
+	struct task_struct *parent;
+	struct perf_event_context *parent_ctx;
+	struct task_struct *child;
+	int ctxn;
+	int inherited_all;
+};
+
+static int
+inherit_task_group_callback(struct perf_event *event, void *data)
+{
+	struct inherit_task_group_params *params = data;
+
+	return inherit_task_group(event, params->parent, params->parent_ctx,
+			 params->child, params->ctxn, &params->inherited_all);
+}
+
  /*
   * Initialize the perf_event context in task_struct
   */
@@ -10823,11 +11002,15 @@ static int perf_event_init_context(struct 
task_struct *child, int ctxn)
  {
  	struct perf_event_context *child_ctx, *parent_ctx;
  	struct perf_event_context *cloned_ctx;
-	struct perf_event *event;
  	struct task_struct *parent = current;
-	int inherited_all = 1;
  	unsigned long flags;
  	int ret = 0;
+	struct inherit_task_group_params params = {
+			.parent = parent,
+			.child = child,
+			.ctxn = ctxn,
+			.inherited_all = 1
+	};

  	if (likely(!parent->perf_event_ctxp[ctxn]))
  		return 0;
@@ -10839,7 +11022,8 @@ static int perf_event_init_context(struct 
task_struct *child, int ctxn)
  	parent_ctx = perf_pin_task_context(parent, ctxn);
  	if (!parent_ctx)
  		return 0;
-
+	else
+		params.parent_ctx = parent_ctx;
  	/*
  	 * No need to check if parent_ctx != NULL here; since we saw
  	 * it non-NULL earlier, the only reason for it to become NULL
@@ -10857,12 +11041,10 @@ static int perf_event_init_context(struct 
task_struct *child, int ctxn)
  	 * We dont have to disable NMIs - we are only looking at
  	 * the list, not manipulating it:
  	 */
-	list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
-		ret = inherit_task_group(event, parent, parent_ctx,
-					 child, ctxn, &inherited_all);
-		if (ret)
-			goto out_unlock;
-	}
+	ret = perf_cpu_tree_iterate(&parent_ctx->pinned_tree,
+			inherit_task_group_callback, &params);
+	if (ret)
+		goto out_unlock;

  	/*
  	 * We can't hold ctx->lock when iterating the ->flexible_group list due
@@ -10873,19 +11055,17 @@ static int perf_event_init_context(struct 
task_struct *child, int ctxn)
  	parent_ctx->rotate_disable = 1;
  	raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);

-	list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
-		ret = inherit_task_group(event, parent, parent_ctx,
-					 child, ctxn, &inherited_all);
-		if (ret)
-			goto out_unlock;
-	}
+	ret = perf_cpu_tree_iterate(&parent_ctx->flexible_tree,
+				inherit_task_group_callback, &params);
+	if (ret)
+		goto out_unlock;

  	raw_spin_lock_irqsave(&parent_ctx->lock, flags);
  	parent_ctx->rotate_disable = 0;

  	child_ctx = child->perf_event_ctxp[ctxn];

-	if (child_ctx && inherited_all) {
+	if (child_ctx && params.inherited_all) {
  		/*
  		 * Mark the child context as a clone of the parent
  		 * context, or of whatever the parent is a clone of.

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 1/n] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi
  2017-06-15 17:41 [PATCH v3 1/n] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi Alexey Budankov
@ 2017-06-15 19:56 ` Mark Rutland
  2017-06-15 22:10   ` Alexey Budankov
  2017-06-19 20:31   ` Alexey Budankov
  0 siblings, 2 replies; 31+ messages in thread
From: Mark Rutland @ 2017-06-15 19:56 UTC (permalink / raw)
  To: Alexey Budankov
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Andi Kleen, Kan Liang, Dmitri Prokhorov,
	Valery Cherepennikov, David Carrillo-Cisneros, Stephane Eranian,
	linux-kernel

Hi,

On Thu, Jun 15, 2017 at 08:41:42PM +0300, Alexey Budankov wrote:
> This series of patches continues v2 and addresses captured comments.

As a general note, please keep any change log stuff out of the commit
message, and mvoe that just before the diff. It generally doesn't maek
sense for that to go in the commit message.

> Specifically this patch replaces pinned_groups and flexible_groups
> lists of perf_event_context by red-black cpu indexed trees avoiding
> data structures duplication and introducing possibility to iterate
> event groups for a specific CPU only.

If you use --per-thread, I take it the overhead is significantly
lowered?

As another general thing, please aim to write the commit message so that
it'll make sense in N years' time, in the git history. Describe the
whole problem, and the intended solution.

I had to go diving into mail archives to understand the problem you're
trying to solve, and I shouldn't have to.

For example, this commit message could be something like:

    perf/core: use rb trees for pinned/flexible groups

    By default, the userspace perf tool opens per-cpu task-bound events
    when sampling, so for N logical events requested by the user, the tool
    will open N * NR_CPUS events.

    In the kernel, we mux events with a hrtimer, periodically rotating the
    event list and trying to schedule each in turn. We skip events whose
    cpu filter doesn't match. So when we get unlucky, we can walk N *
    (NR_CPUS - 1) events pointlessly for each hrtimer invocation.

    This has been observed to result in significant overhead when running
    the STREAM benchmark on 272 core Xeon Phi systems.

    One way to avoid this is to place our events into an rb tree sorted by
    CPU filter, so that our hrtimer can skip to the current CPU's
    sub-tree, and ignore everything else.

    As a step towards that, this patch replaces our event lists with rb
    trees.
 
... which hopefully makes sense now, and will in 2, 5, 10 years' time,

> 
> Signed-off-by: Alexey Budankov <alexey.budankov@linux.intel.com>

Have you thrown Vince's perf fuzzer at this?

If you haven't, please do. It can be found in the fuzzer directory of:

https://github.com/deater/perf_event_tests

> ---
>  include/linux/perf_event.h |  34 +++-
>  kernel/events/core.c       | 386
> +++++++++++++++++++++++++++++++++------------
>  2 files changed, 315 insertions(+), 105 deletions(-)

<changelog goes here>

> 
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index 24a6358..2c1dcf1 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -574,6 +574,27 @@ struct perf_event {
>  	struct list_head		sibling_list;
> 
>  	/*
> +	 * Node on the pinned or flexible tree located at the event context;
> +	 * the node may be empty in case its event is not directly attached
> +	 * to the tree but to group_list list of the event directly
> +	 * attached to the tree;
> +	 */
> +	struct rb_node			group_node;
> +	/*
> +	 * List keeps groups allocated for the same cpu;
> +	 * the list may be empty in case its event is not directly
> +	 * attached to the tree but to group_list list of the event directly
> +	 * attached to the tree;
> +	 */
> +	struct list_head		group_list;
> +	/*
> +	 * Entry into the group_list list above;
> +	 * the entry may be attached to the self group_list list above
> +	 * in case the event is directly attached to the pinned or
> +	 * flexible tree;
> +	 */
> +	struct list_head		group_list_entry;

We already have group_entry for threading the RB tree. i don't see why
we need two new lists heads.

> +	/*
>  	 * We need storage to track the entries in perf_pmu_migrate_context; we
>  	 * cannot use the event_entry because of RCU and we want to keep the
>  	 * group in tact which avoids us using the other two entries.
> @@ -741,8 +762,17 @@ struct perf_event_context {
>  	struct mutex			mutex;
> 
>  	struct list_head		active_ctx_list;
> -	struct list_head		pinned_groups;
> -	struct list_head		flexible_groups;
> +	/*
> +	 * RB tree for pinned groups; keeps event's group_node
> +	 * nodes of attached pinned groups;
> +	 */
> +	struct rb_root			pinned_tree;
> +	/*
> +	 * RB tree for flexible groups; keeps event's group_node
> +	 * nodes of attached flexible groups;
> +	 */
> +	struct rb_root			flexible_tree;

We didn't need these commesnts before, and I don't think we need them
now.

Any reason not to stick with the existing names?

The existing {pinned,flexible}_groups names are fine regardless of
whether a list or tree is used, and makes it clear that the elements are
the group leaders, not all events.

> +
>  	struct list_head		event_list;
>  	int				nr_events;
>  	int				nr_active;
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index bc63f8d..a3531f9 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -1458,13 +1458,142 @@ static enum event_type_t
> get_event_type(struct perf_event *event)
>  	return event_type;
>  }
> 
> -static struct list_head *
> -ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
> +static struct rb_root *
> +ctx_group_cpu_tree(struct perf_event *event, struct
> perf_event_context *ctx)
>  {
>  	if (event->attr.pinned)
> -		return &ctx->pinned_groups;
> +		return &ctx->pinned_tree;
>  	else
> -		return &ctx->flexible_groups;
> +		return &ctx->flexible_tree;
> +}
> +
> +/*
> + * Insert a group into a tree using event->cpu as a key. If event->cpu node
> + * is already attached to the tree then the event is added to the attached
> + * group's group_list list.
> + */
> +static void
> +perf_cpu_tree_insert(struct rb_root *tree, struct perf_event *event)
> +{
> +	struct rb_node **node;
> +	struct rb_node *parent;
> +
> +	WARN_ON_ONCE(!tree || !event);
> +
> +	node = &tree->rb_node;
> +	parent = *node;

The first iteration of the loop handles this, so it can go.

> +
> +	while (*node) {
> +		struct perf_event *node_event =	container_of(*node,
> +				struct perf_event, group_node);
> +
> +		parent = *node;
> +
> +		if (event->cpu < node_event->cpu) {
> +			node = &parent->rb_left;
> +		} else if (event->cpu > node_event->cpu) {
> +			node = &parent->rb_right;
> +		} else {
> +			list_add_tail(&event->group_list_entry,
> +					&node_event->group_list);
> +			return;

Why aren't we putting all group leaders into the tree?

I don't think this is what Peter meant when he suggested a threaded rb
tree.

My understanding was that every group leader should be in the rb tree,
and every group leader should be in the same list that threads the whole
tree. Both the rb tree and list have to be maintained with the same
order.

Thus, you can walk the rb tree to find the first event in the sub-list
that you care about, then walk that sub-list.

Note that this means we need to shuffle the list and rb tree to rotate
events in each sub-tree.

> +		}
> +	}
> +
> +	list_add_tail(&event->group_list_entry, &event->group_list);
> +
> +	rb_link_node(&event->group_node, parent, node);
> +	rb_insert_color(&event->group_node, tree);
> +}
> +
> +/*
> + * Delete a group from a tree. If the group is directly attached to
> the tree
> + * it also detaches all groups on the group's group_list list.
> + */
> +static void
> +perf_cpu_tree_delete(struct rb_root *tree, struct perf_event *event)
> +{
> +	WARN_ON_ONCE(!tree || !event);
> +
> +	if (RB_EMPTY_NODE(&event->group_node)) {
> +		list_del_init(&event->group_list_entry);
> +	} else {
> +		struct perf_event *list_event, *list_next;
> +
> +		rb_erase(&event->group_node, tree);
> +		list_for_each_entry_safe(list_event, list_next,
> +				&event->group_list, group_list_entry)
> +			list_del_init(&list_event->group_list_entry);

At this point, all the other group leaders withthe same filter are gone
from the rb tree, since we didn't insert any of them into the rb tree in
place of the leader we deleted.

> +	}
> +}
> +
> +/*
> + * Find group_list list by a cpu key and call provided callback for every
> + * group on the list. Iteration stops if the callback returns non zero.
> + */
> +
> +typedef int (*perf_cpu_tree_callback_t)(struct perf_event *, void *);
> +
> +static int
> +perf_cpu_tree_iterate_cpu(struct rb_root *tree, int cpu,
> +		perf_cpu_tree_callback_t callback, void *data)
> +{
> +	int ret = 0;
> +	struct rb_node *node;
> +	struct perf_event *event;
> +
> +	WARN_ON_ONCE(!tree);
> +
> +	node = tree->rb_node;
> +
> +	while (node) {
> +		struct perf_event *node_event = container_of(node,
> +				struct perf_event, group_node);
> +
> +		if (cpu < node_event->cpu) {
> +			node = node->rb_left;
> +		} else if (cpu > node_event->cpu) {
> +			node = node->rb_right;
> +		} else {
> +			list_for_each_entry(event, &node_event->group_list,
> +					group_list_entry) {
> +				ret = callback(event, data);
> +				if (ret)
> +					return ret;
> +			}
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +/*
> + * Iterate a tree and call provided callback for every group in the tree.
> + * Iteration stops if the callback returns non zero.
> + */
> +static int
> +perf_cpu_tree_iterate(struct rb_root *tree,
> +		perf_cpu_tree_callback_t callback, void *data)
> +{
> +	int ret = 0;
> +	struct rb_node *node;
> +	struct perf_event *event;
> +
> +	WARN_ON_ONCE(!tree);
> +
> +	for (node = rb_first(tree); node; node = rb_next(node)) {
> +		struct perf_event *node_event = container_of(node,
> +				struct perf_event, group_node);
> +
> +		list_for_each_entry(event, &node_event->group_list,
> +				group_list_entry) {
> +			ret = callback(event, data);
> +			if (ret)
> +				return ret;
> +		}
> +	}
> +
> +	return 0;
>  }

If you need to iterate over every event, you can use the list that
threads the whole tree.

> 
>  /*
> @@ -1485,12 +1614,12 @@ list_add_event(struct perf_event *event,
> struct perf_event_context *ctx)
>  	 * perf_group_detach can, at all times, locate all siblings.
>  	 */
>  	if (event->group_leader == event) {
> -		struct list_head *list;
> +		struct rb_root *tree;
> 
>  		event->group_caps = event->event_caps;
> 
> -		list = ctx_group_list(event, ctx);
> -		list_add_tail(&event->group_entry, list);
> +		tree = ctx_group_cpu_tree(event, ctx);
> +		perf_cpu_tree_insert(tree, event);

Can we wrap this in a helper so that finds the sub-tree and performs
the insert?

>  	}
> 
>  	list_update_cgroup_event(event, ctx, true);
> @@ -1680,8 +1809,12 @@ list_del_event(struct perf_event *event,
> struct perf_event_context *ctx)
> 
>  	list_del_rcu(&event->event_entry);
> 
> -	if (event->group_leader == event)
> -		list_del_init(&event->group_entry);
> +	if (event->group_leader == event) {
> +		struct rb_root *tree;
> +
> +		tree = ctx_group_cpu_tree(event, ctx);
> +		perf_cpu_tree_delete(tree, event);

... likewise?

Thanks,
Mark.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 1/n] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi
  2017-06-15 19:56 ` Mark Rutland
@ 2017-06-15 22:10   ` Alexey Budankov
  2017-06-16  9:09     ` Mark Rutland
  2017-06-19 13:08     ` Alexey Budankov
  2017-06-19 20:31   ` Alexey Budankov
  1 sibling, 2 replies; 31+ messages in thread
From: Alexey Budankov @ 2017-06-15 22:10 UTC (permalink / raw)
  To: Mark Rutland
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Andi Kleen, Kan Liang, Dmitri Prokhorov,
	Valery Cherepennikov, David Carrillo-Cisneros, Stephane Eranian,
	linux-kernel

Hi,

On 15.06.2017 22:56, Mark Rutland wrote:
> Hi,
> 
> On Thu, Jun 15, 2017 at 08:41:42PM +0300, Alexey Budankov wrote:
>> This series of patches continues v2 and addresses captured comments.
> 
> As a general note, please keep any change log stuff out of the commit
> message, and mvoe that just before the diff. It generally doesn't maek
> sense for that to go in the commit message.

Accepted.

> 
>> Specifically this patch replaces pinned_groups and flexible_groups
>> lists of perf_event_context by red-black cpu indexed trees avoiding
>> data structures duplication and introducing possibility to iterate
>> event groups for a specific CPU only.
> 
> If you use --per-thread, I take it the overhead is significantly
> lowered?

Please ask more.

> 
> As another general thing, please aim to write the commit message so that
> it'll make sense in N years' time, in the git history. Describe the
> whole problem, and the intended solution.
> 
> I had to go diving into mail archives to understand the problem you're
> trying to solve, and I shouldn't have to.
> 
> For example, this commit message could be something like:
> 
>      perf/core: use rb trees for pinned/flexible groups
> 
>      By default, the userspace perf tool opens per-cpu task-bound events
>      when sampling, so for N logical events requested by the user, the tool
>      will open N * NR_CPUS events.
> 
>      In the kernel, we mux events with a hrtimer, periodically rotating the
>      event list and trying to schedule each in turn. We skip events whose
>      cpu filter doesn't match. So when we get unlucky, we can walk N *
>      (NR_CPUS - 1) events pointlessly for each hrtimer invocation.
> 
>      This has been observed to result in significant overhead when running
>      the STREAM benchmark on 272 core Xeon Phi systems.
> 
>      One way to avoid this is to place our events into an rb tree sorted by
>      CPU filter, so that our hrtimer can skip to the current CPU's
>      sub-tree, and ignore everything else.
> 
>      As a step towards that, this patch replaces our event lists with rb
>      trees.
>   
> ... which hopefully makes sense now, and will in 2, 5, 10 years' time,
> 

Accepted. Thanks.

>>
>> Signed-off-by: Alexey Budankov <alexey.budankov@linux.intel.com>
> 
> Have you thrown Vince's perf fuzzer at this?
> 
> If you haven't, please do. It can be found in the fuzzer directory of:
> 
> https://github.com/deater/perf_event_tests
> 

Accepted.

>> ---
>>   include/linux/perf_event.h |  34 +++-
>>   kernel/events/core.c       | 386
>> +++++++++++++++++++++++++++++++++------------
>>   2 files changed, 315 insertions(+), 105 deletions(-)
> 
> <changelog goes here>
> 
>>
>> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
>> index 24a6358..2c1dcf1 100644
>> --- a/include/linux/perf_event.h
>> +++ b/include/linux/perf_event.h
>> @@ -574,6 +574,27 @@ struct perf_event {
>>   	struct list_head		sibling_list;
>>
>>   	/*
>> +	 * Node on the pinned or flexible tree located at the event context;
>> +	 * the node may be empty in case its event is not directly attached
>> +	 * to the tree but to group_list list of the event directly
>> +	 * attached to the tree;
>> +	 */
>> +	struct rb_node			group_node;
>> +	/*
>> +	 * List keeps groups allocated for the same cpu;
>> +	 * the list may be empty in case its event is not directly
>> +	 * attached to the tree but to group_list list of the event directly
>> +	 * attached to the tree;
>> +	 */
>> +	struct list_head		group_list;
>> +	/*
>> +	 * Entry into the group_list list above;
>> +	 * the entry may be attached to the self group_list list above
>> +	 * in case the event is directly attached to the pinned or
>> +	 * flexible tree;
>> +	 */
>> +	struct list_head		group_list_entry;
> 
> We already have group_entry for threading the RB tree. i don't see why
> we need two new lists heads.

Ok. For a group leader event its group_entry may be used instead of new 
group_list_entry.

> 
>> +	/*
>>   	 * We need storage to track the entries in perf_pmu_migrate_context; we
>>   	 * cannot use the event_entry because of RCU and we want to keep the
>>   	 * group in tact which avoids us using the other two entries.
>> @@ -741,8 +762,17 @@ struct perf_event_context {
>>   	struct mutex			mutex;
>>
>>   	struct list_head		active_ctx_list;
>> -	struct list_head		pinned_groups;
>> -	struct list_head		flexible_groups;
>> +	/*
>> +	 * RB tree for pinned groups; keeps event's group_node
>> +	 * nodes of attached pinned groups;
>> +	 */
>> +	struct rb_root			pinned_tree;
>> +	/*
>> +	 * RB tree for flexible groups; keeps event's group_node
>> +	 * nodes of attached flexible groups;
>> +	 */
>> +	struct rb_root			flexible_tree;
> 
> We didn't need these commesnts before, and I don't think we need them
> now.
> 
> Any reason not to stick with the existing names?
> 
> The existing {pinned,flexible}_groups names are fine regardless of
> whether a list or tree is used, and makes it clear that the elements are
> the group leaders, not all events.

Yes, makes sense. Just changing the type of data structures.

> 
>> +
>>   	struct list_head		event_list;
>>   	int				nr_events;
>>   	int				nr_active;
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index bc63f8d..a3531f9 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -1458,13 +1458,142 @@ static enum event_type_t
>> get_event_type(struct perf_event *event)
>>   	return event_type;
>>   }
>>
>> -static struct list_head *
>> -ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
>> +static struct rb_root *
>> +ctx_group_cpu_tree(struct perf_event *event, struct
>> perf_event_context *ctx)
>>   {
>>   	if (event->attr.pinned)
>> -		return &ctx->pinned_groups;
>> +		return &ctx->pinned_tree;
>>   	else
>> -		return &ctx->flexible_groups;
>> +		return &ctx->flexible_tree;
>> +}
>> +
>> +/*
>> + * Insert a group into a tree using event->cpu as a key. If event->cpu node
>> + * is already attached to the tree then the event is added to the attached
>> + * group's group_list list.
>> + */
>> +static void
>> +perf_cpu_tree_insert(struct rb_root *tree, struct perf_event *event)
>> +{
>> +	struct rb_node **node;
>> +	struct rb_node *parent;
>> +
>> +	WARN_ON_ONCE(!tree || !event);
>> +
>> +	node = &tree->rb_node;
>> +	parent = *node;
> 
> The first iteration of the loop handles this, so it can go.

If tree is empty parent will be uninitialized what is harmful.

> 
>> +
>> +	while (*node) {
>> +		struct perf_event *node_event =	container_of(*node,
>> +				struct perf_event, group_node);
>> +
>> +		parent = *node;
>> +
>> +		if (event->cpu < node_event->cpu) {
>> +			node = &parent->rb_left;
>> +		} else if (event->cpu > node_event->cpu) {
>> +			node = &parent->rb_right;
>> +		} else {
>> +			list_add_tail(&event->group_list_entry,
>> +					&node_event->group_list);
>> +			return;
> 
> Why aren't we putting all group leaders into the tree?

Because we index a tree by cpu only and if two groups share cpu we link 
them into the group_list of the group directly attached to the tree via 
group_node.

> 
> I don't think this is what Peter meant when he suggested a threaded rb
> tree.

Yes, that's right. This implements cpu indexed rb trees with linked 
lists attached to every tree's node. The lists contain event groups 
allocated for the same cpu only.

> 
> My understanding was that every group leader should be in the rb tree,
> and every group leader should be in the same list that threads the whole
> tree. Both the rb tree and list have to be maintained with the same
> order.
> 
> Thus, you can walk the rb tree to find the first event in the sub-list
> that you care about, then walk that sub-list.
> 
> Note that this means we need to shuffle the list and rb tree to rotate
> events in each sub-tree.

Now we also rotate a list but only containing groups for the appropriate 
cpu and we don't touch the tree.

> 
>> +		}
>> +	}
>> +
>> +	list_add_tail(&event->group_list_entry, &event->group_list);
>> +
>> +	rb_link_node(&event->group_node, parent, node);
>> +	rb_insert_color(&event->group_node, tree);
>> +}
>> +
>> +/*
>> + * Delete a group from a tree. If the group is directly attached to
>> the tree
>> + * it also detaches all groups on the group's group_list list.
>> + */
>> +static void
>> +perf_cpu_tree_delete(struct rb_root *tree, struct perf_event *event)
>> +{
>> +	WARN_ON_ONCE(!tree || !event);
>> +
>> +	if (RB_EMPTY_NODE(&event->group_node)) {
>> +		list_del_init(&event->group_list_entry);
>> +	} else {
>> +		struct perf_event *list_event, *list_next;
>> +
>> +		rb_erase(&event->group_node, tree);
>> +		list_for_each_entry_safe(list_event, list_next,
>> +				&event->group_list, group_list_entry)
>> +			list_del_init(&list_event->group_list_entry);
> 
> At this point, all the other group leaders withthe same filter are gone
> from the rb tree, since we didn't insert any of them into the rb tree in
> place of the leader we deleted.
>  >> +	}
>> +}
>> +
>> +/*
>> + * Find group_list list by a cpu key and call provided callback for every
>> + * group on the list. Iteration stops if the callback returns non zero.
>> + */
>> +
>> +typedef int (*perf_cpu_tree_callback_t)(struct perf_event *, void *);
>> +
>> +static int
>> +perf_cpu_tree_iterate_cpu(struct rb_root *tree, int cpu,
>> +		perf_cpu_tree_callback_t callback, void *data)
>> +{
>> +	int ret = 0;
>> +	struct rb_node *node;
>> +	struct perf_event *event;
>> +
>> +	WARN_ON_ONCE(!tree);
>> +
>> +	node = tree->rb_node;
>> +
>> +	while (node) {
>> +		struct perf_event *node_event = container_of(node,
>> +				struct perf_event, group_node);
>> +
>> +		if (cpu < node_event->cpu) {
>> +			node = node->rb_left;
>> +		} else if (cpu > node_event->cpu) {
>> +			node = node->rb_right;
>> +		} else {
>> +			list_for_each_entry(event, &node_event->group_list,
>> +					group_list_entry) {
>> +				ret = callback(event, data);
>> +				if (ret)
>> +					return ret;
>> +			}
>> +		}
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +/*
>> + * Iterate a tree and call provided callback for every group in the tree.
>> + * Iteration stops if the callback returns non zero.
>> + */
>> +static int
>> +perf_cpu_tree_iterate(struct rb_root *tree,
>> +		perf_cpu_tree_callback_t callback, void *data)
>> +{
>> +	int ret = 0;
>> +	struct rb_node *node;
>> +	struct perf_event *event;
>> +
>> +	WARN_ON_ONCE(!tree);
>> +
>> +	for (node = rb_first(tree); node; node = rb_next(node)) {
>> +		struct perf_event *node_event = container_of(node,
>> +				struct perf_event, group_node);
>> +
>> +		list_for_each_entry(event, &node_event->group_list,
>> +				group_list_entry) {
>> +			ret = callback(event, data);
>> +			if (ret)
>> +				return ret;
>> +		}
>> +	}
>> +
>> +	return 0;
>>   }
> 
> If you need to iterate over every event, you can use the list that
> threads the whole tree.
> 
>>
>>   /*
>> @@ -1485,12 +1614,12 @@ list_add_event(struct perf_event *event,
>> struct perf_event_context *ctx)
>>   	 * perf_group_detach can, at all times, locate all siblings.
>>   	 */
>>   	if (event->group_leader == event) {
>> -		struct list_head *list;
>> +		struct rb_root *tree;
>>
>>   		event->group_caps = event->event_caps;
>>
>> -		list = ctx_group_list(event, ctx);
>> -		list_add_tail(&event->group_entry, list);
>> +		tree = ctx_group_cpu_tree(event, ctx);
>> +		perf_cpu_tree_insert(tree, event);
> 
> Can we wrap this in a helper so that finds the sub-tree and performs
> the insert?

Ok. add_to_groups().

> 
>>   	}
>>
>>   	list_update_cgroup_event(event, ctx, true);
>> @@ -1680,8 +1809,12 @@ list_del_event(struct perf_event *event,
>> struct perf_event_context *ctx)
>>
>>   	list_del_rcu(&event->event_entry);
>>
>> -	if (event->group_leader == event)
>> -		list_del_init(&event->group_entry);
>> +	if (event->group_leader == event) {
>> +		struct rb_root *tree;
>> +
>> +		tree = ctx_group_cpu_tree(event, ctx);
>> +		perf_cpu_tree_delete(tree, event);
> 
> ... likewise?

Ok. del_from_groups().

> 
> Thanks,
> Mark.
> 

Thanks,
Alexey

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 1/n] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi
  2017-06-15 22:10   ` Alexey Budankov
@ 2017-06-16  9:09     ` Mark Rutland
  2017-06-16 14:08       ` Alexey Budankov
  2017-06-19 13:08     ` Alexey Budankov
  1 sibling, 1 reply; 31+ messages in thread
From: Mark Rutland @ 2017-06-16  9:09 UTC (permalink / raw)
  To: Alexey Budankov
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Andi Kleen, Kan Liang, Dmitri Prokhorov,
	Valery Cherepennikov, David Carrillo-Cisneros, Stephane Eranian,
	linux-kernel

On Fri, Jun 16, 2017 at 01:10:10AM +0300, Alexey Budankov wrote:
> On 15.06.2017 22:56, Mark Rutland wrote:
> >On Thu, Jun 15, 2017 at 08:41:42PM +0300, Alexey Budankov wrote:
> >>This series of patches continues v2 and addresses captured comments.

> >>Specifically this patch replaces pinned_groups and flexible_groups
> >>lists of perf_event_context by red-black cpu indexed trees avoiding
> >>data structures duplication and introducing possibility to iterate
> >>event groups for a specific CPU only.
> >
> >If you use --per-thread, I take it the overhead is significantly
> >lowered?
> 
> Please ask more.

IIUC, you're seeing the slowdown when using perf record, correct?

There's a --per-thread option to ask perf record to not duplicate the
event per-cpu.

If you use that, what amount of slowdown do you see?

It might be preferable to not open task-bound per-cpu events on systems
with large cpu counts, and it would be good to know what the trade-off
looks like for this case.

> >>+static void
> >>+perf_cpu_tree_insert(struct rb_root *tree, struct perf_event *event)
> >>+{
> >>+	struct rb_node **node;
> >>+	struct rb_node *parent;
> >>+
> >>+	WARN_ON_ONCE(!tree || !event);
> >>+
> >>+	node = &tree->rb_node;
> >>+	parent = *node;
> >
> >The first iteration of the loop handles this, so it can go.
> 
> If tree is empty parent will be uninitialized what is harmful.

Sorry; my bad.

Thanks,
Mark.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 1/n] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi
  2017-06-16  9:09     ` Mark Rutland
@ 2017-06-16 14:08       ` Alexey Budankov
  2017-06-16 14:22         ` Alexey Budankov
  0 siblings, 1 reply; 31+ messages in thread
From: Alexey Budankov @ 2017-06-16 14:08 UTC (permalink / raw)
  To: Mark Rutland
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Andi Kleen, Kan Liang, Dmitri Prokhorov,
	Valery Cherepennikov, David Carrillo-Cisneros, Stephane Eranian,
	linux-kernel

On 16.06.2017 12:09, Mark Rutland wrote:
> On Fri, Jun 16, 2017 at 01:10:10AM +0300, Alexey Budankov wrote:
>> On 15.06.2017 22:56, Mark Rutland wrote:
>>> On Thu, Jun 15, 2017 at 08:41:42PM +0300, Alexey Budankov wrote:
>>>> This series of patches continues v2 and addresses captured comments.
> 
>>>> Specifically this patch replaces pinned_groups and flexible_groups
>>>> lists of perf_event_context by red-black cpu indexed trees avoiding
>>>> data structures duplication and introducing possibility to iterate
>>>> event groups for a specific CPU only.
>>>
>>> If you use --per-thread, I take it the overhead is significantly
>>> lowered?
>>
>> Please ask more.
> 
> IIUC, you're seeing the slowdown when using perf record, correct?

Correct. Specifically in per-process mode - without -a option.

> 
> There's a --per-thread option to ask perf record to not duplicate the
> event per-cpu.
> 
> If you use that, what amount of slowdown do you see?
> 
> It might be preferable to not open task-bound per-cpu events on systems
> with large cpu counts, and it would be good to know what the trade-off
> looks like for this case.
> 
>>>> +static void
>>>> +perf_cpu_tree_insert(struct rb_root *tree, struct perf_event *event)
>>>> +{
>>>> +	struct rb_node **node;
>>>> +	struct rb_node *parent;
>>>> +
>>>> +	WARN_ON_ONCE(!tree || !event);
>>>> +
>>>> +	node = &tree->rb_node;
>>>> +	parent = *node;
>>>
>>> The first iteration of the loop handles this, so it can go.
>>
>> If tree is empty parent will be uninitialized what is harmful.
> 
> Sorry; my bad.
> 
> Thanks,
> Mark.
> 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 1/n] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi
  2017-06-16 14:08       ` Alexey Budankov
@ 2017-06-16 14:22         ` Alexey Budankov
  2017-06-19 12:46           ` Mark Rutland
  0 siblings, 1 reply; 31+ messages in thread
From: Alexey Budankov @ 2017-06-16 14:22 UTC (permalink / raw)
  To: Mark Rutland
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Andi Kleen, Kan Liang, Dmitri Prokhorov,
	Valery Cherepennikov, David Carrillo-Cisneros, Stephane Eranian,
	linux-kernel

On 16.06.2017 17:08, Alexey Budankov wrote:
> On 16.06.2017 12:09, Mark Rutland wrote:
>> On Fri, Jun 16, 2017 at 01:10:10AM +0300, Alexey Budankov wrote:
>>> On 15.06.2017 22:56, Mark Rutland wrote:
>>>> On Thu, Jun 15, 2017 at 08:41:42PM +0300, Alexey Budankov wrote:
>>>>> This series of patches continues v2 and addresses captured comments.
>>
>>>>> Specifically this patch replaces pinned_groups and flexible_groups
>>>>> lists of perf_event_context by red-black cpu indexed trees avoiding
>>>>> data structures duplication and introducing possibility to iterate
>>>>> event groups for a specific CPU only.
>>>>
>>>> If you use --per-thread, I take it the overhead is significantly
>>>> lowered?
>>>
>>> Please ask more.
>>
>> IIUC, you're seeing the slowdown when using perf record, correct?
> 
> Correct. Specifically in per-process mode - without -a option.
> 
>>
>> There's a --per-thread option to ask perf record to not duplicate the
>> event per-cpu.
>>
>> If you use that, what amount of slowdown do you see?

After applying all three patches:

- system-wide collection:

[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 303.795 MB perf.data (~13272985 samples) ]
2162.08user 176.24system 0:12.97elapsed 18021%CPU (0avgtext+0avgdata 
1187208maxresident)k
0inputs+622624outputs (0major+1360285minor)pagefaults 0swaps

- per-process collection:

[ perf record: Woken up 5 times to write data ]
[ perf record: Captured and wrote 1.079 MB perf.data (~47134 samples) ]
2102.39user 153.88system 0:12.78elapsed 17645%CPU (0avgtext+0avgdata 
1187156maxresident)k
0inputs+2272outputs (0major+1181660minor)pagefaults 0swaps

Elapsed times look similar. Data file sizes differ significantly.

Test script:

#!/bin/bash

echo 0 > /proc/sys/kernel/watchdog
echo 0 > /proc/sys/kernel/perf_event_paranoid
/usr/bin/time 
"/root/abudanko/vtune_amplifier_2018_zip/bin64/amplxe-perf" record 
--per-thread [-a] -N -B -T -R -d -e 
cpu/period=0x155cc0,pc=0x0,any=0x0,inv=0x0,edge=0x0,cmask=0x0,event=0x3c,in_tx=0x0,ldlat=0x0,umask=0x0,in_tx_cp=0x0,offcore_rsp=0x0/Duk,cpu/period=0x155cc0,pc=0x0,any=0x0,inv=0x0,edge=0x0,cmask=0x0,event=0x0,in_tx=0x0,ldlat=0x0,umask=0x3,in_tx_cp=0x0,offcore_rsp=0x0/Duk,cpu/period=0x155cc0,pc=0x0,any=0x0,inv=0x0,edge=0x0,cmask=0x0,event=0xc0,in_tx=0x0,ldlat=0x0,umask=0x0,in_tx_cp=0x0,offcore_rsp=0x0/Duk,cpu/period=0x30d43,pc=0x0,any=0x0,inv=0x0,edge=0x0,cmask=0x0,event=0x3,in_tx=0x0,ldlat=0x0,umask=0x8,in_tx_cp=0x0,offcore_rsp=0x0/ukpp,cpu/period=0x30d43,pc=0x0,any=0x0,inv=0x0,edge=0x0,cmask=0x0,event=0x3,in_tx=0x0,ldlat=0x0,umask=0x1,in_tx_cp=0x0,offcore_rsp=0x0/ukpp,cpu/period=0x30d43,pc=0x0,any=0x0,inv=0x0,edge=0x0,cmask=0x0,event=0x4,in_tx=0x0,ldlat=0x0,umask=0x2,in_tx_cp=0x0,offcore_rsp=0x0/ukpp,cpu/period=0x186a7,pc=0x0,any=0x0,inv=0x0,edge=0x0,cmask=0x0,event=0x4,in_tx=0x0,ldlat=0x0,umask=0x4,in_tx_cp=0x0,offcore_rsp=0x0/ukpp,cpu/period=0x1e8483,pc=0x0,any=0x0,inv=0x0,edge=0x0,cmask=0x0,event=0x3c,in_tx=0x0,ldlat=0x0,umask=0x0,in_tx_cp=0x0,offcore_rsp=0x0/uk,cpu/period=0x1e8483,pc=0x0,any=0x0,inv=0x0,edge=0x0,cmask=0x0,event=0xc2,in_tx=0x0,ldlat=0x0,umask=0x10,in_tx_cp=0x0,offcore_rsp=0x0/uk,cpu/period=0x30d43,pc=0x0,any=0x0,inv=0x0,edge=0x0,cmask=0x0,event=0xca,in_tx=0x0,ldlat=0x0,umask=0x4,in_tx_cp=0x0,offcore_rsp=0x0/uk,cpu/period=0x30d43,pc=0x0,any=0x0,inv=0x0,edge=0x0,cmask=0x0,event=0xca,in_tx=0x0,ldlat=0x0,umask=0x90,in_tx_cp=0x0,offcore_rsp=0x0/uk,cpu/period=0x1e8483,pc=0x0,any=0x0,inv=0x0,edge=0x0,cmask=0x0,event=0xc2,in_tx=0x0,ldlat=0x0,umask=0x1,in_tx_cp=0x0,offcore_rsp=0x0/uk,cpu/period=0x30d43,pc=0x0,any=0x0,inv=0x0,edge=0x0,cmask=0x0,event=0xc3,in_tx=0x0,ldlat=0x0,umask=0x4,in_tx_cp=0x0,offcore_rsp=0x0/uk,cpu/period=0x30d43,pc=0x0,any=0x0,inv=0x0,edge=0x0,cmask=0x0,event=0x4,in_tx=0x0,ldlat=0x0,umask=0x20,in_tx_cp=0x0,offcore_rsp=0x0/uk,cpu/period=0x30d43,pc=0x0,any=0x0,inv=0x0,edge=0x0,cmask=0x0,event=0x5,in_tx=0x0,ldlat=0x0,umask=0x3,in_tx_cp=0x0,offcore_rsp=0x0/uk,cpu/period=0x1e8483,pc=0x
0,any=0x0,inv=0x0,edge=0x0,cmask=0x0,event=0xcd,in_tx=0x0,ldlat=0x0,umask=0x1,in_tx_cp=0x0,offcore_rsp=0x0/uk,cpu/period=0x30d43,pc=0x0,any=0x0,inv=0x0,edge=0x0,cmask=0x0,event=0x3,in_tx=0x0,ldlat=0x0,umask=0x4,in_tx_cp=0x0,offcore_rsp=0x0/uk,cpu/period=0x30d43,pc=0x0,any=0x0,inv=0x0,edge=0x0,cmask=0x0,event=0x86,in_tx=0x0,ldlat=0x0,umask=0x4,in_tx_cp=0x0,offcore_rsp=0x0/uk,cpu/period=0x30d43,pc=0x0,any=0x0,inv=0x0,edge=0x0,cmask=0x0,event=0x4,in_tx=0x0,ldlat=0x0,umask=0x10,in_tx_cp=0x0,offcore_rsp=0x0/uk,cpu/period=0x30d43,pc=0x0,any=0x0,inv=0x0,edge=0x0,cmask=0x0,event=0x4,in_tx=0x0,ldlat=0x0,umask=0x40,in_tx_cp=0x0,offcore_rsp=0x0/uk,cpu/period=0x30d43,pc=0x0,any=0x0,inv=0x0,edge=0x0,cmask=0x0,event=0x4,in_tx=0x0,ldlat=0x0,umask=0x80,in_tx_cp=0x0,offcore_rsp=0x0/uk,cpu/period=0x30d43,pc=0x0,any=0x0,inv=0x0,edge=0x0,cmask=0x0,event=0xc2,in_tx=0x0,ldlat=0x0,umask=0x40,in_tx_cp=0x0,offcore_rsp=0x0/uk,cpu/period=0x30d43,pc=0x0,any=0x0,inv=0x0,edge=0x0,cmask=0x0,event=0xc2,in_tx=0x0,ldlat=0x0,umask=0x20,in_tx_cp=0x0,offcore_rsp=0x0/uk,cpu/period=0x30d43,pc=0x0,any=0x0,inv=0x0,edge=0x0,cmask=0x0,event=0x5,in_tx=0x0,ldlat=0x0,umask=0x2,in_tx_cp=0x0,offcore_rsp=0x0/uk,cpu/period=0x30d43,pc=0x0,any=0x0,inv=0x0,edge=0x0,cmask=0x0,event=0xe6,in_tx=0x0,ldlat=0x0,umask=0x1,in_tx_cp=0x0,offcore_rsp=0x0/uk,cpu/period=0x30d43,pc=0x0,any=0x0,inv=0x0,edge=0x0,cmask=0x0,event=0xe7,in_tx=0x0,ldlat=0x0,umask=0x1,in_tx_cp=0x0,offcore_rsp=0x0/uk,cpu/period=0x30d43,pc=0x0,any=0x0,inv=0x0,edge=0x0,cmask=0x0,event=0xc3,in_tx=0x0,ldlat=0x0,umask=0x1,in_tx_cp=0x0,offcore_rsp=0x0/uk,cpu/period=0x30d43,pc=0x0,any=0x0,inv=0x0,edge=0x0,cmask=0x0,event=0xc3,in_tx=0x0,ldlat=0x0,umask=0x2,in_tx_cp=0x0,offcore_rsp=0x0/uk,cpu/period=0x30d43,pc=0x0,any=0x0,inv=0x0,edge=0x0,cmask=0x0,event=0x4,in_tx=0x0,ldlat=0x0,umask=0x1,in_tx_cp=0x0,offcore_rsp=0x0/uk 
-- ./stream

>>
>> It might be preferable to not open task-bound per-cpu events on systems
>> with large cpu counts, and it would be good to know what the trade-off
>> looks like for this case.
>>
>>>>> +static void
>>>>> +perf_cpu_tree_insert(struct rb_root *tree, struct perf_event *event)
>>>>> +{
>>>>> +    struct rb_node **node;
>>>>> +    struct rb_node *parent;
>>>>> +
>>>>> +    WARN_ON_ONCE(!tree || !event);
>>>>> +
>>>>> +    node = &tree->rb_node;
>>>>> +    parent = *node;
>>>>
>>>> The first iteration of the loop handles this, so it can go.
>>>
>>> If tree is empty parent will be uninitialized what is harmful.
>>
>> Sorry; my bad.
>>
>> Thanks,
>> Mark.
>>
> 
> 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 1/n] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi
  2017-06-16 14:22         ` Alexey Budankov
@ 2017-06-19 12:46           ` Mark Rutland
  2017-06-19 13:38             ` Mark Rutland
  0 siblings, 1 reply; 31+ messages in thread
From: Mark Rutland @ 2017-06-19 12:46 UTC (permalink / raw)
  To: Alexey Budankov
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Andi Kleen, Kan Liang, Dmitri Prokhorov,
	Valery Cherepennikov, David Carrillo-Cisneros, Stephane Eranian,
	linux-kernel

On Fri, Jun 16, 2017 at 05:22:29PM +0300, Alexey Budankov wrote:
> On 16.06.2017 17:08, Alexey Budankov wrote:
> >On 16.06.2017 12:09, Mark Rutland wrote:
> >>On Fri, Jun 16, 2017 at 01:10:10AM +0300, Alexey Budankov wrote:
> >>>On 15.06.2017 22:56, Mark Rutland wrote:
> >>>>On Thu, Jun 15, 2017 at 08:41:42PM +0300, Alexey Budankov wrote:
> >>>>>This series of patches continues v2 and addresses captured comments.
> >>
> >>>>>Specifically this patch replaces pinned_groups and flexible_groups
> >>>>>lists of perf_event_context by red-black cpu indexed trees avoiding
> >>>>>data structures duplication and introducing possibility to iterate
> >>>>>event groups for a specific CPU only.
> >>>>
> >>>>If you use --per-thread, I take it the overhead is significantly
> >>>>lowered?
> >>>
> >>>Please ask more.
> >>
> >>IIUC, you're seeing the slowdown when using perf record, correct?
> >
> >Correct. Specifically in per-process mode - without -a option.
> >
> >>
> >>There's a --per-thread option to ask perf record to not duplicate the
> >>event per-cpu.
> >>
> >>If you use that, what amount of slowdown do you see?
> 
> After applying all three patches:
> 
> - system-wide collection:
> 
> [ perf record: Woken up 1 times to write data ]
> [ perf record: Captured and wrote 303.795 MB perf.data (~13272985 samples) ]
> 2162.08user 176.24system 0:12.97elapsed 18021%CPU (0avgtext+0avgdata
> 1187208maxresident)k
> 0inputs+622624outputs (0major+1360285minor)pagefaults 0swaps
> 
> - per-process collection:
> 
> [ perf record: Woken up 5 times to write data ]
> [ perf record: Captured and wrote 1.079 MB perf.data (~47134 samples) ]
> 2102.39user 153.88system 0:12.78elapsed 17645%CPU (0avgtext+0avgdata
> 1187156maxresident)k
> 0inputs+2272outputs (0major+1181660minor)pagefaults 0swaps
> 
> Elapsed times look similar. Data file sizes differ significantly.

Interesting. I wonder if that's because we're losing samples due to
hammering the rb, or if that's a side-effect of this patch.

Does perf report describe any lost chunks?

For comparison, can you give --per-thread a go prior to these patches
being applied?

Thanks,
Mark.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 1/n] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi
  2017-06-15 22:10   ` Alexey Budankov
  2017-06-16  9:09     ` Mark Rutland
@ 2017-06-19 13:08     ` Alexey Budankov
  2017-06-19 13:26       ` Mark Rutland
  1 sibling, 1 reply; 31+ messages in thread
From: Alexey Budankov @ 2017-06-19 13:08 UTC (permalink / raw)
  To: Mark Rutland
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Andi Kleen, Kan Liang, Dmitri Prokhorov,
	Valery Cherepennikov, David Carrillo-Cisneros, Stephane Eranian,
	linux-kernel

On 16.06.2017 1:10, Alexey Budankov wrote:
> Hi,
> 
> On 15.06.2017 22:56, Mark Rutland wrote:
>> Hi,
>>
>> On Thu, Jun 15, 2017 at 08:41:42PM +0300, Alexey Budankov wrote:
>>> This series of patches continues v2 and addresses captured comments.
>>
>> As a general note, please keep any change log stuff out of the commit
>> message, and mvoe that just before the diff. It generally doesn't maek
>> sense for that to go in the commit message.
> 
> Accepted.
> 
>>
>>> Specifically this patch replaces pinned_groups and flexible_groups
>>> lists of perf_event_context by red-black cpu indexed trees avoiding
>>> data structures duplication and introducing possibility to iterate
>>> event groups for a specific CPU only.
>>
>> If you use --per-thread, I take it the overhead is significantly
>> lowered?
> 
> Please ask more.
> 
>>
>> As another general thing, please aim to write the commit message so that
>> it'll make sense in N years' time, in the git history. Describe the
>> whole problem, and the intended solution.
>>
>> I had to go diving into mail archives to understand the problem you're
>> trying to solve, and I shouldn't have to.
>>
>> For example, this commit message could be something like:
>>
>>      perf/core: use rb trees for pinned/flexible groups
>>
>>      By default, the userspace perf tool opens per-cpu task-bound events
>>      when sampling, so for N logical events requested by the user, the 
>> tool
>>      will open N * NR_CPUS events.
>>
>>      In the kernel, we mux events with a hrtimer, periodically 
>> rotating the
>>      event list and trying to schedule each in turn. We skip events whose
>>      cpu filter doesn't match. So when we get unlucky, we can walk N *
>>      (NR_CPUS - 1) events pointlessly for each hrtimer invocation.
>>
>>      This has been observed to result in significant overhead when 
>> running
>>      the STREAM benchmark on 272 core Xeon Phi systems.
>>
>>      One way to avoid this is to place our events into an rb tree 
>> sorted by
>>      CPU filter, so that our hrtimer can skip to the current CPU's
>>      sub-tree, and ignore everything else.
>>
>>      As a step towards that, this patch replaces our event lists with rb
>>      trees.
>> ... which hopefully makes sense now, and will in 2, 5, 10 years' time,
>>
> 
> Accepted. Thanks.
> 
>>>
>>> Signed-off-by: Alexey Budankov <alexey.budankov@linux.intel.com>
>>
>> Have you thrown Vince's perf fuzzer at this?
>>
>> If you haven't, please do. It can be found in the fuzzer directory of:
>>
>> https://github.com/deater/perf_event_tests
>>
> 
> Accepted.

I run the test suite and it revealed no additional regressions in 
comparison to what I have on the clean kernel.

However the fuzzer constantly reports some strange stacks that are not 
seen on the clean kernel and I have no idea how that might be caused by 
the patches.

> 
>>> ---
>>>   include/linux/perf_event.h |  34 +++-
>>>   kernel/events/core.c       | 386
>>> +++++++++++++++++++++++++++++++++------------
>>>   2 files changed, 315 insertions(+), 105 deletions(-)
>>
>> <changelog goes here>
>>
>>>
>>> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
>>> index 24a6358..2c1dcf1 100644
>>> --- a/include/linux/perf_event.h
>>> +++ b/include/linux/perf_event.h
>>> @@ -574,6 +574,27 @@ struct perf_event {
>>>       struct list_head        sibling_list;
>>>
>>>       /*
>>> +     * Node on the pinned or flexible tree located at the event 
>>> context;
>>> +     * the node may be empty in case its event is not directly attached
>>> +     * to the tree but to group_list list of the event directly
>>> +     * attached to the tree;
>>> +     */
>>> +    struct rb_node            group_node;
>>> +    /*
>>> +     * List keeps groups allocated for the same cpu;
>>> +     * the list may be empty in case its event is not directly
>>> +     * attached to the tree but to group_list list of the event 
>>> directly
>>> +     * attached to the tree;
>>> +     */
>>> +    struct list_head        group_list;
>>> +    /*
>>> +     * Entry into the group_list list above;
>>> +     * the entry may be attached to the self group_list list above
>>> +     * in case the event is directly attached to the pinned or
>>> +     * flexible tree;
>>> +     */
>>> +    struct list_head        group_list_entry;
>>
>> We already have group_entry for threading the RB tree. i don't see why
>> we need two new lists heads.
> 
> Ok. For a group leader event its group_entry may be used instead of new 
> group_list_entry.
> 
>>
>>> +    /*
>>>        * We need storage to track the entries in 
>>> perf_pmu_migrate_context; we
>>>        * cannot use the event_entry because of RCU and we want to 
>>> keep the
>>>        * group in tact which avoids us using the other two entries.
>>> @@ -741,8 +762,17 @@ struct perf_event_context {
>>>       struct mutex            mutex;
>>>
>>>       struct list_head        active_ctx_list;
>>> -    struct list_head        pinned_groups;
>>> -    struct list_head        flexible_groups;
>>> +    /*
>>> +     * RB tree for pinned groups; keeps event's group_node
>>> +     * nodes of attached pinned groups;
>>> +     */
>>> +    struct rb_root            pinned_tree;
>>> +    /*
>>> +     * RB tree for flexible groups; keeps event's group_node
>>> +     * nodes of attached flexible groups;
>>> +     */
>>> +    struct rb_root            flexible_tree;
>>
>> We didn't need these commesnts before, and I don't think we need them
>> now.
>>
>> Any reason not to stick with the existing names?
>>
>> The existing {pinned,flexible}_groups names are fine regardless of
>> whether a list or tree is used, and makes it clear that the elements are
>> the group leaders, not all events.
> 
> Yes, makes sense. Just changing the type of data structures.
> 
>>
>>> +
>>>       struct list_head        event_list;
>>>       int                nr_events;
>>>       int                nr_active;
>>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>>> index bc63f8d..a3531f9 100644
>>> --- a/kernel/events/core.c
>>> +++ b/kernel/events/core.c
>>> @@ -1458,13 +1458,142 @@ static enum event_type_t
>>> get_event_type(struct perf_event *event)
>>>       return event_type;
>>>   }
>>>
>>> -static struct list_head *
>>> -ctx_group_list(struct perf_event *event, struct perf_event_context 
>>> *ctx)
>>> +static struct rb_root *
>>> +ctx_group_cpu_tree(struct perf_event *event, struct
>>> perf_event_context *ctx)
>>>   {
>>>       if (event->attr.pinned)
>>> -        return &ctx->pinned_groups;
>>> +        return &ctx->pinned_tree;
>>>       else
>>> -        return &ctx->flexible_groups;
>>> +        return &ctx->flexible_tree;
>>> +}
>>> +
>>> +/*
>>> + * Insert a group into a tree using event->cpu as a key. If 
>>> event->cpu node
>>> + * is already attached to the tree then the event is added to the 
>>> attached
>>> + * group's group_list list.
>>> + */
>>> +static void
>>> +perf_cpu_tree_insert(struct rb_root *tree, struct perf_event *event)
>>> +{
>>> +    struct rb_node **node;
>>> +    struct rb_node *parent;
>>> +
>>> +    WARN_ON_ONCE(!tree || !event);
>>> +
>>> +    node = &tree->rb_node;
>>> +    parent = *node;
>>
>> The first iteration of the loop handles this, so it can go.
> 
> If tree is empty parent will be uninitialized what is harmful.
> 
>>
>>> +
>>> +    while (*node) {
>>> +        struct perf_event *node_event =    container_of(*node,
>>> +                struct perf_event, group_node);
>>> +
>>> +        parent = *node;
>>> +
>>> +        if (event->cpu < node_event->cpu) {
>>> +            node = &parent->rb_left;
>>> +        } else if (event->cpu > node_event->cpu) {
>>> +            node = &parent->rb_right;
>>> +        } else {
>>> +            list_add_tail(&event->group_list_entry,
>>> +                    &node_event->group_list);
>>> +            return;
>>
>> Why aren't we putting all group leaders into the tree?
> 
> Because we index a tree by cpu only and if two groups share cpu we link 
> them into the group_list of the group directly attached to the tree via 
> group_node.
> 
>>
>> I don't think this is what Peter meant when he suggested a threaded rb
>> tree.
> 
> Yes, that's right. This implements cpu indexed rb trees with linked 
> lists attached to every tree's node. The lists contain event groups 
> allocated for the same cpu only.
> 
>>
>> My understanding was that every group leader should be in the rb tree,
>> and every group leader should be in the same list that threads the whole
>> tree. Both the rb tree and list have to be maintained with the same
>> order.
>>
>> Thus, you can walk the rb tree to find the first event in the sub-list
>> that you care about, then walk that sub-list.
>>
>> Note that this means we need to shuffle the list and rb tree to rotate
>> events in each sub-tree.
> 
> Now we also rotate a list but only containing groups for the appropriate 
> cpu and we don't touch the tree.
> 
>>
>>> +        }
>>> +    }
>>> +
>>> +    list_add_tail(&event->group_list_entry, &event->group_list);
>>> +
>>> +    rb_link_node(&event->group_node, parent, node);
>>> +    rb_insert_color(&event->group_node, tree);
>>> +}
>>> +
>>> +/*
>>> + * Delete a group from a tree. If the group is directly attached to
>>> the tree
>>> + * it also detaches all groups on the group's group_list list.
>>> + */
>>> +static void
>>> +perf_cpu_tree_delete(struct rb_root *tree, struct perf_event *event)
>>> +{
>>> +    WARN_ON_ONCE(!tree || !event);
>>> +
>>> +    if (RB_EMPTY_NODE(&event->group_node)) {
>>> +        list_del_init(&event->group_list_entry);
>>> +    } else {
>>> +        struct perf_event *list_event, *list_next;
>>> +
>>> +        rb_erase(&event->group_node, tree);
>>> +        list_for_each_entry_safe(list_event, list_next,
>>> +                &event->group_list, group_list_entry)
>>> +            list_del_init(&list_event->group_list_entry);
>>
>> At this point, all the other group leaders withthe same filter are gone
>> from the rb tree, since we didn't insert any of them into the rb tree in
>> place of the leader we deleted.
>>  >> +    }
>>> +}
>>> +
>>> +/*
>>> + * Find group_list list by a cpu key and call provided callback for 
>>> every
>>> + * group on the list. Iteration stops if the callback returns non zero.
>>> + */
>>> +
>>> +typedef int (*perf_cpu_tree_callback_t)(struct perf_event *, void *);
>>> +
>>> +static int
>>> +perf_cpu_tree_iterate_cpu(struct rb_root *tree, int cpu,
>>> +        perf_cpu_tree_callback_t callback, void *data)
>>> +{
>>> +    int ret = 0;
>>> +    struct rb_node *node;
>>> +    struct perf_event *event;
>>> +
>>> +    WARN_ON_ONCE(!tree);
>>> +
>>> +    node = tree->rb_node;
>>> +
>>> +    while (node) {
>>> +        struct perf_event *node_event = container_of(node,
>>> +                struct perf_event, group_node);
>>> +
>>> +        if (cpu < node_event->cpu) {
>>> +            node = node->rb_left;
>>> +        } else if (cpu > node_event->cpu) {
>>> +            node = node->rb_right;
>>> +        } else {
>>> +            list_for_each_entry(event, &node_event->group_list,
>>> +                    group_list_entry) {
>>> +                ret = callback(event, data);
>>> +                if (ret)
>>> +                    return ret;
>>> +            }
>>> +        }
>>> +    }
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +/*
>>> + * Iterate a tree and call provided callback for every group in the 
>>> tree.
>>> + * Iteration stops if the callback returns non zero.
>>> + */
>>> +static int
>>> +perf_cpu_tree_iterate(struct rb_root *tree,
>>> +        perf_cpu_tree_callback_t callback, void *data)
>>> +{
>>> +    int ret = 0;
>>> +    struct rb_node *node;
>>> +    struct perf_event *event;
>>> +
>>> +    WARN_ON_ONCE(!tree);
>>> +
>>> +    for (node = rb_first(tree); node; node = rb_next(node)) {
>>> +        struct perf_event *node_event = container_of(node,
>>> +                struct perf_event, group_node);
>>> +
>>> +        list_for_each_entry(event, &node_event->group_list,
>>> +                group_list_entry) {
>>> +            ret = callback(event, data);
>>> +            if (ret)
>>> +                return ret;
>>> +        }
>>> +    }
>>> +
>>> +    return 0;
>>>   }
>>
>> If you need to iterate over every event, you can use the list that
>> threads the whole tree.
>>
>>>
>>>   /*
>>> @@ -1485,12 +1614,12 @@ list_add_event(struct perf_event *event,
>>> struct perf_event_context *ctx)
>>>        * perf_group_detach can, at all times, locate all siblings.
>>>        */
>>>       if (event->group_leader == event) {
>>> -        struct list_head *list;
>>> +        struct rb_root *tree;
>>>
>>>           event->group_caps = event->event_caps;
>>>
>>> -        list = ctx_group_list(event, ctx);
>>> -        list_add_tail(&event->group_entry, list);
>>> +        tree = ctx_group_cpu_tree(event, ctx);
>>> +        perf_cpu_tree_insert(tree, event);
>>
>> Can we wrap this in a helper so that finds the sub-tree and performs
>> the insert?
> 
> Ok. add_to_groups().
> 
>>
>>>       }
>>>
>>>       list_update_cgroup_event(event, ctx, true);
>>> @@ -1680,8 +1809,12 @@ list_del_event(struct perf_event *event,
>>> struct perf_event_context *ctx)
>>>
>>>       list_del_rcu(&event->event_entry);
>>>
>>> -    if (event->group_leader == event)
>>> -        list_del_init(&event->group_entry);
>>> +    if (event->group_leader == event) {
>>> +        struct rb_root *tree;
>>> +
>>> +        tree = ctx_group_cpu_tree(event, ctx);
>>> +        perf_cpu_tree_delete(tree, event);
>>
>> ... likewise?
> 
> Ok. del_from_groups().
> 
>>
>> Thanks,
>> Mark.
>>
> 
> Thanks,
> Alexey
> 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 1/n] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi
  2017-06-19 13:08     ` Alexey Budankov
@ 2017-06-19 13:26       ` Mark Rutland
  2017-06-19 13:37         ` Alexey Budankov
  0 siblings, 1 reply; 31+ messages in thread
From: Mark Rutland @ 2017-06-19 13:26 UTC (permalink / raw)
  To: Alexey Budankov
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Andi Kleen, Kan Liang, Dmitri Prokhorov,
	Valery Cherepennikov, David Carrillo-Cisneros, Stephane Eranian,
	linux-kernel

On Mon, Jun 19, 2017 at 04:08:32PM +0300, Alexey Budankov wrote:
> On 16.06.2017 1:10, Alexey Budankov wrote:
> >On 15.06.2017 22:56, Mark Rutland wrote:
> >>On Thu, Jun 15, 2017 at 08:41:42PM +0300, Alexey Budankov wrote:
> >>>This series of patches continues v2 and addresses captured comments.

> >>>Specifically this patch replaces pinned_groups and flexible_groups
> >>>lists of perf_event_context by red-black cpu indexed trees avoiding
> >>>data structures duplication and introducing possibility to iterate
> >>>event groups for a specific CPU only.

> >>Have you thrown Vince's perf fuzzer at this?
> >>
> >>If you haven't, please do. It can be found in the fuzzer directory of:
> >>
> >>https://github.com/deater/perf_event_tests
> >
> >Accepted.
> 
> I run the test suite and it revealed no additional regressions in
> comparison to what I have on the clean kernel.
> 
> However the fuzzer constantly reports some strange stacks that are
> not seen on the clean kernel and I have no idea how that might be
> caused by the patches.

Ok; that was the kind of thing I was concerned about.

What you say "strange stacks", what do you mean exactly?

I take it the kernel spewing backtraces in dmesg?

Can you dump those?

Thanks,
Mark.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 1/n] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi
  2017-06-19 13:26       ` Mark Rutland
@ 2017-06-19 13:37         ` Alexey Budankov
  2017-06-19 15:00           ` Alexey Budankov
                             ` (2 more replies)
  0 siblings, 3 replies; 31+ messages in thread
From: Alexey Budankov @ 2017-06-19 13:37 UTC (permalink / raw)
  To: Mark Rutland
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Andi Kleen, Kan Liang, Dmitri Prokhorov,
	Valery Cherepennikov, David Carrillo-Cisneros, Stephane Eranian,
	linux-kernel

On 19.06.2017 16:26, Mark Rutland wrote:
> On Mon, Jun 19, 2017 at 04:08:32PM +0300, Alexey Budankov wrote:
>> On 16.06.2017 1:10, Alexey Budankov wrote:
>>> On 15.06.2017 22:56, Mark Rutland wrote:
>>>> On Thu, Jun 15, 2017 at 08:41:42PM +0300, Alexey Budankov wrote:
>>>>> This series of patches continues v2 and addresses captured comments.
> 
>>>>> Specifically this patch replaces pinned_groups and flexible_groups
>>>>> lists of perf_event_context by red-black cpu indexed trees avoiding
>>>>> data structures duplication and introducing possibility to iterate
>>>>> event groups for a specific CPU only.
> 
>>>> Have you thrown Vince's perf fuzzer at this?
>>>>
>>>> If you haven't, please do. It can be found in the fuzzer directory of:
>>>>
>>>> https://github.com/deater/perf_event_tests
>>>
>>> Accepted.
>>
>> I run the test suite and it revealed no additional regressions in
>> comparison to what I have on the clean kernel.
>>
>> However the fuzzer constantly reports some strange stacks that are
>> not seen on the clean kernel and I have no idea how that might be
>> caused by the patches.
> 
> Ok; that was the kind of thing I was concerned about.
> 
> What you say "strange stacks", what do you mean exactly?
> 
> I take it the kernel spewing backtraces in dmesg?
> 
> Can you dump those?

Here it is:

list_del corruption. prev->next should be ffff88c2c4654010, but was 
ffff88c31eb0c020
[  607.632813] ------------[ cut here ]------------
[  607.632816] kernel BUG at lib/list_debug.c:53!
[  607.632825] invalid opcode: 0000 [#1] SMP
[  607.632898] Modules linked in: fuse xt_CHECKSUM iptable_mangle 
ipt_MASQUERADE nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 nf_nat 
nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack libcrc32c tun 
bridge stp llc ebtable_filter ebtables ip6table_filter ip6_tables nfsv3 
rpcsec_gss_krb5 nfsv4 arc4 md4 nls_utf8 cifs nfs ccm dns_resolver 
fscache rpcrdma ib_isert iscsi_target_mod ib_iser libiscsi 
scsi_transport_iscsi ib_srpt target_core_mod ib_srp scsi_transport_srp 
ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm 
intel_rapl sb_edac x86_pkg_temp_thermal intel_powerclamp hfi1 coretemp 
crct10dif_pclmul crc32_pclmul ghash_clmulni_intel intel_cstate iTCO_wdt 
intel_uncore iTCO_vendor_support joydev rdmavt intel_rapl_perf i2c_i801 
ib_core ipmi_ssif mei_me mei ipmi_si ipmi_devintf tpm_tis
[  607.633954]  lpc_ich pcspkr ipmi_msghandler acpi_pad tpm_tis_core 
shpchp tpm wmi acpi_power_meter nfsd auth_rpcgss nfs_acl lockd grace 
sunrpc mgag200 drm_kms_helper ttm drm igb crc32c_intel ptp pps_core dca 
i2c_algo_bit
[  607.634262] CPU: 271 PID: 28944 Comm: perf_fuzzer Not tainted 
4.12.0-rc4+ #22
[  607.634363] Hardware name: Intel Corporation S7200AP/S7200AP, BIOS 
S72C610.86B.01.01.0190.080520162104 08/05/2016
[  607.634505] task: ffff88c2d5714000 task.stack: ffffa6f9352c8000
[  607.634597] RIP: 0010:__list_del_entry_valid+0x7b/0x90
[  607.634670] RSP: 0000:ffffa6f9352cbad0 EFLAGS: 00010046
[  607.634746] RAX: 0000000000000054 RBX: ffff88c2c4654000 RCX: 
0000000000000000
[  607.634845] RDX: 0000000000000000 RSI: ffff88c33fdce168 RDI: 
ffff88c33fdce168
[  607.634944] RBP: ffffa6f9352cbad0 R08: 00000000fffffffe R09: 
0000000000000600
[  607.635042] R10: 0000000000000005 R11: 0000000000000000 R12: 
ffff88c2e71ab200
[  607.635140] R13: ffff88c2c4654010 R14: 0000000000000001 R15: 
0000000000000001
[  607.635240] FS:  0000000000000000(0000) GS:ffff88c33fdc0000(0000) 
knlGS:0000000000000000
[  607.635351] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  607.635431] CR2: 00000000026be1e4 CR3: 0000002488e09000 CR4: 
00000000001407e0
[  607.635531] Call Trace:
[  607.635583]  list_del_event+0x1d7/0x210
[  607.635646]  ? perf_cgroup_attach+0x70/0x70
[  607.635711]  __perf_remove_from_context+0x3e/0x90
[  607.635783]  ? event_sched_out.isra.90+0x300/0x300
[  607.635854]  event_function_call+0xbf/0xf0
[  607.635918]  ? event_sched_out.isra.90+0x300/0x300
[  607.635991]  perf_remove_from_context+0x25/0x70
[  607.636060]  perf_event_release_kernel+0xda/0x250
[  607.636132]  ? __dentry_kill+0x10e/0x160
[  607.636192]  perf_release+0x10/0x20
[  607.636249]  __fput+0xdf/0x1e0
[  607.636299]  ____fput+0xe/0x10
[  607.636350]  task_work_run+0x83/0xb0
[  607.636408]  do_exit+0x2bc/0xbc0
[  607.636460]  ? page_add_file_rmap+0xaf/0x200
[  607.636526]  ? alloc_set_pte+0x115/0x4f0
[  607.636587]  do_group_exit+0x3f/0xb0
[  607.636644]  get_signal+0x1cc/0x5c0
[  607.636703]  do_signal+0x37/0x6a0
[  607.636758]  ? __perf_sw_event+0x4f/0x80
[  607.636821]  ? __do_page_fault+0x2e1/0x4d0
[  607.636885]  exit_to_usermode_loop+0x4c/0x92
[  607.636953]  prepare_exit_to_usermode+0x40/0x50
[  607.637023]  retint_user+0x8/0x13
[  607.640312] RIP: 0033:0x40f4a9
[  607.643500] RSP: 002b:00007ffc62d00668 EFLAGS: 00000206 ORIG_RAX: 
ffffffffffffff02
[  607.646678] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 
000000000001100a
[  607.649777] RDX: 00007fd5dd25bae0 RSI: 00007fd5dd259760 RDI: 
00007fd5dd25a640
[  607.652791] RBP: 00007ffc62d00680 R08: 00007fd5dd45e740 R09: 
0000000000000000
[  607.655709] R10: 00007fd5dd45ea10 R11: 0000000000000246 R12: 
0000000000401980
[  607.658530] R13: 00007ffc62d02a80 R14: 0000000000000000 R15: 
0000000000000000
[  607.661249] Code: e8 3a f7 d8 ff 0f 0b 48 89 fe 31 c0 48 c7 c7 20 66 
cb ac e8 27 f7 d8 ff 0f 0b 48 89 fe 31 c0 48 c7 c7 e0 65 cb ac e8 14 f7 
d8 ff <0f> 0b 48 89 fe 31 c0 48 c7 c7 a8 65 cb ac e8 01 f7 d8 ff 0f 0b
[  607.666819] RIP: __list_del_entry_valid+0x7b/0x90 RSP: ffffa6f9352cbad0
[  607.683316] ---[ end trace 34244c35550e0713 ]---
[  607.691830] Fixing recursive fault but reboot is needed!

2:

[  467.942059] unchecked MSR access error: WRMSR to 0x711 (tried to 
write 0x00000000e8cc0055) at rIP: 0xffffffffac05fbd4 
(native_write_msr+0x4/0x30)
[  467.942068] Call Trace:
[  467.942073]  <IRQ>
[  467.942094]  ? snbep_uncore_msr_enable_event+0x54/0x60 [intel_uncore]
[  467.942107]  uncore_pmu_event_start+0x9b/0x100 [intel_uncore]
[  467.942119]  uncore_pmu_event_add+0x235/0x3a0 [intel_uncore]
[  467.942126]  ? sched_clock+0xb/0x10
[  467.942132]  ? sched_clock_cpu+0x11/0xb0
[  467.942140]  event_sched_in.isra.100+0xdf/0x250
[  467.942145]  sched_in_group+0x210/0x390
[  467.942150]  ? sched_in_group+0x390/0x390
[  467.942155]  group_sched_in_flexible_callback+0x17/0x20
[  467.942160]  perf_cpu_tree_iterate+0x45/0x75
[  467.942165]  ctx_sched_in+0x97/0x110
[  467.942169]  perf_event_sched_in+0x77/0x80
[  467.942174]  ctx_resched+0x69/0xb0
[  467.942179]  __perf_event_enable+0x208/0x250
[  467.942184]  event_function+0x93/0xe0
[  467.942188]  remote_function+0x3b/0x50
[  467.942194]  flush_smp_call_function_queue+0x71/0x120
[  467.942200]  generic_smp_call_function_single_interrupt+0x13/0x30
[  467.942206]  smp_call_function_single_interrupt+0x27/0x40
[  467.942212]  call_function_single_interrupt+0x93/0xa0
[  467.942217] RIP: 0010:native_restore_fl+0x6/0x10
[  467.942220] RSP: 0000:ffff88c33ba03e00 EFLAGS: 00000206 ORIG_RAX: 
ffffffffffffff04
[  467.942224] RAX: ffff88c33c8dca08 RBX: ffff88c33c8dc928 RCX: 
0000000000000017
[  467.942227] RDX: 0000000000000000 RSI: 0000000000000206 RDI: 
0000000000000206
[  467.942229] RBP: ffff88c33ba03e00 R08: 0000000000000001 R09: 
0000000000007151
[  467.942232] R10: 0000000000000000 R11: 000000000000005d R12: 
ffff88c33c8dca08
[  467.942234] R13: ffff88c33c8dc140 R14: 0000000000000001 R15: 
00000000000001d8
[  467.942241]  _raw_spin_unlock_irqrestore+0x16/0x20
[  467.942245]  update_blocked_averages+0x2cf/0x4a0
[  467.942251]  rebalance_domains+0x4b/0x2b0
[  467.942256]  run_rebalance_domains+0x1d7/0x210
[  467.942260]  __do_softirq+0xd1/0x27f
[  467.942267]  irq_exit+0xe9/0x100
[  467.942271]  scheduler_ipi+0x8f/0x140
[  467.942275]  smp_reschedule_interrupt+0x29/0x30
[  467.942280]  reschedule_interrupt+0x93/0xa0
[  467.942284] RIP: 0010:native_safe_halt+0x6/0x10
[  467.942286] RSP: 0000:fffffffface03de0 EFLAGS: 00000246 ORIG_RAX: 
ffffffffffffff02
[  467.942291] RAX: 0000000000000000 RBX: fffffffface104c0 RCX: 
0000000000000000
[  467.942293] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 
0000000000000000
[  467.942295] RBP: fffffffface03de0 R08: 0000006d2c5efae3 R09: 
0000000000000000
[  467.942298] R10: 0000000000000201 R11: 0000000000000930 R12: 
0000000000000000
[  467.942300] R13: fffffffface104c0 R14: 0000000000000000 R15: 
0000000000000000
[  467.942302]  </IRQ>
[  467.942308]  default_idle+0x20/0x100
[  467.942313]  arch_cpu_idle+0xf/0x20
[  467.942317]  default_idle_call+0x2c/0x40
[  467.942321]  do_idle+0x158/0x1e0
[  467.942325]  cpu_startup_entry+0x71/0x80
[  467.942330]  rest_init+0x77/0x80
[  467.942338]  start_kernel+0x4a7/0x4c8
[  467.942342]  ? set_init_arg+0x5a/0x5a
[  467.942348]  ? early_idt_handler_array+0x120/0x120
[  467.942352]  x86_64_start_reservations+0x29/0x2b
[  467.942357]  x86_64_start_kernel+0x151/0x174
[  467.942363]  secondary_startup_64+0x9f/0x9f


> 
> Thanks,
> Mark.
> 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 1/n] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi
  2017-06-19 12:46           ` Mark Rutland
@ 2017-06-19 13:38             ` Mark Rutland
  2017-06-19 14:09               ` Alexey Budankov
  2017-06-19 14:59               ` Andi Kleen
  0 siblings, 2 replies; 31+ messages in thread
From: Mark Rutland @ 2017-06-19 13:38 UTC (permalink / raw)
  To: Alexey Budankov
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Andi Kleen, Kan Liang, Dmitri Prokhorov,
	Valery Cherepennikov, David Carrillo-Cisneros, Stephane Eranian,
	linux-kernel

On Mon, Jun 19, 2017 at 01:46:39PM +0100, Mark Rutland wrote:
> On Fri, Jun 16, 2017 at 05:22:29PM +0300, Alexey Budankov wrote:
> > On 16.06.2017 17:08, Alexey Budankov wrote:
> > >On 16.06.2017 12:09, Mark Rutland wrote:
> > >>There's a --per-thread option to ask perf record to not duplicate the
> > >>event per-cpu.
> > >>
> > >>If you use that, what amount of slowdown do you see?
> > 
> > After applying all three patches:
> > 
> > - system-wide collection:
> > 
> > [ perf record: Woken up 1 times to write data ]
> > [ perf record: Captured and wrote 303.795 MB perf.data (~13272985 samples) ]
> > 2162.08user 176.24system 0:12.97elapsed 18021%CPU (0avgtext+0avgdata
> > 1187208maxresident)k
> > 0inputs+622624outputs (0major+1360285minor)pagefaults 0swaps
> > 
> > - per-process collection:
> > 
> > [ perf record: Woken up 5 times to write data ]
> > [ perf record: Captured and wrote 1.079 MB perf.data (~47134 samples) ]
> > 2102.39user 153.88system 0:12.78elapsed 17645%CPU (0avgtext+0avgdata
> > 1187156maxresident)k
> > 0inputs+2272outputs (0major+1181660minor)pagefaults 0swaps
> > 
> > Elapsed times look similar. Data file sizes differ significantly.
> 
> Interesting. I wonder if that's because we're losing samples due to
> hammering the rb, or if that's a side-effect of this patch.
> 
> Does perf report describe any lost chunks?
> 
> For comparison, can you give --per-thread a go prior to these patches
> being applied?

FWIW, I had a go with (an old) perf record on an arm64 system using
--per-thread, and I see that no samples are recorded, which seems like a
bug.

With --per-thread, the slwodown was ~20%, whereas with the defaults it
was > 400%.

Thanks,
Mark.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 1/n] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi
  2017-06-19 13:38             ` Mark Rutland
@ 2017-06-19 14:09               ` Alexey Budankov
  2017-06-19 14:59               ` Andi Kleen
  1 sibling, 0 replies; 31+ messages in thread
From: Alexey Budankov @ 2017-06-19 14:09 UTC (permalink / raw)
  To: Mark Rutland
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Andi Kleen, Kan Liang, Dmitri Prokhorov,
	Valery Cherepennikov, David Carrillo-Cisneros, Stephane Eranian,
	linux-kernel

On 19.06.2017 16:38, Mark Rutland wrote:
> On Mon, Jun 19, 2017 at 01:46:39PM +0100, Mark Rutland wrote:
>> On Fri, Jun 16, 2017 at 05:22:29PM +0300, Alexey Budankov wrote:
>>> On 16.06.2017 17:08, Alexey Budankov wrote:
>>>> On 16.06.2017 12:09, Mark Rutland wrote:
>>>>> There's a --per-thread option to ask perf record to not duplicate the
>>>>> event per-cpu.
>>>>>
>>>>> If you use that, what amount of slowdown do you see?
>>>
>>> After applying all three patches:
>>>
>>> - system-wide collection:
>>>
>>> [ perf record: Woken up 1 times to write data ]
>>> [ perf record: Captured and wrote 303.795 MB perf.data (~13272985 samples) ]
>>> 2162.08user 176.24system 0:12.97elapsed 18021%CPU (0avgtext+0avgdata
>>> 1187208maxresident)k
>>> 0inputs+622624outputs (0major+1360285minor)pagefaults 0swaps
>>>
>>> - per-process collection:
>>>
>>> [ perf record: Woken up 5 times to write data ]
>>> [ perf record: Captured and wrote 1.079 MB perf.data (~47134 samples) ]
>>> 2102.39user 153.88system 0:12.78elapsed 17645%CPU (0avgtext+0avgdata
>>> 1187156maxresident)k
>>> 0inputs+2272outputs (0major+1181660minor)pagefaults 0swaps
>>>
>>> Elapsed times look similar. Data file sizes differ significantly.
>>
>> Interesting. I wonder if that's because we're losing samples due to
>> hammering the rb, or if that's a side-effect of this patch.
>>
>> Does perf report describe any lost chunks?
>>
>> For comparison, can you give --per-thread a go prior to these patches
>> being applied?
> 
> FWIW, I had a go with (an old) perf record on an arm64 system using
> --per-thread, and I see that no samples are recorded, which seems like a
> bug.
> 
> With --per-thread, the slwodown was ~20%, whereas with the defaults it
> was > 400%.

That looks similar to what I am observing in per-process single thread 
profiling >4x slowdown.

> 
> Thanks,
> Mark.
> 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 1/n] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi
  2017-06-19 13:38             ` Mark Rutland
  2017-06-19 14:09               ` Alexey Budankov
@ 2017-06-19 14:59               ` Andi Kleen
  2017-06-19 15:09                 ` Mark Rutland
  1 sibling, 1 reply; 31+ messages in thread
From: Andi Kleen @ 2017-06-19 14:59 UTC (permalink / raw)
  To: Mark Rutland
  Cc: Alexey Budankov, Peter Zijlstra, Ingo Molnar,
	Arnaldo Carvalho de Melo, Alexander Shishkin, Kan Liang,
	Dmitri Prokhorov, Valery Cherepennikov, David Carrillo-Cisneros,
	Stephane Eranian, linux-kernel

> > For comparison, can you give --per-thread a go prior to these patches
> > being applied?
> 
> FWIW, I had a go with (an old) perf record on an arm64 system using
> --per-thread, and I see that no samples are recorded, which seems like a
> bug.
> 
> With --per-thread, the slwodown was ~20%, whereas with the defaults it
> was > 400%.

I'm not sure what the point of the experiment is? It has to work
with reasonable overead even without --per-thread.

FWIW Alexey already root caused the problem, so there's no need
to restart the debugging.

-Andi

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 1/n] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi
  2017-06-19 13:37         ` Alexey Budankov
@ 2017-06-19 15:00           ` Alexey Budankov
  2017-06-19 15:24             ` Andi Kleen
  2017-06-30 10:21             ` Alexey Budankov
  2017-06-19 15:14           ` Mark Rutland
  2017-06-30 10:21           ` Alexey Budankov
  2 siblings, 2 replies; 31+ messages in thread
From: Alexey Budankov @ 2017-06-19 15:00 UTC (permalink / raw)
  To: Mark Rutland
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Andi Kleen, Kan Liang, Dmitri Prokhorov,
	Valery Cherepennikov, David Carrillo-Cisneros, Stephane Eranian,
	linux-kernel

On 19.06.2017 16:37, Alexey Budankov wrote:
> On 19.06.2017 16:26, Mark Rutland wrote:
>> On Mon, Jun 19, 2017 at 04:08:32PM +0300, Alexey Budankov wrote:
>>> On 16.06.2017 1:10, Alexey Budankov wrote:
>>>> On 15.06.2017 22:56, Mark Rutland wrote:
>>>>> On Thu, Jun 15, 2017 at 08:41:42PM +0300, Alexey Budankov wrote:
>>>>>> This series of patches continues v2 and addresses captured comments.
>>
>>>>>> Specifically this patch replaces pinned_groups and flexible_groups
>>>>>> lists of perf_event_context by red-black cpu indexed trees avoiding
>>>>>> data structures duplication and introducing possibility to iterate
>>>>>> event groups for a specific CPU only.
>>
>>>>> Have you thrown Vince's perf fuzzer at this?
>>>>>
>>>>> If you haven't, please do. It can be found in the fuzzer directory of:
>>>>>
>>>>> https://github.com/deater/perf_event_tests
>>>>
>>>> Accepted.
>>>
>>> I run the test suite and it revealed no additional regressions in
>>> comparison to what I have on the clean kernel.
>>>
>>> However the fuzzer constantly reports some strange stacks that are
>>> not seen on the clean kernel and I have no idea how that might be
>>> caused by the patches.
>>
>> Ok; that was the kind of thing I was concerned about.
>>
>> What you say "strange stacks", what do you mean exactly?
>>
>> I take it the kernel spewing backtraces in dmesg?
>>
>> Can you dump those?
> 
> Here it is:
> 
> list_del corruption. prev->next should be ffff88c2c4654010, but was 
> ffff88c31eb0c020
> [  607.632813] ------------[ cut here ]------------
> [  607.632816] kernel BUG at lib/list_debug.c:53!
> [  607.632825] invalid opcode: 0000 [#1] SMP
> [  607.632898] Modules linked in: fuse xt_CHECKSUM iptable_mangle 
> ipt_MASQUERADE nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 nf_nat 
> nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack libcrc32c tun 
> bridge stp llc ebtable_filter ebtables ip6table_filter ip6_tables nfsv3 
> rpcsec_gss_krb5 nfsv4 arc4 md4 nls_utf8 cifs nfs ccm dns_resolver 
> fscache rpcrdma ib_isert iscsi_target_mod ib_iser libiscsi 
> scsi_transport_iscsi ib_srpt target_core_mod ib_srp scsi_transport_srp 
> ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm 
> intel_rapl sb_edac x86_pkg_temp_thermal intel_powerclamp hfi1 coretemp 
> crct10dif_pclmul crc32_pclmul ghash_clmulni_intel intel_cstate iTCO_wdt 
> intel_uncore iTCO_vendor_support joydev rdmavt intel_rapl_perf i2c_i801 
> ib_core ipmi_ssif mei_me mei ipmi_si ipmi_devintf tpm_tis
> [  607.633954]  lpc_ich pcspkr ipmi_msghandler acpi_pad tpm_tis_core 
> shpchp tpm wmi acpi_power_meter nfsd auth_rpcgss nfs_acl lockd grace 
> sunrpc mgag200 drm_kms_helper ttm drm igb crc32c_intel ptp pps_core dca 
> i2c_algo_bit
> [  607.634262] CPU: 271 PID: 28944 Comm: perf_fuzzer Not tainted 
> 4.12.0-rc4+ #22
> [  607.634363] Hardware name: Intel Corporation S7200AP/S7200AP, BIOS 
> S72C610.86B.01.01.0190.080520162104 08/05/2016
> [  607.634505] task: ffff88c2d5714000 task.stack: ffffa6f9352c8000
> [  607.634597] RIP: 0010:__list_del_entry_valid+0x7b/0x90
> [  607.634670] RSP: 0000:ffffa6f9352cbad0 EFLAGS: 00010046
> [  607.634746] RAX: 0000000000000054 RBX: ffff88c2c4654000 RCX: 
> 0000000000000000
> [  607.634845] RDX: 0000000000000000 RSI: ffff88c33fdce168 RDI: 
> ffff88c33fdce168
> [  607.634944] RBP: ffffa6f9352cbad0 R08: 00000000fffffffe R09: 
> 0000000000000600
> [  607.635042] R10: 0000000000000005 R11: 0000000000000000 R12: 
> ffff88c2e71ab200
> [  607.635140] R13: ffff88c2c4654010 R14: 0000000000000001 R15: 
> 0000000000000001
> [  607.635240] FS:  0000000000000000(0000) GS:ffff88c33fdc0000(0000) 
> knlGS:0000000000000000
> [  607.635351] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  607.635431] CR2: 00000000026be1e4 CR3: 0000002488e09000 CR4: 
> 00000000001407e0
> [  607.635531] Call Trace:
> [  607.635583]  list_del_event+0x1d7/0x210
> [  607.635646]  ? perf_cgroup_attach+0x70/0x70
> [  607.635711]  __perf_remove_from_context+0x3e/0x90
> [  607.635783]  ? event_sched_out.isra.90+0x300/0x300
> [  607.635854]  event_function_call+0xbf/0xf0
> [  607.635918]  ? event_sched_out.isra.90+0x300/0x300
> [  607.635991]  perf_remove_from_context+0x25/0x70
> [  607.636060]  perf_event_release_kernel+0xda/0x250
> [  607.636132]  ? __dentry_kill+0x10e/0x160
> [  607.636192]  perf_release+0x10/0x20
> [  607.636249]  __fput+0xdf/0x1e0
> [  607.636299]  ____fput+0xe/0x10
> [  607.636350]  task_work_run+0x83/0xb0
> [  607.636408]  do_exit+0x2bc/0xbc0
> [  607.636460]  ? page_add_file_rmap+0xaf/0x200
> [  607.636526]  ? alloc_set_pte+0x115/0x4f0
> [  607.636587]  do_group_exit+0x3f/0xb0
> [  607.636644]  get_signal+0x1cc/0x5c0
> [  607.636703]  do_signal+0x37/0x6a0
> [  607.636758]  ? __perf_sw_event+0x4f/0x80
> [  607.636821]  ? __do_page_fault+0x2e1/0x4d0
> [  607.636885]  exit_to_usermode_loop+0x4c/0x92
> [  607.636953]  prepare_exit_to_usermode+0x40/0x50
> [  607.637023]  retint_user+0x8/0x13
> [  607.640312] RIP: 0033:0x40f4a9
> [  607.643500] RSP: 002b:00007ffc62d00668 EFLAGS: 00000206 ORIG_RAX: 
> ffffffffffffff02
> [  607.646678] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 
> 000000000001100a
> [  607.649777] RDX: 00007fd5dd25bae0 RSI: 00007fd5dd259760 RDI: 
> 00007fd5dd25a640
> [  607.652791] RBP: 00007ffc62d00680 R08: 00007fd5dd45e740 R09: 
> 0000000000000000
> [  607.655709] R10: 00007fd5dd45ea10 R11: 0000000000000246 R12: 
> 0000000000401980
> [  607.658530] R13: 00007ffc62d02a80 R14: 0000000000000000 R15: 
> 0000000000000000
> [  607.661249] Code: e8 3a f7 d8 ff 0f 0b 48 89 fe 31 c0 48 c7 c7 20 66 
> cb ac e8 27 f7 d8 ff 0f 0b 48 89 fe 31 c0 48 c7 c7 e0 65 cb ac e8 14 f7 
> d8 ff <0f> 0b 48 89 fe 31 c0 48 c7 c7 a8 65 cb ac e8 01 f7 d8 ff 0f 0b
> [  607.666819] RIP: __list_del_entry_valid+0x7b/0x90 RSP: ffffa6f9352cbad0
> [  607.683316] ---[ end trace 34244c35550e0713 ]---
> [  607.691830] Fixing recursive fault but reboot is needed!
> 
> 2:
> 
> [  467.942059] unchecked MSR access error: WRMSR to 0x711 (tried to 
> write 0x00000000e8cc0055) at rIP: 0xffffffffac05fbd4 
> (native_write_msr+0x4/0x30)
> [  467.942068] Call Trace:
> [  467.942073]  <IRQ>
> [  467.942094]  ? snbep_uncore_msr_enable_event+0x54/0x60 [intel_uncore]
> [  467.942107]  uncore_pmu_event_start+0x9b/0x100 [intel_uncore]
> [  467.942119]  uncore_pmu_event_add+0x235/0x3a0 [intel_uncore]
> [  467.942126]  ? sched_clock+0xb/0x10
> [  467.942132]  ? sched_clock_cpu+0x11/0xb0
> [  467.942140]  event_sched_in.isra.100+0xdf/0x250
> [  467.942145]  sched_in_group+0x210/0x390
> [  467.942150]  ? sched_in_group+0x390/0x390
> [  467.942155]  group_sched_in_flexible_callback+0x17/0x20
> [  467.942160]  perf_cpu_tree_iterate+0x45/0x75
> [  467.942165]  ctx_sched_in+0x97/0x110
> [  467.942169]  perf_event_sched_in+0x77/0x80
> [  467.942174]  ctx_resched+0x69/0xb0
> [  467.942179]  __perf_event_enable+0x208/0x250
> [  467.942184]  event_function+0x93/0xe0
> [  467.942188]  remote_function+0x3b/0x50
> [  467.942194]  flush_smp_call_function_queue+0x71/0x120
> [  467.942200]  generic_smp_call_function_single_interrupt+0x13/0x30
> [  467.942206]  smp_call_function_single_interrupt+0x27/0x40
> [  467.942212]  call_function_single_interrupt+0x93/0xa0
> [  467.942217] RIP: 0010:native_restore_fl+0x6/0x10
> [  467.942220] RSP: 0000:ffff88c33ba03e00 EFLAGS: 00000206 ORIG_RAX: 
> ffffffffffffff04
> [  467.942224] RAX: ffff88c33c8dca08 RBX: ffff88c33c8dc928 RCX: 
> 0000000000000017
> [  467.942227] RDX: 0000000000000000 RSI: 0000000000000206 RDI: 
> 0000000000000206
> [  467.942229] RBP: ffff88c33ba03e00 R08: 0000000000000001 R09: 
> 0000000000007151
> [  467.942232] R10: 0000000000000000 R11: 000000000000005d R12: 
> ffff88c33c8dca08
> [  467.942234] R13: ffff88c33c8dc140 R14: 0000000000000001 R15: 
> 00000000000001d8
> [  467.942241]  _raw_spin_unlock_irqrestore+0x16/0x20
> [  467.942245]  update_blocked_averages+0x2cf/0x4a0
> [  467.942251]  rebalance_domains+0x4b/0x2b0
> [  467.942256]  run_rebalance_domains+0x1d7/0x210
> [  467.942260]  __do_softirq+0xd1/0x27f
> [  467.942267]  irq_exit+0xe9/0x100
> [  467.942271]  scheduler_ipi+0x8f/0x140
> [  467.942275]  smp_reschedule_interrupt+0x29/0x30
> [  467.942280]  reschedule_interrupt+0x93/0xa0
> [  467.942284] RIP: 0010:native_safe_halt+0x6/0x10
> [  467.942286] RSP: 0000:fffffffface03de0 EFLAGS: 00000246 ORIG_RAX: 
> ffffffffffffff02
> [  467.942291] RAX: 0000000000000000 RBX: fffffffface104c0 RCX: 
> 0000000000000000
> [  467.942293] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 
> 0000000000000000
> [  467.942295] RBP: fffffffface03de0 R08: 0000006d2c5efae3 R09: 
> 0000000000000000
> [  467.942298] R10: 0000000000000201 R11: 0000000000000930 R12: 
> 0000000000000000
> [  467.942300] R13: fffffffface104c0 R14: 0000000000000000 R15: 
> 0000000000000000
> [  467.942302]  </IRQ>
> [  467.942308]  default_idle+0x20/0x100
> [  467.942313]  arch_cpu_idle+0xf/0x20
> [  467.942317]  default_idle_call+0x2c/0x40
> [  467.942321]  do_idle+0x158/0x1e0
> [  467.942325]  cpu_startup_entry+0x71/0x80
> [  467.942330]  rest_init+0x77/0x80
> [  467.942338]  start_kernel+0x4a7/0x4c8
> [  467.942342]  ? set_init_arg+0x5a/0x5a
> [  467.942348]  ? early_idt_handler_array+0x120/0x120
> [  467.942352]  x86_64_start_reservations+0x29/0x2b
> [  467.942357]  x86_64_start_kernel+0x151/0x174
> [  467.942363]  secondary_startup_64+0x9f/0x9f
> 
> 

One more:

[  484.804717] ------------[ cut here ]------------
[  484.804737] WARNING: CPU: 15 PID: 31168 at 
arch/x86/events/core.c:1257 x86_pmu_start+0x8f/0xb0
[  484.804739] Modules linked in: btrfs xor raid6_pq ufs hfsplus hfs 
minix vfat msdos fat jfs xfs reiserfs binfmt_misc xt_CHECKSUM 
iptable_mangle fuse ipt_MASQUERADE nf_nat_masquerade_ipv4 iptable_nat 
nf_nat_ipv4 nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack 
nf_conntrack libcrc32c tun bridge stp llc ebtable_filter ebtables 
ip6table_filter nfsv3 ip6_tables arc4 md4 nls_utf8 rpcsec_gss_krb5 cifs 
nfsv4 nfs ccm dns_resolver fscache rpcrdma ib_isert iscsi_target_mod 
ib_iser libiscsi scsi_transport_iscsi ib_srpt target_core_mod ib_srp 
scsi_transport_srp ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm 
ib_cm iw_cm intel_rapl sb_edac x86_pkg_temp_thermal intel_powerclamp 
hfi1 coretemp crct10dif_pclmul crc32_pclmul rdmavt ghash_clmulni_intel 
intel_cstate joydev iTCO_wdt intel_uncore ib_core iTCO_vendor_support
[  484.804868]  tpm_tis intel_rapl_perf ipmi_ssif tpm_tis_core ipmi_si 
mei_me tpm ipmi_devintf shpchp acpi_pad lpc_ich ipmi_msghandler pcspkr 
mei i2c_i801 nfsd acpi_power_meter wmi auth_rpcgss nfs_acl lockd grace 
sunrpc mgag200 drm_kms_helper ttm drm igb crc32c_intel ptp pps_core dca 
i2c_algo_bit
[  484.804927] CPU: 15 PID: 31168 Comm: perf_fuzzer Tainted: G        W 
      4.12.0-rc4+ #24
[  484.804930] Hardware name: Intel Corporation S7200AP/S7200AP, BIOS 
S72C610.86B.01.01.0190.080520162104 08/05/2016
[  484.804933] task: ffff8c4699c10000 task.stack: ffff9ad8b5900000
[  484.804938] RIP: 0010:x86_pmu_start+0x8f/0xb0
[  484.804941] RSP: 0000:ffff8c46fbdc3e58 EFLAGS: 00010046
[  484.804945] RAX: 0000000000000000 RBX: ffff8c468068f000 RCX: 
0000000000000002
[  484.804948] RDX: 000000000000000e RSI: 0000000000000002 RDI: 
ffff8c468068f000
[  484.804950] RBP: ffff8c46fbdc3e70 R08: 0000000000000000 R09: 
00000000000004c1
[  484.804952] R10: ffff8c46fbdcaaa0 R11: ffff8c46fbdc3b58 R12: 
ffff8c46fbdca380
[  484.804955] R13: 0000000000000000 R14: ffff8c46fbdca5a4 R15: 
0000000000000001
[  484.804958] FS:  00007f1625c4c740(0000) GS:ffff8c46fbdc0000(0000) 
knlGS:0000000000000000
[  484.804961] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  484.804964] CR2: 000000000061b218 CR3: 0000002f2184f000 CR4: 
00000000001407e0
[  484.804967] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 
0000000000000000
[  484.804969] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 
0000000000000600
[  484.804971] Call Trace:
[  484.804976]  <IRQ>
[  484.804984]  x86_pmu_enable+0x27f/0x2f0
[  484.804992]  perf_pmu_enable+0x22/0x30
[  484.804997]  ctx_resched+0x72/0xb0
[  484.805003]  __perf_event_enable+0x208/0x250
[  484.805008]  event_function+0x93/0xe0
[  484.805012]  remote_function+0x3b/0x50
[  484.805018]  flush_smp_call_function_queue+0x71/0x120
[  484.805024]  generic_smp_call_function_single_interrupt+0x13/0x30
[  484.805030]  smp_trace_call_function_single_interrupt+0x2d/0xd0
[  484.805036]  trace_call_function_single_interrupt+0x93/0xa0
[  484.805040] RIP: 0033:0x40f4a9
[  484.805042] RSP: 002b:00007ffe26fde668 EFLAGS: 00000202 ORIG_RAX: 
ffffffffffffff04
[  484.805047] RAX: 00000000000028a1 RBX: 0000000000000000 RCX: 
0000000000067f43
[  484.805049] RDX: 00007f1625a49ae0 RSI: 00007f1625a47760 RDI: 
00007f1625a48640
[  484.805052] RBP: 00007ffe26fde680 R08: 00007f1625c4c740 R09: 
0000000000000000
[  484.805054] R10: 00007ffe26fde3f0 R11: 0000000000000246 R12: 
0000000000401980
[  484.805056] R13: 00007ffe26fe0a80 R14: 0000000000000000 R15: 
0000000000000000
[  484.805058]  </IRQ>
[  484.805062] Code: 0f ab 02 49 81 c4 08 02 00 00 49 0f ab 04 24 48 89 
df ff 15 9c ca 03 01 48 89 df e8 0c 05 1b 00 5b 41 5c 41 5d 5d c3 0f ff 
eb f5 <0f> ff eb f1 0f ff 66 66 2e 0f 1f 84 00 00 00 00 00 eb a0 66 66
[  484.805161] ---[ end trace dcc4ce8c0a8781e1 ]---
[  508.895507] NMI watchdog: enabled on all CPUs, permanently consumes 
one hw-PMU counter.


>>
>> Thanks,
>> Mark.
>>
> 
> 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 1/n] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi
  2017-06-19 14:59               ` Andi Kleen
@ 2017-06-19 15:09                 ` Mark Rutland
  2017-06-19 15:21                   ` Andi Kleen
  0 siblings, 1 reply; 31+ messages in thread
From: Mark Rutland @ 2017-06-19 15:09 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Alexey Budankov, Peter Zijlstra, Ingo Molnar,
	Arnaldo Carvalho de Melo, Alexander Shishkin, Kan Liang,
	Dmitri Prokhorov, Valery Cherepennikov, David Carrillo-Cisneros,
	Stephane Eranian, linux-kernel

On Mon, Jun 19, 2017 at 07:59:08AM -0700, Andi Kleen wrote:
> > > For comparison, can you give --per-thread a go prior to these patches
> > > being applied?
> > 
> > FWIW, I had a go with (an old) perf record on an arm64 system using
> > --per-thread, and I see that no samples are recorded, which seems like a
> > bug.
> > 
> > With --per-thread, the slwodown was ~20%, whereas with the defaults it
> > was > 400%.
> 
> I'm not sure what the point of the experiment is? It has to work
> with reasonable overead even without --per-thread.
> 
> FWIW Alexey already root caused the problem, so there's no need
> to restart the debugging.

Sure; we understand where that overhead is coming from, we have an idea
as to how to mitigate that, and we should try to make that work it we
can.

I was trying to get a feel for how that compares to what we can do
today. For other reasons (e.g. fd exhaustion), opening NR_CPUS * n
events might not be a great idea on systems with a huge number of CPUs.
We might want a heuristic in the perf tool regardless.

Thanks,
Mark.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 1/n] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi
  2017-06-19 13:37         ` Alexey Budankov
  2017-06-19 15:00           ` Alexey Budankov
@ 2017-06-19 15:14           ` Mark Rutland
  2017-06-19 15:27             ` Alexey Budankov
  2017-06-30 10:21           ` Alexey Budankov
  2 siblings, 1 reply; 31+ messages in thread
From: Mark Rutland @ 2017-06-19 15:14 UTC (permalink / raw)
  To: Alexey Budankov
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Andi Kleen, Kan Liang, Dmitri Prokhorov,
	Valery Cherepennikov, David Carrillo-Cisneros, Stephane Eranian,
	linux-kernel

On Mon, Jun 19, 2017 at 04:37:41PM +0300, Alexey Budankov wrote:
> On 19.06.2017 16:26, Mark Rutland wrote:
> >On Mon, Jun 19, 2017 at 04:08:32PM +0300, Alexey Budankov wrote:
> >>On 16.06.2017 1:10, Alexey Budankov wrote:
> >>>On 15.06.2017 22:56, Mark Rutland wrote:
> >>>>On Thu, Jun 15, 2017 at 08:41:42PM +0300, Alexey Budankov wrote:
> >>>>>This series of patches continues v2 and addresses captured comments.
> >
> >>>>>Specifically this patch replaces pinned_groups and flexible_groups
> >>>>>lists of perf_event_context by red-black cpu indexed trees avoiding
> >>>>>data structures duplication and introducing possibility to iterate
> >>>>>event groups for a specific CPU only.
> >
> >>>>Have you thrown Vince's perf fuzzer at this?
> >>>>
> >>>>If you haven't, please do. It can be found in the fuzzer directory of:
> >>>>
> >>>>https://github.com/deater/perf_event_tests
> >>>
> >>>Accepted.
> >>
> >>I run the test suite and it revealed no additional regressions in
> >>comparison to what I have on the clean kernel.
> >>
> >>However the fuzzer constantly reports some strange stacks that are
> >>not seen on the clean kernel and I have no idea how that might be
> >>caused by the patches.
> >
> >Ok; that was the kind of thing I was concerned about.
> >
> >What you say "strange stacks", what do you mean exactly?
> >
> >I take it the kernel spewing backtraces in dmesg?
> >
> >Can you dump those?
> 
> Here it is:
> 
> list_del corruption. prev->next should be ffff88c2c4654010, but was
> ffff88c31eb0c020
> [  607.632813] ------------[ cut here ]------------
> [  607.632816] kernel BUG at lib/list_debug.c:53!

> [  607.635531] Call Trace:
> [  607.635583]  list_del_event+0x1d7/0x210

Given this patch changes how list_{del,add}_event() works, it's possible
that this is a new bug.

I was going to try to test this on arm64, but I couldn't get the patch
to apply. I had a go with v4.12-rc5, tip/perf/core, and tip/perf/urgent.

Which branch should I be using as the base?

Thanks,
Mark

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 1/n] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi
  2017-06-19 15:09                 ` Mark Rutland
@ 2017-06-19 15:21                   ` Andi Kleen
  2017-06-19 15:24                     ` Mark Rutland
  0 siblings, 1 reply; 31+ messages in thread
From: Andi Kleen @ 2017-06-19 15:21 UTC (permalink / raw)
  To: Mark Rutland
  Cc: Alexey Budankov, Peter Zijlstra, Ingo Molnar,
	Arnaldo Carvalho de Melo, Alexander Shishkin, Kan Liang,
	Dmitri Prokhorov, Valery Cherepennikov, David Carrillo-Cisneros,
	Stephane Eranian, linux-kernel

> I was trying to get a feel for how that compares to what we can do
> today. For other reasons (e.g. fd exhaustion), opening NR_CPUS * n

You just have to increase the fd limit. The 1024 fd default is just
archaic for larger systems and doesn't really make any sense because
it only controls very small amounts of kernel memory.

> events might not be a great idea on systems with a huge number of CPUs.
> We might want a heuristic in the perf tool regardless.

But there's no alternative: we have to measure all CPUs with all events.

-andi

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 1/n] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi
  2017-06-19 15:21                   ` Andi Kleen
@ 2017-06-19 15:24                     ` Mark Rutland
  2017-06-19 15:39                       ` Andi Kleen
  0 siblings, 1 reply; 31+ messages in thread
From: Mark Rutland @ 2017-06-19 15:24 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Alexey Budankov, Peter Zijlstra, Ingo Molnar,
	Arnaldo Carvalho de Melo, Alexander Shishkin, Kan Liang,
	Dmitri Prokhorov, Valery Cherepennikov, David Carrillo-Cisneros,
	Stephane Eranian, linux-kernel

On Mon, Jun 19, 2017 at 08:21:51AM -0700, Andi Kleen wrote:
> > I was trying to get a feel for how that compares to what we can do
> > today. For other reasons (e.g. fd exhaustion), opening NR_CPUS * n
> 
> You just have to increase the fd limit. The 1024 fd default is just
> archaic for larger systems and doesn't really make any sense because
> it only controls very small amounts of kernel memory.
> 
> > events might not be a great idea on systems with a huge number of CPUs.
> > We might want a heuristic in the perf tool regardless.
> 
> But there's no alternative: we have to measure all CPUs with all events.

You can measure the process on all CPUs by using 1 event without a CPU
filter, rather than NR_CPUS events.

Thanks,
Mark.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 1/n] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi
  2017-06-19 15:00           ` Alexey Budankov
@ 2017-06-19 15:24             ` Andi Kleen
  2017-06-19 15:34               ` Alexey Budankov
  2017-06-30 10:21             ` Alexey Budankov
  1 sibling, 1 reply; 31+ messages in thread
From: Andi Kleen @ 2017-06-19 15:24 UTC (permalink / raw)
  To: Alexey Budankov
  Cc: Mark Rutland, Peter Zijlstra, Ingo Molnar,
	Arnaldo Carvalho de Melo, Alexander Shishkin, Kan Liang,
	Dmitri Prokhorov, Valery Cherepennikov, David Carrillo-Cisneros,
	Stephane Eranian, linux-kernel

> One more:

It may help if you run with KASAN enabled. That often
finds problems earlier. 

Also could use ftrace and use ftrace_dump_on_oops to 
see the sequence (but that has a lot of overhead
and sometimes hides problems)

-Andi

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 1/n] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi
  2017-06-19 15:14           ` Mark Rutland
@ 2017-06-19 15:27             ` Alexey Budankov
  0 siblings, 0 replies; 31+ messages in thread
From: Alexey Budankov @ 2017-06-19 15:27 UTC (permalink / raw)
  To: Mark Rutland
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Andi Kleen, Kan Liang, Dmitri Prokhorov,
	Valery Cherepennikov, David Carrillo-Cisneros, Stephane Eranian,
	linux-kernel

On 19.06.2017 18:14, Mark Rutland wrote:
> On Mon, Jun 19, 2017 at 04:37:41PM +0300, Alexey Budankov wrote:
>> On 19.06.2017 16:26, Mark Rutland wrote:
>>> On Mon, Jun 19, 2017 at 04:08:32PM +0300, Alexey Budankov wrote:
>>>> On 16.06.2017 1:10, Alexey Budankov wrote:
>>>>> On 15.06.2017 22:56, Mark Rutland wrote:
>>>>>> On Thu, Jun 15, 2017 at 08:41:42PM +0300, Alexey Budankov wrote:
>>>>>>> This series of patches continues v2 and addresses captured comments.
>>>
>>>>>>> Specifically this patch replaces pinned_groups and flexible_groups
>>>>>>> lists of perf_event_context by red-black cpu indexed trees avoiding
>>>>>>> data structures duplication and introducing possibility to iterate
>>>>>>> event groups for a specific CPU only.
>>>
>>>>>> Have you thrown Vince's perf fuzzer at this?
>>>>>>
>>>>>> If you haven't, please do. It can be found in the fuzzer directory of:
>>>>>>
>>>>>> https://github.com/deater/perf_event_tests
>>>>>
>>>>> Accepted.
>>>>
>>>> I run the test suite and it revealed no additional regressions in
>>>> comparison to what I have on the clean kernel.
>>>>
>>>> However the fuzzer constantly reports some strange stacks that are
>>>> not seen on the clean kernel and I have no idea how that might be
>>>> caused by the patches.
>>>
>>> Ok; that was the kind of thing I was concerned about.
>>>
>>> What you say "strange stacks", what do you mean exactly?
>>>
>>> I take it the kernel spewing backtraces in dmesg?
>>>
>>> Can you dump those?
>>
>> Here it is:
>>
>> list_del corruption. prev->next should be ffff88c2c4654010, but was
>> ffff88c31eb0c020
>> [  607.632813] ------------[ cut here ]------------
>> [  607.632816] kernel BUG at lib/list_debug.c:53!
> 
>> [  607.635531] Call Trace:
>> [  607.635583]  list_del_event+0x1d7/0x210
> 
> Given this patch changes how list_{del,add}_event() works, it's possible
> that this is a new bug.
> 
> I was going to try to test this on arm64, but I couldn't get the patch
> to apply. I had a go with v4.12-rc5, tip/perf/core, and tip/perf/urgent.
> 
> Which branch should I be using as the base?

v4.12-rc4+,
perf/core d0fabd1 [origin/perf/core] perf/core: Remove unused 
perf_cgroup_event_cgrp_time() function

> 
> Thanks,
> Mark
> 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 1/n] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi
  2017-06-19 15:24             ` Andi Kleen
@ 2017-06-19 15:34               ` Alexey Budankov
  2017-06-30 10:23                 ` Alexey Budankov
  0 siblings, 1 reply; 31+ messages in thread
From: Alexey Budankov @ 2017-06-19 15:34 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Mark Rutland, Peter Zijlstra, Ingo Molnar,
	Arnaldo Carvalho de Melo, Alexander Shishkin, Kan Liang,
	Dmitri Prokhorov, Valery Cherepennikov, David Carrillo-Cisneros,
	Stephane Eranian, linux-kernel

On 19.06.2017 18:24, Andi Kleen wrote:
>> One more:
> 
> It may help if you run with KASAN enabled. That often
> finds problems earlier.
> 
> Also could use ftrace and use ftrace_dump_on_oops to
> see the sequence (but that has a lot of overhead
> and sometimes hides problems)

Ok. Will try. Thanks.

> 
> -Andi
> 
> 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 1/n] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi
  2017-06-19 15:24                     ` Mark Rutland
@ 2017-06-19 15:39                       ` Andi Kleen
  2017-06-19 15:52                         ` Mark Rutland
  0 siblings, 1 reply; 31+ messages in thread
From: Andi Kleen @ 2017-06-19 15:39 UTC (permalink / raw)
  To: Mark Rutland
  Cc: Alexey Budankov, Peter Zijlstra, Ingo Molnar,
	Arnaldo Carvalho de Melo, Alexander Shishkin, Kan Liang,
	Dmitri Prokhorov, Valery Cherepennikov, David Carrillo-Cisneros,
	Stephane Eranian, linux-kernel

On Mon, Jun 19, 2017 at 04:24:01PM +0100, Mark Rutland wrote:
> On Mon, Jun 19, 2017 at 08:21:51AM -0700, Andi Kleen wrote:
> > > I was trying to get a feel for how that compares to what we can do
> > > today. For other reasons (e.g. fd exhaustion), opening NR_CPUS * n
> > 
> > You just have to increase the fd limit. The 1024 fd default is just
> > archaic for larger systems and doesn't really make any sense because
> > it only controls very small amounts of kernel memory.
> > 
> > > events might not be a great idea on systems with a huge number of CPUs.
> > > We might want a heuristic in the perf tool regardless.
> > 
> > But there's no alternative: we have to measure all CPUs with all events.
> 
> You can measure the process on all CPUs by using 1 event without a CPU
> filter, rather than NR_CPUS events.

That wouldn't measure all threads, at least not with current perf core.

-Andi

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 1/n] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi
  2017-06-19 15:39                       ` Andi Kleen
@ 2017-06-19 15:52                         ` Mark Rutland
  0 siblings, 0 replies; 31+ messages in thread
From: Mark Rutland @ 2017-06-19 15:52 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Alexey Budankov, Peter Zijlstra, Ingo Molnar,
	Arnaldo Carvalho de Melo, Alexander Shishkin, Kan Liang,
	Dmitri Prokhorov, Valery Cherepennikov, David Carrillo-Cisneros,
	Stephane Eranian, linux-kernel

On Mon, Jun 19, 2017 at 08:39:18AM -0700, Andi Kleen wrote:
> On Mon, Jun 19, 2017 at 04:24:01PM +0100, Mark Rutland wrote:
> > On Mon, Jun 19, 2017 at 08:21:51AM -0700, Andi Kleen wrote:
> > > > I was trying to get a feel for how that compares to what we can do
> > > > today. For other reasons (e.g. fd exhaustion), opening NR_CPUS * n
> > > 
> > > You just have to increase the fd limit. The 1024 fd default is just
> > > archaic for larger systems and doesn't really make any sense because
> > > it only controls very small amounts of kernel memory.
> > > 
> > > > events might not be a great idea on systems with a huge number of CPUs.
> > > > We might want a heuristic in the perf tool regardless.
> > > 
> > > But there's no alternative: we have to measure all CPUs with all events.
> > 
> > You can measure the process on all CPUs by using 1 event without a CPU
> > filter, rather than NR_CPUS events.
> 
> That wouldn't measure all threads, at least not with current perf core.

Ah; I missed the constraint imposed by perf_mmap().

For some reason I thought that was enforced by userspace only.

Sorry for the noise.

Mark.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 1/n] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi
  2017-06-15 19:56 ` Mark Rutland
  2017-06-15 22:10   ` Alexey Budankov
@ 2017-06-19 20:31   ` Alexey Budankov
  2017-06-20 13:36     ` Mark Rutland
  1 sibling, 1 reply; 31+ messages in thread
From: Alexey Budankov @ 2017-06-19 20:31 UTC (permalink / raw)
  To: Mark Rutland
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Andi Kleen, Kan Liang, Dmitri Prokhorov,
	Valery Cherepennikov, David Carrillo-Cisneros, Stephane Eranian,
	linux-kernel

On 15.06.2017 22:56, Mark Rutland wrote:
> Hi,
> 
> On Thu, Jun 15, 2017 at 08:41:42PM +0300, Alexey Budankov wrote:
>> This series of patches continues v2 and addresses captured comments.
> 
> As a general note, please keep any change log stuff out of the commit
> message, and mvoe that just before the diff. It generally doesn't maek
> sense for that to go in the commit message.
> 
>> Specifically this patch replaces pinned_groups and flexible_groups
>> lists of perf_event_context by red-black cpu indexed trees avoiding
>> data structures duplication and introducing possibility to iterate
>> event groups for a specific CPU only.
> 
> If you use --per-thread, I take it the overhead is significantly
> lowered?
> 
> As another general thing, please aim to write the commit message so that
> it'll make sense in N years' time, in the git history. Describe the
> whole problem, and the intended solution.
> 
> I had to go diving into mail archives to understand the problem you're
> trying to solve, and I shouldn't have to.
> 
> For example, this commit message could be something like:
> 
>      perf/core: use rb trees for pinned/flexible groups
> 
>      By default, the userspace perf tool opens per-cpu task-bound events
>      when sampling, so for N logical events requested by the user, the tool
>      will open N * NR_CPUS events.
> 
>      In the kernel, we mux events with a hrtimer, periodically rotating the
>      event list and trying to schedule each in turn. We skip events whose
>      cpu filter doesn't match. So when we get unlucky, we can walk N *
>      (NR_CPUS - 1) events pointlessly for each hrtimer invocation.
> 
>      This has been observed to result in significant overhead when running
>      the STREAM benchmark on 272 core Xeon Phi systems.
> 
>      One way to avoid this is to place our events into an rb tree sorted by
>      CPU filter, so that our hrtimer can skip to the current CPU's
>      sub-tree, and ignore everything else.
> 
>      As a step towards that, this patch replaces our event lists with rb
>      trees.
>   
> ... which hopefully makes sense now, and will in 2, 5, 10 years' time,
> 
>>
>> Signed-off-by: Alexey Budankov <alexey.budankov@linux.intel.com>
> 
> Have you thrown Vince's perf fuzzer at this?
> 
> If you haven't, please do. It can be found in the fuzzer directory of:
> 
> https://github.com/deater/perf_event_tests
> 
>> ---
>>   include/linux/perf_event.h |  34 +++-
>>   kernel/events/core.c       | 386
>> +++++++++++++++++++++++++++++++++------------
>>   2 files changed, 315 insertions(+), 105 deletions(-)
> 
> <changelog goes here>
> 
>>
>> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
>> index 24a6358..2c1dcf1 100644
>> --- a/include/linux/perf_event.h
>> +++ b/include/linux/perf_event.h
>> @@ -574,6 +574,27 @@ struct perf_event {
>>   	struct list_head		sibling_list;
>>
>>   	/*
>> +	 * Node on the pinned or flexible tree located at the event context;
>> +	 * the node may be empty in case its event is not directly attached
>> +	 * to the tree but to group_list list of the event directly
>> +	 * attached to the tree;
>> +	 */
>> +	struct rb_node			group_node;
>> +	/*
>> +	 * List keeps groups allocated for the same cpu;
>> +	 * the list may be empty in case its event is not directly
>> +	 * attached to the tree but to group_list list of the event directly
>> +	 * attached to the tree;
>> +	 */
>> +	struct list_head		group_list;
>> +	/*
>> +	 * Entry into the group_list list above;
>> +	 * the entry may be attached to the self group_list list above
>> +	 * in case the event is directly attached to the pinned or
>> +	 * flexible tree;
>> +	 */
>> +	struct list_head		group_list_entry;
> 
> We already have group_entry for threading the RB tree. i don't see why
> we need two new lists heads.
> 
>> +	/*
>>   	 * We need storage to track the entries in perf_pmu_migrate_context; we
>>   	 * cannot use the event_entry because of RCU and we want to keep the
>>   	 * group in tact which avoids us using the other two entries.
>> @@ -741,8 +762,17 @@ struct perf_event_context {
>>   	struct mutex			mutex;
>>
>>   	struct list_head		active_ctx_list;
>> -	struct list_head		pinned_groups;
>> -	struct list_head		flexible_groups;
>> +	/*
>> +	 * RB tree for pinned groups; keeps event's group_node
>> +	 * nodes of attached pinned groups;
>> +	 */
>> +	struct rb_root			pinned_tree;
>> +	/*
>> +	 * RB tree for flexible groups; keeps event's group_node
>> +	 * nodes of attached flexible groups;
>> +	 */
>> +	struct rb_root			flexible_tree;
> 
> We didn't need these commesnts before, and I don't think we need them
> now.
> 
> Any reason not to stick with the existing names?
> 
> The existing {pinned,flexible}_groups names are fine regardless of
> whether a list or tree is used, and makes it clear that the elements are
> the group leaders, not all events.
> 
>> +
>>   	struct list_head		event_list;
>>   	int				nr_events;
>>   	int				nr_active;
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index bc63f8d..a3531f9 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -1458,13 +1458,142 @@ static enum event_type_t
>> get_event_type(struct perf_event *event)
>>   	return event_type;
>>   }
>>
>> -static struct list_head *
>> -ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
>> +static struct rb_root *
>> +ctx_group_cpu_tree(struct perf_event *event, struct
>> perf_event_context *ctx)
>>   {
>>   	if (event->attr.pinned)
>> -		return &ctx->pinned_groups;
>> +		return &ctx->pinned_tree;
>>   	else
>> -		return &ctx->flexible_groups;
>> +		return &ctx->flexible_tree;
>> +}
>> +
>> +/*
>> + * Insert a group into a tree using event->cpu as a key. If event->cpu node
>> + * is already attached to the tree then the event is added to the attached
>> + * group's group_list list.
>> + */
>> +static void
>> +perf_cpu_tree_insert(struct rb_root *tree, struct perf_event *event)
>> +{
>> +	struct rb_node **node;
>> +	struct rb_node *parent;
>> +
>> +	WARN_ON_ONCE(!tree || !event);
>> +
>> +	node = &tree->rb_node;
>> +	parent = *node;
> 
> The first iteration of the loop handles this, so it can go.
> 
>> +
>> +	while (*node) {
>> +		struct perf_event *node_event =	container_of(*node,
>> +				struct perf_event, group_node);
>> +
>> +		parent = *node;
>> +
>> +		if (event->cpu < node_event->cpu) {
>> +			node = &parent->rb_left;
>> +		} else if (event->cpu > node_event->cpu) {
>> +			node = &parent->rb_right;
>> +		} else {
>> +			list_add_tail(&event->group_list_entry,
>> +					&node_event->group_list);
>> +			return;
> 
> Why aren't we putting all group leaders into the tree?
> 
> I don't think this is what Peter meant when he suggested a threaded rb
> tree.
> 
> My understanding was that every group leader should be in the rb tree,
> and every group leader should be in the same list that threads the whole
> tree. Both the rb tree and list have to be maintained with the same
> order.
> 
> Thus, you can walk the rb tree to find the first event in the sub-list
> that you care about, then walk that sub-list.
> 
> Note that this means we need to shuffle the list and rb tree to rotate
> events in each sub-tree.
> 
>> +		}
>> +	}
>> +
>> +	list_add_tail(&event->group_list_entry, &event->group_list);
>> +
>> +	rb_link_node(&event->group_node, parent, node);
>> +	rb_insert_color(&event->group_node, tree);
>> +}
>> +
>> +/*
>> + * Delete a group from a tree. If the group is directly attached to
>> the tree
>> + * it also detaches all groups on the group's group_list list.
>> + */
>> +static void
>> +perf_cpu_tree_delete(struct rb_root *tree, struct perf_event *event)
>> +{
>> +	WARN_ON_ONCE(!tree || !event);
>> +
>> +	if (RB_EMPTY_NODE(&event->group_node)) {
>> +		list_del_init(&event->group_list_entry);
>> +	} else {
>> +		struct perf_event *list_event, *list_next;
>> +
>> +		rb_erase(&event->group_node, tree);
>> +		list_for_each_entry_safe(list_event, list_next,
>> +				&event->group_list, group_list_entry)
>> +			list_del_init(&list_event->group_list_entry);
> 
> At this point, all the other group leaders withthe same filter are gone
> from the rb tree, since we didn't insert any of them into the rb tree in
> place of the leader we deleted.
> 
>> +	}
>> +}
>> +
>> +/*
>> + * Find group_list list by a cpu key and call provided callback for every
>> + * group on the list. Iteration stops if the callback returns non zero.
>> + */
>> +
>> +typedef int (*perf_cpu_tree_callback_t)(struct perf_event *, void *);
>> +
>> +static int
>> +perf_cpu_tree_iterate_cpu(struct rb_root *tree, int cpu,
>> +		perf_cpu_tree_callback_t callback, void *data)
>> +{
>> +	int ret = 0;
>> +	struct rb_node *node;
>> +	struct perf_event *event;
>> +
>> +	WARN_ON_ONCE(!tree);
>> +
>> +	node = tree->rb_node;
>> +
>> +	while (node) {
>> +		struct perf_event *node_event = container_of(node,
>> +				struct perf_event, group_node);
>> +
>> +		if (cpu < node_event->cpu) {
>> +			node = node->rb_left;
>> +		} else if (cpu > node_event->cpu) {
>> +			node = node->rb_right;
>> +		} else {
>> +			list_for_each_entry(event, &node_event->group_list,
>> +					group_list_entry) {
>> +				ret = callback(event, data);
>> +				if (ret)
>> +					return ret;
>> +			}
>> +		}
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +/*
>> + * Iterate a tree and call provided callback for every group in the tree.
>> + * Iteration stops if the callback returns non zero.
>> + */
>> +static int
>> +perf_cpu_tree_iterate(struct rb_root *tree,
>> +		perf_cpu_tree_callback_t callback, void *data)
>> +{
>> +	int ret = 0;
>> +	struct rb_node *node;
>> +	struct perf_event *event;
>> +
>> +	WARN_ON_ONCE(!tree);
>> +
>> +	for (node = rb_first(tree); node; node = rb_next(node)) {
>> +		struct perf_event *node_event = container_of(node,
>> +				struct perf_event, group_node);
>> +
>> +		list_for_each_entry(event, &node_event->group_list,
>> +				group_list_entry) {
>> +			ret = callback(event, data);
>> +			if (ret)
>> +				return ret;
>> +		}
>> +	}
>> +
>> +	return 0;
>>   }
> 
> If you need to iterate over every event, you can use the list that
> threads the whole tree.

Could you please explain more on that?

> 
>>
>>   /*
>> @@ -1485,12 +1614,12 @@ list_add_event(struct perf_event *event,
>> struct perf_event_context *ctx)
>>   	 * perf_group_detach can, at all times, locate all siblings.
>>   	 */
>>   	if (event->group_leader == event) {
>> -		struct list_head *list;
>> +		struct rb_root *tree;
>>
>>   		event->group_caps = event->event_caps;
>>
>> -		list = ctx_group_list(event, ctx);
>> -		list_add_tail(&event->group_entry, list);
>> +		tree = ctx_group_cpu_tree(event, ctx);
>> +		perf_cpu_tree_insert(tree, event);
> 
> Can we wrap this in a helper so that finds the sub-tree and performs
> the insert?
> 
>>   	}
>>
>>   	list_update_cgroup_event(event, ctx, true);
>> @@ -1680,8 +1809,12 @@ list_del_event(struct perf_event *event,
>> struct perf_event_context *ctx)
>>
>>   	list_del_rcu(&event->event_entry);
>>
>> -	if (event->group_leader == event)
>> -		list_del_init(&event->group_entry);
>> +	if (event->group_leader == event) {
>> +		struct rb_root *tree;
>> +
>> +		tree = ctx_group_cpu_tree(event, ctx);
>> +		perf_cpu_tree_delete(tree, event);
> 
> ... likewise?
> 
> Thanks,
> Mark.
> 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 1/n] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi
  2017-06-19 20:31   ` Alexey Budankov
@ 2017-06-20 13:36     ` Mark Rutland
  2017-06-20 15:22       ` Alexey Budankov
  0 siblings, 1 reply; 31+ messages in thread
From: Mark Rutland @ 2017-06-20 13:36 UTC (permalink / raw)
  To: Alexey Budankov
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Andi Kleen, Kan Liang, Dmitri Prokhorov,
	Valery Cherepennikov, David Carrillo-Cisneros, Stephane Eranian,
	linux-kernel

On Mon, Jun 19, 2017 at 11:31:59PM +0300, Alexey Budankov wrote:
> On 15.06.2017 22:56, Mark Rutland wrote:
> >On Thu, Jun 15, 2017 at 08:41:42PM +0300, Alexey Budankov wrote:
> >>+static int
> >>+perf_cpu_tree_iterate(struct rb_root *tree,
> >>+		perf_cpu_tree_callback_t callback, void *data)
> >>+{
> >>+	int ret = 0;
> >>+	struct rb_node *node;
> >>+	struct perf_event *event;
> >>+
> >>+	WARN_ON_ONCE(!tree);
> >>+
> >>+	for (node = rb_first(tree); node; node = rb_next(node)) {
> >>+		struct perf_event *node_event = container_of(node,
> >>+				struct perf_event, group_node);
> >>+
> >>+		list_for_each_entry(event, &node_event->group_list,
> >>+				group_list_entry) {
> >>+			ret = callback(event, data);
> >>+			if (ret)
> >>+				return ret;
> >>+		}
> >>+	}
> >>+
> >>+	return 0;
> >>  }
> >
> >If you need to iterate over every event, you can use the list that
> >threads the whole tree.
> 
> Could you please explain more on that?

In Peter's original suggestion, we'd use a threaded tree rather than a
tree of lists.

i.e. you'd have something like:

struct threaded_rb_node {
	struct rb_node   node;
	struct list_head head;
};

... with the tree and list covering all nodes, in the same order:

Tree:

     3
    / \
   /   \
  1     5
 / \   / \
0   2 4   6

List:

0 - 1 - 2 - 3 - 4 - 5 - 6

... that way you can search using the tree, and iterate using the list,
even when you wan to iterate over sub-lists.

Thanks,
Mark.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 1/n] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi
  2017-06-20 13:36     ` Mark Rutland
@ 2017-06-20 15:22       ` Alexey Budankov
  2017-06-20 16:37         ` Mark Rutland
  0 siblings, 1 reply; 31+ messages in thread
From: Alexey Budankov @ 2017-06-20 15:22 UTC (permalink / raw)
  To: Mark Rutland
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Andi Kleen, Kan Liang, Dmitri Prokhorov,
	Valery Cherepennikov, David Carrillo-Cisneros, Stephane Eranian,
	linux-kernel

On 20.06.2017 16:36, Mark Rutland wrote:
> On Mon, Jun 19, 2017 at 11:31:59PM +0300, Alexey Budankov wrote:
>> On 15.06.2017 22:56, Mark Rutland wrote:
>>> On Thu, Jun 15, 2017 at 08:41:42PM +0300, Alexey Budankov wrote:
>>>> +static int
>>>> +perf_cpu_tree_iterate(struct rb_root *tree,
>>>> +		perf_cpu_tree_callback_t callback, void *data)
>>>> +{
>>>> +	int ret = 0;
>>>> +	struct rb_node *node;
>>>> +	struct perf_event *event;
>>>> +
>>>> +	WARN_ON_ONCE(!tree);
>>>> +
>>>> +	for (node = rb_first(tree); node; node = rb_next(node)) {
>>>> +		struct perf_event *node_event = container_of(node,
>>>> +				struct perf_event, group_node);
>>>> +
>>>> +		list_for_each_entry(event, &node_event->group_list,
>>>> +				group_list_entry) {
>>>> +			ret = callback(event, data);
>>>> +			if (ret)
>>>> +				return ret;
>>>> +		}
>>>> +	}
>>>> +
>>>> +	return 0;
>>>>   }
>>>
>>> If you need to iterate over every event, you can use the list that
>>> threads the whole tree.
>>
>> Could you please explain more on that?
> 
> In Peter's original suggestion, we'd use a threaded tree rather than a
> tree of lists.
> 
> i.e. you'd have something like:
> 
> struct threaded_rb_node {
> 	struct rb_node   node;
> 	struct list_head head;
> };

Is this for every group leader? Which objects does the head keep?

> 
> ... with the tree and list covering all nodes, in the same order:
> 
> Tree:
> 
>       3
>      / \
>     /   \
>    1     5
>   / \   / \
> 0   2 4   6
> 
> List:
> 
> 0 - 1 - 2 - 3 - 4 - 5 - 6
> 
> ... that way you can search using the tree, and iterate using the list,
> even when you wan to iterate over sub-lists.
> 
> Thanks,
> Mark.
> 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 1/n] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi
  2017-06-20 15:22       ` Alexey Budankov
@ 2017-06-20 16:37         ` Mark Rutland
  2017-06-20 17:10           ` Alexey Budankov
  0 siblings, 1 reply; 31+ messages in thread
From: Mark Rutland @ 2017-06-20 16:37 UTC (permalink / raw)
  To: Alexey Budankov
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Andi Kleen, Kan Liang, Dmitri Prokhorov,
	Valery Cherepennikov, David Carrillo-Cisneros, Stephane Eranian,
	linux-kernel

On Tue, Jun 20, 2017 at 06:22:56PM +0300, Alexey Budankov wrote:
> On 20.06.2017 16:36, Mark Rutland wrote:
> >On Mon, Jun 19, 2017 at 11:31:59PM +0300, Alexey Budankov wrote:
> >>On 15.06.2017 22:56, Mark Rutland wrote:
> >>>On Thu, Jun 15, 2017 at 08:41:42PM +0300, Alexey Budankov wrote:
> >>>>+static int
> >>>>+perf_cpu_tree_iterate(struct rb_root *tree,
> >>>>+		perf_cpu_tree_callback_t callback, void *data)
> >>>>+{
> >>>>+	int ret = 0;
> >>>>+	struct rb_node *node;
> >>>>+	struct perf_event *event;
> >>>>+
> >>>>+	WARN_ON_ONCE(!tree);
> >>>>+
> >>>>+	for (node = rb_first(tree); node; node = rb_next(node)) {
> >>>>+		struct perf_event *node_event = container_of(node,
> >>>>+				struct perf_event, group_node);
> >>>>+
> >>>>+		list_for_each_entry(event, &node_event->group_list,
> >>>>+				group_list_entry) {
> >>>>+			ret = callback(event, data);
> >>>>+			if (ret)
> >>>>+				return ret;
> >>>>+		}
> >>>>+	}
> >>>>+
> >>>>+	return 0;
> >>>>  }
> >>>
> >>>If you need to iterate over every event, you can use the list that
> >>>threads the whole tree.
> >>
> >>Could you please explain more on that?
> >
> >In Peter's original suggestion, we'd use a threaded tree rather than a
> >tree of lists.
> >
> >i.e. you'd have something like:
> >
> >struct threaded_rb_node {
> >	struct rb_node   node;
> >	struct list_head head;
> >};
> 
> Is this for every group leader?

Yes; *every* group leader would be directly in the threaded rb tree.

> Which objects does the head keep?

Sorry, I'm not sure how to answer that. Did the above clarify?

If not, could you rephrase the question?

Thanks,
Mark.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 1/n] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi
  2017-06-20 16:37         ` Mark Rutland
@ 2017-06-20 17:10           ` Alexey Budankov
  0 siblings, 0 replies; 31+ messages in thread
From: Alexey Budankov @ 2017-06-20 17:10 UTC (permalink / raw)
  To: Mark Rutland
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Andi Kleen, Kan Liang, Dmitri Prokhorov,
	Valery Cherepennikov, David Carrillo-Cisneros, Stephane Eranian,
	linux-kernel

On 20.06.2017 19:37, Mark Rutland wrote:
> On Tue, Jun 20, 2017 at 06:22:56PM +0300, Alexey Budankov wrote:
>> On 20.06.2017 16:36, Mark Rutland wrote:
>>> On Mon, Jun 19, 2017 at 11:31:59PM +0300, Alexey Budankov wrote:
>>>> On 15.06.2017 22:56, Mark Rutland wrote:
>>>>> On Thu, Jun 15, 2017 at 08:41:42PM +0300, Alexey Budankov wrote:
>>>>>> +static int
>>>>>> +perf_cpu_tree_iterate(struct rb_root *tree,
>>>>>> +		perf_cpu_tree_callback_t callback, void *data)
>>>>>> +{
>>>>>> +	int ret = 0;
>>>>>> +	struct rb_node *node;
>>>>>> +	struct perf_event *event;
>>>>>> +
>>>>>> +	WARN_ON_ONCE(!tree);
>>>>>> +
>>>>>> +	for (node = rb_first(tree); node; node = rb_next(node)) {
>>>>>> +		struct perf_event *node_event = container_of(node,
>>>>>> +				struct perf_event, group_node);
>>>>>> +
>>>>>> +		list_for_each_entry(event, &node_event->group_list,
>>>>>> +				group_list_entry) {
>>>>>> +			ret = callback(event, data);
>>>>>> +			if (ret)
>>>>>> +				return ret;
>>>>>> +		}
>>>>>> +	}
>>>>>> +
>>>>>> +	return 0;
>>>>>>   }
>>>>>
>>>>> If you need to iterate over every event, you can use the list that
>>>>> threads the whole tree.
>>>>
>>>> Could you please explain more on that?
>>>
>>> In Peter's original suggestion, we'd use a threaded tree rather than a
>>> tree of lists.
>>>
>>> i.e. you'd have something like:
>>>
>>> struct threaded_rb_node {
>>> 	struct rb_node   node;
>>> 	struct list_head head;
>>> };
>>
>> Is this for every group leader?
> 
> Yes; *every* group leader would be directly in the threaded rb tree.

In this case the tree's key heeds to be something trickier than just 
event->cpu. To avoid that complication group_list is introduced. BTW, 
addressing perf_event_tree_delete issue doesn't look like a big change now:

static void
perf_cpu_tree_delete(struct rb_root *tree, struct perf_event *event)
{
	struct perf_event *next;

	WARN_ON_ONCE(!tree || !event);

	list_del_init(&event->group_entry);

	if (!RB_EMPTY_NODE(&event->group_node)) {
		if (!list_empty(&event->group_list)) {
			next = list_first_entry(&event->group_list,
					struct perf_event, group_entry);
			list_replace_init(&event->group_list,
					&next->group_list);
			rb_replace_node(&event->group_node,
					&next->group_node, tree);
		} else {
			rb_erase(&event->group_node, tree);
		}
		RB_CLEAR_NODE(&event->group_node);
	}
}

> 
>> Which objects does the head keep?
> 
> Sorry, I'm not sure how to answer that. Did the above clarify?
> 
> If not, could you rephrase the question?
> 
> Thanks,
> Mark.
> 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 1/n] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi
  2017-06-19 15:00           ` Alexey Budankov
  2017-06-19 15:24             ` Andi Kleen
@ 2017-06-30 10:21             ` Alexey Budankov
  1 sibling, 0 replies; 31+ messages in thread
From: Alexey Budankov @ 2017-06-30 10:21 UTC (permalink / raw)
  To: Mark Rutland
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Andi Kleen, Kan Liang, Dmitri Prokhorov,
	Valery Cherepennikov, David Carrillo-Cisneros, Stephane Eranian,
	linux-kernel

Hi,

On 19.06.2017 18:00, Alexey Budankov wrote:
> On 19.06.2017 16:37, Alexey Budankov wrote:
>> On 19.06.2017 16:26, Mark Rutland wrote:
>>> On Mon, Jun 19, 2017 at 04:08:32PM +0300, Alexey Budankov wrote:
>>>> On 16.06.2017 1:10, Alexey Budankov wrote:
>>>>> On 15.06.2017 22:56, Mark Rutland wrote:
>>>>>> On Thu, Jun 15, 2017 at 08:41:42PM +0300, Alexey Budankov wrote:
>>>>>>> This series of patches continues v2 and addresses captured comments.
>>>
>>>>>>> Specifically this patch replaces pinned_groups and flexible_groups
>>>>>>> lists of perf_event_context by red-black cpu indexed trees avoiding
>>>>>>> data structures duplication and introducing possibility to iterate
>>>>>>> event groups for a specific CPU only.
>>>
>>>>>> Have you thrown Vince's perf fuzzer at this?
>>>>>>
>>>>>> If you haven't, please do. It can be found in the fuzzer directory of:
>>>>>>
>>>>>> https://github.com/deater/perf_event_tests
>>>>>
>>>>> Accepted.
>>>>
>>>> I run the test suite and it revealed no additional regressions in
>>>> comparison to what I have on the clean kernel.
>>>>
>>>> However the fuzzer constantly reports some strange stacks that are
>>>> not seen on the clean kernel and I have no idea how that might be
>>>> caused by the patches.
>>>
>>> Ok; that was the kind of thing I was concerned about.
>>>
>>> What you say "strange stacks", what do you mean exactly?
>>>
>>> I take it the kernel spewing backtraces in dmesg?
>>>
>>> Can you dump those?
>>
>> Here it is:
>>
>> list_del corruption. prev->next should be ffff88c2c4654010, but was ffff88c31eb0c020
>> [  607.632813] ------------[ cut here ]------------
>> [  607.632816] kernel BUG at lib/list_debug.c:53!
>> [  607.632825] invalid opcode: 0000 [#1] SMP
>> [  607.632898] Modules linked in: fuse xt_CHECKSUM iptable_mangle ipt_MASQUERADE nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack libcrc32c tun bridge stp llc ebtable_filter ebtables ip6table_filter ip6_tables nfsv3 rpcsec_gss_krb5 nfsv4 arc4 md4 nls_utf8 cifs nfs ccm dns_resolver fscache rpcrdma ib_isert iscsi_target_mod ib_iser libiscsi scsi_transport_iscsi ib_srpt target_core_mod ib_srp scsi_transport_srp ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm intel_rapl sb_edac x86_pkg_temp_thermal intel_powerclamp hfi1 coretemp crct10dif_pclmul crc32_pclmul ghash_clmulni_intel intel_cstate iTCO_wdt intel_uncore iTCO_vendor_support joydev rdmavt intel_rapl_perf i2c_i801 ib_core ipmi_ssif mei_me mei ipmi_si ipmi_devintf tpm_tis
>> [  607.633954]  lpc_ich pcspkr ipmi_msghandler acpi_pad tpm_tis_core shpchp tpm wmi acpi_power_meter nfsd auth_rpcgss nfs_acl lockd grace sunrpc mgag200 drm_kms_helper ttm drm igb crc32c_intel ptp pps_core dca i2c_algo_bit
>> [  607.634262] CPU: 271 PID: 28944 Comm: perf_fuzzer Not tainted 4.12.0-rc4+ #22
>> [  607.634363] Hardware name: Intel Corporation S7200AP/S7200AP, BIOS S72C610.86B.01.01.0190.080520162104 08/05/2016
>> [  607.634505] task: ffff88c2d5714000 task.stack: ffffa6f9352c8000
>> [  607.634597] RIP: 0010:__list_del_entry_valid+0x7b/0x90
>> [  607.634670] RSP: 0000:ffffa6f9352cbad0 EFLAGS: 00010046
>> [  607.634746] RAX: 0000000000000054 RBX: ffff88c2c4654000 RCX: 0000000000000000
>> [  607.634845] RDX: 0000000000000000 RSI: ffff88c33fdce168 RDI: ffff88c33fdce168
>> [  607.634944] RBP: ffffa6f9352cbad0 R08: 00000000fffffffe R09: 0000000000000600
>> [  607.635042] R10: 0000000000000005 R11: 0000000000000000 R12: ffff88c2e71ab200
>> [  607.635140] R13: ffff88c2c4654010 R14: 0000000000000001 R15: 0000000000000001
>> [  607.635240] FS:  0000000000000000(0000) GS:ffff88c33fdc0000(0000) knlGS:0000000000000000
>> [  607.635351] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>> [  607.635431] CR2: 00000000026be1e4 CR3: 0000002488e09000 CR4: 00000000001407e0
>> [  607.635531] Call Trace:
>> [  607.635583]  list_del_event+0x1d7/0x210
>> [  607.635646]  ? perf_cgroup_attach+0x70/0x70
>> [  607.635711]  __perf_remove_from_context+0x3e/0x90
>> [  607.635783]  ? event_sched_out.isra.90+0x300/0x300
>> [  607.635854]  event_function_call+0xbf/0xf0
>> [  607.635918]  ? event_sched_out.isra.90+0x300/0x300
>> [  607.635991]  perf_remove_from_context+0x25/0x70
>> [  607.636060]  perf_event_release_kernel+0xda/0x250
>> [  607.636132]  ? __dentry_kill+0x10e/0x160
>> [  607.636192]  perf_release+0x10/0x20
>> [  607.636249]  __fput+0xdf/0x1e0
>> [  607.636299]  ____fput+0xe/0x10
>> [  607.636350]  task_work_run+0x83/0xb0
>> [  607.636408]  do_exit+0x2bc/0xbc0
>> [  607.636460]  ? page_add_file_rmap+0xaf/0x200
>> [  607.636526]  ? alloc_set_pte+0x115/0x4f0
>> [  607.636587]  do_group_exit+0x3f/0xb0
>> [  607.636644]  get_signal+0x1cc/0x5c0
>> [  607.636703]  do_signal+0x37/0x6a0
>> [  607.636758]  ? __perf_sw_event+0x4f/0x80
>> [  607.636821]  ? __do_page_fault+0x2e1/0x4d0
>> [  607.636885]  exit_to_usermode_loop+0x4c/0x92
>> [  607.636953]  prepare_exit_to_usermode+0x40/0x50
>> [  607.637023]  retint_user+0x8/0x13
>> [  607.640312] RIP: 0033:0x40f4a9
>> [  607.643500] RSP: 002b:00007ffc62d00668 EFLAGS: 00000206 ORIG_RAX: ffffffffffffff02
>> [  607.646678] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 000000000001100a
>> [  607.649777] RDX: 00007fd5dd25bae0 RSI: 00007fd5dd259760 RDI: 00007fd5dd25a640
>> [  607.652791] RBP: 00007ffc62d00680 R08: 00007fd5dd45e740 R09: 0000000000000000
>> [  607.655709] R10: 00007fd5dd45ea10 R11: 0000000000000246 R12: 0000000000401980
>> [  607.658530] R13: 00007ffc62d02a80 R14: 0000000000000000 R15: 0000000000000000
>> [  607.661249] Code: e8 3a f7 d8 ff 0f 0b 48 89 fe 31 c0 48 c7 c7 20 66 cb ac e8 27 f7 d8 ff 0f 0b 48 89 fe 31 c0 48 c7 c7 e0 65 cb ac e8 14 f7 d8 ff <0f> 0b 48 89 fe 31 c0 48 c7 c7 a8 65 cb ac e8 01 f7 d8 ff 0f 0b
>> [  607.666819] RIP: __list_del_entry_valid+0x7b/0x90 RSP: ffffa6f9352cbad0
>> [  607.683316] ---[ end trace 34244c35550e0713 ]---
>> [  607.691830] Fixing recursive fault but reboot is needed!
>>
>> 2:
>>
>> [  467.942059] unchecked MSR access error: WRMSR to 0x711 (tried to write 0x00000000e8cc0055) at rIP: 0xffffffffac05fbd4 (native_write_msr+0x4/0x30)
>> [  467.942068] Call Trace:
>> [  467.942073]  <IRQ>
>> [  467.942094]  ? snbep_uncore_msr_enable_event+0x54/0x60 [intel_uncore]
>> [  467.942107]  uncore_pmu_event_start+0x9b/0x100 [intel_uncore]
>> [  467.942119]  uncore_pmu_event_add+0x235/0x3a0 [intel_uncore]
>> [  467.942126]  ? sched_clock+0xb/0x10
>> [  467.942132]  ? sched_clock_cpu+0x11/0xb0
>> [  467.942140]  event_sched_in.isra.100+0xdf/0x250
>> [  467.942145]  sched_in_group+0x210/0x390
>> [  467.942150]  ? sched_in_group+0x390/0x390
>> [  467.942155]  group_sched_in_flexible_callback+0x17/0x20
>> [  467.942160]  perf_cpu_tree_iterate+0x45/0x75
>> [  467.942165]  ctx_sched_in+0x97/0x110
>> [  467.942169]  perf_event_sched_in+0x77/0x80
>> [  467.942174]  ctx_resched+0x69/0xb0
>> [  467.942179]  __perf_event_enable+0x208/0x250
>> [  467.942184]  event_function+0x93/0xe0
>> [  467.942188]  remote_function+0x3b/0x50
>> [  467.942194]  flush_smp_call_function_queue+0x71/0x120
>> [  467.942200]  generic_smp_call_function_single_interrupt+0x13/0x30
>> [  467.942206]  smp_call_function_single_interrupt+0x27/0x40
>> [  467.942212]  call_function_single_interrupt+0x93/0xa0
>> [  467.942217] RIP: 0010:native_restore_fl+0x6/0x10
>> [  467.942220] RSP: 0000:ffff88c33ba03e00 EFLAGS: 00000206 ORIG_RAX: ffffffffffffff04
>> [  467.942224] RAX: ffff88c33c8dca08 RBX: ffff88c33c8dc928 RCX: 0000000000000017
>> [  467.942227] RDX: 0000000000000000 RSI: 0000000000000206 RDI: 0000000000000206
>> [  467.942229] RBP: ffff88c33ba03e00 R08: 0000000000000001 R09: 0000000000007151
>> [  467.942232] R10: 0000000000000000 R11: 000000000000005d R12: ffff88c33c8dca08
>> [  467.942234] R13: ffff88c33c8dc140 R14: 0000000000000001 R15: 00000000000001d8
>> [  467.942241]  _raw_spin_unlock_irqrestore+0x16/0x20
>> [  467.942245]  update_blocked_averages+0x2cf/0x4a0
>> [  467.942251]  rebalance_domains+0x4b/0x2b0
>> [  467.942256]  run_rebalance_domains+0x1d7/0x210
>> [  467.942260]  __do_softirq+0xd1/0x27f
>> [  467.942267]  irq_exit+0xe9/0x100
>> [  467.942271]  scheduler_ipi+0x8f/0x140
>> [  467.942275]  smp_reschedule_interrupt+0x29/0x30
>> [  467.942280]  reschedule_interrupt+0x93/0xa0
>> [  467.942284] RIP: 0010:native_safe_halt+0x6/0x10
>> [  467.942286] RSP: 0000:fffffffface03de0 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff02
>> [  467.942291] RAX: 0000000000000000 RBX: fffffffface104c0 RCX: 0000000000000000
>> [  467.942293] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
>> [  467.942295] RBP: fffffffface03de0 R08: 0000006d2c5efae3 R09: 0000000000000000
>> [  467.942298] R10: 0000000000000201 R11: 0000000000000930 R12: 0000000000000000
>> [  467.942300] R13: fffffffface104c0 R14: 0000000000000000 R15: 0000000000000000
>> [  467.942302]  </IRQ>
>> [  467.942308]  default_idle+0x20/0x100
>> [  467.942313]  arch_cpu_idle+0xf/0x20
>> [  467.942317]  default_idle_call+0x2c/0x40
>> [  467.942321]  do_idle+0x158/0x1e0
>> [  467.942325]  cpu_startup_entry+0x71/0x80
>> [  467.942330]  rest_init+0x77/0x80
>> [  467.942338]  start_kernel+0x4a7/0x4c8
>> [  467.942342]  ? set_init_arg+0x5a/0x5a
>> [  467.942348]  ? early_idt_handler_array+0x120/0x120
>> [  467.942352]  x86_64_start_reservations+0x29/0x2b
>> [  467.942357]  x86_64_start_kernel+0x151/0x174
>> [  467.942363]  secondary_startup_64+0x9f/0x9f
>>
>>
> 
> One more:
> 
> [  484.804717] ------------[ cut here ]------------
> [  484.804737] WARNING: CPU: 15 PID: 31168 at arch/x86/events/core.c:1257 x86_pmu_start+0x8f/0xb0
> [  484.804739] Modules linked in: btrfs xor raid6_pq ufs hfsplus hfs minix vfat msdos fat jfs xfs reiserfs binfmt_misc xt_CHECKSUM iptable_mangle fuse ipt_MASQUERADE nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack libcrc32c tun bridge stp llc ebtable_filter ebtables ip6table_filter nfsv3 ip6_tables arc4 md4 nls_utf8 rpcsec_gss_krb5 cifs nfsv4 nfs ccm dns_resolver fscache rpcrdma ib_isert iscsi_target_mod ib_iser libiscsi scsi_transport_iscsi ib_srpt target_core_mod ib_srp scsi_transport_srp ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm intel_rapl sb_edac x86_pkg_temp_thermal intel_powerclamp hfi1 coretemp crct10dif_pclmul crc32_pclmul rdmavt ghash_clmulni_intel intel_cstate joydev iTCO_wdt intel_uncore ib_core iTCO_vendor_support
> [  484.804868]  tpm_tis intel_rapl_perf ipmi_ssif tpm_tis_core ipmi_si mei_me tpm ipmi_devintf shpchp acpi_pad lpc_ich ipmi_msghandler pcspkr mei i2c_i801 nfsd acpi_power_meter wmi auth_rpcgss nfs_acl lockd grace sunrpc mgag200 drm_kms_helper ttm drm igb crc32c_intel ptp pps_core dca i2c_algo_bit
> [  484.804927] CPU: 15 PID: 31168 Comm: perf_fuzzer Tainted: G        W      4.12.0-rc4+ #24
> [  484.804930] Hardware name: Intel Corporation S7200AP/S7200AP, BIOS S72C610.86B.01.01.0190.080520162104 08/05/2016
> [  484.804933] task: ffff8c4699c10000 task.stack: ffff9ad8b5900000
> [  484.804938] RIP: 0010:x86_pmu_start+0x8f/0xb0
> [  484.804941] RSP: 0000:ffff8c46fbdc3e58 EFLAGS: 00010046
> [  484.804945] RAX: 0000000000000000 RBX: ffff8c468068f000 RCX: 0000000000000002
> [  484.804948] RDX: 000000000000000e RSI: 0000000000000002 RDI: ffff8c468068f000
> [  484.804950] RBP: ffff8c46fbdc3e70 R08: 0000000000000000 R09: 00000000000004c1
> [  484.804952] R10: ffff8c46fbdcaaa0 R11: ffff8c46fbdc3b58 R12: ffff8c46fbdca380
> [  484.804955] R13: 0000000000000000 R14: ffff8c46fbdca5a4 R15: 0000000000000001
> [  484.804958] FS:  00007f1625c4c740(0000) GS:ffff8c46fbdc0000(0000) knlGS:0000000000000000
> [  484.804961] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  484.804964] CR2: 000000000061b218 CR3: 0000002f2184f000 CR4: 00000000001407e0
> [  484.804967] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [  484.804969] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000600
> [  484.804971] Call Trace:
> [  484.804976]  <IRQ>
> [  484.804984]  x86_pmu_enable+0x27f/0x2f0
> [  484.804992]  perf_pmu_enable+0x22/0x30
> [  484.804997]  ctx_resched+0x72/0xb0
> [  484.805003]  __perf_event_enable+0x208/0x250
> [  484.805008]  event_function+0x93/0xe0
> [  484.805012]  remote_function+0x3b/0x50
> [  484.805018]  flush_smp_call_function_queue+0x71/0x120
> [  484.805024]  generic_smp_call_function_single_interrupt+0x13/0x30
> [  484.805030]  smp_trace_call_function_single_interrupt+0x2d/0xd0
> [  484.805036]  trace_call_function_single_interrupt+0x93/0xa0
> [  484.805040] RIP: 0033:0x40f4a9
> [  484.805042] RSP: 002b:00007ffe26fde668 EFLAGS: 00000202 ORIG_RAX: ffffffffffffff04
> [  484.805047] RAX: 00000000000028a1 RBX: 0000000000000000 RCX: 0000000000067f43
> [  484.805049] RDX: 00007f1625a49ae0 RSI: 00007f1625a47760 RDI: 00007f1625a48640
> [  484.805052] RBP: 00007ffe26fde680 R08: 00007f1625c4c740 R09: 0000000000000000
> [  484.805054] R10: 00007ffe26fde3f0 R11: 0000000000000246 R12: 0000000000401980
> [  484.805056] R13: 00007ffe26fe0a80 R14: 0000000000000000 R15: 0000000000000000
> [  484.805058]  </IRQ>
> [  484.805062] Code: 0f ab 02 49 81 c4 08 02 00 00 49 0f ab 04 24 48 89 df ff 15 9c ca 03 01 48 89 df e8 0c 05 1b 00 5b 41 5c 41 5d 5d c3 0f ff eb f5 <0f> ff eb f1 0f ff 66 66 2e 0f 1f 84 00 00 00 00 00 eb a0 66 66
> [  484.805161] ---[ end trace dcc4ce8c0a8781e1 ]---
> [  508.895507] NMI watchdog: enabled on all CPUs, permanently consumes one hw-PMU counter.

addressed in:
[PATCH v5 4/4] perf/core: addressing 4x slowdown during per-process 
                          profiling of STREAM benchmark on Intel Xeon Phi
> 
> 
>>>
>>> Thanks,
>>> Mark.
>>>
>>
>>
> 
> 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 1/n] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi
  2017-06-19 13:37         ` Alexey Budankov
  2017-06-19 15:00           ` Alexey Budankov
  2017-06-19 15:14           ` Mark Rutland
@ 2017-06-30 10:21           ` Alexey Budankov
  2 siblings, 0 replies; 31+ messages in thread
From: Alexey Budankov @ 2017-06-30 10:21 UTC (permalink / raw)
  To: Mark Rutland
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Andi Kleen, Kan Liang, Dmitri Prokhorov,
	Valery Cherepennikov, David Carrillo-Cisneros, Stephane Eranian,
	linux-kernel

Hi,

On 19.06.2017 16:37, Alexey Budankov wrote:
> On 19.06.2017 16:26, Mark Rutland wrote:
>> On Mon, Jun 19, 2017 at 04:08:32PM +0300, Alexey Budankov wrote:
>>> On 16.06.2017 1:10, Alexey Budankov wrote:
>>>> On 15.06.2017 22:56, Mark Rutland wrote:
>>>>> On Thu, Jun 15, 2017 at 08:41:42PM +0300, Alexey Budankov wrote:
>>>>>> This series of patches continues v2 and addresses captured comments.
>>
>>>>>> Specifically this patch replaces pinned_groups and flexible_groups
>>>>>> lists of perf_event_context by red-black cpu indexed trees avoiding
>>>>>> data structures duplication and introducing possibility to iterate
>>>>>> event groups for a specific CPU only.
>>
>>>>> Have you thrown Vince's perf fuzzer at this?
>>>>>
>>>>> If you haven't, please do. It can be found in the fuzzer directory of:
>>>>>
>>>>> https://github.com/deater/perf_event_tests
>>>>
>>>> Accepted.
>>>
>>> I run the test suite and it revealed no additional regressions in
>>> comparison to what I have on the clean kernel.
>>>
>>> However the fuzzer constantly reports some strange stacks that are
>>> not seen on the clean kernel and I have no idea how that might be
>>> caused by the patches.
>>
>> Ok; that was the kind of thing I was concerned about.
>>
>> What you say "strange stacks", what do you mean exactly?
>>
>> I take it the kernel spewing backtraces in dmesg?
>>
>> Can you dump those?
> 
> Here it is:
> 
> list_del corruption. prev->next should be ffff88c2c4654010, but was ffff88c31eb0c020
> [  607.632813] ------------[ cut here ]------------
> [  607.632816] kernel BUG at lib/list_debug.c:53!
> [  607.632825] invalid opcode: 0000 [#1] SMP
> [  607.632898] Modules linked in: fuse xt_CHECKSUM iptable_mangle ipt_MASQUERADE nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack libcrc32c tun bridge stp llc ebtable_filter ebtables ip6table_filter ip6_tables nfsv3 rpcsec_gss_krb5 nfsv4 arc4 md4 nls_utf8 cifs nfs ccm dns_resolver fscache rpcrdma ib_isert iscsi_target_mod ib_iser libiscsi scsi_transport_iscsi ib_srpt target_core_mod ib_srp scsi_transport_srp ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm intel_rapl sb_edac x86_pkg_temp_thermal intel_powerclamp hfi1 coretemp crct10dif_pclmul crc32_pclmul ghash_clmulni_intel intel_cstate iTCO_wdt intel_uncore iTCO_vendor_support joydev rdmavt intel_rapl_perf i2c_i801 ib_core ipmi_ssif mei_me mei ipmi_si ipmi_devintf tpm_tis
> [  607.633954]  lpc_ich pcspkr ipmi_msghandler acpi_pad tpm_tis_core shpchp tpm wmi acpi_power_meter nfsd auth_rpcgss nfs_acl lockd grace sunrpc mgag200 drm_kms_helper ttm drm igb crc32c_intel ptp pps_core dca i2c_algo_bit
> [  607.634262] CPU: 271 PID: 28944 Comm: perf_fuzzer Not tainted 4.12.0-rc4+ #22
> [  607.634363] Hardware name: Intel Corporation S7200AP/S7200AP, BIOS S72C610.86B.01.01.0190.080520162104 08/05/2016
> [  607.634505] task: ffff88c2d5714000 task.stack: ffffa6f9352c8000
> [  607.634597] RIP: 0010:__list_del_entry_valid+0x7b/0x90
> [  607.634670] RSP: 0000:ffffa6f9352cbad0 EFLAGS: 00010046
> [  607.634746] RAX: 0000000000000054 RBX: ffff88c2c4654000 RCX: 0000000000000000
> [  607.634845] RDX: 0000000000000000 RSI: ffff88c33fdce168 RDI: ffff88c33fdce168
> [  607.634944] RBP: ffffa6f9352cbad0 R08: 00000000fffffffe R09: 0000000000000600
> [  607.635042] R10: 0000000000000005 R11: 0000000000000000 R12: ffff88c2e71ab200
> [  607.635140] R13: ffff88c2c4654010 R14: 0000000000000001 R15: 0000000000000001
> [  607.635240] FS:  0000000000000000(0000) GS:ffff88c33fdc0000(0000) knlGS:0000000000000000
> [  607.635351] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  607.635431] CR2: 00000000026be1e4 CR3: 0000002488e09000 CR4: 00000000001407e0
> [  607.635531] Call Trace:
> [  607.635583]  list_del_event+0x1d7/0x210
> [  607.635646]  ? perf_cgroup_attach+0x70/0x70
> [  607.635711]  __perf_remove_from_context+0x3e/0x90
> [  607.635783]  ? event_sched_out.isra.90+0x300/0x300
> [  607.635854]  event_function_call+0xbf/0xf0
> [  607.635918]  ? event_sched_out.isra.90+0x300/0x300
> [  607.635991]  perf_remove_from_context+0x25/0x70
> [  607.636060]  perf_event_release_kernel+0xda/0x250
> [  607.636132]  ? __dentry_kill+0x10e/0x160
> [  607.636192]  perf_release+0x10/0x20
> [  607.636249]  __fput+0xdf/0x1e0
> [  607.636299]  ____fput+0xe/0x10
> [  607.636350]  task_work_run+0x83/0xb0
> [  607.636408]  do_exit+0x2bc/0xbc0
> [  607.636460]  ? page_add_file_rmap+0xaf/0x200
> [  607.636526]  ? alloc_set_pte+0x115/0x4f0
> [  607.636587]  do_group_exit+0x3f/0xb0
> [  607.636644]  get_signal+0x1cc/0x5c0
> [  607.636703]  do_signal+0x37/0x6a0
> [  607.636758]  ? __perf_sw_event+0x4f/0x80
> [  607.636821]  ? __do_page_fault+0x2e1/0x4d0
> [  607.636885]  exit_to_usermode_loop+0x4c/0x92
> [  607.636953]  prepare_exit_to_usermode+0x40/0x50
> [  607.637023]  retint_user+0x8/0x13
> [  607.640312] RIP: 0033:0x40f4a9
> [  607.643500] RSP: 002b:00007ffc62d00668 EFLAGS: 00000206 ORIG_RAX: ffffffffffffff02
> [  607.646678] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 000000000001100a
> [  607.649777] RDX: 00007fd5dd25bae0 RSI: 00007fd5dd259760 RDI: 00007fd5dd25a640
> [  607.652791] RBP: 00007ffc62d00680 R08: 00007fd5dd45e740 R09: 0000000000000000
> [  607.655709] R10: 00007fd5dd45ea10 R11: 0000000000000246 R12: 0000000000401980
> [  607.658530] R13: 00007ffc62d02a80 R14: 0000000000000000 R15: 0000000000000000
> [  607.661249] Code: e8 3a f7 d8 ff 0f 0b 48 89 fe 31 c0 48 c7 c7 20 66 cb ac e8 27 f7 d8 ff 0f 0b 48 89 fe 31 c0 48 c7 c7 e0 65 cb ac e8 14 f7 d8 ff <0f> 0b 48 89 fe 31 c0 48 c7 c7 a8 65 cb ac e8 01 f7 d8 ff 0f 0b
> [  607.666819] RIP: __list_del_entry_valid+0x7b/0x90 RSP: ffffa6f9352cbad0
> [  607.683316] ---[ end trace 34244c35550e0713 ]---
> [  607.691830] Fixing recursive fault but reboot is needed!

addressed in:
[PATCH v5 4/4] perf/core: addressing 4x slowdown during per-process 
                          profiling of STREAM benchmark on Intel Xeon Phi

> 
> 2:
> 
> [  467.942059] unchecked MSR access error: WRMSR to 0x711 (tried to write 0x00000000e8cc0055) at rIP: 0xffffffffac05fbd4 (native_write_msr+0x4/0x30)
> [  467.942068] Call Trace:
> [  467.942073]  <IRQ>
> [  467.942094]  ? snbep_uncore_msr_enable_event+0x54/0x60 [intel_uncore]
> [  467.942107]  uncore_pmu_event_start+0x9b/0x100 [intel_uncore]
> [  467.942119]  uncore_pmu_event_add+0x235/0x3a0 [intel_uncore]
> [  467.942126]  ? sched_clock+0xb/0x10
> [  467.942132]  ? sched_clock_cpu+0x11/0xb0
> [  467.942140]  event_sched_in.isra.100+0xdf/0x250
> [  467.942145]  sched_in_group+0x210/0x390
> [  467.942150]  ? sched_in_group+0x390/0x390
> [  467.942155]  group_sched_in_flexible_callback+0x17/0x20
> [  467.942160]  perf_cpu_tree_iterate+0x45/0x75
> [  467.942165]  ctx_sched_in+0x97/0x110
> [  467.942169]  perf_event_sched_in+0x77/0x80
> [  467.942174]  ctx_resched+0x69/0xb0
> [  467.942179]  __perf_event_enable+0x208/0x250
> [  467.942184]  event_function+0x93/0xe0
> [  467.942188]  remote_function+0x3b/0x50
> [  467.942194]  flush_smp_call_function_queue+0x71/0x120
> [  467.942200]  generic_smp_call_function_single_interrupt+0x13/0x30
> [  467.942206]  smp_call_function_single_interrupt+0x27/0x40
> [  467.942212]  call_function_single_interrupt+0x93/0xa0
> [  467.942217] RIP: 0010:native_restore_fl+0x6/0x10
> [  467.942220] RSP: 0000:ffff88c33ba03e00 EFLAGS: 00000206 ORIG_RAX: ffffffffffffff04
> [  467.942224] RAX: ffff88c33c8dca08 RBX: ffff88c33c8dc928 RCX: 0000000000000017
> [  467.942227] RDX: 0000000000000000 RSI: 0000000000000206 RDI: 0000000000000206
> [  467.942229] RBP: ffff88c33ba03e00 R08: 0000000000000001 R09: 0000000000007151
> [  467.942232] R10: 0000000000000000 R11: 000000000000005d R12: ffff88c33c8dca08
> [  467.942234] R13: ffff88c33c8dc140 R14: 0000000000000001 R15: 00000000000001d8
> [  467.942241]  _raw_spin_unlock_irqrestore+0x16/0x20
> [  467.942245]  update_blocked_averages+0x2cf/0x4a0
> [  467.942251]  rebalance_domains+0x4b/0x2b0
> [  467.942256]  run_rebalance_domains+0x1d7/0x210
> [  467.942260]  __do_softirq+0xd1/0x27f
> [  467.942267]  irq_exit+0xe9/0x100
> [  467.942271]  scheduler_ipi+0x8f/0x140
> [  467.942275]  smp_reschedule_interrupt+0x29/0x30
> [  467.942280]  reschedule_interrupt+0x93/0xa0
> [  467.942284] RIP: 0010:native_safe_halt+0x6/0x10
> [  467.942286] RSP: 0000:fffffffface03de0 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff02
> [  467.942291] RAX: 0000000000000000 RBX: fffffffface104c0 RCX: 0000000000000000
> [  467.942293] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
> [  467.942295] RBP: fffffffface03de0 R08: 0000006d2c5efae3 R09: 0000000000000000
> [  467.942298] R10: 0000000000000201 R11: 0000000000000930 R12: 0000000000000000
> [  467.942300] R13: fffffffface104c0 R14: 0000000000000000 R15: 0000000000000000
> [  467.942302]  </IRQ>
> [  467.942308]  default_idle+0x20/0x100
> [  467.942313]  arch_cpu_idle+0xf/0x20
> [  467.942317]  default_idle_call+0x2c/0x40
> [  467.942321]  do_idle+0x158/0x1e0
> [  467.942325]  cpu_startup_entry+0x71/0x80
> [  467.942330]  rest_init+0x77/0x80
> [  467.942338]  start_kernel+0x4a7/0x4c8
> [  467.942342]  ? set_init_arg+0x5a/0x5a
> [  467.942348]  ? early_idt_handler_array+0x120/0x120
> [  467.942352]  x86_64_start_reservations+0x29/0x2b
> [  467.942357]  x86_64_start_kernel+0x151/0x174
> [  467.942363]  secondary_startup_64+0x9f/0x9f> 
> 
>>
>> Thanks,
>> Mark.
>>
> 
> 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH v3 1/n] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi
  2017-06-19 15:34               ` Alexey Budankov
@ 2017-06-30 10:23                 ` Alexey Budankov
  0 siblings, 0 replies; 31+ messages in thread
From: Alexey Budankov @ 2017-06-30 10:23 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Mark Rutland, Peter Zijlstra, Ingo Molnar,
	Arnaldo Carvalho de Melo, Alexander Shishkin, Kan Liang,
	Dmitri Prokhorov, Valery Cherepennikov, David Carrillo-Cisneros,
	Stephane Eranian, linux-kernel

On 19.06.2017 18:34, Alexey Budankov wrote:
> On 19.06.2017 18:24, Andi Kleen wrote:
>>> One more:
>>
>> It may help if you run with KASAN enabled. That often
>> finds problems earlier.
>>
>> Also could use ftrace and use ftrace_dump_on_oops to
>> see the sequence (but that has a lot of overhead
>> and sometimes hides problems)
> 
> Ok. Will try. Thanks.

Tested patch v5 with KASAN enabled. No memory corruptions were revealed so far. 

> 
>>
>> -Andi
>>
>>
> 
> 

^ permalink raw reply	[flat|nested] 31+ messages in thread

end of thread, other threads:[~2017-06-30 10:24 UTC | newest]

Thread overview: 31+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-06-15 17:41 [PATCH v3 1/n] perf/core: addressing 4x slowdown during per-process profiling of STREAM benchmark on Intel Xeon Phi Alexey Budankov
2017-06-15 19:56 ` Mark Rutland
2017-06-15 22:10   ` Alexey Budankov
2017-06-16  9:09     ` Mark Rutland
2017-06-16 14:08       ` Alexey Budankov
2017-06-16 14:22         ` Alexey Budankov
2017-06-19 12:46           ` Mark Rutland
2017-06-19 13:38             ` Mark Rutland
2017-06-19 14:09               ` Alexey Budankov
2017-06-19 14:59               ` Andi Kleen
2017-06-19 15:09                 ` Mark Rutland
2017-06-19 15:21                   ` Andi Kleen
2017-06-19 15:24                     ` Mark Rutland
2017-06-19 15:39                       ` Andi Kleen
2017-06-19 15:52                         ` Mark Rutland
2017-06-19 13:08     ` Alexey Budankov
2017-06-19 13:26       ` Mark Rutland
2017-06-19 13:37         ` Alexey Budankov
2017-06-19 15:00           ` Alexey Budankov
2017-06-19 15:24             ` Andi Kleen
2017-06-19 15:34               ` Alexey Budankov
2017-06-30 10:23                 ` Alexey Budankov
2017-06-30 10:21             ` Alexey Budankov
2017-06-19 15:14           ` Mark Rutland
2017-06-19 15:27             ` Alexey Budankov
2017-06-30 10:21           ` Alexey Budankov
2017-06-19 20:31   ` Alexey Budankov
2017-06-20 13:36     ` Mark Rutland
2017-06-20 15:22       ` Alexey Budankov
2017-06-20 16:37         ` Mark Rutland
2017-06-20 17:10           ` Alexey Budankov

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.