Re: [RFC][PATCH] perf: Rewrite core context handling

From: Peter Zijlstra <peterz@infradead.org>
To: Mark Rutland <mark.rutland@arm.com>
Cc: mingo@kernel.org, linux-kernel@vger.kernel.org, acme@kernel.org,
	alexander.shishkin@linux.intel.com, jolsa@redhat.com,
	songliubraving@fb.com, eranian@google.com, tglx@linutronix.de,
	alexey.budankov@linux.intel.com, megha.dey@intel.com,
	frederic@kernel.org, nd@arm.com
Subject: Re: [RFC][PATCH] perf: Rewrite core context handling
Date: Tue, 16 Oct 2018 20:07:24 +0200	[thread overview]
Message-ID: <20181016180724.GC3121@hirez.programming.kicks-ass.net> (raw)
In-Reply-To: <20181016162645.ymtuk6qqz65sg7bt@lakrids.cambridge.arm.com>

On Tue, Oct 16, 2018 at 05:26:45PM +0100, Mark Rutland wrote:
> On Wed, Oct 10, 2018 at 12:45:59PM +0200, Peter Zijlstra wrote:

> > +struct perf_event_pmu_context {
> > +	struct pmu			*pmu;
> > +	struct perf_event_context 	*ctx;
> > +
> > +	struct list_head		pmu_ctx_entry;
> > +
> > +	struct list_head		pinned_active;
> > +	struct list_head		flexible_active;
> > +
> > +	unsigned int			embedded : 1;
> 
> Is this just for lifetime management (i.e. not attempting to free the
> embedded epc)?

IIRC, yes.

> Do we need a flag? Can't we have the pmu hold a ref on its embedded epc,
> and init that at pmu init time?

IIRC, we do two things when we hit 0, we remove the pmu_ctx_entry and we
free. I thing we still want to do the first, but want to avoid the
second.

> > @@ -723,20 +761,21 @@ struct perf_event_context {
> >  	 */
> >  	struct mutex			mutex;
> >  
> > -	struct list_head		active_ctx_list;
> > +	struct list_head		pmu_ctx_list;
> > +
> >  	struct perf_event_groups	pinned_groups;
> >  	struct perf_event_groups	flexible_groups;
> >  	struct list_head		event_list;
> 
> I think that the groups lists and event list should be in the
> perf_event_pmu_context.
>
> That would make scheduling and rotating events a per-pmu thing, as we
> want, without complicating the RB tree logic or requiring additional
> hooks.

I didn't think that RB tree logic was particularly complicated.

> That may make the move_group case more complicated, though.
> 
> ... and maybe I've missed some other headache with that?

move_group not I think, the locking is per perf_event_context, and
changing perf_event::ctx is tricky, but outside of that not so much.

> > -	struct list_head		pinned_active;
> > -	struct list_head		flexible_active;
> > -
> >  	int				nr_events;
> >  	int				nr_active;
> >  	int				is_active;
> > +
> > +	int				nr_task_data;
> >  	int				nr_stat;
> >  	int				nr_freq;
> >  	int				rotate_disable;
> 
> Likewise these all seem to be PMU-specific (though I guess we care about
> them in the ctx-switch fast paths?).

nr_active and nr_events were also useful on a ctx wide basis IIRC, thw
nr_{stat,freq} thing are boolean gates, not sure we win much by breaking
that up into per-pmu.

The rotate_disable, yes, that should be per PMU I suppose.

> > @@ -1528,6 +1498,11 @@ perf_event_groups_less(struct perf_event
> >  	if (left->cpu > right->cpu)
> >  		return false;
> >  
> > +	if (left->pmu_ctx->pmu < right->pmu_ctx->pmu)
> > +		return true;
> > +	if (left->pmu_ctx->pmu > right->pmu_ctx->pmu)
> > +		return false;
> > +
> >  	if (left->group_index < right->group_index)
> >  		return true;
> >  	if (left->group_index > right->group_index)
> > @@ -1610,7 +1585,7 @@ del_event_from_groups(struct perf_event
> >   * Get the leftmost event in the @cpu subtree.
> >   */
> >  static struct perf_event *
> > -perf_event_groups_first(struct perf_event_groups *groups, int cpu)
> > +perf_event_groups_first(struct perf_event_groups *groups, int cpu, struct pmu *pmu)
> >  {
> >  	struct perf_event *node_event = NULL, *match = NULL;
> >  	struct rb_node *node = groups->tree.rb_node;
> > @@ -1623,8 +1598,19 @@ perf_event_groups_first(struct perf_even
> >  		} else if (cpu > node_event->cpu) {
> >  			node = node->rb_right;
> >  		} else {
> > -			match = node_event;
> > -			node = node->rb_left;
> > +			if (pmu) {
> > +				if (pmu < node_event->pmu_ctx->pmu) {
> > +					node = node->rb_left;
> > +				} else if (pmu > node_event->pmu_ctx->pmu) {
> > +					node = node->rb_right;
> > +				} else  {
> > +					match = node_event;
> > +					node = node->rb_left;
> > +				}
> > +			} else {
> > +				match = node_event;
> > +				node = node->rb_left;
> > +			}
> >  		}
> >  	}
> >  
> > @@ -1635,13 +1621,17 @@ perf_event_groups_first(struct perf_even
> >   * Like rb_entry_next_safe() for the @cpu subtree.
> >   */
> >  static struct perf_event *
> > -perf_event_groups_next(struct perf_event *event)
> > +perf_event_groups_next(struct perf_event *event, struct pmu *pmu)
> >  {
> >  	struct perf_event *next;
> >  
> >  	next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node);
> > -	if (next && next->cpu == event->cpu)
> > +	if (next && next->cpu == event->cpu) {
> > +		if (pmu && next->pmu_ctx->pmu != pmu)
> > +			return NULL;
> > +
> >  		return next;
> > +	}
> >  
> >  	return NULL;
> >  }
> 
> This would be much nicer with a per-pmu event_list.

So I was thinking we'd want to easily find the events for a particular
PMU, stuffing them into the perf_event_pmu_context makes that much
harder.

(also, there's an XXX in perf_tp_event() where this capability makes
sense)

In fact, I was planning on making it more complicated still :-) So that
we can optimize the case where merge_sched_in() has saturated a PMU, to
then quickly find the next PMU without having to iterate all
intermediate events.

See the below patch..

Also, a threaded RB tree might speed up the whole iteration. We could
finally bite the bullet and implement that in the generic RB tree code,
or just fake it (again) by adding an additional list.


> > +	// XXX premature; what if this is allowed, but we get moved to a PMU
> > +	// that doesn't have this.
> >  	if (is_sampling_event(event)) {
> >  		if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
> >  			err = -EOPNOTSUPP;
> 
> Ugh, could that happen for SW events moved into a HW context?

I _think_ all SW events support sampling. And since we do not allow
changing anything but SW events, this might just work.


---

--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1585,7 +1585,7 @@ del_event_from_groups(struct perf_event
  * Get the leftmost event in the @cpu subtree.
  */
 static struct perf_event *
-perf_event_groups_first(struct perf_event_groups *groups, int cpu, struct pmu *pmu)
+perf_event_groups_first(struct perf_event_groups *groups, int cpu, struct pmu *pmu, bool next)
 {
 	struct perf_event *node_event = NULL, *match = NULL;
 	struct rb_node *node = groups->tree.rb_node;
@@ -1601,7 +1601,8 @@ perf_event_groups_first(struct perf_even
 			if (pmu) {
 				if (pmu < node_event->pmu_ctx->pmu) {
 					node = node->rb_left;
-				} else if (pmu > node_event->pmu_ctx->pmu) {
+				} else if (pmu > node_event->pmu_ctx->pmu ||
+					   (next && pmu == node_event->pmu_ctx->pmu)) {
 					node = node->rb_right;
 				} else  {
 					match = node_event;
@@ -3274,10 +3275,12 @@ visit_groups_merge(struct perf_event_gro
 		   int (*func)(struct perf_event *, void *), void *data)
 {
 	struct perf_event **evt, *evt1, *evt2;
+	bool next = false;
 	int ret;
 
-	evt1 = perf_event_groups_first(groups, -1, pmu);
-	evt2 = perf_event_groups_first(groups, cpu, pmu);
+again:
+	evt1 = perf_event_groups_first(groups, -1, pmu, next);
+	evt2 = perf_event_groups_first(groups, cpu, pmu, next);
 
 	while (evt1 || evt2) {
 		if (evt1 && evt2) {
@@ -3292,9 +3295,15 @@ visit_groups_merge(struct perf_event_gro
 		}
 
 		ret = func(*evt, data);
-		if (ret)
+		if (ret < 0)
 			return ret;
 
+		if (ret > 0) {
+			pmu = (*evt)->pmu_ctx->pmu;
+			next = true;
+			goto again;
+		}
+
 		*evt = perf_event_groups_next(*evt, pmu);
 	}
 
@@ -3386,7 +3395,7 @@ ctx_flexible_sched_in(struct perf_event_
 {
 	struct sched_in_data sid = {
 		.ctx = ctx,
-		.busy = pmu ? -EBUSY : 0,
+		.busy = pmu ? -EBUSY : 1,
 	};
 
 	visit_groups_merge(&ctx->flexible_groups, smp_processor_id(), pmu,