Re: [RFC][PATCH] perf: Rewrite core context handling

From: Mark Rutland <mark.rutland@arm.com>
To: Peter Zijlstra <peterz@infradead.org>
Cc: mingo@kernel.org, linux-kernel@vger.kernel.org, acme@kernel.org,
	alexander.shishkin@linux.intel.com, jolsa@redhat.com,
	songliubraving@fb.com, eranian@google.com, tglx@linutronix.de,
	alexey.budankov@linux.intel.com, megha.dey@intel.com,
	frederic@kernel.org, nd@arm.com
Subject: Re: [RFC][PATCH] perf: Rewrite core context handling
Date: Tue, 16 Oct 2018 17:26:45 +0100	[thread overview]
Message-ID: <20181016162645.ymtuk6qqz65sg7bt@lakrids.cambridge.arm.com> (raw)
In-Reply-To: <20181010104559.GO5728@hirez.programming.kicks-ass.net>

On Wed, Oct 10, 2018 at 12:45:59PM +0200, Peter Zijlstra wrote:
> Hi all,
> 
> There have been various issues and limitations with the way perf uses
> (task) contexts to track events. Most notable is the single hardware PMU
> task context, which has resulted in a number of yucky things (both
> proposed and merged).
> 
> Notably:
> 
>  - HW breakpoint PMU
>  - ARM big.little PMU
>  - Intel Branch Monitoring PMU
> 
> Since we now track the events in RB trees, we can 'simply' add a pmu
> order to them and have them grouped that way, reducing to a single
> context. Of course, reality never quite works out that simple, and below
> ends up adding an intermediate data structure to bridge the context ->
> pmu mapping.
> 
> Something a little like:
> 
>               ,------------------------[1:n]---------------------.
>               V                                                  V
>     perf_event_context <-[1:n]-> perf_event_pmu_context <--- perf_event
>               ^                      ^     |                     |
>               `--------[1:n]---------'     `-[n:1]-> pmu <-[1:n]-'
> 
> This patch builds (provided you disable CGROUP_PERF), boots and survives
> perf-top without the machine catching fire.
> 
> There's still a fair bit of loose ends (look for XXX), but I think this
> is the direction we should be going.

I think this is the right direction, as this is roughly what I suggested
before the RB-tree stuff. ;)

> Comments?

Vague things inline below.

> +/*
> + *           ,------------------------[1:n]---------------------.
> + *           V                                                  V
> + * perf_event_context <-[1:n]-> perf_event_pmu_context <--- perf_event
> + *           ^                      ^     |                     |
> + *           `--------[1:n]---------'     `-[n:1]-> pmu <-[1:n]-'
> + *
> + *
> + * XXX destroy epc when empty
> + *   refcount, !rcu
> + *
> + * XXX epc locking
> + *
> + *   event->pmu_ctx		ctx->mutex && inactive
> + *   ctx->pmu_ctx_list		ctx->mutex && ctx->lock
> + *
> + */
> +struct perf_event_pmu_context {
> +	struct pmu			*pmu;
> +	struct perf_event_context 	*ctx;
> +
> +	struct list_head		pmu_ctx_entry;
> +
> +	struct list_head		pinned_active;
> +	struct list_head		flexible_active;
> +
> +	unsigned int			embedded : 1;

Is this just for lifetime management (i.e. not attempting to free the
embedded epc)?

Do we need a flag? Can't we have the pmu hold a ref on its embedded epc,
and init that at pmu init time?

> +
> +	unsigned int			nr_events;
> +	unsigned int			nr_active;
> +
> +	atomic_t			refcount; /* event <-> epc */
> +
> +	void				*task_ctx_data; /* pmu specific data */
> +};
>  
>  struct perf_event_groups {
>  	struct rb_root	tree;
> @@ -710,7 +749,6 @@ struct perf_event_groups {
>   * Used as a container for task events and CPU events as well:
>   */
>  struct perf_event_context {
> -	struct pmu			*pmu;
>  	/*
>  	 * Protect the states of the events in the list,
>  	 * nr_active, and the list:
> @@ -723,20 +761,21 @@ struct perf_event_context {
>  	 */
>  	struct mutex			mutex;
>  
> -	struct list_head		active_ctx_list;
> +	struct list_head		pmu_ctx_list;
> +
>  	struct perf_event_groups	pinned_groups;
>  	struct perf_event_groups	flexible_groups;
>  	struct list_head		event_list;

I think that the groups lists and event list should be in the
perf_event_pmu_context.

That would make scheduling and rotating events a per-pmu thing, as we
want, without complicating the RB tree logic or requiring additional
hooks.

That may make the move_group case more complicated, though.

... and maybe I've missed some other headache with that?

>  
> -	struct list_head		pinned_active;
> -	struct list_head		flexible_active;
> -
>  	int				nr_events;
>  	int				nr_active;
>  	int				is_active;
> +
> +	int				nr_task_data;
>  	int				nr_stat;
>  	int				nr_freq;
>  	int				rotate_disable;

Likewise these all seem to be PMU-specific (though I guess we care about
them in the ctx-switch fast paths?).

> +
>  	atomic_t			refcount;
>  	struct task_struct		*task;
>  
> @@ -757,7 +796,6 @@ struct perf_event_context {
>  #ifdef CONFIG_CGROUP_PERF
>  	int				nr_cgroups;	 /* cgroup evts */
>  #endif
> -	void				*task_ctx_data; /* pmu specific data */
>  	struct rcu_head			rcu_head;
>  };

[...]

> @@ -1528,6 +1498,11 @@ perf_event_groups_less(struct perf_event
>  	if (left->cpu > right->cpu)
>  		return false;
>  
> +	if (left->pmu_ctx->pmu < right->pmu_ctx->pmu)
> +		return true;
> +	if (left->pmu_ctx->pmu > right->pmu_ctx->pmu)
> +		return false;
> +
>  	if (left->group_index < right->group_index)
>  		return true;
>  	if (left->group_index > right->group_index)
> @@ -1610,7 +1585,7 @@ del_event_from_groups(struct perf_event
>   * Get the leftmost event in the @cpu subtree.
>   */
>  static struct perf_event *
> -perf_event_groups_first(struct perf_event_groups *groups, int cpu)
> +perf_event_groups_first(struct perf_event_groups *groups, int cpu, struct pmu *pmu)
>  {
>  	struct perf_event *node_event = NULL, *match = NULL;
>  	struct rb_node *node = groups->tree.rb_node;
> @@ -1623,8 +1598,19 @@ perf_event_groups_first(struct perf_even
>  		} else if (cpu > node_event->cpu) {
>  			node = node->rb_right;
>  		} else {
> -			match = node_event;
> -			node = node->rb_left;
> +			if (pmu) {
> +				if (pmu < node_event->pmu_ctx->pmu) {
> +					node = node->rb_left;
> +				} else if (pmu > node_event->pmu_ctx->pmu) {
> +					node = node->rb_right;
> +				} else  {
> +					match = node_event;
> +					node = node->rb_left;
> +				}
> +			} else {
> +				match = node_event;
> +				node = node->rb_left;
> +			}
>  		}
>  	}
>  
> @@ -1635,13 +1621,17 @@ perf_event_groups_first(struct perf_even
>   * Like rb_entry_next_safe() for the @cpu subtree.
>   */
>  static struct perf_event *
> -perf_event_groups_next(struct perf_event *event)
> +perf_event_groups_next(struct perf_event *event, struct pmu *pmu)
>  {
>  	struct perf_event *next;
>  
>  	next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node);
> -	if (next && next->cpu == event->cpu)
> +	if (next && next->cpu == event->cpu) {
> +		if (pmu && next->pmu_ctx->pmu != pmu)
> +			return NULL;
> +
>  		return next;
> +	}
>  
>  	return NULL;
>  }

This would be much nicer with a per-pmu event_list.

[...]

> +	// XXX premature; what if this is allowed, but we get moved to a PMU
> +	// that doesn't have this.
>  	if (is_sampling_event(event)) {
>  		if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
>  			err = -EOPNOTSUPP;

Ugh, could that happen for SW events moved into a HW context?

Thanks,
Mark.