From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <linux-kernel-owner@vger.kernel.org>
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S1764679AbdAJO0E (ORCPT <rfc822;w@1wt.eu>);
        Tue, 10 Jan 2017 09:26:04 -0500
Received: from foss.arm.com ([217.140.101.70]:53352 "EHLO foss.arm.com"
        rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP
        id S939261AbdAJNua (ORCPT <rfc822;linux-kernel@vger.kernel.org>);
        Tue, 10 Jan 2017 08:50:30 -0500
Date: Tue, 10 Jan 2017 13:49:25 +0000
From: Mark Rutland <mark.rutland@arm.com>
To: David Carrillo-Cisneros <davidcc@google.com>
Cc: linux-kernel@vger.kernel.org, "x86@kernel.org" <x86@kernel.org>,
        Ingo Molnar <mingo@redhat.com>, Thomas Gleixner <tglx@linutronix.de>,
        Andi Kleen <ak@linux.intel.com>, Kan Liang <kan.liang@intel.com>,
        Peter Zijlstra <peterz@infradead.org>, Borislav Petkov <bp@suse.de>,
        Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>,
        Dave Hansen <dave.hansen@linux.intel.com>,
        Vikas Shivappa <vikas.shivappa@linux.intel.com>,
        Arnaldo Carvalho de Melo <acme@kernel.org>,
        Vince Weaver <vince@deater.net>, Paul Turner <pjt@google.com>,
        Stephane Eranian <eranian@google.com>
Subject: Re: [RFC 1/6] perf/core: create active and inactive event groups
Message-ID: <20170110134925.GB19704@leverpostej>
References: <20170110102502.106187-1-davidcc@google.com>
 <20170110102502.106187-2-davidcc@google.com>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <20170110102502.106187-2-davidcc@google.com>
User-Agent: Mutt/1.5.21 (2010-09-15)
Sender: linux-kernel-owner@vger.kernel.org
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org

Hi,

On Tue, Jan 10, 2017 at 02:24:57AM -0800, David Carrillo-Cisneros wrote:
> Currently, perf uses pinned_groups and flexible_groups for sched in/out.
> We can do better because:
>   - sched out only cares about the ACTIVE events, this is usually a small
>   set of events.
>   - There can be many events in these lists thate are no relevant to
>   the scheduler (e.g. other CPU/cgroups, events in OFF and ERROR state).
> 
> Reduce the set of events to iterate over each context switch by adding
> three new lists: active_pinned_groups, active_flexible_groups and
> inactive_groups. All events in each list are in the same state so we
> avoid checking state. It also saves the iteration over events in OFF and
> ERROR state during sched in/out.
> 
> The main impact of this patch is that ctx_sched_out can use the "small"
> active_{pinned,flexible}_groups instead of the potentially much larger
> {pinned,flexible}_groups.


> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index 4741ecdb9817..3fa18f05c9b0 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -573,6 +573,7 @@ struct perf_event {
>  
>  	struct hlist_node		hlist_entry;
>  	struct list_head		active_entry;
> +	struct list_head		ctx_active_entry;

I think we should be able to kill off active_entry as part of this
series; it's there to do the same thing (optimize iteration over active
events).

If we expose a for_each_ctx_active_event() helper which iterates of the
pinned and flexible lists, I think we may be able to migrate existing
users over and kill off perf_event::active_entry, and the redundant list
manipulation in drivers.

... there might be some fun and games ordering manipulation against PMI
handlers, tough, so it may turn out that we need both.

>  	int				nr_siblings;
>  
>  	/* Not serialized. Only written during event initialization. */
> @@ -734,6 +735,11 @@ struct perf_event_context {
>  	struct list_head		active_ctx_list;
>  	struct list_head		pinned_groups;
>  	struct list_head		flexible_groups;
> +
> +	struct list_head		active_pinned_groups;
> +	struct list_head		active_flexible_groups;
> +	struct list_head		inactive_groups;
> +
>  	struct list_head		event_list;
>  	int				nr_events;
>  	int				nr_active;
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index faf073d0287f..b744b5a8dbd0 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -1462,6 +1462,21 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
>  		return &ctx->flexible_groups;
>  }
>  
> +static void
> +ctx_sched_groups_to_inactive(struct perf_event *event,
> +			     struct perf_event_context *ctx)
> +{
> +	WARN_ON(event->state != PERF_EVENT_STATE_INACTIVE);
> +	list_move_tail(&event->ctx_active_entry, &ctx->inactive_groups);
> +};

> @@ -1851,6 +1877,11 @@ group_sched_out(struct perf_event *group_event,
>  
>  	if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
>  		cpuctx->exclusive = 0;
> +
> +	if (group_event->state <= PERF_EVENT_STATE_INACTIVE)
> +		ctx_sched_groups_to_inactive(group_event, ctx);

Was this intended to be '==' ?

As-is, this looks inconsistent with the WARN_ON() in
ctx_sched_groups_to_inactive() ...

> +	if (group_event->state < PERF_EVENT_STATE_INACTIVE)
> +		ctx_sched_groups_del(group_event, ctx);

... and here we'll subsequently delete most events from the inactive
list, rather than never adding them to the inactive list in the first
place.

>  }
>  
>  #define DETACH_GROUP	0x01UL
> @@ -1918,6 +1949,8 @@ static void __perf_event_disable(struct perf_event *event,
>  		group_sched_out(event, cpuctx, ctx);
>  	else
>  		event_sched_out(event, cpuctx, ctx);
> +	if (event->state == PERF_EVENT_STATE_INACTIVE)
> +		ctx_sched_groups_del(event, ctx);
>  	event->state = PERF_EVENT_STATE_OFF;
>  }
>  
> @@ -2014,6 +2047,17 @@ static void perf_set_shadow_time(struct perf_event *event,
>  static void perf_log_throttle(struct perf_event *event, int enable);
>  static void perf_log_itrace_start(struct perf_event *event);
>  
> +static void
> +ctx_sched_groups_to_active(struct perf_event *event, struct perf_event_context *ctx)
> +{
> +	struct list_head *h = event->attr.pinned ? &ctx->active_pinned_groups :
> +						   &ctx->active_flexible_groups;

It would be nicer to splti the definition from the intisation. That way
the lines can be shorter and more legible, we can s/h/head/ ...

> +	WARN_ON(!event);

... and we can move the dereference of event after the check here.

That said, is there ever a risk of this being NULL? Won't the event have
to be the container of a list element we walked? Or is there a path
where that is not the case?

We didn't add a similar check to ctx_sched_groups_to_inactive(), so if
nothing else it seems inconsistent.

> +	WARN_ON(list_empty(&event->ctx_active_entry));

I take it this is because we always expect the event to be in the
inactive list first?

> +	WARN_ON(event->state != PERF_EVENT_STATE_ACTIVE);
> +	list_move_tail(&event->ctx_active_entry, h);
> +}
> +
>  static int
>  event_sched_in(struct perf_event *event,
>  		 struct perf_cpu_context *cpuctx,
> @@ -2091,9 +2135,7 @@ group_sched_in(struct perf_event *group_event,
>  	u64 now = ctx->time;
>  	bool simulate = false;
>  
> -	if (group_event->state == PERF_EVENT_STATE_OFF)
> -		return 0;
> -
> +	WARN_ON(group_event->state != PERF_EVENT_STATE_INACTIVE);
>  	pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
>  
>  	if (event_sched_in(group_event, cpuctx, ctx)) {
> @@ -2112,9 +2154,10 @@ group_sched_in(struct perf_event *group_event,
>  		}
>  	}
>  
> -	if (!pmu->commit_txn(pmu))
> +	if (!pmu->commit_txn(pmu)) {
> +		ctx_sched_groups_to_active(group_event, ctx);
>  		return 0;

I think IRQs are disabled in this path (though I'll need to
double-check), but I don't think the PMU is disabled, so I believe a PMI
can come in between the commit_txn() and the addition of events to their
active list.

I'm not immediately sure if that matters -- we'll need to consider what
list manipulation might happen in a PMI handler.

If it does matter, we could always add the events to an active list
first, then try the commit, then remove them if the commit failed. It
means we might see some not-actually-active events in the active lists
occasionally, but the lists would still be shorter than the full event
list.

> -
> +	}
>  group_error:
>  	/*
>  	 * Groups can be scheduled in as one unit only, so undo any
> @@ -2396,6 +2439,7 @@ static void __perf_event_enable(struct perf_event *event,
>  		ctx_sched_out(ctx, cpuctx, EVENT_TIME);
>  
>  	__perf_event_mark_enabled(event);
> +	ctx_sched_groups_add(event, ctx);
>  
>  	if (!ctx->is_active)
>  		return;
> @@ -2611,7 +2655,7 @@ static void ctx_sched_out(struct perf_event_context *ctx,
>  			  enum event_type_t event_type)
>  {
>  	int is_active = ctx->is_active;
> -	struct perf_event *event;
> +	struct perf_event *event, *tmp;
>  
>  	lockdep_assert_held(&ctx->lock);
>  
> @@ -2658,13 +2702,17 @@ static void ctx_sched_out(struct perf_event_context *ctx,
>  
>  	perf_pmu_disable(ctx->pmu);
>  	if (is_active & EVENT_PINNED) {
> -		list_for_each_entry(event, &ctx->pinned_groups, group_entry)
> +		list_for_each_entry_safe(event, tmp, &ctx->active_pinned_groups, ctx_active_entry) {
> +			WARN_ON(event->state != PERF_EVENT_STATE_ACTIVE);
>  			group_sched_out(event, cpuctx, ctx);
> +		}
>  	}
>  
>  	if (is_active & EVENT_FLEXIBLE) {
> -		list_for_each_entry(event, &ctx->flexible_groups, group_entry)
> +		list_for_each_entry_safe(event, tmp, &ctx->active_flexible_groups, ctx_active_entry) {
> +			WARN_ON(event->state != PERF_EVENT_STATE_ACTIVE);
>  			group_sched_out(event, cpuctx, ctx);
> +		}
>  	}
>  	perf_pmu_enable(ctx->pmu);
>  }
> @@ -2962,10 +3010,11 @@ static void
>  ctx_pinned_sched_in(struct perf_event_context *ctx,
>  		    struct perf_cpu_context *cpuctx)
>  {
> -	struct perf_event *event;
> +	struct perf_event *event = NULL, *tmp;

I don't believe we need to initialise event here;
list_for_each_entry_safe() should do that as required.

>  
> -	list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
> -		if (event->state <= PERF_EVENT_STATE_OFF)
> +	list_for_each_entry_safe(
> +			event, tmp, &ctx->inactive_groups, ctx_active_entry) {
> +		if (WARN_ON(event->state != PERF_EVENT_STATE_INACTIVE)) /* debug only */
>  			continue;

Given the comment, is this still needed?

>  		if (!event_filter_match(event))
>  			continue;
> @@ -2983,6 +3032,7 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
>  		 */
>  		if (event->state == PERF_EVENT_STATE_INACTIVE) {
>  			update_group_times(event);
> +			ctx_sched_groups_del(event, ctx);
>  			event->state = PERF_EVENT_STATE_ERROR;
>  		}
>  	}
> @@ -2992,12 +3042,12 @@ static void
>  ctx_flexible_sched_in(struct perf_event_context *ctx,
>  		      struct perf_cpu_context *cpuctx)
>  {
> -	struct perf_event *event;
> +	struct perf_event *event = NULL, *tmp;
>  	int can_add_hw = 1;
>  
> -	list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
> -		/* Ignore events in OFF or ERROR state */
> -		if (event->state <= PERF_EVENT_STATE_OFF)
> +	list_for_each_entry_safe(
> +			event, tmp, &ctx->inactive_groups, ctx_active_entry) {
> +		if (WARN_ON(event->state != PERF_EVENT_STATE_INACTIVE)) /* debug only */
>  			continue;

Likewise, is this still needed?

Thanks,
Mark.