From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1760374AbdAJKZn (ORCPT ); Tue, 10 Jan 2017 05:25:43 -0500 Received: from mail-pf0-f173.google.com ([209.85.192.173]:34407 "EHLO mail-pf0-f173.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1750924AbdAJKZl (ORCPT ); Tue, 10 Jan 2017 05:25:41 -0500 From: David Carrillo-Cisneros To: linux-kernel@vger.kernel.org Cc: "x86@kernel.org" , Ingo Molnar , Thomas Gleixner , Andi Kleen , Kan Liang , Peter Zijlstra , Borislav Petkov , Srinivas Pandruvada , Dave Hansen , Vikas Shivappa , Mark Rutland , Arnaldo Carvalho de Melo , Vince Weaver , Paul Turner , Stephane Eranian , David Carrillo-Cisneros Subject: [RFC 1/6] perf/core: create active and inactive event groups Date: Tue, 10 Jan 2017 02:24:57 -0800 Message-Id: <20170110102502.106187-2-davidcc@google.com> X-Mailer: git-send-email 2.11.0.390.gc69c2f50cf-goog In-Reply-To: <20170110102502.106187-1-davidcc@google.com> References: <20170110102502.106187-1-davidcc@google.com> Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Currently, perf uses pinned_groups and flexible_groups for sched in/out. We can do better because: - sched out only cares about the ACTIVE events, this is usually a small set of events. - There can be many events in these lists thate are no relevant to the scheduler (e.g. other CPU/cgroups, events in OFF and ERROR state). Reduce the set of events to iterate over each context switch by adding three new lists: active_pinned_groups, active_flexible_groups and inactive_groups. All events in each list are in the same state so we avoid checking state. It also saves the iteration over events in OFF and ERROR state during sched in/out. The main impact of this patch is that ctx_sched_out can use the "small" active_{pinned,flexible}_groups instead of the potentially much larger {pinned,flexible}_groups. There is no pinned/flexible version of inactive event because next patches will create an index on them. Bookkeeping of the new lists is more involved, but it can provide a potentially large speed up. The new lists are intended to eventually replace {pinned,flexible}_groups, although that's not yet implemented. The inactive list is kept in FIFO order and only added after group_sched_in has succeded. This guarantees that it's in timestamp order. Signed-off-by: David Carrillo-Cisneros --- include/linux/perf_event.h | 6 +++ kernel/events/core.c | 93 +++++++++++++++++++++++++++++++++++++--------- 2 files changed, 82 insertions(+), 17 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 4741ecdb9817..3fa18f05c9b0 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -573,6 +573,7 @@ struct perf_event { struct hlist_node hlist_entry; struct list_head active_entry; + struct list_head ctx_active_entry; int nr_siblings; /* Not serialized. Only written during event initialization. */ @@ -734,6 +735,11 @@ struct perf_event_context { struct list_head active_ctx_list; struct list_head pinned_groups; struct list_head flexible_groups; + + struct list_head active_pinned_groups; + struct list_head active_flexible_groups; + struct list_head inactive_groups; + struct list_head event_list; int nr_events; int nr_active; diff --git a/kernel/events/core.c b/kernel/events/core.c index faf073d0287f..b744b5a8dbd0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1462,6 +1462,21 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) return &ctx->flexible_groups; } +static void +ctx_sched_groups_to_inactive(struct perf_event *event, + struct perf_event_context *ctx) +{ + WARN_ON(event->state != PERF_EVENT_STATE_INACTIVE); + list_move_tail(&event->ctx_active_entry, &ctx->inactive_groups); +}; + +static void +ctx_sched_groups_add(struct perf_event *event, struct perf_event_context *ctx) +{ + WARN_ON(!list_empty(&event->ctx_active_entry)); + list_add_tail(&event->ctx_active_entry, &ctx->inactive_groups); +} + /* * Add a event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. @@ -1487,10 +1502,11 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) list = ctx_group_list(event, ctx); list_add_tail(&event->group_entry, list); + if (event->state == PERF_EVENT_STATE_INACTIVE) + ctx_sched_groups_add(event, ctx); } list_update_cgroup_event(event, ctx, true); - list_add_rcu(&event->event_entry, &ctx->event_list); ctx->nr_events++; if (event->attr.inherit_stat) @@ -1648,6 +1664,13 @@ static void perf_group_attach(struct perf_event *event) perf_event__header_size(pos); } +static void ctx_sched_groups_del(struct perf_event *group, + struct perf_event_context *ctx) +{ + WARN_ON(group->state != PERF_EVENT_STATE_INACTIVE); + list_del_init(&group->ctx_active_entry); +} + /* * Remove a event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. @@ -1674,8 +1697,11 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) list_del_rcu(&event->event_entry); - if (event->group_leader == event) + if (event->group_leader == event) { + if (event->state == PERF_EVENT_STATE_INACTIVE) + ctx_sched_groups_del(event, ctx); list_del_init(&event->group_entry); + } update_group_times(event); @@ -1851,6 +1877,11 @@ group_sched_out(struct perf_event *group_event, if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive) cpuctx->exclusive = 0; + + if (group_event->state <= PERF_EVENT_STATE_INACTIVE) + ctx_sched_groups_to_inactive(group_event, ctx); + if (group_event->state < PERF_EVENT_STATE_INACTIVE) + ctx_sched_groups_del(group_event, ctx); } #define DETACH_GROUP 0x01UL @@ -1918,6 +1949,8 @@ static void __perf_event_disable(struct perf_event *event, group_sched_out(event, cpuctx, ctx); else event_sched_out(event, cpuctx, ctx); + if (event->state == PERF_EVENT_STATE_INACTIVE) + ctx_sched_groups_del(event, ctx); event->state = PERF_EVENT_STATE_OFF; } @@ -2014,6 +2047,17 @@ static void perf_set_shadow_time(struct perf_event *event, static void perf_log_throttle(struct perf_event *event, int enable); static void perf_log_itrace_start(struct perf_event *event); +static void +ctx_sched_groups_to_active(struct perf_event *event, struct perf_event_context *ctx) +{ + struct list_head *h = event->attr.pinned ? &ctx->active_pinned_groups : + &ctx->active_flexible_groups; + WARN_ON(!event); + WARN_ON(list_empty(&event->ctx_active_entry)); + WARN_ON(event->state != PERF_EVENT_STATE_ACTIVE); + list_move_tail(&event->ctx_active_entry, h); +} + static int event_sched_in(struct perf_event *event, struct perf_cpu_context *cpuctx, @@ -2091,9 +2135,7 @@ group_sched_in(struct perf_event *group_event, u64 now = ctx->time; bool simulate = false; - if (group_event->state == PERF_EVENT_STATE_OFF) - return 0; - + WARN_ON(group_event->state != PERF_EVENT_STATE_INACTIVE); pmu->start_txn(pmu, PERF_PMU_TXN_ADD); if (event_sched_in(group_event, cpuctx, ctx)) { @@ -2112,9 +2154,10 @@ group_sched_in(struct perf_event *group_event, } } - if (!pmu->commit_txn(pmu)) + if (!pmu->commit_txn(pmu)) { + ctx_sched_groups_to_active(group_event, ctx); return 0; - + } group_error: /* * Groups can be scheduled in as one unit only, so undo any @@ -2396,6 +2439,7 @@ static void __perf_event_enable(struct perf_event *event, ctx_sched_out(ctx, cpuctx, EVENT_TIME); __perf_event_mark_enabled(event); + ctx_sched_groups_add(event, ctx); if (!ctx->is_active) return; @@ -2611,7 +2655,7 @@ static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) { int is_active = ctx->is_active; - struct perf_event *event; + struct perf_event *event, *tmp; lockdep_assert_held(&ctx->lock); @@ -2658,13 +2702,17 @@ static void ctx_sched_out(struct perf_event_context *ctx, perf_pmu_disable(ctx->pmu); if (is_active & EVENT_PINNED) { - list_for_each_entry(event, &ctx->pinned_groups, group_entry) + list_for_each_entry_safe(event, tmp, &ctx->active_pinned_groups, ctx_active_entry) { + WARN_ON(event->state != PERF_EVENT_STATE_ACTIVE); group_sched_out(event, cpuctx, ctx); + } } if (is_active & EVENT_FLEXIBLE) { - list_for_each_entry(event, &ctx->flexible_groups, group_entry) + list_for_each_entry_safe(event, tmp, &ctx->active_flexible_groups, ctx_active_entry) { + WARN_ON(event->state != PERF_EVENT_STATE_ACTIVE); group_sched_out(event, cpuctx, ctx); + } } perf_pmu_enable(ctx->pmu); } @@ -2962,10 +3010,11 @@ static void ctx_pinned_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx) { - struct perf_event *event; + struct perf_event *event = NULL, *tmp; - list_for_each_entry(event, &ctx->pinned_groups, group_entry) { - if (event->state <= PERF_EVENT_STATE_OFF) + list_for_each_entry_safe( + event, tmp, &ctx->inactive_groups, ctx_active_entry) { + if (WARN_ON(event->state != PERF_EVENT_STATE_INACTIVE)) /* debug only */ continue; if (!event_filter_match(event)) continue; @@ -2983,6 +3032,7 @@ ctx_pinned_sched_in(struct perf_event_context *ctx, */ if (event->state == PERF_EVENT_STATE_INACTIVE) { update_group_times(event); + ctx_sched_groups_del(event, ctx); event->state = PERF_EVENT_STATE_ERROR; } } @@ -2992,12 +3042,12 @@ static void ctx_flexible_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx) { - struct perf_event *event; + struct perf_event *event = NULL, *tmp; int can_add_hw = 1; - list_for_each_entry(event, &ctx->flexible_groups, group_entry) { - /* Ignore events in OFF or ERROR state */ - if (event->state <= PERF_EVENT_STATE_OFF) + list_for_each_entry_safe( + event, tmp, &ctx->inactive_groups, ctx_active_entry) { + if (WARN_ON(event->state != PERF_EVENT_STATE_INACTIVE)) /* debug only */ continue; /* * Listen to the 'cpu' scheduling filter constraint @@ -3389,6 +3439,7 @@ static int event_enable_on_exec(struct perf_event *event, return 0; __perf_event_mark_enabled(event); + ctx_sched_groups_add(event, ctx); return 1; } @@ -3639,6 +3690,9 @@ static void __perf_event_init_context(struct perf_event_context *ctx) INIT_LIST_HEAD(&ctx->pinned_groups); INIT_LIST_HEAD(&ctx->flexible_groups); INIT_LIST_HEAD(&ctx->event_list); + INIT_LIST_HEAD(&ctx->active_pinned_groups); + INIT_LIST_HEAD(&ctx->active_flexible_groups); + INIT_LIST_HEAD(&ctx->inactive_groups); atomic_set(&ctx->refcount, 1); } @@ -9109,6 +9163,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, INIT_LIST_HEAD(&event->sibling_list); INIT_LIST_HEAD(&event->rb_entry); INIT_LIST_HEAD(&event->active_entry); + INIT_LIST_HEAD(&event->ctx_active_entry); INIT_LIST_HEAD(&event->addr_filters.list); INIT_HLIST_NODE(&event->hlist_entry); @@ -10085,6 +10140,10 @@ perf_event_exit_event(struct perf_event *child_event, if (parent_event) perf_group_detach(child_event); list_del_event(child_event, child_ctx); + + if (!parent_event && child_event->state == PERF_EVENT_STATE_INACTIVE) + ctx_sched_groups_del(parent_event, child_ctx); + child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */ raw_spin_unlock_irq(&child_ctx->lock); -- 2.11.0.390.gc69c2f50cf-goog