From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1755124AbbKDEMY (ORCPT ); Tue, 3 Nov 2015 23:12:24 -0500 Received: from mail-pa0-f43.google.com ([209.85.220.43]:36484 "EHLO mail-pa0-f43.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1754507AbbKDEMX (ORCPT ); Tue, 3 Nov 2015 23:12:23 -0500 Date: Wed, 4 Nov 2015 05:12:19 +0100 From: Stephane Eranian To: linux-kernel@vger.kernel.org Cc: peterz@infradead.org, mingo@elte.hu, ak@linux.intel.com, edumazet@google.com, acme@redhat.com Subject: [PATCH v3] perf: fix RCU issues with cgroup monitoring mode Message-ID: <20151104041219.GA12969@thinkpad> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline User-Agent: Mutt/1.5.21 (2010-09-15) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org This patch eliminates RCU violations detected by the RCU checker (PROVE_RCU). The impact code paths were all related to cgroup mode monitoring and involved access a task's cgrp. V2 is updated to include comments from PeterZ to eliminate some of the warnings without grabbing the rcu_read lock because we know we are already holding th ctx->lock which prevents the cgroup from disappearing while we are accessing it. The trick, as suggested by Peter, is to modify the perf_cgroup_from_task() to take an extra boolean parameter to allow bypassing the lockdep test in the task_subsys_cstate() macros. This patch uses this approach to update all calls the perf_cgroup_from_task(). In V3, we change the boolean parameter for a pointer to a perf_event_context so we can check the ctx->lock explicitely. This is more robust, than passing the boolean to express that we know the lock is held. The code can change, and thus the locking assumption, checking lockdep_is_held() ensures, the proper locking is in place. Patch relative to tip.git at commit 57ef9fc. Signed-off-by: Stephane Eranian --- arch/x86/kernel/cpu/perf_event_intel_cqm.c | 2 +- include/linux/perf_event.h | 5 +++-- kernel/events/core.c | 25 +++++++++++++++---------- 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c index 377e8f8..a316ca9 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c @@ -298,7 +298,7 @@ static bool __match_event(struct perf_event *a, struct perf_event *b) static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event) { if (event->attach_state & PERF_ATTACH_TASK) - return perf_cgroup_from_task(event->hw.target); + return perf_cgroup_from_task(event->hw.target, event->ctx); return event->cgrp; } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index d841d33..94107e4 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -697,9 +697,10 @@ struct perf_cgroup { * if there is no cgroup event for the current CPU context. */ static inline struct perf_cgroup * -perf_cgroup_from_task(struct task_struct *task) +perf_cgroup_from_task(struct task_struct *task, struct perf_event_context *ctx) { - return container_of(task_css(task, perf_event_cgrp_id), + bool safe = ctx ? lockdep_is_held(&ctx->lock) : true; + return container_of(task_css_check(task, perf_event_cgrp_id, safe), struct perf_cgroup, css); } #endif /* CONFIG_CGROUP_PERF */ diff --git a/kernel/events/core.c b/kernel/events/core.c index ea02109..f611246 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -435,7 +435,7 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) if (!is_cgroup_event(event)) return; - cgrp = perf_cgroup_from_task(current); + cgrp = perf_cgroup_from_task(current, event->ctx); /* * Do not update time when cgroup is not active */ @@ -458,7 +458,7 @@ perf_cgroup_set_timestamp(struct task_struct *task, if (!task || !ctx->nr_cgroups) return; - cgrp = perf_cgroup_from_task(task); + cgrp = perf_cgroup_from_task(task, ctx); info = this_cpu_ptr(cgrp->info); info->timestamp = ctx->timestamp; } @@ -489,7 +489,6 @@ static void perf_cgroup_switch(struct task_struct *task, int mode) * we reschedule only in the presence of cgroup * constrained events. */ - rcu_read_lock(); list_for_each_entry_rcu(pmu, &pmus, entry) { cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); @@ -523,7 +522,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode) * event_filter_match() to not have to pass * task around */ - cpuctx->cgrp = perf_cgroup_from_task(task); + cpuctx->cgrp = perf_cgroup_from_task(task, NULL); cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); } perf_pmu_enable(cpuctx->ctx.pmu); @@ -531,8 +530,6 @@ static void perf_cgroup_switch(struct task_struct *task, int mode) } } - rcu_read_unlock(); - local_irq_restore(flags); } @@ -542,17 +539,18 @@ static inline void perf_cgroup_sched_out(struct task_struct *task, struct perf_cgroup *cgrp1; struct perf_cgroup *cgrp2 = NULL; + rcu_read_lock(); /* * we come here when we know perf_cgroup_events > 0 */ - cgrp1 = perf_cgroup_from_task(task); + cgrp1 = perf_cgroup_from_task(task, NULL); /* * next is NULL when called from perf_event_enable_on_exec() * that will systematically cause a cgroup_switch() */ if (next) - cgrp2 = perf_cgroup_from_task(next); + cgrp2 = perf_cgroup_from_task(next, NULL); /* * only schedule out current cgroup events if we know @@ -561,6 +559,8 @@ static inline void perf_cgroup_sched_out(struct task_struct *task, */ if (cgrp1 != cgrp2) perf_cgroup_switch(task, PERF_CGROUP_SWOUT); + + rcu_read_unlock(); } static inline void perf_cgroup_sched_in(struct task_struct *prev, @@ -569,13 +569,14 @@ static inline void perf_cgroup_sched_in(struct task_struct *prev, struct perf_cgroup *cgrp1; struct perf_cgroup *cgrp2 = NULL; + rcu_read_lock(); /* * we come here when we know perf_cgroup_events > 0 */ - cgrp1 = perf_cgroup_from_task(task); + cgrp1 = perf_cgroup_from_task(task, NULL); /* prev can never be NULL */ - cgrp2 = perf_cgroup_from_task(prev); + cgrp2 = perf_cgroup_from_task(prev, NULL); /* * only need to schedule in cgroup events if we are changing @@ -584,6 +585,8 @@ static inline void perf_cgroup_sched_in(struct task_struct *prev, */ if (cgrp1 != cgrp2) perf_cgroup_switch(task, PERF_CGROUP_SWIN); + + rcu_read_unlock(); } static inline int perf_cgroup_connect(int fd, struct perf_event *event, @@ -9442,7 +9445,9 @@ static void perf_cgroup_css_free(struct cgroup_subsys_state *css) static int __perf_cgroup_move(void *info) { struct task_struct *task = info; + rcu_read_lock(); perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN); + rcu_read_unlock(); return 0; } -- 2.5.0