From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1165488AbdD1OZK (ORCPT ); Fri, 28 Apr 2017 10:25:10 -0400 Received: from Galois.linutronix.de ([146.0.238.70]:34253 "EHLO Galois.linutronix.de" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1164300AbdD1OZB (ORCPT ); Fri, 28 Apr 2017 10:25:01 -0400 Date: Fri, 28 Apr 2017 16:24:56 +0200 From: Sebastian Siewior To: Thomas Gleixner , Peter Zijlstra Cc: LKML , Ingo Molnar , Steven Rostedt Subject: [RFC PATCH] trace/perf: cure locking issue in perf_event_open() error path Message-ID: <20170428142456.5xh44ef3fv7w2kkh@linutronix.de> References: <20170418170442.665445272@linutronix.de> MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Disposition: inline In-Reply-To: <20170418170442.665445272@linutronix.de> User-Agent: NeoMutt/20170306 (1.8.0) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org As trinity figured out, there is a recursive get_online_cpus() in perf_event_open()'s error path: | Call Trace: | dump_stack+0x86/0xce | __lock_acquire+0x2520/0x2cd0 | lock_acquire+0x27c/0x2f0 | get_online_cpus+0x3d/0x80 | static_key_slow_dec+0x5a/0x70 | sw_perf_event_destroy+0x8e/0x100 | _free_event+0x61b/0x800 | free_event+0x68/0x70 | SyS_perf_event_open+0x19db/0x1d80 In order to cure, I am moving free_event() after the put_online_cpus() block. Besides that one, there also the error path in perf_event_alloc() which also invokes event->destory. Here I delayed the destory work to schedule_work(). Signed-off-by: Sebastian Andrzej Siewior --- I am not quite happy with the schedule_work() part. include/linux/perf_event.h | 1 + kernel/events/core.c | 52 +++++++++++++++++++++++++++++++++------------- 2 files changed, 38 insertions(+), 15 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 24a635887f28..d6a874dbbd21 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -718,6 +718,7 @@ struct perf_event { #endif struct list_head sb_list; + struct work_struct destroy_work; #endif /* CONFIG_PERF_EVENTS */ }; diff --git a/kernel/events/core.c b/kernel/events/core.c index 7aed78b516fc..3358889609f8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9320,6 +9320,16 @@ static void account_event(struct perf_event *event) account_pmu_sb_event(event); } +static void perf_alloc_destroy_ev(struct work_struct *work) +{ + struct perf_event *event; + + event = container_of(work, struct perf_event, destroy_work); + event->destroy(event); + module_put(event->pmu->module); + kfree(event); +} + /* * Allocate and initialize a event structure */ @@ -9334,6 +9344,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, struct pmu *pmu; struct perf_event *event; struct hw_perf_event *hwc; + bool delay_destroy = false; long err = -EINVAL; if ((unsigned)cpu >= nr_cpu_ids) { @@ -9497,15 +9508,22 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, exclusive_event_destroy(event); err_pmu: - if (event->destroy) - event->destroy(event); - module_put(pmu->module); + if (event->destroy) { + /* delay ->destroy due to nested get_online_cpus() */ + INIT_WORK(&event->destroy_work, perf_alloc_destroy_ev); + delay_destroy = true; + } else { + module_put(pmu->module); + } err_ns: if (is_cgroup_event(event)) perf_detach_cgroup(event); if (event->ns) put_pid_ns(event->ns); - kfree(event); + if (delay_destroy) + schedule_work(&event->destroy_work); + else + kfree(event); return ERR_PTR(err); } @@ -9798,7 +9816,7 @@ SYSCALL_DEFINE5(perf_event_open, pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) { struct perf_event *group_leader = NULL, *output_event = NULL; - struct perf_event *event, *sibling; + struct perf_event *event = NULL, *sibling; struct perf_event_attr attr; struct perf_event_context *ctx, *uninitialized_var(gctx); struct file *event_file = NULL; @@ -9908,13 +9926,14 @@ SYSCALL_DEFINE5(perf_event_open, NULL, NULL, cgroup_fd); if (IS_ERR(event)) { err = PTR_ERR(event); + event = NULL; goto err_cred; } if (is_sampling_event(event)) { if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) { err = -EOPNOTSUPP; - goto err_alloc; + goto err_cred; } } @@ -9927,7 +9946,7 @@ SYSCALL_DEFINE5(perf_event_open, if (attr.use_clockid) { err = perf_event_set_clock(event, attr.clockid); if (err) - goto err_alloc; + goto err_cred; } if (pmu->task_ctx_nr == perf_sw_context) @@ -9962,7 +9981,7 @@ SYSCALL_DEFINE5(perf_event_open, ctx = find_get_context(pmu, task, event); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); - goto err_alloc; + goto err_cred; } if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) { @@ -10186,18 +10205,21 @@ SYSCALL_DEFINE5(perf_event_open, err_context: perf_unpin_context(ctx); put_ctx(ctx); -err_alloc: - /* - * If event_file is set, the fput() above will have called ->release() - * and that will take care of freeing the event. - */ - if (!event_file) - free_event(event); err_cred: if (task) mutex_unlock(&task->signal->cred_guard_mutex); err_cpus: put_online_cpus(); + /* + * The event cleanup should happen earlier (as per cleanup in reverse + * allocation order). It is delayed after the put_online_cpus() section + * so we don't invoke event->destroy in it and risk recursive invocation + * of it via static_key_slow_dec(). + * If event_file is set, the fput() above will have called ->release() + * and that will take care of freeing the event. + */ + if (event && !event_file) + free_event(event); err_task: if (task) put_task_struct(task); -- 2.11.0