From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S936122AbcIGLOW (ORCPT ); Wed, 7 Sep 2016 07:14:22 -0400 Received: from mail-pf0-f175.google.com ([209.85.192.175]:35842 "EHLO mail-pf0-f175.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S932778AbcIGLOR (ORCPT ); Wed, 7 Sep 2016 07:14:17 -0400 From: Binoy Jayan To: "Steven Rostedt (Red Hat)" Cc: Ingo Molnar , Daniel Wagner , Arnd Bergmann , linux-kernel@vger.kernel.org, Masami , Thomas Gleixner , Binoy Jayan Subject: [PATCH v6 1/4] tracing: Deference pointers without RCU checks Date: Wed, 7 Sep 2016 16:43:52 +0530 Message-Id: <1473246835-30075-2-git-send-email-binoy.jayan@linaro.org> X-Mailer: git-send-email 1.8.2.1 In-Reply-To: <1473246835-30075-1-git-send-email-binoy.jayan@linaro.org> References: <1473246835-30075-1-git-send-email-binoy.jayan@linaro.org> Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org From: Daniel Wagner The tracepoint can't be used in code section where we are in the middle of a state transition. For example if we place a tracepoint inside start/stop_critical_section(), lockdep complains with [ 0.035589] WARNING: CPU: 0 PID: 3 at kernel/locking/lockdep.c:3560 \ check_flags.part.36+0x1bc/0x210() [ 0.036000] \ DEBUG_LOCKS_WARN_ON(current->softirqs_enabled) [ 0.036000] Kernel panic - not \ syncing: panic_on_warn set ... [ 0.036000] [ 0.036000] CPU: 0 PID: 3 Comm: ksoftirqd/0 Not tainted 4.0.0-rc7+ #460 [ 0.036000] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS \ 1.7.5-20140709_153950- 04/01/2014 [ 0.036000] ffffffff81f2463a ffff88007c93bb98 \ ffffffff81afb317 0000000000000001 [ 0.036000] ffffffff81f212b3 ffff88007c93bc18 \ ffffffff81af7bc2 ffff88007c93bbb8 [ 0.036000] ffffffff00000008 ffff88007c93bc28 \ ffff88007c93bbc8 000000000093bbd8 [ 0.036000] Call Trace: [ 0.036000] [] dump_stack+0x4f/0x7b [ 0.036000] [] panic+0xc0/0x1e9 [ 0.036000] [] ? _raw_spin_unlock_irqrestore+0x38/0x80 [ 0.036000] [] warn_slowpath_common+0xc0/0xc0 [ 0.036000] [] ? __local_bh_enable+0x36/0x70 [ 0.036000] [] warn_slowpath_fmt+0x46/0x50 [ 0.036000] [] check_flags.part.36+0x1bc/0x210 [ 0.036000] [] lock_is_held+0x78/0x90 [ 0.036000] [] ? __local_bh_enable+0x36/0x70 [ 0.036000] [] ? __do_softirq+0x3db/0x500 [ 0.036000] [] trace_preempt_on+0x255/0x260 [ 0.036000] [] preempt_count_sub+0xab/0xf0 [ 0.036000] [] __local_bh_enable+0x36/0x70 [ 0.036000] [] __do_softirq+0x3db/0x500 [ 0.036000] [] run_ksoftirqd+0x1f/0x60 [ 0.036000] [] smpboot_thread_fn+0x193/0x2a0 [ 0.036000] [] ? SyS_setgroups+0x150/0x150 [ 0.036000] [] kthread+0xf2/0x110 [ 0.036000] [] ? wait_for_completion+0xc3/0x120 [ 0.036000] [] ? preempt_count_sub+0xab/0xf0 [ 0.036000] [] ? kthread_create_on_node+0x240/0x240 [ 0.036000] [] ret_from_fork+0x58/0x90 [ 0.036000] [] ? kthread_create_on_node+0x240/0x240 [ 0.036000] ---[ end Kernel panic - not syncing: panic_on_warn set ... PeterZ was so kind to explain it to me what is happening: "__local_bh_enable() tests if this is the last SOFTIRQ_OFFSET, if so it tells lockdep softirqs are enabled with trace_softirqs_on() after that we go an actually modify the preempt_count with preempt_count_sub(). Then in preempt_count_sub() you call into trace_preempt_on() if this was the last preempt_count increment but you do that _before_ you actually change the preempt_count with __preempt_count_sub() at this point lockdep and preempt_count think the world differs and *boom*" So the simplest way to avoid this is by disabling the consistency checks. We also need to take care of the iterating in trace_events_trigger.c to avoid a splatter in conjunction with the hist trigger. Signed-off-by: Daniel Wagner Signed-off-by: Binoy Jayan --- include/linux/rculist.h | 36 ++++++++++++++++++++++++++++++++++++ include/linux/tracepoint.h | 4 ++-- kernel/trace/trace_events_filter.c | 4 ++-- kernel/trace/trace_events_trigger.c | 6 +++--- 4 files changed, 43 insertions(+), 7 deletions(-) diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 8beb98d..bee836b 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -279,6 +279,24 @@ static inline void list_splice_tail_init_rcu(struct list_head *list, container_of(lockless_dereference(ptr), type, member) /** + * list_entry_rcu_notrace - get the struct for this entry (for tracing) + * @ptr: the &struct list_head pointer. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_head within the struct. + * + * This primitive may safely run concurrently with the _rcu list-mutation + * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). + * + * This is the same as list_entry_rcu() except that it does + * not do any RCU debugging or tracing. + */ +#define list_entry_rcu_notrace(ptr, type, member) \ +({ \ + typeof(*ptr) __rcu *__ptr = (typeof(*ptr) __rcu __force *)ptr; \ + container_of((typeof(ptr))rcu_dereference_raw_notrace(__ptr), type, member); \ +}) + +/** * Where are list_empty_rcu() and list_first_entry_rcu()? * * Implementing those functions following their counterparts list_empty() and @@ -391,6 +409,24 @@ static inline void list_splice_tail_init_rcu(struct list_head *list, pos = list_entry_lockless(pos->member.next, typeof(*pos), member)) /** + * list_for_each_entry_rcu_notrace - iterate over rcu list of given type (for tracing) + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_head within the struct. + * + * This list-traversal primitive may safely run concurrently with + * the _rcu list-mutation primitives such as list_add_rcu() + * as long as the traversal is guarded by rcu_read_lock(). + * + * This is the same as list_for_each_entry_rcu() except that it does + * not do any RCU debugging or tracing. + */ +#define list_for_each_entry_rcu_notrace(pos, head, member) \ + for (pos = list_entry_rcu_notrace((head)->next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_entry_rcu_notrace(pos->member.next, typeof(*pos), member)) + +/** * list_for_each_entry_continue_rcu - continue iteration over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index be586c6..d0e2a82 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -138,7 +138,7 @@ extern void syscall_unregfunc(void); return; \ prercu; \ rcu_read_lock_sched_notrace(); \ - it_func_ptr = rcu_dereference_sched((tp)->funcs); \ + it_func_ptr = rcu_dereference_raw_notrace((tp)->funcs); \ if (it_func_ptr) { \ do { \ it_func = (it_func_ptr)->func; \ @@ -189,7 +189,7 @@ extern void syscall_unregfunc(void); TP_CONDITION(cond),,); \ if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \ rcu_read_lock_sched_notrace(); \ - rcu_dereference_sched(__tracepoint_##name.funcs);\ + rcu_dereference_raw_notrace(__tracepoint_##name.funcs);\ rcu_read_unlock_sched_notrace(); \ } \ } \ diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 9daa9b3..a00a8d5 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -608,11 +608,11 @@ int filter_match_preds(struct event_filter *filter, void *rec) /* * n_preds, root and filter->preds are protect with preemption disabled. */ - root = rcu_dereference_sched(filter->root); + root = rcu_dereference_raw_notrace(filter->root); if (!root) return 1; - data.preds = preds = rcu_dereference_sched(filter->preds); + data.preds = preds = rcu_dereference_raw_notrace(filter->preds); ret = walk_pred_tree(preds, root, filter_match_preds_cb, &data); WARN_ON(ret); return data.match; diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index a975571..4489fb4 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -71,14 +71,14 @@ event_triggers_call(struct trace_event_file *file, void *rec) if (list_empty(&file->triggers)) return tt; - list_for_each_entry_rcu(data, &file->triggers, list) { + list_for_each_entry_rcu_notrace(data, &file->triggers, list) { if (data->paused) continue; if (!rec) { data->ops->func(data, rec); continue; } - filter = rcu_dereference_sched(data->filter); + filter = rcu_dereference_raw_notrace(data->filter); if (filter && !filter_match_preds(filter, rec)) continue; if (event_command_post_trigger(data->cmd_ops)) { @@ -111,7 +111,7 @@ event_triggers_post_call(struct trace_event_file *file, { struct event_trigger_data *data; - list_for_each_entry_rcu(data, &file->triggers, list) { + list_for_each_entry_rcu_notrace(data, &file->triggers, list) { if (data->paused) continue; if (data->cmd_ops->trigger_type & tt) -- The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project