[RFC,6/6] tracing: use sched-RCU instead of SRCU for rcuidle tracepoints
diff mbox series

Message ID 20201023195352.26269-7-mjeanson@efficios.com
State New, archived
Headers show
Series
  • Sleepable tracepoints
Related show

Commit Message

Michael Jeanson Oct. 23, 2020, 7:53 p.m. UTC
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>

Considering that tracer callbacks expect RCU to be watching (for
instance, perf uses rcu_read_lock), we need rcuidle tracepoints to issue
rcu_irq_{enter,exit}_irqson around calls to the callbacks. So there is
no point in using SRCU anymore given that rcuidle tracepoints need to
ensure RCU is watching. Therefore, simply use sched-RCU like normal
tracepoints for rcuidle tracepoints.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Michael Jeanson <mjeanson@efficios.com>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Yonghong Song <yhs@fb.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: bpf@vger.kernel.org
---
 include/linux/tracepoint.h | 33 +++++++--------------------------
 kernel/tracepoint.c        | 25 +++++++++----------------
 2 files changed, 16 insertions(+), 42 deletions(-)

Comments

Joel Fernandes Oct. 23, 2020, 9:13 p.m. UTC | #1
On Fri, Oct 23, 2020 at 03:53:52PM -0400, Michael Jeanson wrote:
> From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> 
> Considering that tracer callbacks expect RCU to be watching (for
> instance, perf uses rcu_read_lock), we need rcuidle tracepoints to issue
> rcu_irq_{enter,exit}_irqson around calls to the callbacks. So there is
> no point in using SRCU anymore given that rcuidle tracepoints need to
> ensure RCU is watching. Therefore, simply use sched-RCU like normal
> tracepoints for rcuidle tracepoints.

High level question:

IIRC, doing this increases overhead for general tracing that does not use
perf, for 'rcuidle' tracepoints such as the preempt/irq enable/disable
tracepoints. I remember adding SRCU because of this reason.

Can the 'rcuidle' information not be pushed down further, such that perf does
it because it requires RCU to be watching, so that it does not effect, say,
trace events?

thanks,

 - Joel

> Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> Cc: Michael Jeanson <mjeanson@efficios.com>
> Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Alexei Starovoitov <ast@kernel.org>
> Cc: Yonghong Song <yhs@fb.com>
> Cc: Paul E. McKenney <paulmck@kernel.org>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
> Cc: Mark Rutland <mark.rutland@arm.com>
> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
> Cc: Jiri Olsa <jolsa@redhat.com>
> Cc: Namhyung Kim <namhyung@kernel.org>
> Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
> Cc: bpf@vger.kernel.org
> ---
>  include/linux/tracepoint.h | 33 +++++++--------------------------
>  kernel/tracepoint.c        | 25 +++++++++----------------
>  2 files changed, 16 insertions(+), 42 deletions(-)
> 
> diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
> index 0386b54cbcbb..1414b11f864b 100644
> --- a/include/linux/tracepoint.h
> +++ b/include/linux/tracepoint.h
> @@ -13,7 +13,6 @@
>   */
>  
>  #include <linux/smp.h>
> -#include <linux/srcu.h>
>  #include <linux/errno.h>
>  #include <linux/types.h>
>  #include <linux/cpumask.h>
> @@ -33,8 +32,6 @@ struct trace_eval_map {
>  
>  #define TRACEPOINT_DEFAULT_PRIO	10
>  
> -extern struct srcu_struct tracepoint_srcu;
> -
>  extern int
>  tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data);
>  extern int
> @@ -86,7 +83,6 @@ int unregister_tracepoint_module_notifier(struct notifier_block *nb)
>  static inline void tracepoint_synchronize_unregister(void)
>  {
>  	synchronize_rcu_tasks_trace();
> -	synchronize_srcu(&tracepoint_srcu);
>  	synchronize_rcu();
>  }
>  #else
> @@ -175,25 +171,13 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
>  		if (!(cond))						\
>  			return;						\
>  									\
> -		/* srcu can't be used from NMI */			\
> -		WARN_ON_ONCE(rcuidle && in_nmi());			\
> -									\
> -		if (maysleep) {						\
> -			might_sleep();					\
> +		might_sleep_if(maysleep);				\
> +		if (rcuidle)						\
> +			rcu_irq_enter_irqson();				\
> +		if (maysleep)						\
>  			rcu_read_lock_trace();				\
> -		} else {						\
> -			/* keep srcu and sched-rcu usage consistent */	\
> +		else							\
>  			preempt_disable_notrace();			\
> -		}							\
> -									\
> -		/*							\
> -		 * For rcuidle callers, use srcu since sched-rcu	\
> -		 * doesn't work from the idle path.			\
> -		 */							\
> -		if (rcuidle) {						\
> -			__idx = srcu_read_lock_notrace(&tracepoint_srcu);\
> -			rcu_irq_enter_irqson();				\
> -		}							\
>  									\
>  		it_func_ptr = rcu_dereference_raw((tp)->funcs);		\
>  									\
> @@ -205,15 +189,12 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
>  			} while ((++it_func_ptr)->func);		\
>  		}							\
>  									\
> -		if (rcuidle) {						\
> -			rcu_irq_exit_irqson();				\
> -			srcu_read_unlock_notrace(&tracepoint_srcu, __idx);\
> -		}							\
> -									\
>  		if (maysleep)						\
>  			rcu_read_unlock_trace();			\
>  		else							\
>  			preempt_enable_notrace();			\
> +		if (rcuidle)						\
> +			rcu_irq_exit_irqson();				\
>  	} while (0)
>  
>  #ifndef MODULE
> diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
> index 8d8e41c5d8a5..68b4e50798b1 100644
> --- a/kernel/tracepoint.c
> +++ b/kernel/tracepoint.c
> @@ -18,9 +18,6 @@
>  extern tracepoint_ptr_t __start___tracepoints_ptrs[];
>  extern tracepoint_ptr_t __stop___tracepoints_ptrs[];
>  
> -DEFINE_SRCU(tracepoint_srcu);
> -EXPORT_SYMBOL_GPL(tracepoint_srcu);
> -
>  /* Set to 1 to enable tracepoint debug output */
>  static const int tracepoint_debug;
>  
> @@ -65,14 +62,9 @@ static void rcu_tasks_trace_free_old_probes(struct rcu_head *head)
>  	kfree(container_of(head, struct tp_probes, rcu));
>  }
>  
> -static void srcu_free_old_probes(struct rcu_head *head)
> -{
> -	call_rcu_tasks_trace(head, rcu_tasks_trace_free_old_probes);
> -}
> -
>  static void rcu_free_old_probes(struct rcu_head *head)
>  {
> -	call_srcu(&tracepoint_srcu, head, srcu_free_old_probes);
> +	call_rcu_tasks_trace(head, rcu_tasks_trace_free_old_probes);
>  }
>  
>  static __init int release_early_probes(void)
> @@ -90,7 +82,7 @@ static __init int release_early_probes(void)
>  	return 0;
>  }
>  
> -/* SRCU and Tasks Trace RCU are initialized at core_initcall */
> +/* Tasks Trace RCU is initialized at core_initcall */
>  postcore_initcall(release_early_probes);
>  
>  static inline void release_probes(struct tracepoint_func *old)
> @@ -100,9 +92,8 @@ static inline void release_probes(struct tracepoint_func *old)
>  			struct tp_probes, probes[0]);
>  
>  		/*
> -		 * We can't free probes if SRCU and Tasks Trace RCU are not
> -		 * initialized yet. Postpone the freeing till after both are
> -		 * initialized.
> +		 * We can't free probes if Tasks Trace RCU is not initialized yet.
> +		 * Postpone the freeing till after Tasks Trace RCU is initialized.
>  		 */
>  		if (unlikely(!ok_to_free_tracepoints)) {
>  			tp_probes->rcu.next = early_probes;
> @@ -111,9 +102,11 @@ static inline void release_probes(struct tracepoint_func *old)
>  		}
>  
>  		/*
> -		 * Tracepoint probes are protected by sched RCU, SRCU and
> -		 * Tasks Trace RCU by chaining the callbacks we cover all three
> -		 * cases and wait for all three grace periods.
> +		 * Tracepoint probes are protected by both sched RCU and
> +		 * Tasks Trace RCU, by calling the Tasks Trace RCU callback in
> +		 * the sched RCU callback we cover both cases. So let us chain
> +		 * the Tasks Trace RCU and sched RCU callbacks to wait for both
> +		 * grace periods.
>  		 */
>  		call_rcu(&tp_probes->rcu, rcu_free_old_probes);
>  	}
> -- 
> 2.25.1
>
Peter Zijlstra Oct. 26, 2020, 8:20 a.m. UTC | #2
On Fri, Oct 23, 2020 at 05:13:59PM -0400, Joel Fernandes wrote:
> On Fri, Oct 23, 2020 at 03:53:52PM -0400, Michael Jeanson wrote:
> > From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> > 
> > Considering that tracer callbacks expect RCU to be watching (for
> > instance, perf uses rcu_read_lock), we need rcuidle tracepoints to issue
> > rcu_irq_{enter,exit}_irqson around calls to the callbacks. So there is
> > no point in using SRCU anymore given that rcuidle tracepoints need to
> > ensure RCU is watching. Therefore, simply use sched-RCU like normal
> > tracepoints for rcuidle tracepoints.
> 
> High level question:
> 
> IIRC, doing this increases overhead for general tracing that does not use
> perf, for 'rcuidle' tracepoints such as the preempt/irq enable/disable
> tracepoints. I remember adding SRCU because of this reason.
> 
> Can the 'rcuidle' information not be pushed down further, such that perf does
> it because it requires RCU to be watching, so that it does not effect, say,
> trace events?

There's very few trace_.*_rcuidle() users left. We should eradicate them
and remove the option. It's bugs to begin with.
Mathieu Desnoyers Oct. 26, 2020, 2:28 p.m. UTC | #3
----- On Oct 26, 2020, at 4:20 AM, Peter Zijlstra peterz@infradead.org wrote:

> On Fri, Oct 23, 2020 at 05:13:59PM -0400, Joel Fernandes wrote:
>> On Fri, Oct 23, 2020 at 03:53:52PM -0400, Michael Jeanson wrote:
>> > From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
>> > 
>> > Considering that tracer callbacks expect RCU to be watching (for
>> > instance, perf uses rcu_read_lock), we need rcuidle tracepoints to issue
>> > rcu_irq_{enter,exit}_irqson around calls to the callbacks. So there is
>> > no point in using SRCU anymore given that rcuidle tracepoints need to
>> > ensure RCU is watching. Therefore, simply use sched-RCU like normal
>> > tracepoints for rcuidle tracepoints.
>> 
>> High level question:
>> 
>> IIRC, doing this increases overhead for general tracing that does not use
>> perf, for 'rcuidle' tracepoints such as the preempt/irq enable/disable
>> tracepoints. I remember adding SRCU because of this reason.
>> 
>> Can the 'rcuidle' information not be pushed down further, such that perf does
>> it because it requires RCU to be watching, so that it does not effect, say,
>> trace events?
> 
> There's very few trace_.*_rcuidle() users left. We should eradicate them
> and remove the option. It's bugs to begin with.

I agree with Peter. Removing the trace_.*_rcuidle weirdness from the tracepoint
API and fixing all callers to ensure they trace from a context where RCU is
watching would simplify instrumentation of the Linux kernel, thus making it harder
for subtle bugs to hide and be unearthed only when tracing is enabled. This is
AFAIU the general approach Thomas Gleixner has been aiming for recently, and I
think it is a good thing.

So if we consider this our target, and that the current state of things is that
we need to have RCU watching around callback invocation, then removing the
dependency on SRCU seems like an overall simplification which does not regress
feature-wise nor speed-wise compared with what we have upstream today. The next
steps would then be to audit all rcuidle tracepoints and make sure the context
where they are placed has RCU watching already, so we can remove the tracepoint
rcuidle API. That would effectively remove the calls to rcu_irq_{enter,exit}_irqson
from the tracepoint code.

This is however beyond the scope of the proposed patch set.

Thanks,

Mathieu
Steven Rostedt Oct. 26, 2020, 8:44 p.m. UTC | #4
On Mon, 26 Oct 2020 10:28:07 -0400 (EDT)
Mathieu Desnoyers <mathieu.desnoyers@efficios.com> wrote:

> I agree with Peter. Removing the trace_.*_rcuidle weirdness from the tracepoint
> API and fixing all callers to ensure they trace from a context where RCU is
> watching would simplify instrumentation of the Linux kernel, thus making it harder
> for subtle bugs to hide and be unearthed only when tracing is enabled. This is

Note, the lockdep RCU checking of a tracepoint is outside of it being
enabled or disable. So if a non rcuidle() tracepoint is in a location that
RCU is not watching, it will complain loudly, even if you don't enable that
tracepoint.

> AFAIU the general approach Thomas Gleixner has been aiming for recently, and I
> think it is a good thing.
> 
> So if we consider this our target, and that the current state of things is that
> we need to have RCU watching around callback invocation, then removing the
> dependency on SRCU seems like an overall simplification which does not regress
> feature-wise nor speed-wise compared with what we have upstream today. The next
> steps would then be to audit all rcuidle tracepoints and make sure the context
> where they are placed has RCU watching already, so we can remove the tracepoint

Just remove the _rcuidle() from them, and lockdep will complain if they are
being called without RCU watching.

-- Steve


> rcuidle API. That would effectively remove the calls to rcu_irq_{enter,exit}_irqson
> from the tracepoint code.
> 
> This is however beyond the scope of the proposed patch set.
> 
> Thanks,
> 
> Mathieu
>
Mathieu Desnoyers Oct. 27, 2020, 1:57 p.m. UTC | #5
----- On Oct 26, 2020, at 4:44 PM, rostedt rostedt@goodmis.org wrote:

> On Mon, 26 Oct 2020 10:28:07 -0400 (EDT)
> Mathieu Desnoyers <mathieu.desnoyers@efficios.com> wrote:
> 
>> I agree with Peter. Removing the trace_.*_rcuidle weirdness from the tracepoint
>> API and fixing all callers to ensure they trace from a context where RCU is
>> watching would simplify instrumentation of the Linux kernel, thus making it
>> harder
>> for subtle bugs to hide and be unearthed only when tracing is enabled. This is
> 
> Note, the lockdep RCU checking of a tracepoint is outside of it being
> enabled or disable. So if a non rcuidle() tracepoint is in a location that
> RCU is not watching, it will complain loudly, even if you don't enable that
> tracepoint.
> 
>> AFAIU the general approach Thomas Gleixner has been aiming for recently, and I
>> think it is a good thing.
>> 
>> So if we consider this our target, and that the current state of things is that
>> we need to have RCU watching around callback invocation, then removing the
>> dependency on SRCU seems like an overall simplification which does not regress
>> feature-wise nor speed-wise compared with what we have upstream today. The next
>> steps would then be to audit all rcuidle tracepoints and make sure the context
>> where they are placed has RCU watching already, so we can remove the tracepoint
> 
> Just remove the _rcuidle() from them, and lockdep will complain if they are
> being called without RCU watching.

That's the easy part. The patch removing rcuidle is attached to this email,
and here are the splats I get just when booting the machine (see below). The
part which takes more time is fixing those splats and figuring out how to
restructure the code. For instance, what should we do about the rcuidle
tracepoint within printk() ?

Also, how do we test rcuidle tracepoints triggered only when printk is called
from printk warnings ? We'd also need to test on arm32 boards, because some arm
architecture code uses rcuidle tracepoints as well.

As this is beyond the scope of this patch set, I can either drop this patch
entirely (it's not required for sleepable tracepoints), or keep it as an
intermediate cleanup step. Removing rcuidle tracepoints entirely is beyond
the effort Michael and I can undertake now.

=============================
WARNING: suspicious RCU usage

5.9.1+ #72 Not tainted
-----------------------------
=============================
./include/trace/events/preemptirq.h:42 suspicious rcu_dereference_check() usage!
WARNING: suspicious RCU usage

other info that might help us debug this:


rcu_scheduler_active = 1, debug_locks = 1
5.9.1+ #72 Not tainted
RCU used illegally from extended quiescent state!
no locks held by swapper/0/0.
-----------------------------
./include/trace/events/preemptirq.h:38 suspicious rcu_dereference_check() usage!

stack backtrace:
CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.9.1+ #72

other info that might help us debug this:


rcu_scheduler_active = 1, debug_locks = 1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
Call Trace:
RCU used illegally from extended quiescent state!
no locks held by swapper/1/0.
 dump_stack+0x8d/0xbb

stack backtrace:
 ? default_idle+0xa/0x20
 trace_hardirqs_on+0xed/0xf0
 default_idle+0xa/0x20
 default_idle_call+0x4f/0x1e0
 do_idle+0x1ef/0x2c0
 cpu_startup_entry+0x19/0x20
 start_kernel+0x578/0x59d
 secondary_startup_64+0xa4/0xb0
CPU: 1 PID: 0 Comm: swapper/1 Not tainted 5.9.1+ #72

=============================
WARNING: suspicious RCU usage
5.9.1+ #72 Not tainted
-----------------------------
./include/trace/events/lock.h:37 suspicious rcu_dereference_check() usage!

other info that might help us debug this:


rcu_scheduler_active = 1, debug_locks = 1
RCU used illegally from extended quiescent state!
no locks held by swapper/0/0.

stack backtrace:
CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.9.1+ #72
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
Call Trace:
 dump_stack+0x8d/0xbb
 lock_acquire+0x346/0x3b0
 ? __lock_acquire+0x2e7/0x1da0
 _raw_spin_lock+0x2c/0x40
 ? vprintk_emit+0x9f/0x410
 vprintk_emit+0x9f/0x410
 printk+0x52/0x6e
 lockdep_rcu_suspicious+0x1b/0xe0
 ? default_idle+0xa/0x20
 trace_hardirqs_on+0xed/0xf0
 default_idle+0xa/0x20
 default_idle_call+0x4f/0x1e0
 do_idle+0x1ef/0x2c0
 cpu_startup_entry+0x19/0x20
 start_kernel+0x578/0x59d
 secondary_startup_64+0xa4/0xb0

=============================
WARNING: suspicious RCU usage
5.9.1+ #72 Not tainted
-----------------------------
./include/trace/events/lock.h:63 suspicious rcu_dereference_check() usage!

other info that might help us debug this:


rcu_scheduler_active = 1, debug_locks = 1
RCU used illegally from extended quiescent state!
1 lock held by swapper/0/0:
 #0: ffffffff97a80158 (logbuf_lock){-...}-{2:2}, at: vprintk_emit+0x9f/0x410

stack backtrace:
CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.9.1+ #72
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
Call Trace:
 dump_stack+0x8d/0xbb
 lock_release+0x25a/0x280
 _raw_spin_unlock+0x17/0x30
 vprintk_emit+0xdf/0x410
 printk+0x52/0x6e
 lockdep_rcu_suspicious+0x1b/0xe0
 ? default_idle+0xa/0x20
 trace_hardirqs_on+0xed/0xf0
 default_idle+0xa/0x20
 default_idle_call+0x4f/0x1e0
 do_idle+0x1ef/0x2c0
 cpu_startup_entry+0x19/0x20
 start_kernel+0x578/0x59d
 secondary_startup_64+0xa4/0xb0

=============================
WARNING: suspicious RCU usage
5.9.1+ #72 Not tainted
-----------------------------
./include/trace/events/printk.h:33 suspicious rcu_dereference_check() usage!

other info that might help us debug this:


rcu_scheduler_active = 1, debug_locks = 1
RCU used illegally from extended quiescent state!
2 locks held by swapper/0/0:
 #0: ffffffff97a801a0 (console_lock){+.+.}-{0:0}, at: vprintk_emit+0x126/0x410
 #1: ffffffff97a7fec0 (console_owner){....}-{0:0}, at: console_unlock+0x190/0x5d0

stack backtrace:
CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.9.1+ #72
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
Call Trace:
 dump_stack+0x8d/0xbb
 console_unlock+0x5ad/0x5d0
 vprintk_emit+0x133/0x410
 printk+0x52/0x6e
 lockdep_rcu_suspicious+0x1b/0xe0
 ? default_idle+0xa/0x20
 trace_hardirqs_on+0xed/0xf0
 default_idle+0xa/0x20
 default_idle_call+0x4f/0x1e0
 do_idle+0x1ef/0x2c0
 cpu_startup_entry+0x19/0x20
 start_kernel+0x578/0x59d
 secondary_startup_64+0xa4/0xb0

=============================
WARNING: suspicious RCU usage
5.9.1+ #72 Not tainted
-----------------------------
./include/linux/rcupdate.h:636 rcu_read_lock() used illegally while idle!

other info that might help us debug this:


rcu_scheduler_active = 1, debug_locks = 1
RCU used illegally from extended quiescent state!
4 locks held by swapper/0/0:
 #0: ffffffff97a801a0 (console_lock){+.+.}-{0:0}, at: vprintk_emit+0x126/0x410
 #1: ffffffff97a7fec0 (console_owner){....}-{0:0}, at: console_unlock+0x190/0x5d0
 #2: ffffffff97b7d598 (printing_lock){....}-{2:2}, at: vt_console_print+0x7d/0x3e0
 #3: ffffffff97a82d80 (rcu_read_lock){....}-{1:2}, at: __atomic_notifier_call_chain+0x5/0x110

stack backtrace:
CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.9.1+ #72
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
Call Trace:
 dump_stack+0x8d/0xbb
 __atomic_notifier_call_chain+0xd7/0x110
 vt_console_print+0x19e/0x3e0
 console_unlock+0x417/0x5d0
 vprintk_emit+0x133/0x410
 printk+0x52/0x6e
 lockdep_rcu_suspicious+0x1b/0xe0
 ? default_idle+0xa/0x20
 trace_hardirqs_on+0xed/0xf0
 default_idle+0xa/0x20
 default_idle_call+0x4f/0x1e0
 do_idle+0x1ef/0x2c0
 cpu_startup_entry+0x19/0x20
 start_kernel+0x578/0x59d
 secondary_startup_64+0xa4/0xb0

=============================
WARNING: suspicious RCU usage
5.9.1+ #72 Not tainted
-----------------------------
./include/linux/rcupdate.h:685 rcu_read_unlock() used illegally while idle!

other info that might help us debug this:


rcu_scheduler_active = 1, debug_locks = 1
RCU used illegally from extended quiescent state!
4 locks held by swapper/0/0:
 #0: ffffffff97a801a0 (console_lock){+.+.}-{0:0}, at: vprintk_emit+0x126/0x410
 #1: ffffffff97a7fec0 (console_owner){....}-{0:0}, at: console_unlock+0x190/0x5d0
 #2: ffffffff97b7d598 (printing_lock){....}-{2:2}, at: vt_console_print+0x7d/0x3e0
 #3: ffffffff97a82d80 (rcu_read_lock){....}-{1:2}, at: __atomic_notifier_call_chain+0x5/0x110

stack backtrace:
CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.9.1+ #72
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
Call Trace:
 dump_stack+0x8d/0xbb
 __atomic_notifier_call_chain+0x101/0x110
 vt_console_print+0x19e/0x3e0
 console_unlock+0x417/0x5d0
 vprintk_emit+0x133/0x410
 printk+0x52/0x6e
 lockdep_rcu_suspicious+0x1b/0xe0
 ? default_idle+0xa/0x20
 trace_hardirqs_on+0xed/0xf0
 default_idle+0xa/0x20
 default_idle_call+0x4f/0x1e0
 do_idle+0x1ef/0x2c0
 cpu_startup_entry+0x19/0x20
 start_kernel+0x578/0x59d
 secondary_startup_64+0xa4/0xb0
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
Call Trace:
 dump_stack+0x8d/0xbb
 ? rcu_idle_exit+0x32/0x40
 trace_hardirqs_off+0xe3/0xf0
 rcu_idle_exit+0x32/0x40
 default_idle_call+0x54/0x1e0
 do_idle+0x1ef/0x2c0
 ? lockdep_hardirqs_on_prepare+0xec/0x180
 cpu_startup_entry+0x19/0x20
 start_secondary+0x11c/0x160
 secondary_startup_64+0xa4/0xb0

Thanks,

Mathieu


> 
> -- Steve
> 
> 
>> rcuidle API. That would effectively remove the calls to
>> rcu_irq_{enter,exit}_irqson
>> from the tracepoint code.
>> 
>> This is however beyond the scope of the proposed patch set.
>> 
>> Thanks,
>> 
>> Mathieu
Joel Fernandes Nov. 2, 2020, 6:43 p.m. UTC | #6
On Mon, Oct 26, 2020 at 10:28:07AM -0400, Mathieu Desnoyers wrote:
> ----- On Oct 26, 2020, at 4:20 AM, Peter Zijlstra peterz@infradead.org wrote:
> 
> > On Fri, Oct 23, 2020 at 05:13:59PM -0400, Joel Fernandes wrote:
> >> On Fri, Oct 23, 2020 at 03:53:52PM -0400, Michael Jeanson wrote:
> >> > From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> >> > 
> >> > Considering that tracer callbacks expect RCU to be watching (for
> >> > instance, perf uses rcu_read_lock), we need rcuidle tracepoints to issue
> >> > rcu_irq_{enter,exit}_irqson around calls to the callbacks. So there is
> >> > no point in using SRCU anymore given that rcuidle tracepoints need to
> >> > ensure RCU is watching. Therefore, simply use sched-RCU like normal
> >> > tracepoints for rcuidle tracepoints.
> >> 
> >> High level question:
> >> 
> >> IIRC, doing this increases overhead for general tracing that does not use
> >> perf, for 'rcuidle' tracepoints such as the preempt/irq enable/disable
> >> tracepoints. I remember adding SRCU because of this reason.
> >> 
> >> Can the 'rcuidle' information not be pushed down further, such that perf does
> >> it because it requires RCU to be watching, so that it does not effect, say,
> >> trace events?
> > 
> > There's very few trace_.*_rcuidle() users left. We should eradicate them
> > and remove the option. It's bugs to begin with.
> 
> I agree with Peter. Removing the trace_.*_rcuidle weirdness from the tracepoint
> API and fixing all callers to ensure they trace from a context where RCU is
> watching would simplify instrumentation of the Linux kernel, thus making it harder
> for subtle bugs to hide and be unearthed only when tracing is enabled. This is
> AFAIU the general approach Thomas Gleixner has been aiming for recently, and I
> think it is a good thing.
> 
> So if we consider this our target, and that the current state of things is that
> we need to have RCU watching around callback invocation, then removing the
> dependency on SRCU seems like an overall simplification which does not regress
> feature-wise nor speed-wise compared with what we have upstream today. The next
> steps would then be to audit all rcuidle tracepoints and make sure the context
> where they are placed has RCU watching already, so we can remove the tracepoint
> rcuidle API. That would effectively remove the calls to rcu_irq_{enter,exit}_irqson
> from the tracepoint code.
> 
> This is however beyond the scope of the proposed patch set.

You are right, it doesn't regress speedwise - I got confused since the code
was modified to call rcu_enter_irqson() even for the rcuidle case (which I
had avoided when I added SRCU). So in current code, SRCU is kind of
pointless. I think keep the patch in the series.

thanks,

 - Joel

Patch
diff mbox series

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 0386b54cbcbb..1414b11f864b 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -13,7 +13,6 @@ 
  */
 
 #include <linux/smp.h>
-#include <linux/srcu.h>
 #include <linux/errno.h>
 #include <linux/types.h>
 #include <linux/cpumask.h>
@@ -33,8 +32,6 @@  struct trace_eval_map {
 
 #define TRACEPOINT_DEFAULT_PRIO	10
 
-extern struct srcu_struct tracepoint_srcu;
-
 extern int
 tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data);
 extern int
@@ -86,7 +83,6 @@  int unregister_tracepoint_module_notifier(struct notifier_block *nb)
 static inline void tracepoint_synchronize_unregister(void)
 {
 	synchronize_rcu_tasks_trace();
-	synchronize_srcu(&tracepoint_srcu);
 	synchronize_rcu();
 }
 #else
@@ -175,25 +171,13 @@  static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 		if (!(cond))						\
 			return;						\
 									\
-		/* srcu can't be used from NMI */			\
-		WARN_ON_ONCE(rcuidle && in_nmi());			\
-									\
-		if (maysleep) {						\
-			might_sleep();					\
+		might_sleep_if(maysleep);				\
+		if (rcuidle)						\
+			rcu_irq_enter_irqson();				\
+		if (maysleep)						\
 			rcu_read_lock_trace();				\
-		} else {						\
-			/* keep srcu and sched-rcu usage consistent */	\
+		else							\
 			preempt_disable_notrace();			\
-		}							\
-									\
-		/*							\
-		 * For rcuidle callers, use srcu since sched-rcu	\
-		 * doesn't work from the idle path.			\
-		 */							\
-		if (rcuidle) {						\
-			__idx = srcu_read_lock_notrace(&tracepoint_srcu);\
-			rcu_irq_enter_irqson();				\
-		}							\
 									\
 		it_func_ptr = rcu_dereference_raw((tp)->funcs);		\
 									\
@@ -205,15 +189,12 @@  static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 			} while ((++it_func_ptr)->func);		\
 		}							\
 									\
-		if (rcuidle) {						\
-			rcu_irq_exit_irqson();				\
-			srcu_read_unlock_notrace(&tracepoint_srcu, __idx);\
-		}							\
-									\
 		if (maysleep)						\
 			rcu_read_unlock_trace();			\
 		else							\
 			preempt_enable_notrace();			\
+		if (rcuidle)						\
+			rcu_irq_exit_irqson();				\
 	} while (0)
 
 #ifndef MODULE
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 8d8e41c5d8a5..68b4e50798b1 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -18,9 +18,6 @@ 
 extern tracepoint_ptr_t __start___tracepoints_ptrs[];
 extern tracepoint_ptr_t __stop___tracepoints_ptrs[];
 
-DEFINE_SRCU(tracepoint_srcu);
-EXPORT_SYMBOL_GPL(tracepoint_srcu);
-
 /* Set to 1 to enable tracepoint debug output */
 static const int tracepoint_debug;
 
@@ -65,14 +62,9 @@  static void rcu_tasks_trace_free_old_probes(struct rcu_head *head)
 	kfree(container_of(head, struct tp_probes, rcu));
 }
 
-static void srcu_free_old_probes(struct rcu_head *head)
-{
-	call_rcu_tasks_trace(head, rcu_tasks_trace_free_old_probes);
-}
-
 static void rcu_free_old_probes(struct rcu_head *head)
 {
-	call_srcu(&tracepoint_srcu, head, srcu_free_old_probes);
+	call_rcu_tasks_trace(head, rcu_tasks_trace_free_old_probes);
 }
 
 static __init int release_early_probes(void)
@@ -90,7 +82,7 @@  static __init int release_early_probes(void)
 	return 0;
 }
 
-/* SRCU and Tasks Trace RCU are initialized at core_initcall */
+/* Tasks Trace RCU is initialized at core_initcall */
 postcore_initcall(release_early_probes);
 
 static inline void release_probes(struct tracepoint_func *old)
@@ -100,9 +92,8 @@  static inline void release_probes(struct tracepoint_func *old)
 			struct tp_probes, probes[0]);
 
 		/*
-		 * We can't free probes if SRCU and Tasks Trace RCU are not
-		 * initialized yet. Postpone the freeing till after both are
-		 * initialized.
+		 * We can't free probes if Tasks Trace RCU is not initialized yet.
+		 * Postpone the freeing till after Tasks Trace RCU is initialized.
 		 */
 		if (unlikely(!ok_to_free_tracepoints)) {
 			tp_probes->rcu.next = early_probes;
@@ -111,9 +102,11 @@  static inline void release_probes(struct tracepoint_func *old)
 		}
 
 		/*
-		 * Tracepoint probes are protected by sched RCU, SRCU and
-		 * Tasks Trace RCU by chaining the callbacks we cover all three
-		 * cases and wait for all three grace periods.
+		 * Tracepoint probes are protected by both sched RCU and
+		 * Tasks Trace RCU, by calling the Tasks Trace RCU callback in
+		 * the sched RCU callback we cover both cases. So let us chain
+		 * the Tasks Trace RCU and sched RCU callbacks to wait for both
+		 * grace periods.
 		 */
 		call_rcu(&tp_probes->rcu, rcu_free_old_probes);
 	}