linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Peter Zijlstra <peterz@infradead.org>
To: Steven Rostedt <rostedt@goodmis.org>
Cc: LKML <linux-kernel@vger.kernel.org>,
	Ingo Molnar <mingo@kernel.org>,
	"Joel Fernandes (Google)" <joel@joelfernandes.org>,
	Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
	"Gustavo A. R. Silva" <gustavo@embeddedor.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	"Paul E. McKenney" <paulmck@kernel.org>,
	Josh Triplett <josh@joshtriplett.org>,
	Mathieu Desnoyers <mathieu.desnoyers@efficios.com>,
	Lai Jiangshan <jiangshanlai@gmail.com>
Subject: Re: [PATCH v2] tracing/perf: Move rcu_irq_enter/exit_irqson() to perf trace point hook
Date: Tue, 11 Feb 2020 18:29:52 +0100	[thread overview]
Message-ID: <20200211172952.GY14914@hirez.programming.kicks-ass.net> (raw)
In-Reply-To: <20200211111828.48058768@gandalf.local.home>

On Tue, Feb 11, 2020 at 11:18:28AM -0500, Steven Rostedt wrote:
> On Tue, 11 Feb 2020 16:34:52 +0100
> Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > > +		if (unlikely(in_nmi()))
> > > +			goto out;  
> > 
> > unless I'm mistaken, we can simply do rcu_nmi_enter() in this case, and
> > rcu_nmi_exit() on the other end.
> > 
> > > +		rcu_irq_enter_irqson();
> 
> The thing is, I don't think this can ever happen. 

Per your own argument it should be true in the trace_preempt_off()
tracepoint from nmi_enter():

  <idle>
    // rcu_is_watching() := false
    <NMI>
      nmi_enter()
        preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET)
	  __preempt_count_add()
	  // in_nmi() := true
	  preempt_latency_start()
	    // .oO -- we'll never hit this tracepoint because idle has
	    // preempt_count() == 1, so we'll have:
	    //   preempt_count() != val

	    // .oO-2 we should be able to this this when the NMI
	    // hits userspace and we have NOHZ_FULL on.
	rcu_nmi_enter() // function tracer


But the point is, if we ever do hit it, rcu_nmi_enter() is the right
thing to call when '!rcu_is_watching() && in_nmi()', because, as per the
rcu_nmi_enter() comments, that function fully supports nested NMIs.

How about something like so? And then you get to use the same in
__trace_stack() or something, and anywhere else you're doing this dance.

---

diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index da0af631ded5..e987236abf95 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -89,4 +89,52 @@ extern void irq_exit(void);
 		arch_nmi_exit();				\
 	} while (0)
 
+/*
+ * Tracing vs rcu_is_watching()
+ * ----------------------------
+ *
+ * tracepoints and function-tracing can happen when RCU isn't watching (idle,
+ * or early IRQ/NMI entry).
+ *
+ * When it happens during idle or early during IRQ entry, tracing will have
+ * to inform RCU that it ought to pay attention, this is done by calling
+ * rcu_irq_enter_irqon().
+ *
+ * On NMI entry, we must be very careful that tracing only happens after we've
+ * incremented preempt_count(), otherwise we cannot tell we're in NMI and take
+ * the special path.
+ */
+
+#define __TR_IRQ	1
+#define __TR_NMI	2
+
+#define trace_rcu_enter()					\
+({								\
+	unsigned long state = 0;				\
+	if (!rcu_is_watching())	{				\
+		if (in_nmi()) {					\
+			state = __TR_NMI;			\
+			rcu_nmi_enter();			\
+		} else {					\
+			state = __TR_IRQ;			\
+			rcu_irq_enter_irqson();			\
+		}						\
+	}							\
+	state;							\
+})
+
+#define trace_rcu_exit(state)					\
+do {								\
+	switch (state) {					\
+	case __TR_IRQ:						\
+		rcu_irq_exit_irqson();				\
+		break;						\
+	case __IRQ_NMI:						\
+		rcu_nmi_exit();					\
+		break;						\
+	default:						\
+		break;						\
+	}							\
+} while (0)
+
 #endif /* LINUX_HARDIRQ_H */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e453589da97c..8f34592a1355 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8950,6 +8950,7 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
 {
 	struct perf_sample_data data;
 	struct perf_event *event;
+	unsigned long rcu_flags;
 
 	struct perf_raw_record raw = {
 		.frag = {
@@ -8961,6 +8962,8 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
 	perf_sample_data_init(&data, 0, 0);
 	data.raw = &raw;
 
+	rcu_flags = trace_rcu_enter();
+
 	perf_trace_buf_update(record, event_type);
 
 	hlist_for_each_entry_rcu(event, head, hlist_entry) {
@@ -8996,6 +8999,8 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
 	}
 
 	perf_swevent_put_recursion_context(rctx);
+
+	trace_rcu_exit(rcu_flags);
 }
 EXPORT_SYMBOL_GPL(perf_tp_event);
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 45f79bcc3146..62f87807bbe6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3781,7 +3781,7 @@ static inline void preempt_latency_start(int val)
 	}
 }
 
-void preempt_count_add(int val)
+void notrace preempt_count_add(int val)
 {
 #ifdef CONFIG_DEBUG_PREEMPT
 	/*
@@ -3813,7 +3813,7 @@ static inline void preempt_latency_stop(int val)
 		trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
 }
 
-void preempt_count_sub(int val)
+void notrace preempt_count_sub(int val)
 {
 #ifdef CONFIG_DEBUG_PREEMPT
 	/*

  parent reply	other threads:[~2020-02-11 17:30 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-02-11 14:50 [PATCH v2] tracing/perf: Move rcu_irq_enter/exit_irqson() to perf trace point hook Steven Rostedt
2020-02-11 15:34 ` Mathieu Desnoyers
2020-02-11 15:46   ` Peter Zijlstra
2020-02-11 16:02     ` Mathieu Desnoyers
2020-02-11 15:34 ` Peter Zijlstra
2020-02-11 16:18   ` Steven Rostedt
2020-02-11 16:27     ` Mathieu Desnoyers
2020-02-11 16:35       ` Steven Rostedt
2020-02-11 17:29     ` Peter Zijlstra [this message]
2020-02-11 17:32       ` Peter Zijlstra
2020-02-11 18:54         ` Paul E. McKenney
2020-02-12  8:05           ` Peter Zijlstra
2020-02-12  9:05             ` Paul E. McKenney
2020-02-11 17:35       ` Mathieu Desnoyers
2020-02-12  8:02         ` Peter Zijlstra
2020-02-12 15:14           ` Mathieu Desnoyers

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200211172952.GY14914@hirez.programming.kicks-ass.net \
    --to=peterz@infradead.org \
    --cc=gregkh@linuxfoundation.org \
    --cc=gustavo@embeddedor.com \
    --cc=jiangshanlai@gmail.com \
    --cc=joel@joelfernandes.org \
    --cc=josh@joshtriplett.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mathieu.desnoyers@efficios.com \
    --cc=mingo@kernel.org \
    --cc=paulmck@kernel.org \
    --cc=rostedt@goodmis.org \
    --cc=tglx@linutronix.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).