All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/6] A few cpuidle vs rcu fixes
@ 2023-01-23 20:50 ` Peter Zijlstra
  0 siblings, 0 replies; 53+ messages in thread
From: Peter Zijlstra @ 2023-01-23 20:50 UTC (permalink / raw)
  To: mingo
  Cc: will, peterz, boqun.feng, mark.rutland, tglx, bp, dave.hansen,
	x86, hpa, seanjc, pbonzini, jgross, srivatsa, amakhalov,
	pv-drivers, rostedt, mhiramat, wanpengli, vkuznets,
	boris.ostrovsky, rafael, daniel.lezcano, juri.lelli,
	vincent.guittot, dietmar.eggemann, bsegall, mgorman, bristot,
	vschneid, linux-kernel, kvm, virtualization, linux-trace-kernel,
	linux-pm

0-day robot reported graph-tracing made the cpuidle-vs-rcu rework go splat.

These patches appear to cure this, the ftrace selftest now runs to completion
without spamming scary messages to dmesg.

---
 arch/x86/include/asm/atomic64_32.h | 44 +++++++++++++++++++-------------------
 arch/x86/include/asm/atomic64_64.h | 36 +++++++++++++++----------------
 arch/x86/include/asm/kvmclock.h    |  2 +-
 arch/x86/include/asm/paravirt.h    |  2 +-
 arch/x86/include/asm/pvclock.h     |  3 ++-
 arch/x86/kernel/cpu/vmware.c       |  2 +-
 arch/x86/kernel/ftrace.c           |  3 +++
 arch/x86/kernel/kvmclock.c         |  6 +++---
 arch/x86/kernel/pvclock.c          | 22 +++++++++++++------
 arch/x86/kernel/tsc.c              |  7 +++---
 arch/x86/xen/time.c                | 12 +++++++++--
 drivers/cpuidle/cpuidle.c          |  2 +-
 drivers/cpuidle/poll_state.c       |  2 --
 include/linux/math64.h             |  4 ++--
 include/linux/sched/clock.h        |  8 +++----
 kernel/sched/clock.c               | 27 +++++++++++++++++------
 16 files changed, 107 insertions(+), 75 deletions(-)



^ permalink raw reply	[flat|nested] 53+ messages in thread

* [PATCH 0/6] A few cpuidle vs rcu fixes
@ 2023-01-23 20:50 ` Peter Zijlstra
  0 siblings, 0 replies; 53+ messages in thread
From: Peter Zijlstra @ 2023-01-23 20:50 UTC (permalink / raw)
  To: mingo
  Cc: mark.rutland, juri.lelli, daniel.lezcano, wanpengli, kvm, rafael,
	peterz, dave.hansen, virtualization, bsegall, amakhalov, will,
	tglx, vschneid, hpa, x86, pv-drivers, mgorman,
	linux-trace-kernel, linux-pm, boqun.feng, rostedt, bp,
	vincent.guittot, boris.ostrovsky, dietmar.eggemann, jgross,
	seanjc, linux-kernel, mhiramat, pbonzini, bristot

0-day robot reported graph-tracing made the cpuidle-vs-rcu rework go splat.

These patches appear to cure this, the ftrace selftest now runs to completion
without spamming scary messages to dmesg.

---
 arch/x86/include/asm/atomic64_32.h | 44 +++++++++++++++++++-------------------
 arch/x86/include/asm/atomic64_64.h | 36 +++++++++++++++----------------
 arch/x86/include/asm/kvmclock.h    |  2 +-
 arch/x86/include/asm/paravirt.h    |  2 +-
 arch/x86/include/asm/pvclock.h     |  3 ++-
 arch/x86/kernel/cpu/vmware.c       |  2 +-
 arch/x86/kernel/ftrace.c           |  3 +++
 arch/x86/kernel/kvmclock.c         |  6 +++---
 arch/x86/kernel/pvclock.c          | 22 +++++++++++++------
 arch/x86/kernel/tsc.c              |  7 +++---
 arch/x86/xen/time.c                | 12 +++++++++--
 drivers/cpuidle/cpuidle.c          |  2 +-
 drivers/cpuidle/poll_state.c       |  2 --
 include/linux/math64.h             |  4 ++--
 include/linux/sched/clock.h        |  8 +++----
 kernel/sched/clock.c               | 27 +++++++++++++++++------
 16 files changed, 107 insertions(+), 75 deletions(-)


_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 53+ messages in thread

* [PATCH 1/6] x86: Always inline arch_atomic64
  2023-01-23 20:50 ` Peter Zijlstra
@ 2023-01-23 20:50   ` Peter Zijlstra
  -1 siblings, 0 replies; 53+ messages in thread
From: Peter Zijlstra @ 2023-01-23 20:50 UTC (permalink / raw)
  To: mingo
  Cc: will, peterz, boqun.feng, mark.rutland, tglx, bp, dave.hansen,
	x86, hpa, seanjc, pbonzini, jgross, srivatsa, amakhalov,
	pv-drivers, rostedt, mhiramat, wanpengli, vkuznets,
	boris.ostrovsky, rafael, daniel.lezcano, juri.lelli,
	vincent.guittot, dietmar.eggemann, bsegall, mgorman, bristot,
	vschneid, linux-kernel, kvm, virtualization, linux-trace-kernel,
	linux-pm

As already done for regular arch_atomic, always inline arch_atomic64.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/include/asm/atomic64_32.h |   44 ++++++++++++++++++-------------------
 arch/x86/include/asm/atomic64_64.h |   36 +++++++++++++++---------------
 2 files changed, 40 insertions(+), 40 deletions(-)

--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -71,7 +71,7 @@ ATOMIC64_DECL(add_unless);
  * the old value.
  */
 
-static inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 o, s64 n)
+static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 o, s64 n)
 {
 	return arch_cmpxchg64(&v->counter, o, n);
 }
@@ -85,7 +85,7 @@ static inline s64 arch_atomic64_cmpxchg(
  * Atomically xchgs the value of @v to @n and returns
  * the old value.
  */
-static inline s64 arch_atomic64_xchg(atomic64_t *v, s64 n)
+static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 n)
 {
 	s64 o;
 	unsigned high = (unsigned)(n >> 32);
@@ -104,7 +104,7 @@ static inline s64 arch_atomic64_xchg(ato
  *
  * Atomically sets the value of @v to @n.
  */
-static inline void arch_atomic64_set(atomic64_t *v, s64 i)
+static __always_inline void arch_atomic64_set(atomic64_t *v, s64 i)
 {
 	unsigned high = (unsigned)(i >> 32);
 	unsigned low = (unsigned)i;
@@ -119,7 +119,7 @@ static inline void arch_atomic64_set(ato
  *
  * Atomically reads the value of @v and returns it.
  */
-static inline s64 arch_atomic64_read(const atomic64_t *v)
+static __always_inline s64 arch_atomic64_read(const atomic64_t *v)
 {
 	s64 r;
 	alternative_atomic64(read, "=&A" (r), "c" (v) : "memory");
@@ -133,7 +133,7 @@ static inline s64 arch_atomic64_read(con
  *
  * Atomically adds @i to @v and returns @i + *@v
  */
-static inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v)
 {
 	alternative_atomic64(add_return,
 			     ASM_OUTPUT2("+A" (i), "+c" (v)),
@@ -145,7 +145,7 @@ static inline s64 arch_atomic64_add_retu
 /*
  * Other variants with different arithmetic operators:
  */
-static inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v)
 {
 	alternative_atomic64(sub_return,
 			     ASM_OUTPUT2("+A" (i), "+c" (v)),
@@ -154,7 +154,7 @@ static inline s64 arch_atomic64_sub_retu
 }
 #define arch_atomic64_sub_return arch_atomic64_sub_return
 
-static inline s64 arch_atomic64_inc_return(atomic64_t *v)
+static __always_inline s64 arch_atomic64_inc_return(atomic64_t *v)
 {
 	s64 a;
 	alternative_atomic64(inc_return, "=&A" (a),
@@ -163,7 +163,7 @@ static inline s64 arch_atomic64_inc_retu
 }
 #define arch_atomic64_inc_return arch_atomic64_inc_return
 
-static inline s64 arch_atomic64_dec_return(atomic64_t *v)
+static __always_inline s64 arch_atomic64_dec_return(atomic64_t *v)
 {
 	s64 a;
 	alternative_atomic64(dec_return, "=&A" (a),
@@ -179,7 +179,7 @@ static inline s64 arch_atomic64_dec_retu
  *
  * Atomically adds @i to @v.
  */
-static inline s64 arch_atomic64_add(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_add(s64 i, atomic64_t *v)
 {
 	__alternative_atomic64(add, add_return,
 			       ASM_OUTPUT2("+A" (i), "+c" (v)),
@@ -194,7 +194,7 @@ static inline s64 arch_atomic64_add(s64
  *
  * Atomically subtracts @i from @v.
  */
-static inline s64 arch_atomic64_sub(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_sub(s64 i, atomic64_t *v)
 {
 	__alternative_atomic64(sub, sub_return,
 			       ASM_OUTPUT2("+A" (i), "+c" (v)),
@@ -208,7 +208,7 @@ static inline s64 arch_atomic64_sub(s64
  *
  * Atomically increments @v by 1.
  */
-static inline void arch_atomic64_inc(atomic64_t *v)
+static __always_inline void arch_atomic64_inc(atomic64_t *v)
 {
 	__alternative_atomic64(inc, inc_return, /* no output */,
 			       "S" (v) : "memory", "eax", "ecx", "edx");
@@ -221,7 +221,7 @@ static inline void arch_atomic64_inc(ato
  *
  * Atomically decrements @v by 1.
  */
-static inline void arch_atomic64_dec(atomic64_t *v)
+static __always_inline void arch_atomic64_dec(atomic64_t *v)
 {
 	__alternative_atomic64(dec, dec_return, /* no output */,
 			       "S" (v) : "memory", "eax", "ecx", "edx");
@@ -237,7 +237,7 @@ static inline void arch_atomic64_dec(ato
  * Atomically adds @a to @v, so long as it was not @u.
  * Returns non-zero if the add was done, zero otherwise.
  */
-static inline int arch_atomic64_add_unless(atomic64_t *v, s64 a, s64 u)
+static __always_inline int arch_atomic64_add_unless(atomic64_t *v, s64 a, s64 u)
 {
 	unsigned low = (unsigned)u;
 	unsigned high = (unsigned)(u >> 32);
@@ -248,7 +248,7 @@ static inline int arch_atomic64_add_unle
 }
 #define arch_atomic64_add_unless arch_atomic64_add_unless
 
-static inline int arch_atomic64_inc_not_zero(atomic64_t *v)
+static __always_inline int arch_atomic64_inc_not_zero(atomic64_t *v)
 {
 	int r;
 	alternative_atomic64(inc_not_zero, "=&a" (r),
@@ -257,7 +257,7 @@ static inline int arch_atomic64_inc_not_
 }
 #define arch_atomic64_inc_not_zero arch_atomic64_inc_not_zero
 
-static inline s64 arch_atomic64_dec_if_positive(atomic64_t *v)
+static __always_inline s64 arch_atomic64_dec_if_positive(atomic64_t *v)
 {
 	s64 r;
 	alternative_atomic64(dec_if_positive, "=&A" (r),
@@ -269,7 +269,7 @@ static inline s64 arch_atomic64_dec_if_p
 #undef alternative_atomic64
 #undef __alternative_atomic64
 
-static inline void arch_atomic64_and(s64 i, atomic64_t *v)
+static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v)
 {
 	s64 old, c = 0;
 
@@ -277,7 +277,7 @@ static inline void arch_atomic64_and(s64
 		c = old;
 }
 
-static inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
 {
 	s64 old, c = 0;
 
@@ -288,7 +288,7 @@ static inline s64 arch_atomic64_fetch_an
 }
 #define arch_atomic64_fetch_and arch_atomic64_fetch_and
 
-static inline void arch_atomic64_or(s64 i, atomic64_t *v)
+static __always_inline void arch_atomic64_or(s64 i, atomic64_t *v)
 {
 	s64 old, c = 0;
 
@@ -296,7 +296,7 @@ static inline void arch_atomic64_or(s64
 		c = old;
 }
 
-static inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
 {
 	s64 old, c = 0;
 
@@ -307,7 +307,7 @@ static inline s64 arch_atomic64_fetch_or
 }
 #define arch_atomic64_fetch_or arch_atomic64_fetch_or
 
-static inline void arch_atomic64_xor(s64 i, atomic64_t *v)
+static __always_inline void arch_atomic64_xor(s64 i, atomic64_t *v)
 {
 	s64 old, c = 0;
 
@@ -315,7 +315,7 @@ static inline void arch_atomic64_xor(s64
 		c = old;
 }
 
-static inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v)
 {
 	s64 old, c = 0;
 
@@ -326,7 +326,7 @@ static inline s64 arch_atomic64_fetch_xo
 }
 #define arch_atomic64_fetch_xor arch_atomic64_fetch_xor
 
-static inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
 {
 	s64 old, c = 0;
 
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -17,7 +17,7 @@
  * Atomically reads the value of @v.
  * Doesn't imply a read memory barrier.
  */
-static inline s64 arch_atomic64_read(const atomic64_t *v)
+static __always_inline s64 arch_atomic64_read(const atomic64_t *v)
 {
 	return __READ_ONCE((v)->counter);
 }
@@ -29,7 +29,7 @@ static inline s64 arch_atomic64_read(con
  *
  * Atomically sets the value of @v to @i.
  */
-static inline void arch_atomic64_set(atomic64_t *v, s64 i)
+static __always_inline void arch_atomic64_set(atomic64_t *v, s64 i)
 {
 	__WRITE_ONCE(v->counter, i);
 }
@@ -55,7 +55,7 @@ static __always_inline void arch_atomic6
  *
  * Atomically subtracts @i from @v.
  */
-static inline void arch_atomic64_sub(s64 i, atomic64_t *v)
+static __always_inline void arch_atomic64_sub(s64 i, atomic64_t *v)
 {
 	asm volatile(LOCK_PREFIX "subq %1,%0"
 		     : "=m" (v->counter)
@@ -71,7 +71,7 @@ static inline void arch_atomic64_sub(s64
  * true if the result is zero, or false for all
  * other cases.
  */
-static inline bool arch_atomic64_sub_and_test(s64 i, atomic64_t *v)
+static __always_inline bool arch_atomic64_sub_and_test(s64 i, atomic64_t *v)
 {
 	return GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, e, "er", i);
 }
@@ -113,7 +113,7 @@ static __always_inline void arch_atomic6
  * returns true if the result is 0, or false for all other
  * cases.
  */
-static inline bool arch_atomic64_dec_and_test(atomic64_t *v)
+static __always_inline bool arch_atomic64_dec_and_test(atomic64_t *v)
 {
 	return GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, e);
 }
@@ -127,7 +127,7 @@ static inline bool arch_atomic64_dec_and
  * and returns true if the result is zero, or false for all
  * other cases.
  */
-static inline bool arch_atomic64_inc_and_test(atomic64_t *v)
+static __always_inline bool arch_atomic64_inc_and_test(atomic64_t *v)
 {
 	return GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, e);
 }
@@ -142,7 +142,7 @@ static inline bool arch_atomic64_inc_and
  * if the result is negative, or false when
  * result is greater than or equal to zero.
  */
-static inline bool arch_atomic64_add_negative(s64 i, atomic64_t *v)
+static __always_inline bool arch_atomic64_add_negative(s64 i, atomic64_t *v)
 {
 	return GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, s, "er", i);
 }
@@ -161,25 +161,25 @@ static __always_inline s64 arch_atomic64
 }
 #define arch_atomic64_add_return arch_atomic64_add_return
 
-static inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v)
 {
 	return arch_atomic64_add_return(-i, v);
 }
 #define arch_atomic64_sub_return arch_atomic64_sub_return
 
-static inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
 {
 	return xadd(&v->counter, i);
 }
 #define arch_atomic64_fetch_add arch_atomic64_fetch_add
 
-static inline s64 arch_atomic64_fetch_sub(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_fetch_sub(s64 i, atomic64_t *v)
 {
 	return xadd(&v->counter, -i);
 }
 #define arch_atomic64_fetch_sub arch_atomic64_fetch_sub
 
-static inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
+static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
 {
 	return arch_cmpxchg(&v->counter, old, new);
 }
@@ -191,13 +191,13 @@ static __always_inline bool arch_atomic6
 }
 #define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg
 
-static inline s64 arch_atomic64_xchg(atomic64_t *v, s64 new)
+static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 new)
 {
 	return arch_xchg(&v->counter, new);
 }
 #define arch_atomic64_xchg arch_atomic64_xchg
 
-static inline void arch_atomic64_and(s64 i, atomic64_t *v)
+static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v)
 {
 	asm volatile(LOCK_PREFIX "andq %1,%0"
 			: "+m" (v->counter)
@@ -205,7 +205,7 @@ static inline void arch_atomic64_and(s64
 			: "memory");
 }
 
-static inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
 {
 	s64 val = arch_atomic64_read(v);
 
@@ -215,7 +215,7 @@ static inline s64 arch_atomic64_fetch_an
 }
 #define arch_atomic64_fetch_and arch_atomic64_fetch_and
 
-static inline void arch_atomic64_or(s64 i, atomic64_t *v)
+static __always_inline void arch_atomic64_or(s64 i, atomic64_t *v)
 {
 	asm volatile(LOCK_PREFIX "orq %1,%0"
 			: "+m" (v->counter)
@@ -223,7 +223,7 @@ static inline void arch_atomic64_or(s64
 			: "memory");
 }
 
-static inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
 {
 	s64 val = arch_atomic64_read(v);
 
@@ -233,7 +233,7 @@ static inline s64 arch_atomic64_fetch_or
 }
 #define arch_atomic64_fetch_or arch_atomic64_fetch_or
 
-static inline void arch_atomic64_xor(s64 i, atomic64_t *v)
+static __always_inline void arch_atomic64_xor(s64 i, atomic64_t *v)
 {
 	asm volatile(LOCK_PREFIX "xorq %1,%0"
 			: "+m" (v->counter)
@@ -241,7 +241,7 @@ static inline void arch_atomic64_xor(s64
 			: "memory");
 }
 
-static inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v)
 {
 	s64 val = arch_atomic64_read(v);
 



^ permalink raw reply	[flat|nested] 53+ messages in thread

* [PATCH 1/6] x86: Always inline arch_atomic64
@ 2023-01-23 20:50   ` Peter Zijlstra
  0 siblings, 0 replies; 53+ messages in thread
From: Peter Zijlstra @ 2023-01-23 20:50 UTC (permalink / raw)
  To: mingo
  Cc: mark.rutland, juri.lelli, daniel.lezcano, wanpengli, kvm, rafael,
	peterz, dave.hansen, virtualization, bsegall, amakhalov, will,
	tglx, vschneid, hpa, x86, pv-drivers, mgorman,
	linux-trace-kernel, linux-pm, boqun.feng, rostedt, bp,
	vincent.guittot, boris.ostrovsky, dietmar.eggemann, jgross,
	seanjc, linux-kernel, mhiramat, pbonzini, bristot

As already done for regular arch_atomic, always inline arch_atomic64.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/include/asm/atomic64_32.h |   44 ++++++++++++++++++-------------------
 arch/x86/include/asm/atomic64_64.h |   36 +++++++++++++++---------------
 2 files changed, 40 insertions(+), 40 deletions(-)

--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -71,7 +71,7 @@ ATOMIC64_DECL(add_unless);
  * the old value.
  */
 
-static inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 o, s64 n)
+static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 o, s64 n)
 {
 	return arch_cmpxchg64(&v->counter, o, n);
 }
@@ -85,7 +85,7 @@ static inline s64 arch_atomic64_cmpxchg(
  * Atomically xchgs the value of @v to @n and returns
  * the old value.
  */
-static inline s64 arch_atomic64_xchg(atomic64_t *v, s64 n)
+static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 n)
 {
 	s64 o;
 	unsigned high = (unsigned)(n >> 32);
@@ -104,7 +104,7 @@ static inline s64 arch_atomic64_xchg(ato
  *
  * Atomically sets the value of @v to @n.
  */
-static inline void arch_atomic64_set(atomic64_t *v, s64 i)
+static __always_inline void arch_atomic64_set(atomic64_t *v, s64 i)
 {
 	unsigned high = (unsigned)(i >> 32);
 	unsigned low = (unsigned)i;
@@ -119,7 +119,7 @@ static inline void arch_atomic64_set(ato
  *
  * Atomically reads the value of @v and returns it.
  */
-static inline s64 arch_atomic64_read(const atomic64_t *v)
+static __always_inline s64 arch_atomic64_read(const atomic64_t *v)
 {
 	s64 r;
 	alternative_atomic64(read, "=&A" (r), "c" (v) : "memory");
@@ -133,7 +133,7 @@ static inline s64 arch_atomic64_read(con
  *
  * Atomically adds @i to @v and returns @i + *@v
  */
-static inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v)
 {
 	alternative_atomic64(add_return,
 			     ASM_OUTPUT2("+A" (i), "+c" (v)),
@@ -145,7 +145,7 @@ static inline s64 arch_atomic64_add_retu
 /*
  * Other variants with different arithmetic operators:
  */
-static inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v)
 {
 	alternative_atomic64(sub_return,
 			     ASM_OUTPUT2("+A" (i), "+c" (v)),
@@ -154,7 +154,7 @@ static inline s64 arch_atomic64_sub_retu
 }
 #define arch_atomic64_sub_return arch_atomic64_sub_return
 
-static inline s64 arch_atomic64_inc_return(atomic64_t *v)
+static __always_inline s64 arch_atomic64_inc_return(atomic64_t *v)
 {
 	s64 a;
 	alternative_atomic64(inc_return, "=&A" (a),
@@ -163,7 +163,7 @@ static inline s64 arch_atomic64_inc_retu
 }
 #define arch_atomic64_inc_return arch_atomic64_inc_return
 
-static inline s64 arch_atomic64_dec_return(atomic64_t *v)
+static __always_inline s64 arch_atomic64_dec_return(atomic64_t *v)
 {
 	s64 a;
 	alternative_atomic64(dec_return, "=&A" (a),
@@ -179,7 +179,7 @@ static inline s64 arch_atomic64_dec_retu
  *
  * Atomically adds @i to @v.
  */
-static inline s64 arch_atomic64_add(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_add(s64 i, atomic64_t *v)
 {
 	__alternative_atomic64(add, add_return,
 			       ASM_OUTPUT2("+A" (i), "+c" (v)),
@@ -194,7 +194,7 @@ static inline s64 arch_atomic64_add(s64
  *
  * Atomically subtracts @i from @v.
  */
-static inline s64 arch_atomic64_sub(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_sub(s64 i, atomic64_t *v)
 {
 	__alternative_atomic64(sub, sub_return,
 			       ASM_OUTPUT2("+A" (i), "+c" (v)),
@@ -208,7 +208,7 @@ static inline s64 arch_atomic64_sub(s64
  *
  * Atomically increments @v by 1.
  */
-static inline void arch_atomic64_inc(atomic64_t *v)
+static __always_inline void arch_atomic64_inc(atomic64_t *v)
 {
 	__alternative_atomic64(inc, inc_return, /* no output */,
 			       "S" (v) : "memory", "eax", "ecx", "edx");
@@ -221,7 +221,7 @@ static inline void arch_atomic64_inc(ato
  *
  * Atomically decrements @v by 1.
  */
-static inline void arch_atomic64_dec(atomic64_t *v)
+static __always_inline void arch_atomic64_dec(atomic64_t *v)
 {
 	__alternative_atomic64(dec, dec_return, /* no output */,
 			       "S" (v) : "memory", "eax", "ecx", "edx");
@@ -237,7 +237,7 @@ static inline void arch_atomic64_dec(ato
  * Atomically adds @a to @v, so long as it was not @u.
  * Returns non-zero if the add was done, zero otherwise.
  */
-static inline int arch_atomic64_add_unless(atomic64_t *v, s64 a, s64 u)
+static __always_inline int arch_atomic64_add_unless(atomic64_t *v, s64 a, s64 u)
 {
 	unsigned low = (unsigned)u;
 	unsigned high = (unsigned)(u >> 32);
@@ -248,7 +248,7 @@ static inline int arch_atomic64_add_unle
 }
 #define arch_atomic64_add_unless arch_atomic64_add_unless
 
-static inline int arch_atomic64_inc_not_zero(atomic64_t *v)
+static __always_inline int arch_atomic64_inc_not_zero(atomic64_t *v)
 {
 	int r;
 	alternative_atomic64(inc_not_zero, "=&a" (r),
@@ -257,7 +257,7 @@ static inline int arch_atomic64_inc_not_
 }
 #define arch_atomic64_inc_not_zero arch_atomic64_inc_not_zero
 
-static inline s64 arch_atomic64_dec_if_positive(atomic64_t *v)
+static __always_inline s64 arch_atomic64_dec_if_positive(atomic64_t *v)
 {
 	s64 r;
 	alternative_atomic64(dec_if_positive, "=&A" (r),
@@ -269,7 +269,7 @@ static inline s64 arch_atomic64_dec_if_p
 #undef alternative_atomic64
 #undef __alternative_atomic64
 
-static inline void arch_atomic64_and(s64 i, atomic64_t *v)
+static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v)
 {
 	s64 old, c = 0;
 
@@ -277,7 +277,7 @@ static inline void arch_atomic64_and(s64
 		c = old;
 }
 
-static inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
 {
 	s64 old, c = 0;
 
@@ -288,7 +288,7 @@ static inline s64 arch_atomic64_fetch_an
 }
 #define arch_atomic64_fetch_and arch_atomic64_fetch_and
 
-static inline void arch_atomic64_or(s64 i, atomic64_t *v)
+static __always_inline void arch_atomic64_or(s64 i, atomic64_t *v)
 {
 	s64 old, c = 0;
 
@@ -296,7 +296,7 @@ static inline void arch_atomic64_or(s64
 		c = old;
 }
 
-static inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
 {
 	s64 old, c = 0;
 
@@ -307,7 +307,7 @@ static inline s64 arch_atomic64_fetch_or
 }
 #define arch_atomic64_fetch_or arch_atomic64_fetch_or
 
-static inline void arch_atomic64_xor(s64 i, atomic64_t *v)
+static __always_inline void arch_atomic64_xor(s64 i, atomic64_t *v)
 {
 	s64 old, c = 0;
 
@@ -315,7 +315,7 @@ static inline void arch_atomic64_xor(s64
 		c = old;
 }
 
-static inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v)
 {
 	s64 old, c = 0;
 
@@ -326,7 +326,7 @@ static inline s64 arch_atomic64_fetch_xo
 }
 #define arch_atomic64_fetch_xor arch_atomic64_fetch_xor
 
-static inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
 {
 	s64 old, c = 0;
 
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -17,7 +17,7 @@
  * Atomically reads the value of @v.
  * Doesn't imply a read memory barrier.
  */
-static inline s64 arch_atomic64_read(const atomic64_t *v)
+static __always_inline s64 arch_atomic64_read(const atomic64_t *v)
 {
 	return __READ_ONCE((v)->counter);
 }
@@ -29,7 +29,7 @@ static inline s64 arch_atomic64_read(con
  *
  * Atomically sets the value of @v to @i.
  */
-static inline void arch_atomic64_set(atomic64_t *v, s64 i)
+static __always_inline void arch_atomic64_set(atomic64_t *v, s64 i)
 {
 	__WRITE_ONCE(v->counter, i);
 }
@@ -55,7 +55,7 @@ static __always_inline void arch_atomic6
  *
  * Atomically subtracts @i from @v.
  */
-static inline void arch_atomic64_sub(s64 i, atomic64_t *v)
+static __always_inline void arch_atomic64_sub(s64 i, atomic64_t *v)
 {
 	asm volatile(LOCK_PREFIX "subq %1,%0"
 		     : "=m" (v->counter)
@@ -71,7 +71,7 @@ static inline void arch_atomic64_sub(s64
  * true if the result is zero, or false for all
  * other cases.
  */
-static inline bool arch_atomic64_sub_and_test(s64 i, atomic64_t *v)
+static __always_inline bool arch_atomic64_sub_and_test(s64 i, atomic64_t *v)
 {
 	return GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, e, "er", i);
 }
@@ -113,7 +113,7 @@ static __always_inline void arch_atomic6
  * returns true if the result is 0, or false for all other
  * cases.
  */
-static inline bool arch_atomic64_dec_and_test(atomic64_t *v)
+static __always_inline bool arch_atomic64_dec_and_test(atomic64_t *v)
 {
 	return GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, e);
 }
@@ -127,7 +127,7 @@ static inline bool arch_atomic64_dec_and
  * and returns true if the result is zero, or false for all
  * other cases.
  */
-static inline bool arch_atomic64_inc_and_test(atomic64_t *v)
+static __always_inline bool arch_atomic64_inc_and_test(atomic64_t *v)
 {
 	return GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, e);
 }
@@ -142,7 +142,7 @@ static inline bool arch_atomic64_inc_and
  * if the result is negative, or false when
  * result is greater than or equal to zero.
  */
-static inline bool arch_atomic64_add_negative(s64 i, atomic64_t *v)
+static __always_inline bool arch_atomic64_add_negative(s64 i, atomic64_t *v)
 {
 	return GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, s, "er", i);
 }
@@ -161,25 +161,25 @@ static __always_inline s64 arch_atomic64
 }
 #define arch_atomic64_add_return arch_atomic64_add_return
 
-static inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v)
 {
 	return arch_atomic64_add_return(-i, v);
 }
 #define arch_atomic64_sub_return arch_atomic64_sub_return
 
-static inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
 {
 	return xadd(&v->counter, i);
 }
 #define arch_atomic64_fetch_add arch_atomic64_fetch_add
 
-static inline s64 arch_atomic64_fetch_sub(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_fetch_sub(s64 i, atomic64_t *v)
 {
 	return xadd(&v->counter, -i);
 }
 #define arch_atomic64_fetch_sub arch_atomic64_fetch_sub
 
-static inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
+static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
 {
 	return arch_cmpxchg(&v->counter, old, new);
 }
@@ -191,13 +191,13 @@ static __always_inline bool arch_atomic6
 }
 #define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg
 
-static inline s64 arch_atomic64_xchg(atomic64_t *v, s64 new)
+static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 new)
 {
 	return arch_xchg(&v->counter, new);
 }
 #define arch_atomic64_xchg arch_atomic64_xchg
 
-static inline void arch_atomic64_and(s64 i, atomic64_t *v)
+static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v)
 {
 	asm volatile(LOCK_PREFIX "andq %1,%0"
 			: "+m" (v->counter)
@@ -205,7 +205,7 @@ static inline void arch_atomic64_and(s64
 			: "memory");
 }
 
-static inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
 {
 	s64 val = arch_atomic64_read(v);
 
@@ -215,7 +215,7 @@ static inline s64 arch_atomic64_fetch_an
 }
 #define arch_atomic64_fetch_and arch_atomic64_fetch_and
 
-static inline void arch_atomic64_or(s64 i, atomic64_t *v)
+static __always_inline void arch_atomic64_or(s64 i, atomic64_t *v)
 {
 	asm volatile(LOCK_PREFIX "orq %1,%0"
 			: "+m" (v->counter)
@@ -223,7 +223,7 @@ static inline void arch_atomic64_or(s64
 			: "memory");
 }
 
-static inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
 {
 	s64 val = arch_atomic64_read(v);
 
@@ -233,7 +233,7 @@ static inline s64 arch_atomic64_fetch_or
 }
 #define arch_atomic64_fetch_or arch_atomic64_fetch_or
 
-static inline void arch_atomic64_xor(s64 i, atomic64_t *v)
+static __always_inline void arch_atomic64_xor(s64 i, atomic64_t *v)
 {
 	asm volatile(LOCK_PREFIX "xorq %1,%0"
 			: "+m" (v->counter)
@@ -241,7 +241,7 @@ static inline void arch_atomic64_xor(s64
 			: "memory");
 }
 
-static inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v)
 {
 	s64 val = arch_atomic64_read(v);
 


_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 53+ messages in thread

* [PATCH 2/6] x86/pvclock: improve atomic update of last_value in pvclock_clocksource_read
  2023-01-23 20:50 ` Peter Zijlstra
@ 2023-01-23 20:50   ` Peter Zijlstra
  -1 siblings, 0 replies; 53+ messages in thread
From: Peter Zijlstra @ 2023-01-23 20:50 UTC (permalink / raw)
  To: mingo
  Cc: will, peterz, boqun.feng, mark.rutland, tglx, bp, dave.hansen,
	x86, hpa, seanjc, pbonzini, jgross, srivatsa, amakhalov,
	pv-drivers, rostedt, mhiramat, wanpengli, vkuznets,
	boris.ostrovsky, rafael, daniel.lezcano, juri.lelli,
	vincent.guittot, dietmar.eggemann, bsegall, mgorman, bristot,
	vschneid, linux-kernel, kvm, virtualization, linux-trace-kernel,
	linux-pm, Uros Bizjak

From: Uros Bizjak <ubizjak@gmail.com>

Improve atomic update of last_value in pvclock_clocksource_read:

- Atomic update can be skipped if the "last_value" is already
  equal to "ret".

- The detection of atomic update failure is not correct. The value,
  returned by atomic64_cmpxchg should be compared to the old value
  from the location to be updated. If these two are the same, then
  atomic update succeeded and "last_value" location is updated to
  "ret" in an atomic way. Otherwise, the atomic update failed and
  it should be retried with the value from "last_value" - exactly
  what atomic64_try_cmpxchg does in a correct and more optimal way.

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20230118202330.3740-1-ubizjak@gmail.com
---
 arch/x86/kernel/pvclock.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index eda37df016f0..5a2a517dd61b 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -102,10 +102,9 @@ u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
 	 */
 	last = atomic64_read(&last_value);
 	do {
-		if (ret < last)
+		if (ret <= last)
 			return last;
-		last = atomic64_cmpxchg(&last_value, last, ret);
-	} while (unlikely(last != ret));
+	} while (!atomic64_try_cmpxchg(&last_value, &last, ret));
 
 	return ret;
 }
-- 
2.39.0




^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [PATCH 2/6] x86/pvclock: improve atomic update of last_value in pvclock_clocksource_read
@ 2023-01-23 20:50   ` Peter Zijlstra
  0 siblings, 0 replies; 53+ messages in thread
From: Peter Zijlstra @ 2023-01-23 20:50 UTC (permalink / raw)
  To: mingo
  Cc: mark.rutland, juri.lelli, daniel.lezcano, wanpengli, kvm, rafael,
	peterz, dave.hansen, virtualization, bsegall, amakhalov, will,
	tglx, vschneid, hpa, x86, Uros Bizjak, pv-drivers, mgorman,
	linux-trace-kernel, linux-pm, boqun.feng, rostedt, bp,
	vincent.guittot, boris.ostrovsky, dietmar.eggemann, jgross,
	seanjc, linux-kernel, mhiramat, pbonzini, bristot

From: Uros Bizjak <ubizjak@gmail.com>

Improve atomic update of last_value in pvclock_clocksource_read:

- Atomic update can be skipped if the "last_value" is already
  equal to "ret".

- The detection of atomic update failure is not correct. The value,
  returned by atomic64_cmpxchg should be compared to the old value
  from the location to be updated. If these two are the same, then
  atomic update succeeded and "last_value" location is updated to
  "ret" in an atomic way. Otherwise, the atomic update failed and
  it should be retried with the value from "last_value" - exactly
  what atomic64_try_cmpxchg does in a correct and more optimal way.

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20230118202330.3740-1-ubizjak@gmail.com
---
 arch/x86/kernel/pvclock.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index eda37df016f0..5a2a517dd61b 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -102,10 +102,9 @@ u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
 	 */
 	last = atomic64_read(&last_value);
 	do {
-		if (ret < last)
+		if (ret <= last)
 			return last;
-		last = atomic64_cmpxchg(&last_value, last, ret);
-	} while (unlikely(last != ret));
+	} while (!atomic64_try_cmpxchg(&last_value, &last, ret));
 
 	return ret;
 }
-- 
2.39.0



_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [PATCH 3/6] ftrace/x86: Warn and ignore graph tracing when RCU is disabled
  2023-01-23 20:50 ` Peter Zijlstra
@ 2023-01-23 20:50   ` Peter Zijlstra
  -1 siblings, 0 replies; 53+ messages in thread
From: Peter Zijlstra @ 2023-01-23 20:50 UTC (permalink / raw)
  To: mingo
  Cc: will, peterz, boqun.feng, mark.rutland, tglx, bp, dave.hansen,
	x86, hpa, seanjc, pbonzini, jgross, srivatsa, amakhalov,
	pv-drivers, rostedt, mhiramat, wanpengli, vkuznets,
	boris.ostrovsky, rafael, daniel.lezcano, juri.lelli,
	vincent.guittot, dietmar.eggemann, bsegall, mgorman, bristot,
	vschneid, linux-kernel, kvm, virtualization, linux-trace-kernel,
	linux-pm

All RCU disabled code should be noinstr and hence we should never get
here -- when we do, WARN about it and make sure to not actually do
tracing.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/kernel/ftrace.c |    3 +++
 1 file changed, 3 insertions(+)

--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -646,6 +646,9 @@ void prepare_ftrace_return(unsigned long
 	if (unlikely(atomic_read(&current->tracing_graph_pause)))
 		return;
 
+	if (WARN_ONCE(!rcu_is_watching(), "RCU not on for: %pS\n", (void *)ip))
+		return;
+
 	bit = ftrace_test_recursion_trylock(ip, *parent);
 	if (bit < 0)
 		return;



^ permalink raw reply	[flat|nested] 53+ messages in thread

* [PATCH 3/6] ftrace/x86: Warn and ignore graph tracing when RCU is disabled
@ 2023-01-23 20:50   ` Peter Zijlstra
  0 siblings, 0 replies; 53+ messages in thread
From: Peter Zijlstra @ 2023-01-23 20:50 UTC (permalink / raw)
  To: mingo
  Cc: mark.rutland, juri.lelli, daniel.lezcano, wanpengli, kvm, rafael,
	peterz, dave.hansen, virtualization, bsegall, amakhalov, will,
	tglx, vschneid, hpa, x86, pv-drivers, mgorman,
	linux-trace-kernel, linux-pm, boqun.feng, rostedt, bp,
	vincent.guittot, boris.ostrovsky, dietmar.eggemann, jgross,
	seanjc, linux-kernel, mhiramat, pbonzini, bristot

All RCU disabled code should be noinstr and hence we should never get
here -- when we do, WARN about it and make sure to not actually do
tracing.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/kernel/ftrace.c |    3 +++
 1 file changed, 3 insertions(+)

--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -646,6 +646,9 @@ void prepare_ftrace_return(unsigned long
 	if (unlikely(atomic_read(&current->tracing_graph_pause)))
 		return;
 
+	if (WARN_ONCE(!rcu_is_watching(), "RCU not on for: %pS\n", (void *)ip))
+		return;
+
 	bit = ftrace_test_recursion_trylock(ip, *parent);
 	if (bit < 0)
 		return;


_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 53+ messages in thread

* [PATCH 4/6] x86: Mark sched_clock() noinstr
  2023-01-23 20:50 ` Peter Zijlstra
@ 2023-01-23 20:50   ` Peter Zijlstra
  -1 siblings, 0 replies; 53+ messages in thread
From: Peter Zijlstra @ 2023-01-23 20:50 UTC (permalink / raw)
  To: mingo
  Cc: will, peterz, boqun.feng, mark.rutland, tglx, bp, dave.hansen,
	x86, hpa, seanjc, pbonzini, jgross, srivatsa, amakhalov,
	pv-drivers, rostedt, mhiramat, wanpengli, vkuznets,
	boris.ostrovsky, rafael, daniel.lezcano, juri.lelli,
	vincent.guittot, dietmar.eggemann, bsegall, mgorman, bristot,
	vschneid, linux-kernel, kvm, virtualization, linux-trace-kernel,
	linux-pm

In order to use sched_clock() from noinstr code, mark it and all it's
implenentations noinstr.

The whole pvclock thing (used by KVM/Xen) is a bit of a pain,
since it calls out to watchdogs, create a
pvclock_clocksource_read_nowd() variant doesn't do that and can be
noinstr.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/include/asm/kvmclock.h |    2 +-
 arch/x86/include/asm/paravirt.h |    2 +-
 arch/x86/include/asm/pvclock.h  |    3 ++-
 arch/x86/kernel/cpu/vmware.c    |    2 +-
 arch/x86/kernel/kvmclock.c      |    6 +++---
 arch/x86/kernel/pvclock.c       |   19 +++++++++++++++----
 arch/x86/kernel/tsc.c           |    7 +++----
 arch/x86/xen/time.c             |   12 ++++++++++--
 include/linux/math64.h          |    4 ++--
 9 files changed, 38 insertions(+), 19 deletions(-)

--- a/arch/x86/include/asm/kvmclock.h
+++ b/arch/x86/include/asm/kvmclock.h
@@ -8,7 +8,7 @@ extern struct clocksource kvm_clock;
 
 DECLARE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
 
-static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void)
+static __always_inline struct pvclock_vcpu_time_info *this_cpu_pvti(void)
 {
 	return &this_cpu_read(hv_clock_per_cpu)->pvti;
 }
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -26,7 +26,7 @@ DECLARE_STATIC_CALL(pv_sched_clock, dumm
 
 void paravirt_set_sched_clock(u64 (*func)(void));
 
-static inline u64 paravirt_sched_clock(void)
+static __always_inline u64 paravirt_sched_clock(void)
 {
 	return static_call(pv_sched_clock)();
 }
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -7,6 +7,7 @@
 
 /* some helper functions for xen and kvm pv clock sources */
 u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
+u64 pvclock_clocksource_read_nowd(struct pvclock_vcpu_time_info *src);
 u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src);
 void pvclock_set_flags(u8 flags);
 unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src);
@@ -39,7 +40,7 @@ bool pvclock_read_retry(const struct pvc
  * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
  * yielding a 64-bit result.
  */
-static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
+static __always_inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
 {
 	u64 product;
 #ifdef __i386__
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -143,7 +143,7 @@ static __init int parse_no_stealacc(char
 }
 early_param("no-steal-acc", parse_no_stealacc);
 
-static unsigned long long notrace vmware_sched_clock(void)
+static noinstr u64 vmware_sched_clock(void)
 {
 	unsigned long long ns;
 
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -71,12 +71,12 @@ static int kvm_set_wallclock(const struc
 	return -ENODEV;
 }
 
-static u64 kvm_clock_read(void)
+static noinstr u64 kvm_clock_read(void)
 {
 	u64 ret;
 
 	preempt_disable_notrace();
-	ret = pvclock_clocksource_read(this_cpu_pvti());
+	ret = pvclock_clocksource_read_nowd(this_cpu_pvti());
 	preempt_enable_notrace();
 	return ret;
 }
@@ -86,7 +86,7 @@ static u64 kvm_clock_get_cycles(struct c
 	return kvm_clock_read();
 }
 
-static u64 kvm_sched_clock_read(void)
+static noinstr u64 kvm_sched_clock_read(void)
 {
 	return kvm_clock_read() - kvm_sched_clock_offset;
 }
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -64,7 +64,8 @@ u8 pvclock_read_flags(struct pvclock_vcp
 	return flags & valid_flags;
 }
 
-u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
+static __always_inline
+u64 __pvclock_clocksource_read(struct pvclock_vcpu_time_info *src, bool dowd)
 {
 	unsigned version;
 	u64 ret;
@@ -77,7 +78,7 @@ u64 pvclock_clocksource_read(struct pvcl
 		flags = src->flags;
 	} while (pvclock_read_retry(src, version));
 
-	if (unlikely((flags & PVCLOCK_GUEST_STOPPED) != 0)) {
+	if (dowd && unlikely((flags & PVCLOCK_GUEST_STOPPED) != 0)) {
 		src->flags &= ~PVCLOCK_GUEST_STOPPED;
 		pvclock_touch_watchdogs();
 	}
@@ -100,15 +101,25 @@ u64 pvclock_clocksource_read(struct pvcl
 	 * updating at the same time, and one of them could be slightly behind,
 	 * making the assumption that last_value always go forward fail to hold.
 	 */
-	last = atomic64_read(&last_value);
+	last = arch_atomic64_read(&last_value);
 	do {
 		if (ret <= last)
 			return last;
-	} while (!atomic64_try_cmpxchg(&last_value, &last, ret));
+	} while (!arch_atomic64_try_cmpxchg(&last_value, &last, ret));
 
 	return ret;
 }
 
+u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
+{
+	return __pvclock_clocksource_read(src, true);
+}
+
+noinstr u64 pvclock_clocksource_read_nowd(struct pvclock_vcpu_time_info *src)
+{
+	return __pvclock_clocksource_read(src, false);
+}
+
 void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
 			    struct pvclock_vcpu_time_info *vcpu_time,
 			    struct timespec64 *ts)
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -215,7 +215,7 @@ static void __init cyc2ns_init_secondary
 /*
  * Scheduler clock - returns current time in nanosec units.
  */
-u64 native_sched_clock(void)
+noinstr u64 native_sched_clock(void)
 {
 	if (static_branch_likely(&__use_tsc)) {
 		u64 tsc_now = rdtsc();
@@ -248,7 +248,7 @@ u64 native_sched_clock_from_tsc(u64 tsc)
 /* We need to define a real function for sched_clock, to override the
    weak default version */
 #ifdef CONFIG_PARAVIRT
-unsigned long long sched_clock(void)
+noinstr u64 sched_clock(void)
 {
 	return paravirt_sched_clock();
 }
@@ -258,8 +258,7 @@ bool using_native_sched_clock(void)
 	return static_call_query(pv_sched_clock) == native_sched_clock;
 }
 #else
-unsigned long long
-sched_clock(void) __attribute__((alias("native_sched_clock")));
+u64 sched_clock(void) __attribute__((alias("native_sched_clock")));
 
 bool using_native_sched_clock(void) { return true; }
 #endif
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -60,9 +60,17 @@ static u64 xen_clocksource_get_cycles(st
 	return xen_clocksource_read();
 }
 
-static u64 xen_sched_clock(void)
+static noinstr u64 xen_sched_clock(void)
 {
-	return xen_clocksource_read() - xen_sched_clock_offset;
+        struct pvclock_vcpu_time_info *src;
+	u64 ret;
+
+	preempt_disable_notrace();
+	src = &__this_cpu_read(xen_vcpu)->time;
+	ret = pvclock_clocksource_read_nowd(src);
+	ret -= xen_sched_clock_offset;
+	preempt_enable_notrace();
+	return ret;
 }
 
 static void xen_read_wallclock(struct timespec64 *ts)
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -161,7 +161,7 @@ static inline u64 mul_u32_u32(u32 a, u32
 #if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__)
 
 #ifndef mul_u64_u32_shr
-static inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift)
+static __always_inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift)
 {
 	return (u64)(((unsigned __int128)a * mul) >> shift);
 }
@@ -177,7 +177,7 @@ static inline u64 mul_u64_u64_shr(u64 a,
 #else
 
 #ifndef mul_u64_u32_shr
-static inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift)
+static __always_inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift)
 {
 	u32 ah, al;
 	u64 ret;



^ permalink raw reply	[flat|nested] 53+ messages in thread

* [PATCH 4/6] x86: Mark sched_clock() noinstr
@ 2023-01-23 20:50   ` Peter Zijlstra
  0 siblings, 0 replies; 53+ messages in thread
From: Peter Zijlstra @ 2023-01-23 20:50 UTC (permalink / raw)
  To: mingo
  Cc: mark.rutland, juri.lelli, daniel.lezcano, wanpengli, kvm, rafael,
	peterz, dave.hansen, virtualization, bsegall, amakhalov, will,
	tglx, vschneid, hpa, x86, pv-drivers, mgorman,
	linux-trace-kernel, linux-pm, boqun.feng, rostedt, bp,
	vincent.guittot, boris.ostrovsky, dietmar.eggemann, jgross,
	seanjc, linux-kernel, mhiramat, pbonzini, bristot

In order to use sched_clock() from noinstr code, mark it and all it's
implenentations noinstr.

The whole pvclock thing (used by KVM/Xen) is a bit of a pain,
since it calls out to watchdogs, create a
pvclock_clocksource_read_nowd() variant doesn't do that and can be
noinstr.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/include/asm/kvmclock.h |    2 +-
 arch/x86/include/asm/paravirt.h |    2 +-
 arch/x86/include/asm/pvclock.h  |    3 ++-
 arch/x86/kernel/cpu/vmware.c    |    2 +-
 arch/x86/kernel/kvmclock.c      |    6 +++---
 arch/x86/kernel/pvclock.c       |   19 +++++++++++++++----
 arch/x86/kernel/tsc.c           |    7 +++----
 arch/x86/xen/time.c             |   12 ++++++++++--
 include/linux/math64.h          |    4 ++--
 9 files changed, 38 insertions(+), 19 deletions(-)

--- a/arch/x86/include/asm/kvmclock.h
+++ b/arch/x86/include/asm/kvmclock.h
@@ -8,7 +8,7 @@ extern struct clocksource kvm_clock;
 
 DECLARE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
 
-static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void)
+static __always_inline struct pvclock_vcpu_time_info *this_cpu_pvti(void)
 {
 	return &this_cpu_read(hv_clock_per_cpu)->pvti;
 }
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -26,7 +26,7 @@ DECLARE_STATIC_CALL(pv_sched_clock, dumm
 
 void paravirt_set_sched_clock(u64 (*func)(void));
 
-static inline u64 paravirt_sched_clock(void)
+static __always_inline u64 paravirt_sched_clock(void)
 {
 	return static_call(pv_sched_clock)();
 }
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -7,6 +7,7 @@
 
 /* some helper functions for xen and kvm pv clock sources */
 u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
+u64 pvclock_clocksource_read_nowd(struct pvclock_vcpu_time_info *src);
 u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src);
 void pvclock_set_flags(u8 flags);
 unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src);
@@ -39,7 +40,7 @@ bool pvclock_read_retry(const struct pvc
  * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
  * yielding a 64-bit result.
  */
-static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
+static __always_inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
 {
 	u64 product;
 #ifdef __i386__
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -143,7 +143,7 @@ static __init int parse_no_stealacc(char
 }
 early_param("no-steal-acc", parse_no_stealacc);
 
-static unsigned long long notrace vmware_sched_clock(void)
+static noinstr u64 vmware_sched_clock(void)
 {
 	unsigned long long ns;
 
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -71,12 +71,12 @@ static int kvm_set_wallclock(const struc
 	return -ENODEV;
 }
 
-static u64 kvm_clock_read(void)
+static noinstr u64 kvm_clock_read(void)
 {
 	u64 ret;
 
 	preempt_disable_notrace();
-	ret = pvclock_clocksource_read(this_cpu_pvti());
+	ret = pvclock_clocksource_read_nowd(this_cpu_pvti());
 	preempt_enable_notrace();
 	return ret;
 }
@@ -86,7 +86,7 @@ static u64 kvm_clock_get_cycles(struct c
 	return kvm_clock_read();
 }
 
-static u64 kvm_sched_clock_read(void)
+static noinstr u64 kvm_sched_clock_read(void)
 {
 	return kvm_clock_read() - kvm_sched_clock_offset;
 }
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -64,7 +64,8 @@ u8 pvclock_read_flags(struct pvclock_vcp
 	return flags & valid_flags;
 }
 
-u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
+static __always_inline
+u64 __pvclock_clocksource_read(struct pvclock_vcpu_time_info *src, bool dowd)
 {
 	unsigned version;
 	u64 ret;
@@ -77,7 +78,7 @@ u64 pvclock_clocksource_read(struct pvcl
 		flags = src->flags;
 	} while (pvclock_read_retry(src, version));
 
-	if (unlikely((flags & PVCLOCK_GUEST_STOPPED) != 0)) {
+	if (dowd && unlikely((flags & PVCLOCK_GUEST_STOPPED) != 0)) {
 		src->flags &= ~PVCLOCK_GUEST_STOPPED;
 		pvclock_touch_watchdogs();
 	}
@@ -100,15 +101,25 @@ u64 pvclock_clocksource_read(struct pvcl
 	 * updating at the same time, and one of them could be slightly behind,
 	 * making the assumption that last_value always go forward fail to hold.
 	 */
-	last = atomic64_read(&last_value);
+	last = arch_atomic64_read(&last_value);
 	do {
 		if (ret <= last)
 			return last;
-	} while (!atomic64_try_cmpxchg(&last_value, &last, ret));
+	} while (!arch_atomic64_try_cmpxchg(&last_value, &last, ret));
 
 	return ret;
 }
 
+u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
+{
+	return __pvclock_clocksource_read(src, true);
+}
+
+noinstr u64 pvclock_clocksource_read_nowd(struct pvclock_vcpu_time_info *src)
+{
+	return __pvclock_clocksource_read(src, false);
+}
+
 void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
 			    struct pvclock_vcpu_time_info *vcpu_time,
 			    struct timespec64 *ts)
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -215,7 +215,7 @@ static void __init cyc2ns_init_secondary
 /*
  * Scheduler clock - returns current time in nanosec units.
  */
-u64 native_sched_clock(void)
+noinstr u64 native_sched_clock(void)
 {
 	if (static_branch_likely(&__use_tsc)) {
 		u64 tsc_now = rdtsc();
@@ -248,7 +248,7 @@ u64 native_sched_clock_from_tsc(u64 tsc)
 /* We need to define a real function for sched_clock, to override the
    weak default version */
 #ifdef CONFIG_PARAVIRT
-unsigned long long sched_clock(void)
+noinstr u64 sched_clock(void)
 {
 	return paravirt_sched_clock();
 }
@@ -258,8 +258,7 @@ bool using_native_sched_clock(void)
 	return static_call_query(pv_sched_clock) == native_sched_clock;
 }
 #else
-unsigned long long
-sched_clock(void) __attribute__((alias("native_sched_clock")));
+u64 sched_clock(void) __attribute__((alias("native_sched_clock")));
 
 bool using_native_sched_clock(void) { return true; }
 #endif
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -60,9 +60,17 @@ static u64 xen_clocksource_get_cycles(st
 	return xen_clocksource_read();
 }
 
-static u64 xen_sched_clock(void)
+static noinstr u64 xen_sched_clock(void)
 {
-	return xen_clocksource_read() - xen_sched_clock_offset;
+        struct pvclock_vcpu_time_info *src;
+	u64 ret;
+
+	preempt_disable_notrace();
+	src = &__this_cpu_read(xen_vcpu)->time;
+	ret = pvclock_clocksource_read_nowd(src);
+	ret -= xen_sched_clock_offset;
+	preempt_enable_notrace();
+	return ret;
 }
 
 static void xen_read_wallclock(struct timespec64 *ts)
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -161,7 +161,7 @@ static inline u64 mul_u32_u32(u32 a, u32
 #if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__)
 
 #ifndef mul_u64_u32_shr
-static inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift)
+static __always_inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift)
 {
 	return (u64)(((unsigned __int128)a * mul) >> shift);
 }
@@ -177,7 +177,7 @@ static inline u64 mul_u64_u64_shr(u64 a,
 #else
 
 #ifndef mul_u64_u32_shr
-static inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift)
+static __always_inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift)
 {
 	u32 ah, al;
 	u64 ret;


_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 53+ messages in thread

* [PATCH 5/6] sched/clock: Make local_clock() noinstr
  2023-01-23 20:50 ` Peter Zijlstra
@ 2023-01-23 20:50   ` Peter Zijlstra
  -1 siblings, 0 replies; 53+ messages in thread
From: Peter Zijlstra @ 2023-01-23 20:50 UTC (permalink / raw)
  To: mingo
  Cc: will, peterz, boqun.feng, mark.rutland, tglx, bp, dave.hansen,
	x86, hpa, seanjc, pbonzini, jgross, srivatsa, amakhalov,
	pv-drivers, rostedt, mhiramat, wanpengli, vkuznets,
	boris.ostrovsky, rafael, daniel.lezcano, juri.lelli,
	vincent.guittot, dietmar.eggemann, bsegall, mgorman, bristot,
	vschneid, linux-kernel, kvm, virtualization, linux-trace-kernel,
	linux-pm

With sched_clock() noinstr, provide a noinstr implementation of
local_clock().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/sched/clock.h |    8 +++-----
 kernel/sched/clock.c        |   27 +++++++++++++++++++++------
 2 files changed, 24 insertions(+), 11 deletions(-)

--- a/include/linux/sched/clock.h
+++ b/include/linux/sched/clock.h
@@ -45,7 +45,7 @@ static inline u64 cpu_clock(int cpu)
 	return sched_clock();
 }
 
-static inline u64 local_clock(void)
+static __always_inline u64 local_clock(void)
 {
 	return sched_clock();
 }
@@ -79,10 +79,8 @@ static inline u64 cpu_clock(int cpu)
 	return sched_clock_cpu(cpu);
 }
 
-static inline u64 local_clock(void)
-{
-	return sched_clock_cpu(raw_smp_processor_id());
-}
+extern u64 local_clock(void);
+
 #endif
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -93,7 +93,7 @@ struct sched_clock_data {
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
 
-notrace static inline struct sched_clock_data *this_scd(void)
+static __always_inline struct sched_clock_data *this_scd(void)
 {
 	return this_cpu_ptr(&sched_clock_data);
 }
@@ -244,12 +244,12 @@ late_initcall(sched_clock_init_late);
  * min, max except they take wrapping into account
  */
 
-notrace static inline u64 wrap_min(u64 x, u64 y)
+static __always_inline u64 wrap_min(u64 x, u64 y)
 {
 	return (s64)(x - y) < 0 ? x : y;
 }
 
-notrace static inline u64 wrap_max(u64 x, u64 y)
+static __always_inline u64 wrap_max(u64 x, u64 y)
 {
 	return (s64)(x - y) > 0 ? x : y;
 }
@@ -260,7 +260,7 @@ notrace static inline u64 wrap_max(u64 x
  *  - filter out backward motion
  *  - use the GTOD tick value to create a window to filter crazy TSC values
  */
-notrace static u64 sched_clock_local(struct sched_clock_data *scd)
+static __always_inline u64 sched_clock_local(struct sched_clock_data *scd)
 {
 	u64 now, clock, old_clock, min_clock, max_clock, gtod;
 	s64 delta;
@@ -287,13 +287,28 @@ notrace static u64 sched_clock_local(str
 	clock = wrap_max(clock, min_clock);
 	clock = wrap_min(clock, max_clock);
 
-	if (!try_cmpxchg64(&scd->clock, &old_clock, clock))
+	if (!arch_try_cmpxchg64(&scd->clock, &old_clock, clock))
 		goto again;
 
 	return clock;
 }
 
-notrace static u64 sched_clock_remote(struct sched_clock_data *scd)
+noinstr u64 local_clock(void)
+{
+	u64 clock;
+
+	if (static_branch_likely(&__sched_clock_stable))
+		return sched_clock() + __sched_clock_offset;
+
+	preempt_disable_notrace();
+	clock = sched_clock_local(this_scd());
+	preempt_enable_notrace();
+
+	return clock;
+}
+EXPORT_SYMBOL_GPL(local_clock);
+
+static notrace u64 sched_clock_remote(struct sched_clock_data *scd)
 {
 	struct sched_clock_data *my_scd = this_scd();
 	u64 this_clock, remote_clock;



^ permalink raw reply	[flat|nested] 53+ messages in thread

* [PATCH 5/6] sched/clock: Make local_clock() noinstr
@ 2023-01-23 20:50   ` Peter Zijlstra
  0 siblings, 0 replies; 53+ messages in thread
From: Peter Zijlstra @ 2023-01-23 20:50 UTC (permalink / raw)
  To: mingo
  Cc: mark.rutland, juri.lelli, daniel.lezcano, wanpengli, kvm, rafael,
	peterz, dave.hansen, virtualization, bsegall, amakhalov, will,
	tglx, vschneid, hpa, x86, pv-drivers, mgorman,
	linux-trace-kernel, linux-pm, boqun.feng, rostedt, bp,
	vincent.guittot, boris.ostrovsky, dietmar.eggemann, jgross,
	seanjc, linux-kernel, mhiramat, pbonzini, bristot

With sched_clock() noinstr, provide a noinstr implementation of
local_clock().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/sched/clock.h |    8 +++-----
 kernel/sched/clock.c        |   27 +++++++++++++++++++++------
 2 files changed, 24 insertions(+), 11 deletions(-)

--- a/include/linux/sched/clock.h
+++ b/include/linux/sched/clock.h
@@ -45,7 +45,7 @@ static inline u64 cpu_clock(int cpu)
 	return sched_clock();
 }
 
-static inline u64 local_clock(void)
+static __always_inline u64 local_clock(void)
 {
 	return sched_clock();
 }
@@ -79,10 +79,8 @@ static inline u64 cpu_clock(int cpu)
 	return sched_clock_cpu(cpu);
 }
 
-static inline u64 local_clock(void)
-{
-	return sched_clock_cpu(raw_smp_processor_id());
-}
+extern u64 local_clock(void);
+
 #endif
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -93,7 +93,7 @@ struct sched_clock_data {
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
 
-notrace static inline struct sched_clock_data *this_scd(void)
+static __always_inline struct sched_clock_data *this_scd(void)
 {
 	return this_cpu_ptr(&sched_clock_data);
 }
@@ -244,12 +244,12 @@ late_initcall(sched_clock_init_late);
  * min, max except they take wrapping into account
  */
 
-notrace static inline u64 wrap_min(u64 x, u64 y)
+static __always_inline u64 wrap_min(u64 x, u64 y)
 {
 	return (s64)(x - y) < 0 ? x : y;
 }
 
-notrace static inline u64 wrap_max(u64 x, u64 y)
+static __always_inline u64 wrap_max(u64 x, u64 y)
 {
 	return (s64)(x - y) > 0 ? x : y;
 }
@@ -260,7 +260,7 @@ notrace static inline u64 wrap_max(u64 x
  *  - filter out backward motion
  *  - use the GTOD tick value to create a window to filter crazy TSC values
  */
-notrace static u64 sched_clock_local(struct sched_clock_data *scd)
+static __always_inline u64 sched_clock_local(struct sched_clock_data *scd)
 {
 	u64 now, clock, old_clock, min_clock, max_clock, gtod;
 	s64 delta;
@@ -287,13 +287,28 @@ notrace static u64 sched_clock_local(str
 	clock = wrap_max(clock, min_clock);
 	clock = wrap_min(clock, max_clock);
 
-	if (!try_cmpxchg64(&scd->clock, &old_clock, clock))
+	if (!arch_try_cmpxchg64(&scd->clock, &old_clock, clock))
 		goto again;
 
 	return clock;
 }
 
-notrace static u64 sched_clock_remote(struct sched_clock_data *scd)
+noinstr u64 local_clock(void)
+{
+	u64 clock;
+
+	if (static_branch_likely(&__sched_clock_stable))
+		return sched_clock() + __sched_clock_offset;
+
+	preempt_disable_notrace();
+	clock = sched_clock_local(this_scd());
+	preempt_enable_notrace();
+
+	return clock;
+}
+EXPORT_SYMBOL_GPL(local_clock);
+
+static notrace u64 sched_clock_remote(struct sched_clock_data *scd)
 {
 	struct sched_clock_data *my_scd = this_scd();
 	u64 this_clock, remote_clock;


_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 53+ messages in thread

* [PATCH 6/6] cpuidle: Fix poll_idle() noinstr annotation
  2023-01-23 20:50 ` Peter Zijlstra
@ 2023-01-23 20:50   ` Peter Zijlstra
  -1 siblings, 0 replies; 53+ messages in thread
From: Peter Zijlstra @ 2023-01-23 20:50 UTC (permalink / raw)
  To: mingo
  Cc: will, peterz, boqun.feng, mark.rutland, tglx, bp, dave.hansen,
	x86, hpa, seanjc, pbonzini, jgross, srivatsa, amakhalov,
	pv-drivers, rostedt, mhiramat, wanpengli, vkuznets,
	boris.ostrovsky, rafael, daniel.lezcano, juri.lelli,
	vincent.guittot, dietmar.eggemann, bsegall, mgorman, bristot,
	vschneid, linux-kernel, kvm, virtualization, linux-trace-kernel,
	linux-pm, kernel test robot

The instrumentation_begin()/end() annotations in poll_idle() were
complete nonsense. Specifically they caused tracing to happen in the
middle of noinstr code, resulting in RCU splats.

Now that local_clock() is noinstr, mark up the rest and let it rip.

Fixes: 00717eb8c955 ("cpuidle: Annotate poll_idle()")
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reported-by: kernel test robot <oliver.sang@intel.com>
Link: https://lore.kernel.org/oe-lkp/202301192148.58ece903-oliver.sang@intel.com
---
 drivers/cpuidle/cpuidle.c    |    2 +-
 drivers/cpuidle/poll_state.c |    2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -426,7 +426,7 @@ void cpuidle_reflect(struct cpuidle_devi
  * @dev:   the cpuidle device
  *
  */
-u64 cpuidle_poll_time(struct cpuidle_driver *drv,
+__cpuidle u64 cpuidle_poll_time(struct cpuidle_driver *drv,
 		      struct cpuidle_device *dev)
 {
 	int i;
--- a/drivers/cpuidle/poll_state.c
+++ b/drivers/cpuidle/poll_state.c
@@ -15,7 +15,6 @@ static int __cpuidle poll_idle(struct cp
 {
 	u64 time_start;
 
-	instrumentation_begin();
 	time_start = local_clock();
 
 	dev->poll_time_limit = false;
@@ -42,7 +41,6 @@ static int __cpuidle poll_idle(struct cp
 	raw_local_irq_disable();
 
 	current_clr_polling();
-	instrumentation_end();
 
 	return index;
 }



^ permalink raw reply	[flat|nested] 53+ messages in thread

* [PATCH 6/6] cpuidle: Fix poll_idle() noinstr annotation
@ 2023-01-23 20:50   ` Peter Zijlstra
  0 siblings, 0 replies; 53+ messages in thread
From: Peter Zijlstra @ 2023-01-23 20:50 UTC (permalink / raw)
  To: mingo
  Cc: mark.rutland, juri.lelli, daniel.lezcano, wanpengli, kvm, rafael,
	peterz, dave.hansen, virtualization, bsegall, amakhalov, will,
	tglx, vschneid, hpa, x86, pv-drivers, mgorman,
	linux-trace-kernel, linux-pm, boqun.feng, rostedt, bp,
	vincent.guittot, boris.ostrovsky, dietmar.eggemann, jgross,
	seanjc, linux-kernel, kernel test robot, mhiramat, pbonzini,
	bristot

The instrumentation_begin()/end() annotations in poll_idle() were
complete nonsense. Specifically they caused tracing to happen in the
middle of noinstr code, resulting in RCU splats.

Now that local_clock() is noinstr, mark up the rest and let it rip.

Fixes: 00717eb8c955 ("cpuidle: Annotate poll_idle()")
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reported-by: kernel test robot <oliver.sang@intel.com>
Link: https://lore.kernel.org/oe-lkp/202301192148.58ece903-oliver.sang@intel.com
---
 drivers/cpuidle/cpuidle.c    |    2 +-
 drivers/cpuidle/poll_state.c |    2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -426,7 +426,7 @@ void cpuidle_reflect(struct cpuidle_devi
  * @dev:   the cpuidle device
  *
  */
-u64 cpuidle_poll_time(struct cpuidle_driver *drv,
+__cpuidle u64 cpuidle_poll_time(struct cpuidle_driver *drv,
 		      struct cpuidle_device *dev)
 {
 	int i;
--- a/drivers/cpuidle/poll_state.c
+++ b/drivers/cpuidle/poll_state.c
@@ -15,7 +15,6 @@ static int __cpuidle poll_idle(struct cp
 {
 	u64 time_start;
 
-	instrumentation_begin();
 	time_start = local_clock();
 
 	dev->poll_time_limit = false;
@@ -42,7 +41,6 @@ static int __cpuidle poll_idle(struct cp
 	raw_local_irq_disable();
 
 	current_clr_polling();
-	instrumentation_end();
 
 	return index;
 }


_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 3/6] ftrace/x86: Warn and ignore graph tracing when RCU is disabled
  2023-01-23 20:50   ` Peter Zijlstra
@ 2023-01-23 21:53     ` Steven Rostedt
  -1 siblings, 0 replies; 53+ messages in thread
From: Steven Rostedt @ 2023-01-23 21:53 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, will, boqun.feng, mark.rutland, tglx, bp, dave.hansen,
	x86, hpa, seanjc, pbonzini, jgross, srivatsa, amakhalov,
	pv-drivers, mhiramat, wanpengli, vkuznets, boris.ostrovsky,
	rafael, daniel.lezcano, juri.lelli, vincent.guittot,
	dietmar.eggemann, bsegall, mgorman, bristot, vschneid,
	linux-kernel, kvm, virtualization, linux-trace-kernel, linux-pm

On Mon, 23 Jan 2023 21:50:12 +0100
Peter Zijlstra <peterz@infradead.org> wrote:

> All RCU disabled code should be noinstr and hence we should never get
> here -- when we do, WARN about it and make sure to not actually do
> tracing.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  arch/x86/kernel/ftrace.c |    3 +++
>  1 file changed, 3 insertions(+)
> 
> --- a/arch/x86/kernel/ftrace.c
> +++ b/arch/x86/kernel/ftrace.c
> @@ -646,6 +646,9 @@ void prepare_ftrace_return(unsigned long
>  	if (unlikely(atomic_read(&current->tracing_graph_pause)))
>  		return;
>  
> +	if (WARN_ONCE(!rcu_is_watching(), "RCU not on for: %pS\n", (void *)ip))
> +		return;
> +

Please add this to after recursion trylock below. Although WARN_ONCE()
should not not have recursion issues, as function tracing can do weird
things, I rather be safe than sorry, and not have the system triple boot
due to some path that might get added in the future.

If rcu_is_watching() is false, it will still get by the below recursion
check and warn. That is, the below check should be done before this
function calls any other function.

>  	bit = ftrace_test_recursion_trylock(ip, *parent);
>  	if (bit < 0)
>  		return;
> 

-- Steve

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 3/6] ftrace/x86: Warn and ignore graph tracing when RCU is disabled
@ 2023-01-23 21:53     ` Steven Rostedt
  0 siblings, 0 replies; 53+ messages in thread
From: Steven Rostedt @ 2023-01-23 21:53 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mark.rutland, juri.lelli, daniel.lezcano, wanpengli, kvm, rafael,
	pv-drivers, dave.hansen, virtualization, bsegall, amakhalov,
	will, vschneid, hpa, x86, mingo, mgorman, linux-trace-kernel,
	linux-pm, boqun.feng, bp, vincent.guittot, boris.ostrovsky,
	dietmar.eggemann, jgross, seanjc, linux-kernel, tglx, mhiramat,
	pbonzini, bristot

On Mon, 23 Jan 2023 21:50:12 +0100
Peter Zijlstra <peterz@infradead.org> wrote:

> All RCU disabled code should be noinstr and hence we should never get
> here -- when we do, WARN about it and make sure to not actually do
> tracing.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  arch/x86/kernel/ftrace.c |    3 +++
>  1 file changed, 3 insertions(+)
> 
> --- a/arch/x86/kernel/ftrace.c
> +++ b/arch/x86/kernel/ftrace.c
> @@ -646,6 +646,9 @@ void prepare_ftrace_return(unsigned long
>  	if (unlikely(atomic_read(&current->tracing_graph_pause)))
>  		return;
>  
> +	if (WARN_ONCE(!rcu_is_watching(), "RCU not on for: %pS\n", (void *)ip))
> +		return;
> +

Please add this to after recursion trylock below. Although WARN_ONCE()
should not not have recursion issues, as function tracing can do weird
things, I rather be safe than sorry, and not have the system triple boot
due to some path that might get added in the future.

If rcu_is_watching() is false, it will still get by the below recursion
check and warn. That is, the below check should be done before this
function calls any other function.

>  	bit = ftrace_test_recursion_trylock(ip, *parent);
>  	if (bit < 0)
>  		return;
> 

-- Steve
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 3/6] ftrace/x86: Warn and ignore graph tracing when RCU is disabled
  2023-01-23 21:53     ` Steven Rostedt
@ 2023-01-23 22:07       ` Steven Rostedt
  -1 siblings, 0 replies; 53+ messages in thread
From: Steven Rostedt @ 2023-01-23 22:07 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mark.rutland, juri.lelli, daniel.lezcano, wanpengli, kvm, rafael,
	pv-drivers, dave.hansen, virtualization, bsegall, amakhalov,
	will, vschneid, hpa, x86, mingo, mgorman, linux-trace-kernel,
	linux-pm, boqun.feng, bp, vincent.guittot, boris.ostrovsky,
	dietmar.eggemann, jgross, seanjc, linux-kernel, tglx, mhiramat,
	pbonzini, bristot

On Mon, 23 Jan 2023 16:53:04 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:

> On Mon, 23 Jan 2023 21:50:12 +0100
> Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > All RCU disabled code should be noinstr and hence we should never get
> > here -- when we do, WARN about it and make sure to not actually do
> > tracing.
> > 
> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > ---
> >  arch/x86/kernel/ftrace.c |    3 +++
> >  1 file changed, 3 insertions(+)
> > 
> > --- a/arch/x86/kernel/ftrace.c
> > +++ b/arch/x86/kernel/ftrace.c
> > @@ -646,6 +646,9 @@ void prepare_ftrace_return(unsigned long
> >  	if (unlikely(atomic_read(&current->tracing_graph_pause)))
> >  		return;
> >  
> > +	if (WARN_ONCE(!rcu_is_watching(), "RCU not on for: %pS\n", (void *)ip))
> > +		return;
> > +  
> 
> Please add this to after recursion trylock below. Although WARN_ONCE()
> should not not have recursion issues, as function tracing can do weird
> things, I rather be safe than sorry, and not have the system triple boot
> due to some path that might get added in the future.
> 
> If rcu_is_watching() is false, it will still get by the below recursion
> check and warn. That is, the below check should be done before this
> function calls any other function.
> 
> >  	bit = ftrace_test_recursion_trylock(ip, *parent);
> >  	if (bit < 0)
> >  		return;
> >   
> 

Actually, perhaps we can just add this, and all you need to do is create
and set CONFIG_NO_RCU_TRACING (or some other name).

This should cover all ftrace locations. (Uncompiled).

-- Steve

diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h
index c303f7a114e9..10ee3fbb9113 100644
--- a/include/linux/trace_recursion.h
+++ b/include/linux/trace_recursion.h
@@ -135,6 +135,22 @@ extern void ftrace_record_recursion(unsigned long ip, unsigned long parent_ip);
 # define do_ftrace_record_recursion(ip, pip)	do { } while (0)
 #endif
 
+#ifdef CONFIG_NO_RCU_TRACING
+# define trace_warn_on_no_rcu(ip)					\
+	({								\
+		bool __ret = false;					\
+		if (!trace_recursion_test(TRACE_RECORD_RECURSION_BIT)) { \
+			trace_recursion_set(TRACE_RECORD_RECURSION_BIT); \
+			__ret = WARN_ONCE(!rcu_is_watching(),		\
+					  "RCU not on for: %pS\n", (void *)ip); \
+			trace_recursion_clear(TRACE_RECORD_RECURSION_BIT); \
+		}							\
+		__ret;							\
+	})
+#else
+# define trace_warn_on_no_rcu(ip)	false
+#endif
+
 /*
  * Preemption is promised to be disabled when return bit >= 0.
  */
@@ -144,6 +160,9 @@ static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsign
 	unsigned int val = READ_ONCE(current->trace_recursion);
 	int bit;
 
+	if (trace_warn_on_no_rcu(ip))
+		return -1;
+
 	bit = trace_get_context_bit() + start;
 	if (unlikely(val & (1 << bit))) {
 		/*
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* Re: [PATCH 3/6] ftrace/x86: Warn and ignore graph tracing when RCU is disabled
@ 2023-01-23 22:07       ` Steven Rostedt
  0 siblings, 0 replies; 53+ messages in thread
From: Steven Rostedt @ 2023-01-23 22:07 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, will, boqun.feng, mark.rutland, tglx, bp, dave.hansen,
	x86, hpa, seanjc, pbonzini, jgross, srivatsa, amakhalov,
	pv-drivers, mhiramat, wanpengli, vkuznets, boris.ostrovsky,
	rafael, daniel.lezcano, juri.lelli, vincent.guittot,
	dietmar.eggemann, bsegall, mgorman, bristot, vschneid,
	linux-kernel, kvm, virtualization, linux-trace-kernel, linux-pm

On Mon, 23 Jan 2023 16:53:04 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:

> On Mon, 23 Jan 2023 21:50:12 +0100
> Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > All RCU disabled code should be noinstr and hence we should never get
> > here -- when we do, WARN about it and make sure to not actually do
> > tracing.
> > 
> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > ---
> >  arch/x86/kernel/ftrace.c |    3 +++
> >  1 file changed, 3 insertions(+)
> > 
> > --- a/arch/x86/kernel/ftrace.c
> > +++ b/arch/x86/kernel/ftrace.c
> > @@ -646,6 +646,9 @@ void prepare_ftrace_return(unsigned long
> >  	if (unlikely(atomic_read(&current->tracing_graph_pause)))
> >  		return;
> >  
> > +	if (WARN_ONCE(!rcu_is_watching(), "RCU not on for: %pS\n", (void *)ip))
> > +		return;
> > +  
> 
> Please add this to after recursion trylock below. Although WARN_ONCE()
> should not not have recursion issues, as function tracing can do weird
> things, I rather be safe than sorry, and not have the system triple boot
> due to some path that might get added in the future.
> 
> If rcu_is_watching() is false, it will still get by the below recursion
> check and warn. That is, the below check should be done before this
> function calls any other function.
> 
> >  	bit = ftrace_test_recursion_trylock(ip, *parent);
> >  	if (bit < 0)
> >  		return;
> >   
> 

Actually, perhaps we can just add this, and all you need to do is create
and set CONFIG_NO_RCU_TRACING (or some other name).

This should cover all ftrace locations. (Uncompiled).

-- Steve

diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h
index c303f7a114e9..10ee3fbb9113 100644
--- a/include/linux/trace_recursion.h
+++ b/include/linux/trace_recursion.h
@@ -135,6 +135,22 @@ extern void ftrace_record_recursion(unsigned long ip, unsigned long parent_ip);
 # define do_ftrace_record_recursion(ip, pip)	do { } while (0)
 #endif
 
+#ifdef CONFIG_NO_RCU_TRACING
+# define trace_warn_on_no_rcu(ip)					\
+	({								\
+		bool __ret = false;					\
+		if (!trace_recursion_test(TRACE_RECORD_RECURSION_BIT)) { \
+			trace_recursion_set(TRACE_RECORD_RECURSION_BIT); \
+			__ret = WARN_ONCE(!rcu_is_watching(),		\
+					  "RCU not on for: %pS\n", (void *)ip); \
+			trace_recursion_clear(TRACE_RECORD_RECURSION_BIT); \
+		}							\
+		__ret;							\
+	})
+#else
+# define trace_warn_on_no_rcu(ip)	false
+#endif
+
 /*
  * Preemption is promised to be disabled when return bit >= 0.
  */
@@ -144,6 +160,9 @@ static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsign
 	unsigned int val = READ_ONCE(current->trace_recursion);
 	int bit;
 
+	if (trace_warn_on_no_rcu(ip))
+		return -1;
+
 	bit = trace_get_context_bit() + start;
 	if (unlikely(val & (1 << bit))) {
 		/*

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* Re: [PATCH 6/6] cpuidle: Fix poll_idle() noinstr annotation
  2023-01-23 20:50   ` Peter Zijlstra
@ 2023-01-24 14:24     ` Rafael J. Wysocki
  -1 siblings, 0 replies; 53+ messages in thread
From: Rafael J. Wysocki @ 2023-01-24 14:24 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, will, boqun.feng, mark.rutland, tglx, bp, dave.hansen,
	x86, hpa, seanjc, pbonzini, jgross, srivatsa, amakhalov,
	pv-drivers, rostedt, mhiramat, wanpengli, vkuznets,
	boris.ostrovsky, rafael, daniel.lezcano, juri.lelli,
	vincent.guittot, dietmar.eggemann, bsegall, mgorman, bristot,
	vschneid, linux-kernel, kvm, virtualization, linux-trace-kernel,
	linux-pm, kernel test robot

On Mon, Jan 23, 2023 at 9:58 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> The instrumentation_begin()/end() annotations in poll_idle() were
> complete nonsense. Specifically they caused tracing to happen in the
> middle of noinstr code, resulting in RCU splats.
>
> Now that local_clock() is noinstr, mark up the rest and let it rip.
>
> Fixes: 00717eb8c955 ("cpuidle: Annotate poll_idle()")
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Reported-by: kernel test robot <oliver.sang@intel.com>
> Link: https://lore.kernel.org/oe-lkp/202301192148.58ece903-oliver.sang@intel.com

Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

> ---
>  drivers/cpuidle/cpuidle.c    |    2 +-
>  drivers/cpuidle/poll_state.c |    2 --
>  2 files changed, 1 insertion(+), 3 deletions(-)
>
> --- a/drivers/cpuidle/cpuidle.c
> +++ b/drivers/cpuidle/cpuidle.c
> @@ -426,7 +426,7 @@ void cpuidle_reflect(struct cpuidle_devi
>   * @dev:   the cpuidle device
>   *
>   */
> -u64 cpuidle_poll_time(struct cpuidle_driver *drv,
> +__cpuidle u64 cpuidle_poll_time(struct cpuidle_driver *drv,
>                       struct cpuidle_device *dev)
>  {
>         int i;
> --- a/drivers/cpuidle/poll_state.c
> +++ b/drivers/cpuidle/poll_state.c
> @@ -15,7 +15,6 @@ static int __cpuidle poll_idle(struct cp
>  {
>         u64 time_start;
>
> -       instrumentation_begin();
>         time_start = local_clock();
>
>         dev->poll_time_limit = false;
> @@ -42,7 +41,6 @@ static int __cpuidle poll_idle(struct cp
>         raw_local_irq_disable();
>
>         current_clr_polling();
> -       instrumentation_end();
>
>         return index;
>  }
>
>

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 6/6] cpuidle: Fix poll_idle() noinstr annotation
@ 2023-01-24 14:24     ` Rafael J. Wysocki
  0 siblings, 0 replies; 53+ messages in thread
From: Rafael J. Wysocki @ 2023-01-24 14:24 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mark.rutland, juri.lelli, daniel.lezcano, wanpengli, kvm, rafael,
	pv-drivers, dave.hansen, virtualization, bsegall, amakhalov,
	will, vschneid, hpa, x86, mingo, mgorman, kernel test robot,
	linux-trace-kernel, linux-pm, boqun.feng, rostedt, bp,
	vincent.guittot, boris.ostrovsky, dietmar.eggemann, jgross,
	seanjc, linux-kernel, tglx, mhiramat, pbonzini, bristot

On Mon, Jan 23, 2023 at 9:58 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> The instrumentation_begin()/end() annotations in poll_idle() were
> complete nonsense. Specifically they caused tracing to happen in the
> middle of noinstr code, resulting in RCU splats.
>
> Now that local_clock() is noinstr, mark up the rest and let it rip.
>
> Fixes: 00717eb8c955 ("cpuidle: Annotate poll_idle()")
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Reported-by: kernel test robot <oliver.sang@intel.com>
> Link: https://lore.kernel.org/oe-lkp/202301192148.58ece903-oliver.sang@intel.com

Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

> ---
>  drivers/cpuidle/cpuidle.c    |    2 +-
>  drivers/cpuidle/poll_state.c |    2 --
>  2 files changed, 1 insertion(+), 3 deletions(-)
>
> --- a/drivers/cpuidle/cpuidle.c
> +++ b/drivers/cpuidle/cpuidle.c
> @@ -426,7 +426,7 @@ void cpuidle_reflect(struct cpuidle_devi
>   * @dev:   the cpuidle device
>   *
>   */
> -u64 cpuidle_poll_time(struct cpuidle_driver *drv,
> +__cpuidle u64 cpuidle_poll_time(struct cpuidle_driver *drv,
>                       struct cpuidle_device *dev)
>  {
>         int i;
> --- a/drivers/cpuidle/poll_state.c
> +++ b/drivers/cpuidle/poll_state.c
> @@ -15,7 +15,6 @@ static int __cpuidle poll_idle(struct cp
>  {
>         u64 time_start;
>
> -       instrumentation_begin();
>         time_start = local_clock();
>
>         dev->poll_time_limit = false;
> @@ -42,7 +41,6 @@ static int __cpuidle poll_idle(struct cp
>         raw_local_irq_disable();
>
>         current_clr_polling();
> -       instrumentation_end();
>
>         return index;
>  }
>
>
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 3/6] ftrace/x86: Warn and ignore graph tracing when RCU is disabled
  2023-01-23 22:07       ` Steven Rostedt
@ 2023-01-24 14:44         ` Peter Zijlstra
  -1 siblings, 0 replies; 53+ messages in thread
From: Peter Zijlstra @ 2023-01-24 14:44 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: mingo, will, boqun.feng, mark.rutland, tglx, bp, dave.hansen,
	x86, hpa, seanjc, pbonzini, jgross, srivatsa, amakhalov,
	pv-drivers, mhiramat, wanpengli, vkuznets, boris.ostrovsky,
	rafael, daniel.lezcano, juri.lelli, vincent.guittot,
	dietmar.eggemann, bsegall, mgorman, bristot, vschneid,
	linux-kernel, kvm, virtualization, linux-trace-kernel, linux-pm

On Mon, Jan 23, 2023 at 05:07:53PM -0500, Steven Rostedt wrote:

> Actually, perhaps we can just add this, and all you need to do is create
> and set CONFIG_NO_RCU_TRACING (or some other name).

Elsewhere I've used CONFIG_ARCH_WANTS_NO_INSTR for this.

Anyway, I took it for a spin and it .... doesn't seems to do the job.

With my patch the first splat is

  "RCU not on for: cpuidle_poll_time+0x0/0x70"

While with yours I seems to get the endless:

  "WARNING: suspicious RCU usage"

thing. Let me see if I can figure out where it goes side-ways.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 3/6] ftrace/x86: Warn and ignore graph tracing when RCU is disabled
@ 2023-01-24 14:44         ` Peter Zijlstra
  0 siblings, 0 replies; 53+ messages in thread
From: Peter Zijlstra @ 2023-01-24 14:44 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: mark.rutland, juri.lelli, daniel.lezcano, wanpengli, kvm, rafael,
	pv-drivers, dave.hansen, virtualization, bsegall, amakhalov,
	will, vschneid, hpa, x86, mingo, mgorman, linux-trace-kernel,
	linux-pm, boqun.feng, bp, vincent.guittot, boris.ostrovsky,
	dietmar.eggemann, jgross, seanjc, linux-kernel, tglx, mhiramat,
	pbonzini, bristot

On Mon, Jan 23, 2023 at 05:07:53PM -0500, Steven Rostedt wrote:

> Actually, perhaps we can just add this, and all you need to do is create
> and set CONFIG_NO_RCU_TRACING (or some other name).

Elsewhere I've used CONFIG_ARCH_WANTS_NO_INSTR for this.

Anyway, I took it for a spin and it .... doesn't seems to do the job.

With my patch the first splat is

  "RCU not on for: cpuidle_poll_time+0x0/0x70"

While with yours I seems to get the endless:

  "WARNING: suspicious RCU usage"

thing. Let me see if I can figure out where it goes side-ways.
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/6] A few cpuidle vs rcu fixes
  2023-01-23 20:50 ` Peter Zijlstra
@ 2023-01-24 16:34   ` Mark Rutland
  -1 siblings, 0 replies; 53+ messages in thread
From: Mark Rutland @ 2023-01-24 16:34 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, will, boqun.feng, tglx, bp, dave.hansen, x86, hpa, seanjc,
	pbonzini, jgross, srivatsa, amakhalov, pv-drivers, rostedt,
	mhiramat, wanpengli, vkuznets, boris.ostrovsky, rafael,
	daniel.lezcano, juri.lelli, vincent.guittot, dietmar.eggemann,
	bsegall, mgorman, bristot, vschneid, linux-kernel, kvm,
	virtualization, linux-trace-kernel, linux-pm

Hi Peter,

On Mon, Jan 23, 2023 at 09:50:09PM +0100, Peter Zijlstra wrote:
> 0-day robot reported graph-tracing made the cpuidle-vs-rcu rework go splat.

Do you have a link toe the splat somewhere?

I'm assuming that this is partially generic, and I'd like to make sure I test
the right thing on arm64. I'll throw my usual lockdep options at the ftrace
selftests...

Thanks,
Mark.

> 
> These patches appear to cure this, the ftrace selftest now runs to completion
> without spamming scary messages to dmesg.
> 
> ---
>  arch/x86/include/asm/atomic64_32.h | 44 +++++++++++++++++++-------------------
>  arch/x86/include/asm/atomic64_64.h | 36 +++++++++++++++----------------
>  arch/x86/include/asm/kvmclock.h    |  2 +-
>  arch/x86/include/asm/paravirt.h    |  2 +-
>  arch/x86/include/asm/pvclock.h     |  3 ++-
>  arch/x86/kernel/cpu/vmware.c       |  2 +-
>  arch/x86/kernel/ftrace.c           |  3 +++
>  arch/x86/kernel/kvmclock.c         |  6 +++---
>  arch/x86/kernel/pvclock.c          | 22 +++++++++++++------
>  arch/x86/kernel/tsc.c              |  7 +++---
>  arch/x86/xen/time.c                | 12 +++++++++--
>  drivers/cpuidle/cpuidle.c          |  2 +-
>  drivers/cpuidle/poll_state.c       |  2 --
>  include/linux/math64.h             |  4 ++--
>  include/linux/sched/clock.h        |  8 +++----
>  kernel/sched/clock.c               | 27 +++++++++++++++++------
>  16 files changed, 107 insertions(+), 75 deletions(-)
> 
> 

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/6] A few cpuidle vs rcu fixes
@ 2023-01-24 16:34   ` Mark Rutland
  0 siblings, 0 replies; 53+ messages in thread
From: Mark Rutland @ 2023-01-24 16:34 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: juri.lelli, daniel.lezcano, wanpengli, kvm, rafael, pv-drivers,
	dave.hansen, virtualization, bsegall, amakhalov, will, vschneid,
	hpa, x86, mingo, mgorman, linux-trace-kernel, linux-pm,
	boqun.feng, rostedt, bp, vincent.guittot, boris.ostrovsky,
	dietmar.eggemann, jgross, seanjc, linux-kernel, tglx, mhiramat,
	pbonzini, bristot

Hi Peter,

On Mon, Jan 23, 2023 at 09:50:09PM +0100, Peter Zijlstra wrote:
> 0-day robot reported graph-tracing made the cpuidle-vs-rcu rework go splat.

Do you have a link toe the splat somewhere?

I'm assuming that this is partially generic, and I'd like to make sure I test
the right thing on arm64. I'll throw my usual lockdep options at the ftrace
selftests...

Thanks,
Mark.

> 
> These patches appear to cure this, the ftrace selftest now runs to completion
> without spamming scary messages to dmesg.
> 
> ---
>  arch/x86/include/asm/atomic64_32.h | 44 +++++++++++++++++++-------------------
>  arch/x86/include/asm/atomic64_64.h | 36 +++++++++++++++----------------
>  arch/x86/include/asm/kvmclock.h    |  2 +-
>  arch/x86/include/asm/paravirt.h    |  2 +-
>  arch/x86/include/asm/pvclock.h     |  3 ++-
>  arch/x86/kernel/cpu/vmware.c       |  2 +-
>  arch/x86/kernel/ftrace.c           |  3 +++
>  arch/x86/kernel/kvmclock.c         |  6 +++---
>  arch/x86/kernel/pvclock.c          | 22 +++++++++++++------
>  arch/x86/kernel/tsc.c              |  7 +++---
>  arch/x86/xen/time.c                | 12 +++++++++--
>  drivers/cpuidle/cpuidle.c          |  2 +-
>  drivers/cpuidle/poll_state.c       |  2 --
>  include/linux/math64.h             |  4 ++--
>  include/linux/sched/clock.h        |  8 +++----
>  kernel/sched/clock.c               | 27 +++++++++++++++++------
>  16 files changed, 107 insertions(+), 75 deletions(-)
> 
> 
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 3/6] ftrace/x86: Warn and ignore graph tracing when RCU is disabled
  2023-01-24 14:44         ` Peter Zijlstra
@ 2023-01-24 17:12           ` Mark Rutland
  -1 siblings, 0 replies; 53+ messages in thread
From: Mark Rutland @ 2023-01-24 17:12 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Steven Rostedt, mingo, will, boqun.feng, tglx, bp, dave.hansen,
	x86, hpa, seanjc, pbonzini, jgross, srivatsa, amakhalov,
	pv-drivers, mhiramat, wanpengli, vkuznets, boris.ostrovsky,
	rafael, daniel.lezcano, juri.lelli, vincent.guittot,
	dietmar.eggemann, bsegall, mgorman, bristot, vschneid,
	linux-kernel, kvm, virtualization, linux-trace-kernel, linux-pm

On Tue, Jan 24, 2023 at 03:44:35PM +0100, Peter Zijlstra wrote:
> On Mon, Jan 23, 2023 at 05:07:53PM -0500, Steven Rostedt wrote:
> 
> > Actually, perhaps we can just add this, and all you need to do is create
> > and set CONFIG_NO_RCU_TRACING (or some other name).
> 
> Elsewhere I've used CONFIG_ARCH_WANTS_NO_INSTR for this.

Yes please; if we use CONFIG_ARCH_WANTS_NO_INSTR then arm64 will get this "for
free" once we add the missing checks (which I assume we need) in our ftrace_prepare_return().

> Anyway, I took it for a spin and it .... doesn't seems to do the job.
> 
> With my patch the first splat is
> 
>   "RCU not on for: cpuidle_poll_time+0x0/0x70"
> 
> While with yours I seems to get the endless:
> 
>   "WARNING: suspicious RCU usage"
> 
> thing. Let me see if I can figure out where it goes side-ways.

Hmmm... for WARN_ONCE() don't we need to wake RCU first also? I thought we
needed that at least for the printk machinery?

Thanks,
Mark.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 3/6] ftrace/x86: Warn and ignore graph tracing when RCU is disabled
@ 2023-01-24 17:12           ` Mark Rutland
  0 siblings, 0 replies; 53+ messages in thread
From: Mark Rutland @ 2023-01-24 17:12 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: juri.lelli, daniel.lezcano, wanpengli, kvm, rafael, pv-drivers,
	dave.hansen, virtualization, bsegall, amakhalov, will, vschneid,
	hpa, x86, mingo, mgorman, linux-trace-kernel, linux-pm,
	boqun.feng, Steven Rostedt, bp, vincent.guittot, boris.ostrovsky,
	dietmar.eggemann, jgross, seanjc, linux-kernel, tglx, mhiramat,
	pbonzini, bristot

On Tue, Jan 24, 2023 at 03:44:35PM +0100, Peter Zijlstra wrote:
> On Mon, Jan 23, 2023 at 05:07:53PM -0500, Steven Rostedt wrote:
> 
> > Actually, perhaps we can just add this, and all you need to do is create
> > and set CONFIG_NO_RCU_TRACING (or some other name).
> 
> Elsewhere I've used CONFIG_ARCH_WANTS_NO_INSTR for this.

Yes please; if we use CONFIG_ARCH_WANTS_NO_INSTR then arm64 will get this "for
free" once we add the missing checks (which I assume we need) in our ftrace_prepare_return().

> Anyway, I took it for a spin and it .... doesn't seems to do the job.
> 
> With my patch the first splat is
> 
>   "RCU not on for: cpuidle_poll_time+0x0/0x70"
> 
> While with yours I seems to get the endless:
> 
>   "WARNING: suspicious RCU usage"
> 
> thing. Let me see if I can figure out where it goes side-ways.

Hmmm... for WARN_ONCE() don't we need to wake RCU first also? I thought we
needed that at least for the printk machinery?

Thanks,
Mark.
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/6] A few cpuidle vs rcu fixes
  2023-01-24 16:34   ` Mark Rutland
@ 2023-01-24 17:30     ` Mark Rutland
  -1 siblings, 0 replies; 53+ messages in thread
From: Mark Rutland @ 2023-01-24 17:30 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, will, boqun.feng, tglx, bp, dave.hansen, x86, hpa, seanjc,
	pbonzini, jgross, srivatsa, amakhalov, pv-drivers, rostedt,
	mhiramat, wanpengli, vkuznets, boris.ostrovsky, rafael,
	daniel.lezcano, juri.lelli, vincent.guittot, dietmar.eggemann,
	bsegall, mgorman, bristot, vschneid, linux-kernel, kvm,
	virtualization, linux-trace-kernel, linux-pm

On Tue, Jan 24, 2023 at 04:34:23PM +0000, Mark Rutland wrote:
> Hi Peter,
> 
> On Mon, Jan 23, 2023 at 09:50:09PM +0100, Peter Zijlstra wrote:
> > 0-day robot reported graph-tracing made the cpuidle-vs-rcu rework go splat.
> 
> Do you have a link toe the splat somewhere?
> 
> I'm assuming that this is partially generic, and I'd like to make sure I test
> the right thing on arm64. I'll throw my usual lockdep options at the ftrace
> selftests...

Hmm... with the tip sched/core branch, with or without this series applied atop
I see a couple of splats which I don't see with v6.2-rc1 (which seems to be
entirely clean). I'm not seeing any other splats.

I can trigger those reliably with the 'toplevel-enable.tc' ftrace test:

  ./ftracetest test.d/event/toplevel-enable.tc

Splats below; I'll dig into this a bit more tomorrow.

[   65.729252] ------------[ cut here ]------------
[   65.730397] WARNING: CPU: 3 PID: 1162 at include/trace/events/preemptirq.h:55 trace_preempt_on+0x68/0x70
[   65.732450] Modules linked in:
[   65.733204] CPU: 3 PID: 1162 Comm: ftracetest Not tainted 6.2.0-rc1-00100-g1066815869f5 #2
[   65.735165] Hardware name: linux,dummy-virt (DT)
[   65.736278] pstate: 40400005 (nZcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[   65.737929] pc : trace_preempt_on+0x68/0x70
[   65.738962] lr : preempt_count_sub+0xb4/0xf0
[   65.739998] sp : ffff80000e03ba70
[   65.740818] x29: ffff80000e03ba70 x28: ffff80000add07e8 x27: ffff800009d0b548
[   65.742531] x26: ffff00000742dd10 x25: ffff00000742dd00 x24: ffff80000ade11d0
[   65.744246] x23: ffff80000e03bb80 x22: ffff80000a99abb0 x21: ffff8000080a5cf4
[   65.745957] x20: ffff8000080a5cf4 x19: 0000000000000001 x18: 0000000000000000
[   65.747677] x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000
[   65.749388] x14: 0000000000000028 x13: 00000000000042d7 x12: 000000000000035f
[   65.751105] x11: 000000000000035f x10: 000000000004035f x9 : ffff8000080a5cf4
[   65.752820] x8 : ffff80000ae31a18 x7 : 0000000000000000 x6 : 0000000000000001
[   65.754526] x5 : ffff80000a8e14e8 x4 : 0000000000000003 x3 : 0000000000000000
[   65.756244] x2 : 0000000000000001 x1 : ffff8000080a5cf4 x0 : ffff8000080a5cf4
[   65.757957] Call trace:
[   65.758572]  trace_preempt_on+0x68/0x70
[   65.759520]  preempt_count_sub+0xb4/0xf0
[   65.760477]  percpu_up_read.constprop.0+0xc4/0x180
[   65.761639]  cpus_read_unlock+0x18/0x24
[   65.762579]  static_key_enable+0x2c/0x40
[   65.763572]  tracepoint_add_func+0x330/0x3dc
[   65.764611]  tracepoint_probe_register+0x74/0xc0
[   65.765725]  trace_event_reg+0x8c/0xa0
[   65.766642]  __ftrace_event_enable_disable+0x174/0x4d0
[   65.767884]  __ftrace_set_clr_event_nolock+0xe0/0x150
[   65.769109]  ftrace_set_clr_event+0x90/0x13c
[   65.770143]  ftrace_event_write+0xd4/0x120
[   65.771145]  vfs_write+0xcc/0x2f0
[   65.771964]  ksys_write+0x78/0x110
[   65.772803]  __arm64_sys_write+0x24/0x30
[   65.773763]  invoke_syscall+0x50/0x120
[   65.774681]  el0_svc_common.constprop.0+0x68/0x124
[   65.775848]  do_el0_svc+0x40/0xbc
[   65.776669]  el0_svc+0x48/0xc0
[   65.777426]  el0t_64_sync_handler+0xf4/0x120
[   65.778459]  el0t_64_sync+0x190/0x194
[   65.779365] irq event stamp: 69686
[   65.780199] hardirqs last  enabled at (69685): [<ffff8000092d5664>] _raw_spin_unlock_irqrestore+0x80/0xa0
[   65.782457] hardirqs last disabled at (69686): [<ffff8000092c3fd4>] el1_dbg+0x24/0x90
[   65.784315] softirqs last  enabled at (69622): [<ffff800008010b08>] __do_softirq+0x448/0x5bc
[   65.786309] softirqs last disabled at (69613): [<ffff800008017288>] ____do_softirq+0x18/0x24
[   65.788332] ---[ end trace 0000000000000000 ]---
[   65.789588] ------------[ cut here ]------------
[   65.790622] WARNING: CPU: 3 PID: 1162 at include/trace/events/preemptirq.h:51 trace_preempt_off+0x68/0xb0
[   65.792698] Modules linked in:
[   65.793465] CPU: 3 PID: 1162 Comm: ftracetest Tainted: G        W          6.2.0-rc1-00100-g1066815869f5 #2
[   65.795780] Hardware name: linux,dummy-virt (DT)
[   65.796898] pstate: 40400005 (nZcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[   65.798555] pc : trace_preempt_off+0x68/0xb0
[   65.799602] lr : preempt_count_add+0xa0/0xc0
[   65.800646] sp : ffff80000e03ba80
[   65.801465] x29: ffff80000e03ba80 x28: ffff80000add07e8 x27: ffff800009d0b558
[   65.803185] x26: ffff00000742dd90 x25: ffff00000742dd80 x24: ffff80000ade1188
[   65.804900] x23: ffff80000e03bb80 x22: ffff80000a99abb0 x21: ffff80000b8b7d18
[   65.806612] x20: ffff8000080a5c68 x19: ffff8000080a5c68 x18: 0000000000000000
[   65.808334] x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000
[   65.810041] x14: 0000000000000028 x13: 00000000000042d7 x12: 000000000000035f
[   65.811755] x11: 000000000000035f x10: 000000000004035f x9 : ffff8000080a5c68
[   65.813460] x8 : ffff80000ae31a18 x7 : 0000000000000000 x6 : 0000000000000003
[   65.815174] x5 : 0000000030b5c3ca x4 : 0000000000000003 x3 : 0000000000000000
[   65.816886] x2 : 0000000000000001 x1 : ffff8000080a5c68 x0 : ffff8000080a5c68
[   65.818592] Call trace:
[   65.819216]  trace_preempt_off+0x68/0xb0
[   65.820171]  preempt_count_add+0xa0/0xc0
[   65.821131]  percpu_up_read.constprop.0+0x38/0x180
[   65.822288]  cpus_read_unlock+0x18/0x24
[   65.823236]  static_key_enable+0x2c/0x40
[   65.824194]  tracepoint_add_func+0x330/0x3dc
[   65.825236]  tracepoint_probe_register+0x74/0xc0
[   65.826351]  trace_event_reg+0x8c/0xa0
[   65.827276]  __ftrace_event_enable_disable+0x174/0x4d0
[   65.828506]  __ftrace_set_clr_event_nolock+0xe0/0x150
[   65.829721]  ftrace_set_clr_event+0x90/0x13c
[   65.830769]  ftrace_event_write+0xd4/0x120
[   65.831766]  vfs_write+0xcc/0x2f0
[   65.832581]  ksys_write+0x78/0x110
[   65.833422]  __arm64_sys_write+0x24/0x30
[   65.834376]  invoke_syscall+0x50/0x120
[   65.835300]  el0_svc_common.constprop.0+0x68/0x124
[   65.836451]  do_el0_svc+0x40/0xbc
[   65.837290]  el0_svc+0x48/0xc0
[   65.838054]  el0t_64_sync_handler+0xf4/0x120
[   65.839102]  el0t_64_sync+0x190/0x194
[   65.840006] irq event stamp: 69710
[   65.840845] hardirqs last  enabled at (69709): [<ffff8000092c4028>] el1_dbg+0x78/0x90
[   65.842699] hardirqs last disabled at (69710): [<ffff8000092c3fd4>] el1_dbg+0x24/0x90
[   65.844568] softirqs last  enabled at (69694): [<ffff800008010b08>] __do_softirq+0x448/0x5bc
[   65.846573] softirqs last disabled at (69689): [<ffff800008017288>] ____do_softirq+0x18/0x24
[   65.848578] ---[ end trace 0000000000000000 ]---

Thanks,
Mark.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/6] A few cpuidle vs rcu fixes
@ 2023-01-24 17:30     ` Mark Rutland
  0 siblings, 0 replies; 53+ messages in thread
From: Mark Rutland @ 2023-01-24 17:30 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: juri.lelli, daniel.lezcano, wanpengli, kvm, rafael, pv-drivers,
	dave.hansen, virtualization, bsegall, amakhalov, will, vschneid,
	hpa, x86, mingo, mgorman, linux-trace-kernel, linux-pm,
	boqun.feng, rostedt, bp, vincent.guittot, boris.ostrovsky,
	dietmar.eggemann, jgross, seanjc, linux-kernel, tglx, mhiramat,
	pbonzini, bristot

On Tue, Jan 24, 2023 at 04:34:23PM +0000, Mark Rutland wrote:
> Hi Peter,
> 
> On Mon, Jan 23, 2023 at 09:50:09PM +0100, Peter Zijlstra wrote:
> > 0-day robot reported graph-tracing made the cpuidle-vs-rcu rework go splat.
> 
> Do you have a link toe the splat somewhere?
> 
> I'm assuming that this is partially generic, and I'd like to make sure I test
> the right thing on arm64. I'll throw my usual lockdep options at the ftrace
> selftests...

Hmm... with the tip sched/core branch, with or without this series applied atop
I see a couple of splats which I don't see with v6.2-rc1 (which seems to be
entirely clean). I'm not seeing any other splats.

I can trigger those reliably with the 'toplevel-enable.tc' ftrace test:

  ./ftracetest test.d/event/toplevel-enable.tc

Splats below; I'll dig into this a bit more tomorrow.

[   65.729252] ------------[ cut here ]------------
[   65.730397] WARNING: CPU: 3 PID: 1162 at include/trace/events/preemptirq.h:55 trace_preempt_on+0x68/0x70
[   65.732450] Modules linked in:
[   65.733204] CPU: 3 PID: 1162 Comm: ftracetest Not tainted 6.2.0-rc1-00100-g1066815869f5 #2
[   65.735165] Hardware name: linux,dummy-virt (DT)
[   65.736278] pstate: 40400005 (nZcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[   65.737929] pc : trace_preempt_on+0x68/0x70
[   65.738962] lr : preempt_count_sub+0xb4/0xf0
[   65.739998] sp : ffff80000e03ba70
[   65.740818] x29: ffff80000e03ba70 x28: ffff80000add07e8 x27: ffff800009d0b548
[   65.742531] x26: ffff00000742dd10 x25: ffff00000742dd00 x24: ffff80000ade11d0
[   65.744246] x23: ffff80000e03bb80 x22: ffff80000a99abb0 x21: ffff8000080a5cf4
[   65.745957] x20: ffff8000080a5cf4 x19: 0000000000000001 x18: 0000000000000000
[   65.747677] x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000
[   65.749388] x14: 0000000000000028 x13: 00000000000042d7 x12: 000000000000035f
[   65.751105] x11: 000000000000035f x10: 000000000004035f x9 : ffff8000080a5cf4
[   65.752820] x8 : ffff80000ae31a18 x7 : 0000000000000000 x6 : 0000000000000001
[   65.754526] x5 : ffff80000a8e14e8 x4 : 0000000000000003 x3 : 0000000000000000
[   65.756244] x2 : 0000000000000001 x1 : ffff8000080a5cf4 x0 : ffff8000080a5cf4
[   65.757957] Call trace:
[   65.758572]  trace_preempt_on+0x68/0x70
[   65.759520]  preempt_count_sub+0xb4/0xf0
[   65.760477]  percpu_up_read.constprop.0+0xc4/0x180
[   65.761639]  cpus_read_unlock+0x18/0x24
[   65.762579]  static_key_enable+0x2c/0x40
[   65.763572]  tracepoint_add_func+0x330/0x3dc
[   65.764611]  tracepoint_probe_register+0x74/0xc0
[   65.765725]  trace_event_reg+0x8c/0xa0
[   65.766642]  __ftrace_event_enable_disable+0x174/0x4d0
[   65.767884]  __ftrace_set_clr_event_nolock+0xe0/0x150
[   65.769109]  ftrace_set_clr_event+0x90/0x13c
[   65.770143]  ftrace_event_write+0xd4/0x120
[   65.771145]  vfs_write+0xcc/0x2f0
[   65.771964]  ksys_write+0x78/0x110
[   65.772803]  __arm64_sys_write+0x24/0x30
[   65.773763]  invoke_syscall+0x50/0x120
[   65.774681]  el0_svc_common.constprop.0+0x68/0x124
[   65.775848]  do_el0_svc+0x40/0xbc
[   65.776669]  el0_svc+0x48/0xc0
[   65.777426]  el0t_64_sync_handler+0xf4/0x120
[   65.778459]  el0t_64_sync+0x190/0x194
[   65.779365] irq event stamp: 69686
[   65.780199] hardirqs last  enabled at (69685): [<ffff8000092d5664>] _raw_spin_unlock_irqrestore+0x80/0xa0
[   65.782457] hardirqs last disabled at (69686): [<ffff8000092c3fd4>] el1_dbg+0x24/0x90
[   65.784315] softirqs last  enabled at (69622): [<ffff800008010b08>] __do_softirq+0x448/0x5bc
[   65.786309] softirqs last disabled at (69613): [<ffff800008017288>] ____do_softirq+0x18/0x24
[   65.788332] ---[ end trace 0000000000000000 ]---
[   65.789588] ------------[ cut here ]------------
[   65.790622] WARNING: CPU: 3 PID: 1162 at include/trace/events/preemptirq.h:51 trace_preempt_off+0x68/0xb0
[   65.792698] Modules linked in:
[   65.793465] CPU: 3 PID: 1162 Comm: ftracetest Tainted: G        W          6.2.0-rc1-00100-g1066815869f5 #2
[   65.795780] Hardware name: linux,dummy-virt (DT)
[   65.796898] pstate: 40400005 (nZcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[   65.798555] pc : trace_preempt_off+0x68/0xb0
[   65.799602] lr : preempt_count_add+0xa0/0xc0
[   65.800646] sp : ffff80000e03ba80
[   65.801465] x29: ffff80000e03ba80 x28: ffff80000add07e8 x27: ffff800009d0b558
[   65.803185] x26: ffff00000742dd90 x25: ffff00000742dd80 x24: ffff80000ade1188
[   65.804900] x23: ffff80000e03bb80 x22: ffff80000a99abb0 x21: ffff80000b8b7d18
[   65.806612] x20: ffff8000080a5c68 x19: ffff8000080a5c68 x18: 0000000000000000
[   65.808334] x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000
[   65.810041] x14: 0000000000000028 x13: 00000000000042d7 x12: 000000000000035f
[   65.811755] x11: 000000000000035f x10: 000000000004035f x9 : ffff8000080a5c68
[   65.813460] x8 : ffff80000ae31a18 x7 : 0000000000000000 x6 : 0000000000000003
[   65.815174] x5 : 0000000030b5c3ca x4 : 0000000000000003 x3 : 0000000000000000
[   65.816886] x2 : 0000000000000001 x1 : ffff8000080a5c68 x0 : ffff8000080a5c68
[   65.818592] Call trace:
[   65.819216]  trace_preempt_off+0x68/0xb0
[   65.820171]  preempt_count_add+0xa0/0xc0
[   65.821131]  percpu_up_read.constprop.0+0x38/0x180
[   65.822288]  cpus_read_unlock+0x18/0x24
[   65.823236]  static_key_enable+0x2c/0x40
[   65.824194]  tracepoint_add_func+0x330/0x3dc
[   65.825236]  tracepoint_probe_register+0x74/0xc0
[   65.826351]  trace_event_reg+0x8c/0xa0
[   65.827276]  __ftrace_event_enable_disable+0x174/0x4d0
[   65.828506]  __ftrace_set_clr_event_nolock+0xe0/0x150
[   65.829721]  ftrace_set_clr_event+0x90/0x13c
[   65.830769]  ftrace_event_write+0xd4/0x120
[   65.831766]  vfs_write+0xcc/0x2f0
[   65.832581]  ksys_write+0x78/0x110
[   65.833422]  __arm64_sys_write+0x24/0x30
[   65.834376]  invoke_syscall+0x50/0x120
[   65.835300]  el0_svc_common.constprop.0+0x68/0x124
[   65.836451]  do_el0_svc+0x40/0xbc
[   65.837290]  el0_svc+0x48/0xc0
[   65.838054]  el0t_64_sync_handler+0xf4/0x120
[   65.839102]  el0t_64_sync+0x190/0x194
[   65.840006] irq event stamp: 69710
[   65.840845] hardirqs last  enabled at (69709): [<ffff8000092c4028>] el1_dbg+0x78/0x90
[   65.842699] hardirqs last disabled at (69710): [<ffff8000092c3fd4>] el1_dbg+0x24/0x90
[   65.844568] softirqs last  enabled at (69694): [<ffff800008010b08>] __do_softirq+0x448/0x5bc
[   65.846573] softirqs last disabled at (69689): [<ffff800008017288>] ____do_softirq+0x18/0x24
[   65.848578] ---[ end trace 0000000000000000 ]---

Thanks,
Mark.
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/6] A few cpuidle vs rcu fixes
  2023-01-24 17:30     ` Mark Rutland
@ 2023-01-24 18:39       ` Mark Rutland
  -1 siblings, 0 replies; 53+ messages in thread
From: Mark Rutland @ 2023-01-24 18:39 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: juri.lelli, daniel.lezcano, wanpengli, kvm, rafael, pv-drivers,
	dave.hansen, virtualization, bsegall, amakhalov, will, vschneid,
	hpa, x86, mingo, mgorman, linux-trace-kernel, linux-pm,
	boqun.feng, rostedt, bp, vincent.guittot, boris.ostrovsky,
	dietmar.eggemann, jgross, seanjc, linux-kernel, tglx, mhiramat,
	pbonzini, bristot

On Tue, Jan 24, 2023 at 05:30:29PM +0000, Mark Rutland wrote:
> On Tue, Jan 24, 2023 at 04:34:23PM +0000, Mark Rutland wrote:
> > Hi Peter,
> > 
> > On Mon, Jan 23, 2023 at 09:50:09PM +0100, Peter Zijlstra wrote:
> > > 0-day robot reported graph-tracing made the cpuidle-vs-rcu rework go splat.
> > 
> > Do you have a link toe the splat somewhere?
> > 
> > I'm assuming that this is partially generic, and I'd like to make sure I test
> > the right thing on arm64. I'll throw my usual lockdep options at the ftrace
> > selftests...
> 
> Hmm... with the tip sched/core branch, with or without this series applied atop
> I see a couple of splats which I don't see with v6.2-rc1 (which seems to be
> entirely clean). I'm not seeing any other splats.
> 
> I can trigger those reliably with the 'toplevel-enable.tc' ftrace test:
> 
>   ./ftracetest test.d/event/toplevel-enable.tc
> 
> Splats below; I'll dig into this a bit more tomorrow.
> 
> [   65.729252] ------------[ cut here ]------------
> [   65.730397] WARNING: CPU: 3 PID: 1162 at include/trace/events/preemptirq.h:55 trace_preempt_on+0x68/0x70

The line number here is a bit inscrutible, but a bisect led me down to commit

  408b961146be4c1a ("tracing: WARN on rcuidle")

... and it appears this must be the RCUIDLE_COND() warning that adds, and that
seems to be because trace_preempt_on() calls trace_preempt_enable_rcuidle():

| void trace_preempt_on(unsigned long a0, unsigned long a1)
| {
|         if (!in_nmi())
|                 trace_preempt_enable_rcuidle(a0, a1);
|         tracer_preempt_on(a0, a1);
| }

It looks like that tracing is dependend upon CONFIG_TRACE_PREEMPT_TOGGLE, and I
have that because I enabled CONFIG_PREEMPT_TRACER. I reckon the same should
happen on x86 with CONFIG_PREEMPT_TRACER=y.

IIUC we'll need to clean up that trace_.*_rcuidle() usage too, but I'm not
entirely sure how to do that.

Thanks,
Mark.

> [   65.732450] Modules linked in:
> [   65.733204] CPU: 3 PID: 1162 Comm: ftracetest Not tainted 6.2.0-rc1-00100-g1066815869f5 #2
> [   65.735165] Hardware name: linux,dummy-virt (DT)
> [   65.736278] pstate: 40400005 (nZcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
> [   65.737929] pc : trace_preempt_on+0x68/0x70
> [   65.738962] lr : preempt_count_sub+0xb4/0xf0
> [   65.739998] sp : ffff80000e03ba70
> [   65.740818] x29: ffff80000e03ba70 x28: ffff80000add07e8 x27: ffff800009d0b548
> [   65.742531] x26: ffff00000742dd10 x25: ffff00000742dd00 x24: ffff80000ade11d0
> [   65.744246] x23: ffff80000e03bb80 x22: ffff80000a99abb0 x21: ffff8000080a5cf4
> [   65.745957] x20: ffff8000080a5cf4 x19: 0000000000000001 x18: 0000000000000000
> [   65.747677] x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000
> [   65.749388] x14: 0000000000000028 x13: 00000000000042d7 x12: 000000000000035f
> [   65.751105] x11: 000000000000035f x10: 000000000004035f x9 : ffff8000080a5cf4
> [   65.752820] x8 : ffff80000ae31a18 x7 : 0000000000000000 x6 : 0000000000000001
> [   65.754526] x5 : ffff80000a8e14e8 x4 : 0000000000000003 x3 : 0000000000000000
> [   65.756244] x2 : 0000000000000001 x1 : ffff8000080a5cf4 x0 : ffff8000080a5cf4
> [   65.757957] Call trace:
> [   65.758572]  trace_preempt_on+0x68/0x70
> [   65.759520]  preempt_count_sub+0xb4/0xf0
> [   65.760477]  percpu_up_read.constprop.0+0xc4/0x180
> [   65.761639]  cpus_read_unlock+0x18/0x24
> [   65.762579]  static_key_enable+0x2c/0x40
> [   65.763572]  tracepoint_add_func+0x330/0x3dc
> [   65.764611]  tracepoint_probe_register+0x74/0xc0
> [   65.765725]  trace_event_reg+0x8c/0xa0
> [   65.766642]  __ftrace_event_enable_disable+0x174/0x4d0
> [   65.767884]  __ftrace_set_clr_event_nolock+0xe0/0x150
> [   65.769109]  ftrace_set_clr_event+0x90/0x13c
> [   65.770143]  ftrace_event_write+0xd4/0x120
> [   65.771145]  vfs_write+0xcc/0x2f0
> [   65.771964]  ksys_write+0x78/0x110
> [   65.772803]  __arm64_sys_write+0x24/0x30
> [   65.773763]  invoke_syscall+0x50/0x120
> [   65.774681]  el0_svc_common.constprop.0+0x68/0x124
> [   65.775848]  do_el0_svc+0x40/0xbc
> [   65.776669]  el0_svc+0x48/0xc0
> [   65.777426]  el0t_64_sync_handler+0xf4/0x120
> [   65.778459]  el0t_64_sync+0x190/0x194
> [   65.779365] irq event stamp: 69686
> [   65.780199] hardirqs last  enabled at (69685): [<ffff8000092d5664>] _raw_spin_unlock_irqrestore+0x80/0xa0
> [   65.782457] hardirqs last disabled at (69686): [<ffff8000092c3fd4>] el1_dbg+0x24/0x90
> [   65.784315] softirqs last  enabled at (69622): [<ffff800008010b08>] __do_softirq+0x448/0x5bc
> [   65.786309] softirqs last disabled at (69613): [<ffff800008017288>] ____do_softirq+0x18/0x24
> [   65.788332] ---[ end trace 0000000000000000 ]---
> [   65.789588] ------------[ cut here ]------------
> [   65.790622] WARNING: CPU: 3 PID: 1162 at include/trace/events/preemptirq.h:51 trace_preempt_off+0x68/0xb0
> [   65.792698] Modules linked in:
> [   65.793465] CPU: 3 PID: 1162 Comm: ftracetest Tainted: G        W          6.2.0-rc1-00100-g1066815869f5 #2
> [   65.795780] Hardware name: linux,dummy-virt (DT)
> [   65.796898] pstate: 40400005 (nZcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
> [   65.798555] pc : trace_preempt_off+0x68/0xb0
> [   65.799602] lr : preempt_count_add+0xa0/0xc0
> [   65.800646] sp : ffff80000e03ba80
> [   65.801465] x29: ffff80000e03ba80 x28: ffff80000add07e8 x27: ffff800009d0b558
> [   65.803185] x26: ffff00000742dd90 x25: ffff00000742dd80 x24: ffff80000ade1188
> [   65.804900] x23: ffff80000e03bb80 x22: ffff80000a99abb0 x21: ffff80000b8b7d18
> [   65.806612] x20: ffff8000080a5c68 x19: ffff8000080a5c68 x18: 0000000000000000
> [   65.808334] x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000
> [   65.810041] x14: 0000000000000028 x13: 00000000000042d7 x12: 000000000000035f
> [   65.811755] x11: 000000000000035f x10: 000000000004035f x9 : ffff8000080a5c68
> [   65.813460] x8 : ffff80000ae31a18 x7 : 0000000000000000 x6 : 0000000000000003
> [   65.815174] x5 : 0000000030b5c3ca x4 : 0000000000000003 x3 : 0000000000000000
> [   65.816886] x2 : 0000000000000001 x1 : ffff8000080a5c68 x0 : ffff8000080a5c68
> [   65.818592] Call trace:
> [   65.819216]  trace_preempt_off+0x68/0xb0
> [   65.820171]  preempt_count_add+0xa0/0xc0
> [   65.821131]  percpu_up_read.constprop.0+0x38/0x180
> [   65.822288]  cpus_read_unlock+0x18/0x24
> [   65.823236]  static_key_enable+0x2c/0x40
> [   65.824194]  tracepoint_add_func+0x330/0x3dc
> [   65.825236]  tracepoint_probe_register+0x74/0xc0
> [   65.826351]  trace_event_reg+0x8c/0xa0
> [   65.827276]  __ftrace_event_enable_disable+0x174/0x4d0
> [   65.828506]  __ftrace_set_clr_event_nolock+0xe0/0x150
> [   65.829721]  ftrace_set_clr_event+0x90/0x13c
> [   65.830769]  ftrace_event_write+0xd4/0x120
> [   65.831766]  vfs_write+0xcc/0x2f0
> [   65.832581]  ksys_write+0x78/0x110
> [   65.833422]  __arm64_sys_write+0x24/0x30
> [   65.834376]  invoke_syscall+0x50/0x120
> [   65.835300]  el0_svc_common.constprop.0+0x68/0x124
> [   65.836451]  do_el0_svc+0x40/0xbc
> [   65.837290]  el0_svc+0x48/0xc0
> [   65.838054]  el0t_64_sync_handler+0xf4/0x120
> [   65.839102]  el0t_64_sync+0x190/0x194
> [   65.840006] irq event stamp: 69710
> [   65.840845] hardirqs last  enabled at (69709): [<ffff8000092c4028>] el1_dbg+0x78/0x90
> [   65.842699] hardirqs last disabled at (69710): [<ffff8000092c3fd4>] el1_dbg+0x24/0x90
> [   65.844568] softirqs last  enabled at (69694): [<ffff800008010b08>] __do_softirq+0x448/0x5bc
> [   65.846573] softirqs last disabled at (69689): [<ffff800008017288>] ____do_softirq+0x18/0x24
> [   65.848578] ---[ end trace 0000000000000000 ]---
> 
> Thanks,
> Mark.
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/6] A few cpuidle vs rcu fixes
@ 2023-01-24 18:39       ` Mark Rutland
  0 siblings, 0 replies; 53+ messages in thread
From: Mark Rutland @ 2023-01-24 18:39 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, will, boqun.feng, tglx, bp, dave.hansen, x86, hpa, seanjc,
	pbonzini, jgross, srivatsa, amakhalov, pv-drivers, rostedt,
	mhiramat, wanpengli, vkuznets, boris.ostrovsky, rafael,
	daniel.lezcano, juri.lelli, vincent.guittot, dietmar.eggemann,
	bsegall, mgorman, bristot, vschneid, linux-kernel, kvm,
	virtualization, linux-trace-kernel, linux-pm

On Tue, Jan 24, 2023 at 05:30:29PM +0000, Mark Rutland wrote:
> On Tue, Jan 24, 2023 at 04:34:23PM +0000, Mark Rutland wrote:
> > Hi Peter,
> > 
> > On Mon, Jan 23, 2023 at 09:50:09PM +0100, Peter Zijlstra wrote:
> > > 0-day robot reported graph-tracing made the cpuidle-vs-rcu rework go splat.
> > 
> > Do you have a link toe the splat somewhere?
> > 
> > I'm assuming that this is partially generic, and I'd like to make sure I test
> > the right thing on arm64. I'll throw my usual lockdep options at the ftrace
> > selftests...
> 
> Hmm... with the tip sched/core branch, with or without this series applied atop
> I see a couple of splats which I don't see with v6.2-rc1 (which seems to be
> entirely clean). I'm not seeing any other splats.
> 
> I can trigger those reliably with the 'toplevel-enable.tc' ftrace test:
> 
>   ./ftracetest test.d/event/toplevel-enable.tc
> 
> Splats below; I'll dig into this a bit more tomorrow.
> 
> [   65.729252] ------------[ cut here ]------------
> [   65.730397] WARNING: CPU: 3 PID: 1162 at include/trace/events/preemptirq.h:55 trace_preempt_on+0x68/0x70

The line number here is a bit inscrutible, but a bisect led me down to commit

  408b961146be4c1a ("tracing: WARN on rcuidle")

... and it appears this must be the RCUIDLE_COND() warning that adds, and that
seems to be because trace_preempt_on() calls trace_preempt_enable_rcuidle():

| void trace_preempt_on(unsigned long a0, unsigned long a1)
| {
|         if (!in_nmi())
|                 trace_preempt_enable_rcuidle(a0, a1);
|         tracer_preempt_on(a0, a1);
| }

It looks like that tracing is dependend upon CONFIG_TRACE_PREEMPT_TOGGLE, and I
have that because I enabled CONFIG_PREEMPT_TRACER. I reckon the same should
happen on x86 with CONFIG_PREEMPT_TRACER=y.

IIUC we'll need to clean up that trace_.*_rcuidle() usage too, but I'm not
entirely sure how to do that.

Thanks,
Mark.

> [   65.732450] Modules linked in:
> [   65.733204] CPU: 3 PID: 1162 Comm: ftracetest Not tainted 6.2.0-rc1-00100-g1066815869f5 #2
> [   65.735165] Hardware name: linux,dummy-virt (DT)
> [   65.736278] pstate: 40400005 (nZcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
> [   65.737929] pc : trace_preempt_on+0x68/0x70
> [   65.738962] lr : preempt_count_sub+0xb4/0xf0
> [   65.739998] sp : ffff80000e03ba70
> [   65.740818] x29: ffff80000e03ba70 x28: ffff80000add07e8 x27: ffff800009d0b548
> [   65.742531] x26: ffff00000742dd10 x25: ffff00000742dd00 x24: ffff80000ade11d0
> [   65.744246] x23: ffff80000e03bb80 x22: ffff80000a99abb0 x21: ffff8000080a5cf4
> [   65.745957] x20: ffff8000080a5cf4 x19: 0000000000000001 x18: 0000000000000000
> [   65.747677] x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000
> [   65.749388] x14: 0000000000000028 x13: 00000000000042d7 x12: 000000000000035f
> [   65.751105] x11: 000000000000035f x10: 000000000004035f x9 : ffff8000080a5cf4
> [   65.752820] x8 : ffff80000ae31a18 x7 : 0000000000000000 x6 : 0000000000000001
> [   65.754526] x5 : ffff80000a8e14e8 x4 : 0000000000000003 x3 : 0000000000000000
> [   65.756244] x2 : 0000000000000001 x1 : ffff8000080a5cf4 x0 : ffff8000080a5cf4
> [   65.757957] Call trace:
> [   65.758572]  trace_preempt_on+0x68/0x70
> [   65.759520]  preempt_count_sub+0xb4/0xf0
> [   65.760477]  percpu_up_read.constprop.0+0xc4/0x180
> [   65.761639]  cpus_read_unlock+0x18/0x24
> [   65.762579]  static_key_enable+0x2c/0x40
> [   65.763572]  tracepoint_add_func+0x330/0x3dc
> [   65.764611]  tracepoint_probe_register+0x74/0xc0
> [   65.765725]  trace_event_reg+0x8c/0xa0
> [   65.766642]  __ftrace_event_enable_disable+0x174/0x4d0
> [   65.767884]  __ftrace_set_clr_event_nolock+0xe0/0x150
> [   65.769109]  ftrace_set_clr_event+0x90/0x13c
> [   65.770143]  ftrace_event_write+0xd4/0x120
> [   65.771145]  vfs_write+0xcc/0x2f0
> [   65.771964]  ksys_write+0x78/0x110
> [   65.772803]  __arm64_sys_write+0x24/0x30
> [   65.773763]  invoke_syscall+0x50/0x120
> [   65.774681]  el0_svc_common.constprop.0+0x68/0x124
> [   65.775848]  do_el0_svc+0x40/0xbc
> [   65.776669]  el0_svc+0x48/0xc0
> [   65.777426]  el0t_64_sync_handler+0xf4/0x120
> [   65.778459]  el0t_64_sync+0x190/0x194
> [   65.779365] irq event stamp: 69686
> [   65.780199] hardirqs last  enabled at (69685): [<ffff8000092d5664>] _raw_spin_unlock_irqrestore+0x80/0xa0
> [   65.782457] hardirqs last disabled at (69686): [<ffff8000092c3fd4>] el1_dbg+0x24/0x90
> [   65.784315] softirqs last  enabled at (69622): [<ffff800008010b08>] __do_softirq+0x448/0x5bc
> [   65.786309] softirqs last disabled at (69613): [<ffff800008017288>] ____do_softirq+0x18/0x24
> [   65.788332] ---[ end trace 0000000000000000 ]---
> [   65.789588] ------------[ cut here ]------------
> [   65.790622] WARNING: CPU: 3 PID: 1162 at include/trace/events/preemptirq.h:51 trace_preempt_off+0x68/0xb0
> [   65.792698] Modules linked in:
> [   65.793465] CPU: 3 PID: 1162 Comm: ftracetest Tainted: G        W          6.2.0-rc1-00100-g1066815869f5 #2
> [   65.795780] Hardware name: linux,dummy-virt (DT)
> [   65.796898] pstate: 40400005 (nZcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
> [   65.798555] pc : trace_preempt_off+0x68/0xb0
> [   65.799602] lr : preempt_count_add+0xa0/0xc0
> [   65.800646] sp : ffff80000e03ba80
> [   65.801465] x29: ffff80000e03ba80 x28: ffff80000add07e8 x27: ffff800009d0b558
> [   65.803185] x26: ffff00000742dd90 x25: ffff00000742dd80 x24: ffff80000ade1188
> [   65.804900] x23: ffff80000e03bb80 x22: ffff80000a99abb0 x21: ffff80000b8b7d18
> [   65.806612] x20: ffff8000080a5c68 x19: ffff8000080a5c68 x18: 0000000000000000
> [   65.808334] x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000
> [   65.810041] x14: 0000000000000028 x13: 00000000000042d7 x12: 000000000000035f
> [   65.811755] x11: 000000000000035f x10: 000000000004035f x9 : ffff8000080a5c68
> [   65.813460] x8 : ffff80000ae31a18 x7 : 0000000000000000 x6 : 0000000000000003
> [   65.815174] x5 : 0000000030b5c3ca x4 : 0000000000000003 x3 : 0000000000000000
> [   65.816886] x2 : 0000000000000001 x1 : ffff8000080a5c68 x0 : ffff8000080a5c68
> [   65.818592] Call trace:
> [   65.819216]  trace_preempt_off+0x68/0xb0
> [   65.820171]  preempt_count_add+0xa0/0xc0
> [   65.821131]  percpu_up_read.constprop.0+0x38/0x180
> [   65.822288]  cpus_read_unlock+0x18/0x24
> [   65.823236]  static_key_enable+0x2c/0x40
> [   65.824194]  tracepoint_add_func+0x330/0x3dc
> [   65.825236]  tracepoint_probe_register+0x74/0xc0
> [   65.826351]  trace_event_reg+0x8c/0xa0
> [   65.827276]  __ftrace_event_enable_disable+0x174/0x4d0
> [   65.828506]  __ftrace_set_clr_event_nolock+0xe0/0x150
> [   65.829721]  ftrace_set_clr_event+0x90/0x13c
> [   65.830769]  ftrace_event_write+0xd4/0x120
> [   65.831766]  vfs_write+0xcc/0x2f0
> [   65.832581]  ksys_write+0x78/0x110
> [   65.833422]  __arm64_sys_write+0x24/0x30
> [   65.834376]  invoke_syscall+0x50/0x120
> [   65.835300]  el0_svc_common.constprop.0+0x68/0x124
> [   65.836451]  do_el0_svc+0x40/0xbc
> [   65.837290]  el0_svc+0x48/0xc0
> [   65.838054]  el0t_64_sync_handler+0xf4/0x120
> [   65.839102]  el0t_64_sync+0x190/0x194
> [   65.840006] irq event stamp: 69710
> [   65.840845] hardirqs last  enabled at (69709): [<ffff8000092c4028>] el1_dbg+0x78/0x90
> [   65.842699] hardirqs last disabled at (69710): [<ffff8000092c3fd4>] el1_dbg+0x24/0x90
> [   65.844568] softirqs last  enabled at (69694): [<ffff800008010b08>] __do_softirq+0x448/0x5bc
> [   65.846573] softirqs last disabled at (69689): [<ffff800008017288>] ____do_softirq+0x18/0x24
> [   65.848578] ---[ end trace 0000000000000000 ]---
> 
> Thanks,
> Mark.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/6] A few cpuidle vs rcu fixes
  2023-01-24 16:34   ` Mark Rutland
@ 2023-01-25  9:31     ` Peter Zijlstra
  -1 siblings, 0 replies; 53+ messages in thread
From: Peter Zijlstra @ 2023-01-25  9:31 UTC (permalink / raw)
  To: Mark Rutland
  Cc: mingo, will, boqun.feng, tglx, bp, dave.hansen, x86, hpa, seanjc,
	pbonzini, jgross, srivatsa, amakhalov, pv-drivers, rostedt,
	mhiramat, wanpengli, vkuznets, boris.ostrovsky, rafael,
	daniel.lezcano, juri.lelli, vincent.guittot, dietmar.eggemann,
	bsegall, mgorman, bristot, vschneid, linux-kernel, kvm,
	virtualization, linux-trace-kernel, linux-pm

On Tue, Jan 24, 2023 at 04:34:23PM +0000, Mark Rutland wrote:
> Hi Peter,
> 
> On Mon, Jan 23, 2023 at 09:50:09PM +0100, Peter Zijlstra wrote:
> > 0-day robot reported graph-tracing made the cpuidle-vs-rcu rework go splat.
> 
> Do you have a link toe the splat somewhere?
> 
> I'm assuming that this is partially generic, and I'd like to make sure I test
> the right thing on arm64. I'll throw my usual lockdep options at the ftrace
> selftests...

0-day triggered this by running tools/testing/selftests/ftrace/ftracetest,
which is what I've been using to reproduce.

If that don't work for you I can try and dig out the 0day email to see
if it has more details on.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/6] A few cpuidle vs rcu fixes
@ 2023-01-25  9:31     ` Peter Zijlstra
  0 siblings, 0 replies; 53+ messages in thread
From: Peter Zijlstra @ 2023-01-25  9:31 UTC (permalink / raw)
  To: Mark Rutland
  Cc: juri.lelli, daniel.lezcano, wanpengli, kvm, rafael, pv-drivers,
	dave.hansen, virtualization, bsegall, amakhalov, will, vschneid,
	hpa, x86, mingo, mgorman, linux-trace-kernel, linux-pm,
	boqun.feng, rostedt, bp, vincent.guittot, boris.ostrovsky,
	dietmar.eggemann, jgross, seanjc, linux-kernel, tglx, mhiramat,
	pbonzini, bristot

On Tue, Jan 24, 2023 at 04:34:23PM +0000, Mark Rutland wrote:
> Hi Peter,
> 
> On Mon, Jan 23, 2023 at 09:50:09PM +0100, Peter Zijlstra wrote:
> > 0-day robot reported graph-tracing made the cpuidle-vs-rcu rework go splat.
> 
> Do you have a link toe the splat somewhere?
> 
> I'm assuming that this is partially generic, and I'd like to make sure I test
> the right thing on arm64. I'll throw my usual lockdep options at the ftrace
> selftests...

0-day triggered this by running tools/testing/selftests/ftrace/ftracetest,
which is what I've been using to reproduce.

If that don't work for you I can try and dig out the 0day email to see
if it has more details on.
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/6] A few cpuidle vs rcu fixes
  2023-01-24 18:39       ` Mark Rutland
@ 2023-01-25  9:35         ` Peter Zijlstra
  -1 siblings, 0 replies; 53+ messages in thread
From: Peter Zijlstra @ 2023-01-25  9:35 UTC (permalink / raw)
  To: Mark Rutland
  Cc: mingo, will, boqun.feng, tglx, bp, dave.hansen, x86, hpa, seanjc,
	pbonzini, jgross, srivatsa, amakhalov, pv-drivers, rostedt,
	mhiramat, wanpengli, vkuznets, boris.ostrovsky, rafael,
	daniel.lezcano, juri.lelli, vincent.guittot, dietmar.eggemann,
	bsegall, mgorman, bristot, vschneid, linux-kernel, kvm,
	virtualization, linux-trace-kernel, linux-pm

On Tue, Jan 24, 2023 at 06:39:12PM +0000, Mark Rutland wrote:
> On Tue, Jan 24, 2023 at 05:30:29PM +0000, Mark Rutland wrote:
> > On Tue, Jan 24, 2023 at 04:34:23PM +0000, Mark Rutland wrote:
> > > Hi Peter,
> > > 
> > > On Mon, Jan 23, 2023 at 09:50:09PM +0100, Peter Zijlstra wrote:
> > > > 0-day robot reported graph-tracing made the cpuidle-vs-rcu rework go splat.
> > > 
> > > Do you have a link toe the splat somewhere?
> > > 
> > > I'm assuming that this is partially generic, and I'd like to make sure I test
> > > the right thing on arm64. I'll throw my usual lockdep options at the ftrace
> > > selftests...
> > 
> > Hmm... with the tip sched/core branch, with or without this series applied atop
> > I see a couple of splats which I don't see with v6.2-rc1 (which seems to be
> > entirely clean). I'm not seeing any other splats.
> > 
> > I can trigger those reliably with the 'toplevel-enable.tc' ftrace test:
> > 
> >   ./ftracetest test.d/event/toplevel-enable.tc
> > 
> > Splats below; I'll dig into this a bit more tomorrow.
> > 
> > [   65.729252] ------------[ cut here ]------------
> > [   65.730397] WARNING: CPU: 3 PID: 1162 at include/trace/events/preemptirq.h:55 trace_preempt_on+0x68/0x70
> 
> The line number here is a bit inscrutible, but a bisect led me down to commit
> 
>   408b961146be4c1a ("tracing: WARN on rcuidle")
> 
> ... and it appears this must be the RCUIDLE_COND() warning that adds, and that
> seems to be because trace_preempt_on() calls trace_preempt_enable_rcuidle():
> 
> | void trace_preempt_on(unsigned long a0, unsigned long a1)
> | {
> |         if (!in_nmi())
> |                 trace_preempt_enable_rcuidle(a0, a1);
> |         tracer_preempt_on(a0, a1);
> | }
> 
> It looks like that tracing is dependend upon CONFIG_TRACE_PREEMPT_TOGGLE, and I
> have that because I enabled CONFIG_PREEMPT_TRACER. I reckon the same should
> happen on x86 with CONFIG_PREEMPT_TRACER=y.
> 
> IIUC we'll need to clean up that trace_.*_rcuidle() usage too, but I'm not
> entirely sure how to do that.

tip/sched/core contains the following patch addressing this:

---
commit 9aedeaed6fc6fe8452b9b8225e95cc2b8631ff91
Author: Peter Zijlstra <peterz@infradead.org>
Date:   Thu Jan 12 20:43:49 2023 +0100

    tracing, hardirq: No moar _rcuidle() tracing
    
    Robot reported that trace_hardirqs_{on,off}() tickle the forbidden
    _rcuidle() tracepoint through local_irq_{en,dis}able().
    
    For 'sane' configs, these calls will only happen with RCU enabled and
    as such can use the regular tracepoint. This also means it's possible
    to trace them from NMI context again.
    
    Reported-by: kernel test robot <lkp@intel.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Link: https://lore.kernel.org/r/20230112195541.477416709@infradead.org

diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c
index 629f2854e12b..f992444a0b1f 100644
--- a/kernel/trace/trace_preemptirq.c
+++ b/kernel/trace/trace_preemptirq.c
@@ -19,6 +19,20 @@
 /* Per-cpu variable to prevent redundant calls when IRQs already off */
 static DEFINE_PER_CPU(int, tracing_irq_cpu);
 
+/*
+ * Use regular trace points on architectures that implement noinstr
+ * tooling: these calls will only happen with RCU enabled, which can
+ * use a regular tracepoint.
+ *
+ * On older architectures, use the rcuidle tracing methods (which
+ * aren't NMI-safe - so exclude NMI contexts):
+ */
+#ifdef CONFIG_ARCH_WANTS_NO_INSTR
+#define trace(point)	trace_##point
+#else
+#define trace(point)	if (!in_nmi()) trace_##point##_rcuidle
+#endif
+
 /*
  * Like trace_hardirqs_on() but without the lockdep invocation. This is
  * used in the low level entry code where the ordering vs. RCU is important
@@ -28,8 +42,7 @@ static DEFINE_PER_CPU(int, tracing_irq_cpu);
 void trace_hardirqs_on_prepare(void)
 {
 	if (this_cpu_read(tracing_irq_cpu)) {
-		if (!in_nmi())
-			trace_irq_enable(CALLER_ADDR0, CALLER_ADDR1);
+		trace(irq_enable)(CALLER_ADDR0, CALLER_ADDR1);
 		tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
 		this_cpu_write(tracing_irq_cpu, 0);
 	}
@@ -40,8 +53,7 @@ NOKPROBE_SYMBOL(trace_hardirqs_on_prepare);
 void trace_hardirqs_on(void)
 {
 	if (this_cpu_read(tracing_irq_cpu)) {
-		if (!in_nmi())
-			trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
+		trace(irq_enable)(CALLER_ADDR0, CALLER_ADDR1);
 		tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
 		this_cpu_write(tracing_irq_cpu, 0);
 	}
@@ -63,8 +75,7 @@ void trace_hardirqs_off_finish(void)
 	if (!this_cpu_read(tracing_irq_cpu)) {
 		this_cpu_write(tracing_irq_cpu, 1);
 		tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1);
-		if (!in_nmi())
-			trace_irq_disable(CALLER_ADDR0, CALLER_ADDR1);
+		trace(irq_disable)(CALLER_ADDR0, CALLER_ADDR1);
 	}
 
 }
@@ -78,8 +89,7 @@ void trace_hardirqs_off(void)
 	if (!this_cpu_read(tracing_irq_cpu)) {
 		this_cpu_write(tracing_irq_cpu, 1);
 		tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1);
-		if (!in_nmi())
-			trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
+		trace(irq_disable)(CALLER_ADDR0, CALLER_ADDR1);
 	}
 }
 EXPORT_SYMBOL(trace_hardirqs_off);

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/6] A few cpuidle vs rcu fixes
@ 2023-01-25  9:35         ` Peter Zijlstra
  0 siblings, 0 replies; 53+ messages in thread
From: Peter Zijlstra @ 2023-01-25  9:35 UTC (permalink / raw)
  To: Mark Rutland
  Cc: juri.lelli, daniel.lezcano, wanpengli, kvm, rafael, pv-drivers,
	dave.hansen, virtualization, bsegall, amakhalov, will, vschneid,
	hpa, x86, mingo, mgorman, linux-trace-kernel, linux-pm,
	boqun.feng, rostedt, bp, vincent.guittot, boris.ostrovsky,
	dietmar.eggemann, jgross, seanjc, linux-kernel, tglx, mhiramat,
	pbonzini, bristot

On Tue, Jan 24, 2023 at 06:39:12PM +0000, Mark Rutland wrote:
> On Tue, Jan 24, 2023 at 05:30:29PM +0000, Mark Rutland wrote:
> > On Tue, Jan 24, 2023 at 04:34:23PM +0000, Mark Rutland wrote:
> > > Hi Peter,
> > > 
> > > On Mon, Jan 23, 2023 at 09:50:09PM +0100, Peter Zijlstra wrote:
> > > > 0-day robot reported graph-tracing made the cpuidle-vs-rcu rework go splat.
> > > 
> > > Do you have a link toe the splat somewhere?
> > > 
> > > I'm assuming that this is partially generic, and I'd like to make sure I test
> > > the right thing on arm64. I'll throw my usual lockdep options at the ftrace
> > > selftests...
> > 
> > Hmm... with the tip sched/core branch, with or without this series applied atop
> > I see a couple of splats which I don't see with v6.2-rc1 (which seems to be
> > entirely clean). I'm not seeing any other splats.
> > 
> > I can trigger those reliably with the 'toplevel-enable.tc' ftrace test:
> > 
> >   ./ftracetest test.d/event/toplevel-enable.tc
> > 
> > Splats below; I'll dig into this a bit more tomorrow.
> > 
> > [   65.729252] ------------[ cut here ]------------
> > [   65.730397] WARNING: CPU: 3 PID: 1162 at include/trace/events/preemptirq.h:55 trace_preempt_on+0x68/0x70
> 
> The line number here is a bit inscrutible, but a bisect led me down to commit
> 
>   408b961146be4c1a ("tracing: WARN on rcuidle")
> 
> ... and it appears this must be the RCUIDLE_COND() warning that adds, and that
> seems to be because trace_preempt_on() calls trace_preempt_enable_rcuidle():
> 
> | void trace_preempt_on(unsigned long a0, unsigned long a1)
> | {
> |         if (!in_nmi())
> |                 trace_preempt_enable_rcuidle(a0, a1);
> |         tracer_preempt_on(a0, a1);
> | }
> 
> It looks like that tracing is dependend upon CONFIG_TRACE_PREEMPT_TOGGLE, and I
> have that because I enabled CONFIG_PREEMPT_TRACER. I reckon the same should
> happen on x86 with CONFIG_PREEMPT_TRACER=y.
> 
> IIUC we'll need to clean up that trace_.*_rcuidle() usage too, but I'm not
> entirely sure how to do that.

tip/sched/core contains the following patch addressing this:

---
commit 9aedeaed6fc6fe8452b9b8225e95cc2b8631ff91
Author: Peter Zijlstra <peterz@infradead.org>
Date:   Thu Jan 12 20:43:49 2023 +0100

    tracing, hardirq: No moar _rcuidle() tracing
    
    Robot reported that trace_hardirqs_{on,off}() tickle the forbidden
    _rcuidle() tracepoint through local_irq_{en,dis}able().
    
    For 'sane' configs, these calls will only happen with RCU enabled and
    as such can use the regular tracepoint. This also means it's possible
    to trace them from NMI context again.
    
    Reported-by: kernel test robot <lkp@intel.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Signed-off-by: Ingo Molnar <mingo@kernel.org>
    Link: https://lore.kernel.org/r/20230112195541.477416709@infradead.org

diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c
index 629f2854e12b..f992444a0b1f 100644
--- a/kernel/trace/trace_preemptirq.c
+++ b/kernel/trace/trace_preemptirq.c
@@ -19,6 +19,20 @@
 /* Per-cpu variable to prevent redundant calls when IRQs already off */
 static DEFINE_PER_CPU(int, tracing_irq_cpu);
 
+/*
+ * Use regular trace points on architectures that implement noinstr
+ * tooling: these calls will only happen with RCU enabled, which can
+ * use a regular tracepoint.
+ *
+ * On older architectures, use the rcuidle tracing methods (which
+ * aren't NMI-safe - so exclude NMI contexts):
+ */
+#ifdef CONFIG_ARCH_WANTS_NO_INSTR
+#define trace(point)	trace_##point
+#else
+#define trace(point)	if (!in_nmi()) trace_##point##_rcuidle
+#endif
+
 /*
  * Like trace_hardirqs_on() but without the lockdep invocation. This is
  * used in the low level entry code where the ordering vs. RCU is important
@@ -28,8 +42,7 @@ static DEFINE_PER_CPU(int, tracing_irq_cpu);
 void trace_hardirqs_on_prepare(void)
 {
 	if (this_cpu_read(tracing_irq_cpu)) {
-		if (!in_nmi())
-			trace_irq_enable(CALLER_ADDR0, CALLER_ADDR1);
+		trace(irq_enable)(CALLER_ADDR0, CALLER_ADDR1);
 		tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
 		this_cpu_write(tracing_irq_cpu, 0);
 	}
@@ -40,8 +53,7 @@ NOKPROBE_SYMBOL(trace_hardirqs_on_prepare);
 void trace_hardirqs_on(void)
 {
 	if (this_cpu_read(tracing_irq_cpu)) {
-		if (!in_nmi())
-			trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
+		trace(irq_enable)(CALLER_ADDR0, CALLER_ADDR1);
 		tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
 		this_cpu_write(tracing_irq_cpu, 0);
 	}
@@ -63,8 +75,7 @@ void trace_hardirqs_off_finish(void)
 	if (!this_cpu_read(tracing_irq_cpu)) {
 		this_cpu_write(tracing_irq_cpu, 1);
 		tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1);
-		if (!in_nmi())
-			trace_irq_disable(CALLER_ADDR0, CALLER_ADDR1);
+		trace(irq_disable)(CALLER_ADDR0, CALLER_ADDR1);
 	}
 
 }
@@ -78,8 +89,7 @@ void trace_hardirqs_off(void)
 	if (!this_cpu_read(tracing_irq_cpu)) {
 		this_cpu_write(tracing_irq_cpu, 1);
 		tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1);
-		if (!in_nmi())
-			trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
+		trace(irq_disable)(CALLER_ADDR0, CALLER_ADDR1);
 	}
 }
 EXPORT_SYMBOL(trace_hardirqs_off);
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/6] A few cpuidle vs rcu fixes
  2023-01-25  9:31     ` Peter Zijlstra
@ 2023-01-25  9:36       ` Mark Rutland
  -1 siblings, 0 replies; 53+ messages in thread
From: Mark Rutland @ 2023-01-25  9:36 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, will, boqun.feng, tglx, bp, dave.hansen, x86, hpa, seanjc,
	pbonzini, jgross, srivatsa, amakhalov, pv-drivers, rostedt,
	mhiramat, wanpengli, vkuznets, boris.ostrovsky, rafael,
	daniel.lezcano, juri.lelli, vincent.guittot, dietmar.eggemann,
	bsegall, mgorman, bristot, vschneid, linux-kernel, kvm,
	virtualization, linux-trace-kernel, linux-pm

On Wed, Jan 25, 2023 at 10:31:41AM +0100, Peter Zijlstra wrote:
> On Tue, Jan 24, 2023 at 04:34:23PM +0000, Mark Rutland wrote:
> > Hi Peter,
> > 
> > On Mon, Jan 23, 2023 at 09:50:09PM +0100, Peter Zijlstra wrote:
> > > 0-day robot reported graph-tracing made the cpuidle-vs-rcu rework go splat.
> > 
> > Do you have a link toe the splat somewhere?
> > 
> > I'm assuming that this is partially generic, and I'd like to make sure I test
> > the right thing on arm64. I'll throw my usual lockdep options at the ftrace
> > selftests...
> 
> 0-day triggered this by running tools/testing/selftests/ftrace/ftracetest,
> which is what I've been using to reproduce.
> 
> If that don't work for you I can try and dig out the 0day email to see
> if it has more details on.

I had a go running those on arm64, but got different splats (as per my other
replies), so I just wanted to see the backtraces and/or config to check I
wasn't missing something due to CONFIG_* or arch differences.

If you have the email to hand, that'd be great, but don't worry too much about
it!

Thanks,
Mark.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/6] A few cpuidle vs rcu fixes
@ 2023-01-25  9:36       ` Mark Rutland
  0 siblings, 0 replies; 53+ messages in thread
From: Mark Rutland @ 2023-01-25  9:36 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: juri.lelli, daniel.lezcano, wanpengli, kvm, rafael, pv-drivers,
	dave.hansen, virtualization, bsegall, amakhalov, will, vschneid,
	hpa, x86, mingo, mgorman, linux-trace-kernel, linux-pm,
	boqun.feng, rostedt, bp, vincent.guittot, boris.ostrovsky,
	dietmar.eggemann, jgross, seanjc, linux-kernel, tglx, mhiramat,
	pbonzini, bristot

On Wed, Jan 25, 2023 at 10:31:41AM +0100, Peter Zijlstra wrote:
> On Tue, Jan 24, 2023 at 04:34:23PM +0000, Mark Rutland wrote:
> > Hi Peter,
> > 
> > On Mon, Jan 23, 2023 at 09:50:09PM +0100, Peter Zijlstra wrote:
> > > 0-day robot reported graph-tracing made the cpuidle-vs-rcu rework go splat.
> > 
> > Do you have a link toe the splat somewhere?
> > 
> > I'm assuming that this is partially generic, and I'd like to make sure I test
> > the right thing on arm64. I'll throw my usual lockdep options at the ftrace
> > selftests...
> 
> 0-day triggered this by running tools/testing/selftests/ftrace/ftracetest,
> which is what I've been using to reproduce.
> 
> If that don't work for you I can try and dig out the 0day email to see
> if it has more details on.

I had a go running those on arm64, but got different splats (as per my other
replies), so I just wanted to see the backtraces and/or config to check I
wasn't missing something due to CONFIG_* or arch differences.

If you have the email to hand, that'd be great, but don't worry too much about
it!

Thanks,
Mark.
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 3/6] ftrace/x86: Warn and ignore graph tracing when RCU is disabled
  2023-01-24 17:12           ` Mark Rutland
@ 2023-01-25  9:37             ` Peter Zijlstra
  -1 siblings, 0 replies; 53+ messages in thread
From: Peter Zijlstra @ 2023-01-25  9:37 UTC (permalink / raw)
  To: Mark Rutland
  Cc: Steven Rostedt, mingo, will, boqun.feng, tglx, bp, dave.hansen,
	x86, hpa, seanjc, pbonzini, jgross, srivatsa, amakhalov,
	pv-drivers, mhiramat, wanpengli, vkuznets, boris.ostrovsky,
	rafael, daniel.lezcano, juri.lelli, vincent.guittot,
	dietmar.eggemann, bsegall, mgorman, bristot, vschneid,
	linux-kernel, kvm, virtualization, linux-trace-kernel, linux-pm

On Tue, Jan 24, 2023 at 05:12:14PM +0000, Mark Rutland wrote:
> On Tue, Jan 24, 2023 at 03:44:35PM +0100, Peter Zijlstra wrote:
> > On Mon, Jan 23, 2023 at 05:07:53PM -0500, Steven Rostedt wrote:
> > 
> > > Actually, perhaps we can just add this, and all you need to do is create
> > > and set CONFIG_NO_RCU_TRACING (or some other name).
> > 
> > Elsewhere I've used CONFIG_ARCH_WANTS_NO_INSTR for this.
> 
> Yes please; if we use CONFIG_ARCH_WANTS_NO_INSTR then arm64 will get this "for
> free" once we add the missing checks (which I assume we need) in our ftrace_prepare_return().

Aye.

> > Anyway, I took it for a spin and it .... doesn't seems to do the job.
> > 
> > With my patch the first splat is
> > 
> >   "RCU not on for: cpuidle_poll_time+0x0/0x70"
> > 
> > While with yours I seems to get the endless:
> > 
> >   "WARNING: suspicious RCU usage"
> > 
> > thing. Let me see if I can figure out where it goes side-ways.
> 
> Hmmm... for WARN_ONCE() don't we need to wake RCU first also? I thought we
> needed that at least for the printk machinery?

Yeah, I'm currently running with a hacked up printk that redirects
everything into early_printk() but it still trips up lots.

I was just about to go stick on RCU magic into WARN itself, this isn't
going to be the only site triggering this fail-cascade.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 3/6] ftrace/x86: Warn and ignore graph tracing when RCU is disabled
@ 2023-01-25  9:37             ` Peter Zijlstra
  0 siblings, 0 replies; 53+ messages in thread
From: Peter Zijlstra @ 2023-01-25  9:37 UTC (permalink / raw)
  To: Mark Rutland
  Cc: juri.lelli, daniel.lezcano, wanpengli, kvm, rafael, pv-drivers,
	dave.hansen, virtualization, bsegall, amakhalov, will, vschneid,
	hpa, x86, mingo, mgorman, linux-trace-kernel, linux-pm,
	boqun.feng, Steven Rostedt, bp, vincent.guittot, boris.ostrovsky,
	dietmar.eggemann, jgross, seanjc, linux-kernel, tglx, mhiramat,
	pbonzini, bristot

On Tue, Jan 24, 2023 at 05:12:14PM +0000, Mark Rutland wrote:
> On Tue, Jan 24, 2023 at 03:44:35PM +0100, Peter Zijlstra wrote:
> > On Mon, Jan 23, 2023 at 05:07:53PM -0500, Steven Rostedt wrote:
> > 
> > > Actually, perhaps we can just add this, and all you need to do is create
> > > and set CONFIG_NO_RCU_TRACING (or some other name).
> > 
> > Elsewhere I've used CONFIG_ARCH_WANTS_NO_INSTR for this.
> 
> Yes please; if we use CONFIG_ARCH_WANTS_NO_INSTR then arm64 will get this "for
> free" once we add the missing checks (which I assume we need) in our ftrace_prepare_return().

Aye.

> > Anyway, I took it for a spin and it .... doesn't seems to do the job.
> > 
> > With my patch the first splat is
> > 
> >   "RCU not on for: cpuidle_poll_time+0x0/0x70"
> > 
> > While with yours I seems to get the endless:
> > 
> >   "WARNING: suspicious RCU usage"
> > 
> > thing. Let me see if I can figure out where it goes side-ways.
> 
> Hmmm... for WARN_ONCE() don't we need to wake RCU first also? I thought we
> needed that at least for the printk machinery?

Yeah, I'm currently running with a hacked up printk that redirects
everything into early_printk() but it still trips up lots.

I was just about to go stick on RCU magic into WARN itself, this isn't
going to be the only site triggering this fail-cascade.
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/6] A few cpuidle vs rcu fixes
  2023-01-25  9:35         ` Peter Zijlstra
@ 2023-01-25  9:40           ` Peter Zijlstra
  -1 siblings, 0 replies; 53+ messages in thread
From: Peter Zijlstra @ 2023-01-25  9:40 UTC (permalink / raw)
  To: Mark Rutland
  Cc: mingo, will, boqun.feng, tglx, bp, dave.hansen, x86, hpa, seanjc,
	pbonzini, jgross, srivatsa, amakhalov, pv-drivers, rostedt,
	mhiramat, wanpengli, vkuznets, boris.ostrovsky, rafael,
	daniel.lezcano, juri.lelli, vincent.guittot, dietmar.eggemann,
	bsegall, mgorman, bristot, vschneid, linux-kernel, kvm,
	virtualization, linux-trace-kernel, linux-pm

On Wed, Jan 25, 2023 at 10:35:16AM +0100, Peter Zijlstra wrote:
> tip/sched/core contains the following patch addressing this:
> 
> ---
> commit 9aedeaed6fc6fe8452b9b8225e95cc2b8631ff91
> Author: Peter Zijlstra <peterz@infradead.org>
> Date:   Thu Jan 12 20:43:49 2023 +0100
> 
>     tracing, hardirq: No moar _rcuidle() tracing
>     
>     Robot reported that trace_hardirqs_{on,off}() tickle the forbidden
>     _rcuidle() tracepoint through local_irq_{en,dis}able().
>     
>     For 'sane' configs, these calls will only happen with RCU enabled and
>     as such can use the regular tracepoint. This also means it's possible
>     to trace them from NMI context again.
>     
>     Reported-by: kernel test robot <lkp@intel.com>
>     Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
>     Signed-off-by: Ingo Molnar <mingo@kernel.org>
>     Link: https://lore.kernel.org/r/20230112195541.477416709@infradead.org
> 
> diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c
> index 629f2854e12b..f992444a0b1f 100644
> --- a/kernel/trace/trace_preemptirq.c
> +++ b/kernel/trace/trace_preemptirq.c
> @@ -19,6 +19,20 @@
>  /* Per-cpu variable to prevent redundant calls when IRQs already off */
>  static DEFINE_PER_CPU(int, tracing_irq_cpu);
>  
> +/*
> + * Use regular trace points on architectures that implement noinstr
> + * tooling: these calls will only happen with RCU enabled, which can
> + * use a regular tracepoint.
> + *
> + * On older architectures, use the rcuidle tracing methods (which
> + * aren't NMI-safe - so exclude NMI contexts):
> + */
> +#ifdef CONFIG_ARCH_WANTS_NO_INSTR
> +#define trace(point)	trace_##point
> +#else
> +#define trace(point)	if (!in_nmi()) trace_##point##_rcuidle
> +#endif
> +
>  /*
>   * Like trace_hardirqs_on() but without the lockdep invocation. This is
>   * used in the low level entry code where the ordering vs. RCU is important

For some reason I missed the trace_preempt_{on,off} things, so that then
gets the below on top or so.

diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c
index f992444a0b1f..ea96b41c8838 100644
--- a/kernel/trace/trace_preemptirq.c
+++ b/kernel/trace/trace_preemptirq.c
@@ -100,15 +100,13 @@ NOKPROBE_SYMBOL(trace_hardirqs_off);
 
 void trace_preempt_on(unsigned long a0, unsigned long a1)
 {
-	if (!in_nmi())
-		trace_preempt_enable_rcuidle(a0, a1);
+	trace(preempt_enable)(a0, a1);
 	tracer_preempt_on(a0, a1);
 }
 
 void trace_preempt_off(unsigned long a0, unsigned long a1)
 {
-	if (!in_nmi())
-		trace_preempt_disable_rcuidle(a0, a1);
+	trace(preempt_disable)(a0, a1);
 	tracer_preempt_off(a0, a1);
 }
 #endif

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/6] A few cpuidle vs rcu fixes
@ 2023-01-25  9:40           ` Peter Zijlstra
  0 siblings, 0 replies; 53+ messages in thread
From: Peter Zijlstra @ 2023-01-25  9:40 UTC (permalink / raw)
  To: Mark Rutland
  Cc: juri.lelli, daniel.lezcano, wanpengli, kvm, rafael, pv-drivers,
	dave.hansen, virtualization, bsegall, amakhalov, will, vschneid,
	hpa, x86, mingo, mgorman, linux-trace-kernel, linux-pm,
	boqun.feng, rostedt, bp, vincent.guittot, boris.ostrovsky,
	dietmar.eggemann, jgross, seanjc, linux-kernel, tglx, mhiramat,
	pbonzini, bristot

On Wed, Jan 25, 2023 at 10:35:16AM +0100, Peter Zijlstra wrote:
> tip/sched/core contains the following patch addressing this:
> 
> ---
> commit 9aedeaed6fc6fe8452b9b8225e95cc2b8631ff91
> Author: Peter Zijlstra <peterz@infradead.org>
> Date:   Thu Jan 12 20:43:49 2023 +0100
> 
>     tracing, hardirq: No moar _rcuidle() tracing
>     
>     Robot reported that trace_hardirqs_{on,off}() tickle the forbidden
>     _rcuidle() tracepoint through local_irq_{en,dis}able().
>     
>     For 'sane' configs, these calls will only happen with RCU enabled and
>     as such can use the regular tracepoint. This also means it's possible
>     to trace them from NMI context again.
>     
>     Reported-by: kernel test robot <lkp@intel.com>
>     Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
>     Signed-off-by: Ingo Molnar <mingo@kernel.org>
>     Link: https://lore.kernel.org/r/20230112195541.477416709@infradead.org
> 
> diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c
> index 629f2854e12b..f992444a0b1f 100644
> --- a/kernel/trace/trace_preemptirq.c
> +++ b/kernel/trace/trace_preemptirq.c
> @@ -19,6 +19,20 @@
>  /* Per-cpu variable to prevent redundant calls when IRQs already off */
>  static DEFINE_PER_CPU(int, tracing_irq_cpu);
>  
> +/*
> + * Use regular trace points on architectures that implement noinstr
> + * tooling: these calls will only happen with RCU enabled, which can
> + * use a regular tracepoint.
> + *
> + * On older architectures, use the rcuidle tracing methods (which
> + * aren't NMI-safe - so exclude NMI contexts):
> + */
> +#ifdef CONFIG_ARCH_WANTS_NO_INSTR
> +#define trace(point)	trace_##point
> +#else
> +#define trace(point)	if (!in_nmi()) trace_##point##_rcuidle
> +#endif
> +
>  /*
>   * Like trace_hardirqs_on() but without the lockdep invocation. This is
>   * used in the low level entry code where the ordering vs. RCU is important

For some reason I missed the trace_preempt_{on,off} things, so that then
gets the below on top or so.

diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c
index f992444a0b1f..ea96b41c8838 100644
--- a/kernel/trace/trace_preemptirq.c
+++ b/kernel/trace/trace_preemptirq.c
@@ -100,15 +100,13 @@ NOKPROBE_SYMBOL(trace_hardirqs_off);
 
 void trace_preempt_on(unsigned long a0, unsigned long a1)
 {
-	if (!in_nmi())
-		trace_preempt_enable_rcuidle(a0, a1);
+	trace(preempt_enable)(a0, a1);
 	tracer_preempt_on(a0, a1);
 }
 
 void trace_preempt_off(unsigned long a0, unsigned long a1)
 {
-	if (!in_nmi())
-		trace_preempt_disable_rcuidle(a0, a1);
+	trace(preempt_disable)(a0, a1);
 	tracer_preempt_off(a0, a1);
 }
 #endif
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/6] A few cpuidle vs rcu fixes
  2023-01-25  9:40           ` Peter Zijlstra
@ 2023-01-25 10:23             ` Mark Rutland
  -1 siblings, 0 replies; 53+ messages in thread
From: Mark Rutland @ 2023-01-25 10:23 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, will, boqun.feng, tglx, bp, dave.hansen, x86, hpa, seanjc,
	pbonzini, jgross, srivatsa, amakhalov, pv-drivers, rostedt,
	mhiramat, wanpengli, vkuznets, boris.ostrovsky, rafael,
	daniel.lezcano, juri.lelli, vincent.guittot, dietmar.eggemann,
	bsegall, mgorman, bristot, vschneid, linux-kernel, kvm,
	virtualization, linux-trace-kernel, linux-pm

On Wed, Jan 25, 2023 at 10:40:17AM +0100, Peter Zijlstra wrote:
> On Wed, Jan 25, 2023 at 10:35:16AM +0100, Peter Zijlstra wrote:
> > tip/sched/core contains the following patch addressing this:
> > 
> > ---
> > commit 9aedeaed6fc6fe8452b9b8225e95cc2b8631ff91
> > Author: Peter Zijlstra <peterz@infradead.org>
> > Date:   Thu Jan 12 20:43:49 2023 +0100
> > 
> >     tracing, hardirq: No moar _rcuidle() tracing
> >     
> >     Robot reported that trace_hardirqs_{on,off}() tickle the forbidden
> >     _rcuidle() tracepoint through local_irq_{en,dis}able().
> >     
> >     For 'sane' configs, these calls will only happen with RCU enabled and
> >     as such can use the regular tracepoint. This also means it's possible
> >     to trace them from NMI context again.
> >     
> >     Reported-by: kernel test robot <lkp@intel.com>
> >     Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> >     Signed-off-by: Ingo Molnar <mingo@kernel.org>
> >     Link: https://lore.kernel.org/r/20230112195541.477416709@infradead.org
> > 
> > diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c
> > index 629f2854e12b..f992444a0b1f 100644
> > --- a/kernel/trace/trace_preemptirq.c
> > +++ b/kernel/trace/trace_preemptirq.c
> > @@ -19,6 +19,20 @@
> >  /* Per-cpu variable to prevent redundant calls when IRQs already off */
> >  static DEFINE_PER_CPU(int, tracing_irq_cpu);
> >  
> > +/*
> > + * Use regular trace points on architectures that implement noinstr
> > + * tooling: these calls will only happen with RCU enabled, which can
> > + * use a regular tracepoint.
> > + *
> > + * On older architectures, use the rcuidle tracing methods (which
> > + * aren't NMI-safe - so exclude NMI contexts):
> > + */
> > +#ifdef CONFIG_ARCH_WANTS_NO_INSTR
> > +#define trace(point)	trace_##point
> > +#else
> > +#define trace(point)	if (!in_nmi()) trace_##point##_rcuidle
> > +#endif
> > +
> >  /*
> >   * Like trace_hardirqs_on() but without the lockdep invocation. This is
> >   * used in the low level entry code where the ordering vs. RCU is important
> 
> For some reason I missed the trace_preempt_{on,off} things, so that then
> gets the below on top or so.
> 
> diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c
> index f992444a0b1f..ea96b41c8838 100644
> --- a/kernel/trace/trace_preemptirq.c
> +++ b/kernel/trace/trace_preemptirq.c
> @@ -100,15 +100,13 @@ NOKPROBE_SYMBOL(trace_hardirqs_off);
>  
>  void trace_preempt_on(unsigned long a0, unsigned long a1)
>  {
> -	if (!in_nmi())
> -		trace_preempt_enable_rcuidle(a0, a1);
> +	trace(preempt_enable)(a0, a1);
>  	tracer_preempt_on(a0, a1);
>  }
>  
>  void trace_preempt_off(unsigned long a0, unsigned long a1)
>  {
> -	if (!in_nmi())
> -		trace_preempt_disable_rcuidle(a0, a1);
> +	trace(preempt_disable)(a0, a1);
>  	tracer_preempt_off(a0, a1);
>  }
>  #endif

I've tested this fixlet atop this series (itself atop tip/sched/core) with a
full-fat ftrace config and the ftrace selftests, and that all runs cleanly.

FWIW, if you spin this as a patch:

Tested-by: Mark Rutland <mark.rutland@arm.com>

Without the fixlet I get splats on both arm64 and x86_64, e.g.

On arm64:

| ------------[ cut here ]------------
| WARNING: CPU: 1 PID: 1162 at include/trace/events/preemptirq.h:55 trace_preempt_on+0x68/0x70
| Modules linked in:
| CPU: 1 PID: 1162 Comm: ftracetest Not tainted 6.2.0-rc1-00100-g1066815869f5 #1
| Hardware name: linux,dummy-virt (DT)
| pstate: 40400005 (nZcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
| pc : trace_preempt_on+0x68/0x70
| lr : preempt_count_sub+0xb4/0xf0
| sp : ffff80000e04ba70
| x29: ffff80000e04ba70 x28: ffff80000ade09e8 x27: ffff800009d0c960
| x26: ffff000007c97e10 x25: ffff000007c97e00 x24: ffff80000adf1410
| x23: ffff80000e04bb80 x22: ffff80000a9aabb0 x21: ffff8000080a5cf4
| x20: ffff8000080a5cf4 x19: 0000000000000001 x18: 0000000000000000
| x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000
| x14: 0000000000000028 x13: 0000000000004320 x12: 0000000000000361
| x11: 0000000000000361 x10: 0000000000040361 x9 : ffff8000080a5cf4
| x8 : ffff80000ae42a18 x7 : 0000000000000000 x6 : 0000000000000001
| x5 : ffff80000a8f14e8 x4 : 0000000000000001 x3 : 0000000000000000
| x2 : 0000000000000007 x1 : ffff8000080a5cf4 x0 : ffff8000080a5cf4
| Call trace:
|  trace_preempt_on+0x68/0x70
|  preempt_count_sub+0xb4/0xf0
|  percpu_up_read.constprop.0+0xc4/0x180
|  cpus_read_unlock+0x18/0x24
|  static_key_enable+0x2c/0x40
|  tracepoint_add_func+0x330/0x3dc
|  tracepoint_probe_register+0x74/0xc0
|  trace_event_reg+0x8c/0xa0
|  __ftrace_event_enable_disable+0x174/0x4d0
|  __ftrace_set_clr_event_nolock+0xe0/0x150
|  ftrace_set_clr_event+0x90/0x13c
|  ftrace_event_write+0xd4/0x120
|  vfs_write+0xcc/0x2f0
|  ksys_write+0x78/0x110
|  __arm64_sys_write+0x24/0x30
|  invoke_syscall+0x50/0x120
|  el0_svc_common.constprop.0+0x68/0x124
|  do_el0_svc+0x40/0xbc
|  el0_svc+0x48/0xc0
|  el0t_64_sync_handler+0xf4/0x120
|  el0t_64_sync+0x190/0x194
| irq event stamp: 69662
| hardirqs last  enabled at (69661): [<ffff8000092d63f4>] _raw_spin_unlock_irqrestore+0x80/0xa0
| hardirqs last disabled at (69662): [<ffff8000092c4d64>] el1_dbg+0x24/0x90
| softirqs last  enabled at (69564): [<ffff800008010b08>] __do_softirq+0x448/0x5bc
| softirqs last disabled at (69555): [<ffff800008017288>] ____do_softirq+0x18/0x24
| ---[ end trace 0000000000000000 ]---
| ------------[ cut here ]------------
| WARNING: CPU: 1 PID: 1162 at include/trace/events/preemptirq.h:51 trace_preempt_off+0x68/0xb0
| Modules linked in:
| CPU: 1 PID: 1162 Comm: ftracetest Tainted: G        W          6.2.0-rc1-00100-g1066815869f5 #1
| Hardware name: linux,dummy-virt (DT)
| pstate: 40400005 (nZcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
| pc : trace_preempt_off+0x68/0xb0
| lr : preempt_count_add+0xa0/0xc0
| sp : ffff80000e04ba80
| x29: ffff80000e04ba80 x28: ffff80000ade09e8 x27: ffff800009d0c970
| x26: ffff000007c97e90 x25: ffff000007c97e80 x24: ffff80000adf13c8
| x23: ffff80000e04bb80 x22: ffff80000a9aabb0 x21: ffff80000b8c8d18
| x20: ffff8000080a5c68 x19: ffff8000080a5c68 x18: 0000000000000000
| x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000
| x14: 0000000000000028 x13: 0000000000004320 x12: 0000000000000361
| x11: 0000000000000361 x10: 0000000000040361 x9 : ffff8000080a5c68
| x8 : ffff80000ae42a18 x7 : 0000000000000000 x6 : 0000000000000003
| x5 : 000000003a9e6115 x4 : 0000000000000001 x3 : 0000000000000000
| x2 : 0000000000000007 x1 : ffff8000080a5c68 x0 : ffff8000080a5c68
| Call trace:
|  trace_preempt_off+0x68/0xb0
|  preempt_count_add+0xa0/0xc0
|  percpu_up_read.constprop.0+0x38/0x180
|  cpus_read_unlock+0x18/0x24
|  static_key_enable+0x2c/0x40
|  tracepoint_add_func+0x330/0x3dc
|  tracepoint_probe_register+0x74/0xc0
|  trace_event_reg+0x8c/0xa0
|  __ftrace_event_enable_disable+0x174/0x4d0
|  __ftrace_set_clr_event_nolock+0xe0/0x150
|  ftrace_set_clr_event+0x90/0x13c
|  ftrace_event_write+0xd4/0x120
|  vfs_write+0xcc/0x2f0
|  ksys_write+0x78/0x110
|  __arm64_sys_write+0x24/0x30
|  invoke_syscall+0x50/0x120
|  el0_svc_common.constprop.0+0x68/0x124
|  do_el0_svc+0x40/0xbc
|  el0_svc+0x48/0xc0
|  el0t_64_sync_handler+0xf4/0x120
|  el0t_64_sync+0x190/0x194
| irq event stamp: 69686
| hardirqs last  enabled at (69685): [<ffff8000092c4db8>] el1_dbg+0x78/0x90
| hardirqs last disabled at (69686): [<ffff8000092c4d64>] el1_dbg+0x24/0x90
| softirqs last  enabled at (69670): [<ffff800008010b08>] __do_softirq+0x448/0x5bc
| softirqs last disabled at (69665): [<ffff800008017288>] ____do_softirq+0x18/0x24
| ---[ end trace 0000000000000000 ]---

On x86_64:

| ------------[ cut here ]------------
| WARNING: CPU: 0 PID: 1083 at include/trace/events/preemptirq.h:55 trace_preempt_on+0x31/0x40
| Modules linked in:
| CPU: 0 PID: 1083 Comm: ftracetest Not tainted 6.2.0-rc1-00100-g1066815869f5 #3
| Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
| RIP: 0010:trace_preempt_on+0x31/0x40
| Code: 1d d6 45 4c a9 00 00 f0 00 74 05 e9 29 11 00 00 cc 90 e9 22 11 00 00 65 8b 05 07 d6 45 4c 89 c0 48 0f a3 05 c1 af b1 01 73 e1 <0f> 0b eb dd 66 66 2e 0f 1f 84 00 00 00 00 00 90 90 90 0
| RSP: 0018:ffffb374c0ce7ba8 EFLAGS: 00010247
| RAX: 0000000000000000 RBX: 0000000000000001 RCX: 0000000000000000
| RDX: 0000000000000001 RSI: 0000000000000000 RDI: ffffffffb49e9c1d
| RBP: 0000000000000000 R08: 000000004ea10906 R09: 000000005e02388f
| R10: 00000000f5e02388 R11: 00000000d269a5f5 R12: ffff98620005c780
| R13: ffffffffb3a363d0 R14: 0000000000000a37 R15: 0000000000000a38
| FS:  00007fc812d836a0(0000) GS:ffff98623bc00000(0000) knlGS:0000000000000000
| CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
| CR2: 0000000000642e58 CR3: 00000001039c6005 CR4: 0000000000370ef0
| DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
| DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
| Call Trace:
|  <TASK>
|  preempt_count_sub+0xa3/0xe0
|  _raw_spin_unlock+0x2d/0x50
|  ? trace_preempt_on+0x17/0x40
|  __text_poke+0x326/0x4c0
|  ? trace_preempt_on+0x17/0x40
|  text_poke_bp_batch+0x7f/0x360
|  text_poke_finish+0x1f/0x30
|  arch_jump_label_transform_apply+0x1c/0x30
|  static_key_enable_cpuslocked+0x65/0xa0
|  static_key_enable+0x1a/0x20
|  tracepoint_add_func+0x32c/0x430
|  ? __pfx_trace_event_raw_event_preemptirq_template+0x10/0x10
|  tracepoint_probe_register+0x78/0xb0
|  ? __pfx_trace_event_raw_event_preemptirq_template+0x10/0x10
|  __ftrace_event_enable_disable+0x180/0x250
|  __ftrace_set_clr_event_nolock+0xe3/0x130
|  ftrace_set_clr_event+0x74/0xf0
|  ftrace_event_write+0xdd/0x110
|  vfs_write+0xee/0x510
|  ksys_write+0x75/0x100
|  do_syscall_64+0x3e/0x90
|  entry_SYSCALL_64_after_hwframe+0x72/0xdc
| RIP: 0033:0x7fc812d09103
| Code: 8b 7c 24 08 89 c5 e8 c5 ff ff ff 89 ef 89 44 24 08 e8 81 bc 02 00 8b 44 24 08 48 83 c4 10 5d c3 48 63 ff b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 76 10 48 8b 15 4e fd 05 00 f7 d8 64 3
| RSP: 002b:00007ffcb12972f8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
| RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007fc812d09103
| RDX: 0000000000000004 RSI: 0000000000645050 RDI: 0000000000000001
| RBP: 0000000000645050 R08: fefefefefefefeff R09: ffffffff00000000
| R10: 0000000000000001 R11: 0000000000000246 R12: 0000000000000004
| R13: 00007fc812d83690 R14: 0000000000000001 R15: 0000000000000000
|  </TASK>
| irq event stamp: 57597
| hardirqs last  enabled at (57609): [<ffffffffb3b1384e>] __up_console_sem+0x5e/0x80
| hardirqs last disabled at (57620): [<ffffffffb3b13833>] __up_console_sem+0x43/0x80
| softirqs last  enabled at (57556): [<ffffffffb49eae94>] __do_softirq+0x354/0x4d7
| softirqs last disabled at (57543): [<ffffffffb3a92ad7>] irq_exit_rcu+0xc7/0x140
| ---[ end trace 0000000000000000 ]---
| ------------[ cut here ]------------
| WARNING: CPU: 0 PID: 1083 at include/trace/events/preemptirq.h:51 trace_preempt_off+0x31/0x40
| Modules linked in:
| CPU: 0 PID: 1083 Comm: ftracetest Tainted: G        W          6.2.0-rc1-00100-g1066815869f5 #3
| Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
| RIP: 0010:trace_preempt_off+0x31/0x40
| Code: cd d5 45 4c a9 00 00 f0 00 74 05 e9 f9 11 00 00 cc 90 e9 f2 11 00 00 65 8b 05 b7 d5 45 4c 89 c0 48 0f a3 05 71 af b1 01 73 e1 <0f> 0b eb dd 66 2e 0f 1f 84 00 00 00 00 00 90 90 90 90 0
| RSP: 0018:ffffb374c0ce7c18 EFLAGS: 00010247
| RAX: 0000000000000000 RBX: 0000000000000001 RCX: 0000000000000001
| RDX: 0000000000000000 RSI: ffffffffb3b74639 RDI: ffffffffb3b74639
| RBP: ffffffffb56faa10 R08: ffffffffb56faa10 R09: 000000005e02388f
| R10: 00000000f5e02388 R11: 00000000d269a5f5 R12: ffffffffb3a36d20
| R13: 0000000000000000 R14: 0000000000000000 R15: ffff9862038a0780
| FS:  00007fc812d836a0(0000) GS:ffff98623bc00000(0000) knlGS:0000000000000000
| CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
| CR2: 0000000000642e58 CR3: 00000001039c6005 CR4: 0000000000370ef0
| DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
| DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
| Call Trace:
|  <TASK>
|  on_each_cpu_cond_mask+0x29/0x80
|  text_poke_bp_batch+0xe7/0x360
|  text_poke_finish+0x1f/0x30
|  arch_jump_label_transform_apply+0x1c/0x30
|  static_key_enable_cpuslocked+0x65/0xa0
|  static_key_enable+0x1a/0x20
|  tracepoint_add_func+0x32c/0x430
|  ? __pfx_trace_event_raw_event_preemptirq_template+0x10/0x10
|  tracepoint_probe_register+0x78/0xb0
|  ? __pfx_trace_event_raw_event_preemptirq_template+0x10/0x10
|  __ftrace_event_enable_disable+0x180/0x250
|  __ftrace_set_clr_event_nolock+0xe3/0x130
|  ftrace_set_clr_event+0x74/0xf0
|  ftrace_event_write+0xdd/0x110
|  vfs_write+0xee/0x510
|  ksys_write+0x75/0x100
|  do_syscall_64+0x3e/0x90
|  entry_SYSCALL_64_after_hwframe+0x72/0xdc
| RIP: 0033:0x7fc812d09103
| Code: 8b 7c 24 08 89 c5 e8 c5 ff ff ff 89 ef 89 44 24 08 e8 81 bc 02 00 8b 44 24 08 48 83 c4 10 5d c3 48 63 ff b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 76 10 48 8b 15 4e fd 05 00 f7 d8 64 3
| RSP: 002b:00007ffcb12972f8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
| RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007fc812d09103
| RDX: 0000000000000004 RSI: 0000000000645050 RDI: 0000000000000001
| RBP: 0000000000645050 R08: fefefefefefefeff R09: ffffffff00000000
| R10: 0000000000000001 R11: 0000000000000246 R12: 0000000000000004
| R13: 00007fc812d83690 R14: 0000000000000001 R15: 0000000000000000
|  </TASK>
| irq event stamp: 58293
| hardirqs last  enabled at (58303): [<ffffffffb3b1384e>] __up_console_sem+0x5e/0x80
| hardirqs last disabled at (58314): [<ffffffffb3b13833>] __up_console_sem+0x43/0x80
| softirqs last  enabled at (57820): [<ffffffffb49eae94>] __do_softirq+0x354/0x4d7
| softirqs last disabled at (57811): [<ffffffffb3a92ad7>] irq_exit_rcu+0xc7/0x140
| ---[ end trace 0000000000000000 ]---

Thanks,
Mark.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/6] A few cpuidle vs rcu fixes
@ 2023-01-25 10:23             ` Mark Rutland
  0 siblings, 0 replies; 53+ messages in thread
From: Mark Rutland @ 2023-01-25 10:23 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: juri.lelli, daniel.lezcano, wanpengli, kvm, rafael, pv-drivers,
	dave.hansen, virtualization, bsegall, amakhalov, will, vschneid,
	hpa, x86, mingo, mgorman, linux-trace-kernel, linux-pm,
	boqun.feng, rostedt, bp, vincent.guittot, boris.ostrovsky,
	dietmar.eggemann, jgross, seanjc, linux-kernel, tglx, mhiramat,
	pbonzini, bristot

On Wed, Jan 25, 2023 at 10:40:17AM +0100, Peter Zijlstra wrote:
> On Wed, Jan 25, 2023 at 10:35:16AM +0100, Peter Zijlstra wrote:
> > tip/sched/core contains the following patch addressing this:
> > 
> > ---
> > commit 9aedeaed6fc6fe8452b9b8225e95cc2b8631ff91
> > Author: Peter Zijlstra <peterz@infradead.org>
> > Date:   Thu Jan 12 20:43:49 2023 +0100
> > 
> >     tracing, hardirq: No moar _rcuidle() tracing
> >     
> >     Robot reported that trace_hardirqs_{on,off}() tickle the forbidden
> >     _rcuidle() tracepoint through local_irq_{en,dis}able().
> >     
> >     For 'sane' configs, these calls will only happen with RCU enabled and
> >     as such can use the regular tracepoint. This also means it's possible
> >     to trace them from NMI context again.
> >     
> >     Reported-by: kernel test robot <lkp@intel.com>
> >     Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> >     Signed-off-by: Ingo Molnar <mingo@kernel.org>
> >     Link: https://lore.kernel.org/r/20230112195541.477416709@infradead.org
> > 
> > diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c
> > index 629f2854e12b..f992444a0b1f 100644
> > --- a/kernel/trace/trace_preemptirq.c
> > +++ b/kernel/trace/trace_preemptirq.c
> > @@ -19,6 +19,20 @@
> >  /* Per-cpu variable to prevent redundant calls when IRQs already off */
> >  static DEFINE_PER_CPU(int, tracing_irq_cpu);
> >  
> > +/*
> > + * Use regular trace points on architectures that implement noinstr
> > + * tooling: these calls will only happen with RCU enabled, which can
> > + * use a regular tracepoint.
> > + *
> > + * On older architectures, use the rcuidle tracing methods (which
> > + * aren't NMI-safe - so exclude NMI contexts):
> > + */
> > +#ifdef CONFIG_ARCH_WANTS_NO_INSTR
> > +#define trace(point)	trace_##point
> > +#else
> > +#define trace(point)	if (!in_nmi()) trace_##point##_rcuidle
> > +#endif
> > +
> >  /*
> >   * Like trace_hardirqs_on() but without the lockdep invocation. This is
> >   * used in the low level entry code where the ordering vs. RCU is important
> 
> For some reason I missed the trace_preempt_{on,off} things, so that then
> gets the below on top or so.
> 
> diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c
> index f992444a0b1f..ea96b41c8838 100644
> --- a/kernel/trace/trace_preemptirq.c
> +++ b/kernel/trace/trace_preemptirq.c
> @@ -100,15 +100,13 @@ NOKPROBE_SYMBOL(trace_hardirqs_off);
>  
>  void trace_preempt_on(unsigned long a0, unsigned long a1)
>  {
> -	if (!in_nmi())
> -		trace_preempt_enable_rcuidle(a0, a1);
> +	trace(preempt_enable)(a0, a1);
>  	tracer_preempt_on(a0, a1);
>  }
>  
>  void trace_preempt_off(unsigned long a0, unsigned long a1)
>  {
> -	if (!in_nmi())
> -		trace_preempt_disable_rcuidle(a0, a1);
> +	trace(preempt_disable)(a0, a1);
>  	tracer_preempt_off(a0, a1);
>  }
>  #endif

I've tested this fixlet atop this series (itself atop tip/sched/core) with a
full-fat ftrace config and the ftrace selftests, and that all runs cleanly.

FWIW, if you spin this as a patch:

Tested-by: Mark Rutland <mark.rutland@arm.com>

Without the fixlet I get splats on both arm64 and x86_64, e.g.

On arm64:

| ------------[ cut here ]------------
| WARNING: CPU: 1 PID: 1162 at include/trace/events/preemptirq.h:55 trace_preempt_on+0x68/0x70
| Modules linked in:
| CPU: 1 PID: 1162 Comm: ftracetest Not tainted 6.2.0-rc1-00100-g1066815869f5 #1
| Hardware name: linux,dummy-virt (DT)
| pstate: 40400005 (nZcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
| pc : trace_preempt_on+0x68/0x70
| lr : preempt_count_sub+0xb4/0xf0
| sp : ffff80000e04ba70
| x29: ffff80000e04ba70 x28: ffff80000ade09e8 x27: ffff800009d0c960
| x26: ffff000007c97e10 x25: ffff000007c97e00 x24: ffff80000adf1410
| x23: ffff80000e04bb80 x22: ffff80000a9aabb0 x21: ffff8000080a5cf4
| x20: ffff8000080a5cf4 x19: 0000000000000001 x18: 0000000000000000
| x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000
| x14: 0000000000000028 x13: 0000000000004320 x12: 0000000000000361
| x11: 0000000000000361 x10: 0000000000040361 x9 : ffff8000080a5cf4
| x8 : ffff80000ae42a18 x7 : 0000000000000000 x6 : 0000000000000001
| x5 : ffff80000a8f14e8 x4 : 0000000000000001 x3 : 0000000000000000
| x2 : 0000000000000007 x1 : ffff8000080a5cf4 x0 : ffff8000080a5cf4
| Call trace:
|  trace_preempt_on+0x68/0x70
|  preempt_count_sub+0xb4/0xf0
|  percpu_up_read.constprop.0+0xc4/0x180
|  cpus_read_unlock+0x18/0x24
|  static_key_enable+0x2c/0x40
|  tracepoint_add_func+0x330/0x3dc
|  tracepoint_probe_register+0x74/0xc0
|  trace_event_reg+0x8c/0xa0
|  __ftrace_event_enable_disable+0x174/0x4d0
|  __ftrace_set_clr_event_nolock+0xe0/0x150
|  ftrace_set_clr_event+0x90/0x13c
|  ftrace_event_write+0xd4/0x120
|  vfs_write+0xcc/0x2f0
|  ksys_write+0x78/0x110
|  __arm64_sys_write+0x24/0x30
|  invoke_syscall+0x50/0x120
|  el0_svc_common.constprop.0+0x68/0x124
|  do_el0_svc+0x40/0xbc
|  el0_svc+0x48/0xc0
|  el0t_64_sync_handler+0xf4/0x120
|  el0t_64_sync+0x190/0x194
| irq event stamp: 69662
| hardirqs last  enabled at (69661): [<ffff8000092d63f4>] _raw_spin_unlock_irqrestore+0x80/0xa0
| hardirqs last disabled at (69662): [<ffff8000092c4d64>] el1_dbg+0x24/0x90
| softirqs last  enabled at (69564): [<ffff800008010b08>] __do_softirq+0x448/0x5bc
| softirqs last disabled at (69555): [<ffff800008017288>] ____do_softirq+0x18/0x24
| ---[ end trace 0000000000000000 ]---
| ------------[ cut here ]------------
| WARNING: CPU: 1 PID: 1162 at include/trace/events/preemptirq.h:51 trace_preempt_off+0x68/0xb0
| Modules linked in:
| CPU: 1 PID: 1162 Comm: ftracetest Tainted: G        W          6.2.0-rc1-00100-g1066815869f5 #1
| Hardware name: linux,dummy-virt (DT)
| pstate: 40400005 (nZcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
| pc : trace_preempt_off+0x68/0xb0
| lr : preempt_count_add+0xa0/0xc0
| sp : ffff80000e04ba80
| x29: ffff80000e04ba80 x28: ffff80000ade09e8 x27: ffff800009d0c970
| x26: ffff000007c97e90 x25: ffff000007c97e80 x24: ffff80000adf13c8
| x23: ffff80000e04bb80 x22: ffff80000a9aabb0 x21: ffff80000b8c8d18
| x20: ffff8000080a5c68 x19: ffff8000080a5c68 x18: 0000000000000000
| x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000
| x14: 0000000000000028 x13: 0000000000004320 x12: 0000000000000361
| x11: 0000000000000361 x10: 0000000000040361 x9 : ffff8000080a5c68
| x8 : ffff80000ae42a18 x7 : 0000000000000000 x6 : 0000000000000003
| x5 : 000000003a9e6115 x4 : 0000000000000001 x3 : 0000000000000000
| x2 : 0000000000000007 x1 : ffff8000080a5c68 x0 : ffff8000080a5c68
| Call trace:
|  trace_preempt_off+0x68/0xb0
|  preempt_count_add+0xa0/0xc0
|  percpu_up_read.constprop.0+0x38/0x180
|  cpus_read_unlock+0x18/0x24
|  static_key_enable+0x2c/0x40
|  tracepoint_add_func+0x330/0x3dc
|  tracepoint_probe_register+0x74/0xc0
|  trace_event_reg+0x8c/0xa0
|  __ftrace_event_enable_disable+0x174/0x4d0
|  __ftrace_set_clr_event_nolock+0xe0/0x150
|  ftrace_set_clr_event+0x90/0x13c
|  ftrace_event_write+0xd4/0x120
|  vfs_write+0xcc/0x2f0
|  ksys_write+0x78/0x110
|  __arm64_sys_write+0x24/0x30
|  invoke_syscall+0x50/0x120
|  el0_svc_common.constprop.0+0x68/0x124
|  do_el0_svc+0x40/0xbc
|  el0_svc+0x48/0xc0
|  el0t_64_sync_handler+0xf4/0x120
|  el0t_64_sync+0x190/0x194
| irq event stamp: 69686
| hardirqs last  enabled at (69685): [<ffff8000092c4db8>] el1_dbg+0x78/0x90
| hardirqs last disabled at (69686): [<ffff8000092c4d64>] el1_dbg+0x24/0x90
| softirqs last  enabled at (69670): [<ffff800008010b08>] __do_softirq+0x448/0x5bc
| softirqs last disabled at (69665): [<ffff800008017288>] ____do_softirq+0x18/0x24
| ---[ end trace 0000000000000000 ]---

On x86_64:

| ------------[ cut here ]------------
| WARNING: CPU: 0 PID: 1083 at include/trace/events/preemptirq.h:55 trace_preempt_on+0x31/0x40
| Modules linked in:
| CPU: 0 PID: 1083 Comm: ftracetest Not tainted 6.2.0-rc1-00100-g1066815869f5 #3
| Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
| RIP: 0010:trace_preempt_on+0x31/0x40
| Code: 1d d6 45 4c a9 00 00 f0 00 74 05 e9 29 11 00 00 cc 90 e9 22 11 00 00 65 8b 05 07 d6 45 4c 89 c0 48 0f a3 05 c1 af b1 01 73 e1 <0f> 0b eb dd 66 66 2e 0f 1f 84 00 00 00 00 00 90 90 90 0
| RSP: 0018:ffffb374c0ce7ba8 EFLAGS: 00010247
| RAX: 0000000000000000 RBX: 0000000000000001 RCX: 0000000000000000
| RDX: 0000000000000001 RSI: 0000000000000000 RDI: ffffffffb49e9c1d
| RBP: 0000000000000000 R08: 000000004ea10906 R09: 000000005e02388f
| R10: 00000000f5e02388 R11: 00000000d269a5f5 R12: ffff98620005c780
| R13: ffffffffb3a363d0 R14: 0000000000000a37 R15: 0000000000000a38
| FS:  00007fc812d836a0(0000) GS:ffff98623bc00000(0000) knlGS:0000000000000000
| CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
| CR2: 0000000000642e58 CR3: 00000001039c6005 CR4: 0000000000370ef0
| DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
| DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
| Call Trace:
|  <TASK>
|  preempt_count_sub+0xa3/0xe0
|  _raw_spin_unlock+0x2d/0x50
|  ? trace_preempt_on+0x17/0x40
|  __text_poke+0x326/0x4c0
|  ? trace_preempt_on+0x17/0x40
|  text_poke_bp_batch+0x7f/0x360
|  text_poke_finish+0x1f/0x30
|  arch_jump_label_transform_apply+0x1c/0x30
|  static_key_enable_cpuslocked+0x65/0xa0
|  static_key_enable+0x1a/0x20
|  tracepoint_add_func+0x32c/0x430
|  ? __pfx_trace_event_raw_event_preemptirq_template+0x10/0x10
|  tracepoint_probe_register+0x78/0xb0
|  ? __pfx_trace_event_raw_event_preemptirq_template+0x10/0x10
|  __ftrace_event_enable_disable+0x180/0x250
|  __ftrace_set_clr_event_nolock+0xe3/0x130
|  ftrace_set_clr_event+0x74/0xf0
|  ftrace_event_write+0xdd/0x110
|  vfs_write+0xee/0x510
|  ksys_write+0x75/0x100
|  do_syscall_64+0x3e/0x90
|  entry_SYSCALL_64_after_hwframe+0x72/0xdc
| RIP: 0033:0x7fc812d09103
| Code: 8b 7c 24 08 89 c5 e8 c5 ff ff ff 89 ef 89 44 24 08 e8 81 bc 02 00 8b 44 24 08 48 83 c4 10 5d c3 48 63 ff b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 76 10 48 8b 15 4e fd 05 00 f7 d8 64 3
| RSP: 002b:00007ffcb12972f8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
| RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007fc812d09103
| RDX: 0000000000000004 RSI: 0000000000645050 RDI: 0000000000000001
| RBP: 0000000000645050 R08: fefefefefefefeff R09: ffffffff00000000
| R10: 0000000000000001 R11: 0000000000000246 R12: 0000000000000004
| R13: 00007fc812d83690 R14: 0000000000000001 R15: 0000000000000000
|  </TASK>
| irq event stamp: 57597
| hardirqs last  enabled at (57609): [<ffffffffb3b1384e>] __up_console_sem+0x5e/0x80
| hardirqs last disabled at (57620): [<ffffffffb3b13833>] __up_console_sem+0x43/0x80
| softirqs last  enabled at (57556): [<ffffffffb49eae94>] __do_softirq+0x354/0x4d7
| softirqs last disabled at (57543): [<ffffffffb3a92ad7>] irq_exit_rcu+0xc7/0x140
| ---[ end trace 0000000000000000 ]---
| ------------[ cut here ]------------
| WARNING: CPU: 0 PID: 1083 at include/trace/events/preemptirq.h:51 trace_preempt_off+0x31/0x40
| Modules linked in:
| CPU: 0 PID: 1083 Comm: ftracetest Tainted: G        W          6.2.0-rc1-00100-g1066815869f5 #3
| Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
| RIP: 0010:trace_preempt_off+0x31/0x40
| Code: cd d5 45 4c a9 00 00 f0 00 74 05 e9 f9 11 00 00 cc 90 e9 f2 11 00 00 65 8b 05 b7 d5 45 4c 89 c0 48 0f a3 05 71 af b1 01 73 e1 <0f> 0b eb dd 66 2e 0f 1f 84 00 00 00 00 00 90 90 90 90 0
| RSP: 0018:ffffb374c0ce7c18 EFLAGS: 00010247
| RAX: 0000000000000000 RBX: 0000000000000001 RCX: 0000000000000001
| RDX: 0000000000000000 RSI: ffffffffb3b74639 RDI: ffffffffb3b74639
| RBP: ffffffffb56faa10 R08: ffffffffb56faa10 R09: 000000005e02388f
| R10: 00000000f5e02388 R11: 00000000d269a5f5 R12: ffffffffb3a36d20
| R13: 0000000000000000 R14: 0000000000000000 R15: ffff9862038a0780
| FS:  00007fc812d836a0(0000) GS:ffff98623bc00000(0000) knlGS:0000000000000000
| CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
| CR2: 0000000000642e58 CR3: 00000001039c6005 CR4: 0000000000370ef0
| DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
| DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
| Call Trace:
|  <TASK>
|  on_each_cpu_cond_mask+0x29/0x80
|  text_poke_bp_batch+0xe7/0x360
|  text_poke_finish+0x1f/0x30
|  arch_jump_label_transform_apply+0x1c/0x30
|  static_key_enable_cpuslocked+0x65/0xa0
|  static_key_enable+0x1a/0x20
|  tracepoint_add_func+0x32c/0x430
|  ? __pfx_trace_event_raw_event_preemptirq_template+0x10/0x10
|  tracepoint_probe_register+0x78/0xb0
|  ? __pfx_trace_event_raw_event_preemptirq_template+0x10/0x10
|  __ftrace_event_enable_disable+0x180/0x250
|  __ftrace_set_clr_event_nolock+0xe3/0x130
|  ftrace_set_clr_event+0x74/0xf0
|  ftrace_event_write+0xdd/0x110
|  vfs_write+0xee/0x510
|  ksys_write+0x75/0x100
|  do_syscall_64+0x3e/0x90
|  entry_SYSCALL_64_after_hwframe+0x72/0xdc
| RIP: 0033:0x7fc812d09103
| Code: 8b 7c 24 08 89 c5 e8 c5 ff ff ff 89 ef 89 44 24 08 e8 81 bc 02 00 8b 44 24 08 48 83 c4 10 5d c3 48 63 ff b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 76 10 48 8b 15 4e fd 05 00 f7 d8 64 3
| RSP: 002b:00007ffcb12972f8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
| RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007fc812d09103
| RDX: 0000000000000004 RSI: 0000000000645050 RDI: 0000000000000001
| RBP: 0000000000645050 R08: fefefefefefefeff R09: ffffffff00000000
| R10: 0000000000000001 R11: 0000000000000246 R12: 0000000000000004
| R13: 00007fc812d83690 R14: 0000000000000001 R15: 0000000000000000
|  </TASK>
| irq event stamp: 58293
| hardirqs last  enabled at (58303): [<ffffffffb3b1384e>] __up_console_sem+0x5e/0x80
| hardirqs last disabled at (58314): [<ffffffffb3b13833>] __up_console_sem+0x43/0x80
| softirqs last  enabled at (57820): [<ffffffffb49eae94>] __do_softirq+0x354/0x4d7
| softirqs last disabled at (57811): [<ffffffffb3a92ad7>] irq_exit_rcu+0xc7/0x140
| ---[ end trace 0000000000000000 ]---

Thanks,
Mark.
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 3/6] ftrace/x86: Warn and ignore graph tracing when RCU is disabled
  2023-01-24 17:12           ` Mark Rutland
@ 2023-01-25 10:47             ` Peter Zijlstra
  -1 siblings, 0 replies; 53+ messages in thread
From: Peter Zijlstra @ 2023-01-25 10:47 UTC (permalink / raw)
  To: Mark Rutland
  Cc: juri.lelli, daniel.lezcano, wanpengli, kvm, rafael, pv-drivers,
	Frederic Weisbecker, dave.hansen, virtualization, bsegall,
	amakhalov, will, vschneid, hpa, x86, mingo, mgorman,
	linux-trace-kernel, Paul McKenney, linux-pm, boqun.feng,
	Steven Rostedt, bp, vincent.guittot, boris.ostrovsky,
	dietmar.eggemann, jgross, seanjc, linux-kernel, tglx, mhiramat,
	pbonzini, bristot

On Tue, Jan 24, 2023 at 05:12:14PM +0000, Mark Rutland wrote:
> On Tue, Jan 24, 2023 at 03:44:35PM +0100, Peter Zijlstra wrote:
> > On Mon, Jan 23, 2023 at 05:07:53PM -0500, Steven Rostedt wrote:
> > 
> > > Actually, perhaps we can just add this, and all you need to do is create
> > > and set CONFIG_NO_RCU_TRACING (or some other name).
> > 
> > Elsewhere I've used CONFIG_ARCH_WANTS_NO_INSTR for this.
> 
> Yes please; if we use CONFIG_ARCH_WANTS_NO_INSTR then arm64 will get this "for
> free" once we add the missing checks (which I assume we need) in our ftrace_prepare_return().
> 
> > Anyway, I took it for a spin and it .... doesn't seems to do the job.
> > 
> > With my patch the first splat is
> > 
> >   "RCU not on for: cpuidle_poll_time+0x0/0x70"
> > 
> > While with yours I seems to get the endless:
> > 
> >   "WARNING: suspicious RCU usage"
> > 
> > thing. Let me see if I can figure out where it goes side-ways.
> 
> Hmmm... for WARN_ONCE() don't we need to wake RCU first also? I thought we
> needed that at least for the printk machinery?

OK, the below seems to work nice for me -- although I'm still on a
hacked up printk, but the recursive RCU not watching fail seems to be
tamed.

Ofc. Paul might have an opinion on this glorious bodge ;-)

---

diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h
index c303f7a114e9..d48cd92d2364 100644
--- a/include/linux/trace_recursion.h
+++ b/include/linux/trace_recursion.h
@@ -135,6 +135,21 @@ extern void ftrace_record_recursion(unsigned long ip, unsigned long parent_ip);
 # define do_ftrace_record_recursion(ip, pip)	do { } while (0)
 #endif
 
+#ifdef CONFIG_ARCH_WANTS_NO_INSTR
+# define trace_warn_on_no_rcu(ip)					\
+	({								\
+		bool __ret = !rcu_is_watching();			\
+		if (__ret && !trace_recursion_test(TRACE_RECORD_RECURSION_BIT)) { \
+			trace_recursion_set(TRACE_RECORD_RECURSION_BIT); \
+			WARN_ONCE(true, "RCU not on for: %pS\n", (void *)ip); \
+			trace_recursion_clear(TRACE_RECORD_RECURSION_BIT); \
+		}							\
+		__ret;							\
+	})
+#else
+# define trace_warn_on_no_rcu(ip)	false
+#endif
+
 /*
  * Preemption is promised to be disabled when return bit >= 0.
  */
@@ -144,6 +159,9 @@ static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsign
 	unsigned int val = READ_ONCE(current->trace_recursion);
 	int bit;
 
+	if (trace_warn_on_no_rcu(ip))
+		return -1;
+
 	bit = trace_get_context_bit() + start;
 	if (unlikely(val & (1 << bit))) {
 		/*
diff --git a/lib/bug.c b/lib/bug.c
index c223a2575b72..0a10643ea168 100644
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -47,6 +47,7 @@
 #include <linux/sched.h>
 #include <linux/rculist.h>
 #include <linux/ftrace.h>
+#include <linux/context_tracking.h>
 
 extern struct bug_entry __start___bug_table[], __stop___bug_table[];
 
@@ -153,7 +154,7 @@ struct bug_entry *find_bug(unsigned long bugaddr)
 	return module_find_bug(bugaddr);
 }
 
-enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
+static enum bug_trap_type __report_bug(unsigned long bugaddr, struct pt_regs *regs)
 {
 	struct bug_entry *bug;
 	const char *file;
@@ -209,6 +210,30 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
 	return BUG_TRAP_TYPE_BUG;
 }
 
+enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
+{
+	enum bug_trap_type ret;
+	bool rcu = false;
+
+#ifdef CONFIG_CONTEXT_TRACKING_IDLE
+	/*
+	 * Horrible hack to shut up recursive RCU isn't watching fail since
+	 * lots of the actual reporting also relies on RCU.
+	 */
+	if (!rcu_is_watching()) {
+		rcu = true;
+		ct_state_inc(RCU_DYNTICKS_IDX);
+	}
+#endif
+
+	ret = __report_bug(bugaddr, regs);
+
+	if (rcu)
+		ct_state_inc(RCU_DYNTICKS_IDX);
+
+	return ret;
+}
+
 static void clear_once_table(struct bug_entry *start, struct bug_entry *end)
 {
 	struct bug_entry *bug;
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* Re: [PATCH 3/6] ftrace/x86: Warn and ignore graph tracing when RCU is disabled
@ 2023-01-25 10:47             ` Peter Zijlstra
  0 siblings, 0 replies; 53+ messages in thread
From: Peter Zijlstra @ 2023-01-25 10:47 UTC (permalink / raw)
  To: Mark Rutland
  Cc: Steven Rostedt, mingo, will, boqun.feng, tglx, bp, dave.hansen,
	x86, hpa, seanjc, pbonzini, jgross, srivatsa, amakhalov,
	pv-drivers, mhiramat, wanpengli, vkuznets, boris.ostrovsky,
	rafael, daniel.lezcano, juri.lelli, vincent.guittot,
	dietmar.eggemann, bsegall, mgorman, bristot, vschneid,
	linux-kernel, kvm, virtualization, linux-trace-kernel, linux-pm,
	Paul McKenney, Frederic Weisbecker

On Tue, Jan 24, 2023 at 05:12:14PM +0000, Mark Rutland wrote:
> On Tue, Jan 24, 2023 at 03:44:35PM +0100, Peter Zijlstra wrote:
> > On Mon, Jan 23, 2023 at 05:07:53PM -0500, Steven Rostedt wrote:
> > 
> > > Actually, perhaps we can just add this, and all you need to do is create
> > > and set CONFIG_NO_RCU_TRACING (or some other name).
> > 
> > Elsewhere I've used CONFIG_ARCH_WANTS_NO_INSTR for this.
> 
> Yes please; if we use CONFIG_ARCH_WANTS_NO_INSTR then arm64 will get this "for
> free" once we add the missing checks (which I assume we need) in our ftrace_prepare_return().
> 
> > Anyway, I took it for a spin and it .... doesn't seems to do the job.
> > 
> > With my patch the first splat is
> > 
> >   "RCU not on for: cpuidle_poll_time+0x0/0x70"
> > 
> > While with yours I seems to get the endless:
> > 
> >   "WARNING: suspicious RCU usage"
> > 
> > thing. Let me see if I can figure out where it goes side-ways.
> 
> Hmmm... for WARN_ONCE() don't we need to wake RCU first also? I thought we
> needed that at least for the printk machinery?

OK, the below seems to work nice for me -- although I'm still on a
hacked up printk, but the recursive RCU not watching fail seems to be
tamed.

Ofc. Paul might have an opinion on this glorious bodge ;-)

---

diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h
index c303f7a114e9..d48cd92d2364 100644
--- a/include/linux/trace_recursion.h
+++ b/include/linux/trace_recursion.h
@@ -135,6 +135,21 @@ extern void ftrace_record_recursion(unsigned long ip, unsigned long parent_ip);
 # define do_ftrace_record_recursion(ip, pip)	do { } while (0)
 #endif
 
+#ifdef CONFIG_ARCH_WANTS_NO_INSTR
+# define trace_warn_on_no_rcu(ip)					\
+	({								\
+		bool __ret = !rcu_is_watching();			\
+		if (__ret && !trace_recursion_test(TRACE_RECORD_RECURSION_BIT)) { \
+			trace_recursion_set(TRACE_RECORD_RECURSION_BIT); \
+			WARN_ONCE(true, "RCU not on for: %pS\n", (void *)ip); \
+			trace_recursion_clear(TRACE_RECORD_RECURSION_BIT); \
+		}							\
+		__ret;							\
+	})
+#else
+# define trace_warn_on_no_rcu(ip)	false
+#endif
+
 /*
  * Preemption is promised to be disabled when return bit >= 0.
  */
@@ -144,6 +159,9 @@ static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsign
 	unsigned int val = READ_ONCE(current->trace_recursion);
 	int bit;
 
+	if (trace_warn_on_no_rcu(ip))
+		return -1;
+
 	bit = trace_get_context_bit() + start;
 	if (unlikely(val & (1 << bit))) {
 		/*
diff --git a/lib/bug.c b/lib/bug.c
index c223a2575b72..0a10643ea168 100644
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -47,6 +47,7 @@
 #include <linux/sched.h>
 #include <linux/rculist.h>
 #include <linux/ftrace.h>
+#include <linux/context_tracking.h>
 
 extern struct bug_entry __start___bug_table[], __stop___bug_table[];
 
@@ -153,7 +154,7 @@ struct bug_entry *find_bug(unsigned long bugaddr)
 	return module_find_bug(bugaddr);
 }
 
-enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
+static enum bug_trap_type __report_bug(unsigned long bugaddr, struct pt_regs *regs)
 {
 	struct bug_entry *bug;
 	const char *file;
@@ -209,6 +210,30 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
 	return BUG_TRAP_TYPE_BUG;
 }
 
+enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
+{
+	enum bug_trap_type ret;
+	bool rcu = false;
+
+#ifdef CONFIG_CONTEXT_TRACKING_IDLE
+	/*
+	 * Horrible hack to shut up recursive RCU isn't watching fail since
+	 * lots of the actual reporting also relies on RCU.
+	 */
+	if (!rcu_is_watching()) {
+		rcu = true;
+		ct_state_inc(RCU_DYNTICKS_IDX);
+	}
+#endif
+
+	ret = __report_bug(bugaddr, regs);
+
+	if (rcu)
+		ct_state_inc(RCU_DYNTICKS_IDX);
+
+	return ret;
+}
+
 static void clear_once_table(struct bug_entry *start, struct bug_entry *end)
 {
 	struct bug_entry *bug;

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* Re: [PATCH 3/6] ftrace/x86: Warn and ignore graph tracing when RCU is disabled
  2023-01-25 10:47             ` Peter Zijlstra
@ 2023-01-25 11:32               ` Mark Rutland
  -1 siblings, 0 replies; 53+ messages in thread
From: Mark Rutland @ 2023-01-25 11:32 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: juri.lelli, daniel.lezcano, wanpengli, kvm, rafael, pv-drivers,
	Frederic Weisbecker, dave.hansen, virtualization, bsegall,
	amakhalov, will, vschneid, hpa, x86, mingo, mgorman,
	linux-trace-kernel, Paul McKenney, linux-pm, boqun.feng,
	Steven Rostedt, bp, vincent.guittot, boris.ostrovsky,
	dietmar.eggemann, jgross, seanjc, linux-kernel, tglx, mhiramat,
	pbonzini, bristot

On Wed, Jan 25, 2023 at 11:47:44AM +0100, Peter Zijlstra wrote:
> On Tue, Jan 24, 2023 at 05:12:14PM +0000, Mark Rutland wrote:
> > On Tue, Jan 24, 2023 at 03:44:35PM +0100, Peter Zijlstra wrote:
> > > On Mon, Jan 23, 2023 at 05:07:53PM -0500, Steven Rostedt wrote:
> > > 
> > > > Actually, perhaps we can just add this, and all you need to do is create
> > > > and set CONFIG_NO_RCU_TRACING (or some other name).
> > > 
> > > Elsewhere I've used CONFIG_ARCH_WANTS_NO_INSTR for this.
> > 
> > Yes please; if we use CONFIG_ARCH_WANTS_NO_INSTR then arm64 will get this "for
> > free" once we add the missing checks (which I assume we need) in our ftrace_prepare_return().
> > 
> > > Anyway, I took it for a spin and it .... doesn't seems to do the job.
> > > 
> > > With my patch the first splat is
> > > 
> > >   "RCU not on for: cpuidle_poll_time+0x0/0x70"
> > > 
> > > While with yours I seems to get the endless:
> > > 
> > >   "WARNING: suspicious RCU usage"
> > > 
> > > thing. Let me see if I can figure out where it goes side-ways.
> > 
> > Hmmm... for WARN_ONCE() don't we need to wake RCU first also? I thought we
> > needed that at least for the printk machinery?
> 
> OK, the below seems to work nice for me -- although I'm still on a
> hacked up printk, but the recursive RCU not watching fail seems to be
> tamed.

FWIW, I gave that a spin on arm64 with the ftrace selftests, and I see no
splats, so it looks good on that front.

Currently arm64's BUG/WARN exception handling does the usual
lockdep/rcu/whatever stuff before getting to report_bug(), so that bit should
be redundant (and any WARN() or BUG() early in the entry code is likely to lead
to a stack overflow and kill the kernel), but it shouldn't be harmful.

> Ofc. Paul might have an opinion on this glorious bodge ;-)

I'll leave that to Paul. ;)

Thanks,
Mark.

> 
> ---
> 
> diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h
> index c303f7a114e9..d48cd92d2364 100644
> --- a/include/linux/trace_recursion.h
> +++ b/include/linux/trace_recursion.h
> @@ -135,6 +135,21 @@ extern void ftrace_record_recursion(unsigned long ip, unsigned long parent_ip);
>  # define do_ftrace_record_recursion(ip, pip)	do { } while (0)
>  #endif
>  
> +#ifdef CONFIG_ARCH_WANTS_NO_INSTR
> +# define trace_warn_on_no_rcu(ip)					\
> +	({								\
> +		bool __ret = !rcu_is_watching();			\
> +		if (__ret && !trace_recursion_test(TRACE_RECORD_RECURSION_BIT)) { \
> +			trace_recursion_set(TRACE_RECORD_RECURSION_BIT); \
> +			WARN_ONCE(true, "RCU not on for: %pS\n", (void *)ip); \
> +			trace_recursion_clear(TRACE_RECORD_RECURSION_BIT); \
> +		}							\
> +		__ret;							\
> +	})
> +#else
> +# define trace_warn_on_no_rcu(ip)	false
> +#endif
> +
>  /*
>   * Preemption is promised to be disabled when return bit >= 0.
>   */
> @@ -144,6 +159,9 @@ static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsign
>  	unsigned int val = READ_ONCE(current->trace_recursion);
>  	int bit;
>  
> +	if (trace_warn_on_no_rcu(ip))
> +		return -1;
> +
>  	bit = trace_get_context_bit() + start;
>  	if (unlikely(val & (1 << bit))) {
>  		/*
> diff --git a/lib/bug.c b/lib/bug.c
> index c223a2575b72..0a10643ea168 100644
> --- a/lib/bug.c
> +++ b/lib/bug.c
> @@ -47,6 +47,7 @@
>  #include <linux/sched.h>
>  #include <linux/rculist.h>
>  #include <linux/ftrace.h>
> +#include <linux/context_tracking.h>
>  
>  extern struct bug_entry __start___bug_table[], __stop___bug_table[];
>  
> @@ -153,7 +154,7 @@ struct bug_entry *find_bug(unsigned long bugaddr)
>  	return module_find_bug(bugaddr);
>  }
>  
> -enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
> +static enum bug_trap_type __report_bug(unsigned long bugaddr, struct pt_regs *regs)
>  {
>  	struct bug_entry *bug;
>  	const char *file;
> @@ -209,6 +210,30 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
>  	return BUG_TRAP_TYPE_BUG;
>  }
>  
> +enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
> +{
> +	enum bug_trap_type ret;
> +	bool rcu = false;
> +
> +#ifdef CONFIG_CONTEXT_TRACKING_IDLE
> +	/*
> +	 * Horrible hack to shut up recursive RCU isn't watching fail since
> +	 * lots of the actual reporting also relies on RCU.
> +	 */
> +	if (!rcu_is_watching()) {
> +		rcu = true;
> +		ct_state_inc(RCU_DYNTICKS_IDX);
> +	}
> +#endif
> +
> +	ret = __report_bug(bugaddr, regs);
> +
> +	if (rcu)
> +		ct_state_inc(RCU_DYNTICKS_IDX);
> +
> +	return ret;
> +}
> +
>  static void clear_once_table(struct bug_entry *start, struct bug_entry *end)
>  {
>  	struct bug_entry *bug;
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 3/6] ftrace/x86: Warn and ignore graph tracing when RCU is disabled
@ 2023-01-25 11:32               ` Mark Rutland
  0 siblings, 0 replies; 53+ messages in thread
From: Mark Rutland @ 2023-01-25 11:32 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Steven Rostedt, mingo, will, boqun.feng, tglx, bp, dave.hansen,
	x86, hpa, seanjc, pbonzini, jgross, srivatsa, amakhalov,
	pv-drivers, mhiramat, wanpengli, vkuznets, boris.ostrovsky,
	rafael, daniel.lezcano, juri.lelli, vincent.guittot,
	dietmar.eggemann, bsegall, mgorman, bristot, vschneid,
	linux-kernel, kvm, virtualization, linux-trace-kernel, linux-pm,
	Paul McKenney, Frederic Weisbecker

On Wed, Jan 25, 2023 at 11:47:44AM +0100, Peter Zijlstra wrote:
> On Tue, Jan 24, 2023 at 05:12:14PM +0000, Mark Rutland wrote:
> > On Tue, Jan 24, 2023 at 03:44:35PM +0100, Peter Zijlstra wrote:
> > > On Mon, Jan 23, 2023 at 05:07:53PM -0500, Steven Rostedt wrote:
> > > 
> > > > Actually, perhaps we can just add this, and all you need to do is create
> > > > and set CONFIG_NO_RCU_TRACING (or some other name).
> > > 
> > > Elsewhere I've used CONFIG_ARCH_WANTS_NO_INSTR for this.
> > 
> > Yes please; if we use CONFIG_ARCH_WANTS_NO_INSTR then arm64 will get this "for
> > free" once we add the missing checks (which I assume we need) in our ftrace_prepare_return().
> > 
> > > Anyway, I took it for a spin and it .... doesn't seems to do the job.
> > > 
> > > With my patch the first splat is
> > > 
> > >   "RCU not on for: cpuidle_poll_time+0x0/0x70"
> > > 
> > > While with yours I seems to get the endless:
> > > 
> > >   "WARNING: suspicious RCU usage"
> > > 
> > > thing. Let me see if I can figure out where it goes side-ways.
> > 
> > Hmmm... for WARN_ONCE() don't we need to wake RCU first also? I thought we
> > needed that at least for the printk machinery?
> 
> OK, the below seems to work nice for me -- although I'm still on a
> hacked up printk, but the recursive RCU not watching fail seems to be
> tamed.

FWIW, I gave that a spin on arm64 with the ftrace selftests, and I see no
splats, so it looks good on that front.

Currently arm64's BUG/WARN exception handling does the usual
lockdep/rcu/whatever stuff before getting to report_bug(), so that bit should
be redundant (and any WARN() or BUG() early in the entry code is likely to lead
to a stack overflow and kill the kernel), but it shouldn't be harmful.

> Ofc. Paul might have an opinion on this glorious bodge ;-)

I'll leave that to Paul. ;)

Thanks,
Mark.

> 
> ---
> 
> diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h
> index c303f7a114e9..d48cd92d2364 100644
> --- a/include/linux/trace_recursion.h
> +++ b/include/linux/trace_recursion.h
> @@ -135,6 +135,21 @@ extern void ftrace_record_recursion(unsigned long ip, unsigned long parent_ip);
>  # define do_ftrace_record_recursion(ip, pip)	do { } while (0)
>  #endif
>  
> +#ifdef CONFIG_ARCH_WANTS_NO_INSTR
> +# define trace_warn_on_no_rcu(ip)					\
> +	({								\
> +		bool __ret = !rcu_is_watching();			\
> +		if (__ret && !trace_recursion_test(TRACE_RECORD_RECURSION_BIT)) { \
> +			trace_recursion_set(TRACE_RECORD_RECURSION_BIT); \
> +			WARN_ONCE(true, "RCU not on for: %pS\n", (void *)ip); \
> +			trace_recursion_clear(TRACE_RECORD_RECURSION_BIT); \
> +		}							\
> +		__ret;							\
> +	})
> +#else
> +# define trace_warn_on_no_rcu(ip)	false
> +#endif
> +
>  /*
>   * Preemption is promised to be disabled when return bit >= 0.
>   */
> @@ -144,6 +159,9 @@ static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsign
>  	unsigned int val = READ_ONCE(current->trace_recursion);
>  	int bit;
>  
> +	if (trace_warn_on_no_rcu(ip))
> +		return -1;
> +
>  	bit = trace_get_context_bit() + start;
>  	if (unlikely(val & (1 << bit))) {
>  		/*
> diff --git a/lib/bug.c b/lib/bug.c
> index c223a2575b72..0a10643ea168 100644
> --- a/lib/bug.c
> +++ b/lib/bug.c
> @@ -47,6 +47,7 @@
>  #include <linux/sched.h>
>  #include <linux/rculist.h>
>  #include <linux/ftrace.h>
> +#include <linux/context_tracking.h>
>  
>  extern struct bug_entry __start___bug_table[], __stop___bug_table[];
>  
> @@ -153,7 +154,7 @@ struct bug_entry *find_bug(unsigned long bugaddr)
>  	return module_find_bug(bugaddr);
>  }
>  
> -enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
> +static enum bug_trap_type __report_bug(unsigned long bugaddr, struct pt_regs *regs)
>  {
>  	struct bug_entry *bug;
>  	const char *file;
> @@ -209,6 +210,30 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
>  	return BUG_TRAP_TYPE_BUG;
>  }
>  
> +enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
> +{
> +	enum bug_trap_type ret;
> +	bool rcu = false;
> +
> +#ifdef CONFIG_CONTEXT_TRACKING_IDLE
> +	/*
> +	 * Horrible hack to shut up recursive RCU isn't watching fail since
> +	 * lots of the actual reporting also relies on RCU.
> +	 */
> +	if (!rcu_is_watching()) {
> +		rcu = true;
> +		ct_state_inc(RCU_DYNTICKS_IDX);
> +	}
> +#endif
> +
> +	ret = __report_bug(bugaddr, regs);
> +
> +	if (rcu)
> +		ct_state_inc(RCU_DYNTICKS_IDX);
> +
> +	return ret;
> +}
> +
>  static void clear_once_table(struct bug_entry *start, struct bug_entry *end)
>  {
>  	struct bug_entry *bug;

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/6] A few cpuidle vs rcu fixes
  2023-01-23 20:50 ` Peter Zijlstra
@ 2023-01-25 15:20   ` Mark Rutland
  -1 siblings, 0 replies; 53+ messages in thread
From: Mark Rutland @ 2023-01-25 15:20 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, will, boqun.feng, tglx, bp, dave.hansen, x86, hpa, seanjc,
	pbonzini, jgross, srivatsa, amakhalov, pv-drivers, rostedt,
	mhiramat, wanpengli, vkuznets, boris.ostrovsky, rafael,
	daniel.lezcano, juri.lelli, vincent.guittot, dietmar.eggemann,
	bsegall, mgorman, bristot, vschneid, linux-kernel, kvm,
	virtualization, linux-trace-kernel, linux-pm, Lorenzo Pieralisi

Hi Peter,

On Mon, Jan 23, 2023 at 09:50:09PM +0100, Peter Zijlstra wrote:
> 0-day robot reported graph-tracing made the cpuidle-vs-rcu rework go splat.
> 
> These patches appear to cure this, the ftrace selftest now runs to completion
> without spamming scary messages to dmesg.

In addition to the other bits for arm64, we'll need the following patch. Are
you happy to add that to the start of this series?

I've tested this on an arm64 Juno board with a full-fat ftrace config,
CONFIG_PROVE_LOCKING + CONFIG_DEBUG_LOCKDEP, and CONFIG_DEBUG_VIRTUAL=y, and
build tested for 32-bit arm.

Thanks,
Mark.

---->8----
From 30ab9eba19e952cb51c9f599d2ac9b8a302cb63d Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 25 Jan 2023 14:20:49 +0000
Subject: [PATCH] drivers: firmware: psci: don't instrument suspend code

The PSCI suspend code is currently instrumentable, which is not safe as
instrumentation (e.g. ftrace) may try to make use of RCU during idle
periods when RCU is not watching.

To fix this we need to ensure that psci_suspend_finisher() and anything
it calls are not instrumented. We can do this fairly simply by marking
psci_suspend_finisher() and the psci*_cpu_suspend() functions as
noinstr, and the underlying helper functions as __always_inline.

When CONFIG_DEBUG_VIRTUAL=y, __pa_symbol() can expand to an out-of-line
instrumented function, so we must use __pa_symbol_nodebug() within
psci_suspend_finisher().

The raw SMCCC invocation functions are written in assembly, and are not
subject to compiler instrumentation.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Lorenzo Pieralisi <lpieralisi@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 drivers/firmware/psci/psci.c | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/drivers/firmware/psci/psci.c b/drivers/firmware/psci/psci.c
index f3a044fa4652a..c12847b4736de 100644
--- a/drivers/firmware/psci/psci.c
+++ b/drivers/firmware/psci/psci.c
@@ -108,9 +108,10 @@ bool psci_power_state_is_valid(u32 state)
 	return !(state & ~valid_mask);
 }
 
-static unsigned long __invoke_psci_fn_hvc(unsigned long function_id,
-			unsigned long arg0, unsigned long arg1,
-			unsigned long arg2)
+static __always_inline unsigned long
+__invoke_psci_fn_hvc(unsigned long function_id,
+		     unsigned long arg0, unsigned long arg1,
+		     unsigned long arg2)
 {
 	struct arm_smccc_res res;
 
@@ -118,9 +119,10 @@ static unsigned long __invoke_psci_fn_hvc(unsigned long function_id,
 	return res.a0;
 }
 
-static unsigned long __invoke_psci_fn_smc(unsigned long function_id,
-			unsigned long arg0, unsigned long arg1,
-			unsigned long arg2)
+static __always_inline unsigned long
+__invoke_psci_fn_smc(unsigned long function_id,
+		     unsigned long arg0, unsigned long arg1,
+		     unsigned long arg2)
 {
 	struct arm_smccc_res res;
 
@@ -128,7 +130,7 @@ static unsigned long __invoke_psci_fn_smc(unsigned long function_id,
 	return res.a0;
 }
 
-static int psci_to_linux_errno(int errno)
+static __always_inline int psci_to_linux_errno(int errno)
 {
 	switch (errno) {
 	case PSCI_RET_SUCCESS:
@@ -169,7 +171,8 @@ int psci_set_osi_mode(bool enable)
 	return psci_to_linux_errno(err);
 }
 
-static int __psci_cpu_suspend(u32 fn, u32 state, unsigned long entry_point)
+static __always_inline int
+__psci_cpu_suspend(u32 fn, u32 state, unsigned long entry_point)
 {
 	int err;
 
@@ -177,13 +180,15 @@ static int __psci_cpu_suspend(u32 fn, u32 state, unsigned long entry_point)
 	return psci_to_linux_errno(err);
 }
 
-static int psci_0_1_cpu_suspend(u32 state, unsigned long entry_point)
+static __always_inline int
+psci_0_1_cpu_suspend(u32 state, unsigned long entry_point)
 {
 	return __psci_cpu_suspend(psci_0_1_function_ids.cpu_suspend,
 				  state, entry_point);
 }
 
-static int psci_0_2_cpu_suspend(u32 state, unsigned long entry_point)
+static __always_inline int
+psci_0_2_cpu_suspend(u32 state, unsigned long entry_point)
 {
 	return __psci_cpu_suspend(PSCI_FN_NATIVE(0_2, CPU_SUSPEND),
 				  state, entry_point);
@@ -447,10 +452,12 @@ late_initcall(psci_debugfs_init)
 #endif
 
 #ifdef CONFIG_CPU_IDLE
-static int psci_suspend_finisher(unsigned long state)
+static noinstr int psci_suspend_finisher(unsigned long state)
 {
 	u32 power_state = state;
-	phys_addr_t pa_cpu_resume = __pa_symbol(cpu_resume);
+	phys_addr_t pa_cpu_resume;
+
+	pa_cpu_resume = __pa_symbol_nodebug((unsigned long)cpu_resume);
 
 	return psci_ops.cpu_suspend(power_state, pa_cpu_resume);
 }
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/6] A few cpuidle vs rcu fixes
@ 2023-01-25 15:20   ` Mark Rutland
  0 siblings, 0 replies; 53+ messages in thread
From: Mark Rutland @ 2023-01-25 15:20 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: juri.lelli, daniel.lezcano, wanpengli, kvm, rafael, pv-drivers,
	Lorenzo Pieralisi, dave.hansen, virtualization, bsegall,
	amakhalov, will, vschneid, hpa, x86, mingo, mgorman,
	linux-trace-kernel, linux-pm, boqun.feng, rostedt, bp,
	vincent.guittot, boris.ostrovsky, dietmar.eggemann, jgross,
	seanjc, linux-kernel, tglx, mhiramat, pbonzini, bristot

Hi Peter,

On Mon, Jan 23, 2023 at 09:50:09PM +0100, Peter Zijlstra wrote:
> 0-day robot reported graph-tracing made the cpuidle-vs-rcu rework go splat.
> 
> These patches appear to cure this, the ftrace selftest now runs to completion
> without spamming scary messages to dmesg.

In addition to the other bits for arm64, we'll need the following patch. Are
you happy to add that to the start of this series?

I've tested this on an arm64 Juno board with a full-fat ftrace config,
CONFIG_PROVE_LOCKING + CONFIG_DEBUG_LOCKDEP, and CONFIG_DEBUG_VIRTUAL=y, and
build tested for 32-bit arm.

Thanks,
Mark.

---->8----
From 30ab9eba19e952cb51c9f599d2ac9b8a302cb63d Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 25 Jan 2023 14:20:49 +0000
Subject: [PATCH] drivers: firmware: psci: don't instrument suspend code

The PSCI suspend code is currently instrumentable, which is not safe as
instrumentation (e.g. ftrace) may try to make use of RCU during idle
periods when RCU is not watching.

To fix this we need to ensure that psci_suspend_finisher() and anything
it calls are not instrumented. We can do this fairly simply by marking
psci_suspend_finisher() and the psci*_cpu_suspend() functions as
noinstr, and the underlying helper functions as __always_inline.

When CONFIG_DEBUG_VIRTUAL=y, __pa_symbol() can expand to an out-of-line
instrumented function, so we must use __pa_symbol_nodebug() within
psci_suspend_finisher().

The raw SMCCC invocation functions are written in assembly, and are not
subject to compiler instrumentation.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Lorenzo Pieralisi <lpieralisi@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 drivers/firmware/psci/psci.c | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/drivers/firmware/psci/psci.c b/drivers/firmware/psci/psci.c
index f3a044fa4652a..c12847b4736de 100644
--- a/drivers/firmware/psci/psci.c
+++ b/drivers/firmware/psci/psci.c
@@ -108,9 +108,10 @@ bool psci_power_state_is_valid(u32 state)
 	return !(state & ~valid_mask);
 }
 
-static unsigned long __invoke_psci_fn_hvc(unsigned long function_id,
-			unsigned long arg0, unsigned long arg1,
-			unsigned long arg2)
+static __always_inline unsigned long
+__invoke_psci_fn_hvc(unsigned long function_id,
+		     unsigned long arg0, unsigned long arg1,
+		     unsigned long arg2)
 {
 	struct arm_smccc_res res;
 
@@ -118,9 +119,10 @@ static unsigned long __invoke_psci_fn_hvc(unsigned long function_id,
 	return res.a0;
 }
 
-static unsigned long __invoke_psci_fn_smc(unsigned long function_id,
-			unsigned long arg0, unsigned long arg1,
-			unsigned long arg2)
+static __always_inline unsigned long
+__invoke_psci_fn_smc(unsigned long function_id,
+		     unsigned long arg0, unsigned long arg1,
+		     unsigned long arg2)
 {
 	struct arm_smccc_res res;
 
@@ -128,7 +130,7 @@ static unsigned long __invoke_psci_fn_smc(unsigned long function_id,
 	return res.a0;
 }
 
-static int psci_to_linux_errno(int errno)
+static __always_inline int psci_to_linux_errno(int errno)
 {
 	switch (errno) {
 	case PSCI_RET_SUCCESS:
@@ -169,7 +171,8 @@ int psci_set_osi_mode(bool enable)
 	return psci_to_linux_errno(err);
 }
 
-static int __psci_cpu_suspend(u32 fn, u32 state, unsigned long entry_point)
+static __always_inline int
+__psci_cpu_suspend(u32 fn, u32 state, unsigned long entry_point)
 {
 	int err;
 
@@ -177,13 +180,15 @@ static int __psci_cpu_suspend(u32 fn, u32 state, unsigned long entry_point)
 	return psci_to_linux_errno(err);
 }
 
-static int psci_0_1_cpu_suspend(u32 state, unsigned long entry_point)
+static __always_inline int
+psci_0_1_cpu_suspend(u32 state, unsigned long entry_point)
 {
 	return __psci_cpu_suspend(psci_0_1_function_ids.cpu_suspend,
 				  state, entry_point);
 }
 
-static int psci_0_2_cpu_suspend(u32 state, unsigned long entry_point)
+static __always_inline int
+psci_0_2_cpu_suspend(u32 state, unsigned long entry_point)
 {
 	return __psci_cpu_suspend(PSCI_FN_NATIVE(0_2, CPU_SUSPEND),
 				  state, entry_point);
@@ -447,10 +452,12 @@ late_initcall(psci_debugfs_init)
 #endif
 
 #ifdef CONFIG_CPU_IDLE
-static int psci_suspend_finisher(unsigned long state)
+static noinstr int psci_suspend_finisher(unsigned long state)
 {
 	u32 power_state = state;
-	phys_addr_t pa_cpu_resume = __pa_symbol(cpu_resume);
+	phys_addr_t pa_cpu_resume;
+
+	pa_cpu_resume = __pa_symbol_nodebug((unsigned long)cpu_resume);
 
 	return psci_ops.cpu_suspend(power_state, pa_cpu_resume);
 }
-- 
2.30.2

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* Re: [PATCH 3/6] ftrace/x86: Warn and ignore graph tracing when RCU is disabled
  2023-01-25 10:47             ` Peter Zijlstra
  (?)
  (?)
@ 2023-01-25 18:46             ` Paul E. McKenney
  2023-01-26  9:28                 ` Peter Zijlstra
  -1 siblings, 1 reply; 53+ messages in thread
From: Paul E. McKenney @ 2023-01-25 18:46 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Mark Rutland, Steven Rostedt, mingo, will, boqun.feng, tglx, bp,
	dave.hansen, x86, hpa, seanjc, pbonzini, jgross, srivatsa,
	amakhalov, pv-drivers, mhiramat, wanpengli, vkuznets,
	boris.ostrovsky, rafael, daniel.lezcano, juri.lelli,
	vincent.guittot, dietmar.eggemann, bsegall, mgorman, bristot,
	vschneid, linux-kernel, kvm, virtualization, linux-trace-kernel,
	linux-pm, Frederic Weisbecker

On Wed, Jan 25, 2023 at 11:47:44AM +0100, Peter Zijlstra wrote:
> On Tue, Jan 24, 2023 at 05:12:14PM +0000, Mark Rutland wrote:
> > On Tue, Jan 24, 2023 at 03:44:35PM +0100, Peter Zijlstra wrote:
> > > On Mon, Jan 23, 2023 at 05:07:53PM -0500, Steven Rostedt wrote:
> > > 
> > > > Actually, perhaps we can just add this, and all you need to do is create
> > > > and set CONFIG_NO_RCU_TRACING (or some other name).
> > > 
> > > Elsewhere I've used CONFIG_ARCH_WANTS_NO_INSTR for this.
> > 
> > Yes please; if we use CONFIG_ARCH_WANTS_NO_INSTR then arm64 will get this "for
> > free" once we add the missing checks (which I assume we need) in our ftrace_prepare_return().
> > 
> > > Anyway, I took it for a spin and it .... doesn't seems to do the job.
> > > 
> > > With my patch the first splat is
> > > 
> > >   "RCU not on for: cpuidle_poll_time+0x0/0x70"
> > > 
> > > While with yours I seems to get the endless:
> > > 
> > >   "WARNING: suspicious RCU usage"
> > > 
> > > thing. Let me see if I can figure out where it goes side-ways.
> > 
> > Hmmm... for WARN_ONCE() don't we need to wake RCU first also? I thought we
> > needed that at least for the printk machinery?
> 
> OK, the below seems to work nice for me -- although I'm still on a
> hacked up printk, but the recursive RCU not watching fail seems to be
> tamed.
> 
> Ofc. Paul might have an opinion on this glorious bodge ;-)

For some definition of the word "glorious", to be sure.  ;-)

Am I correct that you have two things happening here?  (1) Preventing
trace recursion and (2) forcing RCU to pay attention when needed.

I cannot resist pointing out that you have re-invented RCU_NONIDLE(),
though avoiding much of the overhead when not needed.  ;-)

I would have objections if this ever leaks out onto a non-error code path.
There are things that need doing when RCU starts and stops watching,
and this approach omits those things.  Which again is OK in this case,
where this code is only ever executed when something is already broken,
but definitely *not* OK when things are not already broken.

							Thanx, Paul

> ---
> 
> diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h
> index c303f7a114e9..d48cd92d2364 100644
> --- a/include/linux/trace_recursion.h
> +++ b/include/linux/trace_recursion.h
> @@ -135,6 +135,21 @@ extern void ftrace_record_recursion(unsigned long ip, unsigned long parent_ip);
>  # define do_ftrace_record_recursion(ip, pip)	do { } while (0)
>  #endif
>  
> +#ifdef CONFIG_ARCH_WANTS_NO_INSTR
> +# define trace_warn_on_no_rcu(ip)					\
> +	({								\
> +		bool __ret = !rcu_is_watching();			\
> +		if (__ret && !trace_recursion_test(TRACE_RECORD_RECURSION_BIT)) { \
> +			trace_recursion_set(TRACE_RECORD_RECURSION_BIT); \
> +			WARN_ONCE(true, "RCU not on for: %pS\n", (void *)ip); \
> +			trace_recursion_clear(TRACE_RECORD_RECURSION_BIT); \
> +		}							\
> +		__ret;							\
> +	})
> +#else
> +# define trace_warn_on_no_rcu(ip)	false
> +#endif
> +
>  /*
>   * Preemption is promised to be disabled when return bit >= 0.
>   */
> @@ -144,6 +159,9 @@ static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsign
>  	unsigned int val = READ_ONCE(current->trace_recursion);
>  	int bit;
>  
> +	if (trace_warn_on_no_rcu(ip))
> +		return -1;
> +
>  	bit = trace_get_context_bit() + start;
>  	if (unlikely(val & (1 << bit))) {
>  		/*
> diff --git a/lib/bug.c b/lib/bug.c
> index c223a2575b72..0a10643ea168 100644
> --- a/lib/bug.c
> +++ b/lib/bug.c
> @@ -47,6 +47,7 @@
>  #include <linux/sched.h>
>  #include <linux/rculist.h>
>  #include <linux/ftrace.h>
> +#include <linux/context_tracking.h>
>  
>  extern struct bug_entry __start___bug_table[], __stop___bug_table[];
>  
> @@ -153,7 +154,7 @@ struct bug_entry *find_bug(unsigned long bugaddr)
>  	return module_find_bug(bugaddr);
>  }
>  
> -enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
> +static enum bug_trap_type __report_bug(unsigned long bugaddr, struct pt_regs *regs)
>  {
>  	struct bug_entry *bug;
>  	const char *file;
> @@ -209,6 +210,30 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
>  	return BUG_TRAP_TYPE_BUG;
>  }
>  
> +enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
> +{
> +	enum bug_trap_type ret;
> +	bool rcu = false;
> +
> +#ifdef CONFIG_CONTEXT_TRACKING_IDLE
> +	/*
> +	 * Horrible hack to shut up recursive RCU isn't watching fail since
> +	 * lots of the actual reporting also relies on RCU.
> +	 */
> +	if (!rcu_is_watching()) {
> +		rcu = true;
> +		ct_state_inc(RCU_DYNTICKS_IDX);
> +	}
> +#endif
> +
> +	ret = __report_bug(bugaddr, regs);
> +
> +	if (rcu)
> +		ct_state_inc(RCU_DYNTICKS_IDX);
> +
> +	return ret;
> +}
> +
>  static void clear_once_table(struct bug_entry *start, struct bug_entry *end)
>  {
>  	struct bug_entry *bug;

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 3/6] ftrace/x86: Warn and ignore graph tracing when RCU is disabled
  2023-01-25 18:46             ` Paul E. McKenney
@ 2023-01-26  9:28                 ` Peter Zijlstra
  0 siblings, 0 replies; 53+ messages in thread
From: Peter Zijlstra @ 2023-01-26  9:28 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Mark Rutland, Steven Rostedt, mingo, will, boqun.feng, tglx, bp,
	dave.hansen, x86, hpa, seanjc, pbonzini, jgross, srivatsa,
	amakhalov, pv-drivers, mhiramat, wanpengli, vkuznets,
	boris.ostrovsky, rafael, daniel.lezcano, juri.lelli,
	vincent.guittot, dietmar.eggemann, bsegall, mgorman, bristot,
	vschneid, linux-kernel, kvm, virtualization, linux-trace-kernel,
	linux-pm, Frederic Weisbecker

On Wed, Jan 25, 2023 at 10:46:58AM -0800, Paul E. McKenney wrote:

> > Ofc. Paul might have an opinion on this glorious bodge ;-)
> 
> For some definition of the word "glorious", to be sure.  ;-)
> 
> Am I correct that you have two things happening here?  (1) Preventing
> trace recursion and (2) forcing RCU to pay attention when needed.

Mostly just (1), we're in an error situation, I'm not too worried about
(2).

> I cannot resist pointing out that you have re-invented RCU_NONIDLE(),
> though avoiding much of the overhead when not needed.  ;-)

Yeah, this was the absolute minimal bodge I could come up with that
shuts up the rcu_derefence warning thing.

> I would have objections if this ever leaks out onto a non-error code path.

Agreed.

> There are things that need doing when RCU starts and stops watching,
> and this approach omits those things.  Which again is OK in this case,
> where this code is only ever executed when something is already broken,
> but definitely *not* OK when things are not already broken.

And agreed.

Current version of the bodge looks like so (will repost the whole series
a little later today).

I managed to tickle the recursion so that it was a test-case for the
stack guard...

With this on, it prints just the one WARN and lives.

---
Subject: bug: Disable rcu_is_watching() during WARN/BUG
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed Jan 25 13:57:49 CET 2023

In order to avoid WARN/BUG from generating nested or even recursive
warnings, force rcu_is_watching() true during
WARN/lockdep_rcu_suspicious().

Notably things like unwinding the stack can trigger rcu_dereference()
warnings, which then triggers more unwinding which then triggers more
warnings etc..

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/context_tracking.h |   27 +++++++++++++++++++++++++++
 kernel/locking/lockdep.c         |    3 +++
 kernel/panic.c                   |    5 +++++
 lib/bug.c                        |   15 ++++++++++++++-
 4 files changed, 49 insertions(+), 1 deletion(-)

--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -130,9 +130,36 @@ static __always_inline unsigned long ct_
 	return arch_atomic_add_return(incby, this_cpu_ptr(&context_tracking.state));
 }
 
+static __always_inline bool warn_rcu_enter(void)
+{
+	bool ret = false;
+
+	/*
+	 * Horrible hack to shut up recursive RCU isn't watching fail since
+	 * lots of the actual reporting also relies on RCU.
+	 */
+	preempt_disable_notrace();
+	if (rcu_dynticks_curr_cpu_in_eqs()) {
+		ret = true;
+		ct_state_inc(RCU_DYNTICKS_IDX);
+	}
+
+	return ret;
+}
+
+static __always_inline void warn_rcu_exit(bool rcu)
+{
+	if (rcu)
+		ct_state_inc(RCU_DYNTICKS_IDX);
+	preempt_enable_notrace();
+}
+
 #else
 static inline void ct_idle_enter(void) { }
 static inline void ct_idle_exit(void) { }
+
+static __always_inline bool warn_rcu_enter(void) { return false; }
+static __always_inline void warn_rcu_exit(bool rcu) { }
 #endif /* !CONFIG_CONTEXT_TRACKING_IDLE */
 
 #endif
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -55,6 +55,7 @@
 #include <linux/rcupdate.h>
 #include <linux/kprobes.h>
 #include <linux/lockdep.h>
+#include <linux/context_tracking.h>
 
 #include <asm/sections.h>
 
@@ -6555,6 +6556,7 @@ void lockdep_rcu_suspicious(const char *
 {
 	struct task_struct *curr = current;
 	int dl = READ_ONCE(debug_locks);
+	bool rcu = warn_rcu_enter();
 
 	/* Note: the following can be executed concurrently, so be careful. */
 	pr_warn("\n");
@@ -6595,5 +6597,6 @@ void lockdep_rcu_suspicious(const char *
 	lockdep_print_held_locks(curr);
 	pr_warn("\nstack backtrace:\n");
 	dump_stack();
+	warn_rcu_exit(rcu);
 }
 EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -34,6 +34,7 @@
 #include <linux/ratelimit.h>
 #include <linux/debugfs.h>
 #include <linux/sysfs.h>
+#include <linux/context_tracking.h>
 #include <trace/events/error_report.h>
 #include <asm/sections.h>
 
@@ -679,6 +680,7 @@ void __warn(const char *file, int line,
 void warn_slowpath_fmt(const char *file, int line, unsigned taint,
 		       const char *fmt, ...)
 {
+	bool rcu = warn_rcu_enter();
 	struct warn_args args;
 
 	pr_warn(CUT_HERE);
@@ -693,11 +695,13 @@ void warn_slowpath_fmt(const char *file,
 	va_start(args.args, fmt);
 	__warn(file, line, __builtin_return_address(0), taint, NULL, &args);
 	va_end(args.args);
+	warn_rcu_exit(rcu);
 }
 EXPORT_SYMBOL(warn_slowpath_fmt);
 #else
 void __warn_printk(const char *fmt, ...)
 {
+	bool rcu = warn_rcu_enter();
 	va_list args;
 
 	pr_warn(CUT_HERE);
@@ -705,6 +709,7 @@ void __warn_printk(const char *fmt, ...)
 	va_start(args, fmt);
 	vprintk(fmt, args);
 	va_end(args);
+	warn_rcu_exit(rcu);
 }
 EXPORT_SYMBOL(__warn_printk);
 #endif
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -47,6 +47,7 @@
 #include <linux/sched.h>
 #include <linux/rculist.h>
 #include <linux/ftrace.h>
+#include <linux/context_tracking.h>
 
 extern struct bug_entry __start___bug_table[], __stop___bug_table[];
 
@@ -153,7 +154,7 @@ struct bug_entry *find_bug(unsigned long
 	return module_find_bug(bugaddr);
 }
 
-enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
+static enum bug_trap_type __report_bug(unsigned long bugaddr, struct pt_regs *regs)
 {
 	struct bug_entry *bug;
 	const char *file;
@@ -209,6 +210,18 @@ enum bug_trap_type report_bug(unsigned l
 	return BUG_TRAP_TYPE_BUG;
 }
 
+enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
+{
+	enum bug_trap_type ret;
+	bool rcu = false;
+
+	rcu = warn_rcu_enter();
+	ret = __report_bug(bugaddr, regs);
+	warn_rcu_exit(rcu);
+
+	return ret;
+}
+
 static void clear_once_table(struct bug_entry *start, struct bug_entry *end)
 {
 	struct bug_entry *bug;

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 3/6] ftrace/x86: Warn and ignore graph tracing when RCU is disabled
@ 2023-01-26  9:28                 ` Peter Zijlstra
  0 siblings, 0 replies; 53+ messages in thread
From: Peter Zijlstra @ 2023-01-26  9:28 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Mark Rutland, juri.lelli, daniel.lezcano, wanpengli, kvm, rafael,
	pv-drivers, Frederic Weisbecker, dave.hansen, virtualization,
	bsegall, amakhalov, will, vschneid, hpa, x86, mingo, mgorman,
	linux-trace-kernel, linux-pm, boqun.feng, Steven Rostedt, bp,
	vincent.guittot, boris.ostrovsky, dietmar.eggemann, jgross,
	seanjc, linux-kernel, tglx, mhiramat, pbonzini, bristot

On Wed, Jan 25, 2023 at 10:46:58AM -0800, Paul E. McKenney wrote:

> > Ofc. Paul might have an opinion on this glorious bodge ;-)
> 
> For some definition of the word "glorious", to be sure.  ;-)
> 
> Am I correct that you have two things happening here?  (1) Preventing
> trace recursion and (2) forcing RCU to pay attention when needed.

Mostly just (1), we're in an error situation, I'm not too worried about
(2).

> I cannot resist pointing out that you have re-invented RCU_NONIDLE(),
> though avoiding much of the overhead when not needed.  ;-)

Yeah, this was the absolute minimal bodge I could come up with that
shuts up the rcu_derefence warning thing.

> I would have objections if this ever leaks out onto a non-error code path.

Agreed.

> There are things that need doing when RCU starts and stops watching,
> and this approach omits those things.  Which again is OK in this case,
> where this code is only ever executed when something is already broken,
> but definitely *not* OK when things are not already broken.

And agreed.

Current version of the bodge looks like so (will repost the whole series
a little later today).

I managed to tickle the recursion so that it was a test-case for the
stack guard...

With this on, it prints just the one WARN and lives.

---
Subject: bug: Disable rcu_is_watching() during WARN/BUG
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed Jan 25 13:57:49 CET 2023

In order to avoid WARN/BUG from generating nested or even recursive
warnings, force rcu_is_watching() true during
WARN/lockdep_rcu_suspicious().

Notably things like unwinding the stack can trigger rcu_dereference()
warnings, which then triggers more unwinding which then triggers more
warnings etc..

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/context_tracking.h |   27 +++++++++++++++++++++++++++
 kernel/locking/lockdep.c         |    3 +++
 kernel/panic.c                   |    5 +++++
 lib/bug.c                        |   15 ++++++++++++++-
 4 files changed, 49 insertions(+), 1 deletion(-)

--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -130,9 +130,36 @@ static __always_inline unsigned long ct_
 	return arch_atomic_add_return(incby, this_cpu_ptr(&context_tracking.state));
 }
 
+static __always_inline bool warn_rcu_enter(void)
+{
+	bool ret = false;
+
+	/*
+	 * Horrible hack to shut up recursive RCU isn't watching fail since
+	 * lots of the actual reporting also relies on RCU.
+	 */
+	preempt_disable_notrace();
+	if (rcu_dynticks_curr_cpu_in_eqs()) {
+		ret = true;
+		ct_state_inc(RCU_DYNTICKS_IDX);
+	}
+
+	return ret;
+}
+
+static __always_inline void warn_rcu_exit(bool rcu)
+{
+	if (rcu)
+		ct_state_inc(RCU_DYNTICKS_IDX);
+	preempt_enable_notrace();
+}
+
 #else
 static inline void ct_idle_enter(void) { }
 static inline void ct_idle_exit(void) { }
+
+static __always_inline bool warn_rcu_enter(void) { return false; }
+static __always_inline void warn_rcu_exit(bool rcu) { }
 #endif /* !CONFIG_CONTEXT_TRACKING_IDLE */
 
 #endif
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -55,6 +55,7 @@
 #include <linux/rcupdate.h>
 #include <linux/kprobes.h>
 #include <linux/lockdep.h>
+#include <linux/context_tracking.h>
 
 #include <asm/sections.h>
 
@@ -6555,6 +6556,7 @@ void lockdep_rcu_suspicious(const char *
 {
 	struct task_struct *curr = current;
 	int dl = READ_ONCE(debug_locks);
+	bool rcu = warn_rcu_enter();
 
 	/* Note: the following can be executed concurrently, so be careful. */
 	pr_warn("\n");
@@ -6595,5 +6597,6 @@ void lockdep_rcu_suspicious(const char *
 	lockdep_print_held_locks(curr);
 	pr_warn("\nstack backtrace:\n");
 	dump_stack();
+	warn_rcu_exit(rcu);
 }
 EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -34,6 +34,7 @@
 #include <linux/ratelimit.h>
 #include <linux/debugfs.h>
 #include <linux/sysfs.h>
+#include <linux/context_tracking.h>
 #include <trace/events/error_report.h>
 #include <asm/sections.h>
 
@@ -679,6 +680,7 @@ void __warn(const char *file, int line,
 void warn_slowpath_fmt(const char *file, int line, unsigned taint,
 		       const char *fmt, ...)
 {
+	bool rcu = warn_rcu_enter();
 	struct warn_args args;
 
 	pr_warn(CUT_HERE);
@@ -693,11 +695,13 @@ void warn_slowpath_fmt(const char *file,
 	va_start(args.args, fmt);
 	__warn(file, line, __builtin_return_address(0), taint, NULL, &args);
 	va_end(args.args);
+	warn_rcu_exit(rcu);
 }
 EXPORT_SYMBOL(warn_slowpath_fmt);
 #else
 void __warn_printk(const char *fmt, ...)
 {
+	bool rcu = warn_rcu_enter();
 	va_list args;
 
 	pr_warn(CUT_HERE);
@@ -705,6 +709,7 @@ void __warn_printk(const char *fmt, ...)
 	va_start(args, fmt);
 	vprintk(fmt, args);
 	va_end(args);
+	warn_rcu_exit(rcu);
 }
 EXPORT_SYMBOL(__warn_printk);
 #endif
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -47,6 +47,7 @@
 #include <linux/sched.h>
 #include <linux/rculist.h>
 #include <linux/ftrace.h>
+#include <linux/context_tracking.h>
 
 extern struct bug_entry __start___bug_table[], __stop___bug_table[];
 
@@ -153,7 +154,7 @@ struct bug_entry *find_bug(unsigned long
 	return module_find_bug(bugaddr);
 }
 
-enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
+static enum bug_trap_type __report_bug(unsigned long bugaddr, struct pt_regs *regs)
 {
 	struct bug_entry *bug;
 	const char *file;
@@ -209,6 +210,18 @@ enum bug_trap_type report_bug(unsigned l
 	return BUG_TRAP_TYPE_BUG;
 }
 
+enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
+{
+	enum bug_trap_type ret;
+	bool rcu = false;
+
+	rcu = warn_rcu_enter();
+	ret = __report_bug(bugaddr, regs);
+	warn_rcu_exit(rcu);
+
+	return ret;
+}
+
 static void clear_once_table(struct bug_entry *start, struct bug_entry *end)
 {
 	struct bug_entry *bug;
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 3/6] ftrace/x86: Warn and ignore graph tracing when RCU is disabled
  2023-01-26  9:28                 ` Peter Zijlstra
  (?)
@ 2023-01-28 19:12                 ` Paul E. McKenney
  -1 siblings, 0 replies; 53+ messages in thread
From: Paul E. McKenney @ 2023-01-28 19:12 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Mark Rutland, Steven Rostedt, mingo, will, boqun.feng, tglx, bp,
	dave.hansen, x86, hpa, seanjc, pbonzini, jgross, srivatsa,
	amakhalov, pv-drivers, mhiramat, wanpengli, vkuznets,
	boris.ostrovsky, rafael, daniel.lezcano, juri.lelli,
	vincent.guittot, dietmar.eggemann, bsegall, mgorman, bristot,
	vschneid, linux-kernel, kvm, virtualization, linux-trace-kernel,
	linux-pm, Frederic Weisbecker

On Thu, Jan 26, 2023 at 10:28:51AM +0100, Peter Zijlstra wrote:
> On Wed, Jan 25, 2023 at 10:46:58AM -0800, Paul E. McKenney wrote:
> 
> > > Ofc. Paul might have an opinion on this glorious bodge ;-)
> > 
> > For some definition of the word "glorious", to be sure.  ;-)
> > 
> > Am I correct that you have two things happening here?  (1) Preventing
> > trace recursion and (2) forcing RCU to pay attention when needed.
> 
> Mostly just (1), we're in an error situation, I'm not too worried about
> (2).
> 
> > I cannot resist pointing out that you have re-invented RCU_NONIDLE(),
> > though avoiding much of the overhead when not needed.  ;-)
> 
> Yeah, this was the absolute minimal bodge I could come up with that
> shuts up the rcu_derefence warning thing.
> 
> > I would have objections if this ever leaks out onto a non-error code path.
> 
> Agreed.
> 
> > There are things that need doing when RCU starts and stops watching,
> > and this approach omits those things.  Which again is OK in this case,
> > where this code is only ever executed when something is already broken,
> > but definitely *not* OK when things are not already broken.
> 
> And agreed.
> 
> Current version of the bodge looks like so (will repost the whole series
> a little later today).
> 
> I managed to tickle the recursion so that it was a test-case for the
> stack guard...
> 
> With this on, it prints just the one WARN and lives.
> 
> ---
> Subject: bug: Disable rcu_is_watching() during WARN/BUG
> From: Peter Zijlstra <peterz@infradead.org>
> Date: Wed Jan 25 13:57:49 CET 2023
> 
> In order to avoid WARN/BUG from generating nested or even recursive
> warnings, force rcu_is_watching() true during
> WARN/lockdep_rcu_suspicious().
> 
> Notably things like unwinding the stack can trigger rcu_dereference()
> warnings, which then triggers more unwinding which then triggers more
> warnings etc..
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>

From an RCU perspective:

Acked-by: Paul E. McKenney <paulmck@kernel.org>

> ---
>  include/linux/context_tracking.h |   27 +++++++++++++++++++++++++++
>  kernel/locking/lockdep.c         |    3 +++
>  kernel/panic.c                   |    5 +++++
>  lib/bug.c                        |   15 ++++++++++++++-
>  4 files changed, 49 insertions(+), 1 deletion(-)
> 
> --- a/include/linux/context_tracking.h
> +++ b/include/linux/context_tracking.h
> @@ -130,9 +130,36 @@ static __always_inline unsigned long ct_
>  	return arch_atomic_add_return(incby, this_cpu_ptr(&context_tracking.state));
>  }
>  
> +static __always_inline bool warn_rcu_enter(void)
> +{
> +	bool ret = false;
> +
> +	/*
> +	 * Horrible hack to shut up recursive RCU isn't watching fail since
> +	 * lots of the actual reporting also relies on RCU.
> +	 */
> +	preempt_disable_notrace();
> +	if (rcu_dynticks_curr_cpu_in_eqs()) {
> +		ret = true;
> +		ct_state_inc(RCU_DYNTICKS_IDX);
> +	}
> +
> +	return ret;
> +}
> +
> +static __always_inline void warn_rcu_exit(bool rcu)
> +{
> +	if (rcu)
> +		ct_state_inc(RCU_DYNTICKS_IDX);
> +	preempt_enable_notrace();
> +}
> +
>  #else
>  static inline void ct_idle_enter(void) { }
>  static inline void ct_idle_exit(void) { }
> +
> +static __always_inline bool warn_rcu_enter(void) { return false; }
> +static __always_inline void warn_rcu_exit(bool rcu) { }
>  #endif /* !CONFIG_CONTEXT_TRACKING_IDLE */
>  
>  #endif
> --- a/kernel/locking/lockdep.c
> +++ b/kernel/locking/lockdep.c
> @@ -55,6 +55,7 @@
>  #include <linux/rcupdate.h>
>  #include <linux/kprobes.h>
>  #include <linux/lockdep.h>
> +#include <linux/context_tracking.h>
>  
>  #include <asm/sections.h>
>  
> @@ -6555,6 +6556,7 @@ void lockdep_rcu_suspicious(const char *
>  {
>  	struct task_struct *curr = current;
>  	int dl = READ_ONCE(debug_locks);
> +	bool rcu = warn_rcu_enter();
>  
>  	/* Note: the following can be executed concurrently, so be careful. */
>  	pr_warn("\n");
> @@ -6595,5 +6597,6 @@ void lockdep_rcu_suspicious(const char *
>  	lockdep_print_held_locks(curr);
>  	pr_warn("\nstack backtrace:\n");
>  	dump_stack();
> +	warn_rcu_exit(rcu);
>  }
>  EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);
> --- a/kernel/panic.c
> +++ b/kernel/panic.c
> @@ -34,6 +34,7 @@
>  #include <linux/ratelimit.h>
>  #include <linux/debugfs.h>
>  #include <linux/sysfs.h>
> +#include <linux/context_tracking.h>
>  #include <trace/events/error_report.h>
>  #include <asm/sections.h>
>  
> @@ -679,6 +680,7 @@ void __warn(const char *file, int line,
>  void warn_slowpath_fmt(const char *file, int line, unsigned taint,
>  		       const char *fmt, ...)
>  {
> +	bool rcu = warn_rcu_enter();
>  	struct warn_args args;
>  
>  	pr_warn(CUT_HERE);
> @@ -693,11 +695,13 @@ void warn_slowpath_fmt(const char *file,
>  	va_start(args.args, fmt);
>  	__warn(file, line, __builtin_return_address(0), taint, NULL, &args);
>  	va_end(args.args);
> +	warn_rcu_exit(rcu);
>  }
>  EXPORT_SYMBOL(warn_slowpath_fmt);
>  #else
>  void __warn_printk(const char *fmt, ...)
>  {
> +	bool rcu = warn_rcu_enter();
>  	va_list args;
>  
>  	pr_warn(CUT_HERE);
> @@ -705,6 +709,7 @@ void __warn_printk(const char *fmt, ...)
>  	va_start(args, fmt);
>  	vprintk(fmt, args);
>  	va_end(args);
> +	warn_rcu_exit(rcu);
>  }
>  EXPORT_SYMBOL(__warn_printk);
>  #endif
> --- a/lib/bug.c
> +++ b/lib/bug.c
> @@ -47,6 +47,7 @@
>  #include <linux/sched.h>
>  #include <linux/rculist.h>
>  #include <linux/ftrace.h>
> +#include <linux/context_tracking.h>
>  
>  extern struct bug_entry __start___bug_table[], __stop___bug_table[];
>  
> @@ -153,7 +154,7 @@ struct bug_entry *find_bug(unsigned long
>  	return module_find_bug(bugaddr);
>  }
>  
> -enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
> +static enum bug_trap_type __report_bug(unsigned long bugaddr, struct pt_regs *regs)
>  {
>  	struct bug_entry *bug;
>  	const char *file;
> @@ -209,6 +210,18 @@ enum bug_trap_type report_bug(unsigned l
>  	return BUG_TRAP_TYPE_BUG;
>  }
>  
> +enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
> +{
> +	enum bug_trap_type ret;
> +	bool rcu = false;
> +
> +	rcu = warn_rcu_enter();
> +	ret = __report_bug(bugaddr, regs);
> +	warn_rcu_exit(rcu);
> +
> +	return ret;
> +}
> +
>  static void clear_once_table(struct bug_entry *start, struct bug_entry *end)
>  {
>  	struct bug_entry *bug;

^ permalink raw reply	[flat|nested] 53+ messages in thread

* [tip: sched/core] cpuidle: tracing, preempt: Squash _rcuidle tracing
  2023-01-25  9:40           ` Peter Zijlstra
  (?)
  (?)
@ 2023-01-31 14:22           ` tip-bot2 for Peter Zijlstra
  -1 siblings, 0 replies; 53+ messages in thread
From: tip-bot2 for Peter Zijlstra @ 2023-01-31 14:22 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Mark Rutland, Peter Zijlstra (Intel), Ingo Molnar, x86, linux-kernel

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     3017ba4b831bc7fd67cc82e744116b6e45e259a4
Gitweb:        https://git.kernel.org/tip/3017ba4b831bc7fd67cc82e744116b6e45e259a4
Author:        Peter Zijlstra <peterz@infradead.org>
AuthorDate:    Tue, 31 Jan 2023 09:50:36 +01:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Tue, 31 Jan 2023 15:01:46 +01:00

cpuidle: tracing, preempt: Squash _rcuidle tracing

Extend/fix commit:

  9aedeaed6fc6 ("tracing, hardirq: No moar _rcuidle() tracing")

... to also cover trace_preempt_{on,off}() which were mysteriously
untouched.

Fixes: 9aedeaed6fc6 ("tracing, hardirq: No moar _rcuidle() tracing")
Reported-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lkml.kernel.org/r/Y9D5AfnOukWNOZ5q@hirez.programming.kicks-ass.net
Link: https://lore.kernel.org/r/Y9jWXKgkxY5EZVwW@hirez.programming.kicks-ass.net
---
 kernel/trace/trace_preemptirq.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c
index f992444..e37446f 100644
--- a/kernel/trace/trace_preemptirq.c
+++ b/kernel/trace/trace_preemptirq.c
@@ -15,10 +15,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/preemptirq.h>
 
-#ifdef CONFIG_TRACE_IRQFLAGS
-/* Per-cpu variable to prevent redundant calls when IRQs already off */
-static DEFINE_PER_CPU(int, tracing_irq_cpu);
-
 /*
  * Use regular trace points on architectures that implement noinstr
  * tooling: these calls will only happen with RCU enabled, which can
@@ -33,6 +29,10 @@ static DEFINE_PER_CPU(int, tracing_irq_cpu);
 #define trace(point)	if (!in_nmi()) trace_##point##_rcuidle
 #endif
 
+#ifdef CONFIG_TRACE_IRQFLAGS
+/* Per-cpu variable to prevent redundant calls when IRQs already off */
+static DEFINE_PER_CPU(int, tracing_irq_cpu);
+
 /*
  * Like trace_hardirqs_on() but without the lockdep invocation. This is
  * used in the low level entry code where the ordering vs. RCU is important
@@ -100,15 +100,13 @@ NOKPROBE_SYMBOL(trace_hardirqs_off);
 
 void trace_preempt_on(unsigned long a0, unsigned long a1)
 {
-	if (!in_nmi())
-		trace_preempt_enable_rcuidle(a0, a1);
+	trace(preempt_enable)(a0, a1);
 	tracer_preempt_on(a0, a1);
 }
 
 void trace_preempt_off(unsigned long a0, unsigned long a1)
 {
-	if (!in_nmi())
-		trace_preempt_disable_rcuidle(a0, a1);
+	trace(preempt_disable)(a0, a1);
 	tracer_preempt_off(a0, a1);
 }
 #endif

^ permalink raw reply related	[flat|nested] 53+ messages in thread

end of thread, other threads:[~2023-01-31 14:23 UTC | newest]

Thread overview: 53+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-01-23 20:50 [PATCH 0/6] A few cpuidle vs rcu fixes Peter Zijlstra
2023-01-23 20:50 ` Peter Zijlstra
2023-01-23 20:50 ` [PATCH 1/6] x86: Always inline arch_atomic64 Peter Zijlstra
2023-01-23 20:50   ` Peter Zijlstra
2023-01-23 20:50 ` [PATCH 2/6] x86/pvclock: improve atomic update of last_value in pvclock_clocksource_read Peter Zijlstra
2023-01-23 20:50   ` Peter Zijlstra
2023-01-23 20:50 ` [PATCH 3/6] ftrace/x86: Warn and ignore graph tracing when RCU is disabled Peter Zijlstra
2023-01-23 20:50   ` Peter Zijlstra
2023-01-23 21:53   ` Steven Rostedt
2023-01-23 21:53     ` Steven Rostedt
2023-01-23 22:07     ` Steven Rostedt
2023-01-23 22:07       ` Steven Rostedt
2023-01-24 14:44       ` Peter Zijlstra
2023-01-24 14:44         ` Peter Zijlstra
2023-01-24 17:12         ` Mark Rutland
2023-01-24 17:12           ` Mark Rutland
2023-01-25  9:37           ` Peter Zijlstra
2023-01-25  9:37             ` Peter Zijlstra
2023-01-25 10:47           ` Peter Zijlstra
2023-01-25 10:47             ` Peter Zijlstra
2023-01-25 11:32             ` Mark Rutland
2023-01-25 11:32               ` Mark Rutland
2023-01-25 18:46             ` Paul E. McKenney
2023-01-26  9:28               ` Peter Zijlstra
2023-01-26  9:28                 ` Peter Zijlstra
2023-01-28 19:12                 ` Paul E. McKenney
2023-01-23 20:50 ` [PATCH 4/6] x86: Mark sched_clock() noinstr Peter Zijlstra
2023-01-23 20:50   ` Peter Zijlstra
2023-01-23 20:50 ` [PATCH 5/6] sched/clock: Make local_clock() noinstr Peter Zijlstra
2023-01-23 20:50   ` Peter Zijlstra
2023-01-23 20:50 ` [PATCH 6/6] cpuidle: Fix poll_idle() noinstr annotation Peter Zijlstra
2023-01-23 20:50   ` Peter Zijlstra
2023-01-24 14:24   ` Rafael J. Wysocki
2023-01-24 14:24     ` Rafael J. Wysocki
2023-01-24 16:34 ` [PATCH 0/6] A few cpuidle vs rcu fixes Mark Rutland
2023-01-24 16:34   ` Mark Rutland
2023-01-24 17:30   ` Mark Rutland
2023-01-24 17:30     ` Mark Rutland
2023-01-24 18:39     ` Mark Rutland
2023-01-24 18:39       ` Mark Rutland
2023-01-25  9:35       ` Peter Zijlstra
2023-01-25  9:35         ` Peter Zijlstra
2023-01-25  9:40         ` Peter Zijlstra
2023-01-25  9:40           ` Peter Zijlstra
2023-01-25 10:23           ` Mark Rutland
2023-01-25 10:23             ` Mark Rutland
2023-01-31 14:22           ` [tip: sched/core] cpuidle: tracing, preempt: Squash _rcuidle tracing tip-bot2 for Peter Zijlstra
2023-01-25  9:31   ` [PATCH 0/6] A few cpuidle vs rcu fixes Peter Zijlstra
2023-01-25  9:31     ` Peter Zijlstra
2023-01-25  9:36     ` Mark Rutland
2023-01-25  9:36       ` Mark Rutland
2023-01-25 15:20 ` Mark Rutland
2023-01-25 15:20   ` Mark Rutland

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.