linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH RT v3 0/5] RCU fixes
@ 2019-09-11 16:57 Scott Wood
  2019-09-11 16:57 ` [PATCH RT v3 1/5] rcu: Acquire RCU lock when disabling BHs Scott Wood
                   ` (4 more replies)
  0 siblings, 5 replies; 35+ messages in thread
From: Scott Wood @ 2019-09-11 16:57 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: linux-rt-users, linux-kernel, Paul E . McKenney, Joel Fernandes,
	Thomas Gleixner, Steven Rostedt, Peter Zijlstra, Juri Lelli,
	Clark Williams

With these patches, rcutorture works on PREEMPT_RT_FULL.

Scott Wood (5):
  rcu: Acquire RCU lock when disabling BHs
  sched: Rename sleeping_lock to rt_invol_sleep
  sched: migrate_dis/enable: Use rt_invol_sleep
  rcu: Disable use_softirq on PREEMPT_RT
  rcutorture: Avoid problematic critical section nesting

 include/linux/rcupdate.h   | 40 +++++++++++++++----
 include/linux/sched.h      | 19 ++++-----
 kernel/cpu.c               |  2 +
 kernel/locking/rtmutex.c   | 14 +++----
 kernel/locking/rwlock-rt.c | 16 ++++----
 kernel/rcu/rcutorture.c    | 96 +++++++++++++++++++++++++++++++++++++++-------
 kernel/rcu/tree.c          |  9 ++++-
 kernel/rcu/tree_plugin.h   |  8 ++--
 kernel/sched/core.c        |  4 ++
 kernel/softirq.c           | 14 +++++--
 kernel/time/hrtimer.c      |  4 +-
 11 files changed, 168 insertions(+), 58 deletions(-)

-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH RT v3 1/5] rcu: Acquire RCU lock when disabling BHs
  2019-09-11 16:57 [PATCH RT v3 0/5] RCU fixes Scott Wood
@ 2019-09-11 16:57 ` Scott Wood
  2019-09-12 22:09   ` Joel Fernandes
  2019-09-17  7:44   ` Sebastian Andrzej Siewior
  2019-09-11 16:57 ` [PATCH RT v3 2/5] sched: Rename sleeping_lock to rt_invol_sleep Scott Wood
                   ` (3 subsequent siblings)
  4 siblings, 2 replies; 35+ messages in thread
From: Scott Wood @ 2019-09-11 16:57 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: linux-rt-users, linux-kernel, Paul E . McKenney, Joel Fernandes,
	Thomas Gleixner, Steven Rostedt, Peter Zijlstra, Juri Lelli,
	Clark Williams, Scott Wood

A plain local_bh_disable() is documented as creating an RCU critical
section, and (at least) rcutorture expects this to be the case.  However,
in_softirq() doesn't block a grace period on PREEMPT_RT, since RCU checks
preempt_count() directly.  Even if RCU were changed to check
in_softirq(), that wouldn't allow blocked BH disablers to be boosted.

Fix this by calling rcu_read_lock() from local_bh_disable(), and update
rcu_read_lock_bh_held() accordingly.

Signed-off-by: Scott Wood <swood@redhat.com>
---
v3: Remove change to rcu_read_lock_bh_held(), and move debug portions
of rcu_read_[un]lock_bh() to separate functions
---
 include/linux/rcupdate.h | 40 ++++++++++++++++++++++++++++++++--------
 kernel/softirq.c         | 12 +++++++++---
 2 files changed, 41 insertions(+), 11 deletions(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 388ace315f32..9ce7c5006a5e 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -600,6 +600,36 @@ static inline void rcu_read_unlock(void)
 	rcu_lock_release(&rcu_lock_map); /* Keep acq info for rls diags. */
 }
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+/*
+ * On RT, local_bh_disable() calls rcu_read_lock() -- no need to
+ * track it twice.
+ */
+static inline void rcu_bh_lock_acquire(void)
+{
+}
+
+static inline void rcu_bh_lock_release(void)
+{
+}
+#else
+static inline void rcu_bh_lock_acquire(void)
+{
+	__acquire(RCU_BH);
+	rcu_lock_acquire(&rcu_bh_lock_map);
+	RCU_LOCKDEP_WARN(!rcu_is_watching(),
+			 "rcu_read_lock_bh() used illegally while idle");
+}
+
+static inline void rcu_bh_lock_release(void)
+{
+	RCU_LOCKDEP_WARN(!rcu_is_watching(),
+			 "rcu_read_unlock_bh() used illegally while idle");
+	rcu_lock_release(&rcu_bh_lock_map);
+	__release(RCU_BH);
+}
+#endif
+
 /**
  * rcu_read_lock_bh() - mark the beginning of an RCU-bh critical section
  *
@@ -615,10 +645,7 @@ static inline void rcu_read_unlock(void)
 static inline void rcu_read_lock_bh(void)
 {
 	local_bh_disable();
-	__acquire(RCU_BH);
-	rcu_lock_acquire(&rcu_bh_lock_map);
-	RCU_LOCKDEP_WARN(!rcu_is_watching(),
-			 "rcu_read_lock_bh() used illegally while idle");
+	rcu_bh_lock_acquire();
 }
 
 /*
@@ -628,10 +655,7 @@ static inline void rcu_read_lock_bh(void)
  */
 static inline void rcu_read_unlock_bh(void)
 {
-	RCU_LOCKDEP_WARN(!rcu_is_watching(),
-			 "rcu_read_unlock_bh() used illegally while idle");
-	rcu_lock_release(&rcu_bh_lock_map);
-	__release(RCU_BH);
+	rcu_bh_lock_release();
 	local_bh_enable();
 }
 
diff --git a/kernel/softirq.c b/kernel/softirq.c
index d16d080a74f7..6080c9328df1 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -115,8 +115,10 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
 	long soft_cnt;
 
 	WARN_ON_ONCE(in_irq());
-	if (!in_atomic())
+	if (!in_atomic()) {
 		local_lock(bh_lock);
+		rcu_read_lock();
+	}
 	soft_cnt = this_cpu_inc_return(softirq_counter);
 	WARN_ON_ONCE(soft_cnt == 0);
 	current->softirq_count += SOFTIRQ_DISABLE_OFFSET;
@@ -151,8 +153,10 @@ void _local_bh_enable(void)
 #endif
 
 	current->softirq_count -= SOFTIRQ_DISABLE_OFFSET;
-	if (!in_atomic())
+	if (!in_atomic()) {
+		rcu_read_unlock();
 		local_unlock(bh_lock);
+	}
 }
 
 void _local_bh_enable_rt(void)
@@ -185,8 +189,10 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
 	WARN_ON_ONCE(count < 0);
 	local_irq_enable();
 
-	if (!in_atomic())
+	if (!in_atomic()) {
+		rcu_read_unlock();
 		local_unlock(bh_lock);
+	}
 
 	current->softirq_count -= SOFTIRQ_DISABLE_OFFSET;
 	preempt_check_resched();
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 35+ messages in thread

* [PATCH RT v3 2/5] sched: Rename sleeping_lock to rt_invol_sleep
  2019-09-11 16:57 [PATCH RT v3 0/5] RCU fixes Scott Wood
  2019-09-11 16:57 ` [PATCH RT v3 1/5] rcu: Acquire RCU lock when disabling BHs Scott Wood
@ 2019-09-11 16:57 ` Scott Wood
  2019-09-17  7:52   ` Sebastian Andrzej Siewior
  2019-09-11 16:57 ` [PATCH RT v3 3/5] sched: migrate_dis/enable: Use rt_invol_sleep Scott Wood
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 35+ messages in thread
From: Scott Wood @ 2019-09-11 16:57 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: linux-rt-users, linux-kernel, Paul E . McKenney, Joel Fernandes,
	Thomas Gleixner, Steven Rostedt, Peter Zijlstra, Juri Lelli,
	Clark Williams, Scott Wood

It's already used for one situation other than acquiring a lock, and the
next patch will add another, so change the name to avoid confusion.

Signed-off-by: Scott Wood <swood@redhat.com>
---
 include/linux/sched.h      | 15 ++++++++-------
 kernel/locking/rtmutex.c   | 14 +++++++-------
 kernel/locking/rwlock-rt.c | 16 ++++++++--------
 kernel/rcu/tree_plugin.h   |  6 +++---
 kernel/softirq.c           |  2 +-
 kernel/time/hrtimer.c      |  4 ++--
 6 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7e892e727f12..edc93b74f7d8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -674,7 +674,8 @@ struct task_struct {
 # endif
 #endif
 #ifdef CONFIG_PREEMPT_RT_FULL
-	int				sleeping_lock;
+	/* Task is blocking due to RT-specific mechanisms, not voluntarily */
+	int				rt_invol_sleep;
 #endif
 
 #ifdef CONFIG_PREEMPT_RCU
@@ -1882,20 +1883,20 @@ static __always_inline bool need_resched(void)
 }
 
 #ifdef CONFIG_PREEMPT_RT_FULL
-static inline void sleeping_lock_inc(void)
+static inline void rt_invol_sleep_inc(void)
 {
-	current->sleeping_lock++;
+	current->rt_invol_sleep++;
 }
 
-static inline void sleeping_lock_dec(void)
+static inline void rt_invol_sleep_dec(void)
 {
-	current->sleeping_lock--;
+	current->rt_invol_sleep--;
 }
 
 #else
 
-static inline void sleeping_lock_inc(void) { }
-static inline void sleeping_lock_dec(void) { }
+static inline void rt_invol_sleep_inc(void) { }
+static inline void rt_invol_sleep_dec(void) { }
 #endif
 
 /*
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 5ccbb45131e5..d7100586c597 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1135,7 +1135,7 @@ void __sched rt_spin_lock_slowunlock(struct rt_mutex *lock)
 
 void __lockfunc rt_spin_lock(spinlock_t *lock)
 {
-	sleeping_lock_inc();
+	rt_invol_sleep_inc();
 	migrate_disable();
 	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
 	rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock);
@@ -1150,7 +1150,7 @@ void __lockfunc __rt_spin_lock(struct rt_mutex *lock)
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass)
 {
-	sleeping_lock_inc();
+	rt_invol_sleep_inc();
 	migrate_disable();
 	spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
 	rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock);
@@ -1164,7 +1164,7 @@ void __lockfunc rt_spin_unlock(spinlock_t *lock)
 	spin_release(&lock->dep_map, 1, _RET_IP_);
 	rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
 	migrate_enable();
-	sleeping_lock_dec();
+	rt_invol_sleep_dec();
 }
 EXPORT_SYMBOL(rt_spin_unlock);
 
@@ -1190,14 +1190,14 @@ int __lockfunc rt_spin_trylock(spinlock_t *lock)
 {
 	int ret;
 
-	sleeping_lock_inc();
+	rt_invol_sleep_inc();
 	migrate_disable();
 	ret = __rt_mutex_trylock(&lock->lock);
 	if (ret) {
 		spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
 	} else {
 		migrate_enable();
-		sleeping_lock_dec();
+		rt_invol_sleep_dec();
 	}
 	return ret;
 }
@@ -1210,7 +1210,7 @@ int __lockfunc rt_spin_trylock_bh(spinlock_t *lock)
 	local_bh_disable();
 	ret = __rt_mutex_trylock(&lock->lock);
 	if (ret) {
-		sleeping_lock_inc();
+		rt_invol_sleep_inc();
 		migrate_disable();
 		spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
 	} else
@@ -1226,7 +1226,7 @@ int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags)
 	*flags = 0;
 	ret = __rt_mutex_trylock(&lock->lock);
 	if (ret) {
-		sleeping_lock_inc();
+		rt_invol_sleep_inc();
 		migrate_disable();
 		spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
 	}
diff --git a/kernel/locking/rwlock-rt.c b/kernel/locking/rwlock-rt.c
index c3b91205161c..de025a7cc9c4 100644
--- a/kernel/locking/rwlock-rt.c
+++ b/kernel/locking/rwlock-rt.c
@@ -305,14 +305,14 @@ int __lockfunc rt_read_trylock(rwlock_t *rwlock)
 {
 	int ret;
 
-	sleeping_lock_inc();
+	rt_invol_sleep_inc();
 	migrate_disable();
 	ret = do_read_rt_trylock(rwlock);
 	if (ret) {
 		rwlock_acquire_read(&rwlock->dep_map, 0, 1, _RET_IP_);
 	} else {
 		migrate_enable();
-		sleeping_lock_dec();
+		rt_invol_sleep_dec();
 	}
 	return ret;
 }
@@ -322,14 +322,14 @@ int __lockfunc rt_write_trylock(rwlock_t *rwlock)
 {
 	int ret;
 
-	sleeping_lock_inc();
+	rt_invol_sleep_inc();
 	migrate_disable();
 	ret = do_write_rt_trylock(rwlock);
 	if (ret) {
 		rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
 	} else {
 		migrate_enable();
-		sleeping_lock_dec();
+		rt_invol_sleep_dec();
 	}
 	return ret;
 }
@@ -337,7 +337,7 @@ int __lockfunc rt_write_trylock(rwlock_t *rwlock)
 
 void __lockfunc rt_read_lock(rwlock_t *rwlock)
 {
-	sleeping_lock_inc();
+	rt_invol_sleep_inc();
 	migrate_disable();
 	rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_);
 	do_read_rt_lock(rwlock);
@@ -346,7 +346,7 @@ void __lockfunc rt_read_lock(rwlock_t *rwlock)
 
 void __lockfunc rt_write_lock(rwlock_t *rwlock)
 {
-	sleeping_lock_inc();
+	rt_invol_sleep_inc();
 	migrate_disable();
 	rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
 	do_write_rt_lock(rwlock);
@@ -358,7 +358,7 @@ void __lockfunc rt_read_unlock(rwlock_t *rwlock)
 	rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
 	do_read_rt_unlock(rwlock);
 	migrate_enable();
-	sleeping_lock_dec();
+	rt_invol_sleep_dec();
 }
 EXPORT_SYMBOL(rt_read_unlock);
 
@@ -367,7 +367,7 @@ void __lockfunc rt_write_unlock(rwlock_t *rwlock)
 	rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
 	do_write_rt_unlock(rwlock);
 	migrate_enable();
-	sleeping_lock_dec();
+	rt_invol_sleep_dec();
 }
 EXPORT_SYMBOL(rt_write_unlock);
 
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 23a54e4b649c..0da4b975cd71 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -287,15 +287,15 @@ void rcu_note_context_switch(bool preempt)
 	struct task_struct *t = current;
 	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
 	struct rcu_node *rnp;
-	int sleeping_l = 0;
+	int rt_invol = 0;
 
 	barrier(); /* Avoid RCU read-side critical sections leaking down. */
 	trace_rcu_utilization(TPS("Start context switch"));
 	lockdep_assert_irqs_disabled();
 #if defined(CONFIG_PREEMPT_RT_FULL)
-	sleeping_l = t->sleeping_lock;
+	rt_invol = t->rt_invol_sleep;
 #endif
-	WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0 && !sleeping_l);
+	WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0 && !rt_invol);
 	if (t->rcu_read_lock_nesting > 0 &&
 	    !t->rcu_read_unlock_special.b.blocked) {
 
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 6080c9328df1..daa21a87838a 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -879,7 +879,7 @@ void softirq_check_pending_idle(void)
 	 */
 	raw_spin_lock(&tsk->pi_lock);
 	if (tsk->pi_blocked_on || tsk->state == TASK_RUNNING ||
-	    (tsk->state == TASK_UNINTERRUPTIBLE && tsk->sleeping_lock)) {
+	    (tsk->state == TASK_UNINTERRUPTIBLE && tsk->rt_invol_sleep)) {
 		okay = true;
 	}
 	raw_spin_unlock(&tsk->pi_lock);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 5eb45a868de9..0e6e2dcf6fa4 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1863,9 +1863,9 @@ void cpu_chill(void)
 	chill_time = ktime_set(0, NSEC_PER_MSEC);
 
 	current->flags |= PF_NOFREEZE;
-	sleeping_lock_inc();
+	rt_invol_sleep_inc();
 	schedule_hrtimeout(&chill_time, HRTIMER_MODE_REL_HARD);
-	sleeping_lock_dec();
+	rt_invol_sleep_dec();
 	if (!freeze_flag)
 		current->flags &= ~PF_NOFREEZE;
 
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 35+ messages in thread

* [PATCH RT v3 3/5] sched: migrate_dis/enable: Use rt_invol_sleep
  2019-09-11 16:57 [PATCH RT v3 0/5] RCU fixes Scott Wood
  2019-09-11 16:57 ` [PATCH RT v3 1/5] rcu: Acquire RCU lock when disabling BHs Scott Wood
  2019-09-11 16:57 ` [PATCH RT v3 2/5] sched: Rename sleeping_lock to rt_invol_sleep Scott Wood
@ 2019-09-11 16:57 ` Scott Wood
  2019-09-17  7:59   ` Sebastian Andrzej Siewior
  2019-09-11 16:57 ` [PATCH RT v3 4/5] rcu: Disable use_softirq on PREEMPT_RT Scott Wood
  2019-09-11 16:57 ` [PATCH RT v3 5/5] rcutorture: Avoid problematic critical section nesting on RT Scott Wood
  4 siblings, 1 reply; 35+ messages in thread
From: Scott Wood @ 2019-09-11 16:57 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: linux-rt-users, linux-kernel, Paul E . McKenney, Joel Fernandes,
	Thomas Gleixner, Steven Rostedt, Peter Zijlstra, Juri Lelli,
	Clark Williams, Scott Wood

Without this, rcu_note_context_switch() will complain if an RCU read lock
is held when migrate_enable() calls stop_one_cpu().  Likewise when
migrate_disable() calls pin_current_cpu() which calls __read_rt_lock() --
which bypasses the part of the mutex code that calls rt_invol_sleep_inc().

Signed-off-by: Scott Wood <swood@redhat.com>
---
v3: Add to pin_current_cpu as well

 include/linux/sched.h    | 4 ++--
 kernel/cpu.c             | 2 ++
 kernel/rcu/tree_plugin.h | 2 +-
 kernel/sched/core.c      | 4 ++++
 4 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index edc93b74f7d8..ecf5cbb23335 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -673,7 +673,7 @@ struct task_struct {
 	int				migrate_disable_atomic;
 # endif
 #endif
-#ifdef CONFIG_PREEMPT_RT_FULL
+#ifdef CONFIG_PREEMPT_RT_BASE
 	/* Task is blocking due to RT-specific mechanisms, not voluntarily */
 	int				rt_invol_sleep;
 #endif
@@ -1882,7 +1882,7 @@ static __always_inline bool need_resched(void)
 	return unlikely(tif_need_resched());
 }
 
-#ifdef CONFIG_PREEMPT_RT_FULL
+#ifdef CONFIG_PREEMPT_RT_BASE
 static inline void rt_invol_sleep_inc(void)
 {
 	current->rt_invol_sleep++;
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 885a195dfbe0..32c6175b63b6 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -308,7 +308,9 @@ void pin_current_cpu(void)
 	preempt_lazy_enable();
 	preempt_enable();
 
+	rt_invol_sleep_inc();
 	__read_rt_lock(cpuhp_pin);
+	rt_invol_sleep_dec();
 
 	preempt_disable();
 	preempt_lazy_disable();
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 0da4b975cd71..6d92dafeeca5 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -292,7 +292,7 @@ void rcu_note_context_switch(bool preempt)
 	barrier(); /* Avoid RCU read-side critical sections leaking down. */
 	trace_rcu_utilization(TPS("Start context switch"));
 	lockdep_assert_irqs_disabled();
-#if defined(CONFIG_PREEMPT_RT_FULL)
+#if defined(CONFIG_PREEMPT_RT_BASE)
 	rt_invol = t->rt_invol_sleep;
 #endif
 	WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0 && !rt_invol);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e1bdd7f9be05..a151332474d8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7405,7 +7405,11 @@ void migrate_enable(void)
 			unpin_current_cpu();
 			preempt_lazy_enable();
 			preempt_enable();
+
+			rt_invol_sleep_inc();
 			stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
+			rt_invol_sleep_dec();
+
 			return;
 		}
 	}
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 35+ messages in thread

* [PATCH RT v3 4/5] rcu: Disable use_softirq on PREEMPT_RT
  2019-09-11 16:57 [PATCH RT v3 0/5] RCU fixes Scott Wood
                   ` (2 preceding siblings ...)
  2019-09-11 16:57 ` [PATCH RT v3 3/5] sched: migrate_dis/enable: Use rt_invol_sleep Scott Wood
@ 2019-09-11 16:57 ` Scott Wood
  2019-09-12 21:38   ` Joel Fernandes
  2019-09-17 14:08   ` Scott Wood
  2019-09-11 16:57 ` [PATCH RT v3 5/5] rcutorture: Avoid problematic critical section nesting on RT Scott Wood
  4 siblings, 2 replies; 35+ messages in thread
From: Scott Wood @ 2019-09-11 16:57 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: linux-rt-users, linux-kernel, Paul E . McKenney, Joel Fernandes,
	Thomas Gleixner, Steven Rostedt, Peter Zijlstra, Juri Lelli,
	Clark Williams, Scott Wood

Besides restoring behavior that used to be default on RT, this avoids
a deadlock on scheduler locks:

[  136.894657] 039: ============================================
[  136.900401] 039: WARNING: possible recursive locking detected
[  136.906146] 039: 5.2.9-rt3.dbg+ #174 Tainted: G            E
[  136.912152] 039: --------------------------------------------
[  136.917896] 039: rcu_torture_rea/13474 is trying to acquire lock:
[  136.923990] 039: 000000005f25146d
[  136.927310] 039:  (
[  136.929414] 039: &p->pi_lock
[  136.932303] 039: ){-...}
[  136.934840] 039: , at: try_to_wake_up+0x39/0x920
[  136.939461] 039:
but task is already holding lock:
[  136.944425] 039: 000000005f25146d
[  136.947745] 039:  (
[  136.949852] 039: &p->pi_lock
[  136.952738] 039: ){-...}
[  136.955274] 039: , at: try_to_wake_up+0x39/0x920
[  136.959895] 039:
other info that might help us debug this:
[  136.965555] 039:  Possible unsafe locking scenario:

[  136.970608] 039:        CPU0
[  136.973493] 039:        ----
[  136.976378] 039:   lock(
[  136.978918] 039: &p->pi_lock
[  136.981806] 039: );
[  136.983911] 039:   lock(
[  136.986451] 039: &p->pi_lock
[  136.989336] 039: );
[  136.991444] 039:
 *** DEADLOCK ***

[  136.995194] 039:  May be due to missing lock nesting notation

[  137.001115] 039: 3 locks held by rcu_torture_rea/13474:
[  137.006341] 039:  #0:
[  137.008707] 039: 000000005f25146d
[  137.012024] 039:  (
[  137.014131] 039: &p->pi_lock
[  137.017015] 039: ){-...}
[  137.019558] 039: , at: try_to_wake_up+0x39/0x920
[  137.024175] 039:  #1:
[  137.026540] 039: 0000000011c8e51d
[  137.029859] 039:  (
[  137.031966] 039: &rq->lock
[  137.034679] 039: ){-...}
[  137.037217] 039: , at: try_to_wake_up+0x241/0x920
[  137.041924] 039:  #2:
[  137.044291] 039: 00000000098649b9
[  137.047610] 039:  (
[  137.049714] 039: rcu_read_lock
[  137.052774] 039: ){....}
[  137.055314] 039: , at: cpuacct_charge+0x33/0x1e0
[  137.059934] 039:
stack backtrace:
[  137.063425] 039: CPU: 39 PID: 13474 Comm: rcu_torture_rea Kdump: loaded Tainted: G            E     5.2.9-rt3.dbg+ #174
[  137.074197] 039: Hardware name: Intel Corporation S2600BT/S2600BT, BIOS SE5C620.86B.01.00.0763.022420181017 02/24/2018
[  137.084886] 039: Call Trace:
[  137.087773] 039:  <IRQ>
[  137.090226] 039:  dump_stack+0x5e/0x8b
[  137.093997] 039:  __lock_acquire+0x725/0x1100
[  137.098358] 039:  lock_acquire+0xc0/0x240
[  137.102374] 039:  ? try_to_wake_up+0x39/0x920
[  137.106737] 039:  _raw_spin_lock_irqsave+0x47/0x90
[  137.111534] 039:  ? try_to_wake_up+0x39/0x920
[  137.115910] 039:  try_to_wake_up+0x39/0x920
[  137.120098] 039:  rcu_read_unlock_special+0x65/0xb0
[  137.124977] 039:  __rcu_read_unlock+0x5d/0x70
[  137.129337] 039:  cpuacct_charge+0xd9/0x1e0
[  137.133522] 039:  ? cpuacct_charge+0x33/0x1e0
[  137.137880] 039:  update_curr+0x14b/0x420
[  137.141894] 039:  enqueue_entity+0x42/0x370
[  137.146080] 039:  enqueue_task_fair+0xa9/0x490
[  137.150528] 039:  activate_task+0x5a/0xf0
[  137.154539] 039:  ttwu_do_activate+0x4e/0x90
[  137.158813] 039:  try_to_wake_up+0x277/0x920
[  137.163086] 039:  irq_exit+0xb6/0xf0
[  137.166661] 039:  smp_apic_timer_interrupt+0xe3/0x3a0
[  137.171714] 039:  apic_timer_interrupt+0xf/0x20
[  137.176249] 039:  </IRQ>
[  137.178785] 039: RIP: 0010:__schedule+0x0/0x8e0
[  137.183319] 039: Code: 00 02 48 89 43 20 e8 0f 5a 00 00 48 8d 7b 28 e8 86 f2 fd ff 31 c0 5b 5d 41 5c c3 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 <55> 48 89 e5 41 57 41 56 49 c7 c6 c0 ca 1e 00 41 55 41 89 fd 41 54
[  137.202498] 039: RSP: 0018:ffffc9005835fbc0 EFLAGS: 00000246
[  137.208158] 039:  ORIG_RAX: ffffffffffffff13
[  137.212428] 039: RAX: 0000000000000000 RBX: ffff8897c3e1bb00 RCX: 0000000000000001
[  137.219994] 039: RDX: 0000000080004008 RSI: 0000000000000006 RDI: 0000000000000001
[  137.227560] 039: RBP: ffff8897c3e1bb00 R08: 0000000000000000 R09: 0000000000000000
[  137.235126] 039: R10: 0000000000000001 R11: 0000000000000001 R12: ffffffff81001fd1
[  137.242694] 039: R13: 0000000000000044 R14: 0000000000000000 R15: ffffc9005835fcac
[  137.250259] 039:  ? ___preempt_schedule+0x16/0x18
[  137.254969] 039:  preempt_schedule_common+0x32/0x80
[  137.259846] 039:  ___preempt_schedule+0x16/0x18
[  137.264379] 039:  rcutorture_one_extend+0x33a/0x510 [rcutorture]
[  137.270397] 039:  rcu_torture_one_read+0x18c/0x450 [rcutorture]
[  137.276334] 039:  rcu_torture_reader+0xac/0x1f0 [rcutorture]
[  137.281998] 039:  ? rcu_torture_reader+0x1f0/0x1f0 [rcutorture]
[  137.287920] 039:  kthread+0x106/0x140
[  137.291591] 039:  ? rcu_torture_one_read+0x450/0x450 [rcutorture]
[  137.297681] 039:  ? kthread_bind+0x10/0x10
[  137.301783] 039:  ret_from_fork+0x3a/0x50

Signed-off-by: Scott Wood <swood@redhat.com>
---
The prohibition on use_softirq should be able to be dropped once RT gets
the latest RCU code, but the question of what use_softirq should default
to on PREEMPT_RT remains.

v3: Use IS_ENABLED
---
 kernel/rcu/tree.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index fc8b00c61b32..ee0a5ec2c30f 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -98,9 +98,14 @@ struct rcu_state rcu_state = {
 /* Dump rcu_node combining tree at boot to verify correct setup. */
 static bool dump_tree;
 module_param(dump_tree, bool, 0444);
-/* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */
-static bool use_softirq = 1;
+/*
+ * By default, use RCU_SOFTIRQ instead of rcuc kthreads.
+ * But, avoid RCU_SOFTIRQ on PREEMPT_RT due to pi/rq deadlocks.
+ */
+static bool use_softirq = !IS_ENABLED(CONFIG_PREEMPT_RT_FULL);
+#ifdef CONFIG_PREEMPT_RT_FULL
 module_param(use_softirq, bool, 0444);
+#endif
 /* Control rcu_node-tree auto-balancing at boot time. */
 static bool rcu_fanout_exact;
 module_param(rcu_fanout_exact, bool, 0444);
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 35+ messages in thread

* [PATCH RT v3 5/5] rcutorture: Avoid problematic critical section nesting on RT
  2019-09-11 16:57 [PATCH RT v3 0/5] RCU fixes Scott Wood
                   ` (3 preceding siblings ...)
  2019-09-11 16:57 ` [PATCH RT v3 4/5] rcu: Disable use_softirq on PREEMPT_RT Scott Wood
@ 2019-09-11 16:57 ` Scott Wood
  2019-09-12 22:17   ` Joel Fernandes
  4 siblings, 1 reply; 35+ messages in thread
From: Scott Wood @ 2019-09-11 16:57 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: linux-rt-users, linux-kernel, Paul E . McKenney, Joel Fernandes,
	Thomas Gleixner, Steven Rostedt, Peter Zijlstra, Juri Lelli,
	Clark Williams, Scott Wood

rcutorture was generating some nesting scenarios that are not
reasonable.  Constrain the state selection to avoid them.

Example #1:

1. preempt_disable()
2. local_bh_disable()
3. preempt_enable()
4. local_bh_enable()

On PREEMPT_RT, BH disabling takes a local lock only when called in
non-atomic context.  Thus, atomic context must be retained until after BH
is re-enabled.  Likewise, if BH is initially disabled in non-atomic
context, it cannot be re-enabled in atomic context.

Example #2:

1. rcu_read_lock()
2. local_irq_disable()
3. rcu_read_unlock()
4. local_irq_enable()

If the thread is preempted between steps 1 and 2,
rcu_read_unlock_special.b.blocked will be set, but it won't be
acted on in step 3 because IRQs are disabled.  Thus, reporting of the
quiescent state will be delayed beyond the local_irq_enable().

For now, these scenarios will continue to be tested on non-PREEMPT_RT
kernels, until debug checks are added to ensure that they are not
happening elsewhere.

Signed-off-by: Scott Wood <swood@redhat.com>
---
v3: Limit to RT kernels, and remove one constraint that, while it
is bad on both RT and non-RT (missing a schedule), does not oops or
otherwise prevent using rcutorture.  It wolud be added once debug checks
are implemented.

 kernel/rcu/rcutorture.c | 96 +++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 82 insertions(+), 14 deletions(-)

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index efaa5b3f4d3f..ecb82cc432af 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -60,10 +60,13 @@
 #define RCUTORTURE_RDR_RBH	 0x08	/*  ... rcu_read_lock_bh(). */
 #define RCUTORTURE_RDR_SCHED	 0x10	/*  ... rcu_read_lock_sched(). */
 #define RCUTORTURE_RDR_RCU	 0x20	/*  ... entering another RCU reader. */
-#define RCUTORTURE_RDR_NBITS	 6	/* Number of bits defined above. */
+#define RCUTORTURE_RDR_ATOM_BH	 0x40	/*  ... disabling bh while atomic */
+#define RCUTORTURE_RDR_ATOM_RBH	 0x80	/*  ... RBH while atomic */
+#define RCUTORTURE_RDR_NBITS	 8	/* Number of bits defined above. */
 #define RCUTORTURE_MAX_EXTEND	 \
 	(RCUTORTURE_RDR_BH | RCUTORTURE_RDR_IRQ | RCUTORTURE_RDR_PREEMPT | \
-	 RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED)
+	 RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED | \
+	 RCUTORTURE_RDR_ATOM_BH | RCUTORTURE_RDR_ATOM_RBH)
 #define RCUTORTURE_RDR_MAX_LOOPS 0x7	/* Maximum reader extensions. */
 					/* Must be power of two minus one. */
 #define RCUTORTURE_RDR_MAX_SEGS (RCUTORTURE_RDR_MAX_LOOPS + 3)
@@ -1092,31 +1095,52 @@ static void rcutorture_one_extend(int *readstate, int newstate,
 	WARN_ON_ONCE((idxold >> RCUTORTURE_RDR_SHIFT) > 1);
 	rtrsp->rt_readstate = newstate;
 
-	/* First, put new protection in place to avoid critical-section gap. */
+	/*
+	 * First, put new protection in place to avoid critical-section gap.
+	 * Disable preemption around the ATOM disables to ensure that
+	 * in_atomic() is true.
+	 */
 	if (statesnew & RCUTORTURE_RDR_BH)
 		local_bh_disable();
+	if (statesnew & RCUTORTURE_RDR_RBH)
+		rcu_read_lock_bh();
 	if (statesnew & RCUTORTURE_RDR_IRQ)
 		local_irq_disable();
 	if (statesnew & RCUTORTURE_RDR_PREEMPT)
 		preempt_disable();
-	if (statesnew & RCUTORTURE_RDR_RBH)
-		rcu_read_lock_bh();
 	if (statesnew & RCUTORTURE_RDR_SCHED)
 		rcu_read_lock_sched();
+	preempt_disable();
+	if (statesnew & RCUTORTURE_RDR_ATOM_BH)
+		local_bh_disable();
+	if (statesnew & RCUTORTURE_RDR_ATOM_RBH)
+		rcu_read_lock_bh();
+	preempt_enable();
 	if (statesnew & RCUTORTURE_RDR_RCU)
 		idxnew = cur_ops->readlock() << RCUTORTURE_RDR_SHIFT;
 
-	/* Next, remove old protection, irq first due to bh conflict. */
+	/*
+	 * Next, remove old protection, in decreasing order of strength
+	 * to avoid unlock paths that aren't safe in the stronger
+	 * context.  Disable preemption around the ATOM enables in
+	 * case the context was only atomic due to IRQ disabling.
+	 */
+	preempt_disable();
 	if (statesold & RCUTORTURE_RDR_IRQ)
 		local_irq_enable();
-	if (statesold & RCUTORTURE_RDR_BH)
+	if (statesold & RCUTORTURE_RDR_ATOM_BH)
 		local_bh_enable();
+	if (statesold & RCUTORTURE_RDR_ATOM_RBH)
+		rcu_read_unlock_bh();
+	preempt_enable();
 	if (statesold & RCUTORTURE_RDR_PREEMPT)
 		preempt_enable();
-	if (statesold & RCUTORTURE_RDR_RBH)
-		rcu_read_unlock_bh();
 	if (statesold & RCUTORTURE_RDR_SCHED)
 		rcu_read_unlock_sched();
+	if (statesold & RCUTORTURE_RDR_BH)
+		local_bh_enable();
+	if (statesold & RCUTORTURE_RDR_RBH)
+		rcu_read_unlock_bh();
 	if (statesold & RCUTORTURE_RDR_RCU)
 		cur_ops->readunlock(idxold >> RCUTORTURE_RDR_SHIFT);
 
@@ -1152,6 +1176,12 @@ static int rcutorture_extend_mask_max(void)
 	int mask = rcutorture_extend_mask_max();
 	unsigned long randmask1 = torture_random(trsp) >> 8;
 	unsigned long randmask2 = randmask1 >> 3;
+	unsigned long preempts = RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED;
+	unsigned long preempts_irq = preempts | RCUTORTURE_RDR_IRQ;
+	unsigned long nonatomic_bhs = RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH;
+	unsigned long atomic_bhs = RCUTORTURE_RDR_ATOM_BH |
+				   RCUTORTURE_RDR_ATOM_RBH;
+	unsigned long tmp;
 
 	WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT);
 	/* Mostly only one bit (need preemption!), sometimes lots of bits. */
@@ -1159,11 +1189,49 @@ static int rcutorture_extend_mask_max(void)
 		mask = mask & randmask2;
 	else
 		mask = mask & (1 << (randmask2 % RCUTORTURE_RDR_NBITS));
-	/* Can't enable bh w/irq disabled. */
-	if ((mask & RCUTORTURE_RDR_IRQ) &&
-	    ((!(mask & RCUTORTURE_RDR_BH) && (oldmask & RCUTORTURE_RDR_BH)) ||
-	     (!(mask & RCUTORTURE_RDR_RBH) && (oldmask & RCUTORTURE_RDR_RBH))))
-		mask |= RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH;
+
+	/*
+	 * Can't enable bh w/irq disabled.
+	 */
+	tmp = atomic_bhs | nonatomic_bhs;
+	if (mask & RCUTORTURE_RDR_IRQ)
+		mask |= oldmask & tmp;
+
+	/*
+	 * Ideally these sequences would be detected in debug builds
+	 * (regardless of RT), but until then don't stop testing
+	 * them on non-RT.
+	 */
+	if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL)) {
+		/*
+		 * Can't release the outermost rcu lock in an irq disabled
+		 * section without preemption also being disabled, if irqs
+		 * had ever been enabled during this RCU critical section
+		 * (could leak a special flag and delay reporting the qs).
+		 */
+		if ((oldmask & RCUTORTURE_RDR_RCU) &&
+		    (mask & RCUTORTURE_RDR_IRQ) &&
+		    !(mask & preempts))
+			mask |= RCUTORTURE_RDR_RCU;
+
+		/* Can't modify atomic bh in non-atomic context */
+		if ((oldmask & atomic_bhs) && (mask & atomic_bhs) &&
+		    !(mask & preempts_irq)) {
+			mask |= oldmask & preempts_irq;
+			if (mask & RCUTORTURE_RDR_IRQ)
+				mask |= oldmask & tmp;
+		}
+		if ((mask & atomic_bhs) && !(mask & preempts_irq))
+			mask |= RCUTORTURE_RDR_PREEMPT;
+
+		/* Can't modify non-atomic bh in atomic context */
+		tmp = nonatomic_bhs;
+		if (oldmask & preempts_irq)
+			mask &= ~tmp;
+		if ((oldmask | mask) & preempts_irq)
+			mask |= oldmask & tmp;
+	}
+
 	return mask ?: RCUTORTURE_RDR_RCU;
 }
 
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 35+ messages in thread

* Re: [PATCH RT v3 4/5] rcu: Disable use_softirq on PREEMPT_RT
  2019-09-11 16:57 ` [PATCH RT v3 4/5] rcu: Disable use_softirq on PREEMPT_RT Scott Wood
@ 2019-09-12 21:38   ` Joel Fernandes
  2019-09-12 22:19     ` Joel Fernandes
  2019-09-17  9:31     ` Sebastian Andrzej Siewior
  2019-09-17 14:08   ` Scott Wood
  1 sibling, 2 replies; 35+ messages in thread
From: Joel Fernandes @ 2019-09-12 21:38 UTC (permalink / raw)
  To: Scott Wood
  Cc: Sebastian Andrzej Siewior, linux-rt-users, linux-kernel,
	Paul E . McKenney, Thomas Gleixner, Steven Rostedt,
	Peter Zijlstra, Juri Lelli, Clark Williams, rcu

Hi Scott,

Would you mind CC'ing rcu@vger.kernel.org on RCU related patches? I added it
for this time.

On Wed, Sep 11, 2019 at 05:57:28PM +0100, Scott Wood wrote:
> Besides restoring behavior that used to be default on RT, this avoids
> a deadlock on scheduler locks:
> 
> [  136.894657] 039: ============================================
> [  136.900401] 039: WARNING: possible recursive locking detected
> [  136.906146] 039: 5.2.9-rt3.dbg+ #174 Tainted: G            E
> [  136.912152] 039: --------------------------------------------
> [  136.917896] 039: rcu_torture_rea/13474 is trying to acquire lock:
> [  136.923990] 039: 000000005f25146d
> [  136.927310] 039:  (
> [  136.929414] 039: &p->pi_lock
> [  136.932303] 039: ){-...}
> [  136.934840] 039: , at: try_to_wake_up+0x39/0x920
> [  136.939461] 039:
> but task is already holding lock:
> [  136.944425] 039: 000000005f25146d
> [  136.947745] 039:  (
> [  136.949852] 039: &p->pi_lock
> [  136.952738] 039: ){-...}
> [  136.955274] 039: , at: try_to_wake_up+0x39/0x920
> [  136.959895] 039:
> other info that might help us debug this:
> [  136.965555] 039:  Possible unsafe locking scenario:
> 
> [  136.970608] 039:        CPU0
> [  136.973493] 039:        ----
> [  136.976378] 039:   lock(
> [  136.978918] 039: &p->pi_lock
> [  136.981806] 039: );
> [  136.983911] 039:   lock(
> [  136.986451] 039: &p->pi_lock
> [  136.989336] 039: );
> [  136.991444] 039:
>  *** DEADLOCK ***
> 
> [  136.995194] 039:  May be due to missing lock nesting notation
> 
> [  137.001115] 039: 3 locks held by rcu_torture_rea/13474:
> [  137.006341] 039:  #0:
> [  137.008707] 039: 000000005f25146d
> [  137.012024] 039:  (
> [  137.014131] 039: &p->pi_lock
> [  137.017015] 039: ){-...}
> [  137.019558] 039: , at: try_to_wake_up+0x39/0x920
> [  137.024175] 039:  #1:
> [  137.026540] 039: 0000000011c8e51d
> [  137.029859] 039:  (
> [  137.031966] 039: &rq->lock
> [  137.034679] 039: ){-...}
> [  137.037217] 039: , at: try_to_wake_up+0x241/0x920
> [  137.041924] 039:  #2:
> [  137.044291] 039: 00000000098649b9
> [  137.047610] 039:  (
> [  137.049714] 039: rcu_read_lock
> [  137.052774] 039: ){....}
> [  137.055314] 039: , at: cpuacct_charge+0x33/0x1e0
> [  137.059934] 039:
> stack backtrace:
> [  137.063425] 039: CPU: 39 PID: 13474 Comm: rcu_torture_rea Kdump: loaded Tainted: G            E     5.2.9-rt3.dbg+ #174
> [  137.074197] 039: Hardware name: Intel Corporation S2600BT/S2600BT, BIOS SE5C620.86B.01.00.0763.022420181017 02/24/2018
> [  137.084886] 039: Call Trace:
> [  137.087773] 039:  <IRQ>
> [  137.090226] 039:  dump_stack+0x5e/0x8b
> [  137.093997] 039:  __lock_acquire+0x725/0x1100
> [  137.098358] 039:  lock_acquire+0xc0/0x240
> [  137.102374] 039:  ? try_to_wake_up+0x39/0x920
> [  137.106737] 039:  _raw_spin_lock_irqsave+0x47/0x90
> [  137.111534] 039:  ? try_to_wake_up+0x39/0x920
> [  137.115910] 039:  try_to_wake_up+0x39/0x920
> [  137.120098] 039:  rcu_read_unlock_special+0x65/0xb0
> [  137.124977] 039:  __rcu_read_unlock+0x5d/0x70
> [  137.129337] 039:  cpuacct_charge+0xd9/0x1e0
> [  137.133522] 039:  ? cpuacct_charge+0x33/0x1e0
> [  137.137880] 039:  update_curr+0x14b/0x420
> [  137.141894] 039:  enqueue_entity+0x42/0x370
> [  137.146080] 039:  enqueue_task_fair+0xa9/0x490
> [  137.150528] 039:  activate_task+0x5a/0xf0
> [  137.154539] 039:  ttwu_do_activate+0x4e/0x90
> [  137.158813] 039:  try_to_wake_up+0x277/0x920
> [  137.163086] 039:  irq_exit+0xb6/0xf0
> [  137.166661] 039:  smp_apic_timer_interrupt+0xe3/0x3a0
> [  137.171714] 039:  apic_timer_interrupt+0xf/0x20
> [  137.176249] 039:  </IRQ>
> [  137.178785] 039: RIP: 0010:__schedule+0x0/0x8e0
> [  137.183319] 039: Code: 00 02 48 89 43 20 e8 0f 5a 00 00 48 8d 7b 28 e8 86 f2 fd ff 31 c0 5b 5d 41 5c c3 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 <55> 48 89 e5 41 57 41 56 49 c7 c6 c0 ca 1e 00 41 55 41 89 fd 41 54
> [  137.202498] 039: RSP: 0018:ffffc9005835fbc0 EFLAGS: 00000246
> [  137.208158] 039:  ORIG_RAX: ffffffffffffff13
> [  137.212428] 039: RAX: 0000000000000000 RBX: ffff8897c3e1bb00 RCX: 0000000000000001
> [  137.219994] 039: RDX: 0000000080004008 RSI: 0000000000000006 RDI: 0000000000000001
> [  137.227560] 039: RBP: ffff8897c3e1bb00 R08: 0000000000000000 R09: 0000000000000000
> [  137.235126] 039: R10: 0000000000000001 R11: 0000000000000001 R12: ffffffff81001fd1
> [  137.242694] 039: R13: 0000000000000044 R14: 0000000000000000 R15: ffffc9005835fcac
> [  137.250259] 039:  ? ___preempt_schedule+0x16/0x18
> [  137.254969] 039:  preempt_schedule_common+0x32/0x80
> [  137.259846] 039:  ___preempt_schedule+0x16/0x18
> [  137.264379] 039:  rcutorture_one_extend+0x33a/0x510 [rcutorture]
> [  137.270397] 039:  rcu_torture_one_read+0x18c/0x450 [rcutorture]
> [  137.276334] 039:  rcu_torture_reader+0xac/0x1f0 [rcutorture]
> [  137.281998] 039:  ? rcu_torture_reader+0x1f0/0x1f0 [rcutorture]
> [  137.287920] 039:  kthread+0x106/0x140
> [  137.291591] 039:  ? rcu_torture_one_read+0x450/0x450 [rcutorture]
> [  137.297681] 039:  ? kthread_bind+0x10/0x10
> [  137.301783] 039:  ret_from_fork+0x3a/0x50
> 
> Signed-off-by: Scott Wood <swood@redhat.com>
> ---
> The prohibition on use_softirq should be able to be dropped once RT gets
> the latest RCU code, but the question of what use_softirq should default
> to on PREEMPT_RT remains.
> 
> v3: Use IS_ENABLED

Out of curiosity, does PREEMPT_RT use the NOCB callback offloading? If no,
should it use it? IIUC, that does make the work the softirq have to do less
work since the callbacks are executed in threaded context.

If yes, can RT tolerate use_softirq=false and what could a realistic softirq
running/completion time be that PREEMPT_RT can tolerate? I guess that can be
answered by running rcuperf on PREEMPT_RT with a NOCB configuration and
measuring softirq worst-case start/completion times.

I could run these tests myself but I am vacation for the next week or so.

thanks,

 - Joel


> ---
>  kernel/rcu/tree.c | 9 +++++++--
>  1 file changed, 7 insertions(+), 2 deletions(-)
> 
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index fc8b00c61b32..ee0a5ec2c30f 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -98,9 +98,14 @@ struct rcu_state rcu_state = {
>  /* Dump rcu_node combining tree at boot to verify correct setup. */
>  static bool dump_tree;
>  module_param(dump_tree, bool, 0444);
> -/* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */
> -static bool use_softirq = 1;
> +/*
> + * By default, use RCU_SOFTIRQ instead of rcuc kthreads.
> + * But, avoid RCU_SOFTIRQ on PREEMPT_RT due to pi/rq deadlocks.
> + */
> +static bool use_softirq = !IS_ENABLED(CONFIG_PREEMPT_RT_FULL);
> +#ifdef CONFIG_PREEMPT_RT_FULL
>  module_param(use_softirq, bool, 0444);
> +#endif
>  /* Control rcu_node-tree auto-balancing at boot time. */
>  static bool rcu_fanout_exact;
>  module_param(rcu_fanout_exact, bool, 0444);
> -- 
> 1.8.3.1
> 

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RT v3 1/5] rcu: Acquire RCU lock when disabling BHs
  2019-09-11 16:57 ` [PATCH RT v3 1/5] rcu: Acquire RCU lock when disabling BHs Scott Wood
@ 2019-09-12 22:09   ` Joel Fernandes
  2019-09-17  7:44   ` Sebastian Andrzej Siewior
  1 sibling, 0 replies; 35+ messages in thread
From: Joel Fernandes @ 2019-09-12 22:09 UTC (permalink / raw)
  To: Scott Wood
  Cc: Sebastian Andrzej Siewior, linux-rt-users, linux-kernel,
	Paul E . McKenney, Thomas Gleixner, Steven Rostedt,
	Peter Zijlstra, Juri Lelli, Clark Williams

On Wed, Sep 11, 2019 at 05:57:25PM +0100, Scott Wood wrote:
> A plain local_bh_disable() is documented as creating an RCU critical
> section, and (at least) rcutorture expects this to be the case.  However,
> in_softirq() doesn't block a grace period on PREEMPT_RT, since RCU checks
> preempt_count() directly.  Even if RCU were changed to check
> in_softirq(), that wouldn't allow blocked BH disablers to be boosted.
> 
> Fix this by calling rcu_read_lock() from local_bh_disable(), and update
> rcu_read_lock_bh_held() accordingly.
> 
> Signed-off-by: Scott Wood <swood@redhat.com>

Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>

thanks,

 - Joel

> ---
> v3: Remove change to rcu_read_lock_bh_held(), and move debug portions
> of rcu_read_[un]lock_bh() to separate functions
> ---
>  include/linux/rcupdate.h | 40 ++++++++++++++++++++++++++++++++--------
>  kernel/softirq.c         | 12 +++++++++---
>  2 files changed, 41 insertions(+), 11 deletions(-)
> 
> diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
> index 388ace315f32..9ce7c5006a5e 100644
> --- a/include/linux/rcupdate.h
> +++ b/include/linux/rcupdate.h
> @@ -600,6 +600,36 @@ static inline void rcu_read_unlock(void)
>  	rcu_lock_release(&rcu_lock_map); /* Keep acq info for rls diags. */
>  }
>  
> +#ifdef CONFIG_PREEMPT_RT_FULL
> +/*
> + * On RT, local_bh_disable() calls rcu_read_lock() -- no need to
> + * track it twice.
> + */
> +static inline void rcu_bh_lock_acquire(void)
> +{
> +}
> +
> +static inline void rcu_bh_lock_release(void)
> +{
> +}
> +#else
> +static inline void rcu_bh_lock_acquire(void)
> +{
> +	__acquire(RCU_BH);
> +	rcu_lock_acquire(&rcu_bh_lock_map);
> +	RCU_LOCKDEP_WARN(!rcu_is_watching(),
> +			 "rcu_read_lock_bh() used illegally while idle");
> +}
> +
> +static inline void rcu_bh_lock_release(void)
> +{
> +	RCU_LOCKDEP_WARN(!rcu_is_watching(),
> +			 "rcu_read_unlock_bh() used illegally while idle");
> +	rcu_lock_release(&rcu_bh_lock_map);
> +	__release(RCU_BH);
> +}
> +#endif
> +
>  /**
>   * rcu_read_lock_bh() - mark the beginning of an RCU-bh critical section
>   *
> @@ -615,10 +645,7 @@ static inline void rcu_read_unlock(void)
>  static inline void rcu_read_lock_bh(void)
>  {
>  	local_bh_disable();
> -	__acquire(RCU_BH);
> -	rcu_lock_acquire(&rcu_bh_lock_map);
> -	RCU_LOCKDEP_WARN(!rcu_is_watching(),
> -			 "rcu_read_lock_bh() used illegally while idle");
> +	rcu_bh_lock_acquire();
>  }
>  
>  /*
> @@ -628,10 +655,7 @@ static inline void rcu_read_lock_bh(void)
>   */
>  static inline void rcu_read_unlock_bh(void)
>  {
> -	RCU_LOCKDEP_WARN(!rcu_is_watching(),
> -			 "rcu_read_unlock_bh() used illegally while idle");
> -	rcu_lock_release(&rcu_bh_lock_map);
> -	__release(RCU_BH);
> +	rcu_bh_lock_release();
>  	local_bh_enable();
>  }
>  
> diff --git a/kernel/softirq.c b/kernel/softirq.c
> index d16d080a74f7..6080c9328df1 100644
> --- a/kernel/softirq.c
> +++ b/kernel/softirq.c
> @@ -115,8 +115,10 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
>  	long soft_cnt;
>  
>  	WARN_ON_ONCE(in_irq());
> -	if (!in_atomic())
> +	if (!in_atomic()) {
>  		local_lock(bh_lock);
> +		rcu_read_lock();
> +	}
>  	soft_cnt = this_cpu_inc_return(softirq_counter);
>  	WARN_ON_ONCE(soft_cnt == 0);
>  	current->softirq_count += SOFTIRQ_DISABLE_OFFSET;
> @@ -151,8 +153,10 @@ void _local_bh_enable(void)
>  #endif
>  
>  	current->softirq_count -= SOFTIRQ_DISABLE_OFFSET;
> -	if (!in_atomic())
> +	if (!in_atomic()) {
> +		rcu_read_unlock();
>  		local_unlock(bh_lock);
> +	}
>  }
>  
>  void _local_bh_enable_rt(void)
> @@ -185,8 +189,10 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
>  	WARN_ON_ONCE(count < 0);
>  	local_irq_enable();
>  
> -	if (!in_atomic())
> +	if (!in_atomic()) {
> +		rcu_read_unlock();
>  		local_unlock(bh_lock);
> +	}
>  
>  	current->softirq_count -= SOFTIRQ_DISABLE_OFFSET;
>  	preempt_check_resched();
> -- 
> 1.8.3.1
> 

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RT v3 5/5] rcutorture: Avoid problematic critical section nesting on RT
  2019-09-11 16:57 ` [PATCH RT v3 5/5] rcutorture: Avoid problematic critical section nesting on RT Scott Wood
@ 2019-09-12 22:17   ` Joel Fernandes
  2019-09-16 16:55     ` Scott Wood
  0 siblings, 1 reply; 35+ messages in thread
From: Joel Fernandes @ 2019-09-12 22:17 UTC (permalink / raw)
  To: Scott Wood
  Cc: Sebastian Andrzej Siewior, linux-rt-users, linux-kernel,
	Paul E . McKenney, Thomas Gleixner, Steven Rostedt,
	Peter Zijlstra, Juri Lelli, Clark Williams

On Wed, Sep 11, 2019 at 05:57:29PM +0100, Scott Wood wrote:
> rcutorture was generating some nesting scenarios that are not
> reasonable.  Constrain the state selection to avoid them.
> 
> Example #1:
> 
> 1. preempt_disable()
> 2. local_bh_disable()
> 3. preempt_enable()
> 4. local_bh_enable()
> 
> On PREEMPT_RT, BH disabling takes a local lock only when called in
> non-atomic context.  Thus, atomic context must be retained until after BH
> is re-enabled.  Likewise, if BH is initially disabled in non-atomic
> context, it cannot be re-enabled in atomic context.
> 
> Example #2:
> 
> 1. rcu_read_lock()
> 2. local_irq_disable()
> 3. rcu_read_unlock()
> 4. local_irq_enable()

If I understand correctly, these examples are not unrealistic in the real
world unless RCU is used in the scheduler.

> 
> If the thread is preempted between steps 1 and 2,
> rcu_read_unlock_special.b.blocked will be set, but it won't be
> acted on in step 3 because IRQs are disabled.  Thus, reporting of the
> quiescent state will be delayed beyond the local_irq_enable().

Yes, with consolidated RCU this can happen but AFAIK it has not seen to be a
problem since deferred QS reporting will happen take care of it, which can
also happen from subsequent rcu_read_unlock_special().

> For now, these scenarios will continue to be tested on non-PREEMPT_RT
> kernels, until debug checks are added to ensure that they are not
> happening elsewhere.

Are you seeing real issues that need this patch? It would be good to not
complicate rcutorture if not needed.

thanks,

 - Joel


> 
> Signed-off-by: Scott Wood <swood@redhat.com>
> ---
> v3: Limit to RT kernels, and remove one constraint that, while it
> is bad on both RT and non-RT (missing a schedule), does not oops or
> otherwise prevent using rcutorture.  It wolud be added once debug checks
> are implemented.
> 
>  kernel/rcu/rcutorture.c | 96 +++++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 82 insertions(+), 14 deletions(-)
> 
> diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
> index efaa5b3f4d3f..ecb82cc432af 100644
> --- a/kernel/rcu/rcutorture.c
> +++ b/kernel/rcu/rcutorture.c
> @@ -60,10 +60,13 @@
>  #define RCUTORTURE_RDR_RBH	 0x08	/*  ... rcu_read_lock_bh(). */
>  #define RCUTORTURE_RDR_SCHED	 0x10	/*  ... rcu_read_lock_sched(). */
>  #define RCUTORTURE_RDR_RCU	 0x20	/*  ... entering another RCU reader. */
> -#define RCUTORTURE_RDR_NBITS	 6	/* Number of bits defined above. */
> +#define RCUTORTURE_RDR_ATOM_BH	 0x40	/*  ... disabling bh while atomic */
> +#define RCUTORTURE_RDR_ATOM_RBH	 0x80	/*  ... RBH while atomic */
> +#define RCUTORTURE_RDR_NBITS	 8	/* Number of bits defined above. */
>  #define RCUTORTURE_MAX_EXTEND	 \
>  	(RCUTORTURE_RDR_BH | RCUTORTURE_RDR_IRQ | RCUTORTURE_RDR_PREEMPT | \
> -	 RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED)
> +	 RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED | \
> +	 RCUTORTURE_RDR_ATOM_BH | RCUTORTURE_RDR_ATOM_RBH)
>  #define RCUTORTURE_RDR_MAX_LOOPS 0x7	/* Maximum reader extensions. */
>  					/* Must be power of two minus one. */
>  #define RCUTORTURE_RDR_MAX_SEGS (RCUTORTURE_RDR_MAX_LOOPS + 3)
> @@ -1092,31 +1095,52 @@ static void rcutorture_one_extend(int *readstate, int newstate,
>  	WARN_ON_ONCE((idxold >> RCUTORTURE_RDR_SHIFT) > 1);
>  	rtrsp->rt_readstate = newstate;
>  
> -	/* First, put new protection in place to avoid critical-section gap. */
> +	/*
> +	 * First, put new protection in place to avoid critical-section gap.
> +	 * Disable preemption around the ATOM disables to ensure that
> +	 * in_atomic() is true.
> +	 */
>  	if (statesnew & RCUTORTURE_RDR_BH)
>  		local_bh_disable();
> +	if (statesnew & RCUTORTURE_RDR_RBH)
> +		rcu_read_lock_bh();
>  	if (statesnew & RCUTORTURE_RDR_IRQ)
>  		local_irq_disable();
>  	if (statesnew & RCUTORTURE_RDR_PREEMPT)
>  		preempt_disable();
> -	if (statesnew & RCUTORTURE_RDR_RBH)
> -		rcu_read_lock_bh();
>  	if (statesnew & RCUTORTURE_RDR_SCHED)
>  		rcu_read_lock_sched();
> +	preempt_disable();
> +	if (statesnew & RCUTORTURE_RDR_ATOM_BH)
> +		local_bh_disable();
> +	if (statesnew & RCUTORTURE_RDR_ATOM_RBH)
> +		rcu_read_lock_bh();
> +	preempt_enable();
>  	if (statesnew & RCUTORTURE_RDR_RCU)
>  		idxnew = cur_ops->readlock() << RCUTORTURE_RDR_SHIFT;
>  
> -	/* Next, remove old protection, irq first due to bh conflict. */
> +	/*
> +	 * Next, remove old protection, in decreasing order of strength
> +	 * to avoid unlock paths that aren't safe in the stronger
> +	 * context.  Disable preemption around the ATOM enables in
> +	 * case the context was only atomic due to IRQ disabling.
> +	 */
> +	preempt_disable();
>  	if (statesold & RCUTORTURE_RDR_IRQ)
>  		local_irq_enable();
> -	if (statesold & RCUTORTURE_RDR_BH)
> +	if (statesold & RCUTORTURE_RDR_ATOM_BH)
>  		local_bh_enable();
> +	if (statesold & RCUTORTURE_RDR_ATOM_RBH)
> +		rcu_read_unlock_bh();
> +	preempt_enable();
>  	if (statesold & RCUTORTURE_RDR_PREEMPT)
>  		preempt_enable();
> -	if (statesold & RCUTORTURE_RDR_RBH)
> -		rcu_read_unlock_bh();
>  	if (statesold & RCUTORTURE_RDR_SCHED)
>  		rcu_read_unlock_sched();
> +	if (statesold & RCUTORTURE_RDR_BH)
> +		local_bh_enable();
> +	if (statesold & RCUTORTURE_RDR_RBH)
> +		rcu_read_unlock_bh();
>  	if (statesold & RCUTORTURE_RDR_RCU)
>  		cur_ops->readunlock(idxold >> RCUTORTURE_RDR_SHIFT);
>  
> @@ -1152,6 +1176,12 @@ static int rcutorture_extend_mask_max(void)
>  	int mask = rcutorture_extend_mask_max();
>  	unsigned long randmask1 = torture_random(trsp) >> 8;
>  	unsigned long randmask2 = randmask1 >> 3;
> +	unsigned long preempts = RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED;
> +	unsigned long preempts_irq = preempts | RCUTORTURE_RDR_IRQ;
> +	unsigned long nonatomic_bhs = RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH;
> +	unsigned long atomic_bhs = RCUTORTURE_RDR_ATOM_BH |
> +				   RCUTORTURE_RDR_ATOM_RBH;
> +	unsigned long tmp;
>  
>  	WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT);
>  	/* Mostly only one bit (need preemption!), sometimes lots of bits. */
> @@ -1159,11 +1189,49 @@ static int rcutorture_extend_mask_max(void)
>  		mask = mask & randmask2;
>  	else
>  		mask = mask & (1 << (randmask2 % RCUTORTURE_RDR_NBITS));
> -	/* Can't enable bh w/irq disabled. */
> -	if ((mask & RCUTORTURE_RDR_IRQ) &&
> -	    ((!(mask & RCUTORTURE_RDR_BH) && (oldmask & RCUTORTURE_RDR_BH)) ||
> -	     (!(mask & RCUTORTURE_RDR_RBH) && (oldmask & RCUTORTURE_RDR_RBH))))
> -		mask |= RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH;
> +
> +	/*
> +	 * Can't enable bh w/irq disabled.
> +	 */
> +	tmp = atomic_bhs | nonatomic_bhs;
> +	if (mask & RCUTORTURE_RDR_IRQ)
> +		mask |= oldmask & tmp;
> +
> +	/*
> +	 * Ideally these sequences would be detected in debug builds
> +	 * (regardless of RT), but until then don't stop testing
> +	 * them on non-RT.
> +	 */
> +	if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL)) {
> +		/*
> +		 * Can't release the outermost rcu lock in an irq disabled
> +		 * section without preemption also being disabled, if irqs
> +		 * had ever been enabled during this RCU critical section
> +		 * (could leak a special flag and delay reporting the qs).
> +		 */
> +		if ((oldmask & RCUTORTURE_RDR_RCU) &&
> +		    (mask & RCUTORTURE_RDR_IRQ) &&
> +		    !(mask & preempts))
> +			mask |= RCUTORTURE_RDR_RCU;
> +
> +		/* Can't modify atomic bh in non-atomic context */
> +		if ((oldmask & atomic_bhs) && (mask & atomic_bhs) &&
> +		    !(mask & preempts_irq)) {
> +			mask |= oldmask & preempts_irq;
> +			if (mask & RCUTORTURE_RDR_IRQ)
> +				mask |= oldmask & tmp;
> +		}
> +		if ((mask & atomic_bhs) && !(mask & preempts_irq))
> +			mask |= RCUTORTURE_RDR_PREEMPT;
> +
> +		/* Can't modify non-atomic bh in atomic context */
> +		tmp = nonatomic_bhs;
> +		if (oldmask & preempts_irq)
> +			mask &= ~tmp;
> +		if ((oldmask | mask) & preempts_irq)
> +			mask |= oldmask & tmp;
> +	}
> +
>  	return mask ?: RCUTORTURE_RDR_RCU;
>  }
>  
> -- 
> 1.8.3.1
> 

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RT v3 4/5] rcu: Disable use_softirq on PREEMPT_RT
  2019-09-12 21:38   ` Joel Fernandes
@ 2019-09-12 22:19     ` Joel Fernandes
  2019-09-17  9:31     ` Sebastian Andrzej Siewior
  1 sibling, 0 replies; 35+ messages in thread
From: Joel Fernandes @ 2019-09-12 22:19 UTC (permalink / raw)
  To: Scott Wood
  Cc: Sebastian Andrzej Siewior, linux-rt-users, linux-kernel,
	Paul E . McKenney, Thomas Gleixner, Steven Rostedt,
	Peter Zijlstra, Juri Lelli, Clark Williams, rcu

On Thu, Sep 12, 2019 at 05:38:43PM -0400, Joel Fernandes wrote:
> Hi Scott,
> 
> Would you mind CC'ing rcu@vger.kernel.org on RCU related patches? I added it
> for this time.
> 
> On Wed, Sep 11, 2019 at 05:57:28PM +0100, Scott Wood wrote:
> > Besides restoring behavior that used to be default on RT, this avoids
> > a deadlock on scheduler locks:
[snip]
> > [  136.995194] 039:  May be due to missing lock nesting notation
> > 
> > [  137.001115] 039: 3 locks held by rcu_torture_rea/13474:
> > [  137.006341] 039:  #0:
> > [  137.008707] 039: 000000005f25146d
> > [  137.012024] 039:  (
> > [  137.014131] 039: &p->pi_lock
> > [  137.017015] 039: ){-...}
> > [  137.019558] 039: , at: try_to_wake_up+0x39/0x920
> > [  137.024175] 039:  #1:
> > [  137.026540] 039: 0000000011c8e51d
> > [  137.029859] 039:  (
> > [  137.031966] 039: &rq->lock
> > [  137.034679] 039: ){-...}
> > [  137.037217] 039: , at: try_to_wake_up+0x241/0x920
> > [  137.041924] 039:  #2:
> > [  137.044291] 039: 00000000098649b9
> > [  137.047610] 039:  (
> > [  137.049714] 039: rcu_read_lock
> > [  137.052774] 039: ){....}
> > [  137.055314] 039: , at: cpuacct_charge+0x33/0x1e0
> > [  137.059934] 039:
> > stack backtrace:
> > [  137.063425] 039: CPU: 39 PID: 13474 Comm: rcu_torture_rea Kdump: loaded Tainted: G            E     5.2.9-rt3.dbg+ #174
> > [  137.074197] 039: Hardware name: Intel Corporation S2600BT/S2600BT, BIOS SE5C620.86B.01.00.0763.022420181017 02/24/2018
> > [  137.084886] 039: Call Trace:
> > [  137.087773] 039:  <IRQ>
> > [  137.090226] 039:  dump_stack+0x5e/0x8b
> > [  137.093997] 039:  __lock_acquire+0x725/0x1100
> > [  137.098358] 039:  lock_acquire+0xc0/0x240
> > [  137.102374] 039:  ? try_to_wake_up+0x39/0x920
> > [  137.106737] 039:  _raw_spin_lock_irqsave+0x47/0x90
> > [  137.111534] 039:  ? try_to_wake_up+0x39/0x920
> > [  137.115910] 039:  try_to_wake_up+0x39/0x920
> > [  137.120098] 039:  rcu_read_unlock_special+0x65/0xb0
> > [  137.124977] 039:  __rcu_read_unlock+0x5d/0x70
> > [  137.129337] 039:  cpuacct_charge+0xd9/0x1e0
> > [  137.133522] 039:  ? cpuacct_charge+0x33/0x1e0
> > [  137.137880] 039:  update_curr+0x14b/0x420
> > [  137.141894] 039:  enqueue_entity+0x42/0x370
> > [  137.146080] 039:  enqueue_task_fair+0xa9/0x490
> > [  137.150528] 039:  activate_task+0x5a/0xf0
> > [  137.154539] 039:  ttwu_do_activate+0x4e/0x90
> > [  137.158813] 039:  try_to_wake_up+0x277/0x920
> > [  137.163086] 039:  irq_exit+0xb6/0xf0
[snip]
> > Signed-off-by: Scott Wood <swood@redhat.com>
> > ---
> > The prohibition on use_softirq should be able to be dropped once RT gets
> > the latest RCU code, but the question of what use_softirq should default
> > to on PREEMPT_RT remains.
> > 
> > v3: Use IS_ENABLED
> 
> Out of curiosity, does PREEMPT_RT use the NOCB callback offloading? If no,
> should it use it? IIUC, that does make the work the softirq have to do less
> work since the callbacks are executed in threaded context.
> 
> If yes, can RT tolerate use_softirq=false and what could a realistic softirq

s/use_softirq=false/use_softirq=true/

thanks,

 - Joel


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RT v3 5/5] rcutorture: Avoid problematic critical section nesting on RT
  2019-09-12 22:17   ` Joel Fernandes
@ 2019-09-16 16:55     ` Scott Wood
  2019-09-17 10:07       ` Sebastian Andrzej Siewior
  0 siblings, 1 reply; 35+ messages in thread
From: Scott Wood @ 2019-09-16 16:55 UTC (permalink / raw)
  To: Joel Fernandes
  Cc: Sebastian Andrzej Siewior, linux-rt-users, linux-kernel,
	Paul E . McKenney, Thomas Gleixner, Steven Rostedt,
	Peter Zijlstra, Juri Lelli, Clark Williams

On Thu, 2019-09-12 at 18:17 -0400, Joel Fernandes wrote:
> On Wed, Sep 11, 2019 at 05:57:29PM +0100, Scott Wood wrote:
> > rcutorture was generating some nesting scenarios that are not
> > reasonable.  Constrain the state selection to avoid them.
> > 
> > Example #1:
> > 
> > 1. preempt_disable()
> > 2. local_bh_disable()
> > 3. preempt_enable()
> > 4. local_bh_enable()
> > 
> > On PREEMPT_RT, BH disabling takes a local lock only when called in
> > non-atomic context.  Thus, atomic context must be retained until after
> > BH
> > is re-enabled.  Likewise, if BH is initially disabled in non-atomic
> > context, it cannot be re-enabled in atomic context.
> > 
> > Example #2:
> > 
> > 1. rcu_read_lock()
> > 2. local_irq_disable()
> > 3. rcu_read_unlock()
> > 4. local_irq_enable()
> 
> If I understand correctly, these examples are not unrealistic in the real
> world unless RCU is used in the scheduler.

I hope you mean "not realistic", at least when it comes to explicit
preempt/irq disabling rather than spinlock variants that don't disable
preempt/irqs on PREEMPT_RT.

> > If the thread is preempted between steps 1 and 2,
> > rcu_read_unlock_special.b.blocked will be set, but it won't be
> > acted on in step 3 because IRQs are disabled.  Thus, reporting of the
> > quiescent state will be delayed beyond the local_irq_enable().
> 
> Yes, with consolidated RCU this can happen but AFAIK it has not seen to be
> a
> problem since deferred QS reporting will happen take care of it, which can
> also happen from subsequent rcu_read_unlock_special().

The defer_qs_iw_pending stuff isn't in 5.2-rt.  Still, given patch 4/5 (and
special.b.deferred_qs on mainline) this shouldn't present a deadlock concern
(letting the test run a bit now to double check) so this patch could
probably be limited to the "example #1" sequence.

> > For now, these scenarios will continue to be tested on non-PREEMPT_RT
> > kernels, until debug checks are added to ensure that they are not
> > happening elsewhere.
> 
> Are you seeing real issues that need this patch? It would be good to not
> complicate rcutorture if not needed.

rcutorture crashes on RT without this patch (in particular due to the
local_bh_disable misordering).

-Scott



^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RT v3 1/5] rcu: Acquire RCU lock when disabling BHs
  2019-09-11 16:57 ` [PATCH RT v3 1/5] rcu: Acquire RCU lock when disabling BHs Scott Wood
  2019-09-12 22:09   ` Joel Fernandes
@ 2019-09-17  7:44   ` Sebastian Andrzej Siewior
  2019-09-17 14:06     ` Scott Wood
  1 sibling, 1 reply; 35+ messages in thread
From: Sebastian Andrzej Siewior @ 2019-09-17  7:44 UTC (permalink / raw)
  To: Scott Wood
  Cc: linux-rt-users, linux-kernel, Paul E . McKenney, Joel Fernandes,
	Thomas Gleixner, Steven Rostedt, Peter Zijlstra, Juri Lelli,
	Clark Williams

On 2019-09-11 17:57:25 [+0100], Scott Wood wrote:
> index 388ace315f32..9ce7c5006a5e 100644
> --- a/include/linux/rcupdate.h
> +++ b/include/linux/rcupdate.h
> @@ -600,6 +600,36 @@ static inline void rcu_read_unlock(void)
>  	rcu_lock_release(&rcu_lock_map); /* Keep acq info for rls diags. */
>  }
>  
> +#ifdef CONFIG_PREEMPT_RT_FULL
> +/*
> + * On RT, local_bh_disable() calls rcu_read_lock() -- no need to
> + * track it twice.
> + */
> +static inline void rcu_bh_lock_acquire(void)
> +{
> +}
> +
> +static inline void rcu_bh_lock_release(void)
> +{
> +}
> +#else
> +static inline void rcu_bh_lock_acquire(void)
> +{
> +	__acquire(RCU_BH);
> +	rcu_lock_acquire(&rcu_bh_lock_map);
> +	RCU_LOCKDEP_WARN(!rcu_is_watching(),
> +			 "rcu_read_lock_bh() used illegally while idle");
> +}
> +
> +static inline void rcu_bh_lock_release(void)
> +{
> +	RCU_LOCKDEP_WARN(!rcu_is_watching(),
> +			 "rcu_read_unlock_bh() used illegally while idle");
> +	rcu_lock_release(&rcu_bh_lock_map);
> +	__release(RCU_BH);
> +}
> +#endif
> +
>  /**
>   * rcu_read_lock_bh() - mark the beginning of an RCU-bh critical section
>   *
> @@ -615,10 +645,7 @@ static inline void rcu_read_unlock(void)
>  static inline void rcu_read_lock_bh(void)
>  {
>  	local_bh_disable();
> -	__acquire(RCU_BH);
> -	rcu_lock_acquire(&rcu_bh_lock_map);
> -	RCU_LOCKDEP_WARN(!rcu_is_watching(),
> -			 "rcu_read_lock_bh() used illegally while idle");
> +	rcu_bh_lock_acquire();
>  }
>  
>  /*

I asked previously why do you need to change rcu_read_lock_bh() and you
replied that you don't remember:
   https://lore.kernel.org/linux-rt-users/b948ec6cccda31925ed8dc123bd0f55423fff3d4.camel@redhat.com/

Did this change?

Sebastian

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RT v3 2/5] sched: Rename sleeping_lock to rt_invol_sleep
  2019-09-11 16:57 ` [PATCH RT v3 2/5] sched: Rename sleeping_lock to rt_invol_sleep Scott Wood
@ 2019-09-17  7:52   ` Sebastian Andrzej Siewior
  0 siblings, 0 replies; 35+ messages in thread
From: Sebastian Andrzej Siewior @ 2019-09-17  7:52 UTC (permalink / raw)
  To: Scott Wood
  Cc: linux-rt-users, linux-kernel, Paul E . McKenney, Joel Fernandes,
	Thomas Gleixner, Steven Rostedt, Peter Zijlstra, Juri Lelli,
	Clark Williams

On 2019-09-11 17:57:26 [+0100], Scott Wood wrote:
> It's already used for one situation other than acquiring a lock, and the
> next patch will add another, so change the name to avoid confusion.

I know it has been suggested but please don't rename it, keep it as is.
The _only_ reason why you are having it is to avoid a RCU related
warning.
PeterZ asked if we could maybe utilize a task-state bit for this
annotation instead. So I will look into this and change the mechanism
that is used to something different if it is preferred over this one and
you don't have to worry about it. Please use what is here at the moment.

> Signed-off-by: Scott Wood <swood@redhat.com>

Sebastian

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RT v3 3/5] sched: migrate_dis/enable: Use rt_invol_sleep
  2019-09-11 16:57 ` [PATCH RT v3 3/5] sched: migrate_dis/enable: Use rt_invol_sleep Scott Wood
@ 2019-09-17  7:59   ` Sebastian Andrzej Siewior
  2019-09-17 14:06     ` Scott Wood
  0 siblings, 1 reply; 35+ messages in thread
From: Sebastian Andrzej Siewior @ 2019-09-17  7:59 UTC (permalink / raw)
  To: Scott Wood
  Cc: linux-rt-users, linux-kernel, Paul E . McKenney, Joel Fernandes,
	Thomas Gleixner, Steven Rostedt, Peter Zijlstra, Juri Lelli,
	Clark Williams

On 2019-09-11 17:57:27 [+0100], Scott Wood wrote:
> diff --git a/kernel/cpu.c b/kernel/cpu.c
> index 885a195dfbe0..32c6175b63b6 100644
> --- a/kernel/cpu.c
> +++ b/kernel/cpu.c
> @@ -308,7 +308,9 @@ void pin_current_cpu(void)
>  	preempt_lazy_enable();
>  	preempt_enable();
>  
> +	rt_invol_sleep_inc();
>  	__read_rt_lock(cpuhp_pin);
> +	rt_invol_sleep_dec();
>  
>  	preempt_disable();
>  	preempt_lazy_disable();

I understand the other one. But now looking at it, both end up in
rt_spin_lock_slowlock_locked() which would be the proper place to do
that annotation. Okay.

Sebastian

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RT v3 4/5] rcu: Disable use_softirq on PREEMPT_RT
  2019-09-12 21:38   ` Joel Fernandes
  2019-09-12 22:19     ` Joel Fernandes
@ 2019-09-17  9:31     ` Sebastian Andrzej Siewior
  1 sibling, 0 replies; 35+ messages in thread
From: Sebastian Andrzej Siewior @ 2019-09-17  9:31 UTC (permalink / raw)
  To: Joel Fernandes
  Cc: Scott Wood, linux-rt-users, linux-kernel, Paul E . McKenney,
	Thomas Gleixner, Steven Rostedt, Peter Zijlstra, Juri Lelli,
	Clark Williams, rcu

On 2019-09-12 17:38:43 [-0400], Joel Fernandes wrote:
> > The prohibition on use_softirq should be able to be dropped once RT gets
> > the latest RCU code, but the question of what use_softirq should default
> > to on PREEMPT_RT remains.
> > 
> > v3: Use IS_ENABLED

I'm going to pick it up.

> Out of curiosity, does PREEMPT_RT use the NOCB callback offloading? If no,
> should it use it? IIUC, that does make the work the softirq have to do less
> work since the callbacks are executed in threaded context.

It can use it, it is recommended to do so and to move those threads out
of the CPU(s) that are dedicated for RT workload. But then there are
those with a single CPU.

> If yes, can RT tolerate use_softirq=false and what could a realistic softirq
> running/completion time be that PREEMPT_RT can tolerate? I guess that can be
> answered by running rcuperf on PREEMPT_RT with a NOCB configuration and
> measuring softirq worst-case start/completion times.

It depends. RT as of now. The softirq can be preempted this includes the
RCU softirq. That means that a wakeup of a high priority RT task can
preempt the RCU softirq. So this might not be an issue.
the softirq takes a global per-CPU lock which means we can only have one
softirq vector running at a time (we used to have the possibility of
multiple vectors running in parallel but that is gone now). So this
enforces the behaviour we have in !RT as of today.  If you rely on
"timely" executed softirq then long running RCU-softirq might be bad for
you. But so is everything else that might cause long running softirqs.

What I don't know is how RCU-boosting and preempted softirq works.

> I could run these tests myself but I am vacation for the next week or so.
> 
> thanks,
> 
>  - Joel

Sebastian

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RT v3 5/5] rcutorture: Avoid problematic critical section nesting on RT
  2019-09-16 16:55     ` Scott Wood
@ 2019-09-17 10:07       ` Sebastian Andrzej Siewior
  2019-09-17 14:36         ` Scott Wood
  0 siblings, 1 reply; 35+ messages in thread
From: Sebastian Andrzej Siewior @ 2019-09-17 10:07 UTC (permalink / raw)
  To: Scott Wood
  Cc: Joel Fernandes, linux-rt-users, linux-kernel, Paul E . McKenney,
	Thomas Gleixner, Steven Rostedt, Peter Zijlstra, Juri Lelli,
	Clark Williams

On 2019-09-16 11:55:57 [-0500], Scott Wood wrote:
> On Thu, 2019-09-12 at 18:17 -0400, Joel Fernandes wrote:
> > On Wed, Sep 11, 2019 at 05:57:29PM +0100, Scott Wood wrote:
> > > rcutorture was generating some nesting scenarios that are not
> > > reasonable.  Constrain the state selection to avoid them.
> > > 
> > > Example #1:
> > > 
> > > 1. preempt_disable()
> > > 2. local_bh_disable()
> > > 3. preempt_enable()
> > > 4. local_bh_enable()
> > > 
> > > On PREEMPT_RT, BH disabling takes a local lock only when called in
> > > non-atomic context.  Thus, atomic context must be retained until after
> > > BH
> > > is re-enabled.  Likewise, if BH is initially disabled in non-atomic
> > > context, it cannot be re-enabled in atomic context.
> > > 
> > > Example #2:
> > > 
> > > 1. rcu_read_lock()
> > > 2. local_irq_disable()
> > > 3. rcu_read_unlock()
> > > 4. local_irq_enable()
> > 
> > If I understand correctly, these examples are not unrealistic in the real
> > world unless RCU is used in the scheduler.
> 
> I hope you mean "not realistic", at least when it comes to explicit
> preempt/irq disabling rather than spinlock variants that don't disable
> preempt/irqs on PREEMPT_RT.

We have:
- local_irq_disable() (+save)
- spin_lock()
- local_bh_disable()
- preempt_disable()

On non-RT you can (but should not) use the counter part of the function
in random order like:
	local_bh_disable();
	local_irq_disable();
	local_bh_enable();
	local_irq_enable();

The non-RT will survive this. On RT the counterpart functions have to be
used in reverse order:
	local_bh_disable();
	local_irq_disable();
	local_irq_enable();
	local_bh_enable();

or the kernel will fall apart.

Since you _can_ use it in random order Paul wants to test that the
random use of those function does not break RCU in any way. Since they
can not be used on RT in random order it has been agreed that we keep
the test for !RT but disable it on RT.

> -Scott

Sebastian

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RT v3 1/5] rcu: Acquire RCU lock when disabling BHs
  2019-09-17  7:44   ` Sebastian Andrzej Siewior
@ 2019-09-17 14:06     ` Scott Wood
  2019-09-17 14:42       ` Sebastian Andrzej Siewior
  0 siblings, 1 reply; 35+ messages in thread
From: Scott Wood @ 2019-09-17 14:06 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: linux-rt-users, linux-kernel, Paul E . McKenney, Joel Fernandes,
	Thomas Gleixner, Steven Rostedt, Peter Zijlstra, Juri Lelli,
	Clark Williams

On Tue, 2019-09-17 at 09:44 +0200, Sebastian Andrzej Siewior wrote:
> On 2019-09-11 17:57:25 [+0100], Scott Wood wrote:
> > 
> > @@ -615,10 +645,7 @@ static inline void rcu_read_unlock(void)
> >  static inline void rcu_read_lock_bh(void)
> >  {
> >  	local_bh_disable();
> > -	__acquire(RCU_BH);
> > -	rcu_lock_acquire(&rcu_bh_lock_map);
> > -	RCU_LOCKDEP_WARN(!rcu_is_watching(),
> > -			 "rcu_read_lock_bh() used illegally while idle");
> > +	rcu_bh_lock_acquire();
> >  }
> >  
> >  /*
> 
> I asked previously why do you need to change rcu_read_lock_bh() and you
> replied that you don't remember:
>    
> https://lore.kernel.org/linux-rt-users/b948ec6cccda31925ed8dc123bd0f55423fff3d4.camel@redhat.com/
> 
> Did this change?

Sorry, I missed that you were asking about rcu_read_lock_bh() as well.  I
did remove the change to rcu_read_lock_bh_held().

With this patch, local_bh_disable() calls rcu_read_lock() on RT which
handles this debug stuff.  Doing it twice shouldn't be explicitly harmful,
but it's redundant, and debug kernels are slow enough as is.

-Scott



^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RT v3 3/5] sched: migrate_dis/enable: Use rt_invol_sleep
  2019-09-17  7:59   ` Sebastian Andrzej Siewior
@ 2019-09-17 14:06     ` Scott Wood
  2019-09-23 16:59       ` Scott Wood
  0 siblings, 1 reply; 35+ messages in thread
From: Scott Wood @ 2019-09-17 14:06 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: linux-rt-users, linux-kernel, Paul E . McKenney, Joel Fernandes,
	Thomas Gleixner, Steven Rostedt, Peter Zijlstra, Juri Lelli,
	Clark Williams

On Tue, 2019-09-17 at 09:59 +0200, Sebastian Andrzej Siewior wrote:
> On 2019-09-11 17:57:27 [+0100], Scott Wood wrote:
> > diff --git a/kernel/cpu.c b/kernel/cpu.c
> > index 885a195dfbe0..32c6175b63b6 100644
> > --- a/kernel/cpu.c
> > +++ b/kernel/cpu.c
> > @@ -308,7 +308,9 @@ void pin_current_cpu(void)
> >  	preempt_lazy_enable();
> >  	preempt_enable();
> >  
> > +	rt_invol_sleep_inc();
> >  	__read_rt_lock(cpuhp_pin);
> > +	rt_invol_sleep_dec();
> >  
> >  	preempt_disable();
> >  	preempt_lazy_disable();
> 
> I understand the other one. But now looking at it, both end up in
> rt_spin_lock_slowlock_locked() which would be the proper place to do
> that annotation. Okay.

FWIW, if my lazy migrate patchset is accepted, then there will be no users
of __read_rt_lock() outside rwlock-rt.c and it'll be moot.

-Scott



^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RT v3 4/5] rcu: Disable use_softirq on PREEMPT_RT
  2019-09-11 16:57 ` [PATCH RT v3 4/5] rcu: Disable use_softirq on PREEMPT_RT Scott Wood
  2019-09-12 21:38   ` Joel Fernandes
@ 2019-09-17 14:08   ` Scott Wood
  1 sibling, 0 replies; 35+ messages in thread
From: Scott Wood @ 2019-09-17 14:08 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: linux-rt-users, linux-kernel, Paul E . McKenney, Joel Fernandes,
	Thomas Gleixner, Steven Rostedt, Peter Zijlstra, Juri Lelli,
	Clark Williams

On Wed, 2019-09-11 at 17:57 +0100, Scott Wood wrote:
>  kernel/rcu/tree.c | 9 +++++++--
>  1 file changed, 7 insertions(+), 2 deletions(-)
> 
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index fc8b00c61b32..ee0a5ec2c30f 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -98,9 +98,14 @@ struct rcu_state rcu_state = {
>  /* Dump rcu_node combining tree at boot to verify correct setup. */
>  static bool dump_tree;
>  module_param(dump_tree, bool, 0444);
> -/* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */
> -static bool use_softirq = 1;
> +/*
> + * By default, use RCU_SOFTIRQ instead of rcuc kthreads.
> + * But, avoid RCU_SOFTIRQ on PREEMPT_RT due to pi/rq deadlocks.
> + */
> +static bool use_softirq = !IS_ENABLED(CONFIG_PREEMPT_RT_FULL);
> +#ifdef CONFIG_PREEMPT_RT_FULL
>  module_param(use_softirq, bool, 0444);
> +#endif

Ugh, that should be s/ifdef/ifndef/

-Scott



^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RT v3 5/5] rcutorture: Avoid problematic critical section nesting on RT
  2019-09-17 10:07       ` Sebastian Andrzej Siewior
@ 2019-09-17 14:36         ` Scott Wood
  2019-09-17 14:50           ` Sebastian Andrzej Siewior
  0 siblings, 1 reply; 35+ messages in thread
From: Scott Wood @ 2019-09-17 14:36 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Joel Fernandes, linux-rt-users, linux-kernel, Paul E . McKenney,
	Thomas Gleixner, Steven Rostedt, Peter Zijlstra, Juri Lelli,
	Clark Williams

On Tue, 2019-09-17 at 12:07 +0200, Sebastian Andrzej Siewior wrote:
> On 2019-09-16 11:55:57 [-0500], Scott Wood wrote:
> > On Thu, 2019-09-12 at 18:17 -0400, Joel Fernandes wrote:
> > > On Wed, Sep 11, 2019 at 05:57:29PM +0100, Scott Wood wrote:
> > > > rcutorture was generating some nesting scenarios that are not
> > > > reasonable.  Constrain the state selection to avoid them.
> > > > 
> > > > Example #1:
> > > > 
> > > > 1. preempt_disable()
> > > > 2. local_bh_disable()
> > > > 3. preempt_enable()
> > > > 4. local_bh_enable()
> > > > 
> > > > On PREEMPT_RT, BH disabling takes a local lock only when called in
> > > > non-atomic context.  Thus, atomic context must be retained until
> > > > after
> > > > BH
> > > > is re-enabled.  Likewise, if BH is initially disabled in non-atomic
> > > > context, it cannot be re-enabled in atomic context.
> > > > 
> > > > Example #2:
> > > > 
> > > > 1. rcu_read_lock()
> > > > 2. local_irq_disable()
> > > > 3. rcu_read_unlock()
> > > > 4. local_irq_enable()
> > > 
> > > If I understand correctly, these examples are not unrealistic in the
> > > real
> > > world unless RCU is used in the scheduler.
> > 
> > I hope you mean "not realistic", at least when it comes to explicit
> > preempt/irq disabling rather than spinlock variants that don't disable
> > preempt/irqs on PREEMPT_RT.
> 
> We have:
> - local_irq_disable() (+save)
> - spin_lock()
> - local_bh_disable()
> - preempt_disable()
> 
> On non-RT you can (but should not) use the counter part of the function
> in random order like:
> 	local_bh_disable();
> 	local_irq_disable();
> 	local_bh_enable();
> 	local_irq_enable();

Actually even non-RT will assert if you do local_bh_enable() with IRQs
disabled -- but the other combinations do work, and are used some places via
spinlocks.  If they are used via direct calls to preempt_disable() or
local_irq_disable() (or via raw spinlocks), then that will not go away on RT
and we'll have a problem.

> The non-RT will survive this. On RT the counterpart functions have to be
> used in reverse order:
> 	local_bh_disable();
> 	local_irq_disable();
> 	local_irq_enable();
> 	local_bh_enable();
> 
> or the kernel will fall apart.
> 
> Since you _can_ use it in random order Paul wants to test that the
> random use of those function does not break RCU in any way. Since they
> can not be used on RT in random order it has been agreed that we keep
> the test for !RT but disable it on RT.

For now, yes.  Long term it would be good to keep track of when
preemption/irqs would be disabled on RT, even when running a non-RT debug
kernel, and assert when bad things are done with it (assuming an RT-capable
arch).  Besides detecting these fairly unusual patterns, it could also
detect earlier the much more common problem of nesting a non-raw spinlock
inside a raw spinlock or other RT-atomic context.

-Scott



^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RT v3 1/5] rcu: Acquire RCU lock when disabling BHs
  2019-09-17 14:06     ` Scott Wood
@ 2019-09-17 14:42       ` Sebastian Andrzej Siewior
  2019-09-17 16:12         ` Scott Wood
  0 siblings, 1 reply; 35+ messages in thread
From: Sebastian Andrzej Siewior @ 2019-09-17 14:42 UTC (permalink / raw)
  To: Scott Wood
  Cc: linux-rt-users, linux-kernel, Paul E . McKenney, Joel Fernandes,
	Thomas Gleixner, Steven Rostedt, Peter Zijlstra, Juri Lelli,
	Clark Williams

On 2019-09-17 09:06:28 [-0500], Scott Wood wrote:
> Sorry, I missed that you were asking about rcu_read_lock_bh() as well.  I
> did remove the change to rcu_read_lock_bh_held().

Sorry for not being clear here.

> With this patch, local_bh_disable() calls rcu_read_lock() on RT which
> handles this debug stuff.  Doing it twice shouldn't be explicitly harmful,
> but it's redundant, and debug kernels are slow enough as is.

rcu_read_lock() does:
|         __rcu_read_lock();
|         __acquire(RCU);
|         rcu_lock_acquire(&rcu_lock_map);
|         RCU_LOCKDEP_WARN(!rcu_is_watching(),
|                          "rcu_read_lock() used illegally while idle");

__acquire() is removed removed by cpp.
That RCU_LOCKDEP_WARN is doing the same thing as above and redundant.
Am I right to assume that you consider
	rcu_lock_acquire(&rcu_bh_lock_map);

redundant because the only user of that is also checking for
rcu_lock_map?

> -Scott

Sebastian

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RT v3 5/5] rcutorture: Avoid problematic critical section nesting on RT
  2019-09-17 14:36         ` Scott Wood
@ 2019-09-17 14:50           ` Sebastian Andrzej Siewior
  2019-09-17 16:32             ` Scott Wood
  0 siblings, 1 reply; 35+ messages in thread
From: Sebastian Andrzej Siewior @ 2019-09-17 14:50 UTC (permalink / raw)
  To: Scott Wood
  Cc: Joel Fernandes, linux-rt-users, linux-kernel, Paul E . McKenney,
	Thomas Gleixner, Steven Rostedt, Peter Zijlstra, Juri Lelli,
	Clark Williams

On 2019-09-17 09:36:22 [-0500], Scott Wood wrote:
> > On non-RT you can (but should not) use the counter part of the function
> > in random order like:
> > 	local_bh_disable();
> > 	local_irq_disable();
> > 	local_bh_enable();
> > 	local_irq_enable();
> 
> Actually even non-RT will assert if you do local_bh_enable() with IRQs
> disabled -- but the other combinations do work, and are used some places via
> spinlocks.  If they are used via direct calls to preempt_disable() or
> local_irq_disable() (or via raw spinlocks), then that will not go away on RT
> and we'll have a problem.

lockdep_assert_irqs_enabled() is a nop with CONFIG_PROVE_LOCKING=N and
RT breaks either way. 

> > Since you _can_ use it in random order Paul wants to test that the
> > random use of those function does not break RCU in any way. Since they
> > can not be used on RT in random order it has been agreed that we keep
> > the test for !RT but disable it on RT.
> 
> For now, yes.  Long term it would be good to keep track of when
> preemption/irqs would be disabled on RT, even when running a non-RT debug
> kernel, and assert when bad things are done with it (assuming an RT-capable
> arch).  Besides detecting these fairly unusual patterns, it could also
> detect earlier the much more common problem of nesting a non-raw spinlock
> inside a raw spinlock or other RT-atomic context.

you will be surprised but we have patches for that. We need first get
rid of other "false positives" before plugging this in.

> -Scott

Sebastian

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RT v3 1/5] rcu: Acquire RCU lock when disabling BHs
  2019-09-17 14:42       ` Sebastian Andrzej Siewior
@ 2019-09-17 16:12         ` Scott Wood
  2019-09-23 16:41           ` Sebastian Andrzej Siewior
  0 siblings, 1 reply; 35+ messages in thread
From: Scott Wood @ 2019-09-17 16:12 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: linux-rt-users, linux-kernel, Paul E . McKenney, Joel Fernandes,
	Thomas Gleixner, Steven Rostedt, Peter Zijlstra, Juri Lelli,
	Clark Williams

On Tue, 2019-09-17 at 16:42 +0200, Sebastian Andrzej Siewior wrote:
> On 2019-09-17 09:06:28 [-0500], Scott Wood wrote:
> > Sorry, I missed that you were asking about rcu_read_lock_bh() as
> > well.  I
> > did remove the change to rcu_read_lock_bh_held().
> 
> Sorry for not being clear here.
> 
> > With this patch, local_bh_disable() calls rcu_read_lock() on RT which
> > handles this debug stuff.  Doing it twice shouldn't be explicitly
> > harmful,
> > but it's redundant, and debug kernels are slow enough as is.
> 
> rcu_read_lock() does:
> >         __rcu_read_lock();
> >         __acquire(RCU);
> >         rcu_lock_acquire(&rcu_lock_map);
> >         RCU_LOCKDEP_WARN(!rcu_is_watching(),
> >                          "rcu_read_lock() used illegally while idle");
> 
> __acquire() is removed removed by cpp.
> That RCU_LOCKDEP_WARN is doing the same thing as above and redundant.
> Am I right to assume that you consider
> 	rcu_lock_acquire(&rcu_bh_lock_map);
> 
> redundant because the only user of that is also checking for
> rcu_lock_map?

Yes.

-Scott



^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RT v3 5/5] rcutorture: Avoid problematic critical section nesting on RT
  2019-09-17 14:50           ` Sebastian Andrzej Siewior
@ 2019-09-17 16:32             ` Scott Wood
  2019-09-23 16:25               ` Sebastian Andrzej Siewior
  0 siblings, 1 reply; 35+ messages in thread
From: Scott Wood @ 2019-09-17 16:32 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Joel Fernandes, linux-rt-users, linux-kernel, Paul E . McKenney,
	Thomas Gleixner, Steven Rostedt, Peter Zijlstra, Juri Lelli,
	Clark Williams

On Tue, 2019-09-17 at 16:50 +0200, Sebastian Andrzej Siewior wrote:
> On 2019-09-17 09:36:22 [-0500], Scott Wood wrote:
> > > On non-RT you can (but should not) use the counter part of the
> > > function
> > > in random order like:
> > > 	local_bh_disable();
> > > 	local_irq_disable();
> > > 	local_bh_enable();
> > > 	local_irq_enable();
> > 
> > Actually even non-RT will assert if you do local_bh_enable() with IRQs
> > disabled -- but the other combinations do work, and are used some places
> > via
> > spinlocks.  If they are used via direct calls to preempt_disable() or
> > local_irq_disable() (or via raw spinlocks), then that will not go away
> > on RT
> > and we'll have a problem.
> 
> lockdep_assert_irqs_enabled() is a nop with CONFIG_PROVE_LOCKING=N and
> RT breaks either way. 

Right, I meant a non-RT kernel with debug checks enabled.

> > > Since you _can_ use it in random order Paul wants to test that the
> > > random use of those function does not break RCU in any way. Since they
> > > can not be used on RT in random order it has been agreed that we keep
> > > the test for !RT but disable it on RT.
> > 
> > For now, yes.  Long term it would be good to keep track of when
> > preemption/irqs would be disabled on RT, even when running a non-RT
> > debug
> > kernel, and assert when bad things are done with it (assuming an RT-
> > capable
> > arch).  Besides detecting these fairly unusual patterns, it could also
> > detect earlier the much more common problem of nesting a non-raw
> > spinlock
> > inside a raw spinlock or other RT-atomic context.
> 
> you will be surprised but we have patches for that. We need first get
> rid of other "false positives" before plugging this in.

Nice!  Are the "false positives" real issues from components that are
currently blacklisted on RT, or something different?

-Scott



^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RT v3 5/5] rcutorture: Avoid problematic critical section nesting on RT
  2019-09-17 16:32             ` Scott Wood
@ 2019-09-23 16:25               ` Sebastian Andrzej Siewior
  0 siblings, 0 replies; 35+ messages in thread
From: Sebastian Andrzej Siewior @ 2019-09-23 16:25 UTC (permalink / raw)
  To: Scott Wood
  Cc: Joel Fernandes, linux-rt-users, linux-kernel, Paul E . McKenney,
	Thomas Gleixner, Steven Rostedt, Peter Zijlstra, Juri Lelli,
	Clark Williams

On 2019-09-17 11:32:11 [-0500], Scott Wood wrote:
> Nice!  Are the "false positives" real issues from components that are
> currently blacklisted on RT, or something different?

So first a little bit of infrastructure like commit d5096aa65acd0
("sched: Mark hrtimers to expire in hard interrupt context") is required
so lockdep can see it all properly without RT enabled. Then we need
patches to avoid lockdep complaining about things that are not complained
about in RT because the lock is converted to raw_spinlock_t or something
different is applied so we don't have the warning.

> -Scott

Sebastian

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RT v3 1/5] rcu: Acquire RCU lock when disabling BHs
  2019-09-17 16:12         ` Scott Wood
@ 2019-09-23 16:41           ` Sebastian Andrzej Siewior
  0 siblings, 0 replies; 35+ messages in thread
From: Sebastian Andrzej Siewior @ 2019-09-23 16:41 UTC (permalink / raw)
  To: Scott Wood
  Cc: linux-rt-users, linux-kernel, Paul E . McKenney, Joel Fernandes,
	Thomas Gleixner, Steven Rostedt, Peter Zijlstra, Juri Lelli,
	Clark Williams

On 2019-09-17 11:12:48 [-0500], Scott Wood wrote:
> > rcu_read_lock() does:
> > >         __rcu_read_lock();
> > >         __acquire(RCU);
> > >         rcu_lock_acquire(&rcu_lock_map);
> > >         RCU_LOCKDEP_WARN(!rcu_is_watching(),
> > >                          "rcu_read_lock() used illegally while idle");
> > 
> > __acquire() is removed removed by cpp.
> > That RCU_LOCKDEP_WARN is doing the same thing as above and redundant.
> > Am I right to assume that you consider
> > 	rcu_lock_acquire(&rcu_bh_lock_map);
> > 
> > redundant because the only user of that is also checking for
> > rcu_lock_map?
> Yes.

I'm going to drop that hunk. It makes the patch smaller and result more
obvious. If the additional lock annotation (rcu_bh_lock_map) is too much
then we can still remove it later entirely (for RT) but for now I would
like to keep the changes simple and small.

> -Scott

Sebastian

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RT v3 3/5] sched: migrate_dis/enable: Use rt_invol_sleep
  2019-09-17 14:06     ` Scott Wood
@ 2019-09-23 16:59       ` Scott Wood
  2019-09-23 17:52         ` Sebastian Andrzej Siewior
  0 siblings, 1 reply; 35+ messages in thread
From: Scott Wood @ 2019-09-23 16:59 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: linux-rt-users, linux-kernel, Paul E . McKenney, Joel Fernandes,
	Thomas Gleixner, Steven Rostedt, Peter Zijlstra, Juri Lelli,
	Clark Williams

On Tue, 2019-09-17 at 09:06 -0500, Scott Wood wrote:
> On Tue, 2019-09-17 at 09:59 +0200, Sebastian Andrzej Siewior wrote:
> > On 2019-09-11 17:57:27 [+0100], Scott Wood wrote:
> > > diff --git a/kernel/cpu.c b/kernel/cpu.c
> > > index 885a195dfbe0..32c6175b63b6 100644
> > > --- a/kernel/cpu.c
> > > +++ b/kernel/cpu.c
> > > @@ -308,7 +308,9 @@ void pin_current_cpu(void)
> > >  	preempt_lazy_enable();
> > >  	preempt_enable();
> > >  
> > > +	rt_invol_sleep_inc();
> > >  	__read_rt_lock(cpuhp_pin);
> > > +	rt_invol_sleep_dec();
> > >  
> > >  	preempt_disable();
> > >  	preempt_lazy_disable();
> > 
> > I understand the other one. But now looking at it, both end up in
> > rt_spin_lock_slowlock_locked() which would be the proper place to do
> > that annotation. Okay.
> 
> FWIW, if my lazy migrate patchset is accepted, then there will be no users
> of __read_rt_lock() outside rwlock-rt.c and it'll be moot.

I missed the "both" -- which is the "other one" that ends up in
rt_spin_lock_slowlock_locked()?  stop_one_cpu() doesn't...

-Scott



^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RT v3 3/5] sched: migrate_dis/enable: Use rt_invol_sleep
  2019-09-23 16:59       ` Scott Wood
@ 2019-09-23 17:52         ` Sebastian Andrzej Siewior
  2019-09-24 11:21           ` Sebastian Andrzej Siewior
  0 siblings, 1 reply; 35+ messages in thread
From: Sebastian Andrzej Siewior @ 2019-09-23 17:52 UTC (permalink / raw)
  To: Scott Wood
  Cc: linux-rt-users, linux-kernel, Paul E . McKenney, Joel Fernandes,
	Thomas Gleixner, Steven Rostedt, Peter Zijlstra, Juri Lelli,
	Clark Williams

On 2019-09-23 11:59:23 [-0500], Scott Wood wrote:
> On Tue, 2019-09-17 at 09:06 -0500, Scott Wood wrote:
> > On Tue, 2019-09-17 at 09:59 +0200, Sebastian Andrzej Siewior wrote:
> > > On 2019-09-11 17:57:27 [+0100], Scott Wood wrote:
> > > > diff --git a/kernel/cpu.c b/kernel/cpu.c
> > > > index 885a195dfbe0..32c6175b63b6 100644
> > > > --- a/kernel/cpu.c
> > > > +++ b/kernel/cpu.c
> > > > @@ -308,7 +308,9 @@ void pin_current_cpu(void)
> > > >  	preempt_lazy_enable();
> > > >  	preempt_enable();
> > > >  
> > > > +	rt_invol_sleep_inc();
> > > >  	__read_rt_lock(cpuhp_pin);
> > > > +	rt_invol_sleep_dec();
> > > >  
> > > >  	preempt_disable();
> > > >  	preempt_lazy_disable();
> > > 
> > > I understand the other one. But now looking at it, both end up in
> > > rt_spin_lock_slowlock_locked() which would be the proper place to do
> > > that annotation. Okay.
> > 
> > FWIW, if my lazy migrate patchset is accepted, then there will be no users
> > of __read_rt_lock() outside rwlock-rt.c and it'll be moot.
> 
> I missed the "both" -- which is the "other one" that ends up in
> rt_spin_lock_slowlock_locked()?  stop_one_cpu() doesn't...

That one used here:
 __read_rt_lock()
    -> rt_spin_lock_slowlock_locked()

The official one (including the write part):
 rt_read_lock() (annotation)
   -> do_read_rt_lock()
     -> __read_rt_lock()
       -> rt_spin_lock_slowlock_locked()


and the only missing to the party of sleeping locks is:
rt_spin_lock() (annotation)
  -> rt_spin_lock_slowlock()
    -> rt_spin_lock_slowlock_locked()

> -Scott

Sebastian

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RT v3 3/5] sched: migrate_dis/enable: Use rt_invol_sleep
  2019-09-23 17:52         ` Sebastian Andrzej Siewior
@ 2019-09-24 11:21           ` Sebastian Andrzej Siewior
  2019-09-24 13:53             ` Scott Wood
  0 siblings, 1 reply; 35+ messages in thread
From: Sebastian Andrzej Siewior @ 2019-09-24 11:21 UTC (permalink / raw)
  To: Scott Wood
  Cc: linux-rt-users, linux-kernel, Paul E . McKenney, Joel Fernandes,
	Thomas Gleixner, Steven Rostedt, Peter Zijlstra, Juri Lelli,
	Clark Williams

On 2019-09-23 19:52:33 [+0200], To Scott Wood wrote:

I made dis:

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 885a195dfbe02..25afa2bb1a2cf 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -308,7 +308,9 @@ void pin_current_cpu(void)
 	preempt_lazy_enable();
 	preempt_enable();
 
+	sleeping_lock_inc();
 	__read_rt_lock(cpuhp_pin);
+	sleeping_lock_dec();
 
 	preempt_disable();
 	preempt_lazy_disable();
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e1bdd7f9be054..63a6420d01053 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7388,6 +7388,7 @@ void migrate_enable(void)
 
 		WARN_ON(smp_processor_id() != task_cpu(p));
 		if (!cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
+			struct task_struct *self = current;
 			const struct cpumask *cpu_valid_mask = cpu_active_mask;
 			struct migration_arg arg;
 			unsigned int dest_cpu;
@@ -7405,7 +7406,21 @@ void migrate_enable(void)
 			unpin_current_cpu();
 			preempt_lazy_enable();
 			preempt_enable();
+			rt_invol_sleep_inc();
+
+			raw_spin_lock_irq(&self->pi_lock);
+			self->saved_state = self->state;
+			__set_current_state_no_track(TASK_RUNNING);
+			raw_spin_unlock_irq(&self->pi_lock);
+
 			stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
+
+			raw_spin_lock_irq(&self->pi_lock);
+			__set_current_state_no_track(self->saved_state);
+			self->saved_state = TASK_RUNNING;
+			raw_spin_unlock_irq(&self->pi_lock);
+
+			rt_invol_sleep_dec();
 			return;
 		}
 	}

I think we need to preserve the current state, otherwise we will lose
anything != TASK_RUNNING here. So the spin_lock() would preserve it
while waiting but the migrate_enable() will lose it if it needs to
change the CPU at the end.
I will try to prepare all commits for the next release before I release
so you can have a look first and yell if needed.

> > -Scott
 
Sebastian

^ permalink raw reply related	[flat|nested] 35+ messages in thread

* Re: [PATCH RT v3 3/5] sched: migrate_dis/enable: Use rt_invol_sleep
  2019-09-24 11:21           ` Sebastian Andrzej Siewior
@ 2019-09-24 13:53             ` Scott Wood
  2019-09-24 15:25               ` Sebastian Andrzej Siewior
  0 siblings, 1 reply; 35+ messages in thread
From: Scott Wood @ 2019-09-24 13:53 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: linux-rt-users, linux-kernel, Paul E . McKenney, Joel Fernandes,
	Thomas Gleixner, Steven Rostedt, Peter Zijlstra, Juri Lelli,
	Clark Williams

On Tue, 2019-09-24 at 13:21 +0200, Sebastian Andrzej Siewior wrote:
> On 2019-09-23 19:52:33 [+0200], To Scott Wood wrote:
> 
> I made dis:
> 
> diff --git a/kernel/cpu.c b/kernel/cpu.c
> index 885a195dfbe02..25afa2bb1a2cf 100644
> --- a/kernel/cpu.c
> +++ b/kernel/cpu.c
> @@ -308,7 +308,9 @@ void pin_current_cpu(void)
>  	preempt_lazy_enable();
>  	preempt_enable();
>  
> +	sleeping_lock_inc();
>  	__read_rt_lock(cpuhp_pin);
> +	sleeping_lock_dec();
>  
>  	preempt_disable();
>  	preempt_lazy_disable();
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index e1bdd7f9be054..63a6420d01053 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -7388,6 +7388,7 @@ void migrate_enable(void)
>  
>  		WARN_ON(smp_processor_id() != task_cpu(p));
>  		if (!cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
> +			struct task_struct *self = current;
>  			const struct cpumask *cpu_valid_mask =
> cpu_active_mask;
>  			struct migration_arg arg;
>  			unsigned int dest_cpu;
> @@ -7405,7 +7406,21 @@ void migrate_enable(void)
>  			unpin_current_cpu();
>  			preempt_lazy_enable();
>  			preempt_enable();
> +			rt_invol_sleep_inc();
> +
> +			raw_spin_lock_irq(&self->pi_lock);
> +			self->saved_state = self->state;
> +			__set_current_state_no_track(TASK_RUNNING);
> +			raw_spin_unlock_irq(&self->pi_lock);
> +
>  			stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
> +
> +			raw_spin_lock_irq(&self->pi_lock);
> +			__set_current_state_no_track(self->saved_state);
> +			self->saved_state = TASK_RUNNING;
> +			raw_spin_unlock_irq(&self->pi_lock);
> +
> +			rt_invol_sleep_dec();
>  			return;
>  		}
>  	}
> 
> I think we need to preserve the current state, otherwise we will lose
> anything != TASK_RUNNING here. So the spin_lock() would preserve it
> while waiting but the migrate_enable() will lose it if it needs to
> change the CPU at the end.
> I will try to prepare all commits for the next release before I release
> so you can have a look first and yell if needed.

As I pointed out in the "[PATCH RT 6/8] sched: migrate_enable: Set state to
TASK_RUNNING" discussion, we can get here inside the rtmutex code (e.g. from
debug_rt_mutex_print_deadlock) where saved_state is already holding
something -- plus, the waker won't have WF_LOCK_SLEEPER and therefore
saved_state will get cleared anyway.

-Scott



^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RT v3 3/5] sched: migrate_dis/enable: Use rt_invol_sleep
  2019-09-24 13:53             ` Scott Wood
@ 2019-09-24 15:25               ` Sebastian Andrzej Siewior
  2019-09-24 15:47                 ` Scott Wood
  0 siblings, 1 reply; 35+ messages in thread
From: Sebastian Andrzej Siewior @ 2019-09-24 15:25 UTC (permalink / raw)
  To: Scott Wood
  Cc: linux-rt-users, linux-kernel, Paul E . McKenney, Joel Fernandes,
	Thomas Gleixner, Steven Rostedt, Peter Zijlstra, Juri Lelli,
	Clark Williams

On 2019-09-24 08:53:43 [-0500], Scott Wood wrote:
> As I pointed out in the "[PATCH RT 6/8] sched: migrate_enable: Set state to
> TASK_RUNNING" discussion, we can get here inside the rtmutex code (e.g. from
> debug_rt_mutex_print_deadlock) where saved_state is already holding
> something -- plus, the waker won't have WF_LOCK_SLEEPER and therefore
> saved_state will get cleared anyway.

So let me drop the saved_state pieces and get back to it once I get to
the other thread (which you replied and I didn't realised until now).

Regarding the WF_LOCK_SLEEPER part. I think this works as expected.
Imagine:

CPU0						CPU1
spin_lock();
set_current_state(TASK_UNINTERRUPTIBLE);
…
spin_unlock()
 -> migrate_enable();
   -> stop_one_cpu();				<-- A)
other_func();					<-- B)
schedule();

So. With only CPU0 we enter schedule() with TASK_UNINTERRUPTIBLE because
the state gets preserved with the change I added (which is expected).
If CPU1 sends a wake_up() at A) then the saved_state gets overwritten
and we enter schedule() with TASK_RUNNING. Same happens if it is sent at
B) point which is outside of any migrate/spin lock related code. 

Was this clear or did I miss the point?

> -Scott

Sebastian

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RT v3 3/5] sched: migrate_dis/enable: Use rt_invol_sleep
  2019-09-24 15:25               ` Sebastian Andrzej Siewior
@ 2019-09-24 15:47                 ` Scott Wood
  2019-09-24 16:05                   ` Sebastian Andrzej Siewior
  0 siblings, 1 reply; 35+ messages in thread
From: Scott Wood @ 2019-09-24 15:47 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: linux-rt-users, linux-kernel, Paul E . McKenney, Joel Fernandes,
	Thomas Gleixner, Steven Rostedt, Peter Zijlstra, Juri Lelli,
	Clark Williams

On Tue, 2019-09-24 at 17:25 +0200, Sebastian Andrzej Siewior wrote:
> On 2019-09-24 08:53:43 [-0500], Scott Wood wrote:
> > As I pointed out in the "[PATCH RT 6/8] sched: migrate_enable: Set state
> > to
> > TASK_RUNNING" discussion, we can get here inside the rtmutex code (e.g.
> > from
> > debug_rt_mutex_print_deadlock) where saved_state is already holding
> > something -- plus, the waker won't have WF_LOCK_SLEEPER and therefore
> > saved_state will get cleared anyway.
> 
> So let me drop the saved_state pieces and get back to it once I get to
> the other thread (which you replied and I didn't realised until now).
> 
> Regarding the WF_LOCK_SLEEPER part. I think this works as expected.
> Imagine:
> 
> CPU0						CPU1
> spin_lock();
> set_current_state(TASK_UNINTERRUPTIBLE);
> …
> spin_unlock()
>  -> migrate_enable();
>    -> stop_one_cpu();				<-- A)
> other_func();					<-- B)
> schedule();
> 
> So. With only CPU0 we enter schedule() with TASK_UNINTERRUPTIBLE because
> the state gets preserved with the change I added (which is expected).
> If CPU1 sends a wake_up() at A) then the saved_state gets overwritten
> and we enter schedule() with TASK_RUNNING. Same happens if it is sent at
> B) point which is outside of any migrate/spin lock related code. 
> 
> Was this clear or did I miss the point?

When the stop machine finishes it will do a wake_up_process() via
complete().  Since this does not pass WF_LOCK_SLEEPER, saved_state will be
cleared, and you'll have TASK_RUNNING when you get to other_func() and
schedule(), regardless of whether CPU1 sends wake_up() -- so this change
doesn't actually accomplish anything.

While as noted in the other thread I don't think these spurious wakeups are
a huge problem, we could avoid them by doing stop_one_cpu_nowait() and then
schedule() without messing with task state.  Since we're stopping our own
cpu, it should be guaranteed that the stopper has finished by the time we
exit schedule().

-Scott



^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RT v3 3/5] sched: migrate_dis/enable: Use rt_invol_sleep
  2019-09-24 15:47                 ` Scott Wood
@ 2019-09-24 16:05                   ` Sebastian Andrzej Siewior
  2019-09-24 16:35                     ` Scott Wood
  0 siblings, 1 reply; 35+ messages in thread
From: Sebastian Andrzej Siewior @ 2019-09-24 16:05 UTC (permalink / raw)
  To: Scott Wood
  Cc: linux-rt-users, linux-kernel, Paul E . McKenney, Joel Fernandes,
	Thomas Gleixner, Steven Rostedt, Peter Zijlstra, Juri Lelli,
	Clark Williams

On 2019-09-24 10:47:36 [-0500], Scott Wood wrote:
> When the stop machine finishes it will do a wake_up_process() via
> complete().  Since this does not pass WF_LOCK_SLEEPER, saved_state will be
> cleared, and you'll have TASK_RUNNING when you get to other_func() and
> schedule(), regardless of whether CPU1 sends wake_up() -- so this change
> doesn't actually accomplish anything.

True, I completely missed that part.

> While as noted in the other thread I don't think these spurious wakeups are
> a huge problem, we could avoid them by doing stop_one_cpu_nowait() and then
> schedule() without messing with task state.  Since we're stopping our own
> cpu, it should be guaranteed that the stopper has finished by the time we
> exit schedule().

I remember loosing a state can be a problem. Lets say it is not "just"
TASK_UNINTERRUPTIBLE -> TASK_RUNNING which sounds harmless but it is
__TASK_TRACED and you lose it as part of unlocking siglock.

> -Scott

Sebastian

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RT v3 3/5] sched: migrate_dis/enable: Use rt_invol_sleep
  2019-09-24 16:05                   ` Sebastian Andrzej Siewior
@ 2019-09-24 16:35                     ` Scott Wood
  2019-10-04 16:45                       ` Sebastian Andrzej Siewior
  0 siblings, 1 reply; 35+ messages in thread
From: Scott Wood @ 2019-09-24 16:35 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: linux-rt-users, linux-kernel, Paul E . McKenney, Joel Fernandes,
	Thomas Gleixner, Steven Rostedt, Peter Zijlstra, Juri Lelli,
	Clark Williams

On Tue, 2019-09-24 at 18:05 +0200, Sebastian Andrzej Siewior wrote:
> On 2019-09-24 10:47:36 [-0500], Scott Wood wrote:
> > When the stop machine finishes it will do a wake_up_process() via
> > complete().  Since this does not pass WF_LOCK_SLEEPER, saved_state will
> > be
> > cleared, and you'll have TASK_RUNNING when you get to other_func() and
> > schedule(), regardless of whether CPU1 sends wake_up() -- so this change
> > doesn't actually accomplish anything.
> 
> True, I completely missed that part.
> 
> > While as noted in the other thread I don't think these spurious wakeups
> > are
> > a huge problem, we could avoid them by doing stop_one_cpu_nowait() and
> > then
> > schedule() without messing with task state.  Since we're stopping our
> > own
> > cpu, it should be guaranteed that the stopper has finished by the time
> > we
> > exit schedule().
> 
> I remember loosing a state can be a problem. Lets say it is not "just"
> TASK_UNINTERRUPTIBLE -> TASK_RUNNING which sounds harmless but it is
> __TASK_TRACED and you lose it as part of unlocking siglock.

OK, sounds like stop_one_cpu_nowait() is the way to go then.

-Scott



^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RT v3 3/5] sched: migrate_dis/enable: Use rt_invol_sleep
  2019-09-24 16:35                     ` Scott Wood
@ 2019-10-04 16:45                       ` Sebastian Andrzej Siewior
  0 siblings, 0 replies; 35+ messages in thread
From: Sebastian Andrzej Siewior @ 2019-10-04 16:45 UTC (permalink / raw)
  To: Scott Wood
  Cc: linux-rt-users, linux-kernel, Paul E . McKenney, Joel Fernandes,
	Thomas Gleixner, Steven Rostedt, Peter Zijlstra, Juri Lelli,
	Clark Williams

On 2019-09-24 11:35:16 [-0500], Scott Wood wrote:
> 
> OK, sounds like stop_one_cpu_nowait() is the way to go then.

so I applied the last three patches from the migrate-disable() series
and it looks good. Nothing blew up in my testing.
There were no objects to the stop_one_cpu_nowait() suggestion and I
think that it is important not to lose the state at end.
Could you please repost the patches?

> -Scott

Sebastian

^ permalink raw reply	[flat|nested] 35+ messages in thread

end of thread, other threads:[~2019-10-04 16:45 UTC | newest]

Thread overview: 35+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-09-11 16:57 [PATCH RT v3 0/5] RCU fixes Scott Wood
2019-09-11 16:57 ` [PATCH RT v3 1/5] rcu: Acquire RCU lock when disabling BHs Scott Wood
2019-09-12 22:09   ` Joel Fernandes
2019-09-17  7:44   ` Sebastian Andrzej Siewior
2019-09-17 14:06     ` Scott Wood
2019-09-17 14:42       ` Sebastian Andrzej Siewior
2019-09-17 16:12         ` Scott Wood
2019-09-23 16:41           ` Sebastian Andrzej Siewior
2019-09-11 16:57 ` [PATCH RT v3 2/5] sched: Rename sleeping_lock to rt_invol_sleep Scott Wood
2019-09-17  7:52   ` Sebastian Andrzej Siewior
2019-09-11 16:57 ` [PATCH RT v3 3/5] sched: migrate_dis/enable: Use rt_invol_sleep Scott Wood
2019-09-17  7:59   ` Sebastian Andrzej Siewior
2019-09-17 14:06     ` Scott Wood
2019-09-23 16:59       ` Scott Wood
2019-09-23 17:52         ` Sebastian Andrzej Siewior
2019-09-24 11:21           ` Sebastian Andrzej Siewior
2019-09-24 13:53             ` Scott Wood
2019-09-24 15:25               ` Sebastian Andrzej Siewior
2019-09-24 15:47                 ` Scott Wood
2019-09-24 16:05                   ` Sebastian Andrzej Siewior
2019-09-24 16:35                     ` Scott Wood
2019-10-04 16:45                       ` Sebastian Andrzej Siewior
2019-09-11 16:57 ` [PATCH RT v3 4/5] rcu: Disable use_softirq on PREEMPT_RT Scott Wood
2019-09-12 21:38   ` Joel Fernandes
2019-09-12 22:19     ` Joel Fernandes
2019-09-17  9:31     ` Sebastian Andrzej Siewior
2019-09-17 14:08   ` Scott Wood
2019-09-11 16:57 ` [PATCH RT v3 5/5] rcutorture: Avoid problematic critical section nesting on RT Scott Wood
2019-09-12 22:17   ` Joel Fernandes
2019-09-16 16:55     ` Scott Wood
2019-09-17 10:07       ` Sebastian Andrzej Siewior
2019-09-17 14:36         ` Scott Wood
2019-09-17 14:50           ` Sebastian Andrzej Siewior
2019-09-17 16:32             ` Scott Wood
2019-09-23 16:25               ` Sebastian Andrzej Siewior

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).