linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [ANNOUNCE] v5.14-rc4-rt4
@ 2021-08-02 16:27 Sebastian Andrzej Siewior
  2021-08-04  8:24 ` Daniel Wagner
  0 siblings, 1 reply; 30+ messages in thread
From: Sebastian Andrzej Siewior @ 2021-08-02 16:27 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: LKML, linux-rt-users, Steven Rostedt

Dear RT folks!

I'm pleased to announce the v5.14-rc4-rt4 patch set. 

Changes since v5.14-rc4-rt3:

  - Updating the locking bits:

    - Use proper task state in ww_mutex_lock_interruptible(), reported
      by Mike Galbraith.

    - Fix the wake_q handling for a task which blocks simultaneously as
      a regular task and additionally as a sleeper on a sleeping lock.
      Regression introduced in the v5.14 cycle, reported by Mike
      Galbraith, patched by Thomas Gleixner.

    - Address the futex related review comments by Peter Zijlstra.

Known issues
     - netconsole triggers WARN.

     - The "Memory controller" (CONFIG_MEMCG) has been disabled.

     - A RCU and ARM64 warning has been fixed by Valentin Schneider. It is
       still not clear if the RCU related change is correct.

The delta patch against v5.14-rc4-rt3 is appended below and can be found here:
 
     https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.14/incr/patch-5.14-rc4-rt3-rt4.patch.xz

You can get this release via the git tree at:

    git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git v5.14-rc4-rt4

The RT patch against v5.14-rc4 can be found here:

    https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.14/older/patch-5.14-rc4-rt4.patch.xz

The split quilt queue is available at:

    https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.14/older/patches-5.14-rc4-rt4.tar.xz

Sebastian

diff --git a/include/linux/sched/wake_q.h b/include/linux/sched/wake_q.h
index cdd98d57c0e6a..8e57a6660aad1 100644
--- a/include/linux/sched/wake_q.h
+++ b/include/linux/sched/wake_q.h
@@ -61,11 +61,5 @@ static inline bool wake_q_empty(struct wake_q_head *head)
 
 extern void wake_q_add(struct wake_q_head *head, struct task_struct *task);
 extern void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task);
-extern void __wake_up_q(struct wake_q_head *head, unsigned int state);
-
-static inline void wake_up_q(struct wake_q_head *head)
-{
-	__wake_up_q(head, TASK_NORMAL);
-}
-
+extern void wake_up_q(struct wake_q_head *head);
 #endif /* _LINUX_SCHED_WAKE_Q_H */
diff --git a/kernel/futex.c b/kernel/futex.c
index 7fc061ee5f2d4..c4e28bd37abcb 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1853,15 +1853,16 @@ enum {
 static inline bool futex_requeue_pi_prepare(struct futex_q *q,
 					    struct futex_pi_state *pi_state)
 {
-	int cur, res, new;
+	int old, new;
 
 	/*
 	 * Set state to Q_REQUEUE_PI_IN_PROGRESS unless an early wakeup has
 	 * already set Q_REQUEUE_PI_IGNORE to signal that requeue should
 	 * ignore the waiter.
 	 */
-	for (cur = atomic_read(&q->requeue_state);; cur = res) {
-		if (cur == Q_REQUEUE_PI_IGNORE)
+	old = atomic_read_acquire(&q->requeue_state);
+	do {
+		if (old == Q_REQUEUE_PI_IGNORE)
 			return false;
 
 		/*
@@ -1872,74 +1873,68 @@ static inline bool futex_requeue_pi_prepare(struct futex_q *q,
 		 * trylock, but that would just add more conditionals
 		 * all over the place for a dubious value.
 		 */
-		if (cur != Q_REQUEUE_PI_NONE)
+		if (old != Q_REQUEUE_PI_NONE)
 			break;
 
 		new = Q_REQUEUE_PI_IN_PROGRESS;
-		res = atomic_cmpxchg(&q->requeue_state, cur, new);
-		if (likely(cur == res))
-			break;
-	}
+	} while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
+
 	q->pi_state = pi_state;
 	return true;
 }
 
 static inline void futex_requeue_pi_complete(struct futex_q *q, int locked)
 {
-	int cur, res, new;
+	int old, new;
 
-	for (cur = atomic_read(&q->requeue_state);; cur = res) {
+	old = atomic_read_acquire(&q->requeue_state);
+	do {
 		if (locked >= 0) {
 			/* Requeue succeeded. Set DONE or LOCKED */
 			new = Q_REQUEUE_PI_DONE + locked;
-		} else if (cur == Q_REQUEUE_PI_IN_PROGRESS) {
+		} else if (old == Q_REQUEUE_PI_IN_PROGRESS) {
 			/* Deadlock, no early wakeup interleave */
 			new = Q_REQUEUE_PI_NONE;
 		} else {
 			/* Deadlock, early wakeup interleave. */
 			new = Q_REQUEUE_PI_IGNORE;
 		}
-
-		res = atomic_cmpxchg(&q->requeue_state, cur, new);
-		if (likely(cur == res))
-			break;
-	}
+	} while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
 
 #ifdef CONFIG_PREEMPT_RT
 	/* If the waiter interleaved with the requeue let it know */
-	if (unlikely(cur == Q_REQUEUE_PI_WAIT))
+	if (unlikely(old == Q_REQUEUE_PI_WAIT))
 		rcuwait_wake_up(&q->requeue_wait);
 #endif
 }
 
 static inline int futex_requeue_pi_wakeup_sync(struct futex_q *q)
 {
-	int cur, new, res;
+	int old, new;
 
-	for (cur = atomic_read(&q->requeue_state);; cur = res) {
+	old = atomic_read_acquire(&q->requeue_state);
+	do {
 		/* Is requeue done already? */
-		if (cur >= Q_REQUEUE_PI_DONE)
-			break;
+		if (old >= Q_REQUEUE_PI_DONE)
+			return old;
 
 		/*
 		 * If not done, then tell the requeue code to either ignore
 		 * the waiter or to wake it up once the requeue is done.
 		 */
-		new = !cur ? Q_REQUEUE_PI_IGNORE : Q_REQUEUE_PI_WAIT;
-		res = atomic_cmpxchg(&q->requeue_state, cur, new);
-		if (likely(cur == res))
-			break;
-	}
+		new = Q_REQUEUE_PI_WAIT;
+		if (old == Q_REQUEUE_PI_NONE)
+			new = Q_REQUEUE_PI_IGNORE;
+	} while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
 
 	/* If the requeue was in progress, wait for it to complete */
-	if (cur == Q_REQUEUE_PI_IN_PROGRESS) {
+	if (old == Q_REQUEUE_PI_IN_PROGRESS) {
 #ifdef CONFIG_PREEMPT_RT
 		rcuwait_wait_event(&q->requeue_wait,
 				   atomic_read(&q->requeue_state) != Q_REQUEUE_PI_WAIT,
 				   TASK_UNINTERRUPTIBLE);
 #else
-		while (atomic_read(&q->requeue_state) == Q_REQUEUE_PI_WAIT)
-			cpu_relax();
+		(void)atomic_cond_read_relaxed(&q->requeue_state, VAL != Q_REQUEUE_PI_WAIT);
 #endif
 	}
 
@@ -2039,7 +2034,10 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
 	if (!top_waiter)
 		return 0;
 
-	/* Ensure that this is a waiter sitting in futex_wait_requeue_pi() */
+	/*
+	 * Ensure that this is a waiter sitting in futex_wait_requeue_pi()
+	 * and waiting on the 'waitqueue' futex which is always !PI.
+	 */
 	if (!top_waiter->rt_waiter || top_waiter->pi_state)
 		ret = -EINVAL;
 
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index e347bbc12641d..eadaface1fd29 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -429,7 +429,10 @@ static __always_inline void rt_mutex_wake_q_add(struct rt_wake_q_head *wqh,
 						struct rt_mutex_waiter *w)
 {
 	if (IS_ENABLED(CONFIG_PREEMPT_RT) && w->wake_state != TASK_NORMAL) {
-		wake_q_add(&wqh->rt_head, w->task);
+		if (IS_ENABLED(CONFIG_PROVE_LOCKING))
+			WARN_ON_ONCE(wqh->rtlock_task);
+		get_task_struct(w->task);
+		wqh->rtlock_task = w->task;
 	} else {
 		wake_q_add(&wqh->head, w->task);
 	}
@@ -437,8 +440,11 @@ static __always_inline void rt_mutex_wake_q_add(struct rt_wake_q_head *wqh,
 
 static __always_inline void rt_mutex_wake_up_q(struct rt_wake_q_head *wqh)
 {
-	if (IS_ENABLED(CONFIG_PREEMPT_RT) && !wake_q_empty(&wqh->rt_head))
-		__wake_up_q(&wqh->rt_head, TASK_RTLOCK_WAIT);
+	if (IS_ENABLED(CONFIG_PREEMPT_RT) && wqh->rtlock_task) {
+		wake_up_state(wqh->rtlock_task, TASK_RTLOCK_WAIT);
+		put_task_struct(wqh->rtlock_task);
+		wqh->rtlock_task = NULL;
+	}
 
 	if (!wake_q_empty(&wqh->head))
 		wake_up_q(&wqh->head);
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 9697b04c40e68..61256de5bd66a 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -44,18 +44,18 @@ struct rt_mutex_waiter {
 /**
  * rt_wake_q_head - Wrapper around regular wake_q_head to support
  *		    "sleeping" spinlocks on RT
- * @head:	The regular wake_q_head for sleeping lock variants
- * @rt_head:	The wake_q_head for RT lock (spin/rwlock) variants
+ * @head:		The regular wake_q_head for sleeping lock variants
+ * @rtlock_task:	Task pointer for RT lock (spin/rwlock) wakeups
  */
 struct rt_wake_q_head {
 	struct wake_q_head	head;
-	struct wake_q_head	rt_head;
+	struct task_struct	*rtlock_task;
 };
 
 #define DEFINE_RT_WAKE_Q(name)						\
 	struct rt_wake_q_head name = {					\
 		.head		= WAKE_Q_HEAD_INITIALIZER(name.head),	\
-		.rt_head	= WAKE_Q_HEAD_INITIALIZER(name.rt_head),\
+		.rtlock_task	= NULL,					\
 	}
 
 /*
diff --git a/kernel/locking/ww_rt_mutex.c b/kernel/locking/ww_rt_mutex.c
index 9b7e3f413a828..519092ee5e88e 100644
--- a/kernel/locking/ww_rt_mutex.c
+++ b/kernel/locking/ww_rt_mutex.c
@@ -60,7 +60,7 @@ EXPORT_SYMBOL(ww_mutex_lock);
 int __sched
 ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
-	return __ww_rt_mutex_lock(lock, ctx, TASK_UNINTERRUPTIBLE, _RET_IP_);
+	return __ww_rt_mutex_lock(lock, ctx, TASK_INTERRUPTIBLE, _RET_IP_);
 }
 EXPORT_SYMBOL(ww_mutex_lock_interruptible);
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 87e0ad6665260..a742e9dc9a7cb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -920,7 +920,7 @@ void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task)
 		put_task_struct(task);
 }
 
-void __wake_up_q(struct wake_q_head *head, unsigned int state)
+void wake_up_q(struct wake_q_head *head)
 {
 	struct wake_q_node *node = head->first;
 
@@ -936,7 +936,7 @@ void __wake_up_q(struct wake_q_head *head, unsigned int state)
 		 * wake_up_process() executes a full barrier, which pairs with
 		 * the queueing in wake_q_add() so as not to miss wakeups.
 		 */
-		wake_up_state(task, state);
+		wake_up_process(task);
 		put_task_struct(task);
 	}
 }
diff --git a/localversion-rt b/localversion-rt
index 1445cd65885cd..ad3da1bcab7e8 100644
--- a/localversion-rt
+++ b/localversion-rt
@@ -1 +1 @@
--rt3
+-rt4

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* Re: [ANNOUNCE] v5.14-rc4-rt4
  2021-08-02 16:27 [ANNOUNCE] v5.14-rc4-rt4 Sebastian Andrzej Siewior
@ 2021-08-04  8:24 ` Daniel Wagner
  2021-08-04 10:43   ` Sebastian Andrzej Siewior
  0 siblings, 1 reply; 30+ messages in thread
From: Daniel Wagner @ 2021-08-04  8:24 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Thomas Gleixner, LKML, linux-rt-users, Steven Rostedt

Hi Sebastian,

On Mon, Aug 02, 2021 at 06:27:50PM +0200, Sebastian Andrzej Siewior wrote:
> Dear RT folks!
> 
> I'm pleased to announce the v5.14-rc4-rt4 patch set. 

stress-ng is able to trigger a BUG on my old x86_64 box very
easy. Unfortunately, there is not much in the logs though.

+ ./cyclictest.sh -D 15m -p 98 -i 1000 -t 2 -a 0-1 -h 100 -w 'stress-ng --class io --all 1'
# /dev/cpu_dma_latency set to 0us
[   21.059000] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:35
[   21.059008] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 321, name: iou-wrk-314
[   21.059010] Preemption disabled at:
[   21.059011] [<ffffffffa5223ce7>] schedule+0xa7/0xf0
[   23.076059] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:35
[   23.076064] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 324, name: iou-wrk-314
[   23.076066] Preemption disabled at:
[   23.076067] [<ffffffffa5223ce7>] schedule+0xa7/0xf0
[   37.416490] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:35
[   37.416495] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 369, name: iou-wrk-314
[   37.416498] Preemption disabled at:
[   37.416498] [<ffffffffa5223ce7>] schedule+0xa7/0xf0
[   38.416858] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:35
[   38.416863] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 411, name: iou-wrk-314
[   38.416864] Preemption disabled at:
[   38.416865] [<ffffffffa5223ce7>] schedule+0xa7/0xf0
[   43.486360] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:35
[   43.486366] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 414, name: iou-wrk-314
[   43.486368] Preemption disabled at:
[   43.486369] [<ffffffffa5223ce7>] schedule+0xa7/0xf0

Thanks,
Daniel

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [ANNOUNCE] v5.14-rc4-rt4
  2021-08-04  8:24 ` Daniel Wagner
@ 2021-08-04 10:43   ` Sebastian Andrzej Siewior
  2021-08-04 10:48     ` Sebastian Andrzej Siewior
  0 siblings, 1 reply; 30+ messages in thread
From: Sebastian Andrzej Siewior @ 2021-08-04 10:43 UTC (permalink / raw)
  To: Daniel Wagner; +Cc: Thomas Gleixner, LKML, linux-rt-users, Steven Rostedt

On 2021-08-04 10:24:18 [+0200], Daniel Wagner wrote:
> Hi Sebastian,
> 
> On Mon, Aug 02, 2021 at 06:27:50PM +0200, Sebastian Andrzej Siewior wrote:
> > Dear RT folks!
> > 
> > I'm pleased to announce the v5.14-rc4-rt4 patch set. 
> 
> stress-ng is able to trigger a BUG on my old x86_64 box very
> easy. Unfortunately, there is not much in the logs though.

Odd. Do you have a config for that, please?

> Thanks,
> Daniel

Sebastian

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [ANNOUNCE] v5.14-rc4-rt4
  2021-08-04 10:43   ` Sebastian Andrzej Siewior
@ 2021-08-04 10:48     ` Sebastian Andrzej Siewior
  2021-08-04 11:00       ` Sebastian Andrzej Siewior
  0 siblings, 1 reply; 30+ messages in thread
From: Sebastian Andrzej Siewior @ 2021-08-04 10:48 UTC (permalink / raw)
  To: Daniel Wagner; +Cc: Thomas Gleixner, LKML, linux-rt-users, Steven Rostedt

On 2021-08-04 12:43:42 [+0200], To Daniel Wagner wrote:
> Odd. Do you have a config for that, please?

No need.
| [   90.202543] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:35
| [   90.202549] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 2047, name: iou-wrk-2041
| [   90.202555] CPU: 5 PID: 2047 Comm: iou-wrk-2041 Tainted: G        W         5.14.0-rc4-rt4+ #89
| [   90.202559] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2 04/01/2014
| [   90.202561] Call Trace:
| [   90.202577]  dump_stack_lvl+0x34/0x44
| [   90.202584]  ___might_sleep.cold+0x87/0x94
| [   90.202588]  rt_spin_lock+0x19/0x70
| [   90.202593]  ___slab_alloc+0xcb/0x7d0
| [   90.202598]  ? newidle_balance.constprop.0+0xf5/0x3b0
| [   90.202603]  ? dequeue_entity+0xc3/0x290
| [   90.202605]  ? io_wqe_dec_running.isra.0+0x98/0xe0
| [   90.202610]  ? pick_next_task_fair+0xb9/0x330
| [   90.202612]  ? __schedule+0x670/0x1410
| [   90.202615]  ? io_wqe_dec_running.isra.0+0x98/0xe0
| [   90.202618]  kmem_cache_alloc_trace+0x79/0x1f0
| [   90.202621]  io_wqe_dec_running.isra.0+0x98/0xe0
| [   90.202625]  io_wq_worker_sleeping+0x37/0x50
| [   90.202628]  schedule+0x30/0xd0
| [   90.202630]  schedule_timeout+0x8f/0x1a0
| [   90.202634]  ? __bpf_trace_tick_stop+0x10/0x10
| [   90.202637]  io_wqe_worker+0xfd/0x320
| [   90.202641]  ? finish_task_switch.isra.0+0xd3/0x290
| [   90.202644]  ? io_worker_handle_work+0x670/0x670
| [   90.202646]  ? io_worker_handle_work+0x670/0x670
| [   90.202649]  ret_from_fork+0x22/0x30

le look.

> > Thanks,
> > Daniel

Sebastian

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [ANNOUNCE] v5.14-rc4-rt4
  2021-08-04 10:48     ` Sebastian Andrzej Siewior
@ 2021-08-04 11:00       ` Sebastian Andrzej Siewior
  2021-08-04 13:17         ` Peter Zijlstra
  0 siblings, 1 reply; 30+ messages in thread
From: Sebastian Andrzej Siewior @ 2021-08-04 11:00 UTC (permalink / raw)
  To: Daniel Wagner
  Cc: Thomas Gleixner, LKML, linux-rt-users, Steven Rostedt,
	Peter Zijlstra, Jens Axboe

On 2021-08-04 12:48:05 [+0200], To Daniel Wagner wrote:
> On 2021-08-04 12:43:42 [+0200], To Daniel Wagner wrote:
> > Odd. Do you have a config for that, please?
> 
> No need.
> | [   90.202543] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:35
> | [   90.202549] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 2047, name: iou-wrk-2041
> | [   90.202555] CPU: 5 PID: 2047 Comm: iou-wrk-2041 Tainted: G        W         5.14.0-rc4-rt4+ #89
> | [   90.202561] Call Trace:
> | [   90.202588]  rt_spin_lock+0x19/0x70
> | [   90.202593]  ___slab_alloc+0xcb/0x7d0
> | [   90.202618]  kmem_cache_alloc_trace+0x79/0x1f0
> | [   90.202621]  io_wqe_dec_running.isra.0+0x98/0xe0
> | [   90.202625]  io_wq_worker_sleeping+0x37/0x50
> | [   90.202628]  schedule+0x30/0xd0
> 
> le look.

So this is due to commit
  685fe7feedb96 ("io-wq: eliminate the need for a manager thread")

introduced in the v5.13-rc1 merge window. The call chain is
  schedule()
   sched_submit_work()
    preempt_disable();
    io_wq_worker_sleeping()
      raw_spin_lock_irq(&worker->wqe->lock);
      io_wqe_dec_running(worker);
       io_queue_worker_create()
        kmalloc(sizeof(*cwd), GFP_ATOMIC);

The lock wqe::lock has been turned into a raw_spinlock_t in commit
   95da84659226d ("io_wq: Make io_wqe::lock a raw_spinlock_t")

after a careful analysis of the code at that time. This commit breaks
things. Is this really needed?

> > > Thanks,
> > > Daniel

Sebastian

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [ANNOUNCE] v5.14-rc4-rt4
  2021-08-04 11:00       ` Sebastian Andrzej Siewior
@ 2021-08-04 13:17         ` Peter Zijlstra
  2021-08-04 13:32           ` Jens Axboe
  0 siblings, 1 reply; 30+ messages in thread
From: Peter Zijlstra @ 2021-08-04 13:17 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Daniel Wagner, Thomas Gleixner, LKML, linux-rt-users,
	Steven Rostedt, Jens Axboe

On Wed, Aug 04, 2021 at 01:00:57PM +0200, Sebastian Andrzej Siewior wrote:
> On 2021-08-04 12:48:05 [+0200], To Daniel Wagner wrote:
> > On 2021-08-04 12:43:42 [+0200], To Daniel Wagner wrote:
> > > Odd. Do you have a config for that, please?
> > 
> > No need.
> > | [   90.202543] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:35
> > | [   90.202549] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 2047, name: iou-wrk-2041
> > | [   90.202555] CPU: 5 PID: 2047 Comm: iou-wrk-2041 Tainted: G        W         5.14.0-rc4-rt4+ #89
> > | [   90.202561] Call Trace:
> …
> > | [   90.202588]  rt_spin_lock+0x19/0x70
> > | [   90.202593]  ___slab_alloc+0xcb/0x7d0
> …
> > | [   90.202618]  kmem_cache_alloc_trace+0x79/0x1f0
> > | [   90.202621]  io_wqe_dec_running.isra.0+0x98/0xe0
> > | [   90.202625]  io_wq_worker_sleeping+0x37/0x50
> > | [   90.202628]  schedule+0x30/0xd0
> > 
> > le look.
> 
> So this is due to commit
>   685fe7feedb96 ("io-wq: eliminate the need for a manager thread")
> 
> introduced in the v5.13-rc1 merge window. The call chain is
>   schedule()
>    sched_submit_work()
>     preempt_disable();
>     io_wq_worker_sleeping()
>       raw_spin_lock_irq(&worker->wqe->lock);
>       io_wqe_dec_running(worker);
>        io_queue_worker_create()
>         kmalloc(sizeof(*cwd), GFP_ATOMIC);
> 
> The lock wqe::lock has been turned into a raw_spinlock_t in commit
>    95da84659226d ("io_wq: Make io_wqe::lock a raw_spinlock_t")
> 
> after a careful analysis of the code at that time. This commit breaks
> things. Is this really needed?

Urgh, doing allocs from schedule seems really yuck. Can we please not do
this?

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [ANNOUNCE] v5.14-rc4-rt4
  2021-08-04 13:17         ` Peter Zijlstra
@ 2021-08-04 13:32           ` Jens Axboe
  2021-08-04 14:23             ` Jens Axboe
  0 siblings, 1 reply; 30+ messages in thread
From: Jens Axboe @ 2021-08-04 13:32 UTC (permalink / raw)
  To: Peter Zijlstra, Sebastian Andrzej Siewior
  Cc: Daniel Wagner, Thomas Gleixner, LKML, linux-rt-users, Steven Rostedt

On 8/4/21 7:17 AM, Peter Zijlstra wrote:
> On Wed, Aug 04, 2021 at 01:00:57PM +0200, Sebastian Andrzej Siewior wrote:
>> On 2021-08-04 12:48:05 [+0200], To Daniel Wagner wrote:
>>> On 2021-08-04 12:43:42 [+0200], To Daniel Wagner wrote:
>>>> Odd. Do you have a config for that, please?
>>>
>>> No need.
>>> | [   90.202543] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:35
>>> | [   90.202549] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 2047, name: iou-wrk-2041
>>> | [   90.202555] CPU: 5 PID: 2047 Comm: iou-wrk-2041 Tainted: G        W         5.14.0-rc4-rt4+ #89
>>> | [   90.202561] Call Trace:
>> …
>>> | [   90.202588]  rt_spin_lock+0x19/0x70
>>> | [   90.202593]  ___slab_alloc+0xcb/0x7d0
>> …
>>> | [   90.202618]  kmem_cache_alloc_trace+0x79/0x1f0
>>> | [   90.202621]  io_wqe_dec_running.isra.0+0x98/0xe0
>>> | [   90.202625]  io_wq_worker_sleeping+0x37/0x50
>>> | [   90.202628]  schedule+0x30/0xd0
>>>
>>> le look.
>>
>> So this is due to commit
>>   685fe7feedb96 ("io-wq: eliminate the need for a manager thread")
>>
>> introduced in the v5.13-rc1 merge window. The call chain is
>>   schedule()
>>    sched_submit_work()
>>     preempt_disable();
>>     io_wq_worker_sleeping()
>>       raw_spin_lock_irq(&worker->wqe->lock);
>>       io_wqe_dec_running(worker);
>>        io_queue_worker_create()
>>         kmalloc(sizeof(*cwd), GFP_ATOMIC);
>>
>> The lock wqe::lock has been turned into a raw_spinlock_t in commit
>>    95da84659226d ("io_wq: Make io_wqe::lock a raw_spinlock_t")
>>
>> after a careful analysis of the code at that time. This commit breaks
>> things. Is this really needed?
> 
> Urgh, doing allocs from schedule seems really yuck. Can we please not do
> this?

Agree, I have an idea of how to get rid of it. Let me experiment a bit...

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [ANNOUNCE] v5.14-rc4-rt4
  2021-08-04 13:32           ` Jens Axboe
@ 2021-08-04 14:23             ` Jens Axboe
  2021-08-04 15:33               ` Sebastian Andrzej Siewior
  0 siblings, 1 reply; 30+ messages in thread
From: Jens Axboe @ 2021-08-04 14:23 UTC (permalink / raw)
  To: Peter Zijlstra, Sebastian Andrzej Siewior
  Cc: Daniel Wagner, Thomas Gleixner, LKML, linux-rt-users, Steven Rostedt

On 8/4/21 7:32 AM, Jens Axboe wrote:
> On 8/4/21 7:17 AM, Peter Zijlstra wrote:
>> On Wed, Aug 04, 2021 at 01:00:57PM +0200, Sebastian Andrzej Siewior wrote:
>>> On 2021-08-04 12:48:05 [+0200], To Daniel Wagner wrote:
>>>> On 2021-08-04 12:43:42 [+0200], To Daniel Wagner wrote:
>>>>> Odd. Do you have a config for that, please?
>>>>
>>>> No need.
>>>> | [   90.202543] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:35
>>>> | [   90.202549] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 2047, name: iou-wrk-2041
>>>> | [   90.202555] CPU: 5 PID: 2047 Comm: iou-wrk-2041 Tainted: G        W         5.14.0-rc4-rt4+ #89
>>>> | [   90.202561] Call Trace:
>>> …
>>>> | [   90.202588]  rt_spin_lock+0x19/0x70
>>>> | [   90.202593]  ___slab_alloc+0xcb/0x7d0
>>> …
>>>> | [   90.202618]  kmem_cache_alloc_trace+0x79/0x1f0
>>>> | [   90.202621]  io_wqe_dec_running.isra.0+0x98/0xe0
>>>> | [   90.202625]  io_wq_worker_sleeping+0x37/0x50
>>>> | [   90.202628]  schedule+0x30/0xd0
>>>>
>>>> le look.
>>>
>>> So this is due to commit
>>>   685fe7feedb96 ("io-wq: eliminate the need for a manager thread")
>>>
>>> introduced in the v5.13-rc1 merge window. The call chain is
>>>   schedule()
>>>    sched_submit_work()
>>>     preempt_disable();
>>>     io_wq_worker_sleeping()
>>>       raw_spin_lock_irq(&worker->wqe->lock);
>>>       io_wqe_dec_running(worker);
>>>        io_queue_worker_create()
>>>         kmalloc(sizeof(*cwd), GFP_ATOMIC);
>>>
>>> The lock wqe::lock has been turned into a raw_spinlock_t in commit
>>>    95da84659226d ("io_wq: Make io_wqe::lock a raw_spinlock_t")
>>>
>>> after a careful analysis of the code at that time. This commit breaks
>>> things. Is this really needed?
>>
>> Urgh, doing allocs from schedule seems really yuck. Can we please not do
>> this?
> 
> Agree, I have an idea of how to get rid of it. Let me experiment a bit...

Something like this should do it - the only thing we need the allocation for
is short lived, queueing a task_work item to create a new worker. We can
manage this on a per-existing worker basis, and just have the tw/index
stored in the worker itself. That avoids an allocation off schedule ->
going to sleep.

Totally untested, but I think the principle is sound. I'll run it through
some testing.


diff --git a/fs/io-wq.c b/fs/io-wq.c
index 50dc93ffc153..97eaaf25a429 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -51,6 +51,10 @@ struct io_worker {
 
 	struct completion ref_done;
 
+	unsigned long create_state;
+	struct callback_head create_work;
+	int create_index;
+
 	struct rcu_head rcu;
 };
 
@@ -261,42 +265,44 @@ static void io_wqe_inc_running(struct io_worker *worker)
 	atomic_inc(&acct->nr_running);
 }
 
-struct create_worker_data {
-	struct callback_head work;
-	struct io_wqe *wqe;
-	int index;
-};
-
 static void create_worker_cb(struct callback_head *cb)
 {
-	struct create_worker_data *cwd;
+	struct io_worker *worker;
 	struct io_wq *wq;
 
-	cwd = container_of(cb, struct create_worker_data, work);
-	wq = cwd->wqe->wq;
-	create_io_worker(wq, cwd->wqe, cwd->index);
-	kfree(cwd);
+	worker = container_of(cb, struct io_worker, create_work);
+	wq = worker->wqe->wq;
+	create_io_worker(wq, worker->wqe, worker->create_index);
+	clear_bit_unlock(0, &worker->create_state);
+	io_worker_release(worker);
 }
 
-static void io_queue_worker_create(struct io_wqe *wqe, struct io_wqe_acct *acct)
+static void io_queue_worker_create(struct io_wqe *wqe, struct io_worker *worker,
+				   struct io_wqe_acct *acct)
 {
-	struct create_worker_data *cwd;
 	struct io_wq *wq = wqe->wq;
 
 	/* raced with exit, just ignore create call */
 	if (test_bit(IO_WQ_BIT_EXIT, &wq->state))
 		goto fail;
+	/*
+	 * create_state manages ownership of create_work/index. We should
+	 * only need one entry per worker, as the worker going to sleep
+	 * will trigger the condition, and waking will clear it once it
+	 * runs the task_work.
+	 */
+	if (test_bit(0, &worker->create_state) ||
+	    test_and_set_bit_lock(0, &worker->create_state))
+		goto fail;
 
-	cwd = kmalloc(sizeof(*cwd), GFP_ATOMIC);
-	if (cwd) {
-		init_task_work(&cwd->work, create_worker_cb);
-		cwd->wqe = wqe;
-		cwd->index = acct->index;
-		if (!task_work_add(wq->task, &cwd->work, TWA_SIGNAL))
-			return;
+	io_worker_get(worker);
+	init_task_work(&worker->create_work, create_worker_cb);
+	worker->create_index = acct->index;
+	if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL))
+		return;
 
-		kfree(cwd);
-	}
+	clear_bit_unlock(0, &worker->create_state);
+	io_worker_release(worker);
 fail:
 	atomic_dec(&acct->nr_running);
 	io_worker_ref_put(wq);
@@ -314,7 +320,7 @@ static void io_wqe_dec_running(struct io_worker *worker)
 	if (atomic_dec_and_test(&acct->nr_running) && io_wqe_run_queue(wqe)) {
 		atomic_inc(&acct->nr_running);
 		atomic_inc(&wqe->wq->worker_refs);
-		io_queue_worker_create(wqe, acct);
+		io_queue_worker_create(wqe, worker, acct);
 	}
 }
 
@@ -973,12 +979,12 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 
 static bool io_task_work_match(struct callback_head *cb, void *data)
 {
-	struct create_worker_data *cwd;
+	struct io_worker *worker;
 
 	if (cb->func != create_worker_cb)
 		return false;
-	cwd = container_of(cb, struct create_worker_data, work);
-	return cwd->wqe->wq == data;
+	worker = container_of(cb, struct io_worker, create_work);
+	return worker->wqe->wq == data;
 }
 
 void io_wq_exit_start(struct io_wq *wq)
@@ -995,12 +1001,13 @@ static void io_wq_exit_workers(struct io_wq *wq)
 		return;
 
 	while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) {
-		struct create_worker_data *cwd;
+		struct io_worker *worker;
 
-		cwd = container_of(cb, struct create_worker_data, work);
-		atomic_dec(&cwd->wqe->acct[cwd->index].nr_running);
+		worker = container_of(cb, struct io_worker, create_work);
+		atomic_dec(&worker->wqe->acct[worker->create_index].nr_running);
 		io_worker_ref_put(wq);
-		kfree(cwd);
+		clear_bit_unlock(0, &worker->create_state);
+		io_worker_release(worker);
 	}
 
 	rcu_read_lock();

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* Re: [ANNOUNCE] v5.14-rc4-rt4
  2021-08-04 14:23             ` Jens Axboe
@ 2021-08-04 15:33               ` Sebastian Andrzej Siewior
  2021-08-04 15:39                 ` Jens Axboe
  0 siblings, 1 reply; 30+ messages in thread
From: Sebastian Andrzej Siewior @ 2021-08-04 15:33 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Peter Zijlstra, Daniel Wagner, Thomas Gleixner, LKML,
	linux-rt-users, Steven Rostedt

On 2021-08-04 08:23:55 [-0600], Jens Axboe wrote:
> Totally untested, but I think the principle is sound. I'll run it through
> some testing.

This is needed:

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 1192ee2abd982..77ec6896edaa5 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -428,9 +428,9 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
 	}
 
 	if (stall_hash != -1U) {
-		raw_spin_unlock(&wqe->lock);
+		raw_spin_unlock_irq(&wqe->lock);
 		io_wait_on_hash(wqe, stall_hash);
-		raw_spin_lock(&wqe->lock);
+		raw_spin_lock_irq(&wqe->lock);
 	}
 
 	return NULL;


otherwise the spinlock_t lock in io_wait_on_hash() is acquired with
disabled interrupts which is a no-no on -RT.
With that it all looks good as far as I can tell.
Thank you.

Sebastian

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* Re: [ANNOUNCE] v5.14-rc4-rt4
  2021-08-04 15:33               ` Sebastian Andrzej Siewior
@ 2021-08-04 15:39                 ` Jens Axboe
  2021-08-04 15:47                   ` Sebastian Andrzej Siewior
  0 siblings, 1 reply; 30+ messages in thread
From: Jens Axboe @ 2021-08-04 15:39 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Peter Zijlstra, Daniel Wagner, Thomas Gleixner, LKML,
	linux-rt-users, Steven Rostedt

On 8/4/21 9:33 AM, Sebastian Andrzej Siewior wrote:
> On 2021-08-04 08:23:55 [-0600], Jens Axboe wrote:
>> Totally untested, but I think the principle is sound. I'll run it through
>> some testing.
> 
> This is needed:
> 
> diff --git a/fs/io-wq.c b/fs/io-wq.c
> index 1192ee2abd982..77ec6896edaa5 100644
> --- a/fs/io-wq.c
> +++ b/fs/io-wq.c
> @@ -428,9 +428,9 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
>  	}
>  
>  	if (stall_hash != -1U) {
> -		raw_spin_unlock(&wqe->lock);
> +		raw_spin_unlock_irq(&wqe->lock);
>  		io_wait_on_hash(wqe, stall_hash);
> -		raw_spin_lock(&wqe->lock);
> +		raw_spin_lock_irq(&wqe->lock);
>  	}
>  
>  	return NULL;
> 
> 
> otherwise the spinlock_t lock in io_wait_on_hash() is acquired with
> disabled interrupts which is a no-no on -RT.
> With that it all looks good as far as I can tell.
> Thank you.

I'm confused, the waitqueue locks are always IRQ disabling.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [ANNOUNCE] v5.14-rc4-rt4
  2021-08-04 15:39                 ` Jens Axboe
@ 2021-08-04 15:47                   ` Sebastian Andrzej Siewior
  2021-08-04 15:49                     ` Jens Axboe
  0 siblings, 1 reply; 30+ messages in thread
From: Sebastian Andrzej Siewior @ 2021-08-04 15:47 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Peter Zijlstra, Daniel Wagner, Thomas Gleixner, LKML,
	linux-rt-users, Steven Rostedt

On 2021-08-04 09:39:30 [-0600], Jens Axboe wrote:
> I'm confused, the waitqueue locks are always IRQ disabling.

spin_lock_irq() does not disable interrupts on -RT. The patch above
produces:

| BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:35
| in_atomic(): 0, irqs_disabled(): 1, non_block: 0, pid: 2020, name: iou-wrk-2018
| 1 lock held by iou-wrk-2018/2020:
|  #0: ffff888111a47de8 (&hash->wait){+.+.}-{0:0}, at: io_worker_handle_work+0x443/0x630
| irq event stamp: 10
| hardirqs last  enabled at (9): [<ffffffff81c47818>] _raw_spin_unlock_irqrestore+0x28/0x70
| hardirqs last disabled at (10): [<ffffffff81c4769e>] _raw_spin_lock_irq+0x3e/0x40
| softirqs last  enabled at (0): [<ffffffff81077238>] copy_process+0x8f8/0x2020
| softirqs last disabled at (0): [<0000000000000000>] 0x0
| CPU: 5 PID: 2020 Comm: iou-wrk-2018 Tainted: G        W         5.14.0-rc4-rt4+ #97
| Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2 04/01/2014
| Call Trace:
|  dump_stack_lvl+0x45/0x59
|  ___might_sleep.cold+0xa6/0xb6
|  rt_spin_lock+0x35/0xc0
|  ? io_worker_handle_work+0x443/0x630
|  io_worker_handle_work+0x443/0x630
|  io_wqe_worker+0xb4/0x340
|  ? lockdep_hardirqs_on_prepare+0xd4/0x170
|  ? _raw_spin_unlock_irqrestore+0x28/0x70
|  ? _raw_spin_unlock_irqrestore+0x28/0x70
|  ? io_worker_handle_work+0x630/0x630
|  ? rt_mutex_slowunlock+0x2ba/0x310
|  ? io_worker_handle_work+0x630/0x630
|  ret_from_fork+0x22/0x30


But indeed, you are right, my snippet breaks non-RT. So this then maybe:

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 57d3cdddcdb3e..0b931ac3c83e6 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -384,7 +384,7 @@ static void io_wait_on_hash(struct io_wqe *wqe, unsigned int hash)
 {
 	struct io_wq *wq = wqe->wq;
 
-	spin_lock(&wq->hash->wait.lock);
+	spin_lock_irq(&wq->hash->wait.lock);
 	if (list_empty(&wqe->wait.entry)) {
 		__add_wait_queue(&wq->hash->wait, &wqe->wait);
 		if (!test_bit(hash, &wq->hash->map)) {
@@ -392,7 +392,7 @@ static void io_wait_on_hash(struct io_wqe *wqe, unsigned int hash)
 			list_del_init(&wqe->wait.entry);
 		}
 	}
-	spin_unlock(&wq->hash->wait.lock);
+	spin_unlock_irq(&wq->hash->wait.lock);
 }
 
 static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
@@ -430,9 +430,9 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
 	}
 
 	if (stall_hash != -1U) {
-		raw_spin_unlock(&wqe->lock);
+		raw_spin_unlock_irq(&wqe->lock);
 		io_wait_on_hash(wqe, stall_hash);
-		raw_spin_lock(&wqe->lock);
+		raw_spin_lock_irq(&wqe->lock);
 	}
 
 	return NULL;

(this is on-top of the patch you sent earlier and Daniel Cc: me on after
I checked that the problem/warning still exists).

Sebastian

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* Re: [ANNOUNCE] v5.14-rc4-rt4
  2021-08-04 15:47                   ` Sebastian Andrzej Siewior
@ 2021-08-04 15:49                     ` Jens Axboe
  2021-08-04 15:57                       ` Sebastian Andrzej Siewior
  2021-08-04 16:17                       ` [ANNOUNCE] v5.14-rc4-rt4 Steven Rostedt
  0 siblings, 2 replies; 30+ messages in thread
From: Jens Axboe @ 2021-08-04 15:49 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Peter Zijlstra, Daniel Wagner, Thomas Gleixner, LKML,
	linux-rt-users, Steven Rostedt

On 8/4/21 9:47 AM, Sebastian Andrzej Siewior wrote:
> On 2021-08-04 09:39:30 [-0600], Jens Axboe wrote:
>> I'm confused, the waitqueue locks are always IRQ disabling.
> 
> spin_lock_irq() does not disable interrupts on -RT. The patch above
> produces:
> 
> | BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:35
> | in_atomic(): 0, irqs_disabled(): 1, non_block: 0, pid: 2020, name: iou-wrk-2018
> | 1 lock held by iou-wrk-2018/2020:
> |  #0: ffff888111a47de8 (&hash->wait){+.+.}-{0:0}, at: io_worker_handle_work+0x443/0x630
> | irq event stamp: 10
> | hardirqs last  enabled at (9): [<ffffffff81c47818>] _raw_spin_unlock_irqrestore+0x28/0x70
> | hardirqs last disabled at (10): [<ffffffff81c4769e>] _raw_spin_lock_irq+0x3e/0x40
> | softirqs last  enabled at (0): [<ffffffff81077238>] copy_process+0x8f8/0x2020
> | softirqs last disabled at (0): [<0000000000000000>] 0x0
> | CPU: 5 PID: 2020 Comm: iou-wrk-2018 Tainted: G        W         5.14.0-rc4-rt4+ #97
> | Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2 04/01/2014
> | Call Trace:
> |  dump_stack_lvl+0x45/0x59
> |  ___might_sleep.cold+0xa6/0xb6
> |  rt_spin_lock+0x35/0xc0
> |  ? io_worker_handle_work+0x443/0x630
> |  io_worker_handle_work+0x443/0x630
> |  io_wqe_worker+0xb4/0x340
> |  ? lockdep_hardirqs_on_prepare+0xd4/0x170
> |  ? _raw_spin_unlock_irqrestore+0x28/0x70
> |  ? _raw_spin_unlock_irqrestore+0x28/0x70
> |  ? io_worker_handle_work+0x630/0x630
> |  ? rt_mutex_slowunlock+0x2ba/0x310
> |  ? io_worker_handle_work+0x630/0x630
> |  ret_from_fork+0x22/0x30
> 
> 
> But indeed, you are right, my snippet breaks non-RT. So this then maybe:
> 
> diff --git a/fs/io-wq.c b/fs/io-wq.c
> index 57d3cdddcdb3e..0b931ac3c83e6 100644
> --- a/fs/io-wq.c
> +++ b/fs/io-wq.c
> @@ -384,7 +384,7 @@ static void io_wait_on_hash(struct io_wqe *wqe, unsigned int hash)
>  {
>  	struct io_wq *wq = wqe->wq;
>  
> -	spin_lock(&wq->hash->wait.lock);
> +	spin_lock_irq(&wq->hash->wait.lock);
>  	if (list_empty(&wqe->wait.entry)) {
>  		__add_wait_queue(&wq->hash->wait, &wqe->wait);
>  		if (!test_bit(hash, &wq->hash->map)) {
> @@ -392,7 +392,7 @@ static void io_wait_on_hash(struct io_wqe *wqe, unsigned int hash)
>  			list_del_init(&wqe->wait.entry);
>  		}
>  	}
> -	spin_unlock(&wq->hash->wait.lock);
> +	spin_unlock_irq(&wq->hash->wait.lock);
>  }
>  
>  static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
> @@ -430,9 +430,9 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
>  	}
>  
>  	if (stall_hash != -1U) {
> -		raw_spin_unlock(&wqe->lock);
> +		raw_spin_unlock_irq(&wqe->lock);
>  		io_wait_on_hash(wqe, stall_hash);
> -		raw_spin_lock(&wqe->lock);
> +		raw_spin_lock_irq(&wqe->lock);
>  	}
>  
>  	return NULL;
> 
> (this is on-top of the patch you sent earlier and Daniel Cc: me on after
> I checked that the problem/warning still exists).

That'd work on non-RT as well, but it makes it worse on non-RT as well with
the irq enable/disable dance. While that's not the end of the world, would
be nice to have a solution that doesn't sacrifice anything, yet doesn't
make RT unhappy.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [ANNOUNCE] v5.14-rc4-rt4
  2021-08-04 15:49                     ` Jens Axboe
@ 2021-08-04 15:57                       ` Sebastian Andrzej Siewior
  2021-08-04 16:05                         ` Jens Axboe
  2021-08-04 16:17                       ` [ANNOUNCE] v5.14-rc4-rt4 Steven Rostedt
  1 sibling, 1 reply; 30+ messages in thread
From: Sebastian Andrzej Siewior @ 2021-08-04 15:57 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Peter Zijlstra, Daniel Wagner, Thomas Gleixner, LKML,
	linux-rt-users, Steven Rostedt

On 2021-08-04 09:49:48 [-0600], Jens Axboe wrote:
> 
> That'd work on non-RT as well, but it makes it worse on non-RT as well with
> the irq enable/disable dance. While that's not the end of the world, would
> be nice to have a solution that doesn't sacrifice anything, yet doesn't
> make RT unhappy.

There were plans to make local_irq_disable() mostly a nop (similar to
what ppc64 does). But I have no idea what happened. I hope that work
gets picked up.

Sebastian

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [ANNOUNCE] v5.14-rc4-rt4
  2021-08-04 15:57                       ` Sebastian Andrzej Siewior
@ 2021-08-04 16:05                         ` Jens Axboe
  2021-08-04 16:20                           ` Sebastian Andrzej Siewior
  2021-08-04 16:20                           ` Steven Rostedt
  0 siblings, 2 replies; 30+ messages in thread
From: Jens Axboe @ 2021-08-04 16:05 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Peter Zijlstra, Daniel Wagner, Thomas Gleixner, LKML,
	linux-rt-users, Steven Rostedt

On 8/4/21 9:57 AM, Sebastian Andrzej Siewior wrote:
> On 2021-08-04 09:49:48 [-0600], Jens Axboe wrote:
>>
>> That'd work on non-RT as well, but it makes it worse on non-RT as well with
>> the irq enable/disable dance. While that's not the end of the world, would
>> be nice to have a solution that doesn't sacrifice anything, yet doesn't
>> make RT unhappy.
> 
> There were plans to make local_irq_disable() mostly a nop (similar to
> what ppc64 does). But I have no idea what happened. I hope that work
> gets picked up.

So what do you propose in the interim? As far as io_uring is concerned,
it's not a _huge_ deal to do the IRQ dance, but it does bother me that
we're making things slightly worse for the mainline kernel just to make
the out-of-tree patches happy.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [ANNOUNCE] v5.14-rc4-rt4
  2021-08-04 15:49                     ` Jens Axboe
  2021-08-04 15:57                       ` Sebastian Andrzej Siewior
@ 2021-08-04 16:17                       ` Steven Rostedt
  2021-08-04 16:22                         ` Sebastian Andrzej Siewior
  1 sibling, 1 reply; 30+ messages in thread
From: Steven Rostedt @ 2021-08-04 16:17 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Sebastian Andrzej Siewior, Peter Zijlstra, Daniel Wagner,
	Thomas Gleixner, LKML, linux-rt-users

On Wed, 4 Aug 2021 09:49:48 -0600
Jens Axboe <axboe@kernel.dk> wrote:

> > @@ -430,9 +430,9 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
> >  	}
> >  
> >  	if (stall_hash != -1U) {
> > -		raw_spin_unlock(&wqe->lock);
> > +		raw_spin_unlock_irq(&wqe->lock);
> >  		io_wait_on_hash(wqe, stall_hash);
> > -		raw_spin_lock(&wqe->lock);
> > +		raw_spin_lock_irq(&wqe->lock);
> >  	}
> >  
> >  	return NULL;
> > 
> > (this is on-top of the patch you sent earlier and Daniel Cc: me on after
> > I checked that the problem/warning still exists).  
> 
> That'd work on non-RT as well, but it makes it worse on non-RT as well with
> the irq enable/disable dance. While that's not the end of the world, would
> be nice to have a solution that doesn't sacrifice anything, yet doesn't
> make RT unhappy.

We use to have something like:

	local_irq_disable_rt()

that would only disable irqs when PREEMPT_RT was configured, but this
was considered "ugly" and removed to try to only use spin_lock_irq()
and raw_spin_lock_irq(). But for this situation, it looks like it would
do exactly what you wanted. Not sacrifice anything yet keep RT happy.

Not sure that's a solution still. :-/

Perhaps in this situation, we could open code it to:

  	if (stall_hash != -1U) {
		raw_spin_unlock(&wqe->lock);

		/* On RT the spin_lock_irq() does not disable interrupts */
		if (IS_ENABLED(CONFIG_PREEMPT_RT))
			local_irq_enable();

 		io_wait_on_hash(wqe, stall_hash);

		if (IS_ENABLED(CONFIG_PREEMPT_RT))
			local_irq_disable();
		raw_spin_lock(&wqe->lock);
 	}

Note, I haven't looked at the rest of the code to know the ripple
effect of this, but I'm just suggesting the idea.

-- Steve

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [ANNOUNCE] v5.14-rc4-rt4
  2021-08-04 16:05                         ` Jens Axboe
@ 2021-08-04 16:20                           ` Sebastian Andrzej Siewior
  2021-08-04 16:20                             ` Jens Axboe
  2021-08-04 16:20                           ` Steven Rostedt
  1 sibling, 1 reply; 30+ messages in thread
From: Sebastian Andrzej Siewior @ 2021-08-04 16:20 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Peter Zijlstra, Daniel Wagner, Thomas Gleixner, LKML,
	linux-rt-users, Steven Rostedt

On 2021-08-04 10:05:39 [-0600], Jens Axboe wrote:
> So what do you propose in the interim? As far as io_uring is concerned,
> it's not a _huge_ deal to do the IRQ dance, but it does bother me that
> we're making things slightly worse for the mainline kernel just to make
> the out-of-tree patches happy.

I'm sorry but I propose nothing other than the dance. I don't think
PeterZ/ tglx will appreciate the ifdefery to avoid the IRQ-on/off here.
I have no other ideas here.

Sebastian

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [ANNOUNCE] v5.14-rc4-rt4
  2021-08-04 16:05                         ` Jens Axboe
  2021-08-04 16:20                           ` Sebastian Andrzej Siewior
@ 2021-08-04 16:20                           ` Steven Rostedt
  2021-08-04 16:22                             ` Jens Axboe
  1 sibling, 1 reply; 30+ messages in thread
From: Steven Rostedt @ 2021-08-04 16:20 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Sebastian Andrzej Siewior, Peter Zijlstra, Daniel Wagner,
	Thomas Gleixner, LKML, linux-rt-users

On Wed, 4 Aug 2021 10:05:39 -0600
Jens Axboe <axboe@kernel.dk> wrote:

> So what do you propose in the interim? As far as io_uring is concerned,
> it's not a _huge_ deal to do the IRQ dance, but it does bother me that
> we're making things slightly worse for the mainline kernel just to make
> the out-of-tree patches happy.

Note that the purpose of these patches are to be able to bring those
out-of-tree patches into the kernel such that they are no longer
out-of-tree.

-- Steve

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [ANNOUNCE] v5.14-rc4-rt4
  2021-08-04 16:20                           ` Sebastian Andrzej Siewior
@ 2021-08-04 16:20                             ` Jens Axboe
  0 siblings, 0 replies; 30+ messages in thread
From: Jens Axboe @ 2021-08-04 16:20 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Peter Zijlstra, Daniel Wagner, Thomas Gleixner, LKML,
	linux-rt-users, Steven Rostedt

On 8/4/21 10:20 AM, Sebastian Andrzej Siewior wrote:
> On 2021-08-04 10:05:39 [-0600], Jens Axboe wrote:
>> So what do you propose in the interim? As far as io_uring is concerned,
>> it's not a _huge_ deal to do the IRQ dance, but it does bother me that
>> we're making things slightly worse for the mainline kernel just to make
>> the out-of-tree patches happy.
> 
> I'm sorry but I propose nothing other than the dance. I don't think
> PeterZ/ tglx will appreciate the ifdefery to avoid the IRQ-on/off here.
> I have no other ideas here.

Alright, I'll see if I can think of something. Don't like doing ifdefs
either, or enable checks - but at least it'd only hurt readability
slightly at that point.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [ANNOUNCE] v5.14-rc4-rt4
  2021-08-04 16:17                       ` [ANNOUNCE] v5.14-rc4-rt4 Steven Rostedt
@ 2021-08-04 16:22                         ` Sebastian Andrzej Siewior
  2021-08-04 16:25                           ` Steven Rostedt
  0 siblings, 1 reply; 30+ messages in thread
From: Sebastian Andrzej Siewior @ 2021-08-04 16:22 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Jens Axboe, Peter Zijlstra, Daniel Wagner, Thomas Gleixner, LKML,
	linux-rt-users

On 2021-08-04 12:17:04 [-0400], Steven Rostedt wrote:
> Perhaps in this situation, we could open code it to:
> 
>   	if (stall_hash != -1U) {
> 		raw_spin_unlock(&wqe->lock);
> 
> 		/* On RT the spin_lock_irq() does not disable interrupts */
> 		if (IS_ENABLED(CONFIG_PREEMPT_RT))
> 			local_irq_enable();

no preemption happens here with NEED_RESCHED set.

>  		io_wait_on_hash(wqe, stall_hash);
> 
> 		if (IS_ENABLED(CONFIG_PREEMPT_RT))
> 			local_irq_disable();
> 		raw_spin_lock(&wqe->lock);
>  	}
> 
> Note, I haven't looked at the rest of the code to know the ripple
> effect of this, but I'm just suggesting the idea.
> 

Sebastian

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [ANNOUNCE] v5.14-rc4-rt4
  2021-08-04 16:20                           ` Steven Rostedt
@ 2021-08-04 16:22                             ` Jens Axboe
  2021-08-04 16:47                               ` Sebastian Andrzej Siewior
  0 siblings, 1 reply; 30+ messages in thread
From: Jens Axboe @ 2021-08-04 16:22 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Sebastian Andrzej Siewior, Peter Zijlstra, Daniel Wagner,
	Thomas Gleixner, LKML, linux-rt-users

On 8/4/21 10:20 AM, Steven Rostedt wrote:
> On Wed, 4 Aug 2021 10:05:39 -0600
> Jens Axboe <axboe@kernel.dk> wrote:
> 
>> So what do you propose in the interim? As far as io_uring is concerned,
>> it's not a _huge_ deal to do the IRQ dance, but it does bother me that
>> we're making things slightly worse for the mainline kernel just to make
>> the out-of-tree patches happy.
> 
> Note that the purpose of these patches are to be able to bring those
> out-of-tree patches into the kernel such that they are no longer
> out-of-tree.

Sure, I realize that. And I've always been accommodating to making
pieces of code more RT friendly, I just don't like doing it for cases
where we are making mainline worse.

In that regard, I do still consider those patches out-of-tree, which
they are. And while I'm more sympathetic to them compared to other
out-of-tree code as there's a long term plan to get it all in, it's
still out-of-tree. Best solution here is probably to just carry that
particular change in the RT patchset for now.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [ANNOUNCE] v5.14-rc4-rt4
  2021-08-04 16:22                         ` Sebastian Andrzej Siewior
@ 2021-08-04 16:25                           ` Steven Rostedt
  2021-08-04 16:31                             ` Sebastian Andrzej Siewior
  0 siblings, 1 reply; 30+ messages in thread
From: Steven Rostedt @ 2021-08-04 16:25 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Jens Axboe, Peter Zijlstra, Daniel Wagner, Thomas Gleixner, LKML,
	linux-rt-users

On Wed, 4 Aug 2021 18:22:31 +0200
Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:

> no preemption happens here with NEED_RESCHED set.

But if interrupts were disabled, how would NEED_RESCHED be set? As soon
as you enable interrupts, the interrupt that sets NEED_RESCHED would
trigger the preemption.

-- Steve

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [ANNOUNCE] v5.14-rc4-rt4
  2021-08-04 16:25                           ` Steven Rostedt
@ 2021-08-04 16:31                             ` Sebastian Andrzej Siewior
  2021-08-04 16:47                               ` Steven Rostedt
  0 siblings, 1 reply; 30+ messages in thread
From: Sebastian Andrzej Siewior @ 2021-08-04 16:31 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Jens Axboe, Peter Zijlstra, Daniel Wagner, Thomas Gleixner, LKML,
	linux-rt-users

On 2021-08-04 12:25:41 [-0400], Steven Rostedt wrote:
> On Wed, 4 Aug 2021 18:22:31 +0200
> Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:
> 
> > no preemption happens here with NEED_RESCHED set.
> 
> But if interrupts were disabled, how would NEED_RESCHED be set? As soon
> as you enable interrupts, the interrupt that sets NEED_RESCHED would
> trigger the preemption.

CPU-local wake-ups just set NEED_RESCHED and wait for preempt_enable()
to do the magic. Just because the code not perform wake_up() now does
not mean it will not do so in the future. Also it is here as an example
which might be copied somewhere else.

> -- Steve

Sebastian

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [ANNOUNCE] v5.14-rc4-rt4
  2021-08-04 16:22                             ` Jens Axboe
@ 2021-08-04 16:47                               ` Sebastian Andrzej Siewior
  2021-08-04 16:57                                 ` Jens Axboe
  0 siblings, 1 reply; 30+ messages in thread
From: Sebastian Andrzej Siewior @ 2021-08-04 16:47 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Steven Rostedt, Peter Zijlstra, Daniel Wagner, Thomas Gleixner,
	LKML, linux-rt-users

On 2021-08-04 10:22:59 [-0600], Jens Axboe wrote:
> 
> In that regard, I do still consider those patches out-of-tree, which
> they are. And while I'm more sympathetic to them compared to other
> out-of-tree code as there's a long term plan to get it all in, it's
> still out-of-tree. Best solution here is probably to just carry that
> particular change in the RT patchset for now.

So today in the morning I learned that there is a memory allocation in
an IRQ-off section and now, a patch later, it is almost gone. So that
makes me actually happy :)

The spin_lock_irq() vs local_irq_disable() + spin_lock() is documented
in Documentation/locking/locktypes.rst.
That said I have no problem by carrying that patch in the RT-patchset
and revisit it later.

Sebastian

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [ANNOUNCE] v5.14-rc4-rt4
  2021-08-04 16:31                             ` Sebastian Andrzej Siewior
@ 2021-08-04 16:47                               ` Steven Rostedt
  2021-08-04 16:57                                 ` Sebastian Andrzej Siewior
  0 siblings, 1 reply; 30+ messages in thread
From: Steven Rostedt @ 2021-08-04 16:47 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Jens Axboe, Peter Zijlstra, Daniel Wagner, Thomas Gleixner, LKML,
	linux-rt-users

On Wed, 4 Aug 2021 18:31:19 +0200
Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:

> CPU-local wake-ups just set NEED_RESCHED and wait for preempt_enable()
> to do the magic. Just because the code not perform wake_up() now does
> not mean it will not do so in the future. Also it is here as an example
> which might be copied somewhere else.

Does this mean all local_irq_disable/enable() is audited? What do you do for;

	local_irq_disable();
	[..]
	wakeup_process(x); /* on local CPU */
	[..]
	local_irq_enable();

And if local_irq_disable() is not used anymore, or seldom, what harm
would it be to add a preemption check to that caller? And change
local_irq_enable() that is used internally by other atom functions be
called __local_irq_enable()?

Not to mention that we could just open code that too:

	if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
		local_irq_enable();
		preempt_check_resched();
	}

And make it ugly enough that nobody will want to copy it :-)

-- Steve

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [ANNOUNCE] v5.14-rc4-rt4
  2021-08-04 16:47                               ` Sebastian Andrzej Siewior
@ 2021-08-04 16:57                                 ` Jens Axboe
  2021-08-04 17:02                                   ` Sebastian Andrzej Siewior
  2021-08-10  7:40                                   ` Sebastian Andrzej Siewior
  0 siblings, 2 replies; 30+ messages in thread
From: Jens Axboe @ 2021-08-04 16:57 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Steven Rostedt, Peter Zijlstra, Daniel Wagner, Thomas Gleixner,
	LKML, linux-rt-users

On 8/4/21 10:47 AM, Sebastian Andrzej Siewior wrote:
> On 2021-08-04 10:22:59 [-0600], Jens Axboe wrote:
>>
>> In that regard, I do still consider those patches out-of-tree, which
>> they are. And while I'm more sympathetic to them compared to other
>> out-of-tree code as there's a long term plan to get it all in, it's
>> still out-of-tree. Best solution here is probably to just carry that
>> particular change in the RT patchset for now.
> 
> So today in the morning I learned that there is a memory allocation in
> an IRQ-off section and now, a patch later, it is almost gone. So that
> makes me actually happy :)

1 out of 2 is better than 0 ;-)

> The spin_lock_irq() vs local_irq_disable() + spin_lock() is documented
> in Documentation/locking/locktypes.rst.
> That said I have no problem by carrying that patch in the RT-patchset
> and revisit it later.

Right, I suspect that was added as a pre RT patch dump at some point.
It's a newer thing. Is it actually possible to set PREEMPT_RT in the
mainline kernel? Looks like it depends on ARCH_SUPPORTS_RT and nobody
sets that.

So I agree that just carrying your solution in the RT patchset is fine
for now, we can revisit later.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [ANNOUNCE] v5.14-rc4-rt4
  2021-08-04 16:47                               ` Steven Rostedt
@ 2021-08-04 16:57                                 ` Sebastian Andrzej Siewior
  0 siblings, 0 replies; 30+ messages in thread
From: Sebastian Andrzej Siewior @ 2021-08-04 16:57 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Jens Axboe, Peter Zijlstra, Daniel Wagner, Thomas Gleixner, LKML,
	linux-rt-users

On 2021-08-04 12:47:55 [-0400], Steven Rostedt wrote:
> On Wed, 4 Aug 2021 18:31:19 +0200
> Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:
> 
> > CPU-local wake-ups just set NEED_RESCHED and wait for preempt_enable()
> > to do the magic. Just because the code not perform wake_up() now does
> > not mean it will not do so in the future. Also it is here as an example
> > which might be copied somewhere else.
> 
> Does this mean all local_irq_disable/enable() is audited? What do you do for;
> 
> 	local_irq_disable();
> 	[..]
> 	wakeup_process(x); /* on local CPU */
> 	[..]
> 	local_irq_enable();

I hunted and fixed a few of those. I still have few
preempt_check_resched_rt() which I want fix other than what is in RT.

> And if local_irq_disable() is not used anymore, or seldom, what harm
> would it be to add a preemption check to that caller? And change
> local_irq_enable() that is used internally by other atom functions be
> called __local_irq_enable()?
> 
> Not to mention that we could just open code that too:
> 
> 	if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
> 		local_irq_enable();
> 		preempt_check_resched();
> 	}
> 
> And make it ugly enough that nobody will want to copy it :-)

I remember that the amount of enthusiasm was quite low when it was
suggested that local_irq_enable() gets additionally the preempt-check.
Maybe was due to the people involved :)
But we managed to work around it for most callers. Therefore we I
wouldn't suggest local_irq_disable_rt(). We had it in -RT, we had a
bunch of users and all of them were fixed in a different way.

Same goes btw. for preempt_disable_rt() which has been reduced to vmstat
and had previously more users :)

> -- Steve

Sebastian

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [ANNOUNCE] v5.14-rc4-rt4
  2021-08-04 16:57                                 ` Jens Axboe
@ 2021-08-04 17:02                                   ` Sebastian Andrzej Siewior
  2021-08-10  7:40                                   ` Sebastian Andrzej Siewior
  1 sibling, 0 replies; 30+ messages in thread
From: Sebastian Andrzej Siewior @ 2021-08-04 17:02 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Steven Rostedt, Peter Zijlstra, Daniel Wagner, Thomas Gleixner,
	LKML, linux-rt-users

On 2021-08-04 10:57:11 [-0600], Jens Axboe wrote:
> > The spin_lock_irq() vs local_irq_disable() + spin_lock() is documented
> > in Documentation/locking/locktypes.rst.
> > That said I have no problem by carrying that patch in the RT-patchset
> > and revisit it later.
> 
> Right, I suspect that was added as a pre RT patch dump at some point.
> It's a newer thing. Is it actually possible to set PREEMPT_RT in the
> mainline kernel? Looks like it depends on ARCH_SUPPORTS_RT and nobody
> sets that.

Yes. It is upstream now. The documentation and CONFIG_PREEMPT_RT symbol.
The Kconfig symbol has already users outside of the -RT patch.

You can't enable it yet since we require a few things which are on their
way.

Having the symbol and the documentation helped to convince some people
:) 

> So I agree that just carrying your solution in the RT patchset is fine
> for now, we can revisit later.

Okay.

Sebastian

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [ANNOUNCE] v5.14-rc4-rt4
  2021-08-04 16:57                                 ` Jens Axboe
  2021-08-04 17:02                                   ` Sebastian Andrzej Siewior
@ 2021-08-10  7:40                                   ` Sebastian Andrzej Siewior
  2021-08-10 11:22                                     ` [PATCH] io-wq: remove GFP_ATOMIC allocation off schedule out path kernel test robot
  2021-08-10 15:22                                     ` kernel test robot
  1 sibling, 2 replies; 30+ messages in thread
From: Sebastian Andrzej Siewior @ 2021-08-10  7:40 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Steven Rostedt, Peter Zijlstra, Daniel Wagner, Thomas Gleixner,
	LKML, linux-rt-users

On 2021-08-04 10:57:11 [-0600], Jens Axboe wrote:
> 
> 1 out of 2 is better than 0 ;-)

The patch at the end of the email is what I ended up after rebasing to
-rc5. Also after testing I figured out that tools/io_uring/io_uring-cp.c
is no longer working. The resulting file has a size of either 0 or 4KiB.
Then, tglx pointed out that the example in liburing shares the same file
name and is actually working.

From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 4 Aug 2021 08:43:43 -0600
Subject: [PATCH] io-wq: remove GFP_ATOMIC allocation off schedule out path

Daniel reports that the v5.14-rc4-rt4 kernel throws a BUG when running
stress-ng:

| [   90.202543] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:35
| [   90.202549] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 2047, name: iou-wrk-2041
| [   90.202555] CPU: 5 PID: 2047 Comm: iou-wrk-2041 Tainted: G        W         5.14.0-rc4-rt4+ #89
| [   90.202559] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2 04/01/2014
| [   90.202561] Call Trace:
| [   90.202577]  dump_stack_lvl+0x34/0x44
| [   90.202584]  ___might_sleep.cold+0x87/0x94
| [   90.202588]  rt_spin_lock+0x19/0x70
| [   90.202593]  ___slab_alloc+0xcb/0x7d0
| [   90.202598]  ? newidle_balance.constprop.0+0xf5/0x3b0
| [   90.202603]  ? dequeue_entity+0xc3/0x290
| [   90.202605]  ? io_wqe_dec_running.isra.0+0x98/0xe0
| [   90.202610]  ? pick_next_task_fair+0xb9/0x330
| [   90.202612]  ? __schedule+0x670/0x1410
| [   90.202615]  ? io_wqe_dec_running.isra.0+0x98/0xe0
| [   90.202618]  kmem_cache_alloc_trace+0x79/0x1f0
| [   90.202621]  io_wqe_dec_running.isra.0+0x98/0xe0
| [   90.202625]  io_wq_worker_sleeping+0x37/0x50
| [   90.202628]  schedule+0x30/0xd0
| [   90.202630]  schedule_timeout+0x8f/0x1a0
| [   90.202634]  ? __bpf_trace_tick_stop+0x10/0x10
| [   90.202637]  io_wqe_worker+0xfd/0x320
| [   90.202641]  ? finish_task_switch.isra.0+0xd3/0x290
| [   90.202644]  ? io_worker_handle_work+0x670/0x670
| [   90.202646]  ? io_worker_handle_work+0x670/0x670
| [   90.202649]  ret_from_fork+0x22/0x30

which is due to the RT kernel not liking a GFP_ATOMIC allocation inside
a raw spinlock. Besides that not working on RT, doing any kind of
allocation from inside schedule() is kind of nasty and should be avoided
if at all possible.

This particular path happens when an io-wq worker goes to sleep, and we
need a new worker to handle pending work. We currently allocate a small
data item to hold the information we need to create a new worker, but we
can instead include this data in the io_worker struct itself and just
protect it with a single bit lock. We only really need one per worker
anyway, as we will have run pending work between to sleep cycles.

https://lore.kernel.org/lkml/20210804082418.fbibprcwtzyt5qax@beryllium.lan/

Reported-by: Daniel Wagner <dwagner@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Tested-by: Daniel Wagner <dwagner@suse.de>
Link: https://lore.kernel.org/r/a673a130-e0e4-5aa8-4165-f35d1262fc6a@kernel.dk
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 fs/io-wq.c |   75 ++++++++++++++++++++++++++++++++++---------------------------
 1 file changed, 42 insertions(+), 33 deletions(-)

--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -51,6 +51,10 @@ struct io_worker {
 
 	struct completion ref_done;
 
+	unsigned long create_state;
+	struct callback_head create_work;
+	int create_index;
+
 	struct rcu_head rcu;
 };
 
@@ -270,50 +274,54 @@ static void io_wqe_inc_running(struct io
 	atomic_inc(&acct->nr_running);
 }
 
-struct create_worker_data {
-	struct callback_head work;
-	struct io_wqe *wqe;
-	int index;
-};
-
 static void create_worker_cb(struct callback_head *cb)
 {
-	struct create_worker_data *cwd;
+	struct io_worker *worker;
 	struct io_wq *wq;
 	struct io_wqe *wqe;
 	struct io_wqe_acct *acct;
 
-	cwd = container_of(cb, struct create_worker_data, work);
-	wqe = cwd->wqe;
-	wq = wqe->wq;
-	acct = &wqe->acct[cwd->index];
+	worker = container_of(cb, struct io_worker, create_work);
+	wqe = worker->wqe;
+	wq = worker->wq;
+	acct = &wqe->acct[worker->create_index];
 	raw_spin_lock_irq(&wqe->lock);
 	if (acct->nr_workers < acct->max_workers)
 		acct->nr_workers++;
 	raw_spin_unlock_irq(&wqe->lock);
-	create_io_worker(wq, cwd->wqe, cwd->index);
-	kfree(cwd);
+	create_io_worker(wq, wqe, worker->create_index);
+	clear_bit_unlock(0, &worker->create_state);
+	io_worker_release(worker);
 }
 
-static void io_queue_worker_create(struct io_wqe *wqe, struct io_wqe_acct *acct)
+static void io_queue_worker_create(struct io_wqe *wqe, struct io_worker
+				   *worker, struct io_wqe_acct *acct)
 {
-	struct create_worker_data *cwd;
 	struct io_wq *wq = wqe->wq;
 
 	/* raced with exit, just ignore create call */
 	if (test_bit(IO_WQ_BIT_EXIT, &wq->state))
 		goto fail;
+	if (!io_worker_get(worker))
+		goto fail;
+	/*
+	 * create_state manages ownership of create_work/index. We should
+	 * only need one entry per worker, as the worker going to sleep
+	 * will trigger the condition, and waking will clear it once it
+	 * runs the task_work.
+	 */
+	if (test_bit(0, &worker->create_state) ||
+	    test_and_set_bit_lock(0, &worker->create_state))
+		goto fail_release;
+
+	init_task_work(&worker->create_work, create_worker_cb);
+	worker->create_index = acct->index;
+	if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL))
+		return;
 
-	cwd = kmalloc(sizeof(*cwd), GFP_ATOMIC);
-	if (cwd) {
-		init_task_work(&cwd->work, create_worker_cb);
-		cwd->wqe = wqe;
-		cwd->index = acct->index;
-		if (!task_work_add(wq->task, &cwd->work, TWA_SIGNAL))
-			return;
-
-		kfree(cwd);
-	}
+	clear_bit_unlock(0, &worker->create_state);
+fail_release:
+	io_worker_release(worker);
 fail:
 	atomic_dec(&acct->nr_running);
 	io_worker_ref_put(wq);
@@ -331,7 +339,7 @@ static void io_wqe_dec_running(struct io
 	if (atomic_dec_and_test(&acct->nr_running) && io_wqe_run_queue(wqe)) {
 		atomic_inc(&acct->nr_running);
 		atomic_inc(&wqe->wq->worker_refs);
-		io_queue_worker_create(wqe, acct);
+		io_queue_worker_create(wqe, worker, acct);
 	}
 }
 
@@ -992,12 +1000,12 @@ struct io_wq *io_wq_create(unsigned boun
 
 static bool io_task_work_match(struct callback_head *cb, void *data)
 {
-	struct create_worker_data *cwd;
+	struct io_worker *worker;
 
 	if (cb->func != create_worker_cb)
 		return false;
-	cwd = container_of(cb, struct create_worker_data, work);
-	return cwd->wqe->wq == data;
+	worker = container_of(cb, struct io_worker, create_work);
+	return worker->wqe->wq == data;
 }
 
 void io_wq_exit_start(struct io_wq *wq)
@@ -1014,12 +1022,13 @@ static void io_wq_exit_workers(struct io
 		return;
 
 	while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) {
-		struct create_worker_data *cwd;
+		struct io_worker *worker;
 
-		cwd = container_of(cb, struct create_worker_data, work);
-		atomic_dec(&cwd->wqe->acct[cwd->index].nr_running);
+		worker = container_of(cb, struct io_worker, create_work);
+		atomic_dec(&worker->wqe->acct[worker->create_index].nr_running);
 		io_worker_ref_put(wq);
-		kfree(cwd);
+		clear_bit_unlock(0, &worker->create_state);
+		io_worker_release(worker);
 	}
 
 	rcu_read_lock();

Sebastian

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] io-wq: remove GFP_ATOMIC allocation off schedule out path
  2021-08-10  7:40                                   ` Sebastian Andrzej Siewior
@ 2021-08-10 11:22                                     ` kernel test robot
  2021-08-10 15:22                                     ` kernel test robot
  1 sibling, 0 replies; 30+ messages in thread
From: kernel test robot @ 2021-08-10 11:22 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior, Jens Axboe
  Cc: kbuild-all, Steven Rostedt, Peter Zijlstra, Daniel Wagner,
	Thomas Gleixner, LKML, linux-rt-users

[-- Attachment #1: Type: text/plain, Size: 2470 bytes --]

Hi Sebastian,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v5.14-rc5]
[cannot apply to next-20210810]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Sebastian-Andrzej-Siewior/io-wq-remove-GFP_ATOMIC-allocation-off-schedule-out-path/20210810-154135
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 9a73fa375d58fee5262dd16473c8e7522bdf44de
config: um-x86_64_defconfig (attached as .config)
compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
reproduce (this is a W=1 build):
        # https://github.com/0day-ci/linux/commit/7fd11421b3672d0230a9b529445014d99185b387
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Sebastian-Andrzej-Siewior/io-wq-remove-GFP_ATOMIC-allocation-off-schedule-out-path/20210810-154135
        git checkout 7fd11421b3672d0230a9b529445014d99185b387
        # save the attached .config to linux build tree
        mkdir build_dir
        make W=1 O=build_dir ARCH=um SUBARCH=x86_64 SHELL=/bin/bash

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   fs/io-wq.c: In function 'create_worker_cb':
>> fs/io-wq.c:286:15: error: 'struct io_worker' has no member named 'wq'; did you mean 'wqe'?
     286 |  wq = worker->wq;
         |               ^~
         |               wqe


vim +286 fs/io-wq.c

   276	
   277	static void create_worker_cb(struct callback_head *cb)
   278	{
   279		struct io_worker *worker;
   280		struct io_wq *wq;
   281		struct io_wqe *wqe;
   282		struct io_wqe_acct *acct;
   283	
   284		worker = container_of(cb, struct io_worker, create_work);
   285		wqe = worker->wqe;
 > 286		wq = worker->wq;
   287		acct = &wqe->acct[worker->create_index];
   288		raw_spin_lock_irq(&wqe->lock);
   289		if (acct->nr_workers < acct->max_workers)
   290			acct->nr_workers++;
   291		raw_spin_unlock_irq(&wqe->lock);
   292		create_io_worker(wq, wqe, worker->create_index);
   293		clear_bit_unlock(0, &worker->create_state);
   294		io_worker_release(worker);
   295	}
   296	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 9537 bytes --]

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH] io-wq: remove GFP_ATOMIC allocation off schedule out path
  2021-08-10  7:40                                   ` Sebastian Andrzej Siewior
  2021-08-10 11:22                                     ` [PATCH] io-wq: remove GFP_ATOMIC allocation off schedule out path kernel test robot
@ 2021-08-10 15:22                                     ` kernel test robot
  1 sibling, 0 replies; 30+ messages in thread
From: kernel test robot @ 2021-08-10 15:22 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior, Jens Axboe
  Cc: clang-built-linux, kbuild-all, Steven Rostedt, Peter Zijlstra,
	Daniel Wagner, Thomas Gleixner, LKML, linux-rt-users

[-- Attachment #1: Type: text/plain, Size: 2653 bytes --]

Hi Sebastian,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v5.14-rc5]
[cannot apply to next-20210810]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Sebastian-Andrzej-Siewior/io-wq-remove-GFP_ATOMIC-allocation-off-schedule-out-path/20210810-154135
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 9a73fa375d58fee5262dd16473c8e7522bdf44de
config: x86_64-randconfig-a012-20210809 (attached as .config)
compiler: clang version 14.0.0 (https://github.com/llvm/llvm-project 614c7d03877fd99c2de47429b15be3f00306a3bd)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/7fd11421b3672d0230a9b529445014d99185b387
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Sebastian-Andrzej-Siewior/io-wq-remove-GFP_ATOMIC-allocation-off-schedule-out-path/20210810-154135
        git checkout 7fd11421b3672d0230a9b529445014d99185b387
        # save the attached .config to linux build tree
        mkdir build_dir
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross O=build_dir ARCH=x86_64 SHELL=/bin/bash

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

>> fs/io-wq.c:286:15: error: no member named 'wq' in 'struct io_worker'
           wq = worker->wq;
                ~~~~~~  ^
   1 error generated.


vim +286 fs/io-wq.c

   276	
   277	static void create_worker_cb(struct callback_head *cb)
   278	{
   279		struct io_worker *worker;
   280		struct io_wq *wq;
   281		struct io_wqe *wqe;
   282		struct io_wqe_acct *acct;
   283	
   284		worker = container_of(cb, struct io_worker, create_work);
   285		wqe = worker->wqe;
 > 286		wq = worker->wq;
   287		acct = &wqe->acct[worker->create_index];
   288		raw_spin_lock_irq(&wqe->lock);
   289		if (acct->nr_workers < acct->max_workers)
   290			acct->nr_workers++;
   291		raw_spin_unlock_irq(&wqe->lock);
   292		create_io_worker(wq, wqe, worker->create_index);
   293		clear_bit_unlock(0, &worker->create_state);
   294		io_worker_release(worker);
   295	}
   296	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 32132 bytes --]

^ permalink raw reply	[flat|nested] 30+ messages in thread

end of thread, other threads:[~2021-08-10 15:23 UTC | newest]

Thread overview: 30+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-08-02 16:27 [ANNOUNCE] v5.14-rc4-rt4 Sebastian Andrzej Siewior
2021-08-04  8:24 ` Daniel Wagner
2021-08-04 10:43   ` Sebastian Andrzej Siewior
2021-08-04 10:48     ` Sebastian Andrzej Siewior
2021-08-04 11:00       ` Sebastian Andrzej Siewior
2021-08-04 13:17         ` Peter Zijlstra
2021-08-04 13:32           ` Jens Axboe
2021-08-04 14:23             ` Jens Axboe
2021-08-04 15:33               ` Sebastian Andrzej Siewior
2021-08-04 15:39                 ` Jens Axboe
2021-08-04 15:47                   ` Sebastian Andrzej Siewior
2021-08-04 15:49                     ` Jens Axboe
2021-08-04 15:57                       ` Sebastian Andrzej Siewior
2021-08-04 16:05                         ` Jens Axboe
2021-08-04 16:20                           ` Sebastian Andrzej Siewior
2021-08-04 16:20                             ` Jens Axboe
2021-08-04 16:20                           ` Steven Rostedt
2021-08-04 16:22                             ` Jens Axboe
2021-08-04 16:47                               ` Sebastian Andrzej Siewior
2021-08-04 16:57                                 ` Jens Axboe
2021-08-04 17:02                                   ` Sebastian Andrzej Siewior
2021-08-10  7:40                                   ` Sebastian Andrzej Siewior
2021-08-10 11:22                                     ` [PATCH] io-wq: remove GFP_ATOMIC allocation off schedule out path kernel test robot
2021-08-10 15:22                                     ` kernel test robot
2021-08-04 16:17                       ` [ANNOUNCE] v5.14-rc4-rt4 Steven Rostedt
2021-08-04 16:22                         ` Sebastian Andrzej Siewior
2021-08-04 16:25                           ` Steven Rostedt
2021-08-04 16:31                             ` Sebastian Andrzej Siewior
2021-08-04 16:47                               ` Steven Rostedt
2021-08-04 16:57                                 ` Sebastian Andrzej Siewior

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).