linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Jens Axboe <axboe@kernel.dk>
To: Peter Zijlstra <peterz@infradead.org>,
	Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Daniel Wagner <wagi@monom.org>,
	Thomas Gleixner <tglx@linutronix.de>,
	LKML <linux-kernel@vger.kernel.org>,
	linux-rt-users@vger.kernel.org,
	Steven Rostedt <rostedt@goodmis.org>
Subject: Re: [ANNOUNCE] v5.14-rc4-rt4
Date: Wed, 4 Aug 2021 08:23:55 -0600	[thread overview]
Message-ID: <feebf183-2e33-36b5-4538-62a40b2a58b6@kernel.dk> (raw)
In-Reply-To: <4f549344-1040-c677-6a6a-53e243c5f364@kernel.dk>

On 8/4/21 7:32 AM, Jens Axboe wrote:
> On 8/4/21 7:17 AM, Peter Zijlstra wrote:
>> On Wed, Aug 04, 2021 at 01:00:57PM +0200, Sebastian Andrzej Siewior wrote:
>>> On 2021-08-04 12:48:05 [+0200], To Daniel Wagner wrote:
>>>> On 2021-08-04 12:43:42 [+0200], To Daniel Wagner wrote:
>>>>> Odd. Do you have a config for that, please?
>>>>
>>>> No need.
>>>> | [   90.202543] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:35
>>>> | [   90.202549] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 2047, name: iou-wrk-2041
>>>> | [   90.202555] CPU: 5 PID: 2047 Comm: iou-wrk-2041 Tainted: G        W         5.14.0-rc4-rt4+ #89
>>>> | [   90.202561] Call Trace:
>>> …
>>>> | [   90.202588]  rt_spin_lock+0x19/0x70
>>>> | [   90.202593]  ___slab_alloc+0xcb/0x7d0
>>> …
>>>> | [   90.202618]  kmem_cache_alloc_trace+0x79/0x1f0
>>>> | [   90.202621]  io_wqe_dec_running.isra.0+0x98/0xe0
>>>> | [   90.202625]  io_wq_worker_sleeping+0x37/0x50
>>>> | [   90.202628]  schedule+0x30/0xd0
>>>>
>>>> le look.
>>>
>>> So this is due to commit
>>>   685fe7feedb96 ("io-wq: eliminate the need for a manager thread")
>>>
>>> introduced in the v5.13-rc1 merge window. The call chain is
>>>   schedule()
>>>    sched_submit_work()
>>>     preempt_disable();
>>>     io_wq_worker_sleeping()
>>>       raw_spin_lock_irq(&worker->wqe->lock);
>>>       io_wqe_dec_running(worker);
>>>        io_queue_worker_create()
>>>         kmalloc(sizeof(*cwd), GFP_ATOMIC);
>>>
>>> The lock wqe::lock has been turned into a raw_spinlock_t in commit
>>>    95da84659226d ("io_wq: Make io_wqe::lock a raw_spinlock_t")
>>>
>>> after a careful analysis of the code at that time. This commit breaks
>>> things. Is this really needed?
>>
>> Urgh, doing allocs from schedule seems really yuck. Can we please not do
>> this?
> 
> Agree, I have an idea of how to get rid of it. Let me experiment a bit...

Something like this should do it - the only thing we need the allocation for
is short lived, queueing a task_work item to create a new worker. We can
manage this on a per-existing worker basis, and just have the tw/index
stored in the worker itself. That avoids an allocation off schedule ->
going to sleep.

Totally untested, but I think the principle is sound. I'll run it through
some testing.


diff --git a/fs/io-wq.c b/fs/io-wq.c
index 50dc93ffc153..97eaaf25a429 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -51,6 +51,10 @@ struct io_worker {
 
 	struct completion ref_done;
 
+	unsigned long create_state;
+	struct callback_head create_work;
+	int create_index;
+
 	struct rcu_head rcu;
 };
 
@@ -261,42 +265,44 @@ static void io_wqe_inc_running(struct io_worker *worker)
 	atomic_inc(&acct->nr_running);
 }
 
-struct create_worker_data {
-	struct callback_head work;
-	struct io_wqe *wqe;
-	int index;
-};
-
 static void create_worker_cb(struct callback_head *cb)
 {
-	struct create_worker_data *cwd;
+	struct io_worker *worker;
 	struct io_wq *wq;
 
-	cwd = container_of(cb, struct create_worker_data, work);
-	wq = cwd->wqe->wq;
-	create_io_worker(wq, cwd->wqe, cwd->index);
-	kfree(cwd);
+	worker = container_of(cb, struct io_worker, create_work);
+	wq = worker->wqe->wq;
+	create_io_worker(wq, worker->wqe, worker->create_index);
+	clear_bit_unlock(0, &worker->create_state);
+	io_worker_release(worker);
 }
 
-static void io_queue_worker_create(struct io_wqe *wqe, struct io_wqe_acct *acct)
+static void io_queue_worker_create(struct io_wqe *wqe, struct io_worker *worker,
+				   struct io_wqe_acct *acct)
 {
-	struct create_worker_data *cwd;
 	struct io_wq *wq = wqe->wq;
 
 	/* raced with exit, just ignore create call */
 	if (test_bit(IO_WQ_BIT_EXIT, &wq->state))
 		goto fail;
+	/*
+	 * create_state manages ownership of create_work/index. We should
+	 * only need one entry per worker, as the worker going to sleep
+	 * will trigger the condition, and waking will clear it once it
+	 * runs the task_work.
+	 */
+	if (test_bit(0, &worker->create_state) ||
+	    test_and_set_bit_lock(0, &worker->create_state))
+		goto fail;
 
-	cwd = kmalloc(sizeof(*cwd), GFP_ATOMIC);
-	if (cwd) {
-		init_task_work(&cwd->work, create_worker_cb);
-		cwd->wqe = wqe;
-		cwd->index = acct->index;
-		if (!task_work_add(wq->task, &cwd->work, TWA_SIGNAL))
-			return;
+	io_worker_get(worker);
+	init_task_work(&worker->create_work, create_worker_cb);
+	worker->create_index = acct->index;
+	if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL))
+		return;
 
-		kfree(cwd);
-	}
+	clear_bit_unlock(0, &worker->create_state);
+	io_worker_release(worker);
 fail:
 	atomic_dec(&acct->nr_running);
 	io_worker_ref_put(wq);
@@ -314,7 +320,7 @@ static void io_wqe_dec_running(struct io_worker *worker)
 	if (atomic_dec_and_test(&acct->nr_running) && io_wqe_run_queue(wqe)) {
 		atomic_inc(&acct->nr_running);
 		atomic_inc(&wqe->wq->worker_refs);
-		io_queue_worker_create(wqe, acct);
+		io_queue_worker_create(wqe, worker, acct);
 	}
 }
 
@@ -973,12 +979,12 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 
 static bool io_task_work_match(struct callback_head *cb, void *data)
 {
-	struct create_worker_data *cwd;
+	struct io_worker *worker;
 
 	if (cb->func != create_worker_cb)
 		return false;
-	cwd = container_of(cb, struct create_worker_data, work);
-	return cwd->wqe->wq == data;
+	worker = container_of(cb, struct io_worker, create_work);
+	return worker->wqe->wq == data;
 }
 
 void io_wq_exit_start(struct io_wq *wq)
@@ -995,12 +1001,13 @@ static void io_wq_exit_workers(struct io_wq *wq)
 		return;
 
 	while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) {
-		struct create_worker_data *cwd;
+		struct io_worker *worker;
 
-		cwd = container_of(cb, struct create_worker_data, work);
-		atomic_dec(&cwd->wqe->acct[cwd->index].nr_running);
+		worker = container_of(cb, struct io_worker, create_work);
+		atomic_dec(&worker->wqe->acct[worker->create_index].nr_running);
 		io_worker_ref_put(wq);
-		kfree(cwd);
+		clear_bit_unlock(0, &worker->create_state);
+		io_worker_release(worker);
 	}
 
 	rcu_read_lock();

-- 
Jens Axboe


  reply	other threads:[~2021-08-04 14:24 UTC|newest]

Thread overview: 30+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-08-02 16:27 [ANNOUNCE] v5.14-rc4-rt4 Sebastian Andrzej Siewior
2021-08-04  8:24 ` Daniel Wagner
2021-08-04 10:43   ` Sebastian Andrzej Siewior
2021-08-04 10:48     ` Sebastian Andrzej Siewior
2021-08-04 11:00       ` Sebastian Andrzej Siewior
2021-08-04 13:17         ` Peter Zijlstra
2021-08-04 13:32           ` Jens Axboe
2021-08-04 14:23             ` Jens Axboe [this message]
2021-08-04 15:33               ` Sebastian Andrzej Siewior
2021-08-04 15:39                 ` Jens Axboe
2021-08-04 15:47                   ` Sebastian Andrzej Siewior
2021-08-04 15:49                     ` Jens Axboe
2021-08-04 15:57                       ` Sebastian Andrzej Siewior
2021-08-04 16:05                         ` Jens Axboe
2021-08-04 16:20                           ` Sebastian Andrzej Siewior
2021-08-04 16:20                             ` Jens Axboe
2021-08-04 16:20                           ` Steven Rostedt
2021-08-04 16:22                             ` Jens Axboe
2021-08-04 16:47                               ` Sebastian Andrzej Siewior
2021-08-04 16:57                                 ` Jens Axboe
2021-08-04 17:02                                   ` Sebastian Andrzej Siewior
2021-08-10  7:40                                   ` Sebastian Andrzej Siewior
2021-08-10 11:22                                     ` [PATCH] io-wq: remove GFP_ATOMIC allocation off schedule out path kernel test robot
2021-08-10 15:22                                     ` kernel test robot
2021-08-04 16:17                       ` [ANNOUNCE] v5.14-rc4-rt4 Steven Rostedt
2021-08-04 16:22                         ` Sebastian Andrzej Siewior
2021-08-04 16:25                           ` Steven Rostedt
2021-08-04 16:31                             ` Sebastian Andrzej Siewior
2021-08-04 16:47                               ` Steven Rostedt
2021-08-04 16:57                                 ` Sebastian Andrzej Siewior

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=feebf183-2e33-36b5-4538-62a40b2a58b6@kernel.dk \
    --to=axboe@kernel.dk \
    --cc=bigeasy@linutronix.de \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-rt-users@vger.kernel.org \
    --cc=peterz@infradead.org \
    --cc=rostedt@goodmis.org \
    --cc=tglx@linutronix.de \
    --cc=wagi@monom.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).