Re: Kernel BUG when registering the ring

From: Jens Axboe <axboe@kernel.dk>
To: Jann Horn <jannh@google.com>
Cc: Glauber Costa <glauber@scylladb.com>,
	io-uring <io-uring@vger.kernel.org>,
	Avi Kivity <avi@scylladb.com>
Subject: Re: Kernel BUG when registering the ring
Date: Thu, 13 Feb 2020 08:20:43 -0700	[thread overview]
Message-ID: <bb412ce9-7bb4-f279-e475-757e6829f47c@kernel.dk> (raw)
In-Reply-To: <CAG48ez0ffvnfnPEemkKn1ZkGE-uAsAyBcAeJWdJ8j-eVeyaxfQ@mail.gmail.com>

On 2/12/20 3:31 PM, Jann Horn wrote:
> On Tue, Feb 11, 2020 at 7:58 PM Jens Axboe <axboe@kernel.dk> wrote:
> [...]
>> @@ -849,6 +857,8 @@ void io_wq_cancel_all(struct io_wq *wq)
>>         for_each_node(node) {
>>                 struct io_wqe *wqe = wq->wqes[node];
>>
>> +               if (!node_online(node))
>> +                       continue;
>>                 io_wq_for_each_worker(wqe, io_wqe_worker_send_sig, NULL);
>>         }
>>         rcu_read_unlock();
> 
> What is this going to do if a NUMA node is marked as offline (through
> a call to node_set_offline() from try_offline_node()) while it has a
> worker running, and then afterwards, with the worker still running,
> io_wq_cancel_all() is executed? Is that going to potentially hang
> because some op is still executing on that node's worker? Or is there
> a reason why that can't happen?

I folded in this incremental last night, I think it's a better
approach as well.

> [...]
>> @@ -1084,6 +1100,8 @@ void io_wq_flush(struct io_wq *wq)
>>         for_each_node(node) {
>>                 struct io_wqe *wqe = wq->wqes[node];
>>
>> +               if (!node_online(node))
>> +                       continue;
>>                 init_completion(&data.done);
>>                 INIT_IO_WORK(&data.work, io_wq_flush_func);
>>                 data.work.flags |= IO_WQ_WORK_INTERNAL;
> 
> (io_wq_flush() is dead code since 05f3fb3c5397, right? Are there plans
> to use it again?)

Should probably just be removed for now, I generally don't like carrying
dead code. It's easy enough to bring back if we need it, though I suspect
if we do need it, we'll probably make it work like the workqueue
flushing where we guarantee existing work is done at that point.

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 2d741fb76098..0a5ab1a8f69a 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -837,7 +837,9 @@ static bool io_wq_for_each_worker(struct io_wqe *wqe,
 
 	list_for_each_entry_rcu(worker, &wqe->all_list, all_list) {
 		if (io_worker_get(worker)) {
-			ret = func(worker, data);
+			/* no task if node is/was offline */
+			if (worker->task)
+				ret = func(worker, data);
 			io_worker_release(worker);
 			if (ret)
 				break;
@@ -857,8 +859,6 @@ void io_wq_cancel_all(struct io_wq *wq)
 	for_each_node(node) {
 		struct io_wqe *wqe = wq->wqes[node];
 
-		if (!node_online(node))
-			continue;
 		io_wq_for_each_worker(wqe, io_wqe_worker_send_sig, NULL);
 	}
 	rcu_read_unlock();
@@ -939,8 +939,6 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
 	for_each_node(node) {
 		struct io_wqe *wqe = wq->wqes[node];
 
-		if (!node_online(node))
-			continue;
 		ret = io_wqe_cancel_cb_work(wqe, cancel, data);
 		if (ret != IO_WQ_CANCEL_NOTFOUND)
 			break;
@@ -1033,8 +1031,6 @@ enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork)
 	for_each_node(node) {
 		struct io_wqe *wqe = wq->wqes[node];
 
-		if (!node_online(node))
-			continue;
 		ret = io_wqe_cancel_work(wqe, &match);
 		if (ret != IO_WQ_CANCEL_NOTFOUND)
 			break;
@@ -1064,8 +1060,6 @@ enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid)
 	for_each_node(node) {
 		struct io_wqe *wqe = wq->wqes[node];
 
-		if (!node_online(node))
-			continue;
 		ret = io_wqe_cancel_work(wqe, &match);
 		if (ret != IO_WQ_CANCEL_NOTFOUND)
 			break;
@@ -1204,11 +1198,8 @@ static void __io_wq_destroy(struct io_wq *wq)
 		kthread_stop(wq->manager);
 
 	rcu_read_lock();
-	for_each_node(node) {
-		if (!node_online(node))
-			continue;
+	for_each_node(node)
 		io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL);
-	}
 	rcu_read_unlock();
 
 	wait_for_completion(&wq->done);

-- 
Jens Axboe