Re: [PATCH] io_uring: fix race with shadow drain deferrals

From: Jens Axboe <axboe@kernel.dk>
To: Jackie Liu <liuyun01@kylinos.cn>
Cc: io-uring@vger.kernel.org
Subject: Re: [PATCH] io_uring: fix race with shadow drain deferrals
Date: Wed, 20 Nov 2019 18:49:29 -0700	[thread overview]
Message-ID: <2005c339-5ed3-6c2e-f011-5bc89dac3f5c@kernel.dk> (raw)
In-Reply-To: <b70c7e29-408c-af72-5dc1-85456c904c7a@kernel.dk>

On 11/20/19 6:40 PM, Jens Axboe wrote:
> On 11/20/19 6:35 PM, Jackie Liu wrote:
>>
>>
>>> 2019年11月21日 09:32，Jackie Liu <liuyun01@kylinos.cn> 写道：
>>>
>>> 2019年11月21日 07:58，Jens Axboe <axboe@kernel.dk> 写道：
>>>
>>>>
>>>> On 11/20/19 4:07 PM, Jens Axboe wrote:
>>>>> When we go and queue requests with drain, we check if we need to defer
>>>>> based on sequence. This is done safely under the lock, but then we drop
>>>>> the lock before actually inserting the shadow. If the original request
>>>>> is found on the deferred list by another completion in the mean time,
>>>>> it could have been started AND completed by the time we insert the
>>>>> shadow, which will stall the queue.
>>>>>
>>>>> After re-grabbing the completion lock, check if the original request is
>>>>> still in the deferred list. If it isn't, then we know that someone else
>>>>> already found and issued it. If that happened, then our job is done, we
>>>>> can simply free the shadow.
>>>>>
>>>>> Cc: Jackie Liu <liuyun01@kylinos.cn>
>>>>> Fixes: 4fe2c963154c ("io_uring: add support for link with drain")
>>>>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>>>>
>>>> BTW, the other solution here is to not release the completion_lock if
>>>> we're going to return -EIOCBQUEUED, and let the caller do what it needs
>>>> before releasing it. That'd look something like this, with some sparse
>>>> annotations to keep things happy.
>>>>
>>>> I think the original I posted here is easier to follow, and the
>>>> deferral list is going to be tiny in general so it won't really add
>>>> any extra overhead.
>>>>
>>>> Let me know what you think and prefer.
>>>>
>>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>>> index 6175e2e195c0..0d1f33bcedc0 100644
>>>> --- a/fs/io_uring.c
>>>> +++ b/fs/io_uring.c
>>>> @@ -2552,6 +2552,11 @@ static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe,
>>>> 	return 0;
>>>> }
>>>>
>>>> +/*
>>>> + * Returns with ctx->completion_lock held if -EIOCBQUEUED is returned, so
>>>> + * the caller can make decisions based on the deferral without worrying about
>>>> + * the request being found and issued in the mean time.
>>>> + */
>>>> static int io_req_defer(struct io_kiocb *req)
>>>> {
>>>> 	const struct io_uring_sqe *sqe = req->submit.sqe;
>>>> @@ -2579,7 +2584,7 @@ static int io_req_defer(struct io_kiocb *req)
>>>>
>>>> 	trace_io_uring_defer(ctx, req, false);
>>>> 	list_add_tail(&req->list, &ctx->defer_list);
>>>> -	spin_unlock_irq(&ctx->completion_lock);
>>>> +	__release(&ctx->completion_lock);
>>>> 	return -EIOCBQUEUED;
>>>> }
>>>>
>>>> @@ -2954,6 +2959,7 @@ static void __io_queue_sqe(struct io_kiocb *req)
>>>>
>>>> static void io_queue_sqe(struct io_kiocb *req)
>>>> {
>>>> +	struct io_ring_ctx *ctx = req->ctx;
>>>> 	int ret;
>>>>
>>>> 	ret = io_req_defer(req);
>>>> @@ -2963,6 +2969,9 @@ static void io_queue_sqe(struct io_kiocb *req)
>>>> 			if (req->flags & REQ_F_LINK)
>>>> 				req->flags |= REQ_F_FAIL_LINK;
>>>> 			io_double_put_req(req);
>>>> +		} else {
>>>> +			__acquire(&ctx->completion_lock);
>>>> +			spin_unlock_irq(&ctx->completion_lock);
>>>> 		}
>>>> 	} else
>>>> 		__io_queue_sqe(req);
>>>> @@ -3001,16 +3010,17 @@ static void io_queue_link_head(struct io_kiocb *req, struct io_kiocb *shadow)
>>>> 				__io_free_req(shadow);
>>>> 			return;
>>>> 		}
>>>> +		__acquire(&ctx->completion_lock);
>>>> 	} else {
>>>> 		/*
>>>> 		 * If ret == 0 means that all IOs in front of link io are
>>>> 		 * running done. let's queue link head.
>>>> 		 */
>>>> 		need_submit = true;
>>>> +		spin_lock_irq(&ctx->completion_lock);
>>>> 	}
>>>>
>>>> 	/* Insert shadow req to defer_list, blocking next IOs */
>>>> -	spin_lock_irq(&ctx->completion_lock);
>>>> 	trace_io_uring_defer(ctx, shadow, true);
>>>> 	list_add_tail(&shadow->list, &ctx->defer_list);
>>>> 	spin_unlock_irq(&ctx->completion_lock);
>>>
>>> This is indeed a potential lock issue, thanks, I am prefer this solution, clearer than first one.
>>> But It may be a bit difficult for other people who read the code, use 'io_req_defer_may_lock'?
>>>
>>> who about this?
>>>
>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>> index 5ad652f..6fdaeb1 100644
>>> --- a/fs/io_uring.c
>>> +++ b/fs/io_uring.c
>>> @@ -2469,7 +2469,7 @@ static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe,
>>>          return 0;
>>> }
>>>
>>> -static int io_req_defer(struct io_kiocb *req)
>>> +static int __io_req_defer(struct io_kiocb *req)
>>> {
>>>          const struct io_uring_sqe *sqe = req->submit.sqe;
>>>          struct io_uring_sqe *sqe_copy;
>>> @@ -2495,8 +2495,21 @@ static int io_req_defer(struct io_kiocb *req)
>>>
>>>          trace_io_uring_defer(ctx, req, false);
>>>          list_add_tail(&req->list, &ctx->defer_list);
>>> +
>>> +       return -EIOCBQUEUED;
>>> +}
>>> +
>>> +static int io_req_defer(struct io_kiocb *req)
>>> +{
>>> +       int ret = __io_req_defer(req);
>>
>> There have an problem, need fix.
>>
>> static int io_req_defer(struct io_kiocb *req)
>> {
>> 	int ret = __io_req_defer(req);
>> 	if (ret == -EIOCBQUEUED)
>> 		spin_unlock_irq(&ctx->completion_lock);
>> 	return ret;
>> }
> 
> Mid-air collision, indeed.
> 
> But as I wrote in the previous email, I don't think this one improves on
> the situation... And fwiw, I did test both of mine, both are verified to
> fix the issue.

Maybe we can compromise on something like this? Doesn't introduce any
may_lock() naming, just uses the __io_req_defer() to take that blame.
And uses the right sparse annotations to keep things happy with C=2 as
well. Uses your trick to make io_req_defer() do the lock drop for the
other caller.

Ran it through 400x rounds of testing, confirmed as well.

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6175e2e195c0..299a218e9552 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2552,7 +2552,12 @@ static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	return 0;
 }
 
-static int io_req_defer(struct io_kiocb *req)
+/*
+ * Returns with ctx->completion_lock held if -EIOCBQUEUED is returned, so
+ * the caller can make decisions based on the deferral without worrying about
+ * the request being found and issued in the mean time.
+ */
+static int __io_req_defer(struct io_kiocb *req)
 {
 	const struct io_uring_sqe *sqe = req->submit.sqe;
 	struct io_uring_sqe *sqe_copy;
@@ -2579,10 +2584,23 @@ static int io_req_defer(struct io_kiocb *req)
 
 	trace_io_uring_defer(ctx, req, false);
 	list_add_tail(&req->list, &ctx->defer_list);
-	spin_unlock_irq(&ctx->completion_lock);
+	__release(&ctx->completion_lock);
 	return -EIOCBQUEUED;
 }
 
+static int io_req_defer(struct io_kiocb *req)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	int ret;
+
+	ret = __io_req_defer(req);
+	if (ret == -EIOCBQUEUED) {
+		__acquire(&ctx->completion_lock);
+		spin_unlock_irq(&ctx->completion_lock);
+	}
+	return ret;
+}
+
 static int __io_submit_sqe(struct io_kiocb *req, struct io_kiocb **nxt,
 			   bool force_nonblock)
 {
@@ -2957,15 +2975,14 @@ static void io_queue_sqe(struct io_kiocb *req)
 	int ret;
 
 	ret = io_req_defer(req);
-	if (ret) {
-		if (ret != -EIOCBQUEUED) {
-			io_cqring_add_event(req, ret);
-			if (req->flags & REQ_F_LINK)
-				req->flags |= REQ_F_FAIL_LINK;
-			io_double_put_req(req);
-		}
-	} else
+	if (!ret) {
 		__io_queue_sqe(req);
+	} else if (ret != -EIOCBQUEUED) {
+		io_cqring_add_event(req, ret);
+		if (req->flags & REQ_F_LINK)
+			req->flags |= REQ_F_FAIL_LINK;
+		io_double_put_req(req);
+	}
 }
 
 static void io_queue_link_head(struct io_kiocb *req, struct io_kiocb *shadow)
@@ -2989,7 +3006,7 @@ static void io_queue_link_head(struct io_kiocb *req, struct io_kiocb *shadow)
 	 * list.
 	 */
 	req->flags |= REQ_F_IO_DRAIN;
-	ret = io_req_defer(req);
+	ret = __io_req_defer(req);
 	if (ret) {
 		if (ret != -EIOCBQUEUED) {
 err:
@@ -3001,16 +3018,17 @@ static void io_queue_link_head(struct io_kiocb *req, struct io_kiocb *shadow)
 				__io_free_req(shadow);
 			return;
 		}
+		__acquire(&ctx->completion_lock);
 	} else {
 		/*
 		 * If ret == 0 means that all IOs in front of link io are
 		 * running done. let's queue link head.
 		 */
 		need_submit = true;
+		spin_lock_irq(&ctx->completion_lock);
 	}
 
 	/* Insert shadow req to defer_list, blocking next IOs */
-	spin_lock_irq(&ctx->completion_lock);
 	trace_io_uring_defer(ctx, shadow, true);
 	list_add_tail(&shadow->list, &ctx->defer_list);
 	spin_unlock_irq(&ctx->completion_lock);

-- 
Jens Axboe