Re: kernel (or fio) bug with IOSQE_IO_DRAIN

From: Jens Axboe <axboe@kernel.dk>
To: Andres Freund <andres@anarazel.de>
Cc: linux-block@vger.kernel.org, io-uring <io-uring@vger.kernel.org>
Subject: Re: kernel (or fio) bug with IOSQE_IO_DRAIN
Date: Wed, 4 Dec 2019 08:43:07 -0700	[thread overview]
Message-ID: <5c22cf17-a2ea-80f3-c7fb-b3b673ce9bee@kernel.dk> (raw)
In-Reply-To: <84b6083c-861c-c586-a5f2-09143d96b74c@kernel.dk>

On 12/4/19 7:58 AM, Jens Axboe wrote:
> On 9/13/19 3:58 PM, Andres Freund wrote:
>> Hi Jens,
>>
>> While I'm not absolutely certain, this looks like a kernel side bug. I
>> modified fio to set DRAIN (because I'd need that in postgres, to not
>> actually just loose data as my current prototype does).  But the
>> submissions currently fail after a very short amount of time, as soon as
>> I use a queue depth bigger than one.
>>
>> I modified fio's master like (obviously not intended as an actual fio
>> patch):
>>
>> diff --git i/engines/io_uring.c w/engines/io_uring.c
>> index e5edfcd2..a3fe461f 100644
>> --- i/engines/io_uring.c
>> +++ w/engines/io_uring.c
>> @@ -181,16 +181,20 @@ static int fio_ioring_prep(struct thread_data *td, struct io_u *io_u)
>>                           sqe->len = 1;
>>                   }
>>                   sqe->off = io_u->offset;
>> +               sqe->flags |= IOSQE_IO_DRAIN;
>>           } else if (ddir_sync(io_u->ddir)) {
>>                   if (io_u->ddir == DDIR_SYNC_FILE_RANGE) {
>>                           sqe->off = f->first_write;
>>                           sqe->len = f->last_write - f->first_write;
>>                           sqe->sync_range_flags = td->o.sync_file_range;
>>                           sqe->opcode = IORING_OP_SYNC_FILE_RANGE;
>> +
>> +                       //sqe->flags |= IOSQE_IO_DRAIN;
>>                   } else {
>>                           if (io_u->ddir == DDIR_DATASYNC)
>>                                   sqe->fsync_flags |= IORING_FSYNC_DATASYNC;
>>                           sqe->opcode = IORING_OP_FSYNC;
>> +                       //sqe->flags |= IOSQE_IO_DRAIN;
>>                   }
>>           }
>>    
>>
>> I pretty much immediately get failed requests back with a simple fio
>> job:
>>
>> [global]
>> name=fio-drain-bug
>>
>> size=1G
>> nrfiles=1
>>
>> iodepth=2
>> ioengine=io_uring
>>
>> [test]
>> rw=write
>>
>>
>> andres@alap4:~/src/fio$ ~/build/fio/install/bin/fio  ~/tmp/fio-drain-bug.fio
>> test: (g=0): rw=randwrite, bs=(R) 4096B-4096B, (W) 4096B-4096B, (T) 4096B-4096B, ioengine=io_uring, iodepth=2
>> fio-3.15
>> Starting 1 process
>> files=0
>> fio: io_u error on file test.0.0: Invalid argument: write offset=794877952, buflen=4096
>> fio: pid=3075, err=22/file:/home/andres/src/fio/io_u.c:1787, func=io_u error, error=Invalid argument
>>
>> test: (groupid=0, jobs=1): err=22 (file:/home/andres/src/fio/io_u.c:1787, func=io_u error, error=Invalid argument): pid=3075: Fri Sep 13 12:45:19 2019
>>
>>
>> Where I think the EINVAL might come from
>>
>> 	if (unlikely(s->index >= ctx->sq_entries))
>> 		return -EINVAL;
>>
>> based on the stack trace a perf probe returns (hacketyhack):
>>
>> kworker/u16:0-e  6304 [006] 28733.064781: probe:__io_submit_sqe: (ffffffff81356719)
>>           ffffffff8135671a __io_submit_sqe+0xfa (/lib/modules/5.3.0-rc8-andres-00007-g3120b9a6a3f7-dirty/build/vmlinux)
>>           ffffffff81356da0 io_sq_wq_submit_work+0xe0 (/lib/modules/5.3.0-rc8-andres-00007-g3120b9a6a3f7-dirty/build/vmlinux)
>>           ffffffff81137392 process_one_work+0x1d2 (/lib/modules/5.3.0-rc8-andres-00007-g3120b9a6a3f7-dirty/build/vmlinux)
>>           ffffffff8113758d worker_thread+0x4d (/lib/modules/5.3.0-rc8-andres-00007-g3120b9a6a3f7-dirty/build/vmlinux)
>>           ffffffff8113d1e6 kthread+0x106 (/lib/modules/5.3.0-rc8-andres-00007-g3120b9a6a3f7-dirty/build/vmlinux)
>>           ffffffff8200021a ret_from_fork+0x3a (/lib/modules/5.3.0-rc8-andres-00007-g3120b9a6a3f7-dirty/build/vmlinux
>>
>> /home/andres/src/kernel/fs/io_uring.c:1800
>>
>> which precisely the return:
>> 	if (unlikely(s->index >= ctx->sq_entries))
>> 		return -EINVAL;
>>
>> This is with your change to limit the number of workqueue threads
>> applied, but I don't think that should matter.
>>
>> I suspect that this is hit due to ->index being unitialized, rather than
>> actually too large. In io_sq_wq_submit_work() the sqe_submit embedded in
>> io_kiocb is used.  ISTM that e.g. when
>> io_uring_enter()->io_ring_submit()->io_submit_sqe()
>> allocates the new io_kiocb and io_queue_sqe()->io_req_defer() and then
>> submits that to io_sq_wq_submit_work() io_kiocb->submit.index (and some
>> other fields?) is not initialized.
> 
> For some reason I completely missed this one. I just tested this with
> the current tree, and I don't see any issues, but we also changed a few
> things there in this area and could have fixed this one inadvertently.
> I'll try 5.4/5.3, we might need a stable addition for this if it's still
> an issue.
> 
> Also see:
> 
> https://github.com/axboe/liburing/issues/33

I think your analysis is spot on, and it's affecting 5.2..5.4. This is
the patch for stable for 5.4, should apply to 5.2 and 5.3 as well by
just dropping the link hunk as 5.2/5.3 doesn't have that.

5.5-rc isn't affected.

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 2c819c3c855d..0393545a39a7 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2016,7 +2017,7 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
  }
  
  static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req,
-			const struct io_uring_sqe *sqe)
+			struct sqe_submit *s)
  {
  	struct io_uring_sqe *sqe_copy;
  
@@ -2034,7 +2035,8 @@ static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req,
  		return 0;
  	}
  
-	memcpy(sqe_copy, sqe, sizeof(*sqe_copy));
+	memcpy(&req->submit, s, sizeof(*s));
+	memcpy(sqe_copy, s->sqe, sizeof(*sqe_copy));
  	req->submit.sqe = sqe_copy;
  
  	INIT_WORK(&req->work, io_sq_wq_submit_work);
@@ -2399,7 +2401,7 @@ static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
  {
  	int ret;
  
-	ret = io_req_defer(ctx, req, s->sqe);
+	ret = io_req_defer(ctx, req, s);
  	if (ret) {
  		if (ret != -EIOCBQUEUED) {
  			io_free_req(req);
@@ -2426,7 +2428,7 @@ static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req,
  	 * list.
  	 */
  	req->flags |= REQ_F_IO_DRAIN;
-	ret = io_req_defer(ctx, req, s->sqe);
+	ret = io_req_defer(ctx, req, s);
  	if (ret) {
  		if (ret != -EIOCBQUEUED) {
  			io_free_req(req);

-- 
Jens Axboe