All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v4 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline
@ 2020-05-30 14:39 Xiaoguang Wang
  2020-05-30 14:39 ` [PATCH v4 2/2] io_uring: avoid unnecessary io_wq_work copy for fast poll feature Xiaoguang Wang
  2020-05-30 16:44 ` [PATCH v4 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline Jens Axboe
  0 siblings, 2 replies; 27+ messages in thread
From: Xiaoguang Wang @ 2020-05-30 14:39 UTC (permalink / raw)
  To: io-uring; +Cc: axboe, asml.silence, joseph.qi, Xiaoguang Wang

If requests can be submitted and completed inline, we don't need to
initialize whole io_wq_work in io_init_req(), which is an expensive
operation, add a new 'REQ_F_WORK_INITIALIZED' to control whether
io_wq_work is initialized.

I use /dev/nullb0 to evaluate performance improvement in my physical
machine:
  modprobe null_blk nr_devices=1 completion_nsec=0
  sudo taskset -c 60 fio  -name=fiotest -filename=/dev/nullb0 -iodepth=128
  -thread -rw=read -ioengine=io_uring -direct=1 -bs=4k -size=100G -numjobs=1
  -time_based -runtime=120

before this patch:
Run status group 0 (all jobs):
   READ: bw=724MiB/s (759MB/s), 724MiB/s-724MiB/s (759MB/s-759MB/s),
   io=84.8GiB (91.1GB), run=120001-120001msec

With this patch:
Run status group 0 (all jobs):
   READ: bw=761MiB/s (798MB/s), 761MiB/s-761MiB/s (798MB/s-798MB/s),
   io=89.2GiB (95.8GB), run=120001-120001msec

About 5% improvement.

Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>

---
V4:
  add io_req_init_async() helper
---
 fs/io-wq.h    |  5 ----
 fs/io_uring.c | 66 ++++++++++++++++++++++++++++++++++++++-------------
 2 files changed, 50 insertions(+), 21 deletions(-)

diff --git a/fs/io-wq.h b/fs/io-wq.h
index 5ba12de7572f..3d85d365d764 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -94,11 +94,6 @@ struct io_wq_work {
 	pid_t task_pid;
 };
 
-#define INIT_IO_WORK(work, _func)				\
-	do {							\
-		*(work) = (struct io_wq_work){ .func = _func };	\
-	} while (0)						\
-
 static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
 {
 	if (!work->list.next)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 95df63b0b2ce..12296ce3e8b9 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -538,6 +538,7 @@ enum {
 	REQ_F_POLLED_BIT,
 	REQ_F_BUFFER_SELECTED_BIT,
 	REQ_F_NO_FILE_TABLE_BIT,
+	REQ_F_WORK_INITIALIZED_BIT,
 
 	/* not a real bit, just to check we're not overflowing the space */
 	__REQ_F_LAST_BIT,
@@ -593,6 +594,8 @@ enum {
 	REQ_F_BUFFER_SELECTED	= BIT(REQ_F_BUFFER_SELECTED_BIT),
 	/* doesn't need file table for this request */
 	REQ_F_NO_FILE_TABLE	= BIT(REQ_F_NO_FILE_TABLE_BIT),
+	/* io_wq_work is initialized */
+	REQ_F_WORK_INITIALIZED	= BIT(REQ_F_WORK_INITIALIZED_BIT),
 };
 
 struct async_poll {
@@ -638,6 +641,7 @@ struct io_kiocb {
 	unsigned int		flags;
 	refcount_t		refs;
 	struct task_struct	*task;
+	const struct cred	*creds;
 	unsigned long		fsize;
 	u64			user_data;
 	u32			result;
@@ -900,6 +904,17 @@ EXPORT_SYMBOL(io_uring_get_socket);
 
 static void io_file_put_work(struct work_struct *work);
 
+static inline void io_req_init_async(struct io_kiocb *req,
+			void (*func)(struct io_wq_work **))
+{
+	if (req->flags & REQ_F_WORK_INITIALIZED)
+		req->work.func = func;
+	else {
+		req->work = (struct io_wq_work){ .func = func };
+		req->flags |= REQ_F_WORK_INITIALIZED;
+	}
+}
+
 static inline bool io_async_submit(struct io_ring_ctx *ctx)
 {
 	return ctx->flags & IORING_SETUP_SQPOLL;
@@ -1025,8 +1040,15 @@ static inline void io_req_work_grab_env(struct io_kiocb *req,
 		mmgrab(current->mm);
 		req->work.mm = current->mm;
 	}
-	if (!req->work.creds)
-		req->work.creds = get_current_cred();
+
+	if (!req->work.creds) {
+		if (!req->creds)
+			req->work.creds = get_current_cred();
+		else {
+			req->work.creds = req->creds;
+			req->creds = NULL;
+		}
+	}
 	if (!req->work.fs && def->needs_fs) {
 		spin_lock(&current->fs->lock);
 		if (!current->fs->in_exec) {
@@ -1043,6 +1065,9 @@ static inline void io_req_work_grab_env(struct io_kiocb *req,
 
 static inline void io_req_work_drop_env(struct io_kiocb *req)
 {
+	if (!(req->flags & REQ_F_WORK_INITIALIZED))
+		return;
+
 	if (req->work.mm) {
 		mmdrop(req->work.mm);
 		req->work.mm = NULL;
@@ -3045,7 +3070,7 @@ static int io_fsync(struct io_kiocb *req, bool force_nonblock)
 {
 	/* fsync always requires a blocking context */
 	if (force_nonblock) {
-		req->work.func = io_fsync_finish;
+		io_req_init_async(req, io_fsync_finish);
 		return -EAGAIN;
 	}
 	__io_fsync(req);
@@ -3093,7 +3118,7 @@ static int io_fallocate(struct io_kiocb *req, bool force_nonblock)
 {
 	/* fallocate always requiring blocking context */
 	if (force_nonblock) {
-		req->work.func = io_fallocate_finish;
+		io_req_init_async(req, io_fallocate_finish);
 		return -EAGAIN;
 	}
 
@@ -3618,7 +3643,7 @@ static int io_close(struct io_kiocb *req, bool force_nonblock)
 	if (req->close.put_file->f_op->flush && force_nonblock) {
 		/* avoid grabbing files - we don't need the files */
 		req->flags |= REQ_F_NO_FILE_TABLE | REQ_F_MUST_PUNT;
-		req->work.func = io_close_finish;
+		io_req_init_async(req, io_close_finish);
 		return -EAGAIN;
 	}
 
@@ -3675,7 +3700,7 @@ static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
 {
 	/* sync_file_range always requires a blocking context */
 	if (force_nonblock) {
-		req->work.func = io_sync_file_range_finish;
+		io_req_init_async(req, io_sync_file_range_finish);
 		return -EAGAIN;
 	}
 
@@ -4144,7 +4169,7 @@ static int io_accept(struct io_kiocb *req, bool force_nonblock)
 
 	ret = __io_accept(req, force_nonblock);
 	if (ret == -EAGAIN && force_nonblock) {
-		req->work.func = io_accept_finish;
+		io_req_init_async(req, io_accept_finish);
 		return -EAGAIN;
 	}
 	return 0;
@@ -5144,6 +5169,8 @@ static int io_req_defer_prep(struct io_kiocb *req,
 	if (!sqe)
 		return 0;
 
+	io_req_init_async(req, io_wq_submit_work);
+
 	if (io_op_defs[req->opcode].file_table) {
 		ret = io_grab_files(req);
 		if (unlikely(ret))
@@ -5779,19 +5806,24 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_kiocb *linked_timeout;
 	struct io_kiocb *nxt;
-	const struct cred *old_creds = NULL;
+	const struct cred *creds, *old_creds = NULL;
 	int ret;
 
 again:
 	linked_timeout = io_prep_linked_timeout(req);
 
-	if (req->work.creds && req->work.creds != current_cred()) {
+	if (req->flags & REQ_F_WORK_INITIALIZED)
+		creds = req->work.creds;
+	else
+		creds = req->creds;
+
+	if (creds && creds != current_cred()) {
 		if (old_creds)
 			revert_creds(old_creds);
-		if (old_creds == req->work.creds)
+		if (old_creds == creds)
 			old_creds = NULL; /* restored original creds */
 		else
-			old_creds = override_creds(req->work.creds);
+			old_creds = override_creds(creds);
 	}
 
 	ret = io_issue_sqe(req, sqe, true);
@@ -5808,6 +5840,8 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 			goto exit;
 		}
 punt:
+		io_req_init_async(req, io_wq_submit_work);
+
 		if (io_op_defs[req->opcode].file_table) {
 			ret = io_grab_files(req);
 			if (ret)
@@ -6060,7 +6094,6 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	refcount_set(&req->refs, 2);
 	req->task = NULL;
 	req->result = 0;
-	INIT_IO_WORK(&req->work, io_wq_submit_work);
 
 	if (unlikely(req->opcode >= IORING_OP_LAST))
 		return -EINVAL;
@@ -6082,11 +6115,12 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 
 	id = READ_ONCE(sqe->personality);
 	if (id) {
-		req->work.creds = idr_find(&ctx->personality_idr, id);
-		if (unlikely(!req->work.creds))
+		req->creds = idr_find(&ctx->personality_idr, id);
+		if (unlikely(!req->creds))
 			return -EINVAL;
-		get_cred(req->work.creds);
-	}
+		get_cred(req->creds);
+	} else
+		req->creds = NULL;
 
 	/* same numerical values with corresponding REQ_F_*, safe to copy */
 	req->flags |= sqe_flags;
-- 
2.17.2


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH v4 2/2] io_uring: avoid unnecessary io_wq_work copy for fast poll feature
  2020-05-30 14:39 [PATCH v4 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline Xiaoguang Wang
@ 2020-05-30 14:39 ` Xiaoguang Wang
  2020-05-30 16:44 ` [PATCH v4 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline Jens Axboe
  1 sibling, 0 replies; 27+ messages in thread
From: Xiaoguang Wang @ 2020-05-30 14:39 UTC (permalink / raw)
  To: io-uring; +Cc: axboe, asml.silence, joseph.qi, Xiaoguang Wang

Basically IORING_OP_POLL_ADD command and async armed poll handlers
for regular commands don't touch io_wq_work, so only REQ_F_WORK_INITIALIZED
is set, can we do io_wq_work copy and restore.

Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>

---
V3:
  drop the REQ_F_WORK_NEED_RESTORE flag introduced in V2 patch, just
  use REQ_F_WORK_INITIALIZED to control whether to do io_wq_work copy
  and restore.
---
 fs/io_uring.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 12296ce3e8b9..a7d61f3366a5 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4513,7 +4513,8 @@ static void io_async_task_func(struct callback_head *cb)
 	spin_unlock_irq(&ctx->completion_lock);
 
 	/* restore ->work in case we need to retry again */
-	memcpy(&req->work, &apoll->work, sizeof(req->work));
+	if (req->flags & REQ_F_WORK_INITIALIZED)
+		memcpy(&req->work, &apoll->work, sizeof(req->work));
 	kfree(apoll);
 
 	if (!canceled) {
@@ -4610,7 +4611,8 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
 		return false;
 
 	req->flags |= REQ_F_POLLED;
-	memcpy(&apoll->work, &req->work, sizeof(req->work));
+	if (req->flags & REQ_F_WORK_INITIALIZED)
+		memcpy(&apoll->work, &req->work, sizeof(req->work));
 	had_io = req->io != NULL;
 
 	get_task_struct(current);
@@ -4635,7 +4637,8 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
 		if (!had_io)
 			io_poll_remove_double(req);
 		spin_unlock_irq(&ctx->completion_lock);
-		memcpy(&req->work, &apoll->work, sizeof(req->work));
+		if (req->flags & REQ_F_WORK_INITIALIZED)
+			memcpy(&req->work, &apoll->work, sizeof(req->work));
 		kfree(apoll);
 		return false;
 	}
@@ -4680,7 +4683,9 @@ static bool io_poll_remove_one(struct io_kiocb *req)
 			 * io_req_work_drop_env below when dropping the
 			 * final reference.
 			 */
-			memcpy(&req->work, &apoll->work, sizeof(req->work));
+			if (req->flags & REQ_F_WORK_INITIALIZED)
+				memcpy(&req->work, &apoll->work,
+				       sizeof(req->work));
 			kfree(apoll);
 		}
 	}
-- 
2.17.2


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH v4 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline
  2020-05-30 14:39 [PATCH v4 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline Xiaoguang Wang
  2020-05-30 14:39 ` [PATCH v4 2/2] io_uring: avoid unnecessary io_wq_work copy for fast poll feature Xiaoguang Wang
@ 2020-05-30 16:44 ` Jens Axboe
  2020-05-30 17:14   ` Jens Axboe
  1 sibling, 1 reply; 27+ messages in thread
From: Jens Axboe @ 2020-05-30 16:44 UTC (permalink / raw)
  To: Xiaoguang Wang, io-uring; +Cc: asml.silence, joseph.qi

On 5/30/20 8:39 AM, Xiaoguang Wang wrote:
> If requests can be submitted and completed inline, we don't need to
> initialize whole io_wq_work in io_init_req(), which is an expensive
> operation, add a new 'REQ_F_WORK_INITIALIZED' to control whether
> io_wq_work is initialized.
> 
> I use /dev/nullb0 to evaluate performance improvement in my physical
> machine:
>   modprobe null_blk nr_devices=1 completion_nsec=0
>   sudo taskset -c 60 fio  -name=fiotest -filename=/dev/nullb0 -iodepth=128
>   -thread -rw=read -ioengine=io_uring -direct=1 -bs=4k -size=100G -numjobs=1
>   -time_based -runtime=120
> 
> before this patch:
> Run status group 0 (all jobs):
>    READ: bw=724MiB/s (759MB/s), 724MiB/s-724MiB/s (759MB/s-759MB/s),
>    io=84.8GiB (91.1GB), run=120001-120001msec
> 
> With this patch:
> Run status group 0 (all jobs):
>    READ: bw=761MiB/s (798MB/s), 761MiB/s-761MiB/s (798MB/s-798MB/s),
>    io=89.2GiB (95.8GB), run=120001-120001msec
> 
> About 5% improvement.

There's something funky going on here. I ran the liburing test
suite on this, and get a lot of left behind workers:

Tests _maybe_ failed:  ring-leak open-close open-close file-update file-update accept-reuse accept-reuse poll-v-poll poll-v-poll fadvise fadvise madvise madvise short-read short-read openat2 openat2 probe probe shared-wq shared-wq personality personality eventfd eventfd send_recv send_recv eventfd-ring eventfd-ring across-fork across-fork sq-poll-kthread sq-poll-kthread splice splice lfs-openat lfs-openat lfs-openat-write lfs-openat-write iopoll iopoll d4ae271dfaae-test d4ae271dfaae-test eventfd-disable eventfd-disable write-file write-file buf-rw buf-rw statx statx

and also saw this:

[  168.208940] ==================================================================
[  168.209311] BUG: KASAN: use-after-free in __lock_acquire+0x8bf/0x3000
[  168.209626] Read of size 8 at addr ffff88806801c0d8 by task io_wqe_worker-0/41761
[  168.209987] 
[  168.210069] CPU: 0 PID: 41761 Comm: io_wqe_worker-0 Not tainted 5.7.0-rc7+ #6318
[  168.210424] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014
[  168.210857] Call Trace:
[  168.210991]  dump_stack+0x97/0xe0
[  168.211164]  print_address_description.constprop.0+0x1a/0x210
[  168.211446]  ? __lock_acquire+0x8bf/0x3000
[  168.211649]  __kasan_report.cold+0x20/0x39
[  168.211851]  ? __lock_acquire+0x8bf/0x3000
[  168.212051]  kasan_report+0x30/0x40
[  168.212226]  __lock_acquire+0x8bf/0x3000
[  168.212432]  ? ret_from_fork+0x24/0x30
[  168.212623]  ? stack_trace_save+0x81/0xa0
[  168.212821]  ? lockdep_hardirqs_on+0x270/0x270
[  168.213039]  ? save_stack+0x32/0x40
[  168.213212]  lock_acquire+0x122/0x570
[  168.213398]  ? __close_fd_get_file+0x40/0x150
[  168.213615]  ? lock_release+0x3f0/0x3f0
[  168.213814]  ? __lock_acquire+0x87e/0x3000
[  168.214016]  _raw_spin_lock+0x2c/0x40
[  168.214196]  ? __close_fd_get_file+0x40/0x150
[  168.214408]  __close_fd_get_file+0x40/0x150
[  168.214618]  io_issue_sqe+0x57f/0x22f0
[  168.214803]  ? lockdep_hardirqs_on+0x270/0x270
[  168.215019]  ? mark_held_locks+0x24/0x90
[  168.215211]  ? quarantine_put+0x6f/0x190
[  168.215404]  ? io_assign_current_work+0x59/0x80
[  168.215623]  ? __ia32_sys_io_uring_setup+0x30/0x30
[  168.215855]  ? find_held_lock+0xcb/0x100
[  168.216054]  ? io_worker_handle_work+0x289/0x980
[  168.216280]  ? lock_downgrade+0x340/0x340
[  168.216476]  ? io_wq_submit_work+0x5d/0x140
[  168.216679]  ? _raw_spin_unlock_irq+0x24/0x40
[  168.216890]  io_wq_submit_work+0x5d/0x140
[  168.217087]  io_worker_handle_work+0x30a/0x980
[  168.217305]  ? io_wqe_dec_running.isra.0+0x70/0x70
[  168.217537]  ? do_raw_spin_lock+0x100/0x180
[  168.217742]  ? rwlock_bug.part.0+0x60/0x60
[  168.217943]  io_wqe_worker+0x5fd/0x780
[  168.218126]  ? lock_downgrade+0x340/0x340
[  168.218323]  ? io_worker_handle_work+0x980/0x980
[  168.218546]  ? lockdep_hardirqs_on+0x17d/0x270
[  168.218765]  ? __kthread_parkme+0xca/0xe0
[  168.218961]  ? io_worker_handle_work+0x980/0x980
[  168.219186]  kthread+0x1f0/0x220
[  168.219346]  ? kthread_create_worker_on_cpu+0xb0/0xb0
[  168.219590]  ret_from_fork+0x24/0x30
[  168.219768] 
[  168.219846] Allocated by task 41758:
[  168.220021]  save_stack+0x1b/0x40
[  168.220185]  __kasan_kmalloc.constprop.0+0xc2/0xd0
[  168.220416]  kmem_cache_alloc+0xe0/0x290
[  168.220607]  dup_fd+0x4e/0x5a0
[  168.220758]  copy_process+0xe35/0x2bf0
[  168.220942]  _do_fork+0xd8/0x550
[  168.221102]  __do_sys_clone+0xb5/0xe0
[  168.221282]  do_syscall_64+0x5e/0xe0
[  168.221457]  entry_SYSCALL_64_after_hwframe+0x49/0xb3
[  168.221729] 
[  168.221848] Freed by task 41759:
[  168.222088]  save_stack+0x1b/0x40
[  168.222336]  __kasan_slab_free+0x12f/0x180
[  168.222632]  slab_free_freelist_hook+0x4d/0x120
[  168.222959]  kmem_cache_free+0x90/0x2e0
[  168.223239]  do_exit+0x5d2/0x12e0
[  168.223482]  do_group_exit+0x6f/0x130
[  168.223754]  __x64_sys_exit_group+0x28/0x30
[  168.224061]  do_syscall_64+0x5e/0xe0
[  168.224326]  entry_SYSCALL_64_after_hwframe+0x49/0xb3
[  168.224686] 

which indicates that current->files is no longer valid.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v4 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline
  2020-05-30 16:44 ` [PATCH v4 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline Jens Axboe
@ 2020-05-30 17:14   ` Jens Axboe
  2020-05-30 17:29     ` Jens Axboe
  0 siblings, 1 reply; 27+ messages in thread
From: Jens Axboe @ 2020-05-30 17:14 UTC (permalink / raw)
  To: Xiaoguang Wang, io-uring; +Cc: asml.silence, joseph.qi

On 5/30/20 10:44 AM, Jens Axboe wrote:
> On 5/30/20 8:39 AM, Xiaoguang Wang wrote:
>> If requests can be submitted and completed inline, we don't need to
>> initialize whole io_wq_work in io_init_req(), which is an expensive
>> operation, add a new 'REQ_F_WORK_INITIALIZED' to control whether
>> io_wq_work is initialized.
>>
>> I use /dev/nullb0 to evaluate performance improvement in my physical
>> machine:
>>   modprobe null_blk nr_devices=1 completion_nsec=0
>>   sudo taskset -c 60 fio  -name=fiotest -filename=/dev/nullb0 -iodepth=128
>>   -thread -rw=read -ioengine=io_uring -direct=1 -bs=4k -size=100G -numjobs=1
>>   -time_based -runtime=120
>>
>> before this patch:
>> Run status group 0 (all jobs):
>>    READ: bw=724MiB/s (759MB/s), 724MiB/s-724MiB/s (759MB/s-759MB/s),
>>    io=84.8GiB (91.1GB), run=120001-120001msec
>>
>> With this patch:
>> Run status group 0 (all jobs):
>>    READ: bw=761MiB/s (798MB/s), 761MiB/s-761MiB/s (798MB/s-798MB/s),
>>    io=89.2GiB (95.8GB), run=120001-120001msec
>>
>> About 5% improvement.
> 
> There's something funky going on here. I ran the liburing test
> suite on this, and get a lot of left behind workers:
> 
> Tests _maybe_ failed:  ring-leak open-close open-close file-update file-update accept-reuse accept-reuse poll-v-poll poll-v-poll fadvise fadvise madvise madvise short-read short-read openat2 openat2 probe probe shared-wq shared-wq personality personality eventfd eventfd send_recv send_recv eventfd-ring eventfd-ring across-fork across-fork sq-poll-kthread sq-poll-kthread splice splice lfs-openat lfs-openat lfs-openat-write lfs-openat-write iopoll iopoll d4ae271dfaae-test d4ae271dfaae-test eventfd-disable eventfd-disable write-file write-file buf-rw buf-rw statx statx
> 
> and also saw this:
> 
> [  168.208940] ==================================================================
> [  168.209311] BUG: KASAN: use-after-free in __lock_acquire+0x8bf/0x3000
> [  168.209626] Read of size 8 at addr ffff88806801c0d8 by task io_wqe_worker-0/41761
> [  168.209987] 
> [  168.210069] CPU: 0 PID: 41761 Comm: io_wqe_worker-0 Not tainted 5.7.0-rc7+ #6318
> [  168.210424] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014
> [  168.210857] Call Trace:
> [  168.210991]  dump_stack+0x97/0xe0
> [  168.211164]  print_address_description.constprop.0+0x1a/0x210
> [  168.211446]  ? __lock_acquire+0x8bf/0x3000
> [  168.211649]  __kasan_report.cold+0x20/0x39
> [  168.211851]  ? __lock_acquire+0x8bf/0x3000
> [  168.212051]  kasan_report+0x30/0x40
> [  168.212226]  __lock_acquire+0x8bf/0x3000
> [  168.212432]  ? ret_from_fork+0x24/0x30
> [  168.212623]  ? stack_trace_save+0x81/0xa0
> [  168.212821]  ? lockdep_hardirqs_on+0x270/0x270
> [  168.213039]  ? save_stack+0x32/0x40
> [  168.213212]  lock_acquire+0x122/0x570
> [  168.213398]  ? __close_fd_get_file+0x40/0x150
> [  168.213615]  ? lock_release+0x3f0/0x3f0
> [  168.213814]  ? __lock_acquire+0x87e/0x3000
> [  168.214016]  _raw_spin_lock+0x2c/0x40
> [  168.214196]  ? __close_fd_get_file+0x40/0x150
> [  168.214408]  __close_fd_get_file+0x40/0x150
> [  168.214618]  io_issue_sqe+0x57f/0x22f0
> [  168.214803]  ? lockdep_hardirqs_on+0x270/0x270
> [  168.215019]  ? mark_held_locks+0x24/0x90
> [  168.215211]  ? quarantine_put+0x6f/0x190
> [  168.215404]  ? io_assign_current_work+0x59/0x80
> [  168.215623]  ? __ia32_sys_io_uring_setup+0x30/0x30
> [  168.215855]  ? find_held_lock+0xcb/0x100
> [  168.216054]  ? io_worker_handle_work+0x289/0x980
> [  168.216280]  ? lock_downgrade+0x340/0x340
> [  168.216476]  ? io_wq_submit_work+0x5d/0x140
> [  168.216679]  ? _raw_spin_unlock_irq+0x24/0x40
> [  168.216890]  io_wq_submit_work+0x5d/0x140
> [  168.217087]  io_worker_handle_work+0x30a/0x980
> [  168.217305]  ? io_wqe_dec_running.isra.0+0x70/0x70
> [  168.217537]  ? do_raw_spin_lock+0x100/0x180
> [  168.217742]  ? rwlock_bug.part.0+0x60/0x60
> [  168.217943]  io_wqe_worker+0x5fd/0x780
> [  168.218126]  ? lock_downgrade+0x340/0x340
> [  168.218323]  ? io_worker_handle_work+0x980/0x980
> [  168.218546]  ? lockdep_hardirqs_on+0x17d/0x270
> [  168.218765]  ? __kthread_parkme+0xca/0xe0
> [  168.218961]  ? io_worker_handle_work+0x980/0x980
> [  168.219186]  kthread+0x1f0/0x220
> [  168.219346]  ? kthread_create_worker_on_cpu+0xb0/0xb0
> [  168.219590]  ret_from_fork+0x24/0x30
> [  168.219768] 
> [  168.219846] Allocated by task 41758:
> [  168.220021]  save_stack+0x1b/0x40
> [  168.220185]  __kasan_kmalloc.constprop.0+0xc2/0xd0
> [  168.220416]  kmem_cache_alloc+0xe0/0x290
> [  168.220607]  dup_fd+0x4e/0x5a0
> [  168.220758]  copy_process+0xe35/0x2bf0
> [  168.220942]  _do_fork+0xd8/0x550
> [  168.221102]  __do_sys_clone+0xb5/0xe0
> [  168.221282]  do_syscall_64+0x5e/0xe0
> [  168.221457]  entry_SYSCALL_64_after_hwframe+0x49/0xb3
> [  168.221729] 
> [  168.221848] Freed by task 41759:
> [  168.222088]  save_stack+0x1b/0x40
> [  168.222336]  __kasan_slab_free+0x12f/0x180
> [  168.222632]  slab_free_freelist_hook+0x4d/0x120
> [  168.222959]  kmem_cache_free+0x90/0x2e0
> [  168.223239]  do_exit+0x5d2/0x12e0
> [  168.223482]  do_group_exit+0x6f/0x130
> [  168.223754]  __x64_sys_exit_group+0x28/0x30
> [  168.224061]  do_syscall_64+0x5e/0xe0
> [  168.224326]  entry_SYSCALL_64_after_hwframe+0x49/0xb3
> [  168.224686] 
> 
> which indicates that current->files is no longer valid.

Narrowed it down to the test/open-close test, and in particular where
it closes the ring itself:

ret = test_close(&ring, ring.ring_fd, 1);

This seems to be because you do io_req_init_async() after calling
io_issue_sqe(), and the command handler may have set something
else for ->func at that point. Hence we never call the right
handler if the close needs to be deferred, as it needs to for
the io_uring as it has ->flush() defined.

Why isn't io_req_init_async() just doing:

static inline void io_req_init_async(struct io_kiocb *req,
                         void (*func)(struct io_wq_work **))
{                                                                               
        if (!(req->flags & REQ_F_WORK_INITIALIZED)) {
                req->work = (struct io_wq_work){ .func = func };
                req->flags |= REQ_F_WORK_INITIALIZED;
        }
}

?

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v4 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline
  2020-05-30 17:14   ` Jens Axboe
@ 2020-05-30 17:29     ` Jens Axboe
  2020-05-30 17:36       ` Pavel Begunkov
  2020-05-31 14:12       ` Xiaoguang Wang
  0 siblings, 2 replies; 27+ messages in thread
From: Jens Axboe @ 2020-05-30 17:29 UTC (permalink / raw)
  To: Xiaoguang Wang, io-uring; +Cc: asml.silence, joseph.qi

On 5/30/20 11:14 AM, Jens Axboe wrote:
> On 5/30/20 10:44 AM, Jens Axboe wrote:
>> On 5/30/20 8:39 AM, Xiaoguang Wang wrote:
>>> If requests can be submitted and completed inline, we don't need to
>>> initialize whole io_wq_work in io_init_req(), which is an expensive
>>> operation, add a new 'REQ_F_WORK_INITIALIZED' to control whether
>>> io_wq_work is initialized.
>>>
>>> I use /dev/nullb0 to evaluate performance improvement in my physical
>>> machine:
>>>   modprobe null_blk nr_devices=1 completion_nsec=0
>>>   sudo taskset -c 60 fio  -name=fiotest -filename=/dev/nullb0 -iodepth=128
>>>   -thread -rw=read -ioengine=io_uring -direct=1 -bs=4k -size=100G -numjobs=1
>>>   -time_based -runtime=120
>>>
>>> before this patch:
>>> Run status group 0 (all jobs):
>>>    READ: bw=724MiB/s (759MB/s), 724MiB/s-724MiB/s (759MB/s-759MB/s),
>>>    io=84.8GiB (91.1GB), run=120001-120001msec
>>>
>>> With this patch:
>>> Run status group 0 (all jobs):
>>>    READ: bw=761MiB/s (798MB/s), 761MiB/s-761MiB/s (798MB/s-798MB/s),
>>>    io=89.2GiB (95.8GB), run=120001-120001msec
>>>
>>> About 5% improvement.
>>
>> There's something funky going on here. I ran the liburing test
>> suite on this, and get a lot of left behind workers:
>>
>> Tests _maybe_ failed:  ring-leak open-close open-close file-update file-update accept-reuse accept-reuse poll-v-poll poll-v-poll fadvise fadvise madvise madvise short-read short-read openat2 openat2 probe probe shared-wq shared-wq personality personality eventfd eventfd send_recv send_recv eventfd-ring eventfd-ring across-fork across-fork sq-poll-kthread sq-poll-kthread splice splice lfs-openat lfs-openat lfs-openat-write lfs-openat-write iopoll iopoll d4ae271dfaae-test d4ae271dfaae-test eventfd-disable eventfd-disable write-file write-file buf-rw buf-rw statx statx
>>
>> and also saw this:
>>
>> [  168.208940] ==================================================================
>> [  168.209311] BUG: KASAN: use-after-free in __lock_acquire+0x8bf/0x3000
>> [  168.209626] Read of size 8 at addr ffff88806801c0d8 by task io_wqe_worker-0/41761
>> [  168.209987] 
>> [  168.210069] CPU: 0 PID: 41761 Comm: io_wqe_worker-0 Not tainted 5.7.0-rc7+ #6318
>> [  168.210424] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014
>> [  168.210857] Call Trace:
>> [  168.210991]  dump_stack+0x97/0xe0
>> [  168.211164]  print_address_description.constprop.0+0x1a/0x210
>> [  168.211446]  ? __lock_acquire+0x8bf/0x3000
>> [  168.211649]  __kasan_report.cold+0x20/0x39
>> [  168.211851]  ? __lock_acquire+0x8bf/0x3000
>> [  168.212051]  kasan_report+0x30/0x40
>> [  168.212226]  __lock_acquire+0x8bf/0x3000
>> [  168.212432]  ? ret_from_fork+0x24/0x30
>> [  168.212623]  ? stack_trace_save+0x81/0xa0
>> [  168.212821]  ? lockdep_hardirqs_on+0x270/0x270
>> [  168.213039]  ? save_stack+0x32/0x40
>> [  168.213212]  lock_acquire+0x122/0x570
>> [  168.213398]  ? __close_fd_get_file+0x40/0x150
>> [  168.213615]  ? lock_release+0x3f0/0x3f0
>> [  168.213814]  ? __lock_acquire+0x87e/0x3000
>> [  168.214016]  _raw_spin_lock+0x2c/0x40
>> [  168.214196]  ? __close_fd_get_file+0x40/0x150
>> [  168.214408]  __close_fd_get_file+0x40/0x150
>> [  168.214618]  io_issue_sqe+0x57f/0x22f0
>> [  168.214803]  ? lockdep_hardirqs_on+0x270/0x270
>> [  168.215019]  ? mark_held_locks+0x24/0x90
>> [  168.215211]  ? quarantine_put+0x6f/0x190
>> [  168.215404]  ? io_assign_current_work+0x59/0x80
>> [  168.215623]  ? __ia32_sys_io_uring_setup+0x30/0x30
>> [  168.215855]  ? find_held_lock+0xcb/0x100
>> [  168.216054]  ? io_worker_handle_work+0x289/0x980
>> [  168.216280]  ? lock_downgrade+0x340/0x340
>> [  168.216476]  ? io_wq_submit_work+0x5d/0x140
>> [  168.216679]  ? _raw_spin_unlock_irq+0x24/0x40
>> [  168.216890]  io_wq_submit_work+0x5d/0x140
>> [  168.217087]  io_worker_handle_work+0x30a/0x980
>> [  168.217305]  ? io_wqe_dec_running.isra.0+0x70/0x70
>> [  168.217537]  ? do_raw_spin_lock+0x100/0x180
>> [  168.217742]  ? rwlock_bug.part.0+0x60/0x60
>> [  168.217943]  io_wqe_worker+0x5fd/0x780
>> [  168.218126]  ? lock_downgrade+0x340/0x340
>> [  168.218323]  ? io_worker_handle_work+0x980/0x980
>> [  168.218546]  ? lockdep_hardirqs_on+0x17d/0x270
>> [  168.218765]  ? __kthread_parkme+0xca/0xe0
>> [  168.218961]  ? io_worker_handle_work+0x980/0x980
>> [  168.219186]  kthread+0x1f0/0x220
>> [  168.219346]  ? kthread_create_worker_on_cpu+0xb0/0xb0
>> [  168.219590]  ret_from_fork+0x24/0x30
>> [  168.219768] 
>> [  168.219846] Allocated by task 41758:
>> [  168.220021]  save_stack+0x1b/0x40
>> [  168.220185]  __kasan_kmalloc.constprop.0+0xc2/0xd0
>> [  168.220416]  kmem_cache_alloc+0xe0/0x290
>> [  168.220607]  dup_fd+0x4e/0x5a0
>> [  168.220758]  copy_process+0xe35/0x2bf0
>> [  168.220942]  _do_fork+0xd8/0x550
>> [  168.221102]  __do_sys_clone+0xb5/0xe0
>> [  168.221282]  do_syscall_64+0x5e/0xe0
>> [  168.221457]  entry_SYSCALL_64_after_hwframe+0x49/0xb3
>> [  168.221729] 
>> [  168.221848] Freed by task 41759:
>> [  168.222088]  save_stack+0x1b/0x40
>> [  168.222336]  __kasan_slab_free+0x12f/0x180
>> [  168.222632]  slab_free_freelist_hook+0x4d/0x120
>> [  168.222959]  kmem_cache_free+0x90/0x2e0
>> [  168.223239]  do_exit+0x5d2/0x12e0
>> [  168.223482]  do_group_exit+0x6f/0x130
>> [  168.223754]  __x64_sys_exit_group+0x28/0x30
>> [  168.224061]  do_syscall_64+0x5e/0xe0
>> [  168.224326]  entry_SYSCALL_64_after_hwframe+0x49/0xb3
>> [  168.224686] 
>>
>> which indicates that current->files is no longer valid.
> 
> Narrowed it down to the test/open-close test, and in particular where
> it closes the ring itself:
> 
> ret = test_close(&ring, ring.ring_fd, 1);
> 
> This seems to be because you do io_req_init_async() after calling
> io_issue_sqe(), and the command handler may have set something
> else for ->func at that point. Hence we never call the right
> handler if the close needs to be deferred, as it needs to for
> the io_uring as it has ->flush() defined.
> 
> Why isn't io_req_init_async() just doing:
> 
> static inline void io_req_init_async(struct io_kiocb *req,
>                          void (*func)(struct io_wq_work **))
> {                                                                               
>         if (!(req->flags & REQ_F_WORK_INITIALIZED)) {
>                 req->work = (struct io_wq_work){ .func = func };
>                 req->flags |= REQ_F_WORK_INITIALIZED;
>         }
> }
> 
> ?

I guess that won't always work, if the request has been deferred and
we're now setting a new work func. So we really want the 'reset to
io_wq_submit_work' to only happen if the opcode hasn't already set
a private handler. Can you make that change?

Also please fix up missing braces. The cases of:

if (something) {
	line 1
	line 2
} else
	line 3

should always includes braces, if one clause has it.

A v5 with those two things would be ready to commit.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v4 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline
  2020-05-30 17:29     ` Jens Axboe
@ 2020-05-30 17:36       ` Pavel Begunkov
  2020-05-31 13:57         ` Xiaoguang Wang
  2020-05-31 14:12       ` Xiaoguang Wang
  1 sibling, 1 reply; 27+ messages in thread
From: Pavel Begunkov @ 2020-05-30 17:36 UTC (permalink / raw)
  To: Jens Axboe, Xiaoguang Wang, io-uring; +Cc: joseph.qi

On 30/05/2020 20:29, Jens Axboe wrote:
> On 5/30/20 11:14 AM, Jens Axboe wrote:
>> On 5/30/20 10:44 AM, Jens Axboe wrote:
>>> On 5/30/20 8:39 AM, Xiaoguang Wang wrote:
>>>> If requests can be submitted and completed inline, we don't need to
>>>> initialize whole io_wq_work in io_init_req(), which is an expensive
>>>> operation, add a new 'REQ_F_WORK_INITIALIZED' to control whether
>>>> io_wq_work is initialized.
>>>>
>>>> I use /dev/nullb0 to evaluate performance improvement in my physical
>>>> machine:
>>>>   modprobe null_blk nr_devices=1 completion_nsec=0
>>>>   sudo taskset -c 60 fio  -name=fiotest -filename=/dev/nullb0 -iodepth=128
>>>>   -thread -rw=read -ioengine=io_uring -direct=1 -bs=4k -size=100G -numjobs=1
>>>>   -time_based -runtime=120
>>>>
>>>> before this patch:
>>>> Run status group 0 (all jobs):
>>>>    READ: bw=724MiB/s (759MB/s), 724MiB/s-724MiB/s (759MB/s-759MB/s),
>>>>    io=84.8GiB (91.1GB), run=120001-120001msec
>>>>
>>>> With this patch:
>>>> Run status group 0 (all jobs):
>>>>    READ: bw=761MiB/s (798MB/s), 761MiB/s-761MiB/s (798MB/s-798MB/s),
>>>>    io=89.2GiB (95.8GB), run=120001-120001msec
>>>>
>>>> About 5% improvement.
>>>
>>> There's something funky going on here. I ran the liburing test
>>> suite on this, and get a lot of left behind workers:
>>>
>>> Tests _maybe_ failed:  ring-leak open-close open-close file-update file-update accept-reuse accept-reuse poll-v-poll poll-v-poll fadvise fadvise madvise madvise short-read short-read openat2 openat2 probe probe shared-wq shared-wq personality personality eventfd eventfd send_recv send_recv eventfd-ring eventfd-ring across-fork across-fork sq-poll-kthread sq-poll-kthread splice splice lfs-openat lfs-openat lfs-openat-write lfs-openat-write iopoll iopoll d4ae271dfaae-test d4ae271dfaae-test eventfd-disable eventfd-disable write-file write-file buf-rw buf-rw statx statx
>>>
>>> and also saw this:
>>>
>>> [  168.208940] ==================================================================
>>> [  168.209311] BUG: KASAN: use-after-free in __lock_acquire+0x8bf/0x3000
>>> [  168.209626] Read of size 8 at addr ffff88806801c0d8 by task io_wqe_worker-0/41761
>>> [  168.209987] 
>>> [  168.210069] CPU: 0 PID: 41761 Comm: io_wqe_worker-0 Not tainted 5.7.0-rc7+ #6318
>>> [  168.210424] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014
>>> [  168.210857] Call Trace:
>>> [  168.210991]  dump_stack+0x97/0xe0
>>> [  168.211164]  print_address_description.constprop.0+0x1a/0x210
>>> [  168.211446]  ? __lock_acquire+0x8bf/0x3000
>>> [  168.211649]  __kasan_report.cold+0x20/0x39
>>> [  168.211851]  ? __lock_acquire+0x8bf/0x3000
>>> [  168.212051]  kasan_report+0x30/0x40
>>> [  168.212226]  __lock_acquire+0x8bf/0x3000
>>> [  168.212432]  ? ret_from_fork+0x24/0x30
>>> [  168.212623]  ? stack_trace_save+0x81/0xa0
>>> [  168.212821]  ? lockdep_hardirqs_on+0x270/0x270
>>> [  168.213039]  ? save_stack+0x32/0x40
>>> [  168.213212]  lock_acquire+0x122/0x570
>>> [  168.213398]  ? __close_fd_get_file+0x40/0x150
>>> [  168.213615]  ? lock_release+0x3f0/0x3f0
>>> [  168.213814]  ? __lock_acquire+0x87e/0x3000
>>> [  168.214016]  _raw_spin_lock+0x2c/0x40
>>> [  168.214196]  ? __close_fd_get_file+0x40/0x150
>>> [  168.214408]  __close_fd_get_file+0x40/0x150
>>> [  168.214618]  io_issue_sqe+0x57f/0x22f0
>>> [  168.214803]  ? lockdep_hardirqs_on+0x270/0x270
>>> [  168.215019]  ? mark_held_locks+0x24/0x90
>>> [  168.215211]  ? quarantine_put+0x6f/0x190
>>> [  168.215404]  ? io_assign_current_work+0x59/0x80
>>> [  168.215623]  ? __ia32_sys_io_uring_setup+0x30/0x30
>>> [  168.215855]  ? find_held_lock+0xcb/0x100
>>> [  168.216054]  ? io_worker_handle_work+0x289/0x980
>>> [  168.216280]  ? lock_downgrade+0x340/0x340
>>> [  168.216476]  ? io_wq_submit_work+0x5d/0x140
>>> [  168.216679]  ? _raw_spin_unlock_irq+0x24/0x40
>>> [  168.216890]  io_wq_submit_work+0x5d/0x140
>>> [  168.217087]  io_worker_handle_work+0x30a/0x980
>>> [  168.217305]  ? io_wqe_dec_running.isra.0+0x70/0x70
>>> [  168.217537]  ? do_raw_spin_lock+0x100/0x180
>>> [  168.217742]  ? rwlock_bug.part.0+0x60/0x60
>>> [  168.217943]  io_wqe_worker+0x5fd/0x780
>>> [  168.218126]  ? lock_downgrade+0x340/0x340
>>> [  168.218323]  ? io_worker_handle_work+0x980/0x980
>>> [  168.218546]  ? lockdep_hardirqs_on+0x17d/0x270
>>> [  168.218765]  ? __kthread_parkme+0xca/0xe0
>>> [  168.218961]  ? io_worker_handle_work+0x980/0x980
>>> [  168.219186]  kthread+0x1f0/0x220
>>> [  168.219346]  ? kthread_create_worker_on_cpu+0xb0/0xb0
>>> [  168.219590]  ret_from_fork+0x24/0x30
>>> [  168.219768] 
>>> [  168.219846] Allocated by task 41758:
>>> [  168.220021]  save_stack+0x1b/0x40
>>> [  168.220185]  __kasan_kmalloc.constprop.0+0xc2/0xd0
>>> [  168.220416]  kmem_cache_alloc+0xe0/0x290
>>> [  168.220607]  dup_fd+0x4e/0x5a0
>>> [  168.220758]  copy_process+0xe35/0x2bf0
>>> [  168.220942]  _do_fork+0xd8/0x550
>>> [  168.221102]  __do_sys_clone+0xb5/0xe0
>>> [  168.221282]  do_syscall_64+0x5e/0xe0
>>> [  168.221457]  entry_SYSCALL_64_after_hwframe+0x49/0xb3
>>> [  168.221729] 
>>> [  168.221848] Freed by task 41759:
>>> [  168.222088]  save_stack+0x1b/0x40
>>> [  168.222336]  __kasan_slab_free+0x12f/0x180
>>> [  168.222632]  slab_free_freelist_hook+0x4d/0x120
>>> [  168.222959]  kmem_cache_free+0x90/0x2e0
>>> [  168.223239]  do_exit+0x5d2/0x12e0
>>> [  168.223482]  do_group_exit+0x6f/0x130
>>> [  168.223754]  __x64_sys_exit_group+0x28/0x30
>>> [  168.224061]  do_syscall_64+0x5e/0xe0
>>> [  168.224326]  entry_SYSCALL_64_after_hwframe+0x49/0xb3
>>> [  168.224686] 
>>>
>>> which indicates that current->files is no longer valid.
>>
>> Narrowed it down to the test/open-close test, and in particular where
>> it closes the ring itself:
>>
>> ret = test_close(&ring, ring.ring_fd, 1);
>>
>> This seems to be because you do io_req_init_async() after calling
>> io_issue_sqe(), and the command handler may have set something
>> else for ->func at that point. Hence we never call the right
>> handler if the close needs to be deferred, as it needs to for
>> the io_uring as it has ->flush() defined.
>>
>> Why isn't io_req_init_async() just doing:
>>
>> static inline void io_req_init_async(struct io_kiocb *req,
>>                          void (*func)(struct io_wq_work **))
>> {                                                                               
>>         if (!(req->flags & REQ_F_WORK_INITIALIZED)) {
>>                 req->work = (struct io_wq_work){ .func = func };
>>                 req->flags |= REQ_F_WORK_INITIALIZED;
>>         }
>> }
>>
>> ?
> 
> I guess that won't always work, if the request has been deferred and
> we're now setting a new work func. So we really want the 'reset to
> io_wq_submit_work' to only happen if the opcode hasn't already set
> a private handler. Can you make that change?
> 
> Also please fix up missing braces. The cases of:
> 
> if (something) {
> 	line 1
> 	line 2
> } else
> 	line 3
> 
> should always includes braces, if one clause has it.
> 
> A v5 with those two things would be ready to commit.
> 

There is another thing:

io_submit_sqes()
    -> io_close() (let ->flush == NULL)
        -> __io_close_finish()
            -> filp_close(req->close.put_file, *req->work.files*);

where req->work.files is garbage.

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v4 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline
  2020-05-30 17:36       ` Pavel Begunkov
@ 2020-05-31 13:57         ` Xiaoguang Wang
  2020-05-31 14:49           ` Pavel Begunkov
  0 siblings, 1 reply; 27+ messages in thread
From: Xiaoguang Wang @ 2020-05-31 13:57 UTC (permalink / raw)
  To: Pavel Begunkov, Jens Axboe, io-uring; +Cc: joseph.qi

hi,

> On 30/05/2020 20:29, Jens Axboe wrote:
>> On 5/30/20 11:14 AM, Jens Axboe wrote:
>>> On 5/30/20 10:44 AM, Jens Axboe wrote:
>>>> On 5/30/20 8:39 AM, Xiaoguang Wang wrote:
>>>>> If requests can be submitted and completed inline, we don't need to
>>>>> initialize whole io_wq_work in io_init_req(), which is an expensive
>>>>> operation, add a new 'REQ_F_WORK_INITIALIZED' to control whether
>>>>> io_wq_work is initialized.
>>>>>
>>>>> I use /dev/nullb0 to evaluate performance improvement in my physical
>>>>> machine:
>>>>>    modprobe null_blk nr_devices=1 completion_nsec=0
>>>>>    sudo taskset -c 60 fio  -name=fiotest -filename=/dev/nullb0 -iodepth=128
>>>>>    -thread -rw=read -ioengine=io_uring -direct=1 -bs=4k -size=100G -numjobs=1
>>>>>    -time_based -runtime=120
>>>>>
>>>>> before this patch:
>>>>> Run status group 0 (all jobs):
>>>>>     READ: bw=724MiB/s (759MB/s), 724MiB/s-724MiB/s (759MB/s-759MB/s),
>>>>>     io=84.8GiB (91.1GB), run=120001-120001msec
>>>>>
>>>>> With this patch:
>>>>> Run status group 0 (all jobs):
>>>>>     READ: bw=761MiB/s (798MB/s), 761MiB/s-761MiB/s (798MB/s-798MB/s),
>>>>>     io=89.2GiB (95.8GB), run=120001-120001msec
>>>>>
>>>>> About 5% improvement.
>>>>
>>>> There's something funky going on here. I ran the liburing test
>>>> suite on this, and get a lot of left behind workers:
>>>>
>>>> Tests _maybe_ failed:  ring-leak open-close open-close file-update file-update accept-reuse accept-reuse poll-v-poll poll-v-poll fadvise fadvise madvise madvise short-read short-read openat2 openat2 probe probe shared-wq shared-wq personality personality eventfd eventfd send_recv send_recv eventfd-ring eventfd-ring across-fork across-fork sq-poll-kthread sq-poll-kthread splice splice lfs-openat lfs-openat lfs-openat-write lfs-openat-write iopoll iopoll d4ae271dfaae-test d4ae271dfaae-test eventfd-disable eventfd-disable write-file write-file buf-rw buf-rw statx statx
>>>>
>>>> and also saw this:
>>>>
>>>> [  168.208940] ==================================================================
>>>> [  168.209311] BUG: KASAN: use-after-free in __lock_acquire+0x8bf/0x3000
>>>> [  168.209626] Read of size 8 at addr ffff88806801c0d8 by task io_wqe_worker-0/41761
>>>> [  168.209987]
>>>> [  168.210069] CPU: 0 PID: 41761 Comm: io_wqe_worker-0 Not tainted 5.7.0-rc7+ #6318
>>>> [  168.210424] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014
>>>> [  168.210857] Call Trace:
>>>> [  168.210991]  dump_stack+0x97/0xe0
>>>> [  168.211164]  print_address_description.constprop.0+0x1a/0x210
>>>> [  168.211446]  ? __lock_acquire+0x8bf/0x3000
>>>> [  168.211649]  __kasan_report.cold+0x20/0x39
>>>> [  168.211851]  ? __lock_acquire+0x8bf/0x3000
>>>> [  168.212051]  kasan_report+0x30/0x40
>>>> [  168.212226]  __lock_acquire+0x8bf/0x3000
>>>> [  168.212432]  ? ret_from_fork+0x24/0x30
>>>> [  168.212623]  ? stack_trace_save+0x81/0xa0
>>>> [  168.212821]  ? lockdep_hardirqs_on+0x270/0x270
>>>> [  168.213039]  ? save_stack+0x32/0x40
>>>> [  168.213212]  lock_acquire+0x122/0x570
>>>> [  168.213398]  ? __close_fd_get_file+0x40/0x150
>>>> [  168.213615]  ? lock_release+0x3f0/0x3f0
>>>> [  168.213814]  ? __lock_acquire+0x87e/0x3000
>>>> [  168.214016]  _raw_spin_lock+0x2c/0x40
>>>> [  168.214196]  ? __close_fd_get_file+0x40/0x150
>>>> [  168.214408]  __close_fd_get_file+0x40/0x150
>>>> [  168.214618]  io_issue_sqe+0x57f/0x22f0
>>>> [  168.214803]  ? lockdep_hardirqs_on+0x270/0x270
>>>> [  168.215019]  ? mark_held_locks+0x24/0x90
>>>> [  168.215211]  ? quarantine_put+0x6f/0x190
>>>> [  168.215404]  ? io_assign_current_work+0x59/0x80
>>>> [  168.215623]  ? __ia32_sys_io_uring_setup+0x30/0x30
>>>> [  168.215855]  ? find_held_lock+0xcb/0x100
>>>> [  168.216054]  ? io_worker_handle_work+0x289/0x980
>>>> [  168.216280]  ? lock_downgrade+0x340/0x340
>>>> [  168.216476]  ? io_wq_submit_work+0x5d/0x140
>>>> [  168.216679]  ? _raw_spin_unlock_irq+0x24/0x40
>>>> [  168.216890]  io_wq_submit_work+0x5d/0x140
>>>> [  168.217087]  io_worker_handle_work+0x30a/0x980
>>>> [  168.217305]  ? io_wqe_dec_running.isra.0+0x70/0x70
>>>> [  168.217537]  ? do_raw_spin_lock+0x100/0x180
>>>> [  168.217742]  ? rwlock_bug.part.0+0x60/0x60
>>>> [  168.217943]  io_wqe_worker+0x5fd/0x780
>>>> [  168.218126]  ? lock_downgrade+0x340/0x340
>>>> [  168.218323]  ? io_worker_handle_work+0x980/0x980
>>>> [  168.218546]  ? lockdep_hardirqs_on+0x17d/0x270
>>>> [  168.218765]  ? __kthread_parkme+0xca/0xe0
>>>> [  168.218961]  ? io_worker_handle_work+0x980/0x980
>>>> [  168.219186]  kthread+0x1f0/0x220
>>>> [  168.219346]  ? kthread_create_worker_on_cpu+0xb0/0xb0
>>>> [  168.219590]  ret_from_fork+0x24/0x30
>>>> [  168.219768]
>>>> [  168.219846] Allocated by task 41758:
>>>> [  168.220021]  save_stack+0x1b/0x40
>>>> [  168.220185]  __kasan_kmalloc.constprop.0+0xc2/0xd0
>>>> [  168.220416]  kmem_cache_alloc+0xe0/0x290
>>>> [  168.220607]  dup_fd+0x4e/0x5a0
>>>> [  168.220758]  copy_process+0xe35/0x2bf0
>>>> [  168.220942]  _do_fork+0xd8/0x550
>>>> [  168.221102]  __do_sys_clone+0xb5/0xe0
>>>> [  168.221282]  do_syscall_64+0x5e/0xe0
>>>> [  168.221457]  entry_SYSCALL_64_after_hwframe+0x49/0xb3
>>>> [  168.221729]
>>>> [  168.221848] Freed by task 41759:
>>>> [  168.222088]  save_stack+0x1b/0x40
>>>> [  168.222336]  __kasan_slab_free+0x12f/0x180
>>>> [  168.222632]  slab_free_freelist_hook+0x4d/0x120
>>>> [  168.222959]  kmem_cache_free+0x90/0x2e0
>>>> [  168.223239]  do_exit+0x5d2/0x12e0
>>>> [  168.223482]  do_group_exit+0x6f/0x130
>>>> [  168.223754]  __x64_sys_exit_group+0x28/0x30
>>>> [  168.224061]  do_syscall_64+0x5e/0xe0
>>>> [  168.224326]  entry_SYSCALL_64_after_hwframe+0x49/0xb3
>>>> [  168.224686]
>>>>
>>>> which indicates that current->files is no longer valid.
>>>
>>> Narrowed it down to the test/open-close test, and in particular where
>>> it closes the ring itself:
>>>
>>> ret = test_close(&ring, ring.ring_fd, 1);
>>>
>>> This seems to be because you do io_req_init_async() after calling
>>> io_issue_sqe(), and the command handler may have set something
>>> else for ->func at that point. Hence we never call the right
>>> handler if the close needs to be deferred, as it needs to for
>>> the io_uring as it has ->flush() defined.
>>>
>>> Why isn't io_req_init_async() just doing:
>>>
>>> static inline void io_req_init_async(struct io_kiocb *req,
>>>                           void (*func)(struct io_wq_work **))
>>> {
>>>          if (!(req->flags & REQ_F_WORK_INITIALIZED)) {
>>>                  req->work = (struct io_wq_work){ .func = func };
>>>                  req->flags |= REQ_F_WORK_INITIALIZED;
>>>          }
>>> }
>>>
>>> ?
>>
>> I guess that won't always work, if the request has been deferred and
>> we're now setting a new work func. So we really want the 'reset to
>> io_wq_submit_work' to only happen if the opcode hasn't already set
>> a private handler. Can you make that change?
>>
>> Also please fix up missing braces. The cases of:
>>
>> if (something) {
>> 	line 1
>> 	line 2
>> } else
>> 	line 3
>>
>> should always includes braces, if one clause has it.
>>
>> A v5 with those two things would be ready to commit.
>>
> 
> There is another thing:
> 
> io_submit_sqes()
>      -> io_close() (let ->flush == NULL)
>          -> __io_close_finish()
>              -> filp_close(req->close.put_file, *req->work.files*);
> 
> where req->work.files is garbage.
I think this bug is independent of my patch. Without my patches, if close request
will be submitted and completed inline, req->work.files will be NULL, it's still
problematic, should we use current->files here?

Regards,
Xiaoguang Wang
> 

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v4 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline
  2020-05-30 17:29     ` Jens Axboe
  2020-05-30 17:36       ` Pavel Begunkov
@ 2020-05-31 14:12       ` Xiaoguang Wang
  2020-05-31 14:31         ` Xiaoguang Wang
  2020-05-31 14:33         ` [PATCH v4 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline Pavel Begunkov
  1 sibling, 2 replies; 27+ messages in thread
From: Xiaoguang Wang @ 2020-05-31 14:12 UTC (permalink / raw)
  To: Jens Axboe, io-uring; +Cc: asml.silence, joseph.qi

hi jens,

> On 5/30/20 11:14 AM, Jens Axboe wrote:
>> On 5/30/20 10:44 AM, Jens Axboe wrote:
>>> On 5/30/20 8:39 AM, Xiaoguang Wang wrote:
>>>> If requests can be submitted and completed inline, we don't need to
>>>> initialize whole io_wq_work in io_init_req(), which is an expensive
>>>> operation, add a new 'REQ_F_WORK_INITIALIZED' to control whether
>>>> io_wq_work is initialized.
>>>>
>>>> I use /dev/nullb0 to evaluate performance improvement in my physical
>>>> machine:
>>>>    modprobe null_blk nr_devices=1 completion_nsec=0
>>>>    sudo taskset -c 60 fio  -name=fiotest -filename=/dev/nullb0 -iodepth=128
>>>>    -thread -rw=read -ioengine=io_uring -direct=1 -bs=4k -size=100G -numjobs=1
>>>>    -time_based -runtime=120
>>>>
>>>> before this patch:
>>>> Run status group 0 (all jobs):
>>>>     READ: bw=724MiB/s (759MB/s), 724MiB/s-724MiB/s (759MB/s-759MB/s),
>>>>     io=84.8GiB (91.1GB), run=120001-120001msec
>>>>
>>>> With this patch:
>>>> Run status group 0 (all jobs):
>>>>     READ: bw=761MiB/s (798MB/s), 761MiB/s-761MiB/s (798MB/s-798MB/s),
>>>>     io=89.2GiB (95.8GB), run=120001-120001msec
>>>>
>>>> About 5% improvement.
>>>
>>> There's something funky going on here. I ran the liburing test
>>> suite on this, and get a lot of left behind workers:
>>>
>>> Tests _maybe_ failed:  ring-leak open-close open-close file-update file-update accept-reuse accept-reuse poll-v-poll poll-v-poll fadvise fadvise madvise madvise short-read short-read openat2 openat2 probe probe shared-wq shared-wq personality personality eventfd eventfd send_recv send_recv eventfd-ring eventfd-ring across-fork across-fork sq-poll-kthread sq-poll-kthread splice splice lfs-openat lfs-openat lfs-openat-write lfs-openat-write iopoll iopoll d4ae271dfaae-test d4ae271dfaae-test eventfd-disable eventfd-disable write-file write-file buf-rw buf-rw statx statx
>>>
>>> and also saw this:
>>>
>>> [  168.208940] ==================================================================
>>> [  168.209311] BUG: KASAN: use-after-free in __lock_acquire+0x8bf/0x3000
>>> [  168.209626] Read of size 8 at addr ffff88806801c0d8 by task io_wqe_worker-0/41761
>>> [  168.209987]
>>> [  168.210069] CPU: 0 PID: 41761 Comm: io_wqe_worker-0 Not tainted 5.7.0-rc7+ #6318
>>> [  168.210424] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014
>>> [  168.210857] Call Trace:
>>> [  168.210991]  dump_stack+0x97/0xe0
>>> [  168.211164]  print_address_description.constprop.0+0x1a/0x210
>>> [  168.211446]  ? __lock_acquire+0x8bf/0x3000
>>> [  168.211649]  __kasan_report.cold+0x20/0x39
>>> [  168.211851]  ? __lock_acquire+0x8bf/0x3000
>>> [  168.212051]  kasan_report+0x30/0x40
>>> [  168.212226]  __lock_acquire+0x8bf/0x3000
>>> [  168.212432]  ? ret_from_fork+0x24/0x30
>>> [  168.212623]  ? stack_trace_save+0x81/0xa0
>>> [  168.212821]  ? lockdep_hardirqs_on+0x270/0x270
>>> [  168.213039]  ? save_stack+0x32/0x40
>>> [  168.213212]  lock_acquire+0x122/0x570
>>> [  168.213398]  ? __close_fd_get_file+0x40/0x150
>>> [  168.213615]  ? lock_release+0x3f0/0x3f0
>>> [  168.213814]  ? __lock_acquire+0x87e/0x3000
>>> [  168.214016]  _raw_spin_lock+0x2c/0x40
>>> [  168.214196]  ? __close_fd_get_file+0x40/0x150
>>> [  168.214408]  __close_fd_get_file+0x40/0x150
>>> [  168.214618]  io_issue_sqe+0x57f/0x22f0
>>> [  168.214803]  ? lockdep_hardirqs_on+0x270/0x270
>>> [  168.215019]  ? mark_held_locks+0x24/0x90
>>> [  168.215211]  ? quarantine_put+0x6f/0x190
>>> [  168.215404]  ? io_assign_current_work+0x59/0x80
>>> [  168.215623]  ? __ia32_sys_io_uring_setup+0x30/0x30
>>> [  168.215855]  ? find_held_lock+0xcb/0x100
>>> [  168.216054]  ? io_worker_handle_work+0x289/0x980
>>> [  168.216280]  ? lock_downgrade+0x340/0x340
>>> [  168.216476]  ? io_wq_submit_work+0x5d/0x140
>>> [  168.216679]  ? _raw_spin_unlock_irq+0x24/0x40
>>> [  168.216890]  io_wq_submit_work+0x5d/0x140
>>> [  168.217087]  io_worker_handle_work+0x30a/0x980
>>> [  168.217305]  ? io_wqe_dec_running.isra.0+0x70/0x70
>>> [  168.217537]  ? do_raw_spin_lock+0x100/0x180
>>> [  168.217742]  ? rwlock_bug.part.0+0x60/0x60
>>> [  168.217943]  io_wqe_worker+0x5fd/0x780
>>> [  168.218126]  ? lock_downgrade+0x340/0x340
>>> [  168.218323]  ? io_worker_handle_work+0x980/0x980
>>> [  168.218546]  ? lockdep_hardirqs_on+0x17d/0x270
>>> [  168.218765]  ? __kthread_parkme+0xca/0xe0
>>> [  168.218961]  ? io_worker_handle_work+0x980/0x980
>>> [  168.219186]  kthread+0x1f0/0x220
>>> [  168.219346]  ? kthread_create_worker_on_cpu+0xb0/0xb0
>>> [  168.219590]  ret_from_fork+0x24/0x30
>>> [  168.219768]
>>> [  168.219846] Allocated by task 41758:
>>> [  168.220021]  save_stack+0x1b/0x40
>>> [  168.220185]  __kasan_kmalloc.constprop.0+0xc2/0xd0
>>> [  168.220416]  kmem_cache_alloc+0xe0/0x290
>>> [  168.220607]  dup_fd+0x4e/0x5a0
>>> [  168.220758]  copy_process+0xe35/0x2bf0
>>> [  168.220942]  _do_fork+0xd8/0x550
>>> [  168.221102]  __do_sys_clone+0xb5/0xe0
>>> [  168.221282]  do_syscall_64+0x5e/0xe0
>>> [  168.221457]  entry_SYSCALL_64_after_hwframe+0x49/0xb3
>>> [  168.221729]
>>> [  168.221848] Freed by task 41759:
>>> [  168.222088]  save_stack+0x1b/0x40
>>> [  168.222336]  __kasan_slab_free+0x12f/0x180
>>> [  168.222632]  slab_free_freelist_hook+0x4d/0x120
>>> [  168.222959]  kmem_cache_free+0x90/0x2e0
>>> [  168.223239]  do_exit+0x5d2/0x12e0
>>> [  168.223482]  do_group_exit+0x6f/0x130
>>> [  168.223754]  __x64_sys_exit_group+0x28/0x30
>>> [  168.224061]  do_syscall_64+0x5e/0xe0
>>> [  168.224326]  entry_SYSCALL_64_after_hwframe+0x49/0xb3
>>> [  168.224686]
>>>
>>> which indicates that current->files is no longer valid.
>>
>> Narrowed it down to the test/open-close test, and in particular where
>> it closes the ring itself:
>>
>> ret = test_close(&ring, ring.ring_fd, 1);
>>
>> This seems to be because you do io_req_init_async() after calling
>> io_issue_sqe(), and the command handler may have set something
>> else for ->func at that point. Hence we never call the right
>> handler if the close needs to be deferred, as it needs to for
>> the io_uring as it has ->flush() defined.
>>
>> Why isn't io_req_init_async() just doing:
>>
>> static inline void io_req_init_async(struct io_kiocb *req,
>>                           void (*func)(struct io_wq_work **))
>> {
>>          if (!(req->flags & REQ_F_WORK_INITIALIZED)) {
>>                  req->work = (struct io_wq_work){ .func = func };
>>                  req->flags |= REQ_F_WORK_INITIALIZED;
>>          }
>> }
>>
>> ?
> 
> I guess that won't always work, if the request has been deferred and
> we're now setting a new work func. So we really want the 'reset to
> io_wq_submit_work' to only happen if the opcode hasn't already set
> a private handler. Can you make that change?
> 
> Also please fix up missing braces. The cases of:
> 
> if (something) {
> 	line 1
> 	line 2
> } else
> 	line 3
> 
> should always includes braces, if one clause has it.
> 
> A v5 with those two things would be ready to commit.
First thanks for figuring out this bug.
Indeed, I had run test cases in liburing/test multiple rounds before sending
V4 patches, and all test cases pass, no kernel issues occurred, here is my kernel
build config, seems that I had kasan enabled.
[lege@localhost linux]$ cat .config | grep KASAN
CONFIG_KASAN_SHADOW_OFFSET=0xdffffc0000000000
CONFIG_HAVE_ARCH_KASAN=y
CONFIG_HAVE_ARCH_KASAN_VMALLOC=y
CONFIG_CC_HAS_KASAN_GENERIC=y
CONFIG_KASAN=y
CONFIG_KASAN_GENERIC=y
CONFIG_KASAN_OUTLINE=y
# CONFIG_KASAN_INLINE is not set
CONFIG_KASAN_STACK=1
# CONFIG_KASAN_VMALLOC is not set
# CONFIG_TEST_KASAN is not set
But I don't know why I did't reproduce the bug, sorry.

Are you fine below changes?
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 12296ce3e8b9..2a3a02838f7b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -907,9 +907,10 @@ static void io_file_put_work(struct work_struct *work);
  static inline void io_req_init_async(struct io_kiocb *req,
                         void (*func)(struct io_wq_work **))
  {
-       if (req->flags & REQ_F_WORK_INITIALIZED)
-               req->work.func = func;
-       else {
+       if (req->flags & REQ_F_WORK_INITIALIZED) {
+               if (!req->work.func)
+                       req->work.func = func;
+       } else {
                 req->work = (struct io_wq_work){ .func = func };
                 req->flags |= REQ_F_WORK_INITIALIZED;
         }
@@ -2920,6 +2921,8 @@ static int __io_splice_prep(struct io_kiocb *req,
                 return ret;
         req->flags |= REQ_F_NEED_CLEANUP;

+       /* Splice will be punted aync, so initialize io_wq_work firstly_*/
+       io_req_init_async(req, io_wq_submit_work);
         if (!S_ISREG(file_inode(sp->file_in)->i_mode))
                 req->work.flags |= IO_WQ_WORK_UNBOUND;

@@ -3592,6 +3595,9 @@ static int io_statx(struct io_kiocb *req, bool force_nonblock)

  static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
  {
+        /* Close may be punted aync, so initialize io_wq_work firstly */
+       io_req_init_async(req, io_wq_submit_work);
+
         /*
          * If we queue this for async, it must not be cancellable. That would
          * leave the 'file' in an undeterminate state.

Regards,
Xiaoguang Wang


> 

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH v4 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline
  2020-05-31 14:12       ` Xiaoguang Wang
@ 2020-05-31 14:31         ` Xiaoguang Wang
  2020-05-31 14:57           ` Pavel Begunkov
  2020-05-31 14:33         ` [PATCH v4 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline Pavel Begunkov
  1 sibling, 1 reply; 27+ messages in thread
From: Xiaoguang Wang @ 2020-05-31 14:31 UTC (permalink / raw)
  To: Jens Axboe, io-uring; +Cc: asml.silence, joseph.qi

hi,

> hi jens,
> 
>> On 5/30/20 11:14 AM, Jens Axboe wrote:
>>> On 5/30/20 10:44 AM, Jens Axboe wrote:
>>>> On 5/30/20 8:39 AM, Xiaoguang Wang wrote:
>>>>> If requests can be submitted and completed inline, we don't need to
>>>>> initialize whole io_wq_work in io_init_req(), which is an expensive
>>>>> operation, add a new 'REQ_F_WORK_INITIALIZED' to control whether
>>>>> io_wq_work is initialized.
>>>>>
>>>>> I use /dev/nullb0 to evaluate performance improvement in my physical
>>>>> machine:
>>>>>    modprobe null_blk nr_devices=1 completion_nsec=0
>>>>>    sudo taskset -c 60 fio  -name=fiotest -filename=/dev/nullb0 -iodepth=128
>>>>>    -thread -rw=read -ioengine=io_uring -direct=1 -bs=4k -size=100G -numjobs=1
>>>>>    -time_based -runtime=120
>>>>>
>>>>> before this patch:
>>>>> Run status group 0 (all jobs):
>>>>>     READ: bw=724MiB/s (759MB/s), 724MiB/s-724MiB/s (759MB/s-759MB/s),
>>>>>     io=84.8GiB (91.1GB), run=120001-120001msec
>>>>>
>>>>> With this patch:
>>>>> Run status group 0 (all jobs):
>>>>>     READ: bw=761MiB/s (798MB/s), 761MiB/s-761MiB/s (798MB/s-798MB/s),
>>>>>     io=89.2GiB (95.8GB), run=120001-120001msec
>>>>>
>>>>> About 5% improvement.
>>>>
>>>> There's something funky going on here. I ran the liburing test
>>>> suite on this, and get a lot of left behind workers:
>>>>
>>>> Tests _maybe_ failed:  ring-leak open-close open-close file-update file-update accept-reuse accept-reuse poll-v-poll poll-v-poll fadvise fadvise madvise madvise short-read short-read openat2 
>>>> openat2 probe probe shared-wq shared-wq personality personality eventfd eventfd send_recv send_recv eventfd-ring eventfd-ring across-fork across-fork sq-poll-kthread sq-poll-kthread splice splice 
>>>> lfs-openat lfs-openat lfs-openat-write lfs-openat-write iopoll iopoll d4ae271dfaae-test d4ae271dfaae-test eventfd-disable eventfd-disable write-file write-file buf-rw buf-rw statx statx
>>>>
>>>> and also saw this:
>>>>
>>>> [  168.208940] ==================================================================
>>>> [  168.209311] BUG: KASAN: use-after-free in __lock_acquire+0x8bf/0x3000
>>>> [  168.209626] Read of size 8 at addr ffff88806801c0d8 by task io_wqe_worker-0/41761
>>>> [  168.209987]
>>>> [  168.210069] CPU: 0 PID: 41761 Comm: io_wqe_worker-0 Not tainted 5.7.0-rc7+ #6318
>>>> [  168.210424] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014
>>>> [  168.210857] Call Trace:
>>>> [  168.210991]  dump_stack+0x97/0xe0
>>>> [  168.211164]  print_address_description.constprop.0+0x1a/0x210
>>>> [  168.211446]  ? __lock_acquire+0x8bf/0x3000
>>>> [  168.211649]  __kasan_report.cold+0x20/0x39
>>>> [  168.211851]  ? __lock_acquire+0x8bf/0x3000
>>>> [  168.212051]  kasan_report+0x30/0x40
>>>> [  168.212226]  __lock_acquire+0x8bf/0x3000
>>>> [  168.212432]  ? ret_from_fork+0x24/0x30
>>>> [  168.212623]  ? stack_trace_save+0x81/0xa0
>>>> [  168.212821]  ? lockdep_hardirqs_on+0x270/0x270
>>>> [  168.213039]  ? save_stack+0x32/0x40
>>>> [  168.213212]  lock_acquire+0x122/0x570
>>>> [  168.213398]  ? __close_fd_get_file+0x40/0x150
>>>> [  168.213615]  ? lock_release+0x3f0/0x3f0
>>>> [  168.213814]  ? __lock_acquire+0x87e/0x3000
>>>> [  168.214016]  _raw_spin_lock+0x2c/0x40
>>>> [  168.214196]  ? __close_fd_get_file+0x40/0x150
>>>> [  168.214408]  __close_fd_get_file+0x40/0x150
>>>> [  168.214618]  io_issue_sqe+0x57f/0x22f0
>>>> [  168.214803]  ? lockdep_hardirqs_on+0x270/0x270
>>>> [  168.215019]  ? mark_held_locks+0x24/0x90
>>>> [  168.215211]  ? quarantine_put+0x6f/0x190
>>>> [  168.215404]  ? io_assign_current_work+0x59/0x80
>>>> [  168.215623]  ? __ia32_sys_io_uring_setup+0x30/0x30
>>>> [  168.215855]  ? find_held_lock+0xcb/0x100
>>>> [  168.216054]  ? io_worker_handle_work+0x289/0x980
>>>> [  168.216280]  ? lock_downgrade+0x340/0x340
>>>> [  168.216476]  ? io_wq_submit_work+0x5d/0x140
>>>> [  168.216679]  ? _raw_spin_unlock_irq+0x24/0x40
>>>> [  168.216890]  io_wq_submit_work+0x5d/0x140
>>>> [  168.217087]  io_worker_handle_work+0x30a/0x980
>>>> [  168.217305]  ? io_wqe_dec_running.isra.0+0x70/0x70
>>>> [  168.217537]  ? do_raw_spin_lock+0x100/0x180
>>>> [  168.217742]  ? rwlock_bug.part.0+0x60/0x60
>>>> [  168.217943]  io_wqe_worker+0x5fd/0x780
>>>> [  168.218126]  ? lock_downgrade+0x340/0x340
>>>> [  168.218323]  ? io_worker_handle_work+0x980/0x980
>>>> [  168.218546]  ? lockdep_hardirqs_on+0x17d/0x270
>>>> [  168.218765]  ? __kthread_parkme+0xca/0xe0
>>>> [  168.218961]  ? io_worker_handle_work+0x980/0x980
>>>> [  168.219186]  kthread+0x1f0/0x220
>>>> [  168.219346]  ? kthread_create_worker_on_cpu+0xb0/0xb0
>>>> [  168.219590]  ret_from_fork+0x24/0x30
>>>> [  168.219768]
>>>> [  168.219846] Allocated by task 41758:
>>>> [  168.220021]  save_stack+0x1b/0x40
>>>> [  168.220185]  __kasan_kmalloc.constprop.0+0xc2/0xd0
>>>> [  168.220416]  kmem_cache_alloc+0xe0/0x290
>>>> [  168.220607]  dup_fd+0x4e/0x5a0
>>>> [  168.220758]  copy_process+0xe35/0x2bf0
>>>> [  168.220942]  _do_fork+0xd8/0x550
>>>> [  168.221102]  __do_sys_clone+0xb5/0xe0
>>>> [  168.221282]  do_syscall_64+0x5e/0xe0
>>>> [  168.221457]  entry_SYSCALL_64_after_hwframe+0x49/0xb3
>>>> [  168.221729]
>>>> [  168.221848] Freed by task 41759:
>>>> [  168.222088]  save_stack+0x1b/0x40
>>>> [  168.222336]  __kasan_slab_free+0x12f/0x180
>>>> [  168.222632]  slab_free_freelist_hook+0x4d/0x120
>>>> [  168.222959]  kmem_cache_free+0x90/0x2e0
>>>> [  168.223239]  do_exit+0x5d2/0x12e0
>>>> [  168.223482]  do_group_exit+0x6f/0x130
>>>> [  168.223754]  __x64_sys_exit_group+0x28/0x30
>>>> [  168.224061]  do_syscall_64+0x5e/0xe0
>>>> [  168.224326]  entry_SYSCALL_64_after_hwframe+0x49/0xb3
>>>> [  168.224686]
>>>>
>>>> which indicates that current->files is no longer valid.
>>>
>>> Narrowed it down to the test/open-close test, and in particular where
>>> it closes the ring itself:
>>>
>>> ret = test_close(&ring, ring.ring_fd, 1);
>>>
>>> This seems to be because you do io_req_init_async() after calling
>>> io_issue_sqe(), and the command handler may have set something
>>> else for ->func at that point. Hence we never call the right
>>> handler if the close needs to be deferred, as it needs to for
>>> the io_uring as it has ->flush() defined.
>>>
>>> Why isn't io_req_init_async() just doing:
>>>
>>> static inline void io_req_init_async(struct io_kiocb *req,
>>>                           void (*func)(struct io_wq_work **))
>>> {
>>>          if (!(req->flags & REQ_F_WORK_INITIALIZED)) {
>>>                  req->work = (struct io_wq_work){ .func = func };
>>>                  req->flags |= REQ_F_WORK_INITIALIZED;
>>>          }
>>> }
>>>
>>> ?
>>
>> I guess that won't always work, if the request has been deferred and
>> we're now setting a new work func. So we really want the 'reset to
>> io_wq_submit_work' to only happen if the opcode hasn't already set
>> a private handler. Can you make that change?
>>
>> Also please fix up missing braces. The cases of:
>>
>> if (something) {
>>     line 1
>>     line 2
>> } else
>>     line 3
>>
>> should always includes braces, if one clause has it.
>>
>> A v5 with those two things would be ready to commit.
> First thanks for figuring out this bug.
> Indeed, I had run test cases in liburing/test multiple rounds before sending
> V4 patches, and all test cases pass, no kernel issues occurred, here is my kernel
> build config, seems that I had kasan enabled.
> [lege@localhost linux]$ cat .config | grep KASAN
> CONFIG_KASAN_SHADOW_OFFSET=0xdffffc0000000000
> CONFIG_HAVE_ARCH_KASAN=y
> CONFIG_HAVE_ARCH_KASAN_VMALLOC=y
> CONFIG_CC_HAS_KASAN_GENERIC=y
> CONFIG_KASAN=y
> CONFIG_KASAN_GENERIC=y
> CONFIG_KASAN_OUTLINE=y
> # CONFIG_KASAN_INLINE is not set
> CONFIG_KASAN_STACK=1
> # CONFIG_KASAN_VMALLOC is not set
> # CONFIG_TEST_KASAN is not set
> But I don't know why I did't reproduce the bug, sorry.
> 
> Are you fine below changes?
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 12296ce3e8b9..2a3a02838f7b 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -907,9 +907,10 @@ static void io_file_put_work(struct work_struct *work);
>   static inline void io_req_init_async(struct io_kiocb *req,
>                          void (*func)(struct io_wq_work **))
>   {
> -       if (req->flags & REQ_F_WORK_INITIALIZED)
> -               req->work.func = func;
> -       else {
> +       if (req->flags & REQ_F_WORK_INITIALIZED) {
> +               if (!req->work.func)
> +                       req->work.func = func;
> +       } else {
>                  req->work = (struct io_wq_work){ .func = func };
>                  req->flags |= REQ_F_WORK_INITIALIZED;
>          }
> @@ -2920,6 +2921,8 @@ static int __io_splice_prep(struct io_kiocb *req,
>                  return ret;
>          req->flags |= REQ_F_NEED_CLEANUP;
> 
> +       /* Splice will be punted aync, so initialize io_wq_work firstly_*/
> +       io_req_init_async(req, io_wq_submit_work);
>          if (!S_ISREG(file_inode(sp->file_in)->i_mode))
>                  req->work.flags |= IO_WQ_WORK_UNBOUND;
> 
> @@ -3592,6 +3595,9 @@ static int io_statx(struct io_kiocb *req, bool force_nonblock)
> 
>   static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>   {
> +        /* Close may be punted aync, so initialize io_wq_work firstly */
> +       io_req_init_async(req, io_wq_submit_work);
> +
For splice and close requests, these two about io_req_init_async() calls should be
io_req_init_async(req, NULL), because they change req->work.flags firstly.

Regards,
Xiaoguang Wang

>          /*
>           * If we queue this for async, it must not be cancellable. That would
>           * leave the 'file' in an undeterminate state.
> 
> Regards,
> Xiaoguang Wang
> 
> 
>>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v4 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline
  2020-05-31 14:12       ` Xiaoguang Wang
  2020-05-31 14:31         ` Xiaoguang Wang
@ 2020-05-31 14:33         ` Pavel Begunkov
  1 sibling, 0 replies; 27+ messages in thread
From: Pavel Begunkov @ 2020-05-31 14:33 UTC (permalink / raw)
  To: Xiaoguang Wang, Jens Axboe, io-uring; +Cc: joseph.qi

On 31/05/2020 17:12, Xiaoguang Wang wrote:
>> A v5 with those two things would be ready to commit.
> First thanks for figuring out this bug.
> Indeed, I had run test cases in liburing/test multiple rounds before sending
> V4 patches, and all test cases pass, no kernel issues occurred, here is my kernel
> build config, seems that I had kasan enabled.
> [lege@localhost linux]$ cat .config | grep KASAN
> CONFIG_KASAN_SHADOW_OFFSET=0xdffffc0000000000
> CONFIG_HAVE_ARCH_KASAN=y
> CONFIG_HAVE_ARCH_KASAN_VMALLOC=y
> CONFIG_CC_HAS_KASAN_GENERIC=y
> CONFIG_KASAN=y
> CONFIG_KASAN_GENERIC=y
> CONFIG_KASAN_OUTLINE=y
> # CONFIG_KASAN_INLINE is not set
> CONFIG_KASAN_STACK=1
> # CONFIG_KASAN_VMALLOC is not set
> # CONFIG_TEST_KASAN is not set
> But I don't know why I did't reproduce the bug, sorry.

Hmm, can't find an option to poison allocated mem. Try to fill req->work
with garbage by hand (just after allocation), should help to reproduce.

> 
> Are you fine below changes?
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 12296ce3e8b9..2a3a02838f7b 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -907,9 +907,10 @@ static void io_file_put_work(struct work_struct *work);
>  static inline void io_req_init_async(struct io_kiocb *req,
>                         void (*func)(struct io_wq_work **))
>  {
> -       if (req->flags & REQ_F_WORK_INITIALIZED)
> -               req->work.func = func;
> -       else {
> +       if (req->flags & REQ_F_WORK_INITIALIZED) {
> +               if (!req->work.func)

->func should be initialised after alloc then, if not already.

> +                       req->work.func = func;
> +       } else {
>                 req->work = (struct io_wq_work){ .func = func };
>                 req->flags |= REQ_F_WORK_INITIALIZED;
>         }
> @@ -2920,6 +2921,8 @@ static int __io_splice_prep(struct io_kiocb *req,
>                 return ret;
>         req->flags |= REQ_F_NEED_CLEANUP;
> 
> +       /* Splice will be punted aync, so initialize io_wq_work firstly_*/
> +       io_req_init_async(req, io_wq_submit_work);
>         if (!S_ISREG(file_inode(sp->file_in)->i_mode))
>                 req->work.flags |= IO_WQ_WORK_UNBOUND;
> 
> @@ -3592,6 +3595,9 @@ static int io_statx(struct io_kiocb *req, bool force_nonblock)
> 
>  static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>  {
> +        /* Close may be punted aync, so initialize io_wq_work firstly */

s/aync/async/

> +       io_req_init_async(req, io_wq_submit_work);
> +
>         /*
>          * If we queue this for async, it must not be cancellable. That would
>          * leave the 'file' in an undeterminate state.
> 
> Regards,
> Xiaoguang Wang
> 
> 
>>

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v4 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline
  2020-05-31 13:57         ` Xiaoguang Wang
@ 2020-05-31 14:49           ` Pavel Begunkov
  0 siblings, 0 replies; 27+ messages in thread
From: Pavel Begunkov @ 2020-05-31 14:49 UTC (permalink / raw)
  To: Xiaoguang Wang, Jens Axboe, io-uring; +Cc: joseph.qi

On 31/05/2020 16:57, Xiaoguang Wang wrote:
>> There is another thing:
>>
>> io_submit_sqes()
>>      -> io_close() (let ->flush == NULL)
>>          -> __io_close_finish()
>>              -> filp_close(req->close.put_file, *req->work.files*);
>>
>> where req->work.files is garbage.
> I think this bug is independent of my patch. Without my patches, if close request

It looks like it's ok to pass NULL, at least Jens did it here and I see an
occurrence of ->flush(NULL). And it's usually not referenced or completely
ignored. I'll check later.

> will be submitted and completed inline, req->work.files will be NULL, it's still
> problematic, should we use current->files here?

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v4 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline
  2020-05-31 14:31         ` Xiaoguang Wang
@ 2020-05-31 14:57           ` Pavel Begunkov
  2020-05-31 16:15             ` Xiaoguang Wang
  2020-06-01  4:56             ` [PATCH v5 " Xiaoguang Wang
  0 siblings, 2 replies; 27+ messages in thread
From: Pavel Begunkov @ 2020-05-31 14:57 UTC (permalink / raw)
  To: Xiaoguang Wang, Jens Axboe, io-uring; +Cc: joseph.qi

>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>> index 12296ce3e8b9..2a3a02838f7b 100644
>> --- a/fs/io_uring.c
>> +++ b/fs/io_uring.c
>> @@ -907,9 +907,10 @@ static void io_file_put_work(struct work_struct *work);
>>   static inline void io_req_init_async(struct io_kiocb *req,
>>                          void (*func)(struct io_wq_work **))
>>   {
>> -       if (req->flags & REQ_F_WORK_INITIALIZED)
>> -               req->work.func = func;
>> -       else {
>> +       if (req->flags & REQ_F_WORK_INITIALIZED) {
>> +               if (!req->work.func)
>> +                       req->work.func = func;
>> +       } else {
>>                  req->work = (struct io_wq_work){ .func = func };
>>                  req->flags |= REQ_F_WORK_INITIALIZED;
>>          }
>> @@ -2920,6 +2921,8 @@ static int __io_splice_prep(struct io_kiocb *req,
>>                  return ret;
>>          req->flags |= REQ_F_NEED_CLEANUP;
>>
>> +       /* Splice will be punted aync, so initialize io_wq_work firstly_*/
>> +       io_req_init_async(req, io_wq_submit_work);
>>          if (!S_ISREG(file_inode(sp->file_in)->i_mode))
>>                  req->work.flags |= IO_WQ_WORK_UNBOUND;
>>
>> @@ -3592,6 +3595,9 @@ static int io_statx(struct io_kiocb *req, bool force_nonblock)
>>
>>   static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>>   {
>> +        /* Close may be punted aync, so initialize io_wq_work firstly */
>> +       io_req_init_async(req, io_wq_submit_work);
>> +
> For splice and close requests, these two about io_req_init_async() calls should be
> io_req_init_async(req, NULL), because they change req->work.flags firstly.

Please no. Such assumptions/dependencies are prone to break.
It'll get us subtle bugs in no time. 

BTW, why not io_wq_submit_work in place of NULL?

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v4 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline
  2020-05-31 14:57           ` Pavel Begunkov
@ 2020-05-31 16:15             ` Xiaoguang Wang
  2020-06-01 10:43               ` Pavel Begunkov
  2020-06-01  4:56             ` [PATCH v5 " Xiaoguang Wang
  1 sibling, 1 reply; 27+ messages in thread
From: Xiaoguang Wang @ 2020-05-31 16:15 UTC (permalink / raw)
  To: Pavel Begunkov, Jens Axboe, io-uring; +Cc: joseph.qi

hi,

>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>> index 12296ce3e8b9..2a3a02838f7b 100644
>>> --- a/fs/io_uring.c
>>> +++ b/fs/io_uring.c
>>> @@ -907,9 +907,10 @@ static void io_file_put_work(struct work_struct *work);
>>>    static inline void io_req_init_async(struct io_kiocb *req,
>>>                           void (*func)(struct io_wq_work **))
>>>    {
>>> -       if (req->flags & REQ_F_WORK_INITIALIZED)
>>> -               req->work.func = func;
>>> -       else {
>>> +       if (req->flags & REQ_F_WORK_INITIALIZED) {
>>> +               if (!req->work.func)
>>> +                       req->work.func = func;
>>> +       } else {
>>>                   req->work = (struct io_wq_work){ .func = func };
>>>                   req->flags |= REQ_F_WORK_INITIALIZED;
>>>           }
>>> @@ -2920,6 +2921,8 @@ static int __io_splice_prep(struct io_kiocb *req,
>>>                   return ret;
>>>           req->flags |= REQ_F_NEED_CLEANUP;
>>>
>>> +       /* Splice will be punted aync, so initialize io_wq_work firstly_*/
>>> +       io_req_init_async(req, io_wq_submit_work);
>>>           if (!S_ISREG(file_inode(sp->file_in)->i_mode))
>>>                   req->work.flags |= IO_WQ_WORK_UNBOUND;
>>>
>>> @@ -3592,6 +3595,9 @@ static int io_statx(struct io_kiocb *req, bool force_nonblock)
>>>
>>>    static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>>>    {
>>> +        /* Close may be punted aync, so initialize io_wq_work firstly */
>>> +       io_req_init_async(req, io_wq_submit_work);
>>> +
>> For splice and close requests, these two about io_req_init_async() calls should be
>> io_req_init_async(req, NULL), because they change req->work.flags firstly.
> 
> Please no. Such assumptions/dependencies are prone to break.
> It'll get us subtle bugs in no time.
> 
> BTW, why not io_wq_submit_work in place of NULL?
In the begin of __io_splice_prep or io_close_prep, current io_uring mainline codes will
modify req->work.flags firstly, so we need to call io_req_init_async to initialize
io_wq_work before the work.flags modification.
For below codes:
static inline void io_req_init_async(struct io_kiocb *req,
                         void (*func)(struct io_wq_work **))
{
         if (req->flags & REQ_F_WORK_INITIALIZED) {
                 if (!req->work.func)
                         req->work.func = func;
         } else {
                 req->work = (struct io_wq_work){ .func = func };
                 req->flags |= REQ_F_WORK_INITIALIZED;
         }
}

if we not pass NULL to parameter 'func', e.g. pass io_wq_submit_work, then
we can not use io_req_init_async() to pass io_close_finish again.

Now I'm confused how to write better codes based on current io_uring mainline codes :)
If you have some free time, please have a deeper look, thanks.

Regards,
Xiaoguang Wang


> 

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v5 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline
  2020-05-31 14:57           ` Pavel Begunkov
  2020-05-31 16:15             ` Xiaoguang Wang
@ 2020-06-01  4:56             ` Xiaoguang Wang
  2020-06-01  4:56               ` [PATCH v5 2/2] io_uring: avoid unnecessary io_wq_work copy for fast poll feature Xiaoguang Wang
  1 sibling, 1 reply; 27+ messages in thread
From: Xiaoguang Wang @ 2020-06-01  4:56 UTC (permalink / raw)
  To: io-uring; +Cc: axboe, asml.silence, joseph.qi, Xiaoguang Wang

If requests can be submitted and completed inline, we don't need to
initialize whole io_wq_work in io_init_req(), which is an expensive
operation, add a new 'REQ_F_WORK_INITIALIZED' to control whether
io_wq_work is initialized.

I use /dev/nullb0 to evaluate performance improvement in my physical
machine:
  modprobe null_blk nr_devices=1 completion_nsec=0
  sudo taskset -c 60 fio  -name=fiotest -filename=/dev/nullb0 -iodepth=128
  -thread -rw=read -ioengine=io_uring -direct=1 -bs=4k -size=100G -numjobs=1
  -time_based -runtime=120

before this patch:
Run status group 0 (all jobs):
   READ: bw=724MiB/s (759MB/s), 724MiB/s-724MiB/s (759MB/s-759MB/s),
   io=84.8GiB (91.1GB), run=120001-120001msec

With this patch:
Run status group 0 (all jobs):
   READ: bw=761MiB/s (798MB/s), 761MiB/s-761MiB/s (798MB/s-798MB/s),
   io=89.2GiB (95.8GB), run=120001-120001msec

About 5% improvement.

Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>

---
V4:
  add io_req_init_async() helper

V5:
  refactor io_req_init_async() to io_init_req_work() and io_init_req_work_func
  in case we need to change io_wq_work.func separately.
---
 fs/io-wq.h    |  5 ----
 fs/io_uring.c | 83 +++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 67 insertions(+), 21 deletions(-)

diff --git a/fs/io-wq.h b/fs/io-wq.h
index 5ba12de7572f..3d85d365d764 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -94,11 +94,6 @@ struct io_wq_work {
 	pid_t task_pid;
 };
 
-#define INIT_IO_WORK(work, _func)				\
-	do {							\
-		*(work) = (struct io_wq_work){ .func = _func };	\
-	} while (0)						\
-
 static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
 {
 	if (!work->list.next)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 95df63b0b2ce..8e022d0f0c86 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -538,6 +538,7 @@ enum {
 	REQ_F_POLLED_BIT,
 	REQ_F_BUFFER_SELECTED_BIT,
 	REQ_F_NO_FILE_TABLE_BIT,
+	REQ_F_WORK_INITIALIZED_BIT,
 
 	/* not a real bit, just to check we're not overflowing the space */
 	__REQ_F_LAST_BIT,
@@ -593,6 +594,8 @@ enum {
 	REQ_F_BUFFER_SELECTED	= BIT(REQ_F_BUFFER_SELECTED_BIT),
 	/* doesn't need file table for this request */
 	REQ_F_NO_FILE_TABLE	= BIT(REQ_F_NO_FILE_TABLE_BIT),
+	/* io_wq_work is initialized */
+	REQ_F_WORK_INITIALIZED	= BIT(REQ_F_WORK_INITIALIZED_BIT),
 };
 
 struct async_poll {
@@ -638,6 +641,7 @@ struct io_kiocb {
 	unsigned int		flags;
 	refcount_t		refs;
 	struct task_struct	*task;
+	const struct cred	*creds;
 	unsigned long		fsize;
 	u64			user_data;
 	u32			result;
@@ -900,6 +904,21 @@ EXPORT_SYMBOL(io_uring_get_socket);
 
 static void io_file_put_work(struct work_struct *work);
 
+static inline void io_init_req_work(struct io_kiocb *req)
+{
+	if (req->flags & REQ_F_WORK_INITIALIZED)
+		return;
+
+	memset(&req->work, 0, sizeof(req->work));
+	req->flags |= REQ_F_WORK_INITIALIZED;
+}
+
+static inline void io_init_req_work_func(struct io_kiocb *req,
+			void (*func)(struct io_wq_work **))
+{
+	req->work.func = func;
+}
+
 static inline bool io_async_submit(struct io_ring_ctx *ctx)
 {
 	return ctx->flags & IORING_SETUP_SQPOLL;
@@ -1025,8 +1044,15 @@ static inline void io_req_work_grab_env(struct io_kiocb *req,
 		mmgrab(current->mm);
 		req->work.mm = current->mm;
 	}
-	if (!req->work.creds)
-		req->work.creds = get_current_cred();
+
+	if (!req->work.creds) {
+		if (!req->creds)
+			req->work.creds = get_current_cred();
+		else {
+			req->work.creds = req->creds;
+			req->creds = NULL;
+		}
+	}
 	if (!req->work.fs && def->needs_fs) {
 		spin_lock(&current->fs->lock);
 		if (!current->fs->in_exec) {
@@ -1043,6 +1069,9 @@ static inline void io_req_work_grab_env(struct io_kiocb *req,
 
 static inline void io_req_work_drop_env(struct io_kiocb *req)
 {
+	if (!(req->flags & REQ_F_WORK_INITIALIZED))
+		return;
+
 	if (req->work.mm) {
 		mmdrop(req->work.mm);
 		req->work.mm = NULL;
@@ -2895,6 +2924,8 @@ static int __io_splice_prep(struct io_kiocb *req,
 		return ret;
 	req->flags |= REQ_F_NEED_CLEANUP;
 
+	/* Splice will be punted aync, so initialize io_wq_work firstly. */
+	io_init_req_work(req);
 	if (!S_ISREG(file_inode(sp->file_in)->i_mode))
 		req->work.flags |= IO_WQ_WORK_UNBOUND;
 
@@ -3045,7 +3076,8 @@ static int io_fsync(struct io_kiocb *req, bool force_nonblock)
 {
 	/* fsync always requires a blocking context */
 	if (force_nonblock) {
-		req->work.func = io_fsync_finish;
+		io_init_req_work(req);
+		io_init_req_work_func(req, io_fsync_finish);
 		return -EAGAIN;
 	}
 	__io_fsync(req);
@@ -3093,7 +3125,8 @@ static int io_fallocate(struct io_kiocb *req, bool force_nonblock)
 {
 	/* fallocate always requiring blocking context */
 	if (force_nonblock) {
-		req->work.func = io_fallocate_finish;
+		io_init_req_work(req);
+		io_init_req_work_func(req, io_fallocate_finish);
 		return -EAGAIN;
 	}
 
@@ -3567,6 +3600,9 @@ static int io_statx(struct io_kiocb *req, bool force_nonblock)
 
 static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
+	 /* Close may be punted aync, so initialize io_wq_work firstly */
+	io_init_req_work(req);
+
 	/*
 	 * If we queue this for async, it must not be cancellable. That would
 	 * leave the 'file' in an undeterminate state.
@@ -3618,7 +3654,8 @@ static int io_close(struct io_kiocb *req, bool force_nonblock)
 	if (req->close.put_file->f_op->flush && force_nonblock) {
 		/* avoid grabbing files - we don't need the files */
 		req->flags |= REQ_F_NO_FILE_TABLE | REQ_F_MUST_PUNT;
-		req->work.func = io_close_finish;
+		io_init_req_work(req);
+		io_init_req_work_func(req, io_close_finish);
 		return -EAGAIN;
 	}
 
@@ -3675,7 +3712,8 @@ static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
 {
 	/* sync_file_range always requires a blocking context */
 	if (force_nonblock) {
-		req->work.func = io_sync_file_range_finish;
+		io_init_req_work(req);
+		io_init_req_work_func(req, io_sync_file_range_finish);
 		return -EAGAIN;
 	}
 
@@ -4144,7 +4182,8 @@ static int io_accept(struct io_kiocb *req, bool force_nonblock)
 
 	ret = __io_accept(req, force_nonblock);
 	if (ret == -EAGAIN && force_nonblock) {
-		req->work.func = io_accept_finish;
+		io_init_req_work(req);
+		io_init_req_work_func(req, io_accept_finish);
 		return -EAGAIN;
 	}
 	return 0;
@@ -5144,6 +5183,9 @@ static int io_req_defer_prep(struct io_kiocb *req,
 	if (!sqe)
 		return 0;
 
+	io_init_req_work(req);
+	io_init_req_work_func(req, io_wq_submit_work);
+
 	if (io_op_defs[req->opcode].file_table) {
 		ret = io_grab_files(req);
 		if (unlikely(ret))
@@ -5779,19 +5821,24 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_kiocb *linked_timeout;
 	struct io_kiocb *nxt;
-	const struct cred *old_creds = NULL;
+	const struct cred *creds, *old_creds = NULL;
 	int ret;
 
 again:
 	linked_timeout = io_prep_linked_timeout(req);
 
-	if (req->work.creds && req->work.creds != current_cred()) {
+	if (req->flags & REQ_F_WORK_INITIALIZED)
+		creds = req->work.creds;
+	else
+		creds = req->creds;
+
+	if (creds && creds != current_cred()) {
 		if (old_creds)
 			revert_creds(old_creds);
-		if (old_creds == req->work.creds)
+		if (old_creds == creds)
 			old_creds = NULL; /* restored original creds */
 		else
-			old_creds = override_creds(req->work.creds);
+			old_creds = override_creds(creds);
 	}
 
 	ret = io_issue_sqe(req, sqe, true);
@@ -5808,6 +5855,10 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 			goto exit;
 		}
 punt:
+		io_init_req_work(req);
+		if (!req->work.func)
+			io_init_req_work_func(req, io_wq_submit_work);
+
 		if (io_op_defs[req->opcode].file_table) {
 			ret = io_grab_files(req);
 			if (ret)
@@ -6060,7 +6111,6 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	refcount_set(&req->refs, 2);
 	req->task = NULL;
 	req->result = 0;
-	INIT_IO_WORK(&req->work, io_wq_submit_work);
 
 	if (unlikely(req->opcode >= IORING_OP_LAST))
 		return -EINVAL;
@@ -6082,11 +6132,12 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 
 	id = READ_ONCE(sqe->personality);
 	if (id) {
-		req->work.creds = idr_find(&ctx->personality_idr, id);
-		if (unlikely(!req->work.creds))
+		req->creds = idr_find(&ctx->personality_idr, id);
+		if (unlikely(!req->creds))
 			return -EINVAL;
-		get_cred(req->work.creds);
-	}
+		get_cred(req->creds);
+	} else
+		req->creds = NULL;
 
 	/* same numerical values with corresponding REQ_F_*, safe to copy */
 	req->flags |= sqe_flags;
-- 
2.17.2


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH v5 2/2] io_uring: avoid unnecessary io_wq_work copy for fast poll feature
  2020-06-01  4:56             ` [PATCH v5 " Xiaoguang Wang
@ 2020-06-01  4:56               ` Xiaoguang Wang
  2020-06-02  1:16                 ` Xiaoguang Wang
  0 siblings, 1 reply; 27+ messages in thread
From: Xiaoguang Wang @ 2020-06-01  4:56 UTC (permalink / raw)
  To: io-uring; +Cc: axboe, asml.silence, joseph.qi, Xiaoguang Wang

Basically IORING_OP_POLL_ADD command and async armed poll handlers
for regular commands don't touch io_wq_work, so only REQ_F_WORK_INITIALIZED
is set, can we do io_wq_work copy and restore.

Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
---
V3:
  drop the REQ_F_WORK_NEED_RESTORE flag introduced in V2 patch, just
  use REQ_F_WORK_INITIALIZED to control whether to do io_wq_work copy
  and restore.
---
 fs/io_uring.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8e022d0f0c86..b761ef7366f9 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4527,7 +4527,8 @@ static void io_async_task_func(struct callback_head *cb)
 	spin_unlock_irq(&ctx->completion_lock);
 
 	/* restore ->work in case we need to retry again */
-	memcpy(&req->work, &apoll->work, sizeof(req->work));
+	if (req->flags & REQ_F_WORK_INITIALIZED)
+		memcpy(&req->work, &apoll->work, sizeof(req->work));
 	kfree(apoll);
 
 	if (!canceled) {
@@ -4624,7 +4625,8 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
 		return false;
 
 	req->flags |= REQ_F_POLLED;
-	memcpy(&apoll->work, &req->work, sizeof(req->work));
+	if (req->flags & REQ_F_WORK_INITIALIZED)
+		memcpy(&apoll->work, &req->work, sizeof(req->work));
 	had_io = req->io != NULL;
 
 	get_task_struct(current);
@@ -4649,7 +4651,8 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
 		if (!had_io)
 			io_poll_remove_double(req);
 		spin_unlock_irq(&ctx->completion_lock);
-		memcpy(&req->work, &apoll->work, sizeof(req->work));
+		if (req->flags & REQ_F_WORK_INITIALIZED)
+			memcpy(&req->work, &apoll->work, sizeof(req->work));
 		kfree(apoll);
 		return false;
 	}
@@ -4694,7 +4697,9 @@ static bool io_poll_remove_one(struct io_kiocb *req)
 			 * io_req_work_drop_env below when dropping the
 			 * final reference.
 			 */
-			memcpy(&req->work, &apoll->work, sizeof(req->work));
+			if (req->flags & REQ_F_WORK_INITIALIZED)
+				memcpy(&req->work, &apoll->work,
+				       sizeof(req->work));
 			kfree(apoll);
 		}
 	}
-- 
2.17.2


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH v4 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline
  2020-05-31 16:15             ` Xiaoguang Wang
@ 2020-06-01 10:43               ` Pavel Begunkov
  2020-06-01 11:50                 ` Xiaoguang Wang
  0 siblings, 1 reply; 27+ messages in thread
From: Pavel Begunkov @ 2020-06-01 10:43 UTC (permalink / raw)
  To: Xiaoguang Wang, Jens Axboe, io-uring; +Cc: joseph.qi

On 31/05/2020 19:15, Xiaoguang Wang wrote:
> In the begin of __io_splice_prep or io_close_prep, current io_uring mainline codes will
> modify req->work.flags firstly, so we need to call io_req_init_async to initialize
> io_wq_work before the work.flags modification.
> For below codes:
> static inline void io_req_init_async(struct io_kiocb *req,
>                         void (*func)(struct io_wq_work **))
> {
>         if (req->flags & REQ_F_WORK_INITIALIZED) {
>                 if (!req->work.func)
>                         req->work.func = func;
>         } else {
>                 req->work = (struct io_wq_work){ .func = func };
>                 req->flags |= REQ_F_WORK_INITIALIZED;
>         }
> }
> 
> if we not pass NULL to parameter 'func', e.g. pass io_wq_submit_work, then
> we can not use io_req_init_async() to pass io_close_finish again.

It's not as bad, just the thing you poked is overused and don't have strict rules.
I have a feeling, that for it to be done right it'd need more fundamental refactoring
with putting everything related to ->work closer to io_queue_async_work().

> 
> Now I'm confused how to write better codes based on current io_uring mainline codes :)
> If you have some free time, please have a deeper look, thanks.

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v4 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline
  2020-06-01 10:43               ` Pavel Begunkov
@ 2020-06-01 11:50                 ` Xiaoguang Wang
  0 siblings, 0 replies; 27+ messages in thread
From: Xiaoguang Wang @ 2020-06-01 11:50 UTC (permalink / raw)
  To: Pavel Begunkov, Jens Axboe, io-uring; +Cc: joseph.qi

hi,

> On 31/05/2020 19:15, Xiaoguang Wang wrote:
>> In the begin of __io_splice_prep or io_close_prep, current io_uring mainline codes will
>> modify req->work.flags firstly, so we need to call io_req_init_async to initialize
>> io_wq_work before the work.flags modification.
>> For below codes:
>> static inline void io_req_init_async(struct io_kiocb *req,
>>                          void (*func)(struct io_wq_work **))
>> {
>>          if (req->flags & REQ_F_WORK_INITIALIZED) {
>>                  if (!req->work.func)
>>                          req->work.func = func;
>>          } else {
>>                  req->work = (struct io_wq_work){ .func = func };
>>                  req->flags |= REQ_F_WORK_INITIALIZED;
>>          }
>> }
>>
>> if we not pass NULL to parameter 'func', e.g. pass io_wq_submit_work, then
>> we can not use io_req_init_async() to pass io_close_finish again.
> 
> It's not as bad, just the thing you poked is overused and don't have strict rules.
> I have a feeling, that for it to be done right it'd need more fundamental refactoring
> with putting everything related to ->work closer to io_queue_async_work().
Yes, I totally agree with you, that would be safer.

Regards,
Xiaoguang Wang

> 
>>
>> Now I'm confused how to write better codes based on current io_uring mainline codes :)
>> If you have some free time, please have a deeper look, thanks.
> 

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v5 2/2] io_uring: avoid unnecessary io_wq_work copy for fast poll feature
  2020-06-01  4:56               ` [PATCH v5 2/2] io_uring: avoid unnecessary io_wq_work copy for fast poll feature Xiaoguang Wang
@ 2020-06-02  1:16                 ` Xiaoguang Wang
  2020-06-03 13:46                   ` Pavel Begunkov
  0 siblings, 1 reply; 27+ messages in thread
From: Xiaoguang Wang @ 2020-06-02  1:16 UTC (permalink / raw)
  To: io-uring; +Cc: axboe, asml.silence, joseph.qi

hi Jens, Pavel,

Will you have a look at this V5 version? Or we hold on this patchset, and
do the refactoring work related io_wq_work firstly.

Regards,
Xiaoguang Wang


> Basically IORING_OP_POLL_ADD command and async armed poll handlers
> for regular commands don't touch io_wq_work, so only REQ_F_WORK_INITIALIZED
> is set, can we do io_wq_work copy and restore.
> 
> Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
> ---
> V3:
>    drop the REQ_F_WORK_NEED_RESTORE flag introduced in V2 patch, just
>    use REQ_F_WORK_INITIALIZED to control whether to do io_wq_work copy
>    and restore.
> ---
>   fs/io_uring.c | 13 +++++++++----
>   1 file changed, 9 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 8e022d0f0c86..b761ef7366f9 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -4527,7 +4527,8 @@ static void io_async_task_func(struct callback_head *cb)
>   	spin_unlock_irq(&ctx->completion_lock);
>   
>   	/* restore ->work in case we need to retry again */
> -	memcpy(&req->work, &apoll->work, sizeof(req->work));
> +	if (req->flags & REQ_F_WORK_INITIALIZED)
> +		memcpy(&req->work, &apoll->work, sizeof(req->work));
>   	kfree(apoll);
>   
>   	if (!canceled) {
> @@ -4624,7 +4625,8 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
>   		return false;
>   
>   	req->flags |= REQ_F_POLLED;
> -	memcpy(&apoll->work, &req->work, sizeof(req->work));
> +	if (req->flags & REQ_F_WORK_INITIALIZED)
> +		memcpy(&apoll->work, &req->work, sizeof(req->work));
>   	had_io = req->io != NULL;
>   
>   	get_task_struct(current);
> @@ -4649,7 +4651,8 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
>   		if (!had_io)
>   			io_poll_remove_double(req);
>   		spin_unlock_irq(&ctx->completion_lock);
> -		memcpy(&req->work, &apoll->work, sizeof(req->work));
> +		if (req->flags & REQ_F_WORK_INITIALIZED)
> +			memcpy(&req->work, &apoll->work, sizeof(req->work));
>   		kfree(apoll);
>   		return false;
>   	}
> @@ -4694,7 +4697,9 @@ static bool io_poll_remove_one(struct io_kiocb *req)
>   			 * io_req_work_drop_env below when dropping the
>   			 * final reference.
>   			 */
> -			memcpy(&req->work, &apoll->work, sizeof(req->work));
> +			if (req->flags & REQ_F_WORK_INITIALIZED)
> +				memcpy(&req->work, &apoll->work,
> +				       sizeof(req->work));
>   			kfree(apoll);
>   		}
>   	}
> 

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v5 2/2] io_uring: avoid unnecessary io_wq_work copy for fast poll feature
  2020-06-02  1:16                 ` Xiaoguang Wang
@ 2020-06-03 13:46                   ` Pavel Begunkov
  2020-06-07 15:02                     ` Jens Axboe
  0 siblings, 1 reply; 27+ messages in thread
From: Pavel Begunkov @ 2020-06-03 13:46 UTC (permalink / raw)
  To: Xiaoguang Wang, io-uring; +Cc: axboe, joseph.qi

On 02/06/2020 04:16, Xiaoguang Wang wrote:
> hi Jens, Pavel,
> 
> Will you have a look at this V5 version? Or we hold on this patchset, and
> do the refactoring work related io_wq_work firstly.

It's entirely up to Jens, but frankly, I think it'll bring more bugs than
merits in the current state of things.

> 
> 
>> Basically IORING_OP_POLL_ADD command and async armed poll handlers
>> for regular commands don't touch io_wq_work, so only REQ_F_WORK_INITIALIZED
>> is set, can we do io_wq_work copy and restore.
>>
>> Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
>> ---
>> V3:
>>    drop the REQ_F_WORK_NEED_RESTORE flag introduced in V2 patch, just
>>    use REQ_F_WORK_INITIALIZED to control whether to do io_wq_work copy
>>    and restore.
>> ---
>>   fs/io_uring.c | 13 +++++++++----
>>   1 file changed, 9 insertions(+), 4 deletions(-)
>>
>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>> index 8e022d0f0c86..b761ef7366f9 100644
>> --- a/fs/io_uring.c
>> +++ b/fs/io_uring.c
>> @@ -4527,7 +4527,8 @@ static void io_async_task_func(struct callback_head *cb)
>>       spin_unlock_irq(&ctx->completion_lock);
>>         /* restore ->work in case we need to retry again */
>> -    memcpy(&req->work, &apoll->work, sizeof(req->work));
>> +    if (req->flags & REQ_F_WORK_INITIALIZED)
>> +        memcpy(&req->work, &apoll->work, sizeof(req->work));
>>       kfree(apoll);
>>         if (!canceled) {
>> @@ -4624,7 +4625,8 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
>>           return false;
>>         req->flags |= REQ_F_POLLED;
>> -    memcpy(&apoll->work, &req->work, sizeof(req->work));
>> +    if (req->flags & REQ_F_WORK_INITIALIZED)
>> +        memcpy(&apoll->work, &req->work, sizeof(req->work));
>>       had_io = req->io != NULL;
>>         get_task_struct(current);
>> @@ -4649,7 +4651,8 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
>>           if (!had_io)
>>               io_poll_remove_double(req);
>>           spin_unlock_irq(&ctx->completion_lock);
>> -        memcpy(&req->work, &apoll->work, sizeof(req->work));
>> +        if (req->flags & REQ_F_WORK_INITIALIZED)
>> +            memcpy(&req->work, &apoll->work, sizeof(req->work));
>>           kfree(apoll);
>>           return false;
>>       }
>> @@ -4694,7 +4697,9 @@ static bool io_poll_remove_one(struct io_kiocb *req)
>>                * io_req_work_drop_env below when dropping the
>>                * final reference.
>>                */
>> -            memcpy(&req->work, &apoll->work, sizeof(req->work));
>> +            if (req->flags & REQ_F_WORK_INITIALIZED)
>> +                memcpy(&req->work, &apoll->work,
>> +                       sizeof(req->work));
>>               kfree(apoll);
>>           }
>>       }
>>

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v5 2/2] io_uring: avoid unnecessary io_wq_work copy for fast poll feature
  2020-06-03 13:46                   ` Pavel Begunkov
@ 2020-06-07 15:02                     ` Jens Axboe
  2020-06-07 20:36                       ` Pavel Begunkov
  0 siblings, 1 reply; 27+ messages in thread
From: Jens Axboe @ 2020-06-07 15:02 UTC (permalink / raw)
  To: Pavel Begunkov, Xiaoguang Wang, io-uring; +Cc: joseph.qi

On 6/3/20 7:46 AM, Pavel Begunkov wrote:
> On 02/06/2020 04:16, Xiaoguang Wang wrote:
>> hi Jens, Pavel,
>>
>> Will you have a look at this V5 version? Or we hold on this patchset, and
>> do the refactoring work related io_wq_work firstly.
> 
> It's entirely up to Jens, but frankly, I think it'll bring more bugs than
> merits in the current state of things.

Well, I'd really like to reduce the overhead where we can, particularly
when the overhead just exists to cater to the slow path.

Planning on taking the next week off and not do too much, but I'll see
if I can get some testing in with the current patches.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v5 2/2] io_uring: avoid unnecessary io_wq_work copy for fast poll feature
  2020-06-07 15:02                     ` Jens Axboe
@ 2020-06-07 20:36                       ` Pavel Begunkov
  2020-06-07 20:57                         ` Pavel Begunkov
  2020-06-07 23:26                         ` Jens Axboe
  0 siblings, 2 replies; 27+ messages in thread
From: Pavel Begunkov @ 2020-06-07 20:36 UTC (permalink / raw)
  To: Jens Axboe, Xiaoguang Wang, io-uring; +Cc: joseph.qi

On 07/06/2020 18:02, Jens Axboe wrote:
> On 6/3/20 7:46 AM, Pavel Begunkov wrote:
>> On 02/06/2020 04:16, Xiaoguang Wang wrote:
>>> hi Jens, Pavel,
>>>
>>> Will you have a look at this V5 version? Or we hold on this patchset, and
>>> do the refactoring work related io_wq_work firstly.
>>
>> It's entirely up to Jens, but frankly, I think it'll bring more bugs than
>> merits in the current state of things.
> 
> Well, I'd really like to reduce the overhead where we can, particularly
> when the overhead just exists to cater to the slow path.
> 
> Planning on taking the next week off and not do too much, but I'll see
> if I can get some testing in with the current patches.
> 

I just think it should not be done at expense of robustness.

e.g. instead of having tons of if's around ->func, we can get rid of
it and issue everything with io_wq_submit_work(). And there are plenty
of pros of doing that:
- freeing some space in io_kiocb (in req.work in particular)
- removing much of stuff with nice negative diffstat
- helping this series
- even safer than now -- can't be screwed with memcpy(req).

Extra switch-lookup in io-wq shouldn't even be noticeable considering
punting overhead. And even though io-wq loses some flexibility, as for
me that's fine as long as there is only 1 user.


And then we can go and fix every other problem until this patch set
looks good.

-- 
Pavel Begunkov



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v5 2/2] io_uring: avoid unnecessary io_wq_work copy for fast poll feature
  2020-06-07 20:36                       ` Pavel Begunkov
@ 2020-06-07 20:57                         ` Pavel Begunkov
  2020-06-07 23:29                           ` Jens Axboe
  2020-06-07 23:26                         ` Jens Axboe
  1 sibling, 1 reply; 27+ messages in thread
From: Pavel Begunkov @ 2020-06-07 20:57 UTC (permalink / raw)
  To: Jens Axboe, Xiaoguang Wang, io-uring; +Cc: joseph.qi

On 07/06/2020 23:36, Pavel Begunkov wrote:
> On 07/06/2020 18:02, Jens Axboe wrote:
>> On 6/3/20 7:46 AM, Pavel Begunkov wrote:
>>> On 02/06/2020 04:16, Xiaoguang Wang wrote:
>>>> hi Jens, Pavel,
>>>>
>>>> Will you have a look at this V5 version? Or we hold on this patchset, and
>>>> do the refactoring work related io_wq_work firstly.
>>>
>>> It's entirely up to Jens, but frankly, I think it'll bring more bugs than
>>> merits in the current state of things.
>>
>> Well, I'd really like to reduce the overhead where we can, particularly
>> when the overhead just exists to cater to the slow path.
>>
>> Planning on taking the next week off and not do too much, but I'll see
>> if I can get some testing in with the current patches.
>>
> 
> I just think it should not be done at expense of robustness.
> 
> e.g. instead of having tons of if's around ->func, we can get rid of
> it and issue everything with io_wq_submit_work(). And there are plenty
> of pros of doing that:
> - freeing some space in io_kiocb (in req.work in particular)
> - removing much of stuff with nice negative diffstat
> - helping this series
> - even safer than now -- can't be screwed with memcpy(req).
> 
> Extra switch-lookup in io-wq shouldn't even be noticeable considering
> punting overhead. And even though io-wq loses some flexibility, as for
> me that's fine as long as there is only 1 user.

How about diff below? if split and cooked properly.
3 files changed, 73 insertions(+), 164 deletions(-)


diff --git a/fs/io-wq.c b/fs/io-wq.c
index 2bfa9117bc28..a44ad3b98886 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -112,6 +112,7 @@ struct io_wq {
 	unsigned long state;
 
 	free_work_fn *free_work;
+	io_wq_work_fn *do_work;
 
 	struct task_struct *manager;
 	struct user_struct *user;
@@ -528,7 +529,7 @@ static void io_worker_handle_work(struct io_worker *worker)
 
 			hash = io_get_work_hash(work);
 			linked = old_work = work;
-			linked->func(&linked);
+			wq->do_work(&linked);
 			linked = (old_work == linked) ? NULL : linked;
 
 			work = next_hashed;
@@ -785,7 +786,7 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe)
 		struct io_wq_work *old_work = work;
 
 		work->flags |= IO_WQ_WORK_CANCEL;
-		work->func(&work);
+		wq->do_work(&work);
 		work = (work == old_work) ? NULL : work;
 		wq->free_work(old_work);
 	} while (work);
@@ -1027,7 +1028,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 	int ret = -ENOMEM, node;
 	struct io_wq *wq;
 
-	if (WARN_ON_ONCE(!data->free_work))
+	if (WARN_ON_ONCE(!data->free_work || !data->do_work))
 		return ERR_PTR(-EINVAL);
 
 	wq = kzalloc(sizeof(*wq), GFP_KERNEL);
@@ -1041,6 +1042,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 	}
 
 	wq->free_work = data->free_work;
+	wq->do_work = data->do_work;
 
 	/* caller must already hold a reference to this */
 	wq->user = data->user;
@@ -1097,7 +1099,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 
 bool io_wq_get(struct io_wq *wq, struct io_wq_data *data)
 {
-	if (data->free_work != wq->free_work)
+	if (data->free_work != wq->free_work || data->do_work != wq->do_work)
 		return false;
 
 	return refcount_inc_not_zero(&wq->use_refs);
diff --git a/fs/io-wq.h b/fs/io-wq.h
index df8a4cd3236d..f3bb596f5a3f 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -85,7 +85,6 @@ static inline void wq_list_del(struct io_wq_work_list *list,
 
 struct io_wq_work {
 	struct io_wq_work_node list;
-	void (*func)(struct io_wq_work **);
 	struct files_struct *files;
 	struct mm_struct *mm;
 	const struct cred *creds;
@@ -94,9 +93,9 @@ struct io_wq_work {
 	pid_t task_pid;
 };
 
-#define INIT_IO_WORK(work, _func)				\
+#define INIT_IO_WORK(work)					\
 	do {							\
-		*(work) = (struct io_wq_work){ .func = _func };	\
+		*(work) = (struct io_wq_work){};		\
 	} while (0)						\
 
 static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
@@ -108,10 +107,12 @@ static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
 }
 
 typedef void (free_work_fn)(struct io_wq_work *);
+typedef void (io_wq_work_fn)(struct io_wq_work **);
 
 struct io_wq_data {
 	struct user_struct *user;
 
+	io_wq_work_fn *do_work;
 	free_work_fn *free_work;
 };
 
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 3aebbf96c123..3bfce882ede5 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -541,6 +541,7 @@ enum {
 	REQ_F_POLLED_BIT,
 	REQ_F_BUFFER_SELECTED_BIT,
 	REQ_F_NO_FILE_TABLE_BIT,
+	REQ_F_QUEUE_TIMEOUT_BIT,
 
 	/* not a real bit, just to check we're not overflowing the space */
 	__REQ_F_LAST_BIT,
@@ -596,6 +597,8 @@ enum {
 	REQ_F_BUFFER_SELECTED	= BIT(REQ_F_BUFFER_SELECTED_BIT),
 	/* doesn't need file table for this request */
 	REQ_F_NO_FILE_TABLE	= BIT(REQ_F_NO_FILE_TABLE_BIT),
+	/* need to queue linked timeout */
+	REQ_F_QUEUE_TIMEOUT	= BIT(REQ_F_QUEUE_TIMEOUT_BIT),
 };
 
 struct async_poll {
@@ -1579,16 +1582,6 @@ static void io_free_req(struct io_kiocb *req)
 		io_queue_async_work(nxt);
 }
 
-static void io_link_work_cb(struct io_wq_work **workptr)
-{
-	struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
-	struct io_kiocb *link;
-
-	link = list_first_entry(&req->link_list, struct io_kiocb, link_list);
-	io_queue_linked_timeout(link);
-	io_wq_submit_work(workptr);
-}
-
 static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt)
 {
 	struct io_kiocb *link;
@@ -1600,7 +1593,7 @@ static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt)
 	*workptr = &nxt->work;
 	link = io_prep_linked_timeout(nxt);
 	if (link)
-		nxt->work.func = io_link_work_cb;
+		nxt->flags |= REQ_F_QUEUE_TIMEOUT;
 }
 
 /*
@@ -2940,23 +2933,15 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return 0;
 }
 
-static bool io_req_cancelled(struct io_kiocb *req)
-{
-	if (req->work.flags & IO_WQ_WORK_CANCEL) {
-		req_set_fail_links(req);
-		io_cqring_add_event(req, -ECANCELED);
-		io_put_req(req);
-		return true;
-	}
-
-	return false;
-}
-
-static void __io_fsync(struct io_kiocb *req)
+static int io_fsync(struct io_kiocb *req, bool force_nonblock)
 {
 	loff_t end = req->sync.off + req->sync.len;
 	int ret;
 
+	/* fsync always requires a blocking context */
+	if (force_nonblock)
+		return -EAGAIN;
+
 	ret = vfs_fsync_range(req->file, req->sync.off,
 				end > 0 ? end : LLONG_MAX,
 				req->sync.flags & IORING_FSYNC_DATASYNC);
@@ -2964,53 +2949,9 @@ static void __io_fsync(struct io_kiocb *req)
 		req_set_fail_links(req);
 	io_cqring_add_event(req, ret);
 	io_put_req(req);
-}
-
-static void io_fsync_finish(struct io_wq_work **workptr)
-{
-	struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
-
-	if (io_req_cancelled(req))
-		return;
-	__io_fsync(req);
-	io_steal_work(req, workptr);
-}
-
-static int io_fsync(struct io_kiocb *req, bool force_nonblock)
-{
-	/* fsync always requires a blocking context */
-	if (force_nonblock) {
-		req->work.func = io_fsync_finish;
-		return -EAGAIN;
-	}
-	__io_fsync(req);
 	return 0;
 }
 
-static void __io_fallocate(struct io_kiocb *req)
-{
-	int ret;
-
-	current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;
-	ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
-				req->sync.len);
-	current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
-	if (ret < 0)
-		req_set_fail_links(req);
-	io_cqring_add_event(req, ret);
-	io_put_req(req);
-}
-
-static void io_fallocate_finish(struct io_wq_work **workptr)
-{
-	struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
-
-	if (io_req_cancelled(req))
-		return;
-	__io_fallocate(req);
-	io_steal_work(req, workptr);
-}
-
 static int io_fallocate_prep(struct io_kiocb *req,
 			     const struct io_uring_sqe *sqe)
 {
@@ -3028,13 +2969,20 @@ static int io_fallocate_prep(struct io_kiocb *req,
 
 static int io_fallocate(struct io_kiocb *req, bool force_nonblock)
 {
+	int ret;
+
 	/* fallocate always requiring blocking context */
-	if (force_nonblock) {
-		req->work.func = io_fallocate_finish;
+	if (force_nonblock)
 		return -EAGAIN;
-	}
 
-	__io_fallocate(req);
+	current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;
+	ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
+				req->sync.len);
+	current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
+	if (ret < 0)
+		req_set_fail_links(req);
+	io_cqring_add_event(req, ret);
+	io_put_req(req);
 	return 0;
 }
 
@@ -3479,53 +3427,37 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	    req->close.fd == req->ctx->ring_fd)
 		return -EBADF;
 
+	req->close.put_file = NULL;
 	return 0;
 }
 
-/* only called when __close_fd_get_file() is done */
-static void __io_close_finish(struct io_kiocb *req)
-{
-	int ret;
-
-	ret = filp_close(req->close.put_file, req->work.files);
-	if (ret < 0)
-		req_set_fail_links(req);
-	io_cqring_add_event(req, ret);
-	fput(req->close.put_file);
-	io_put_req(req);
-}
-
-static void io_close_finish(struct io_wq_work **workptr)
-{
-	struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
-
-	/* not cancellable, don't do io_req_cancelled() */
-	__io_close_finish(req);
-	io_steal_work(req, workptr);
-}
-
 static int io_close(struct io_kiocb *req, bool force_nonblock)
 {
+	struct io_close *close = &req->close;
 	int ret;
 
-	req->close.put_file = NULL;
-	ret = __close_fd_get_file(req->close.fd, &req->close.put_file);
-	if (ret < 0)
-		return (ret == -ENOENT) ? -EBADF : ret;
+	/* might be already done during nonblock submission */
+	if (!close->put_file) {
+		ret = __close_fd_get_file(close->fd, &close->put_file);
+		if (ret < 0)
+			return (ret == -ENOENT) ? -EBADF : ret;
+	}
 
 	/* if the file has a flush method, be safe and punt to async */
-	if (req->close.put_file->f_op->flush && force_nonblock) {
+	if (close->put_file->f_op->flush && force_nonblock) {
 		/* avoid grabbing files - we don't need the files */
 		req->flags |= REQ_F_NO_FILE_TABLE | REQ_F_MUST_PUNT;
-		req->work.func = io_close_finish;
 		return -EAGAIN;
 	}
 
-	/*
-	 * No ->flush(), safely close from here and just punt the
-	 * fput() to async context.
-	 */
-	__io_close_finish(req);
+	/* No ->flush() or already async, safely close from here */
+	ret = filp_close(close->put_file, req->work.files);
+	if (ret < 0)
+		req_set_fail_links(req);
+	io_cqring_add_event(req, ret);
+	fput(close->put_file);
+	close->put_file = NULL;
+	io_put_req(req);
 	return 0;
 }
 
@@ -3547,38 +3479,20 @@ static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return 0;
 }
 
-static void __io_sync_file_range(struct io_kiocb *req)
+static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
 {
 	int ret;
 
+	/* sync_file_range always requires a blocking context */
+	if (force_nonblock)
+		return -EAGAIN;
+
 	ret = sync_file_range(req->file, req->sync.off, req->sync.len,
 				req->sync.flags);
 	if (ret < 0)
 		req_set_fail_links(req);
 	io_cqring_add_event(req, ret);
 	io_put_req(req);
-}
-
-
-static void io_sync_file_range_finish(struct io_wq_work **workptr)
-{
-	struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
-
-	if (io_req_cancelled(req))
-		return;
-	__io_sync_file_range(req);
-	io_steal_work(req, workptr);
-}
-
-static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
-{
-	/* sync_file_range always requires a blocking context */
-	if (force_nonblock) {
-		req->work.func = io_sync_file_range_finish;
-		return -EAGAIN;
-	}
-
-	__io_sync_file_range(req);
 	return 0;
 }
 
@@ -4000,49 +3914,27 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return 0;
 }
 
-static int __io_accept(struct io_kiocb *req, bool force_nonblock)
+static int io_accept(struct io_kiocb *req, bool force_nonblock)
 {
 	struct io_accept *accept = &req->accept;
-	unsigned file_flags;
+	unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
 	int ret;
 
-	file_flags = force_nonblock ? O_NONBLOCK : 0;
 	ret = __sys_accept4_file(req->file, file_flags, accept->addr,
 					accept->addr_len, accept->flags,
 					accept->nofile);
 	if (ret == -EAGAIN && force_nonblock)
 		return -EAGAIN;
-	if (ret == -ERESTARTSYS)
-		ret = -EINTR;
-	if (ret < 0)
+	if (ret < 0) {
+		if (ret == -ERESTARTSYS)
+			ret = -EINTR;
 		req_set_fail_links(req);
+	}
 	io_cqring_add_event(req, ret);
 	io_put_req(req);
 	return 0;
 }
 
-static void io_accept_finish(struct io_wq_work **workptr)
-{
-	struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
-
-	if (io_req_cancelled(req))
-		return;
-	__io_accept(req, false);
-	io_steal_work(req, workptr);
-}
-
-static int io_accept(struct io_kiocb *req, bool force_nonblock)
-{
-	int ret;
-
-	ret = __io_accept(req, force_nonblock);
-	if (ret == -EAGAIN && force_nonblock) {
-		req->work.func = io_accept_finish;
-		return -EAGAIN;
-	}
-	return 0;
-}
-
 static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_connect *conn = &req->connect;
@@ -5434,12 +5326,25 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	return 0;
 }
 
+static void io_link_work_cb(struct io_kiocb *req)
+{
+	struct io_kiocb *link;
+
+	if (!(req->flags & REQ_F_QUEUE_TIMEOUT))
+		return;
+
+	link = list_first_entry(&req->link_list, struct io_kiocb, link_list);
+	io_queue_linked_timeout(link);
+}
+
 static void io_wq_submit_work(struct io_wq_work **workptr)
 {
 	struct io_wq_work *work = *workptr;
 	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
 	int ret = 0;
 
+	io_link_work_cb(req);
+
 	/* if NO_CANCEL is set, we must still run the work */
 	if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) ==
 				IO_WQ_WORK_CANCEL) {
@@ -5974,7 +5879,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	refcount_set(&req->refs, 2);
 	req->task = NULL;
 	req->result = 0;
-	INIT_IO_WORK(&req->work, io_wq_submit_work);
+	INIT_IO_WORK(&req->work);
 
 	if (unlikely(req->opcode >= IORING_OP_LAST))
 		return -EINVAL;
@@ -6990,6 +6895,7 @@ static int io_init_wq_offload(struct io_ring_ctx *ctx,
 
 	data.user = ctx->user;
 	data.free_work = io_free_work;
+	data.do_work = io_wq_submit_work;
 
 	if (!(p->flags & IORING_SETUP_ATTACH_WQ)) {
 		/* Do QD, or 4 * CPUS, whatever is smallest */


-- 
Pavel Begunkov

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH v5 2/2] io_uring: avoid unnecessary io_wq_work copy for fast poll feature
  2020-06-07 20:36                       ` Pavel Begunkov
  2020-06-07 20:57                         ` Pavel Begunkov
@ 2020-06-07 23:26                         ` Jens Axboe
  1 sibling, 0 replies; 27+ messages in thread
From: Jens Axboe @ 2020-06-07 23:26 UTC (permalink / raw)
  To: Pavel Begunkov, Xiaoguang Wang, io-uring; +Cc: joseph.qi

On 6/7/20 2:36 PM, Pavel Begunkov wrote:
> On 07/06/2020 18:02, Jens Axboe wrote:
>> On 6/3/20 7:46 AM, Pavel Begunkov wrote:
>>> On 02/06/2020 04:16, Xiaoguang Wang wrote:
>>>> hi Jens, Pavel,
>>>>
>>>> Will you have a look at this V5 version? Or we hold on this patchset, and
>>>> do the refactoring work related io_wq_work firstly.
>>>
>>> It's entirely up to Jens, but frankly, I think it'll bring more bugs than
>>> merits in the current state of things.
>>
>> Well, I'd really like to reduce the overhead where we can, particularly
>> when the overhead just exists to cater to the slow path.
>>
>> Planning on taking the next week off and not do too much, but I'll see
>> if I can get some testing in with the current patches.
>>
> 
> I just think it should not be done at expense of robustness.

Oh I totally agree, which is why I've been holding back. I'm just
saying that we're currently doing a number of semi expensive operations
for slow path setup up front, which is annoying and would be nice NOT
to do.

> e.g. instead of having tons of if's around ->func, we can get rid of
> it and issue everything with io_wq_submit_work(). And there are plenty
> of pros of doing that:
> - freeing some space in io_kiocb (in req.work in particular)
> - removing much of stuff with nice negative diffstat
> - helping this series
> - even safer than now -- can't be screwed with memcpy(req).

Definitely sounds good.

> Extra switch-lookup in io-wq shouldn't even be noticeable considering
> punting overhead. And even though io-wq loses some flexibility, as for
> me that's fine as long as there is only 1 user.

Don't care too much about the extra lookups, it's a very minor cost
compared to the wins.

> And then we can go and fix every other problem until this patch set
> looks good.

I'll take a look at your patch.


-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v5 2/2] io_uring: avoid unnecessary io_wq_work copy for fast poll feature
  2020-06-07 20:57                         ` Pavel Begunkov
@ 2020-06-07 23:29                           ` Jens Axboe
  2020-06-08  7:43                             ` Pavel Begunkov
  0 siblings, 1 reply; 27+ messages in thread
From: Jens Axboe @ 2020-06-07 23:29 UTC (permalink / raw)
  To: Pavel Begunkov, Xiaoguang Wang, io-uring; +Cc: joseph.qi

On 6/7/20 2:57 PM, Pavel Begunkov wrote:
> On 07/06/2020 23:36, Pavel Begunkov wrote:
>> On 07/06/2020 18:02, Jens Axboe wrote:
>>> On 6/3/20 7:46 AM, Pavel Begunkov wrote:
>>>> On 02/06/2020 04:16, Xiaoguang Wang wrote:
>>>>> hi Jens, Pavel,
>>>>>
>>>>> Will you have a look at this V5 version? Or we hold on this patchset, and
>>>>> do the refactoring work related io_wq_work firstly.
>>>>
>>>> It's entirely up to Jens, but frankly, I think it'll bring more bugs than
>>>> merits in the current state of things.
>>>
>>> Well, I'd really like to reduce the overhead where we can, particularly
>>> when the overhead just exists to cater to the slow path.
>>>
>>> Planning on taking the next week off and not do too much, but I'll see
>>> if I can get some testing in with the current patches.
>>>
>>
>> I just think it should not be done at expense of robustness.
>>
>> e.g. instead of having tons of if's around ->func, we can get rid of
>> it and issue everything with io_wq_submit_work(). And there are plenty
>> of pros of doing that:
>> - freeing some space in io_kiocb (in req.work in particular)
>> - removing much of stuff with nice negative diffstat
>> - helping this series
>> - even safer than now -- can't be screwed with memcpy(req).
>>
>> Extra switch-lookup in io-wq shouldn't even be noticeable considering
>> punting overhead. And even though io-wq loses some flexibility, as for
>> me that's fine as long as there is only 1 user.
> 
> How about diff below? if split and cooked properly.
> 3 files changed, 73 insertions(+), 164 deletions(-)
> 
> 
> @@ -94,9 +93,9 @@ struct io_wq_work {
>  	pid_t task_pid;
>  };
>  
> -#define INIT_IO_WORK(work, _func)				\
> +#define INIT_IO_WORK(work)					\
>  	do {							\
> -		*(work) = (struct io_wq_work){ .func = _func };	\
> +		*(work) = (struct io_wq_work){};		\
>  	} while (0)						\
>  

Would be nice to optimize this one, it's a lot of clearing for something
we'll generally not use at all in the fast path. Or at least keep it
only for before when we submit the work for async execution.

From a quick look at this, otherwise looks great! Please do split and
submit this.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v5 2/2] io_uring: avoid unnecessary io_wq_work copy for fast poll feature
  2020-06-07 23:29                           ` Jens Axboe
@ 2020-06-08  7:43                             ` Pavel Begunkov
  2020-06-08 12:14                               ` Xiaoguang Wang
  0 siblings, 1 reply; 27+ messages in thread
From: Pavel Begunkov @ 2020-06-08  7:43 UTC (permalink / raw)
  To: Jens Axboe, Xiaoguang Wang, io-uring; +Cc: joseph.qi

On 08/06/2020 02:29, Jens Axboe wrote:
> On 6/7/20 2:57 PM, Pavel Begunkov wrote:
>> -#define INIT_IO_WORK(work, _func)				\
>> +#define INIT_IO_WORK(work)					\
>>  	do {							\
>> -		*(work) = (struct io_wq_work){ .func = _func };	\
>> +		*(work) = (struct io_wq_work){};		\
>>  	} while (0)						\
>>  
> 
> Would be nice to optimize this one, it's a lot of clearing for something
> we'll generally not use at all in the fast path. Or at least keep it
> only for before when we submit the work for async execution.

Let's leave it to Xiaoguang and the series of the topic.

> 
> From a quick look at this, otherwise looks great! Please do split and
> submit this.

Sure. Have great time off!


-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v5 2/2] io_uring: avoid unnecessary io_wq_work copy for fast poll feature
  2020-06-08  7:43                             ` Pavel Begunkov
@ 2020-06-08 12:14                               ` Xiaoguang Wang
  2020-06-08 20:08                                 ` Jens Axboe
  0 siblings, 1 reply; 27+ messages in thread
From: Xiaoguang Wang @ 2020-06-08 12:14 UTC (permalink / raw)
  To: Pavel Begunkov, Jens Axboe, io-uring; +Cc: joseph.qi

hi,

> On 08/06/2020 02:29, Jens Axboe wrote:
>> On 6/7/20 2:57 PM, Pavel Begunkov wrote:
>>> -#define INIT_IO_WORK(work, _func)				\
>>> +#define INIT_IO_WORK(work)					\
>>>   	do {							\
>>> -		*(work) = (struct io_wq_work){ .func = _func };	\
>>> +		*(work) = (struct io_wq_work){};		\
>>>   	} while (0)						\
>>>   
>>
>> Would be nice to optimize this one, it's a lot of clearing for something
>> we'll generally not use at all in the fast path. Or at least keep it
>> only for before when we submit the work for async execution.
> 
> Let's leave it to Xiaoguang and the series of the topic.
Yeah, I'd be glad to continue this job, thanks.

Regards,
Xiaoguang Wang
> 
>>
>>  From a quick look at this, otherwise looks great! Please do split and
>> submit this.
> 
> Sure. Have great time off!
> 
> 

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v5 2/2] io_uring: avoid unnecessary io_wq_work copy for fast poll feature
  2020-06-08 12:14                               ` Xiaoguang Wang
@ 2020-06-08 20:08                                 ` Jens Axboe
  0 siblings, 0 replies; 27+ messages in thread
From: Jens Axboe @ 2020-06-08 20:08 UTC (permalink / raw)
  To: Xiaoguang Wang, Pavel Begunkov, io-uring; +Cc: joseph.qi

On 6/8/20 6:14 AM, Xiaoguang Wang wrote:
> hi,
> 
>> On 08/06/2020 02:29, Jens Axboe wrote:
>>> On 6/7/20 2:57 PM, Pavel Begunkov wrote:
>>>> -#define INIT_IO_WORK(work, _func)				\
>>>> +#define INIT_IO_WORK(work)					\
>>>>   	do {							\
>>>> -		*(work) = (struct io_wq_work){ .func = _func };	\
>>>> +		*(work) = (struct io_wq_work){};		\
>>>>   	} while (0)						\
>>>>   
>>>
>>> Would be nice to optimize this one, it's a lot of clearing for something
>>> we'll generally not use at all in the fast path. Or at least keep it
>>> only for before when we submit the work for async execution.
>>
>> Let's leave it to Xiaoguang and the series of the topic.
> Yeah, I'd be glad to continue this job, thanks.

Perfect! I think io_uring-5.8 (as of now) will be a great base for
that.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2020-06-08 20:09 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-05-30 14:39 [PATCH v4 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline Xiaoguang Wang
2020-05-30 14:39 ` [PATCH v4 2/2] io_uring: avoid unnecessary io_wq_work copy for fast poll feature Xiaoguang Wang
2020-05-30 16:44 ` [PATCH v4 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline Jens Axboe
2020-05-30 17:14   ` Jens Axboe
2020-05-30 17:29     ` Jens Axboe
2020-05-30 17:36       ` Pavel Begunkov
2020-05-31 13:57         ` Xiaoguang Wang
2020-05-31 14:49           ` Pavel Begunkov
2020-05-31 14:12       ` Xiaoguang Wang
2020-05-31 14:31         ` Xiaoguang Wang
2020-05-31 14:57           ` Pavel Begunkov
2020-05-31 16:15             ` Xiaoguang Wang
2020-06-01 10:43               ` Pavel Begunkov
2020-06-01 11:50                 ` Xiaoguang Wang
2020-06-01  4:56             ` [PATCH v5 " Xiaoguang Wang
2020-06-01  4:56               ` [PATCH v5 2/2] io_uring: avoid unnecessary io_wq_work copy for fast poll feature Xiaoguang Wang
2020-06-02  1:16                 ` Xiaoguang Wang
2020-06-03 13:46                   ` Pavel Begunkov
2020-06-07 15:02                     ` Jens Axboe
2020-06-07 20:36                       ` Pavel Begunkov
2020-06-07 20:57                         ` Pavel Begunkov
2020-06-07 23:29                           ` Jens Axboe
2020-06-08  7:43                             ` Pavel Begunkov
2020-06-08 12:14                               ` Xiaoguang Wang
2020-06-08 20:08                                 ` Jens Axboe
2020-06-07 23:26                         ` Jens Axboe
2020-05-31 14:33         ` [PATCH v4 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline Pavel Begunkov

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.