All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] io_uring: create percpu io sq thread when IORING_SETUP_SQ_AFF is flagged
@ 2020-05-20 11:56 Xiaoguang Wang
  2020-05-20 12:11 ` Xiaoguang Wang
  2020-05-20 22:09 ` Pavel Begunkov
  0 siblings, 2 replies; 11+ messages in thread
From: Xiaoguang Wang @ 2020-05-20 11:56 UTC (permalink / raw)
  To: io-uring; +Cc: axboe, joseph.qi, yujian.wu1, Xiaoguang Wang

Currently we can create multiple io_uring instances which all have SQPOLL
enabled and make them bound to same cpu core by setting sq_thread_cpu argument,
but sometimes this isn't efficient. Imagine such extreme case, create two io
uring instances, which both have SQPOLL enabled and are bound to same cpu core.
One instance submits io per 500ms, another instance submits io continually,
then obviously the 1st instance still always contend for cpu resource, which
will impact 2nd instance.

I have constructed tests case to the evaluate impact:
    1, a test progarm which creates one io_uring instance with SQPOLL
enabled, and bound to cpu core 61.
    2, sudo taskset -c 60 fio  -name=fiotest -filename=/dev/nvme0n1 -iodepth=128
-thread -rw=read -ioengine=io_uring  -hipri=0 -sqthread_poll=1  -sqthread_poll_cpu=61
-registerfiles=1 -direct=1 -bs=4k -size=10G -numjobs=1  -time_based -runtime=120
Note both io_uring instance's sq_thread_cpu are bound to cpu core 61.

In current kernel, run above two jobs concurrently, and the 2nd job will get:
Run status group 0 (all jobs):
   READ: bw=1623MiB/s (1701MB/s), 1623MiB/s-1623MiB/s (1701MB/s-1701MB/s),
io=190GiB (204GB), run=120010-120010msec

We can see that 1st instance has a huge impact to 2nd instance, I think it's
because of unordered competition for same cpu resource, every instance gets
50% cpu usage.

To fix this issue, when io_uring instance uses IORING_SETUP_SQ_AFF to specify a
cpu,  we create a percpu io sq_thread to handle multiple io_uring instances' io
requests serially. With this patch, in same environment, we get a huge improvement:
Run status group 0 (all jobs):
   READ: bw=3231MiB/s (3388MB/s), 3231MiB/s-3231MiB/s (3388MB/s-3388MB/s),
io=379GiB (407GB), run=120001-120001msec

Link: https://lore.kernel.org/io-uring/c94098d2-279e-a552-91ec-8a8f177d770a@linux.alibaba.com/T/#t
Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
---
 fs/io_uring.c | 257 +++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 232 insertions(+), 25 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ed0c22eb9808..e49ff1f67681 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -258,8 +258,13 @@ struct io_ring_ctx {
 	/* IO offload */
 	struct io_wq		*io_wq;
 	struct task_struct	*sqo_thread;	/* if using sq thread polling */
+	wait_queue_head_t	*sqo_wait;
+	int			submit_status;
+	int			sq_thread_cpu;
+	struct list_head	node;
+
 	struct mm_struct	*sqo_mm;
-	wait_queue_head_t	sqo_wait;
+	wait_queue_head_t	__sqo_wait;
 
 	/*
 	 * If used, fixed file set. Writers must ensure that ->refs is dead,
@@ -330,6 +335,16 @@ struct io_ring_ctx {
 	struct work_struct		exit_work;
 };
 
+struct io_percpu_thread {
+	struct list_head ctx_list;
+	wait_queue_head_t sqo_percpu_wait;
+	struct mutex lock;
+	struct task_struct *sqo_thread;
+	unsigned int sq_thread_idle;
+};
+
+static struct io_percpu_thread __percpu *percpu_threads;
+
 /*
  * First field must be the file pointer in all the
  * iocb unions! See also 'struct kiocb' in <linux/fs.h>
@@ -926,9 +941,11 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 		goto err;
 
 	ctx->flags = p->flags;
-	init_waitqueue_head(&ctx->sqo_wait);
+	init_waitqueue_head(&ctx->__sqo_wait);
+	ctx->sqo_wait = &ctx->__sqo_wait;
 	init_waitqueue_head(&ctx->cq_wait);
 	INIT_LIST_HEAD(&ctx->cq_overflow_list);
+	INIT_LIST_HEAD(&ctx->node);
 	init_completion(&ctx->completions[0]);
 	init_completion(&ctx->completions[1]);
 	idr_init(&ctx->io_buffer_idr);
@@ -1157,8 +1174,8 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
 {
 	if (waitqueue_active(&ctx->wait))
 		wake_up(&ctx->wait);
-	if (waitqueue_active(&ctx->sqo_wait))
-		wake_up(&ctx->sqo_wait);
+	if (waitqueue_active(ctx->sqo_wait))
+		wake_up(ctx->sqo_wait);
 	if (io_should_trigger_evfd(ctx))
 		eventfd_signal(ctx->cq_ev_fd, 1);
 }
@@ -1980,8 +1997,8 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
 		list_add_tail(&req->list, &ctx->poll_list);
 
 	if ((ctx->flags & IORING_SETUP_SQPOLL) &&
-	    wq_has_sleeper(&ctx->sqo_wait))
-		wake_up(&ctx->sqo_wait);
+	    wq_has_sleeper(ctx->sqo_wait))
+		wake_up(ctx->sqo_wait);
 }
 
 static void io_file_put(struct io_submit_state *state)
@@ -5994,7 +6011,7 @@ static int io_sq_thread(void *data)
 				continue;
 			}
 
-			prepare_to_wait(&ctx->sqo_wait, &wait,
+			prepare_to_wait(ctx->sqo_wait, &wait,
 						TASK_INTERRUPTIBLE);
 
 			/*
@@ -6006,7 +6023,7 @@ static int io_sq_thread(void *data)
 			 */
 			if ((ctx->flags & IORING_SETUP_IOPOLL) &&
 			    !list_empty_careful(&ctx->poll_list)) {
-				finish_wait(&ctx->sqo_wait, &wait);
+				finish_wait(ctx->sqo_wait, &wait);
 				continue;
 			}
 
@@ -6018,23 +6035,23 @@ static int io_sq_thread(void *data)
 			to_submit = io_sqring_entries(ctx);
 			if (!to_submit || ret == -EBUSY) {
 				if (kthread_should_park()) {
-					finish_wait(&ctx->sqo_wait, &wait);
+					finish_wait(ctx->sqo_wait, &wait);
 					break;
 				}
 				if (current->task_works) {
 					task_work_run();
-					finish_wait(&ctx->sqo_wait, &wait);
+					finish_wait(ctx->sqo_wait, &wait);
 					continue;
 				}
 				if (signal_pending(current))
 					flush_signals(current);
 				schedule();
-				finish_wait(&ctx->sqo_wait, &wait);
+				finish_wait(ctx->sqo_wait, &wait);
 
 				ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
 				continue;
 			}
-			finish_wait(&ctx->sqo_wait, &wait);
+			finish_wait(ctx->sqo_wait, &wait);
 
 			ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
 		}
@@ -6058,6 +6075,133 @@ static int io_sq_thread(void *data)
 	return 0;
 }
 
+static int process_ctx(struct io_ring_ctx *ctx)
+{
+	unsigned int to_submit;
+	int ret = 0;
+
+	if (!list_empty(&ctx->poll_list)) {
+		unsigned nr_events = 0;
+
+		mutex_lock(&ctx->uring_lock);
+		if (!list_empty(&ctx->poll_list))
+			io_iopoll_getevents(ctx, &nr_events, 0);
+		mutex_unlock(&ctx->uring_lock);
+	}
+
+	to_submit = io_sqring_entries(ctx);
+	if (to_submit) {
+		mutex_lock(&ctx->uring_lock);
+		if (likely(!percpu_ref_is_dying(&ctx->refs)))
+			ret = io_submit_sqes(ctx, to_submit, NULL, -1, true);
+		mutex_unlock(&ctx->uring_lock);
+	}
+
+	if (current->task_works)
+		task_work_run();
+
+	io_sq_thread_drop_mm(ctx);
+	return ret;
+}
+
+static int io_sq_percpu_thread(void *data)
+{
+	struct io_percpu_thread *t = data;
+	struct io_ring_ctx *ctx, *tmp;
+	mm_segment_t old_fs;
+	const struct cred *saved_creds, *cur_creds, *old_creds;
+	unsigned long timeout;
+	DEFINE_WAIT(wait);
+	int iters = 0;
+
+	timeout = jiffies + t->sq_thread_idle;
+	old_fs = get_fs();
+	set_fs(USER_DS);
+	saved_creds = cur_creds = NULL;
+	while (!kthread_should_park()) {
+		bool continue_run;
+		bool needs_wait;
+		unsigned int to_submit;
+
+		mutex_lock(&t->lock);
+again:
+		continue_run = false;
+		list_for_each_entry_safe(ctx, tmp, &t->ctx_list, node) {
+			if (cur_creds != ctx->creds) {
+				old_creds = override_creds(ctx->creds);
+				cur_creds = ctx->creds;
+				if (saved_creds)
+					put_cred(old_creds);
+				else
+					saved_creds = old_creds;
+			}
+			ctx->submit_status = process_ctx(ctx);
+
+			to_submit = io_sqring_entries(ctx);
+			if (!continue_run &&
+			    ((to_submit && ctx->submit_status != -EBUSY) ||
+			    !list_empty(&ctx->poll_list)))
+				continue_run = true;
+		}
+		if (continue_run && (++iters & 7)) {
+			timeout = jiffies + t->sq_thread_idle;
+			goto again;
+		}
+		mutex_unlock(&t->lock);
+		if (continue_run) {
+			timeout = jiffies + t->sq_thread_idle;
+			continue;
+		}
+		if (!time_after(jiffies, timeout)) {
+			cond_resched();
+			continue;
+		}
+
+		needs_wait = true;
+		prepare_to_wait(&t->sqo_percpu_wait, &wait, TASK_INTERRUPTIBLE);
+		mutex_lock(&t->lock);
+		list_for_each_entry_safe(ctx, tmp, &t->ctx_list, node) {
+			if ((ctx->flags & IORING_SETUP_IOPOLL) &&
+			    !list_empty_careful(&ctx->poll_list)) {
+				needs_wait = false;
+				break;
+			}
+			to_submit = io_sqring_entries(ctx);
+			if (to_submit && ctx->submit_status != -EBUSY) {
+				needs_wait = false;
+				break;
+			}
+		}
+		if (needs_wait) {
+			list_for_each_entry_safe(ctx, tmp, &t->ctx_list, node)
+				ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
+			smp_mb();
+
+		}
+		mutex_unlock(&t->lock);
+
+		if (needs_wait) {
+			schedule();
+			mutex_lock(&t->lock);
+			list_for_each_entry_safe(ctx, tmp,
+						 &t->ctx_list, node)
+				ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
+			mutex_unlock(&t->lock);
+			finish_wait(&t->sqo_percpu_wait, &wait);
+		} else
+			finish_wait(&t->sqo_percpu_wait, &wait);
+		timeout = jiffies + t->sq_thread_idle;
+		cond_resched();
+	}
+
+	if (current->task_works)
+		task_work_run();
+	set_fs(old_fs);
+	revert_creds(saved_creds);
+	kthread_parkme();
+	return 0;
+}
+
 struct io_wait_queue {
 	struct wait_queue_entry wq;
 	struct io_ring_ctx *ctx;
@@ -6219,18 +6363,23 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 	return 0;
 }
 
+static void destroy_io_percpu_thread(struct io_ring_ctx *ctx, int cpu);
+
 static void io_sq_thread_stop(struct io_ring_ctx *ctx)
 {
 	if (ctx->sqo_thread) {
-		wait_for_completion(&ctx->completions[1]);
-		/*
-		 * The park is a bit of a work-around, without it we get
-		 * warning spews on shutdown with SQPOLL set and affinity
-		 * set to a single CPU.
-		 */
-		kthread_park(ctx->sqo_thread);
-		kthread_stop(ctx->sqo_thread);
-		ctx->sqo_thread = NULL;
+		if (!(ctx->flags & IORING_SETUP_SQ_AFF)) {
+			wait_for_completion(&ctx->completions[1]);
+			/*
+			 * The park is a bit of a work-around, without it we get
+			 * warning spews on shutdown with SQPOLL set and affinity
+			 * set to a single CPU.
+			 */
+			kthread_park(ctx->sqo_thread);
+			kthread_stop(ctx->sqo_thread);
+			ctx->sqo_thread = NULL;
+		} else
+			destroy_io_percpu_thread(ctx, ctx->sq_thread_cpu);
 	}
 }
 
@@ -6841,6 +6990,52 @@ static int io_init_wq_offload(struct io_ring_ctx *ctx,
 	return ret;
 }
 
+static void create_io_percpu_thread(struct io_ring_ctx *ctx, int cpu)
+{
+	struct io_percpu_thread *t;
+
+	t = per_cpu_ptr(percpu_threads, cpu);
+	mutex_lock(&t->lock);
+	if (!t->sqo_thread) {
+		t->sqo_thread = kthread_create_on_cpu(io_sq_percpu_thread, t,
+					cpu, "io_uring_percpu-sq");
+		if (IS_ERR(t->sqo_thread)) {
+			ctx->sqo_thread = t->sqo_thread;
+			t->sqo_thread = NULL;
+			mutex_unlock(&t->lock);
+			return;
+		}
+	}
+
+	if (t->sq_thread_idle < ctx->sq_thread_idle)
+		t->sq_thread_idle = ctx->sq_thread_idle;
+	ctx->sqo_wait = &t->sqo_percpu_wait;
+	ctx->sq_thread_cpu = cpu;
+	list_add_tail(&ctx->node, &t->ctx_list);
+	ctx->sqo_thread = t->sqo_thread;
+	mutex_unlock(&t->lock);
+}
+
+static void destroy_io_percpu_thread(struct io_ring_ctx *ctx, int cpu)
+{
+	struct io_percpu_thread *t;
+	struct task_struct *sqo_thread = NULL;
+
+	t = per_cpu_ptr(percpu_threads, cpu);
+	mutex_lock(&t->lock);
+	list_del(&ctx->node);
+	if (list_empty(&t->ctx_list)) {
+		sqo_thread = t->sqo_thread;
+		t->sqo_thread = NULL;
+	}
+	mutex_unlock(&t->lock);
+
+	if (sqo_thread) {
+		kthread_park(sqo_thread);
+		kthread_stop(sqo_thread);
+	}
+}
+
 static int io_sq_offload_start(struct io_ring_ctx *ctx,
 			       struct io_uring_params *p)
 {
@@ -6867,9 +7062,7 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
 			if (!cpu_online(cpu))
 				goto err;
 
-			ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
-							ctx, cpu,
-							"io_uring-sq");
+			create_io_percpu_thread(ctx, cpu);
 		} else {
 			ctx->sqo_thread = kthread_create(io_sq_thread, ctx,
 							"io_uring-sq");
@@ -7516,7 +7709,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 		if (!list_empty_careful(&ctx->cq_overflow_list))
 			io_cqring_overflow_flush(ctx, false);
 		if (flags & IORING_ENTER_SQ_WAKEUP)
-			wake_up(&ctx->sqo_wait);
+			wake_up(ctx->sqo_wait);
 		submitted = to_submit;
 	} else if (to_submit) {
 		mutex_lock(&ctx->uring_lock);
@@ -8102,6 +8295,8 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
 
 static int __init io_uring_init(void)
 {
+	int cpu;
+
 #define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
 	BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
 	BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
@@ -8141,6 +8336,18 @@ static int __init io_uring_init(void)
 	BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
 	BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
 	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
+
+	percpu_threads = alloc_percpu(struct io_percpu_thread);
+	for_each_possible_cpu(cpu) {
+		struct io_percpu_thread *t;
+
+		t = per_cpu_ptr(percpu_threads, cpu);
+		INIT_LIST_HEAD(&t->ctx_list);
+		init_waitqueue_head(&t->sqo_percpu_wait);
+		mutex_init(&t->lock);
+		t->sqo_thread = NULL;
+		t->sq_thread_idle = 0;
+	}
 	return 0;
 };
 __initcall(io_uring_init);
-- 
2.17.2


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH] io_uring: create percpu io sq thread when IORING_SETUP_SQ_AFF is flagged
  2020-05-20 11:56 [PATCH] io_uring: create percpu io sq thread when IORING_SETUP_SQ_AFF is flagged Xiaoguang Wang
@ 2020-05-20 12:11 ` Xiaoguang Wang
  2020-05-22 11:17   ` Yu Jian Wu
  2020-05-20 22:09 ` Pavel Begunkov
  1 sibling, 1 reply; 11+ messages in thread
From: Xiaoguang Wang @ 2020-05-20 12:11 UTC (permalink / raw)
  To: io-uring; +Cc: axboe, joseph.qi, yujian.wu1

hi,

There're still some left work to do, fox example, use srcu to protect multiple
ctxs iteration to reduce mutex lock competition, make percpu thread aware of
cpu hotplug, but I send it now for some early comments, thanks in advance!

Regards,
Xiaoguang Wang

> Currently we can create multiple io_uring instances which all have SQPOLL
> enabled and make them bound to same cpu core by setting sq_thread_cpu argument,
> but sometimes this isn't efficient. Imagine such extreme case, create two io
> uring instances, which both have SQPOLL enabled and are bound to same cpu core.
> One instance submits io per 500ms, another instance submits io continually,
> then obviously the 1st instance still always contend for cpu resource, which
> will impact 2nd instance.
> 
> I have constructed tests case to the evaluate impact:
>      1, a test progarm which creates one io_uring instance with SQPOLL
> enabled, and bound to cpu core 61.
>      2, sudo taskset -c 60 fio  -name=fiotest -filename=/dev/nvme0n1 -iodepth=128
> -thread -rw=read -ioengine=io_uring  -hipri=0 -sqthread_poll=1  -sqthread_poll_cpu=61
> -registerfiles=1 -direct=1 -bs=4k -size=10G -numjobs=1  -time_based -runtime=120
> Note both io_uring instance's sq_thread_cpu are bound to cpu core 61.
> 
> In current kernel, run above two jobs concurrently, and the 2nd job will get:
> Run status group 0 (all jobs):
>     READ: bw=1623MiB/s (1701MB/s), 1623MiB/s-1623MiB/s (1701MB/s-1701MB/s),
> io=190GiB (204GB), run=120010-120010msec
> 
> We can see that 1st instance has a huge impact to 2nd instance, I think it's
> because of unordered competition for same cpu resource, every instance gets
> 50% cpu usage.
> 
> To fix this issue, when io_uring instance uses IORING_SETUP_SQ_AFF to specify a
> cpu,  we create a percpu io sq_thread to handle multiple io_uring instances' io
> requests serially. With this patch, in same environment, we get a huge improvement:
> Run status group 0 (all jobs):
>     READ: bw=3231MiB/s (3388MB/s), 3231MiB/s-3231MiB/s (3388MB/s-3388MB/s),
> io=379GiB (407GB), run=120001-120001msec
> 
> Link: https://lore.kernel.org/io-uring/c94098d2-279e-a552-91ec-8a8f177d770a@linux.alibaba.com/T/#t
> Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
> ---
>   fs/io_uring.c | 257 +++++++++++++++++++++++++++++++++++++++++++++-----
>   1 file changed, 232 insertions(+), 25 deletions(-)
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index ed0c22eb9808..e49ff1f67681 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -258,8 +258,13 @@ struct io_ring_ctx {
>   	/* IO offload */
>   	struct io_wq		*io_wq;
>   	struct task_struct	*sqo_thread;	/* if using sq thread polling */
> +	wait_queue_head_t	*sqo_wait;
> +	int			submit_status;
> +	int			sq_thread_cpu;
> +	struct list_head	node;
> +
>   	struct mm_struct	*sqo_mm;
> -	wait_queue_head_t	sqo_wait;
> +	wait_queue_head_t	__sqo_wait;
>   
>   	/*
>   	 * If used, fixed file set. Writers must ensure that ->refs is dead,
> @@ -330,6 +335,16 @@ struct io_ring_ctx {
>   	struct work_struct		exit_work;
>   };
>   
> +struct io_percpu_thread {
> +	struct list_head ctx_list;
> +	wait_queue_head_t sqo_percpu_wait;
> +	struct mutex lock;
> +	struct task_struct *sqo_thread;
> +	unsigned int sq_thread_idle;
> +};
> +
> +static struct io_percpu_thread __percpu *percpu_threads;
> +
>   /*
>    * First field must be the file pointer in all the
>    * iocb unions! See also 'struct kiocb' in <linux/fs.h>
> @@ -926,9 +941,11 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
>   		goto err;
>   
>   	ctx->flags = p->flags;
> -	init_waitqueue_head(&ctx->sqo_wait);
> +	init_waitqueue_head(&ctx->__sqo_wait);
> +	ctx->sqo_wait = &ctx->__sqo_wait;
>   	init_waitqueue_head(&ctx->cq_wait);
>   	INIT_LIST_HEAD(&ctx->cq_overflow_list);
> +	INIT_LIST_HEAD(&ctx->node);
>   	init_completion(&ctx->completions[0]);
>   	init_completion(&ctx->completions[1]);
>   	idr_init(&ctx->io_buffer_idr);
> @@ -1157,8 +1174,8 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
>   {
>   	if (waitqueue_active(&ctx->wait))
>   		wake_up(&ctx->wait);
> -	if (waitqueue_active(&ctx->sqo_wait))
> -		wake_up(&ctx->sqo_wait);
> +	if (waitqueue_active(ctx->sqo_wait))
> +		wake_up(ctx->sqo_wait);
>   	if (io_should_trigger_evfd(ctx))
>   		eventfd_signal(ctx->cq_ev_fd, 1);
>   }
> @@ -1980,8 +1997,8 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
>   		list_add_tail(&req->list, &ctx->poll_list);
>   
>   	if ((ctx->flags & IORING_SETUP_SQPOLL) &&
> -	    wq_has_sleeper(&ctx->sqo_wait))
> -		wake_up(&ctx->sqo_wait);
> +	    wq_has_sleeper(ctx->sqo_wait))
> +		wake_up(ctx->sqo_wait);
>   }
>   
>   static void io_file_put(struct io_submit_state *state)
> @@ -5994,7 +6011,7 @@ static int io_sq_thread(void *data)
>   				continue;
>   			}
>   
> -			prepare_to_wait(&ctx->sqo_wait, &wait,
> +			prepare_to_wait(ctx->sqo_wait, &wait,
>   						TASK_INTERRUPTIBLE);
>   
>   			/*
> @@ -6006,7 +6023,7 @@ static int io_sq_thread(void *data)
>   			 */
>   			if ((ctx->flags & IORING_SETUP_IOPOLL) &&
>   			    !list_empty_careful(&ctx->poll_list)) {
> -				finish_wait(&ctx->sqo_wait, &wait);
> +				finish_wait(ctx->sqo_wait, &wait);
>   				continue;
>   			}
>   
> @@ -6018,23 +6035,23 @@ static int io_sq_thread(void *data)
>   			to_submit = io_sqring_entries(ctx);
>   			if (!to_submit || ret == -EBUSY) {
>   				if (kthread_should_park()) {
> -					finish_wait(&ctx->sqo_wait, &wait);
> +					finish_wait(ctx->sqo_wait, &wait);
>   					break;
>   				}
>   				if (current->task_works) {
>   					task_work_run();
> -					finish_wait(&ctx->sqo_wait, &wait);
> +					finish_wait(ctx->sqo_wait, &wait);
>   					continue;
>   				}
>   				if (signal_pending(current))
>   					flush_signals(current);
>   				schedule();
> -				finish_wait(&ctx->sqo_wait, &wait);
> +				finish_wait(ctx->sqo_wait, &wait);
>   
>   				ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
>   				continue;
>   			}
> -			finish_wait(&ctx->sqo_wait, &wait);
> +			finish_wait(ctx->sqo_wait, &wait);
>   
>   			ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
>   		}
> @@ -6058,6 +6075,133 @@ static int io_sq_thread(void *data)
>   	return 0;
>   }
>   
> +static int process_ctx(struct io_ring_ctx *ctx)
> +{
> +	unsigned int to_submit;
> +	int ret = 0;
> +
> +	if (!list_empty(&ctx->poll_list)) {
> +		unsigned nr_events = 0;
> +
> +		mutex_lock(&ctx->uring_lock);
> +		if (!list_empty(&ctx->poll_list))
> +			io_iopoll_getevents(ctx, &nr_events, 0);
> +		mutex_unlock(&ctx->uring_lock);
> +	}
> +
> +	to_submit = io_sqring_entries(ctx);
> +	if (to_submit) {
> +		mutex_lock(&ctx->uring_lock);
> +		if (likely(!percpu_ref_is_dying(&ctx->refs)))
> +			ret = io_submit_sqes(ctx, to_submit, NULL, -1, true);
> +		mutex_unlock(&ctx->uring_lock);
> +	}
> +
> +	if (current->task_works)
> +		task_work_run();
> +
> +	io_sq_thread_drop_mm(ctx);
> +	return ret;
> +}
> +
> +static int io_sq_percpu_thread(void *data)
> +{
> +	struct io_percpu_thread *t = data;
> +	struct io_ring_ctx *ctx, *tmp;
> +	mm_segment_t old_fs;
> +	const struct cred *saved_creds, *cur_creds, *old_creds;
> +	unsigned long timeout;
> +	DEFINE_WAIT(wait);
> +	int iters = 0;
> +
> +	timeout = jiffies + t->sq_thread_idle;
> +	old_fs = get_fs();
> +	set_fs(USER_DS);
> +	saved_creds = cur_creds = NULL;
> +	while (!kthread_should_park()) {
> +		bool continue_run;
> +		bool needs_wait;
> +		unsigned int to_submit;
> +
> +		mutex_lock(&t->lock);
> +again:
> +		continue_run = false;
> +		list_for_each_entry_safe(ctx, tmp, &t->ctx_list, node) {
> +			if (cur_creds != ctx->creds) {
> +				old_creds = override_creds(ctx->creds);
> +				cur_creds = ctx->creds;
> +				if (saved_creds)
> +					put_cred(old_creds);
> +				else
> +					saved_creds = old_creds;
> +			}
> +			ctx->submit_status = process_ctx(ctx);
> +
> +			to_submit = io_sqring_entries(ctx);
> +			if (!continue_run &&
> +			    ((to_submit && ctx->submit_status != -EBUSY) ||
> +			    !list_empty(&ctx->poll_list)))
> +				continue_run = true;
> +		}
> +		if (continue_run && (++iters & 7)) {
> +			timeout = jiffies + t->sq_thread_idle;
> +			goto again;
> +		}
> +		mutex_unlock(&t->lock);
> +		if (continue_run) {
> +			timeout = jiffies + t->sq_thread_idle;
> +			continue;
> +		}
> +		if (!time_after(jiffies, timeout)) {
> +			cond_resched();
> +			continue;
> +		}
> +
> +		needs_wait = true;
> +		prepare_to_wait(&t->sqo_percpu_wait, &wait, TASK_INTERRUPTIBLE);
> +		mutex_lock(&t->lock);
> +		list_for_each_entry_safe(ctx, tmp, &t->ctx_list, node) {
> +			if ((ctx->flags & IORING_SETUP_IOPOLL) &&
> +			    !list_empty_careful(&ctx->poll_list)) {
> +				needs_wait = false;
> +				break;
> +			}
> +			to_submit = io_sqring_entries(ctx);
> +			if (to_submit && ctx->submit_status != -EBUSY) {
> +				needs_wait = false;
> +				break;
> +			}
> +		}
> +		if (needs_wait) {
> +			list_for_each_entry_safe(ctx, tmp, &t->ctx_list, node)
> +				ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
> +			smp_mb();
> +
> +		}
> +		mutex_unlock(&t->lock);
> +
> +		if (needs_wait) {
> +			schedule();
> +			mutex_lock(&t->lock);
> +			list_for_each_entry_safe(ctx, tmp,
> +						 &t->ctx_list, node)
> +				ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
> +			mutex_unlock(&t->lock);
> +			finish_wait(&t->sqo_percpu_wait, &wait);
> +		} else
> +			finish_wait(&t->sqo_percpu_wait, &wait);
> +		timeout = jiffies + t->sq_thread_idle;
> +		cond_resched();
> +	}
> +
> +	if (current->task_works)
> +		task_work_run();
> +	set_fs(old_fs);
> +	revert_creds(saved_creds);
> +	kthread_parkme();
> +	return 0;
> +}
> +
>   struct io_wait_queue {
>   	struct wait_queue_entry wq;
>   	struct io_ring_ctx *ctx;
> @@ -6219,18 +6363,23 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
>   	return 0;
>   }
>   
> +static void destroy_io_percpu_thread(struct io_ring_ctx *ctx, int cpu);
> +
>   static void io_sq_thread_stop(struct io_ring_ctx *ctx)
>   {
>   	if (ctx->sqo_thread) {
> -		wait_for_completion(&ctx->completions[1]);
> -		/*
> -		 * The park is a bit of a work-around, without it we get
> -		 * warning spews on shutdown with SQPOLL set and affinity
> -		 * set to a single CPU.
> -		 */
> -		kthread_park(ctx->sqo_thread);
> -		kthread_stop(ctx->sqo_thread);
> -		ctx->sqo_thread = NULL;
> +		if (!(ctx->flags & IORING_SETUP_SQ_AFF)) {
> +			wait_for_completion(&ctx->completions[1]);
> +			/*
> +			 * The park is a bit of a work-around, without it we get
> +			 * warning spews on shutdown with SQPOLL set and affinity
> +			 * set to a single CPU.
> +			 */
> +			kthread_park(ctx->sqo_thread);
> +			kthread_stop(ctx->sqo_thread);
> +			ctx->sqo_thread = NULL;
> +		} else
> +			destroy_io_percpu_thread(ctx, ctx->sq_thread_cpu);
>   	}
>   }
>   
> @@ -6841,6 +6990,52 @@ static int io_init_wq_offload(struct io_ring_ctx *ctx,
>   	return ret;
>   }
>   
> +static void create_io_percpu_thread(struct io_ring_ctx *ctx, int cpu)
> +{
> +	struct io_percpu_thread *t;
> +
> +	t = per_cpu_ptr(percpu_threads, cpu);
> +	mutex_lock(&t->lock);
> +	if (!t->sqo_thread) {
> +		t->sqo_thread = kthread_create_on_cpu(io_sq_percpu_thread, t,
> +					cpu, "io_uring_percpu-sq");
> +		if (IS_ERR(t->sqo_thread)) {
> +			ctx->sqo_thread = t->sqo_thread;
> +			t->sqo_thread = NULL;
> +			mutex_unlock(&t->lock);
> +			return;
> +		}
> +	}
> +
> +	if (t->sq_thread_idle < ctx->sq_thread_idle)
> +		t->sq_thread_idle = ctx->sq_thread_idle;
> +	ctx->sqo_wait = &t->sqo_percpu_wait;
> +	ctx->sq_thread_cpu = cpu;
> +	list_add_tail(&ctx->node, &t->ctx_list);
> +	ctx->sqo_thread = t->sqo_thread;
> +	mutex_unlock(&t->lock);
> +}
> +
> +static void destroy_io_percpu_thread(struct io_ring_ctx *ctx, int cpu)
> +{
> +	struct io_percpu_thread *t;
> +	struct task_struct *sqo_thread = NULL;
> +
> +	t = per_cpu_ptr(percpu_threads, cpu);
> +	mutex_lock(&t->lock);
> +	list_del(&ctx->node);
> +	if (list_empty(&t->ctx_list)) {
> +		sqo_thread = t->sqo_thread;
> +		t->sqo_thread = NULL;
> +	}
> +	mutex_unlock(&t->lock);
> +
> +	if (sqo_thread) {
> +		kthread_park(sqo_thread);
> +		kthread_stop(sqo_thread);
> +	}
> +}
> +
>   static int io_sq_offload_start(struct io_ring_ctx *ctx,
>   			       struct io_uring_params *p)
>   {
> @@ -6867,9 +7062,7 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
>   			if (!cpu_online(cpu))
>   				goto err;
>   
> -			ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
> -							ctx, cpu,
> -							"io_uring-sq");
> +			create_io_percpu_thread(ctx, cpu);
>   		} else {
>   			ctx->sqo_thread = kthread_create(io_sq_thread, ctx,
>   							"io_uring-sq");
> @@ -7516,7 +7709,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
>   		if (!list_empty_careful(&ctx->cq_overflow_list))
>   			io_cqring_overflow_flush(ctx, false);
>   		if (flags & IORING_ENTER_SQ_WAKEUP)
> -			wake_up(&ctx->sqo_wait);
> +			wake_up(ctx->sqo_wait);
>   		submitted = to_submit;
>   	} else if (to_submit) {
>   		mutex_lock(&ctx->uring_lock);
> @@ -8102,6 +8295,8 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
>   
>   static int __init io_uring_init(void)
>   {
> +	int cpu;
> +
>   #define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
>   	BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
>   	BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
> @@ -8141,6 +8336,18 @@ static int __init io_uring_init(void)
>   	BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
>   	BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
>   	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
> +
> +	percpu_threads = alloc_percpu(struct io_percpu_thread);
> +	for_each_possible_cpu(cpu) {
> +		struct io_percpu_thread *t;
> +
> +		t = per_cpu_ptr(percpu_threads, cpu);
> +		INIT_LIST_HEAD(&t->ctx_list);
> +		init_waitqueue_head(&t->sqo_percpu_wait);
> +		mutex_init(&t->lock);
> +		t->sqo_thread = NULL;
> +		t->sq_thread_idle = 0;
> +	}
>   	return 0;
>   };
>   __initcall(io_uring_init);
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] io_uring: create percpu io sq thread when IORING_SETUP_SQ_AFF is flagged
  2020-05-20 11:56 [PATCH] io_uring: create percpu io sq thread when IORING_SETUP_SQ_AFF is flagged Xiaoguang Wang
  2020-05-20 12:11 ` Xiaoguang Wang
@ 2020-05-20 22:09 ` Pavel Begunkov
  2020-05-22  8:33   ` Xiaoguang Wang
  1 sibling, 1 reply; 11+ messages in thread
From: Pavel Begunkov @ 2020-05-20 22:09 UTC (permalink / raw)
  To: Xiaoguang Wang, io-uring; +Cc: axboe, joseph.qi, yujian.wu1

On 20/05/2020 14:56, Xiaoguang Wang wrote:
> Currently we can create multiple io_uring instances which all have SQPOLL
> enabled and make them bound to same cpu core by setting sq_thread_cpu argument,
> but sometimes this isn't efficient. Imagine such extreme case, create two io
> uring instances, which both have SQPOLL enabled and are bound to same cpu core.
> One instance submits io per 500ms, another instance submits io continually,
> then obviously the 1st instance still always contend for cpu resource, which
> will impact 2nd instance.
> 
> I have constructed tests case to the evaluate impact:
>     1, a test progarm which creates one io_uring instance with SQPOLL
> enabled, and bound to cpu core 61.
>     2, sudo taskset -c 60 fio  -name=fiotest -filename=/dev/nvme0n1 -iodepth=128
> -thread -rw=read -ioengine=io_uring  -hipri=0 -sqthread_poll=1  -sqthread_poll_cpu=61
> -registerfiles=1 -direct=1 -bs=4k -size=10G -numjobs=1  -time_based -runtime=120
> Note both io_uring instance's sq_thread_cpu are bound to cpu core 61.
> 
> In current kernel, run above two jobs concurrently, and the 2nd job will get:
> Run status group 0 (all jobs):
>    READ: bw=1623MiB/s (1701MB/s), 1623MiB/s-1623MiB/s (1701MB/s-1701MB/s),
> io=190GiB (204GB), run=120010-120010msec
> 
> We can see that 1st instance has a huge impact to 2nd instance, I think it's
> because of unordered competition for same cpu resource, every instance gets
> 50% cpu usage.
> 
> To fix this issue, when io_uring instance uses IORING_SETUP_SQ_AFF to specify a
> cpu,  we create a percpu io sq_thread to handle multiple io_uring instances' io
> requests serially. With this patch, in same environment, we get a huge improvement:

Consider that a user can issue a long sequence of requests, say 2^15 of them,
and all of them will happen in a single call to io_submit_sqes(). Just preparing
them would take much time, apart from that it can do some effective work for
each of them, e.g. copying a page. And now there is another io_uring, caring
much about latencies and nevertheless waiting for _very_ long for its turn.

Another problem is that with it a user can't even guess when its SQ would be
emptied, and would need to constantly poll.

In essence, the problem is in bypassing thread scheduling, and re-implementing
poor man's version of it (round robin by io_uring).
The idea and the reasoning are compelling, but I think we need to do something
about unrelated io_uring instances obstructing each other. At least not making
it mandatory behaviour.

E.g. it's totally fine by me, if a sqpoll kthread is shared between specified
bunch of io_urings -- the user would be responsible for binding them and not
screwing up latencies/whatever. Most probably there won't be much (priviledged)
users using SQPOLL, and they all be a part of a single app, e.g. with
multiple/per-thread io_urings.

Another way would be to switch between io_urings faster, e.g. after processing
not all requests but 1 or some N of them. But that's very thin ice, and I
already see other bag of issues.

Ideas?


> Run status group 0 (all jobs):
>    READ: bw=3231MiB/s (3388MB/s), 3231MiB/s-3231MiB/s (3388MB/s-3388MB/s),
> io=379GiB (407GB), run=120001-120001msec
> 
> Link: https://lore.kernel.org/io-uring/c94098d2-279e-a552-91ec-8a8f177d770a@linux.alibaba.com/T/#t
> Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
> ---
>  fs/io_uring.c | 257 +++++++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 232 insertions(+), 25 deletions(-)
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index ed0c22eb9808..e49ff1f67681 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -258,8 +258,13 @@ struct io_ring_ctx {
>  	/* IO offload */
>  	struct io_wq		*io_wq;
>  	struct task_struct	*sqo_thread;	/* if using sq thread polling */
> +	wait_queue_head_t	*sqo_wait;
> +	int			submit_status;
> +	int			sq_thread_cpu;
> +	struct list_head	node;
> +
>  	struct mm_struct	*sqo_mm;
> -	wait_queue_head_t	sqo_wait;
> +	wait_queue_head_t	__sqo_wait;
>  
>  	/*
>  	 * If used, fixed file set. Writers must ensure that ->refs is dead,
> @@ -330,6 +335,16 @@ struct io_ring_ctx {
>  	struct work_struct		exit_work;
>  };
>  
> +struct io_percpu_thread {
> +	struct list_head ctx_list;
> +	wait_queue_head_t sqo_percpu_wait;
> +	struct mutex lock;
> +	struct task_struct *sqo_thread;
> +	unsigned int sq_thread_idle;
> +};
> +
> +static struct io_percpu_thread __percpu *percpu_threads;
> +
>  /*
>   * First field must be the file pointer in all the
>   * iocb unions! See also 'struct kiocb' in <linux/fs.h>
> @@ -926,9 +941,11 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
>  		goto err;
>  
>  	ctx->flags = p->flags;
> -	init_waitqueue_head(&ctx->sqo_wait);
> +	init_waitqueue_head(&ctx->__sqo_wait);
> +	ctx->sqo_wait = &ctx->__sqo_wait;
>  	init_waitqueue_head(&ctx->cq_wait);
>  	INIT_LIST_HEAD(&ctx->cq_overflow_list);
> +	INIT_LIST_HEAD(&ctx->node);
>  	init_completion(&ctx->completions[0]);
>  	init_completion(&ctx->completions[1]);
>  	idr_init(&ctx->io_buffer_idr);
> @@ -1157,8 +1174,8 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
>  {
>  	if (waitqueue_active(&ctx->wait))
>  		wake_up(&ctx->wait);
> -	if (waitqueue_active(&ctx->sqo_wait))
> -		wake_up(&ctx->sqo_wait);
> +	if (waitqueue_active(ctx->sqo_wait))
> +		wake_up(ctx->sqo_wait);
>  	if (io_should_trigger_evfd(ctx))
>  		eventfd_signal(ctx->cq_ev_fd, 1);
>  }
> @@ -1980,8 +1997,8 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
>  		list_add_tail(&req->list, &ctx->poll_list);
>  
>  	if ((ctx->flags & IORING_SETUP_SQPOLL) &&
> -	    wq_has_sleeper(&ctx->sqo_wait))
> -		wake_up(&ctx->sqo_wait);
> +	    wq_has_sleeper(ctx->sqo_wait))
> +		wake_up(ctx->sqo_wait);
>  }
>  
>  static void io_file_put(struct io_submit_state *state)
> @@ -5994,7 +6011,7 @@ static int io_sq_thread(void *data)
>  				continue;
>  			}
>  
> -			prepare_to_wait(&ctx->sqo_wait, &wait,
> +			prepare_to_wait(ctx->sqo_wait, &wait,
>  						TASK_INTERRUPTIBLE);
>  
>  			/*
> @@ -6006,7 +6023,7 @@ static int io_sq_thread(void *data)
>  			 */
>  			if ((ctx->flags & IORING_SETUP_IOPOLL) &&
>  			    !list_empty_careful(&ctx->poll_list)) {
> -				finish_wait(&ctx->sqo_wait, &wait);
> +				finish_wait(ctx->sqo_wait, &wait);
>  				continue;
>  			}
>  
> @@ -6018,23 +6035,23 @@ static int io_sq_thread(void *data)
>  			to_submit = io_sqring_entries(ctx);
>  			if (!to_submit || ret == -EBUSY) {
>  				if (kthread_should_park()) {
> -					finish_wait(&ctx->sqo_wait, &wait);
> +					finish_wait(ctx->sqo_wait, &wait);
>  					break;
>  				}
>  				if (current->task_works) {
>  					task_work_run();
> -					finish_wait(&ctx->sqo_wait, &wait);
> +					finish_wait(ctx->sqo_wait, &wait);
>  					continue;
>  				}
>  				if (signal_pending(current))
>  					flush_signals(current);
>  				schedule();
> -				finish_wait(&ctx->sqo_wait, &wait);
> +				finish_wait(ctx->sqo_wait, &wait);
>  
>  				ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
>  				continue;
>  			}
> -			finish_wait(&ctx->sqo_wait, &wait);
> +			finish_wait(ctx->sqo_wait, &wait);
>  
>  			ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
>  		}
> @@ -6058,6 +6075,133 @@ static int io_sq_thread(void *data)
>  	return 0;
>  }
>  
> +static int process_ctx(struct io_ring_ctx *ctx)
> +{
> +	unsigned int to_submit;
> +	int ret = 0;
> +
> +	if (!list_empty(&ctx->poll_list)) {
> +		unsigned nr_events = 0;
> +
> +		mutex_lock(&ctx->uring_lock);
> +		if (!list_empty(&ctx->poll_list))
> +			io_iopoll_getevents(ctx, &nr_events, 0);
> +		mutex_unlock(&ctx->uring_lock);
> +	}
> +
> +	to_submit = io_sqring_entries(ctx);
> +	if (to_submit) {
> +		mutex_lock(&ctx->uring_lock);
> +		if (likely(!percpu_ref_is_dying(&ctx->refs)))
> +			ret = io_submit_sqes(ctx, to_submit, NULL, -1, true);
> +		mutex_unlock(&ctx->uring_lock);
> +	}
> +
> +	if (current->task_works)
> +		task_work_run();
> +
> +	io_sq_thread_drop_mm(ctx);
> +	return ret;
> +}
> +
> +static int io_sq_percpu_thread(void *data)
> +{
> +	struct io_percpu_thread *t = data;
> +	struct io_ring_ctx *ctx, *tmp;
> +	mm_segment_t old_fs;
> +	const struct cred *saved_creds, *cur_creds, *old_creds;
> +	unsigned long timeout;
> +	DEFINE_WAIT(wait);
> +	int iters = 0;
> +
> +	timeout = jiffies + t->sq_thread_idle;
> +	old_fs = get_fs();
> +	set_fs(USER_DS);
> +	saved_creds = cur_creds = NULL;
> +	while (!kthread_should_park()) {
> +		bool continue_run;
> +		bool needs_wait;
> +		unsigned int to_submit;
> +
> +		mutex_lock(&t->lock);
> +again:
> +		continue_run = false;
> +		list_for_each_entry_safe(ctx, tmp, &t->ctx_list, node) {
> +			if (cur_creds != ctx->creds) {
> +				old_creds = override_creds(ctx->creds);
> +				cur_creds = ctx->creds;
> +				if (saved_creds)
> +					put_cred(old_creds);
> +				else
> +					saved_creds = old_creds;
> +			}
> +			ctx->submit_status = process_ctx(ctx);
> +
> +			to_submit = io_sqring_entries(ctx);
> +			if (!continue_run &&
> +			    ((to_submit && ctx->submit_status != -EBUSY) ||
> +			    !list_empty(&ctx->poll_list)))
> +				continue_run = true;
> +		}
> +		if (continue_run && (++iters & 7)) {
> +			timeout = jiffies + t->sq_thread_idle;
> +			goto again;
> +		}
> +		mutex_unlock(&t->lock);
> +		if (continue_run) {
> +			timeout = jiffies + t->sq_thread_idle;
> +			continue;
> +		}
> +		if (!time_after(jiffies, timeout)) {
> +			cond_resched();
> +			continue;
> +		}
> +
> +		needs_wait = true;
> +		prepare_to_wait(&t->sqo_percpu_wait, &wait, TASK_INTERRUPTIBLE);
> +		mutex_lock(&t->lock);
> +		list_for_each_entry_safe(ctx, tmp, &t->ctx_list, node) {
> +			if ((ctx->flags & IORING_SETUP_IOPOLL) &&
> +			    !list_empty_careful(&ctx->poll_list)) {
> +				needs_wait = false;
> +				break;
> +			}
> +			to_submit = io_sqring_entries(ctx);
> +			if (to_submit && ctx->submit_status != -EBUSY) {
> +				needs_wait = false;
> +				break;
> +			}
> +		}
> +		if (needs_wait) {
> +			list_for_each_entry_safe(ctx, tmp, &t->ctx_list, node)
> +				ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
> +			smp_mb();
> +
> +		}
> +		mutex_unlock(&t->lock);
> +
> +		if (needs_wait) {
> +			schedule();
> +			mutex_lock(&t->lock);
> +			list_for_each_entry_safe(ctx, tmp,
> +						 &t->ctx_list, node)
> +				ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
> +			mutex_unlock(&t->lock);
> +			finish_wait(&t->sqo_percpu_wait, &wait);
> +		} else
> +			finish_wait(&t->sqo_percpu_wait, &wait);
> +		timeout = jiffies + t->sq_thread_idle;
> +		cond_resched();
> +	}
> +
> +	if (current->task_works)
> +		task_work_run();
> +	set_fs(old_fs);
> +	revert_creds(saved_creds);
> +	kthread_parkme();
> +	return 0;
> +}
> +
>  struct io_wait_queue {
>  	struct wait_queue_entry wq;
>  	struct io_ring_ctx *ctx;
> @@ -6219,18 +6363,23 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
>  	return 0;
>  }
>  
> +static void destroy_io_percpu_thread(struct io_ring_ctx *ctx, int cpu);
> +
>  static void io_sq_thread_stop(struct io_ring_ctx *ctx)
>  {
>  	if (ctx->sqo_thread) {
> -		wait_for_completion(&ctx->completions[1]);
> -		/*
> -		 * The park is a bit of a work-around, without it we get
> -		 * warning spews on shutdown with SQPOLL set and affinity
> -		 * set to a single CPU.
> -		 */
> -		kthread_park(ctx->sqo_thread);
> -		kthread_stop(ctx->sqo_thread);
> -		ctx->sqo_thread = NULL;
> +		if (!(ctx->flags & IORING_SETUP_SQ_AFF)) {
> +			wait_for_completion(&ctx->completions[1]);
> +			/*
> +			 * The park is a bit of a work-around, without it we get
> +			 * warning spews on shutdown with SQPOLL set and affinity
> +			 * set to a single CPU.
> +			 */
> +			kthread_park(ctx->sqo_thread);
> +			kthread_stop(ctx->sqo_thread);
> +			ctx->sqo_thread = NULL;
> +		} else
> +			destroy_io_percpu_thread(ctx, ctx->sq_thread_cpu);
>  	}
>  }
>  
> @@ -6841,6 +6990,52 @@ static int io_init_wq_offload(struct io_ring_ctx *ctx,
>  	return ret;
>  }
>  
> +static void create_io_percpu_thread(struct io_ring_ctx *ctx, int cpu)
> +{
> +	struct io_percpu_thread *t;
> +
> +	t = per_cpu_ptr(percpu_threads, cpu);
> +	mutex_lock(&t->lock);
> +	if (!t->sqo_thread) {
> +		t->sqo_thread = kthread_create_on_cpu(io_sq_percpu_thread, t,
> +					cpu, "io_uring_percpu-sq");
> +		if (IS_ERR(t->sqo_thread)) {
> +			ctx->sqo_thread = t->sqo_thread;
> +			t->sqo_thread = NULL;
> +			mutex_unlock(&t->lock);
> +			return;
> +		}
> +	}
> +
> +	if (t->sq_thread_idle < ctx->sq_thread_idle)
> +		t->sq_thread_idle = ctx->sq_thread_idle;
> +	ctx->sqo_wait = &t->sqo_percpu_wait;
> +	ctx->sq_thread_cpu = cpu;
> +	list_add_tail(&ctx->node, &t->ctx_list);
> +	ctx->sqo_thread = t->sqo_thread;
> +	mutex_unlock(&t->lock);
> +}
> +
> +static void destroy_io_percpu_thread(struct io_ring_ctx *ctx, int cpu)
> +{
> +	struct io_percpu_thread *t;
> +	struct task_struct *sqo_thread = NULL;
> +
> +	t = per_cpu_ptr(percpu_threads, cpu);
> +	mutex_lock(&t->lock);
> +	list_del(&ctx->node);
> +	if (list_empty(&t->ctx_list)) {
> +		sqo_thread = t->sqo_thread;
> +		t->sqo_thread = NULL;
> +	}
> +	mutex_unlock(&t->lock);
> +
> +	if (sqo_thread) {
> +		kthread_park(sqo_thread);
> +		kthread_stop(sqo_thread);
> +	}
> +}
> +
>  static int io_sq_offload_start(struct io_ring_ctx *ctx,
>  			       struct io_uring_params *p)
>  {
> @@ -6867,9 +7062,7 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
>  			if (!cpu_online(cpu))
>  				goto err;
>  
> -			ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
> -							ctx, cpu,
> -							"io_uring-sq");
> +			create_io_percpu_thread(ctx, cpu);
>  		} else {
>  			ctx->sqo_thread = kthread_create(io_sq_thread, ctx,
>  							"io_uring-sq");
> @@ -7516,7 +7709,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
>  		if (!list_empty_careful(&ctx->cq_overflow_list))
>  			io_cqring_overflow_flush(ctx, false);
>  		if (flags & IORING_ENTER_SQ_WAKEUP)
> -			wake_up(&ctx->sqo_wait);
> +			wake_up(ctx->sqo_wait);
>  		submitted = to_submit;
>  	} else if (to_submit) {
>  		mutex_lock(&ctx->uring_lock);
> @@ -8102,6 +8295,8 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
>  
>  static int __init io_uring_init(void)
>  {
> +	int cpu;
> +
>  #define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
>  	BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
>  	BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
> @@ -8141,6 +8336,18 @@ static int __init io_uring_init(void)
>  	BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
>  	BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
>  	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
> +
> +	percpu_threads = alloc_percpu(struct io_percpu_thread);
> +	for_each_possible_cpu(cpu) {
> +		struct io_percpu_thread *t;
> +
> +		t = per_cpu_ptr(percpu_threads, cpu);
> +		INIT_LIST_HEAD(&t->ctx_list);
> +		init_waitqueue_head(&t->sqo_percpu_wait);
> +		mutex_init(&t->lock);
> +		t->sqo_thread = NULL;
> +		t->sq_thread_idle = 0;
> +	}
>  	return 0;
>  };
>  __initcall(io_uring_init);
> 

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] io_uring: create percpu io sq thread when IORING_SETUP_SQ_AFF is flagged
  2020-05-20 22:09 ` Pavel Begunkov
@ 2020-05-22  8:33   ` Xiaoguang Wang
  2020-05-24 11:46     ` Pavel Begunkov
  0 siblings, 1 reply; 11+ messages in thread
From: Xiaoguang Wang @ 2020-05-22  8:33 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring; +Cc: axboe, joseph.qi, yujian.wu1

hi Pavel,

First thanks for reviewing!
> On 20/05/2020 14:56, Xiaoguang Wang wrote:
>>
>> To fix this issue, when io_uring instance uses IORING_SETUP_SQ_AFF to specify a
>> cpu,  we create a percpu io sq_thread to handle multiple io_uring instances' io
>> requests serially. With this patch, in same environment, we get a huge improvement:
> 
> Consider that a user can issue a long sequence of requests, say 2^15 of them,
> and all of them will happen in a single call to io_submit_sqes(). Just preparing
> them would take much time, apart from that it can do some effective work for
> each of them, e.g. copying a page. And now there is another io_uring, caring
> much about latencies and nevertheless waiting for _very_ long for its turn.
Indeed I had thought this case before and don't clearly know how to optimize it yet.
But I think if user has above usage scenarios, they probably shouldn't make io_uring
instances share same cpu core. I'd like to explain more about our usage scenarios.
In a physical machine, say there are 96 cores, and it runs multiple cgroups, every
cgroup run same application and will monopoly 16 cpu cores. This application will
create 16 io threads and every io thread will create an io_uring instance and every
thread will be bound to a different cpu core, these io threads will receive io requests.
If we enable SQPOLL for these io threads, we allocate one or two cpu cores for these
io_uring instances at most, so they must share allocated cpu core. It's totally disaster
that some io_uring instances' busy loop in their sq_thread_idle period will impact other
io_uring instances which have io requests to handle.

> 
> Another problem is that with it a user can't even guess when its SQ would be
> emptied, and would need to constantly poll.
In this patch, in every iteration, we only handle io requests already queued,
will not constantly poll.

> 
> In essence, the problem is in bypassing thread scheduling, and re-implementing
> poor man's version of it (round robin by io_uring).
Yes :) Currently I use round robin strategy to handle multiple io_uring instance
in every iteration.

> The idea and the reasoning are compelling, but I think we need to do something
> about unrelated io_uring instances obstructing each other. At least not making
> it mandatory behaviour.
> 
> E.g. it's totally fine by me, if a sqpoll kthread is shared between specified
> bunch of io_urings -- the user would be responsible for binding them and not
> screwing up latencies/whatever. Most probably there won't be much (priviledged)
> users using SQPOLL, and they all be a part of a single app, e.g. with
> multiple/per-thread io_urings.
Did you read my patch? In this patch, I have implemented this idea :)

> 
> Another way would be to switch between io_urings faster, e.g. after processing
> not all requests but 1 or some N of them. But that's very thin ice, and I
> already see other bag of issues.
Sounds good, could you please lift detailed issues? Thanks.

Regards,
Xiaoguang Wang
> 
> Ideas?
> 
> 
>> Run status group 0 (all jobs):
>>     READ: bw=3231MiB/s (3388MB/s), 3231MiB/s-3231MiB/s (3388MB/s-3388MB/s),
>> io=379GiB (407GB), run=120001-120001msec
>>
>> Link: https://lore.kernel.org/io-uring/c94098d2-279e-a552-91ec-8a8f177d770a@linux.alibaba.com/T/#t
>> Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
>> ---
>>   fs/io_uring.c | 257 +++++++++++++++++++++++++++++++++++++++++++++-----
>>   1 file changed, 232 insertions(+), 25 deletions(-)
>>
>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>> index ed0c22eb9808..e49ff1f67681 100644
>> --- a/fs/io_uring.c
>> +++ b/fs/io_uring.c
>> @@ -258,8 +258,13 @@ struct io_ring_ctx {
>>   	/* IO offload */
>>   	struct io_wq		*io_wq;
>>   	struct task_struct	*sqo_thread;	/* if using sq thread polling */
>> +	wait_queue_head_t	*sqo_wait;
>> +	int			submit_status;
>> +	int			sq_thread_cpu;
>> +	struct list_head	node;
>> +
>>   	struct mm_struct	*sqo_mm;
>> -	wait_queue_head_t	sqo_wait;
>> +	wait_queue_head_t	__sqo_wait;
>>   
>>   	/*
>>   	 * If used, fixed file set. Writers must ensure that ->refs is dead,
>> @@ -330,6 +335,16 @@ struct io_ring_ctx {
>>   	struct work_struct		exit_work;
>>   };
>>   
>> +struct io_percpu_thread {
>> +	struct list_head ctx_list;
>> +	wait_queue_head_t sqo_percpu_wait;
>> +	struct mutex lock;
>> +	struct task_struct *sqo_thread;
>> +	unsigned int sq_thread_idle;
>> +};
>> +
>> +static struct io_percpu_thread __percpu *percpu_threads;
>> +
>>   /*
>>    * First field must be the file pointer in all the
>>    * iocb unions! See also 'struct kiocb' in <linux/fs.h>
>> @@ -926,9 +941,11 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
>>   		goto err;
>>   
>>   	ctx->flags = p->flags;
>> -	init_waitqueue_head(&ctx->sqo_wait);
>> +	init_waitqueue_head(&ctx->__sqo_wait);
>> +	ctx->sqo_wait = &ctx->__sqo_wait;
>>   	init_waitqueue_head(&ctx->cq_wait);
>>   	INIT_LIST_HEAD(&ctx->cq_overflow_list);
>> +	INIT_LIST_HEAD(&ctx->node);
>>   	init_completion(&ctx->completions[0]);
>>   	init_completion(&ctx->completions[1]);
>>   	idr_init(&ctx->io_buffer_idr);
>> @@ -1157,8 +1174,8 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
>>   {
>>   	if (waitqueue_active(&ctx->wait))
>>   		wake_up(&ctx->wait);
>> -	if (waitqueue_active(&ctx->sqo_wait))
>> -		wake_up(&ctx->sqo_wait);
>> +	if (waitqueue_active(ctx->sqo_wait))
>> +		wake_up(ctx->sqo_wait);
>>   	if (io_should_trigger_evfd(ctx))
>>   		eventfd_signal(ctx->cq_ev_fd, 1);
>>   }
>> @@ -1980,8 +1997,8 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
>>   		list_add_tail(&req->list, &ctx->poll_list);
>>   
>>   	if ((ctx->flags & IORING_SETUP_SQPOLL) &&
>> -	    wq_has_sleeper(&ctx->sqo_wait))
>> -		wake_up(&ctx->sqo_wait);
>> +	    wq_has_sleeper(ctx->sqo_wait))
>> +		wake_up(ctx->sqo_wait);
>>   }
>>   
>>   static void io_file_put(struct io_submit_state *state)
>> @@ -5994,7 +6011,7 @@ static int io_sq_thread(void *data)
>>   				continue;
>>   			}
>>   
>> -			prepare_to_wait(&ctx->sqo_wait, &wait,
>> +			prepare_to_wait(ctx->sqo_wait, &wait,
>>   						TASK_INTERRUPTIBLE);
>>   
>>   			/*
>> @@ -6006,7 +6023,7 @@ static int io_sq_thread(void *data)
>>   			 */
>>   			if ((ctx->flags & IORING_SETUP_IOPOLL) &&
>>   			    !list_empty_careful(&ctx->poll_list)) {
>> -				finish_wait(&ctx->sqo_wait, &wait);
>> +				finish_wait(ctx->sqo_wait, &wait);
>>   				continue;
>>   			}
>>   
>> @@ -6018,23 +6035,23 @@ static int io_sq_thread(void *data)
>>   			to_submit = io_sqring_entries(ctx);
>>   			if (!to_submit || ret == -EBUSY) {
>>   				if (kthread_should_park()) {
>> -					finish_wait(&ctx->sqo_wait, &wait);
>> +					finish_wait(ctx->sqo_wait, &wait);
>>   					break;
>>   				}
>>   				if (current->task_works) {
>>   					task_work_run();
>> -					finish_wait(&ctx->sqo_wait, &wait);
>> +					finish_wait(ctx->sqo_wait, &wait);
>>   					continue;
>>   				}
>>   				if (signal_pending(current))
>>   					flush_signals(current);
>>   				schedule();
>> -				finish_wait(&ctx->sqo_wait, &wait);
>> +				finish_wait(ctx->sqo_wait, &wait);
>>   
>>   				ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
>>   				continue;
>>   			}
>> -			finish_wait(&ctx->sqo_wait, &wait);
>> +			finish_wait(ctx->sqo_wait, &wait);
>>   
>>   			ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
>>   		}
>> @@ -6058,6 +6075,133 @@ static int io_sq_thread(void *data)
>>   	return 0;
>>   }
>>   
>> +static int process_ctx(struct io_ring_ctx *ctx)
>> +{
>> +	unsigned int to_submit;
>> +	int ret = 0;
>> +
>> +	if (!list_empty(&ctx->poll_list)) {
>> +		unsigned nr_events = 0;
>> +
>> +		mutex_lock(&ctx->uring_lock);
>> +		if (!list_empty(&ctx->poll_list))
>> +			io_iopoll_getevents(ctx, &nr_events, 0);
>> +		mutex_unlock(&ctx->uring_lock);
>> +	}
>> +
>> +	to_submit = io_sqring_entries(ctx);
>> +	if (to_submit) {
>> +		mutex_lock(&ctx->uring_lock);
>> +		if (likely(!percpu_ref_is_dying(&ctx->refs)))
>> +			ret = io_submit_sqes(ctx, to_submit, NULL, -1, true);
>> +		mutex_unlock(&ctx->uring_lock);
>> +	}
>> +
>> +	if (current->task_works)
>> +		task_work_run();
>> +
>> +	io_sq_thread_drop_mm(ctx);
>> +	return ret;
>> +}
>> +
>> +static int io_sq_percpu_thread(void *data)
>> +{
>> +	struct io_percpu_thread *t = data;
>> +	struct io_ring_ctx *ctx, *tmp;
>> +	mm_segment_t old_fs;
>> +	const struct cred *saved_creds, *cur_creds, *old_creds;
>> +	unsigned long timeout;
>> +	DEFINE_WAIT(wait);
>> +	int iters = 0;
>> +
>> +	timeout = jiffies + t->sq_thread_idle;
>> +	old_fs = get_fs();
>> +	set_fs(USER_DS);
>> +	saved_creds = cur_creds = NULL;
>> +	while (!kthread_should_park()) {
>> +		bool continue_run;
>> +		bool needs_wait;
>> +		unsigned int to_submit;
>> +
>> +		mutex_lock(&t->lock);
>> +again:
>> +		continue_run = false;
>> +		list_for_each_entry_safe(ctx, tmp, &t->ctx_list, node) {
>> +			if (cur_creds != ctx->creds) {
>> +				old_creds = override_creds(ctx->creds);
>> +				cur_creds = ctx->creds;
>> +				if (saved_creds)
>> +					put_cred(old_creds);
>> +				else
>> +					saved_creds = old_creds;
>> +			}
>> +			ctx->submit_status = process_ctx(ctx);
>> +
>> +			to_submit = io_sqring_entries(ctx);
>> +			if (!continue_run &&
>> +			    ((to_submit && ctx->submit_status != -EBUSY) ||
>> +			    !list_empty(&ctx->poll_list)))
>> +				continue_run = true;
>> +		}
>> +		if (continue_run && (++iters & 7)) {
>> +			timeout = jiffies + t->sq_thread_idle;
>> +			goto again;
>> +		}
>> +		mutex_unlock(&t->lock);
>> +		if (continue_run) {
>> +			timeout = jiffies + t->sq_thread_idle;
>> +			continue;
>> +		}
>> +		if (!time_after(jiffies, timeout)) {
>> +			cond_resched();
>> +			continue;
>> +		}
>> +
>> +		needs_wait = true;
>> +		prepare_to_wait(&t->sqo_percpu_wait, &wait, TASK_INTERRUPTIBLE);
>> +		mutex_lock(&t->lock);
>> +		list_for_each_entry_safe(ctx, tmp, &t->ctx_list, node) {
>> +			if ((ctx->flags & IORING_SETUP_IOPOLL) &&
>> +			    !list_empty_careful(&ctx->poll_list)) {
>> +				needs_wait = false;
>> +				break;
>> +			}
>> +			to_submit = io_sqring_entries(ctx);
>> +			if (to_submit && ctx->submit_status != -EBUSY) {
>> +				needs_wait = false;
>> +				break;
>> +			}
>> +		}
>> +		if (needs_wait) {
>> +			list_for_each_entry_safe(ctx, tmp, &t->ctx_list, node)
>> +				ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
>> +			smp_mb();
>> +
>> +		}
>> +		mutex_unlock(&t->lock);
>> +
>> +		if (needs_wait) {
>> +			schedule();
>> +			mutex_lock(&t->lock);
>> +			list_for_each_entry_safe(ctx, tmp,
>> +						 &t->ctx_list, node)
>> +				ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
>> +			mutex_unlock(&t->lock);
>> +			finish_wait(&t->sqo_percpu_wait, &wait);
>> +		} else
>> +			finish_wait(&t->sqo_percpu_wait, &wait);
>> +		timeout = jiffies + t->sq_thread_idle;
>> +		cond_resched();
>> +	}
>> +
>> +	if (current->task_works)
>> +		task_work_run();
>> +	set_fs(old_fs);
>> +	revert_creds(saved_creds);
>> +	kthread_parkme();
>> +	return 0;
>> +}
>> +
>>   struct io_wait_queue {
>>   	struct wait_queue_entry wq;
>>   	struct io_ring_ctx *ctx;
>> @@ -6219,18 +6363,23 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
>>   	return 0;
>>   }
>>   
>> +static void destroy_io_percpu_thread(struct io_ring_ctx *ctx, int cpu);
>> +
>>   static void io_sq_thread_stop(struct io_ring_ctx *ctx)
>>   {
>>   	if (ctx->sqo_thread) {
>> -		wait_for_completion(&ctx->completions[1]);
>> -		/*
>> -		 * The park is a bit of a work-around, without it we get
>> -		 * warning spews on shutdown with SQPOLL set and affinity
>> -		 * set to a single CPU.
>> -		 */
>> -		kthread_park(ctx->sqo_thread);
>> -		kthread_stop(ctx->sqo_thread);
>> -		ctx->sqo_thread = NULL;
>> +		if (!(ctx->flags & IORING_SETUP_SQ_AFF)) {
>> +			wait_for_completion(&ctx->completions[1]);
>> +			/*
>> +			 * The park is a bit of a work-around, without it we get
>> +			 * warning spews on shutdown with SQPOLL set and affinity
>> +			 * set to a single CPU.
>> +			 */
>> +			kthread_park(ctx->sqo_thread);
>> +			kthread_stop(ctx->sqo_thread);
>> +			ctx->sqo_thread = NULL;
>> +		} else
>> +			destroy_io_percpu_thread(ctx, ctx->sq_thread_cpu);
>>   	}
>>   }
>>   
>> @@ -6841,6 +6990,52 @@ static int io_init_wq_offload(struct io_ring_ctx *ctx,
>>   	return ret;
>>   }
>>   
>> +static void create_io_percpu_thread(struct io_ring_ctx *ctx, int cpu)
>> +{
>> +	struct io_percpu_thread *t;
>> +
>> +	t = per_cpu_ptr(percpu_threads, cpu);
>> +	mutex_lock(&t->lock);
>> +	if (!t->sqo_thread) {
>> +		t->sqo_thread = kthread_create_on_cpu(io_sq_percpu_thread, t,
>> +					cpu, "io_uring_percpu-sq");
>> +		if (IS_ERR(t->sqo_thread)) {
>> +			ctx->sqo_thread = t->sqo_thread;
>> +			t->sqo_thread = NULL;
>> +			mutex_unlock(&t->lock);
>> +			return;
>> +		}
>> +	}
>> +
>> +	if (t->sq_thread_idle < ctx->sq_thread_idle)
>> +		t->sq_thread_idle = ctx->sq_thread_idle;
>> +	ctx->sqo_wait = &t->sqo_percpu_wait;
>> +	ctx->sq_thread_cpu = cpu;
>> +	list_add_tail(&ctx->node, &t->ctx_list);
>> +	ctx->sqo_thread = t->sqo_thread;
>> +	mutex_unlock(&t->lock);
>> +}
>> +
>> +static void destroy_io_percpu_thread(struct io_ring_ctx *ctx, int cpu)
>> +{
>> +	struct io_percpu_thread *t;
>> +	struct task_struct *sqo_thread = NULL;
>> +
>> +	t = per_cpu_ptr(percpu_threads, cpu);
>> +	mutex_lock(&t->lock);
>> +	list_del(&ctx->node);
>> +	if (list_empty(&t->ctx_list)) {
>> +		sqo_thread = t->sqo_thread;
>> +		t->sqo_thread = NULL;
>> +	}
>> +	mutex_unlock(&t->lock);
>> +
>> +	if (sqo_thread) {
>> +		kthread_park(sqo_thread);
>> +		kthread_stop(sqo_thread);
>> +	}
>> +}
>> +
>>   static int io_sq_offload_start(struct io_ring_ctx *ctx,
>>   			       struct io_uring_params *p)
>>   {
>> @@ -6867,9 +7062,7 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
>>   			if (!cpu_online(cpu))
>>   				goto err;
>>   
>> -			ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
>> -							ctx, cpu,
>> -							"io_uring-sq");
>> +			create_io_percpu_thread(ctx, cpu);
>>   		} else {
>>   			ctx->sqo_thread = kthread_create(io_sq_thread, ctx,
>>   							"io_uring-sq");
>> @@ -7516,7 +7709,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
>>   		if (!list_empty_careful(&ctx->cq_overflow_list))
>>   			io_cqring_overflow_flush(ctx, false);
>>   		if (flags & IORING_ENTER_SQ_WAKEUP)
>> -			wake_up(&ctx->sqo_wait);
>> +			wake_up(ctx->sqo_wait);
>>   		submitted = to_submit;
>>   	} else if (to_submit) {
>>   		mutex_lock(&ctx->uring_lock);
>> @@ -8102,6 +8295,8 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
>>   
>>   static int __init io_uring_init(void)
>>   {
>> +	int cpu;
>> +
>>   #define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
>>   	BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
>>   	BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
>> @@ -8141,6 +8336,18 @@ static int __init io_uring_init(void)
>>   	BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
>>   	BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
>>   	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
>> +
>> +	percpu_threads = alloc_percpu(struct io_percpu_thread);
>> +	for_each_possible_cpu(cpu) {
>> +		struct io_percpu_thread *t;
>> +
>> +		t = per_cpu_ptr(percpu_threads, cpu);
>> +		INIT_LIST_HEAD(&t->ctx_list);
>> +		init_waitqueue_head(&t->sqo_percpu_wait);
>> +		mutex_init(&t->lock);
>> +		t->sqo_thread = NULL;
>> +		t->sq_thread_idle = 0;
>> +	}
>>   	return 0;
>>   };
>>   __initcall(io_uring_init);
>>
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] io_uring: create percpu io sq thread when IORING_SETUP_SQ_AFF is flagged
  2020-05-20 12:11 ` Xiaoguang Wang
@ 2020-05-22 11:17   ` Yu Jian Wu
  2020-05-25  3:16     ` Xiaoguang Wang
  0 siblings, 1 reply; 11+ messages in thread
From: Yu Jian Wu @ 2020-05-22 11:17 UTC (permalink / raw)
  To: Xiaoguang Wang; +Cc: io-uring, axboe, joseph.qi, asml.silence

On Wed, May 20, 2020 at 08:11:04PM +0800, Xiaoguang Wang wrote:
> hi,
> 
> There're still some left work to do, fox example, use srcu to protect multiple
> ctxs iteration to reduce mutex lock competition, make percpu thread aware of
> cpu hotplug, but I send it now for some early comments, thanks in advance!
> 
> Regards,
> Xiaoguang Wang

Hi,

Thanks for doing this!
Speaking as someone who tried this and really struggled with a few parts. 

Few comments below.

W.r.t Pavel's comments on how to do this fairly, I think the only way is
for this multiple ctx thread to handle all the ctxs fairly is to queue all 
the work and do it async rather than inline.

> 
> > Currently we can create multiple io_uring instances which all have SQPOLL
> > enabled and make them bound to same cpu core by setting sq_thread_cpu argument,
> > but sometimes this isn't efficient. Imagine such extreme case, create two io
> > uring instances, which both have SQPOLL enabled and are bound to same cpu core.
> > One instance submits io per 500ms, another instance submits io continually,
> > then obviously the 1st instance still always contend for cpu resource, which
> > will impact 2nd instance.
> > 
> > I have constructed tests case to the evaluate impact:
> >      1, a test progarm which creates one io_uring instance with SQPOLL
> > enabled, and bound to cpu core 61.
> >      2, sudo taskset -c 60 fio  -name=fiotest -filename=/dev/nvme0n1 -iodepth=128
> > -thread -rw=read -ioengine=io_uring  -hipri=0 -sqthread_poll=1  -sqthread_poll_cpu=61
> > -registerfiles=1 -direct=1 -bs=4k -size=10G -numjobs=1  -time_based -runtime=120
> > Note both io_uring instance's sq_thread_cpu are bound to cpu core 61.
> > 
> > In current kernel, run above two jobs concurrently, and the 2nd job will get:
> > Run status group 0 (all jobs):
> >     READ: bw=1623MiB/s (1701MB/s), 1623MiB/s-1623MiB/s (1701MB/s-1701MB/s),
> > io=190GiB (204GB), run=120010-120010msec
> > 
> > We can see that 1st instance has a huge impact to 2nd instance, I think it's
> > because of unordered competition for same cpu resource, every instance gets
> > 50% cpu usage.
> > 
> > To fix this issue, when io_uring instance uses IORING_SETUP_SQ_AFF to specify a
> > cpu,  we create a percpu io sq_thread to handle multiple io_uring instances' io
> > requests serially. With this patch, in same environment, we get a huge improvement:
> > Run status group 0 (all jobs):
> >     READ: bw=3231MiB/s (3388MB/s), 3231MiB/s-3231MiB/s (3388MB/s-3388MB/s),
> > io=379GiB (407GB), run=120001-120001msec
> > 
> > Link: https://lore.kernel.org/io-uring/c94098d2-279e-a552-91ec-8a8f177d770a@linux.alibaba.com/T/#t
> > Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
> > ---
> >   fs/io_uring.c | 257 +++++++++++++++++++++++++++++++++++++++++++++-----
> >   1 file changed, 232 insertions(+), 25 deletions(-)
> > 
> > diff --git a/fs/io_uring.c b/fs/io_uring.c
> > index ed0c22eb9808..e49ff1f67681 100644
> > --- a/fs/io_uring.c
> > +++ b/fs/io_uring.c
> > @@ -258,8 +258,13 @@ struct io_ring_ctx {
> >   	/* IO offload */
> >   	struct io_wq		*io_wq;
> >   	struct task_struct	*sqo_thread;	/* if using sq thread polling */
> > +	wait_queue_head_t	*sqo_wait;
> > +	int			submit_status;
> > +	int			sq_thread_cpu;
> > +	struct list_head	node;
> > +
> >   	struct mm_struct	*sqo_mm;
> > -	wait_queue_head_t	sqo_wait;
> > +	wait_queue_head_t	__sqo_wait;
> >   	/*
> >   	 * If used, fixed file set. Writers must ensure that ->refs is dead,
> > @@ -330,6 +335,16 @@ struct io_ring_ctx {
> >   	struct work_struct		exit_work;
> >   };
> > +struct io_percpu_thread {
> > +	struct list_head ctx_list;
> > +	wait_queue_head_t sqo_percpu_wait;
> > +	struct mutex lock;
> > +	struct task_struct *sqo_thread;
> > +	unsigned int sq_thread_idle;
> > +};
> > +
> > +static struct io_percpu_thread __percpu *percpu_threads;
> > +
> >   /*
> >    * First field must be the file pointer in all the
> >    * iocb unions! See also 'struct kiocb' in <linux/fs.h>
> > @@ -926,9 +941,11 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
> >   		goto err;
> >   	ctx->flags = p->flags;
> > -	init_waitqueue_head(&ctx->sqo_wait);
> > +	init_waitqueue_head(&ctx->__sqo_wait);
> > +	ctx->sqo_wait = &ctx->__sqo_wait;
> >   	init_waitqueue_head(&ctx->cq_wait);
> >   	INIT_LIST_HEAD(&ctx->cq_overflow_list);
> > +	INIT_LIST_HEAD(&ctx->node);
> >   	init_completion(&ctx->completions[0]);
> >   	init_completion(&ctx->completions[1]);
> >   	idr_init(&ctx->io_buffer_idr);
> > @@ -1157,8 +1174,8 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
> >   {
> >   	if (waitqueue_active(&ctx->wait))
> >   		wake_up(&ctx->wait);
> > -	if (waitqueue_active(&ctx->sqo_wait))
> > -		wake_up(&ctx->sqo_wait);
> > +	if (waitqueue_active(ctx->sqo_wait))
> > +		wake_up(ctx->sqo_wait);
> >   	if (io_should_trigger_evfd(ctx))
> >   		eventfd_signal(ctx->cq_ev_fd, 1);
> >   }
> > @@ -1980,8 +1997,8 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
> >   		list_add_tail(&req->list, &ctx->poll_list);
> >   	if ((ctx->flags & IORING_SETUP_SQPOLL) &&
> > -	    wq_has_sleeper(&ctx->sqo_wait))
> > -		wake_up(&ctx->sqo_wait);
> > +	    wq_has_sleeper(ctx->sqo_wait))
> > +		wake_up(ctx->sqo_wait);
> >   }
> >   static void io_file_put(struct io_submit_state *state)
> > @@ -5994,7 +6011,7 @@ static int io_sq_thread(void *data)
> >   				continue;
> >   			}
> > -			prepare_to_wait(&ctx->sqo_wait, &wait,
> > +			prepare_to_wait(ctx->sqo_wait, &wait,
> >   						TASK_INTERRUPTIBLE);
> >   			/*
> > @@ -6006,7 +6023,7 @@ static int io_sq_thread(void *data)
> >   			 */
> >   			if ((ctx->flags & IORING_SETUP_IOPOLL) &&
> >   			    !list_empty_careful(&ctx->poll_list)) {
> > -				finish_wait(&ctx->sqo_wait, &wait);
> > +				finish_wait(ctx->sqo_wait, &wait);
> >   				continue;
> >   			}
> > @@ -6018,23 +6035,23 @@ static int io_sq_thread(void *data)
> >   			to_submit = io_sqring_entries(ctx);
> >   			if (!to_submit || ret == -EBUSY) {
> >   				if (kthread_should_park()) {
> > -					finish_wait(&ctx->sqo_wait, &wait);
> > +					finish_wait(ctx->sqo_wait, &wait);
> >   					break;
> >   				}
> >   				if (current->task_works) {
> >   					task_work_run();
> > -					finish_wait(&ctx->sqo_wait, &wait);
> > +					finish_wait(ctx->sqo_wait, &wait);
> >   					continue;
> >   				}
> >   				if (signal_pending(current))
> >   					flush_signals(current);
> >   				schedule();
> > -				finish_wait(&ctx->sqo_wait, &wait);
> > +				finish_wait(ctx->sqo_wait, &wait);
> >   				ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
> >   				continue;
> >   			}
> > -			finish_wait(&ctx->sqo_wait, &wait);
> > +			finish_wait(ctx->sqo_wait, &wait);
> >   			ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
> >   		}
> > @@ -6058,6 +6075,133 @@ static int io_sq_thread(void *data)
> >   	return 0;
> >   }
> > +static int process_ctx(struct io_ring_ctx *ctx)
> > +{
> > +	unsigned int to_submit;
> > +	int ret = 0;
> > +
> > +	if (!list_empty(&ctx->poll_list)) {
> > +		unsigned nr_events = 0;
> > +
> > +		mutex_lock(&ctx->uring_lock);
> > +		if (!list_empty(&ctx->poll_list))
> > +			io_iopoll_getevents(ctx, &nr_events, 0);
> > +		mutex_unlock(&ctx->uring_lock);
> > +	}
> > +
> > +	to_submit = io_sqring_entries(ctx);
> > +	if (to_submit) {
> > +		mutex_lock(&ctx->uring_lock);
> > +		if (likely(!percpu_ref_is_dying(&ctx->refs)))
> > +			ret = io_submit_sqes(ctx, to_submit, NULL, -1, true);
> > +		mutex_unlock(&ctx->uring_lock);
> > +	}
> > +
> > +	if (current->task_works)
> > +		task_work_run();
> > +
> > +	io_sq_thread_drop_mm(ctx);
> > +	return ret;
> > +}
> > +
> > +static int io_sq_percpu_thread(void *data)
> > +{
> > +	struct io_percpu_thread *t = data;
> > +	struct io_ring_ctx *ctx, *tmp;
> > +	mm_segment_t old_fs;
> > +	const struct cred *saved_creds, *cur_creds, *old_creds;
> > +	unsigned long timeout;
> > +	DEFINE_WAIT(wait);
> > +	int iters = 0;
> > +
> > +	timeout = jiffies + t->sq_thread_idle;
> > +	old_fs = get_fs();
> > +	set_fs(USER_DS);
> > +	saved_creds = cur_creds = NULL;
> > +	while (!kthread_should_park()) {
> > +		bool continue_run;
> > +		bool needs_wait;
> > +		unsigned int to_submit;
> > +
> > +		mutex_lock(&t->lock);
> > +again:
> > +		continue_run = false;
> > +		list_for_each_entry_safe(ctx, tmp, &t->ctx_list, node) {
> > +			if (cur_creds != ctx->creds) {
> > +				old_creds = override_creds(ctx->creds);
> > +				cur_creds = ctx->creds;
> > +				if (saved_creds)
> > +					put_cred(old_creds);
> > +				else
> > +					saved_creds = old_creds;
> > +			}
> > +			ctx->submit_status = process_ctx(ctx);
> > +
> > +			to_submit = io_sqring_entries(ctx);
> > +			if (!continue_run &&
> > +			    ((to_submit && ctx->submit_status != -EBUSY) ||
> > +			    !list_empty(&ctx->poll_list)))
> > +				continue_run = true;
> > +		}
> > +		if (continue_run && (++iters & 7)) {
> > +			timeout = jiffies + t->sq_thread_idle;
> > +			goto again;
> > +		}
> > +		mutex_unlock(&t->lock);
> > +		if (continue_run) {
> > +			timeout = jiffies + t->sq_thread_idle;
> > +			continue;
> > +		}
> > +		if (!time_after(jiffies, timeout)) {
> > +			cond_resched();
> > +			continue;
> > +		}
> > +
> > +		needs_wait = true;
> > +		prepare_to_wait(&t->sqo_percpu_wait, &wait, TASK_INTERRUPTIBLE);
> > +		mutex_lock(&t->lock);
> > +		list_for_each_entry_safe(ctx, tmp, &t->ctx_list, node) {
> > +			if ((ctx->flags & IORING_SETUP_IOPOLL) &&
> > +			    !list_empty_careful(&ctx->poll_list)) {
> > +				needs_wait = false;
> > +				break;
> > +			}
> > +			to_submit = io_sqring_entries(ctx);

Unless I'm mistaken, I don't think these are submitted anywhere.

> > +			if (to_submit && ctx->submit_status != -EBUSY) {
> > +				needs_wait = false;
> > +				break;
> > +			}
> > +		}
> > +		if (needs_wait) {
> > +			list_for_each_entry_safe(ctx, tmp, &t->ctx_list, node)
> > +				ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
> > +			smp_mb();
> > +
> > +		}
> > +		mutex_unlock(&t->lock);
> > +
> > +		if (needs_wait) {
> > +			schedule();
> > +			mutex_lock(&t->lock);
> > +			list_for_each_entry_safe(ctx, tmp,
> > +						 &t->ctx_list, node)
> > +				ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
> > +			mutex_unlock(&t->lock);
> > +			finish_wait(&t->sqo_percpu_wait, &wait);
> > +		} else
> > +			finish_wait(&t->sqo_percpu_wait, &wait);
> > +		timeout = jiffies + t->sq_thread_idle;
> > +		cond_resched();
> > +	}
> > +
> > +	if (current->task_works)
> > +		task_work_run();
> > +	set_fs(old_fs);
> > +	revert_creds(saved_creds);
> > +	kthread_parkme();
> > +	return 0;
> > +}
> > +
> >   struct io_wait_queue {
> >   	struct wait_queue_entry wq;
> >   	struct io_ring_ctx *ctx;
> > @@ -6219,18 +6363,23 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
> >   	return 0;
> >   }
> > +static void destroy_io_percpu_thread(struct io_ring_ctx *ctx, int cpu);
> > +
> >   static void io_sq_thread_stop(struct io_ring_ctx *ctx)
> >   {
> >   	if (ctx->sqo_thread) {
> > -		wait_for_completion(&ctx->completions[1]);
> > -		/*
> > -		 * The park is a bit of a work-around, without it we get
> > -		 * warning spews on shutdown with SQPOLL set and affinity
> > -		 * set to a single CPU.
> > -		 */
> > -		kthread_park(ctx->sqo_thread);
> > -		kthread_stop(ctx->sqo_thread);
> > -		ctx->sqo_thread = NULL;
> > +		if (!(ctx->flags & IORING_SETUP_SQ_AFF)) {
> > +			wait_for_completion(&ctx->completions[1]);
> > +			/*
> > +			 * The park is a bit of a work-around, without it we get
> > +			 * warning spews on shutdown with SQPOLL set and affinity
> > +			 * set to a single CPU.
> > +			 */
> > +			kthread_park(ctx->sqo_thread);
> > +			kthread_stop(ctx->sqo_thread);
> > +			ctx->sqo_thread = NULL;
> > +		} else
> > +			destroy_io_percpu_thread(ctx, ctx->sq_thread_cpu);
> >   	}
> >   }
> > @@ -6841,6 +6990,52 @@ static int io_init_wq_offload(struct io_ring_ctx *ctx,
> >   	return ret;
> >   }
> > +static void create_io_percpu_thread(struct io_ring_ctx *ctx, int cpu)
> > +{
> > +	struct io_percpu_thread *t;
> > +
> > +	t = per_cpu_ptr(percpu_threads, cpu);
> > +	mutex_lock(&t->lock);
> > +	if (!t->sqo_thread) {
> > +		t->sqo_thread = kthread_create_on_cpu(io_sq_percpu_thread, t,
> > +					cpu, "io_uring_percpu-sq");
> > +		if (IS_ERR(t->sqo_thread)) {
> > +			ctx->sqo_thread = t->sqo_thread;
> > +			t->sqo_thread = NULL;
> > +			mutex_unlock(&t->lock);
> > +			return;
> > +		}
> > +	}
> > +
> > +	if (t->sq_thread_idle < ctx->sq_thread_idle)
> > +		t->sq_thread_idle = ctx->sq_thread_idle;

Is max really the best way to do this? 
Or should it be per ctx?

Suppose the first ctx has a
sq_thread_idle of something very small, and the second has a
sq_thread_idle of 500ms, this will cause the loop to iterate over the
first ctx even though it should have been considered idle a long time
ago.

> > +	ctx->sqo_wait = &t->sqo_percpu_wait;
> > +	ctx->sq_thread_cpu = cpu;
> > +	list_add_tail(&ctx->node, &t->ctx_list);
> > +	ctx->sqo_thread = t->sqo_thread;
> > +	mutex_unlock(&t->lock);
> > +}
> > +
> > +static void destroy_io_percpu_thread(struct io_ring_ctx *ctx, int cpu)
> > +{
> > +	struct io_percpu_thread *t;
> > +	struct task_struct *sqo_thread = NULL;
> > +
> > +	t = per_cpu_ptr(percpu_threads, cpu);
> > +	mutex_lock(&t->lock);
> > +	list_del(&ctx->node);
> > +	if (list_empty(&t->ctx_list)) {
> > +		sqo_thread = t->sqo_thread;
> > +		t->sqo_thread = NULL;
> > +	}
> > +	mutex_unlock(&t->lock);
> > +
> > +	if (sqo_thread) {
> > +		kthread_park(sqo_thread);
> > +		kthread_stop(sqo_thread);
> > +	}
> > +}
> > +
> >   static int io_sq_offload_start(struct io_ring_ctx *ctx,
> >   			       struct io_uring_params *p)
> >   {
> > @@ -6867,9 +7062,7 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
> >   			if (!cpu_online(cpu))
> >   				goto err;
> > -			ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
> > -							ctx, cpu,
> > -							"io_uring-sq");
> > +			create_io_percpu_thread(ctx, cpu);
> >   		} else {
> >   			ctx->sqo_thread = kthread_create(io_sq_thread, ctx,
> >   							"io_uring-sq");
> > @@ -7516,7 +7709,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
> >   		if (!list_empty_careful(&ctx->cq_overflow_list))
> >   			io_cqring_overflow_flush(ctx, false);
> >   		if (flags & IORING_ENTER_SQ_WAKEUP)
> > -			wake_up(&ctx->sqo_wait);
> > +			wake_up(ctx->sqo_wait);
> >   		submitted = to_submit;
> >   	} else if (to_submit) {
> >   		mutex_lock(&ctx->uring_lock);
> > @@ -8102,6 +8295,8 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
> >   static int __init io_uring_init(void)
> >   {
> > +	int cpu;
> > +
> >   #define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
> >   	BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
> >   	BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
> > @@ -8141,6 +8336,18 @@ static int __init io_uring_init(void)
> >   	BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
> >   	BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
> >   	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
> > +
> > +	percpu_threads = alloc_percpu(struct io_percpu_thread);
> > +	for_each_possible_cpu(cpu) {
> > +		struct io_percpu_thread *t;
> > +
> > +		t = per_cpu_ptr(percpu_threads, cpu);
> > +		INIT_LIST_HEAD(&t->ctx_list);
> > +		init_waitqueue_head(&t->sqo_percpu_wait);
> > +		mutex_init(&t->lock);
> > +		t->sqo_thread = NULL;
> > +		t->sq_thread_idle = 0;
> > +	}
> >   	return 0;
> >   };
> >   __initcall(io_uring_init);
> > 
Thanks!

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] io_uring: create percpu io sq thread when IORING_SETUP_SQ_AFF is flagged
  2020-05-22  8:33   ` Xiaoguang Wang
@ 2020-05-24 11:46     ` Pavel Begunkov
  2020-05-26 14:42       ` Xiaoguang Wang
  2020-05-28  7:56       ` Xiaoguang Wang
  0 siblings, 2 replies; 11+ messages in thread
From: Pavel Begunkov @ 2020-05-24 11:46 UTC (permalink / raw)
  To: Xiaoguang Wang, io-uring; +Cc: axboe, joseph.qi, yujian.wu1

On 22/05/2020 11:33, Xiaoguang Wang wrote:
> hi Pavel,
> 
> First thanks for reviewing!

sure, you're welcome!

>> On 20/05/2020 14:56, Xiaoguang Wang wrote:
>>>
>>> To fix this issue, when io_uring instance uses IORING_SETUP_SQ_AFF to specify a
>>> cpu,  we create a percpu io sq_thread to handle multiple io_uring instances' io
>>> requests serially. With this patch, in same environment, we get a huge
>>> improvement:
>>
>> Consider that a user can issue a long sequence of requests, say 2^15 of them,
>> and all of them will happen in a single call to io_submit_sqes(). Just preparing
>> them would take much time, apart from that it can do some effective work for
>> each of them, e.g. copying a page. And now there is another io_uring, caring
>> much about latencies and nevertheless waiting for _very_ long for its turn.
> Indeed I had thought this case before and don't clearly know how to optimize it
> yet.
> But I think if user has above usage scenarios, they probably shouldn't make
> io_uring
> instances share same cpu core. I'd like to explain more about our usage scenarios.

IIRC, you create _globally_ available per-cpu threads, so 2 totally independent
users may greatly affect each other, e.g. 2 docker containers in a hosted
server, and that would be nasty. But if it's limited to a "single user", it'd be
fine by me, see below for more details.

BTW, out of curiosity, what's the performance\latency impact of disabling SQPOLL
at all for your app? Say, comparing with non contended case.

> In a physical machine, say there are 96 cores, and it runs multiple cgroups, every
> cgroup run same application and will monopoly 16 cpu cores. This application will
> create 16 io threads and every io thread will create an io_uring instance and every
> thread will be bound to a different cpu core, these io threads will receive io
> requests.
> If we enable SQPOLL for these io threads, we allocate one or two cpu cores for
> these
> io_uring instances at most, so they must share allocated cpu core. It's totally
> disaster
> that some io_uring instances' busy loop in their sq_thread_idle period will
> impact other
> io_uring instances which have io requests to handle.
> 
>>
>> Another problem is that with it a user can't even guess when its SQ would be
>> emptied, and would need to constantly poll.
> In this patch, in every iteration, we only handle io requests already queued,
> will not constantly poll.

I was talking about a user polling _from userspace_ its full SQ, to understand
when it can submit more. That's if it doesn't want to wait for CQEs yet for some
reason (e.g. useful for net apps).

> 
>>
>> In essence, the problem is in bypassing thread scheduling, and re-implementing
>> poor man's version of it (round robin by io_uring).
> Yes :) Currently I use round robin strategy to handle multiple io_uring instance
> in every iteration.
> 
>> The idea and the reasoning are compelling, but I think we need to do something
>> about unrelated io_uring instances obstructing each other. At least not making
>> it mandatory behaviour.
>>
>> E.g. it's totally fine by me, if a sqpoll kthread is shared between specified
>> bunch of io_urings -- the user would be responsible for binding them and not
>> screwing up latencies/whatever. Most probably there won't be much (priviledged)
>> users using SQPOLL, and they all be a part of a single app, e.g. with
>> multiple/per-thread io_urings.
> Did you read my patch? In this patch, I have implemented this idea :)

Took a glance, may have overlooked things. I meant to do as in your patch, but
not sharing threads between ALL io_uring in the system, but rather between a
specified set of them. In other words, making yours @percpu_threads not global,
but rather binding to a set of io_urings.

e.g. create 2 independent per-cpu sets of threads. 1st one for {a1,a2,a3}, 2nd
for {b1,b2,b3}.

a1 = create_uring()
a2 = create_uring(shared_sq_threads=a1)
a3 = create_uring(shared_sq_threads=a1)

b1 = create_uring()
b2 = create_uring(shared_sq_threads=b1)
b3 = create_uring(shared_sq_threads=b1)

And then:
- it somehow solves the problem. As long as it doesn't effect much other users,
it's ok to let userspace screw itself by submitting 2^16 requests.

- there is still a problem with a simple round robin. E.g. >100 io_urings per
such set. Even though, a user may decide for itself, it worth to think about. I
don't want another scheduling framework here. E.g. first round-robin, then
weighted one, etc.

- it's actually not a set of threads (i.e. per-cpu) the API should operate on,
but just binding io_urings to a single SQPOLL thread.

- there is no need to restrict it to cpu-pinned case.

>>
>> Another way would be to switch between io_urings faster, e.g. after processing
>> not all requests but 1 or some N of them. But that's very thin ice, and I
>> already see other bag of issues.
> Sounds good, could you please lift detailed issues? Thanks.

Sounds terrible, TBH. Especially with arbitrary length links.


-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] io_uring: create percpu io sq thread when IORING_SETUP_SQ_AFF is flagged
  2020-05-22 11:17   ` Yu Jian Wu
@ 2020-05-25  3:16     ` Xiaoguang Wang
  0 siblings, 0 replies; 11+ messages in thread
From: Xiaoguang Wang @ 2020-05-25  3:16 UTC (permalink / raw)
  To: Yu Jian Wu; +Cc: io-uring, axboe, joseph.qi, asml.silence

hi,

> On Wed, May 20, 2020 at 08:11:04PM +0800, Xiaoguang Wang wrote:
>> hi,
>>
>> There're still some left work to do, fox example, use srcu to protect multiple
>> ctxs iteration to reduce mutex lock competition, make percpu thread aware of
>> cpu hotplug, but I send it now for some early comments, thanks in advance!
>>
>> Regards,
>> Xiaoguang Wang
> 
> Hi,
> 
> Thanks for doing this!
> Speaking as someone who tried this and really struggled with a few parts.
> 
> Few comments below.
> 
> W.r.t Pavel's comments on how to do this fairly, I think the only way is
> for this multiple ctx thread to handle all the ctxs fairly is to queue all
> the work and do it async rather than inline.
Look like that you mean every io_sq_thread always uses REQ_F_FORCE_ASYNC to
submit reqs. I'm not sure it's efficient. Queuing works to io-wq should be
a fast job, then that means io_sq_thread will just do busy loop other than
queuing works most of time, which will waste cpu resource the io_sq_thread
is bound to. What I want to express is that io_sq_thead should do some real
job, not just queue the work.

>>> +		needs_wait = true;
>>> +		prepare_to_wait(&t->sqo_percpu_wait, &wait, TASK_INTERRUPTIBLE);
>>> +		mutex_lock(&t->lock);
>>> +		list_for_each_entry_safe(ctx, tmp, &t->ctx_list, node) {
>>> +			if ((ctx->flags & IORING_SETUP_IOPOLL) &&
>>> +			    !list_empty_careful(&ctx->poll_list)) {
>>> +				needs_wait = false;
>>> +				break;
>>> +			}
>>> +			to_submit = io_sqring_entries(ctx);
> 
> Unless I'm mistaken, I don't think these are submitted anywher
Yes, before io_sq_thread goes to sleep, it'll check whether some ctxs has
new sqes to handle, if "to_submit" is greater than zero, io_sq_thread will
skip the sleep and continue to handle these new sqes.

> 
>>> +			if (to_submit && ctx->submit_status != -EBUSY) {
>>> @@ -6841,6 +6990,52 @@ static int io_init_wq_offload(struct io_ring_ctx *ctx,
>>>    	return ret;
>>>    }
>>> +static void create_io_percpu_thread(struct io_ring_ctx *ctx, int cpu)
>>> +{
>>> +	struct io_percpu_thread *t;
>>> +
>>> +	t = per_cpu_ptr(percpu_threads, cpu);
>>> +	mutex_lock(&t->lock);
>>> +	if (!t->sqo_thread) {
>>> +		t->sqo_thread = kthread_create_on_cpu(io_sq_percpu_thread, t,
>>> +					cpu, "io_uring_percpu-sq");
>>> +		if (IS_ERR(t->sqo_thread)) {
>>> +			ctx->sqo_thread = t->sqo_thread;
>>> +			t->sqo_thread = NULL;
>>> +			mutex_unlock(&t->lock);
>>> +			return;
>>> +		}
>>> +	}
>>> +
>>> +	if (t->sq_thread_idle < ctx->sq_thread_idle)
>>> +		t->sq_thread_idle = ctx->sq_thread_idle;
> 
> Is max really the best way to do this?
> Or should it be per ctx?
Because these ctxs are sharing same io_sq_thread, and I think sq_thread_idle is to
control when io_sq_thread can go to sleep, so we should choose a max value.

Regards,
Xiaoguang Wang

> 
> Suppose the first ctx has a
> sq_thread_idle of something very small, and the second has a
> sq_thread_idle of 500ms, this will cause the loop to iterate over the
> first ctx even though it should have been considered idle a long time
> ago.

> 
>>> +	ctx->sqo_wait = &t->sqo_percpu_wait;
>>> +	ctx->sq_thread_cpu = cpu;
>>> +	list_add_tail(&ctx->node, &t->ctx_list);
>>> +	ctx->sqo_thread = t->sqo_thread;
>>> +	mutex_unlock(&t->lock);
>>> +}
>>> +
>>> +static void destroy_io_percpu_thread(struct io_ring_ctx *ctx, int cpu)
>>> +{
>>> +	struct io_percpu_thread *t;
>>> +	struct task_struct *sqo_thread = NULL;
>>> +
>>> +	t = per_cpu_ptr(percpu_threads, cpu);
>>> +	mutex_lock(&t->lock);
>>> +	list_del(&ctx->node);
>>> +	if (list_empty(&t->ctx_list)) {
>>> +		sqo_thread = t->sqo_thread;
>>> +		t->sqo_thread = NULL;
>>> +	}
>>> +	mutex_unlock(&t->lock);
>>> +
>>> +	if (sqo_thread) {
>>> +		kthread_park(sqo_thread);
>>> +		kthread_stop(sqo_thread);
>>> +	}
>>> +}
>>> +
>>>    static int io_sq_offload_start(struct io_ring_ctx *ctx,
>>>    			       struct io_uring_params *p)
>>>    {
>>> @@ -6867,9 +7062,7 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
>>>    			if (!cpu_online(cpu))
>>>    				goto err;
>>> -			ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
>>> -							ctx, cpu,
>>> -							"io_uring-sq");
>>> +			create_io_percpu_thread(ctx, cpu);
>>>    		} else {
>>>    			ctx->sqo_thread = kthread_create(io_sq_thread, ctx,
>>>    							"io_uring-sq");
>>> @@ -7516,7 +7709,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
>>>    		if (!list_empty_careful(&ctx->cq_overflow_list))
>>>    			io_cqring_overflow_flush(ctx, false);
>>>    		if (flags & IORING_ENTER_SQ_WAKEUP)
>>> -			wake_up(&ctx->sqo_wait);
>>> +			wake_up(ctx->sqo_wait);
>>>    		submitted = to_submit;
>>>    	} else if (to_submit) {
>>>    		mutex_lock(&ctx->uring_lock);
>>> @@ -8102,6 +8295,8 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
>>>    static int __init io_uring_init(void)
>>>    {
>>> +	int cpu;
>>> +
>>>    #define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
>>>    	BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
>>>    	BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
>>> @@ -8141,6 +8336,18 @@ static int __init io_uring_init(void)
>>>    	BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
>>>    	BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
>>>    	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
>>> +
>>> +	percpu_threads = alloc_percpu(struct io_percpu_thread);
>>> +	for_each_possible_cpu(cpu) {
>>> +		struct io_percpu_thread *t;
>>> +
>>> +		t = per_cpu_ptr(percpu_threads, cpu);
>>> +		INIT_LIST_HEAD(&t->ctx_list);
>>> +		init_waitqueue_head(&t->sqo_percpu_wait);
>>> +		mutex_init(&t->lock);
>>> +		t->sqo_thread = NULL;
>>> +		t->sq_thread_idle = 0;
>>> +	}
>>>    	return 0;
>>>    };
>>>    __initcall(io_uring_init);
>>>
> Thanks!
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] io_uring: create percpu io sq thread when IORING_SETUP_SQ_AFF is flagged
  2020-05-24 11:46     ` Pavel Begunkov
@ 2020-05-26 14:42       ` Xiaoguang Wang
  2020-06-03 14:47         ` Pavel Begunkov
  2020-05-28  7:56       ` Xiaoguang Wang
  1 sibling, 1 reply; 11+ messages in thread
From: Xiaoguang Wang @ 2020-05-26 14:42 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring; +Cc: axboe, joseph.qi, yujian.wu1

hi,

> On 22/05/2020 11:33, Xiaoguang Wang wrote:
>> hi Pavel,
>>
>> First thanks for reviewing!
> 
> sure, you're welcome!
> 
>>> On 20/05/2020 14:56, Xiaoguang Wang wrote:
>>>>
>>>> To fix this issue, when io_uring instance uses IORING_SETUP_SQ_AFF to specify a
>>>> cpu,  we create a percpu io sq_thread to handle multiple io_uring instances' io
>>>> requests serially. With this patch, in same environment, we get a huge
>>>> improvement:
>>>
>>> Consider that a user can issue a long sequence of requests, say 2^15 of them,
>>> and all of them will happen in a single call to io_submit_sqes(). Just preparing
>>> them would take much time, apart from that it can do some effective work for
>>> each of them, e.g. copying a page. And now there is another io_uring, caring
>>> much about latencies and nevertheless waiting for _very_ long for its turn.
>> Indeed I had thought this case before and don't clearly know how to optimize it
>> yet.
>> But I think if user has above usage scenarios, they probably shouldn't make
>> io_uring
>> instances share same cpu core. I'd like to explain more about our usage scenarios.
> 
> IIRC, you create _globally_ available per-cpu threads, so 2 totally independent
> users may greatly affect each other, e.g. 2 docker containers in a hosted
> server, and that would be nasty. But if it's limited to a "single user", it'd be
> fine by me, see below for more details.Not sure I follow you.
You say "2 totally independent users may greatly affect each other", can you explain more?
With this patch, if user A and user B's io_uring instances are bound to same cpu core,
say 10, this patch will only create one io_sq_thread for cpu 10, then user A and user B
can share it. If their io_uring instance are bound to different cpu cores, I think user A
and user B won't affect each orther.

> 
> BTW, out of curiosity, what's the performance\latency impact of disabling SQPOLL
> at all for your app? Say, comparing with non contended case.
Sorry, our application based on io_uring is still under development, When it's ready
later, I'll show more information.

> 
>> In a physical machine, say there are 96 cores, and it runs multiple cgroups, every
>> cgroup run same application and will monopoly 16 cpu cores. This application will
>> create 16 io threads and every io thread will create an io_uring instance and every
>> thread will be bound to a different cpu core, these io threads will receive io
>> requests.
>> If we enable SQPOLL for these io threads, we allocate one or two cpu cores for
>> these
>> io_uring instances at most, so they must share allocated cpu core. It's totally
>> disaster
>> that some io_uring instances' busy loop in their sq_thread_idle period will
>> impact other
>> io_uring instances which have io requests to handle.
>>
>>>
>>> Another problem is that with it a user can't even guess when its SQ would be
>>> emptied, and would need to constantly poll.
>> In this patch, in every iteration, we only handle io requests already queued,
>> will not constantly poll.
> 
> I was talking about a user polling _from userspace_ its full SQ, to understand
> when it can submit more. That's if it doesn't want to wait for CQEs yet for some
> reason (e.g. useful for net apps).
> 
>>
>>>
>>> In essence, the problem is in bypassing thread scheduling, and re-implementing
>>> poor man's version of it (round robin by io_uring).
>> Yes :) Currently I use round robin strategy to handle multiple io_uring instance
>> in every iteration.
>>
>>> The idea and the reasoning are compelling, but I think we need to do something
>>> about unrelated io_uring instances obstructing each other. At least not making
>>> it mandatory behaviour.
>>>
>>> E.g. it's totally fine by me, if a sqpoll kthread is shared between specified
>>> bunch of io_urings -- the user would be responsible for binding them and not
>>> screwing up latencies/whatever. Most probably there won't be much (priviledged)
>>> users using SQPOLL, and they all be a part of a single app, e.g. with
>>> multiple/per-thread io_urings.
>> Did you read my patch? In this patch, I have implemented this idea :)
> 
> Took a glance, may have overlooked things. I meant to do as in your patch, but
> not sharing threads between ALL io_uring in the system, but rather between a
Yes, I don't try to make all io_uring instances in the system share threads, I just
make io_uring instances which are bound to same cpu core, share one io_sq_thread that
only is created once for every cpu core.
Otherwise in current io_uring mainline codes, we'd better not bind different io_uring
instances to same cpu core,  some instances' busy loop in its sq_thread_idle period will
impact other instanes who currently there are reqs to handle.

> specified set of them. In other words, making yours @percpu_threads not global,
> but rather binding to a set of io_urings.
> 
> e.g. create 2 independent per-cpu sets of threads. 1st one for {a1,a2,a3}, 2nd
> for {b1,b2,b3}.
> 
> a1 = create_uring()
> a2 = create_uring(shared_sq_threads=a1)
> a3 = create_uring(shared_sq_threads=a1)
> 
> b1 = create_uring()
> b2 = create_uring(shared_sq_threads=b1)
> b3 = create_uring(shared_sq_threads=b1)
Thanks your suggestions, I'll try to consider it in V2 patch.

But I still have one question: now "shared_sq_threads=a1" and "shared_sq_threads=b1"
can be bound to same cpu core? I think it's still not efficient. E.g. "shared_sq_threads=a1"
busy loop in its sq_thread_idle period will impact shared_sq_threads=b1 who currently
there are reqs to handle.

Now I also wonder whether a io_uring instance,which has SQPOLL enabed and does not
specify a sq_thread_cpu, can be used in real business environment, the io_sq_thread
may run in any cpu cores, which may affect any other application, e.g. cpu resource
contend. So if trying to use SQPOLL, we'd better allocate specified cpu.

Regards,
Xiaoguang Wang

> 
> And then:
> - it somehow solves the problem. As long as it doesn't effect much other users,
> it's ok to let userspace screw itself by submitting 2^16 requests.
> 
> - there is still a problem with a simple round robin. E.g. >100 io_urings per
> such set. Even though, a user may decide for itself, it worth to think about. I
> don't want another scheduling framework here. E.g. first round-robin, then
> weighted one, etc.
> 
> - it's actually not a set of threads (i.e. per-cpu) the API should operate on,
> but just binding io_urings to a single SQPOLL thread.
> 
> - there is no need to restrict it to cpu-pinned case >
>>>
>>> Another way would be to switch between io_urings faster, e.g. after processing
>>> not all requests but 1 or some N of them. But that's very thin ice, and I
>>> already see other bag of issues.
>> Sounds good, could you please lift detailed issues? Thanks.
> 
> Sounds terrible, TBH. Especially with arbitrary length links.
> 
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] io_uring: create percpu io sq thread when IORING_SETUP_SQ_AFF is flagged
  2020-05-24 11:46     ` Pavel Begunkov
  2020-05-26 14:42       ` Xiaoguang Wang
@ 2020-05-28  7:56       ` Xiaoguang Wang
  1 sibling, 0 replies; 11+ messages in thread
From: Xiaoguang Wang @ 2020-05-28  7:56 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring; +Cc: axboe, joseph.qi, yujian.wu1

hi Pavel,

> On 22/05/2020 11:33, Xiaoguang Wang wrote:
>> hi Pavel,
>>
>> First thanks for reviewing!
> 
> sure, you're welcome!
> 
>>> On 20/05/2020 14:56, Xiaoguang Wang wrote:
>>>>
>>>> To fix this issue, when io_uring instance uses IORING_SETUP_SQ_AFF to specify a
>>>> cpu,  we create a percpu io sq_thread to handle multiple io_uring instances' io
>>>> requests serially. With this patch, in same environment, we get a huge
>>>> improvement:
>>>
>>> Consider that a user can issue a long sequence of requests, say 2^15 of them,
>>> and all of them will happen in a single call to io_submit_sqes(). Just preparing
>>> them would take much time, apart from that it can do some effective work for
>>> each of them, e.g. copying a page. And now there is another io_uring, caring
>>> much about latencies and nevertheless waiting for _very_ long for its turn.
>> Indeed I had thought this case before and don't clearly know how to optimize it
>> yet.
>> But I think if user has above usage scenarios, they probably shouldn't make
>> io_uring
>> instances share same cpu core. I'd like to explain more about our usage scenarios.
> 
> IIRC, you create _globally_ available per-cpu threads, so 2 totally independent
> users may greatly affect each other, e.g. 2 docker containers in a hosted
> server, and that would be nasty. But if it's limited to a "single user", it'd be
> fine by me, see below for more details.
> 
> BTW, out of curiosity, what's the performance\latency impact of disabling SQPOLL
> at all for your app? Say, comparing with non contended case.
> 
>> In a physical machine, say there are 96 cores, and it runs multiple cgroups, every
>> cgroup run same application and will monopoly 16 cpu cores. This application will
>> create 16 io threads and every io thread will create an io_uring instance and every
>> thread will be bound to a different cpu core, these io threads will receive io
>> requests.
>> If we enable SQPOLL for these io threads, we allocate one or two cpu cores for
>> these
>> io_uring instances at most, so they must share allocated cpu core. It's totally
>> disaster
>> that some io_uring instances' busy loop in their sq_thread_idle period will
>> impact other
>> io_uring instances which have io requests to handle.
>>
>>>
>>> Another problem is that with it a user can't even guess when its SQ would be
>>> emptied, and would need to constantly poll.
>> In this patch, in every iteration, we only handle io requests already queued,
>> will not constantly poll.
> 
> I was talking about a user polling _from userspace_ its full SQ, to understand
> when it can submit more. That's if it doesn't want to wait for CQEs yet for some
> reason (e.g. useful for net apps).
> 
>>
>>>
>>> In essence, the problem is in bypassing thread scheduling, and re-implementing
>>> poor man's version of it (round robin by io_uring).
>> Yes :) Currently I use round robin strategy to handle multiple io_uring instance
>> in every iteration.
>>
>>> The idea and the reasoning are compelling, but I think we need to do something
>>> about unrelated io_uring instances obstructing each other. At least not making
>>> it mandatory behaviour.
>>>
>>> E.g. it's totally fine by me, if a sqpoll kthread is shared between specified
>>> bunch of io_urings -- the user would be responsible for binding them and not
>>> screwing up latencies/whatever. Most probably there won't be much (priviledged)
>>> users using SQPOLL, and they all be a part of a single app, e.g. with
>>> multiple/per-thread io_urings.
>> Did you read my patch? In this patch, I have implemented this idea :)
> 
> Took a glance, may have overlooked things. I meant to do as in your patch, but
> not sharing threads between ALL io_uring in the system, but rather between a
> specified set of them. In other words, making yours @percpu_threads not global,
> but rather binding to a set of io_urings.
> 
> e.g. create 2 independent per-cpu sets of threads. 1st one for {a1,a2,a3}, 2nd
> for {b1,b2,b3}.
> 
> a1 = create_uring()
> a2 = create_uring(shared_sq_threads=a1)
> a3 = create_uring(shared_sq_threads=a1)
> 
> b1 = create_uring()
> b2 = create_uring(shared_sq_threads=b1)
> b3 = create_uring(shared_sq_threads=b1)
Think your suggestions further more, seems that you suggest the implementation method
io-wq adopts, which requires a valid wq_fd, and that means we can only share io-wq between
io_uring instances belong to same process, so do io_sq_thread. This method is hard to
implement io_sq_thread share between processes.

As I said in previous mail, I guess io_uring instances which have SQPOLL enabed and do not
specify valid sq_thread_cpu, are not likely be used in real business environment, so I'd
like to preserve current design in my patch, which is simple and only affects cpu-pinned case.

What do you think? Thanks.

Regards,
Xiaoguang Wang

> 
> And then:
> - it somehow solves the problem. As long as it doesn't effect much other users,
> it's ok to let userspace screw itself by submitting 2^16 requests.
> 
> - there is still a problem with a simple round robin. E.g. >100 io_urings per
> such set. Even though, a user may decide for itself, it worth to think about. I
> don't want another scheduling framework here. E.g. first round-robin, then
> weighted one, etc.
> 
> - it's actually not a set of threads (i.e. per-cpu) the API should operate on,
> but just binding io_urings to a single SQPOLL thread.
> 
> - there is no need to restrict it to cpu-pinned case.
> 
>>>
>>> Another way would be to switch between io_urings faster, e.g. after processing
>>> not all requests but 1 or some N of them. But that's very thin ice, and I
>>> already see other bag of issues.
>> Sounds good, could you please lift detailed issues? Thanks.
> 
> Sounds terrible, TBH. Especially with arbitrary length links.
> 
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] io_uring: create percpu io sq thread when IORING_SETUP_SQ_AFF is flagged
  2020-05-26 14:42       ` Xiaoguang Wang
@ 2020-06-03 14:47         ` Pavel Begunkov
  2020-06-03 18:48           ` Pavel Begunkov
  0 siblings, 1 reply; 11+ messages in thread
From: Pavel Begunkov @ 2020-06-03 14:47 UTC (permalink / raw)
  To: Xiaoguang Wang, io-uring; +Cc: axboe, joseph.qi, yujian.wu1

On 26/05/2020 17:42, Xiaoguang Wang wrote:
> Yes, I don't try to make all io_uring instances in the system share threads, I just
> make io_uring instances which are bound to same cpu core, share one io_sq_thread that
> only is created once for every cpu core.
> Otherwise in current io_uring mainline codes, we'd better not bind different io_uring
> instances to same cpu core,  some instances' busy loop in its sq_thread_idle period will
> impact other instanes who currently there are reqs to handle.

I got a bit carried away from your initial case, but there is the case:
Let's we have 2 unrelated apps that create SQPOLL io_uring instances. Let's say they are
in 2 different docker container for the argument (and let's just assume a docker container
user can create such).

The first app1 submits 32K fat requests as described before. The second one (app2) is a
low-latency app, submits reqs by 1, but expects it to be picked really fast. And let's
assume their SQPOLL threads pinned to the same CPU.

1. old version:
The CPU spends some time allocated by a scheduler on 32K requests of app1,
probably not issuing them all but that's fine. And then it goes to the app2.
So, the submit-to-pickup latency for app2 is capped by a task scheduler.
That's somewhat fair. 

2. your version:
io_sq_thread first processes all 32K of requests of app1, and only then goes to app2.
app2 is screwed, unfair as life can be. And a malicious user can create many io_uring
instances as in app1. So the latency will be further multiplied.


Any solution I can think of is ugly and won't ever land upstream. Like creating your
own scheduling framework for io_uring, wiring kindof cgroups, etc. And actually SQPOLL
shouldn't be so ubiquitous (+needs privileges). E.g. I expect there will be a single
app per system using it, e.g. a database consuming most of the resources anyway.
And that's why I think it's better to not trying to solve your original issue.


However, what the patch can be easily remade into is sharing an SQPOLL thread between
io_uring instances of a single app/user, like passing fd described before.
The most obvious example is to share 1 SQPOLL thread (or N << num_cpus) between all
user threads, so
- still creating io_uring per thread to not synchronise SQ
- retaining CPU time for real user work (instead of having N SQPOLL threads)
- increasing polling efficiency (more work -- less idle polling)
- and scheduling/task migration, etc.


note: would be great to check, that it has all necessary cond_resched()

> 
>> specified set of them. In other words, making yours @percpu_threads not global,
>> but rather binding to a set of io_urings.
>>
>> e.g. create 2 independent per-cpu sets of threads. 1st one for {a1,a2,a3}, 2nd
>> for {b1,b2,b3}.
>>
>> a1 = create_uring()
>> a2 = create_uring(shared_sq_threads=a1)
>> a3 = create_uring(shared_sq_threads=a1)
>>
>> b1 = create_uring()
>> b2 = create_uring(shared_sq_threads=b1)
>> b3 = create_uring(shared_sq_threads=b1)
> Thanks your suggestions, I'll try to consider it in V2 patch.
> 
> But I still have one question: now "shared_sq_threads=a1" and "shared_sq_threads=b1"
> can be bound to same cpu core? I think it's still not efficient. E.g. "shared_sq_threads=a1"
> busy loop in its sq_thread_idle period will impact shared_sq_threads=b1 who currently
> there are reqs to handle.
> 
> Now I also wonder whether a io_uring instance,which has SQPOLL enabed and does not
> specify a sq_thread_cpu, can be used in real business environment, the io_sq_thread
> may run in any cpu cores, which may affect any other application, e.g. cpu resource
> contend. So if trying to use SQPOLL, we'd better allocate specified cpu.

I think it's ok.
- needs privileges to create SQPOLL
- don't expect it used without thinking twice
- for not-pinned case everything will be rescheduled to other cpus as appropriate
  It shouldn't affect much in normal circumstances.

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] io_uring: create percpu io sq thread when IORING_SETUP_SQ_AFF is flagged
  2020-06-03 14:47         ` Pavel Begunkov
@ 2020-06-03 18:48           ` Pavel Begunkov
  0 siblings, 0 replies; 11+ messages in thread
From: Pavel Begunkov @ 2020-06-03 18:48 UTC (permalink / raw)
  To: Xiaoguang Wang, io-uring; +Cc: axboe, joseph.qi, yujian.wu1

On 03/06/2020 17:47, Pavel Begunkov wrote:
> On 26/05/2020 17:42, Xiaoguang Wang wrote:
>> Yes, I don't try to make all io_uring instances in the system share threads, I just
>> make io_uring instances which are bound to same cpu core, share one io_sq_thread that
>> only is created once for every cpu core.
>> Otherwise in current io_uring mainline codes, we'd better not bind different io_uring
>> instances to same cpu core,  some instances' busy loop in its sq_thread_idle period will
>> impact other instanes who currently there are reqs to handle.
> 
> I got a bit carried away from your initial case, but there is the case:
> Let's we have 2 unrelated apps that create SQPOLL io_uring instances. Let's say they are
> in 2 different docker container for the argument (and let's just assume a docker container
> user can create such).
> 
> The first app1 submits 32K fat requests as described before. The second one (app2) is a
> low-latency app, submits reqs by 1, but expects it to be picked really fast. And let's
> assume their SQPOLL threads pinned to the same CPU.
> 
> 1. old version:
> The CPU spends some time allocated by a scheduler on 32K requests of app1,
> probably not issuing them all but that's fine. And then it goes to the app2.
> So, the submit-to-pickup latency for app2 is capped by a task scheduler.
> That's somewhat fair. 
> 
> 2. your version:
> io_sq_thread first processes all 32K of requests of app1, and only then goes to app2.
> app2 is screwed, unfair as life can be. And a malicious user can create many io_uring
> instances as in app1. So the latency will be further multiplied.
> 
> 
> Any solution I can think of is ugly and won't ever land upstream. Like creating your
> own scheduling framework for io_uring, wiring kindof cgroups, etc. And actually SQPOLL
> shouldn't be so ubiquitous (+needs privileges). E.g. I expect there will be a single
> app per system using it, e.g. a database consuming most of the resources anyway.
> And that's why I think it's better to not trying to solve your original issue.
> 
> 
> However, what the patch can be easily remade into is sharing an SQPOLL thread between
> io_uring instances of a single app/user, like passing fd described before.
> The most obvious example is to share 1 SQPOLL thread (or N << num_cpus) between all
> user threads, so
> - still creating io_uring per thread to not synchronise SQ
> - retaining CPU time for real user work (instead of having N SQPOLL threads)
> - increasing polling efficiency (more work -- less idle polling)
> - and scheduling/task migration, etc.
> 

Just to add a thing, basically this doesn't differ much from having 1 io_uring
per bunch of threads but replacing SQ synchronisation with the round-robin polling.
If going this way, it'd need a thorough evaluation of performance benefits (if any).

> 
> note: would be great to check, that it has all necessary cond_resched()
> 

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2020-06-03 18:50 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-05-20 11:56 [PATCH] io_uring: create percpu io sq thread when IORING_SETUP_SQ_AFF is flagged Xiaoguang Wang
2020-05-20 12:11 ` Xiaoguang Wang
2020-05-22 11:17   ` Yu Jian Wu
2020-05-25  3:16     ` Xiaoguang Wang
2020-05-20 22:09 ` Pavel Begunkov
2020-05-22  8:33   ` Xiaoguang Wang
2020-05-24 11:46     ` Pavel Begunkov
2020-05-26 14:42       ` Xiaoguang Wang
2020-06-03 14:47         ` Pavel Begunkov
2020-06-03 18:48           ` Pavel Begunkov
2020-05-28  7:56       ` Xiaoguang Wang

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.