linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2 0/2] Optimise io_uring completion waiting
@ 2019-09-22  8:08 Pavel Begunkov (Silence)
  2019-09-22  8:08 ` [PATCH v2 1/2] sched/wait: Add wait_threshold Pavel Begunkov (Silence)
                   ` (2 more replies)
  0 siblings, 3 replies; 38+ messages in thread
From: Pavel Begunkov (Silence) @ 2019-09-22  8:08 UTC (permalink / raw)
  To: Jens Axboe, Ingo Molnar, Peter Zijlstra, linux-block, linux-kernel
  Cc: Pavel Begunkov

From: Pavel Begunkov <asml.silence@gmail.com>

There could be a lot of overhead within generic wait_event_*() used for
waiting for large number of completions. The patchset removes much of
it by using custom wait event (wait_threshold).

Synthetic test showed ~40% performance boost. (see patch 2)

v2: rebase

Pavel Begunkov (2):
  sched/wait: Add wait_threshold
  io_uring: Optimise cq waiting with wait_threshold

 fs/io_uring.c                  | 35 ++++++++++++------
 include/linux/wait_threshold.h | 67 ++++++++++++++++++++++++++++++++++
 kernel/sched/Makefile          |  2 +-
 kernel/sched/wait_threshold.c  | 26 +++++++++++++
 4 files changed, 118 insertions(+), 12 deletions(-)
 create mode 100644 include/linux/wait_threshold.h
 create mode 100644 kernel/sched/wait_threshold.c

-- 
2.23.0


^ permalink raw reply	[flat|nested] 38+ messages in thread

* [PATCH v2 1/2] sched/wait: Add wait_threshold
  2019-09-22  8:08 [PATCH v2 0/2] Optimise io_uring completion waiting Pavel Begunkov (Silence)
@ 2019-09-22  8:08 ` Pavel Begunkov (Silence)
  2019-09-23  7:19   ` Peter Zijlstra
  2019-09-22  8:08 ` [PATCH v2 2/2] io_uring: Optimise cq waiting with wait_threshold Pavel Begunkov (Silence)
  2019-09-22 15:51 ` [PATCH v2 0/2] Optimise io_uring completion waiting Jens Axboe
  2 siblings, 1 reply; 38+ messages in thread
From: Pavel Begunkov (Silence) @ 2019-09-22  8:08 UTC (permalink / raw)
  To: Jens Axboe, Ingo Molnar, Peter Zijlstra, linux-block, linux-kernel
  Cc: Pavel Begunkov

From: Pavel Begunkov <asml.silence@gmail.com>

Add wait_threshold -- a custom wait_event derivative, that waits until
a value is equal to or greater than the specified threshold.

v2: rebase
1. use full condition instead of event number generator
2. add WQ_THRESHOLD_WAKE_ALWAYS constant

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 include/linux/wait_threshold.h | 67 ++++++++++++++++++++++++++++++++++
 kernel/sched/Makefile          |  2 +-
 kernel/sched/wait_threshold.c  | 26 +++++++++++++
 3 files changed, 94 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/wait_threshold.h
 create mode 100644 kernel/sched/wait_threshold.c

diff --git a/include/linux/wait_threshold.h b/include/linux/wait_threshold.h
new file mode 100644
index 000000000000..d8b054504c26
--- /dev/null
+++ b/include/linux/wait_threshold.h
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_WAIT_THRESHOLD_H
+#define _LINUX_WAIT_THRESHOLD_H
+
+#include <linux/wait.h>
+
+#define WQ_THRESHOLD_WAKE_ALWAYS	(~0ui)
+
+struct wait_threshold_queue_entry {
+	struct wait_queue_entry wq_entry;
+	unsigned int threshold;
+};
+
+
+void init_wait_threshold_entry(struct wait_threshold_queue_entry *wtq_entry,
+				unsigned int threshold);
+
+static inline void wake_up_threshold(struct wait_queue_head *wq_head,
+					unsigned int val)
+{
+	void *arg = (void *)(unsigned long)val;
+
+	__wake_up(wq_head, TASK_NORMAL, 1, arg);
+}
+
+#define ___wait_threshold_event(q, thresh, condition, state,		\
+				exclusive, ret, cmd)			\
+({									\
+	__label__ __out;						\
+	struct wait_queue_head *__wq_head = &q;				\
+	struct wait_threshold_queue_entry __wtq_entry;			\
+	struct wait_queue_entry *__wq_entry = &__wtq_entry.wq_entry;	\
+	long __ret = ret; /* explicit shadow */				\
+									\
+	init_wait_threshold_entry(&__wtq_entry, thresh);		\
+	for (;;) {							\
+		long __int = prepare_to_wait_event(__wq_head,		\
+						   __wq_entry,		\
+						   state);		\
+		if (condition)						\
+			break;						\
+									\
+		if (___wait_is_interruptible(state) && __int) {		\
+			__ret = __int;					\
+			goto __out;					\
+		}							\
+									\
+		cmd;							\
+	}								\
+	finish_wait(__wq_head, __wq_entry);				\
+__out:	__ret;								\
+})
+
+#define __wait_threshold_interruptible(q, thresh, condition)		\
+	___wait_threshold_event(q, thresh, condition, TASK_INTERRUPTIBLE, 0, 0,\
+			  schedule())
+
+#define wait_threshold_interruptible(q, threshold, condition)	\
+({								\
+	int __ret = 0;						\
+	might_sleep();						\
+	if (!(condition))					\
+		__ret = __wait_threshold_interruptible(q,	\
+			threshold, condition);			\
+	__ret;							\
+})
+#endif /* _LINUX_WAIT_THRESHOLD_H */
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 21fb5a5662b5..bb895a3184f9 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -18,7 +18,7 @@ endif
 
 obj-y += core.o loadavg.o clock.o cputime.o
 obj-y += idle.o fair.o rt.o deadline.o
-obj-y += wait.o wait_bit.o swait.o completion.o
+obj-y += wait.o wait_bit.o wait_threshold.o swait.o completion.o
 
 obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
diff --git a/kernel/sched/wait_threshold.c b/kernel/sched/wait_threshold.c
new file mode 100644
index 000000000000..80a027c02ff3
--- /dev/null
+++ b/kernel/sched/wait_threshold.c
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include "sched.h"
+#include <linux/wait_threshold.h>
+
+static int wake_threshold_function(struct wait_queue_entry *wq_entry,
+				   unsigned int mode, int sync, void *arg)
+{
+	unsigned int val = (unsigned int)(unsigned long)arg;
+	struct wait_threshold_queue_entry *wtq_entry =
+		container_of(wq_entry, struct wait_threshold_queue_entry,
+			wq_entry);
+
+	if (val < wtq_entry->threshold)
+		return 0;
+
+	return default_wake_function(wq_entry, mode, sync, arg);
+}
+
+void init_wait_threshold_entry(struct wait_threshold_queue_entry *wtq_entry,
+			       unsigned int threshold)
+{
+	init_wait_entry(&wtq_entry->wq_entry, 0);
+	wtq_entry->wq_entry.func = wake_threshold_function;
+	wtq_entry->threshold = threshold;
+}
+EXPORT_SYMBOL(init_wait_threshold_entry);
-- 
2.23.0


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH v2 2/2] io_uring: Optimise cq waiting with wait_threshold
  2019-09-22  8:08 [PATCH v2 0/2] Optimise io_uring completion waiting Pavel Begunkov (Silence)
  2019-09-22  8:08 ` [PATCH v2 1/2] sched/wait: Add wait_threshold Pavel Begunkov (Silence)
@ 2019-09-22  8:08 ` Pavel Begunkov (Silence)
  2019-09-22 15:51 ` [PATCH v2 0/2] Optimise io_uring completion waiting Jens Axboe
  2 siblings, 0 replies; 38+ messages in thread
From: Pavel Begunkov (Silence) @ 2019-09-22  8:08 UTC (permalink / raw)
  To: Jens Axboe, Ingo Molnar, Peter Zijlstra, linux-block, linux-kernel
  Cc: Pavel Begunkov

From: Pavel Begunkov <asml.silence@gmail.com>

While waiting for completion events in io_cqring_wait(), the process
will be waken up inside wait_threshold_interruptible() on any request
completion, check num of events in completion queue and potentially go
to sleep again.

Apparently, there could be a lot of such spurious wakeups with lots of
overhead. It especially manifests itself, when min_events is large, and
completions are arriving one by one or in small batches (that usually
is true).

E.g. if device completes requests one by one and io_uring_enter is
waiting for 100 events, then there will be ~99 spurious wakeups.

Use new wait_threshold_*() instead, which won't wake it up until
necessary number of events is collected.

Performance test:
The first thread generates requests (QD=512) one by one, so they will
be completed in the similar pattern. The second thread waiting for
128 events to complete.

Tested with null_blk with 5us delay
and 3.8GHz Intel CPU.

throughput before: 270 KIOPS
throughput after:  370 KIOPS
~40% throughput boost, exaggerated, but makes a point.

v2: wake always in io_timeout_fn() with WQ_THRESHOLD_WAKE_ALWAYS

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 35 ++++++++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 11 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5c3f2bb81637..05f4391c7bbe 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -70,6 +70,7 @@
 #include <linux/nospec.h>
 #include <linux/sizes.h>
 #include <linux/hugetlb.h>
+#include <linux/wait_threshold.h>
 
 #include <uapi/linux/io_uring.h>
 
@@ -414,6 +415,13 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	return ctx;
 }
 
+static unsigned int io_cqring_events(struct io_rings *rings)
+{
+	/* See comment at the top of this file */
+	smp_rmb();
+	return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head);
+}
+
 static inline bool io_sequence_defer(struct io_ring_ctx *ctx,
 				     struct io_kiocb *req)
 {
@@ -559,16 +567,27 @@ static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
 	}
 }
 
-static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
+static void __io_cqring_ev_posted(struct io_ring_ctx *ctx,
+				unsigned int nr_events)
 {
 	if (waitqueue_active(&ctx->wait))
-		wake_up(&ctx->wait);
+		wake_up_threshold(&ctx->wait, nr_events);
 	if (waitqueue_active(&ctx->sqo_wait))
 		wake_up(&ctx->sqo_wait);
 	if (ctx->cq_ev_fd)
 		eventfd_signal(ctx->cq_ev_fd, 1);
 }
 
+static inline void io_cqring_ev_posted(struct io_ring_ctx *ctx)
+{
+	__io_cqring_ev_posted(ctx, io_cqring_events(ctx->rings));
+}
+
+static inline void io_cqring_timeout_posted(struct io_ring_ctx *ctx)
+{
+	__io_cqring_ev_posted(ctx, WQ_THRESHOLD_WAKE_ALWAYS);
+}
+
 static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data,
 				long res)
 {
@@ -587,7 +606,7 @@ static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs)
 	percpu_ref_put_many(&ctx->refs, refs);
 
 	if (waitqueue_active(&ctx->wait))
-		wake_up(&ctx->wait);
+		wake_up_threshold(&ctx->wait, io_cqring_events(ctx->rings));
 }
 
 static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
@@ -722,12 +741,6 @@ static void io_put_req(struct io_kiocb *req)
 		io_free_req(req);
 }
 
-static unsigned io_cqring_events(struct io_rings *rings)
-{
-	/* See comment at the top of this file */
-	smp_rmb();
-	return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head);
-}
 
 /*
  * Find and free completed poll iocbs
@@ -1824,7 +1837,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
 	io_commit_cqring(ctx);
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 
-	io_cqring_ev_posted(ctx);
+	io_cqring_timeout_posted(ctx);
 
 	io_put_req(req);
 	return HRTIMER_NORESTART;
@@ -2723,7 +2736,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 	 * we started waiting. For timeouts, we always want to return to
 	 * userspace.
 	 */
-	ret = wait_event_interruptible(ctx->wait,
+	ret = wait_threshold_interruptible(ctx->wait, min_events,
 				io_cqring_events(rings) >= min_events ||
 				atomic_read(&ctx->cq_timeouts) != nr_timeouts);
 	restore_saved_sigmask_unless(ret == -ERESTARTSYS);
-- 
2.23.0


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 0/2] Optimise io_uring completion waiting
  2019-09-22  8:08 [PATCH v2 0/2] Optimise io_uring completion waiting Pavel Begunkov (Silence)
  2019-09-22  8:08 ` [PATCH v2 1/2] sched/wait: Add wait_threshold Pavel Begunkov (Silence)
  2019-09-22  8:08 ` [PATCH v2 2/2] io_uring: Optimise cq waiting with wait_threshold Pavel Begunkov (Silence)
@ 2019-09-22 15:51 ` Jens Axboe
  2019-09-23  8:35   ` Ingo Molnar
  2 siblings, 1 reply; 38+ messages in thread
From: Jens Axboe @ 2019-09-22 15:51 UTC (permalink / raw)
  To: Pavel Begunkov (Silence),
	Ingo Molnar, Peter Zijlstra, linux-block, linux-kernel

On 9/22/19 2:08 AM, Pavel Begunkov (Silence) wrote:
> From: Pavel Begunkov <asml.silence@gmail.com>
> 
> There could be a lot of overhead within generic wait_event_*() used for
> waiting for large number of completions. The patchset removes much of
> it by using custom wait event (wait_threshold).
> 
> Synthetic test showed ~40% performance boost. (see patch 2)

I'm fine with the io_uring side of things, but to queue this up we
really need Peter or Ingo to sign off on the core wakeup bits...

Peter?

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 1/2] sched/wait: Add wait_threshold
  2019-09-22  8:08 ` [PATCH v2 1/2] sched/wait: Add wait_threshold Pavel Begunkov (Silence)
@ 2019-09-23  7:19   ` Peter Zijlstra
  2019-09-23 16:37     ` Pavel Begunkov
  0 siblings, 1 reply; 38+ messages in thread
From: Peter Zijlstra @ 2019-09-23  7:19 UTC (permalink / raw)
  To: Pavel Begunkov (Silence)
  Cc: Jens Axboe, Ingo Molnar, linux-block, linux-kernel

On Sun, Sep 22, 2019 at 11:08:50AM +0300, Pavel Begunkov (Silence) wrote:
> From: Pavel Begunkov <asml.silence@gmail.com>
> 
> Add wait_threshold -- a custom wait_event derivative, that waits until
> a value is equal to or greater than the specified threshold.

This is quite insufficient justification for this monster... what exact
semantics do you want?

Why can't you do this exact same with a slightly more complicated @cond
?

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 0/2] Optimise io_uring completion waiting
  2019-09-22 15:51 ` [PATCH v2 0/2] Optimise io_uring completion waiting Jens Axboe
@ 2019-09-23  8:35   ` Ingo Molnar
  2019-09-23 16:21     ` Pavel Begunkov
  0 siblings, 1 reply; 38+ messages in thread
From: Ingo Molnar @ 2019-09-23  8:35 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Pavel Begunkov (Silence),
	Ingo Molnar, Peter Zijlstra, linux-block, linux-kernel


* Jens Axboe <axboe@kernel.dk> wrote:

> On 9/22/19 2:08 AM, Pavel Begunkov (Silence) wrote:
> > From: Pavel Begunkov <asml.silence@gmail.com>
> > 
> > There could be a lot of overhead within generic wait_event_*() used for
> > waiting for large number of completions. The patchset removes much of
> > it by using custom wait event (wait_threshold).
> > 
> > Synthetic test showed ~40% performance boost. (see patch 2)
> 
> I'm fine with the io_uring side of things, but to queue this up we
> really need Peter or Ingo to sign off on the core wakeup bits...
> 
> Peter?

I'm not sure an extension is needed for such a special interface, why not 
just put a ->threshold value next to the ctx->wait field and use either 
the regular wait_event() APIs with the proper condition, or 
wait_event_cmd() style APIs if you absolutely need something more complex 
to happen inside?

Should result in a much lower linecount and no scheduler changes. :-)

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 0/2] Optimise io_uring completion waiting
  2019-09-23  8:35   ` Ingo Molnar
@ 2019-09-23 16:21     ` Pavel Begunkov
  2019-09-23 16:32       ` Pavel Begunkov
  0 siblings, 1 reply; 38+ messages in thread
From: Pavel Begunkov @ 2019-09-23 16:21 UTC (permalink / raw)
  To: Ingo Molnar, Jens Axboe
  Cc: Ingo Molnar, Peter Zijlstra, linux-block, linux-kernel


[-- Attachment #1.1: Type: text/plain, Size: 2436 bytes --]

Hi, and thanks for the feedback.

It could be done with @cond indeed, that's how it works for now.
However, this addresses performance issues only.

The problem with wait_event_*() is that, if we have a counter and are
trying to wake up tasks after each increment, it would schedule each
waiting task O(threshold) times just for it to spuriously check @cond
and go back to sleep. All that overhead (memory barriers, registers
save/load, accounting, etc) turned out to be enough for some workloads
to slow down the system.

With this specialisation it still traverses a wait list and makes
indirect calls to the checker callback, but the list supposedly is
fairly  small, so performance there shouldn't be a problem, at least for
now.

Regarding semantics; It should wake a task when a value passed to
wake_up_threshold() is greater or equal then a task's threshold, that is
specified individually for each task in wait_threshold_*().

In pseudo code:
```
def wake_up_threshold(n, wait_queue):
	for waiter in wait_queue:
		waiter.wake_up_if(n >= waiter.threshold);
```

Any thoughts how to do it better? Ideas are very welcome.

BTW, this monster is mostly a copy-paste from wait_event_*(),
wait_bit_*(). We could try to extract some common parts from these
three, but that's another topic.


On 23/09/2019 11:35, Ingo Molnar wrote:
> 
> * Jens Axboe <axboe@kernel.dk> wrote:
> 
>> On 9/22/19 2:08 AM, Pavel Begunkov (Silence) wrote:
>>> From: Pavel Begunkov <asml.silence@gmail.com>
>>>
>>> There could be a lot of overhead within generic wait_event_*() used for
>>> waiting for large number of completions. The patchset removes much of
>>> it by using custom wait event (wait_threshold).
>>>
>>> Synthetic test showed ~40% performance boost. (see patch 2)
>>
>> I'm fine with the io_uring side of things, but to queue this up we
>> really need Peter or Ingo to sign off on the core wakeup bits...
>>
>> Peter?
> 
> I'm not sure an extension is needed for such a special interface, why not 
> just put a ->threshold value next to the ctx->wait field and use either 
> the regular wait_event() APIs with the proper condition, or 
> wait_event_cmd() style APIs if you absolutely need something more complex 
> to happen inside?
> 
> Should result in a much lower linecount and no scheduler changes. :-)
> 
> Thanks,
> 
> 	Ingo
> 

-- 
Yours sincerely,
Pavel Begunkov


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 0/2] Optimise io_uring completion waiting
  2019-09-23 16:21     ` Pavel Begunkov
@ 2019-09-23 16:32       ` Pavel Begunkov
  2019-09-23 20:48         ` Jens Axboe
  0 siblings, 1 reply; 38+ messages in thread
From: Pavel Begunkov @ 2019-09-23 16:32 UTC (permalink / raw)
  To: Ingo Molnar, Jens Axboe
  Cc: Ingo Molnar, Peter Zijlstra, linux-block, linux-kernel


[-- Attachment #1.1: Type: text/plain, Size: 748 bytes --]

Sorry, mixed the threads.

>>
>> I'm not sure an extension is needed for such a special interface, why not 
>> just put a ->threshold value next to the ctx->wait field and use either 
>> the regular wait_event() APIs with the proper condition, or 
>> wait_event_cmd() style APIs if you absolutely need something more complex 
>> to happen inside?
Ingo,
io_uring works well without this patch just using wait_event_*() with
proper condition, but there are performance issues with spurious
wakeups. Detailed description in the previous mail.
Am I missing something?
Thanks


>>
>> Should result in a much lower linecount and no scheduler changes. :-)
>>
>> Thanks,
>>
>> 	Ingo
>>
> 

-- 
Yours sincerely,
Pavel Begunkov


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 1/2] sched/wait: Add wait_threshold
  2019-09-23  7:19   ` Peter Zijlstra
@ 2019-09-23 16:37     ` Pavel Begunkov
  2019-09-23 19:27       ` Peter Zijlstra
  0 siblings, 1 reply; 38+ messages in thread
From: Pavel Begunkov @ 2019-09-23 16:37 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Jens Axboe, Ingo Molnar, linux-block, linux-kernel


[-- Attachment #1.1: Type: text/plain, Size: 1909 bytes --]

Just in case duplicating a mail from the cover-letter thread:


It could be done with @cond indeed, that's how it works for now.
However, this addresses performance issues only.

The problem with wait_event_*() is that, if we have a counter and are
trying to wake up tasks after each increment, it would schedule each
waiting task O(threshold) times just for it to spuriously check @cond
and go back to sleep. All that overhead (memory barriers, registers
save/load, accounting, etc) turned out to be enough for some workloads
to slow down the system.

With this specialisation it still traverses a wait list and makes
indirect calls to the checker callback, but the list supposedly is
fairly  small, so performance there shouldn't be a problem, at least for
now.

Regarding semantics; It should wake a task when a value passed to
wake_up_threshold() is greater or equal then a task's threshold, that is
specified individually for each task in wait_threshold_*().

In pseudo code:
```
def wake_up_threshold(n, wait_queue):
	for waiter in wait_queue:
		waiter.wake_up_if(n >= waiter.threshold);
```

Any thoughts how to do it better? Ideas are very welcome.

BTW, this monster is mostly a copy-paste from wait_event_*(),
wait_bit_*(). We could try to extract some common parts from these
three, but that's another topic.


On 23/09/2019 10:19, Peter Zijlstra wrote:
> On Sun, Sep 22, 2019 at 11:08:50AM +0300, Pavel Begunkov (Silence) wrote:
>> From: Pavel Begunkov <asml.silence@gmail.com>
>>
>> Add wait_threshold -- a custom wait_event derivative, that waits until
>> a value is equal to or greater than the specified threshold.
> 
> This is quite insufficient justification for this monster... what exact
> semantics do you want?
> 
> Why can't you do this exact same with a slightly more complicated @cond
> ?
> 

-- 
Yours sincerely,
Pavel Begunkov


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 1/2] sched/wait: Add wait_threshold
  2019-09-23 16:37     ` Pavel Begunkov
@ 2019-09-23 19:27       ` Peter Zijlstra
  2019-09-23 20:23         ` Peter Zijlstra
  2019-09-24  6:44         ` Pavel Begunkov
  0 siblings, 2 replies; 38+ messages in thread
From: Peter Zijlstra @ 2019-09-23 19:27 UTC (permalink / raw)
  To: Pavel Begunkov; +Cc: Jens Axboe, Ingo Molnar, linux-block, linux-kernel


A: Because it messes up the order in which people normally read text.
Q: Why is top-posting such a bad thing?
A: Top-posting.
Q: What is the most annoying thing in e-mail?

On Mon, Sep 23, 2019 at 07:37:46PM +0300, Pavel Begunkov wrote:
> Just in case duplicating a mail from the cover-letter thread:

Just because every patch should have a self contained and coherent
Changelog.

> It could be done with @cond indeed, that's how it works for now.
> However, this addresses performance issues only.
> 
> The problem with wait_event_*() is that, if we have a counter and are
> trying to wake up tasks after each increment, it would schedule each
> waiting task O(threshold) times just for it to spuriously check @cond
> and go back to sleep. All that overhead (memory barriers, registers
> save/load, accounting, etc) turned out to be enough for some workloads
> to slow down the system.
> 
> With this specialisation it still traverses a wait list and makes
> indirect calls to the checker callback, but the list supposedly is
> fairly  small, so performance there shouldn't be a problem, at least for
> now.
> 
> Regarding semantics; It should wake a task when a value passed to
> wake_up_threshold() is greater or equal then a task's threshold, that is
> specified individually for each task in wait_threshold_*().
> 
> In pseudo code:
> ```
> def wake_up_threshold(n, wait_queue):
> 	for waiter in wait_queue:
> 		waiter.wake_up_if(n >= waiter.threshold);
> ```
> 
> Any thoughts how to do it better? Ideas are very welcome.
> 
> BTW, this monster is mostly a copy-paste from wait_event_*(),
> wait_bit_*(). We could try to extract some common parts from these
> three, but that's another topic.

I don't think that is another topic at all. It is a quality of
implementation issue. We already have too many copies of all that (3).

So basically you want to fudge the wake function to do the/a @cond test,
not unlike what wait_bit already does, but differenly.

I'm really rather annoyed with C for not having proper lambda functions;
that would make all this so much easier. Anyway, let me have a poke at
this in the morning, it's late already.

Also, is anything actually using wait_queue_entry::private ? I'm
not finding any in a hurry.



^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 1/2] sched/wait: Add wait_threshold
  2019-09-23 19:27       ` Peter Zijlstra
@ 2019-09-23 20:23         ` Peter Zijlstra
  2019-09-24  6:44         ` Pavel Begunkov
  1 sibling, 0 replies; 38+ messages in thread
From: Peter Zijlstra @ 2019-09-23 20:23 UTC (permalink / raw)
  To: Pavel Begunkov
  Cc: Jens Axboe, Ingo Molnar, linux-block, linux-kernel,
	Sebastian Andrzej Siewior

On Mon, Sep 23, 2019 at 09:27:42PM +0200, Peter Zijlstra wrote:

> Also, is anything actually using wait_queue_entry::private ? I'm
> not finding any in a hurry.

That is; outside of default_wake_function(). I don't see a sensible
reason for that field to be 'void *' nor for the name 'private'.

But _that_ is something we can fix later.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 0/2] Optimise io_uring completion waiting
  2019-09-23 16:32       ` Pavel Begunkov
@ 2019-09-23 20:48         ` Jens Axboe
  2019-09-23 23:00           ` Jens Axboe
  0 siblings, 1 reply; 38+ messages in thread
From: Jens Axboe @ 2019-09-23 20:48 UTC (permalink / raw)
  To: Pavel Begunkov, Ingo Molnar
  Cc: Ingo Molnar, Peter Zijlstra, linux-block, linux-kernel

On 9/23/19 10:32 AM, Pavel Begunkov wrote:
> Sorry, mixed the threads.
> 
>>>
>>> I'm not sure an extension is needed for such a special interface, why not
>>> just put a ->threshold value next to the ctx->wait field and use either
>>> the regular wait_event() APIs with the proper condition, or
>>> wait_event_cmd() style APIs if you absolutely need something more complex
>>> to happen inside?
> Ingo,
> io_uring works well without this patch just using wait_event_*() with
> proper condition, but there are performance issues with spurious
> wakeups. Detailed description in the previous mail.
> Am I missing something?

I think we can do the same thing, just wrapping the waitqueue in a
structure with a count in it, on the stack. Got some flight time
coming up later today, let me try and cook up a patch.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 0/2] Optimise io_uring completion waiting
  2019-09-23 20:48         ` Jens Axboe
@ 2019-09-23 23:00           ` Jens Axboe
  2019-09-24  7:06             ` Pavel Begunkov
  0 siblings, 1 reply; 38+ messages in thread
From: Jens Axboe @ 2019-09-23 23:00 UTC (permalink / raw)
  To: Pavel Begunkov, Ingo Molnar
  Cc: Ingo Molnar, Peter Zijlstra, linux-block, linux-kernel

On 9/23/19 2:48 PM, Jens Axboe wrote:
> On 9/23/19 10:32 AM, Pavel Begunkov wrote:
>> Sorry, mixed the threads.
>>
>>>>
>>>> I'm not sure an extension is needed for such a special interface, why not
>>>> just put a ->threshold value next to the ctx->wait field and use either
>>>> the regular wait_event() APIs with the proper condition, or
>>>> wait_event_cmd() style APIs if you absolutely need something more complex
>>>> to happen inside?
>> Ingo,
>> io_uring works well without this patch just using wait_event_*() with
>> proper condition, but there are performance issues with spurious
>> wakeups. Detailed description in the previous mail.
>> Am I missing something?
> 
> I think we can do the same thing, just wrapping the waitqueue in a
> structure with a count in it, on the stack. Got some flight time
> coming up later today, let me try and cook up a patch.

Totally untested, and sent out 5 min before departure... But something
like this.


diff --git a/fs/io_uring.c b/fs/io_uring.c
index ca7570aca430..c2f9e1da26dd 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2768,6 +2768,37 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit,
 	return submit;
 }
 
+struct io_wait_queue {
+	struct wait_queue_entry wq;
+	struct io_ring_ctx *ctx;
+	struct task_struct *task;
+	unsigned to_wait;
+	unsigned nr_timeouts;
+};
+
+static inline bool io_should_wake(struct io_wait_queue *iowq)
+{
+	struct io_ring_ctx *ctx = iowq->ctx;
+
+	return io_cqring_events(ctx->rings) >= iowq->to_wait ||
+			atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
+}
+
+static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
+			    int wake_flags, void *key)
+{
+	struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
+							wq);
+
+	if (io_should_wake(iowq)) {
+		list_del_init(&curr->entry);
+		wake_up_process(iowq->task);
+		return 1;
+	}
+
+	return -1;
+}
+
 /*
  * Wait until events become available, if we don't already have some. The
  * application must reap them itself, as they reside on the shared cq ring.
@@ -2775,8 +2806,16 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit,
 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 			  const sigset_t __user *sig, size_t sigsz)
 {
+	struct io_wait_queue iowq = {
+		.wq = {
+			.func	= io_wake_function,
+			.entry	= LIST_HEAD_INIT(iowq.wq.entry),
+		},
+		.task		= current,
+		.ctx		= ctx,
+		.to_wait	= min_events,
+	};
 	struct io_rings *rings = ctx->rings;
-	unsigned nr_timeouts;
 	int ret;
 
 	if (io_cqring_events(rings) >= min_events)
@@ -2795,15 +2834,16 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 			return ret;
 	}
 
-	nr_timeouts = atomic_read(&ctx->cq_timeouts);
-	/*
-	 * Return if we have enough events, or if a timeout occured since
-	 * we started waiting. For timeouts, we always want to return to
-	 * userspace.
-	 */
-	ret = wait_event_interruptible(ctx->wait,
-				io_cqring_events(rings) >= min_events ||
-				atomic_read(&ctx->cq_timeouts) != nr_timeouts);
+	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
+	prepare_to_wait_exclusive(&ctx->wait, &iowq.wq, TASK_INTERRUPTIBLE);
+	do {
+		if (io_should_wake(&iowq))
+			break;
+		schedule();
+		set_current_state(TASK_INTERRUPTIBLE);
+	} while (1);
+	finish_wait(&ctx->wait, &iowq.wq);
+
 	restore_saved_sigmask_unless(ret == -ERESTARTSYS);
 	if (ret == -ERESTARTSYS)
 		ret = -EINTR;

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 1/2] sched/wait: Add wait_threshold
  2019-09-23 19:27       ` Peter Zijlstra
  2019-09-23 20:23         ` Peter Zijlstra
@ 2019-09-24  6:44         ` Pavel Begunkov
  1 sibling, 0 replies; 38+ messages in thread
From: Pavel Begunkov @ 2019-09-24  6:44 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Jens Axboe, Ingo Molnar, linux-block, linux-kernel


[-- Attachment #1.1: Type: text/plain, Size: 1889 bytes --]


On 23/09/2019 22:27, Peter Zijlstra wrote:
> 
> A: Because it messes up the order in which people normally read text.
> Q: Why is top-posting such a bad thing?
> A: Top-posting.
> Q: What is the most annoying thing in e-mail?
> 
> On Mon, Sep 23, 2019 at 07:37:46PM +0300, Pavel Begunkov wrote:
>> Just in case duplicating a mail from the cover-letter thread:
> 
> Just because every patch should have a self contained and coherent
> Changelog.

Well, I will expand the patch description, if we agree on the
implementation (at least conceptually).


>>
>> BTW, this monster is mostly a copy-paste from wait_event_*(),
>> wait_bit_*(). We could try to extract some common parts from these
>> three, but that's another topic.
> 
> I don't think that is another topic at all. It is a quality of
> implementation issue. We already have too many copies of all that (3).

For example, ___wait_event() is copied in ___wait_var_event(). Instead
it could accept a wait entry generator or just accept entry from above
and be reused in both cases. I've had such a patch, but want to think
what else could be done.

e.g.
```
#define generic_wait_event(ENTRY_GEN, ...)
	ENTRY_GEN(wq_entry_name);
	do_wait_event(wq_entry_name);

#define WBQ_ENTRY_GEN(name)
	struct wait_bit_queue_entry tmp = WBQ_INITIALIZER;
	struct wait_queue_entry	name = &tmp->wq_entry;
```


> 
> So basically you want to fudge the wake function to do the/a @cond test,
> not unlike what wait_bit already does, but differenly.
> 
Yes

> I'm really rather annoyed with C for not having proper lambda functions;
> that would make all this so much easier. Anyway, let me have a poke at
> this in the morning, it's late already.
> 
> Also, is anything actually using wait_queue_entry::private ? I'm
> not finding any in a hurry.
> 
> 

-- 
Yours sincerely,
Pavel Begunkov


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 0/2] Optimise io_uring completion waiting
  2019-09-23 23:00           ` Jens Axboe
@ 2019-09-24  7:06             ` Pavel Begunkov
  2019-09-24  8:02               ` Jens Axboe
  0 siblings, 1 reply; 38+ messages in thread
From: Pavel Begunkov @ 2019-09-24  7:06 UTC (permalink / raw)
  To: Jens Axboe, Ingo Molnar
  Cc: Ingo Molnar, Peter Zijlstra, linux-block, linux-kernel


[-- Attachment #1.1: Type: text/plain, Size: 3404 bytes --]

On 24/09/2019 02:00, Jens Axboe wrote:
>> I think we can do the same thing, just wrapping the waitqueue in a
>> structure with a count in it, on the stack. Got some flight time
>> coming up later today, let me try and cook up a patch.
> 
> Totally untested, and sent out 5 min before departure... But something
> like this.
Hmm, reminds me my first version. Basically that's the same thing but
with macroses inlined. I wanted to make it reusable and self-contained,
though.

If you don't think it could be useful in other places, sure, we could do
something like that. Is that so?

> 
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index ca7570aca430..c2f9e1da26dd 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -2768,6 +2768,37 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit,
>  	return submit;
>  }
>  
> +struct io_wait_queue {
> +	struct wait_queue_entry wq;
> +	struct io_ring_ctx *ctx;
> +	struct task_struct *task;
> +	unsigned to_wait;
> +	unsigned nr_timeouts;
> +};
> +
> +static inline bool io_should_wake(struct io_wait_queue *iowq)
> +{
> +	struct io_ring_ctx *ctx = iowq->ctx;
> +
> +	return io_cqring_events(ctx->rings) >= iowq->to_wait ||
> +			atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
> +}
> +
> +static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
> +			    int wake_flags, void *key)
> +{
> +	struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
> +							wq);
> +
> +	if (io_should_wake(iowq)) {
> +		list_del_init(&curr->entry);
> +		wake_up_process(iowq->task);
> +		return 1;
> +	}
> +
> +	return -1;
> +}
> +
>  /*
>   * Wait until events become available, if we don't already have some. The
>   * application must reap them itself, as they reside on the shared cq ring.
> @@ -2775,8 +2806,16 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit,
>  static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
>  			  const sigset_t __user *sig, size_t sigsz)
>  {
> +	struct io_wait_queue iowq = {
> +		.wq = {
> +			.func	= io_wake_function,
> +			.entry	= LIST_HEAD_INIT(iowq.wq.entry),
> +		},
> +		.task		= current,
> +		.ctx		= ctx,
> +		.to_wait	= min_events,
> +	};
>  	struct io_rings *rings = ctx->rings;
> -	unsigned nr_timeouts;
>  	int ret;
>  
>  	if (io_cqring_events(rings) >= min_events)
> @@ -2795,15 +2834,16 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
>  			return ret;
>  	}
>  
> -	nr_timeouts = atomic_read(&ctx->cq_timeouts);
> -	/*
> -	 * Return if we have enough events, or if a timeout occured since
> -	 * we started waiting. For timeouts, we always want to return to
> -	 * userspace.
> -	 */
> -	ret = wait_event_interruptible(ctx->wait,
> -				io_cqring_events(rings) >= min_events ||
> -				atomic_read(&ctx->cq_timeouts) != nr_timeouts);
> +	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
> +	prepare_to_wait_exclusive(&ctx->wait, &iowq.wq, TASK_INTERRUPTIBLE);
> +	do {
> +		if (io_should_wake(&iowq))
> +			break;
> +		schedule();
> +		set_current_state(TASK_INTERRUPTIBLE);
> +	} while (1);
> +	finish_wait(&ctx->wait, &iowq.wq);
> +
>  	restore_saved_sigmask_unless(ret == -ERESTARTSYS);
>  	if (ret == -ERESTARTSYS)
>  		ret = -EINTR;
> 

-- 
Yours sincerely,
Pavel Begunkov


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 0/2] Optimise io_uring completion waiting
  2019-09-24  7:06             ` Pavel Begunkov
@ 2019-09-24  8:02               ` Jens Axboe
  2019-09-24  8:27                 ` Jens Axboe
  2019-09-24  9:21                 ` Pavel Begunkov
  0 siblings, 2 replies; 38+ messages in thread
From: Jens Axboe @ 2019-09-24  8:02 UTC (permalink / raw)
  To: Pavel Begunkov, Ingo Molnar
  Cc: Ingo Molnar, Peter Zijlstra, linux-block, linux-kernel

On 9/24/19 1:06 AM, Pavel Begunkov wrote:
> On 24/09/2019 02:00, Jens Axboe wrote:
>>> I think we can do the same thing, just wrapping the waitqueue in a
>>> structure with a count in it, on the stack. Got some flight time
>>> coming up later today, let me try and cook up a patch.
>>
>> Totally untested, and sent out 5 min before departure... But something
>> like this.
> Hmm, reminds me my first version. Basically that's the same thing but
> with macroses inlined. I wanted to make it reusable and self-contained,
> though.
> 
> If you don't think it could be useful in other places, sure, we could do
> something like that. Is that so?

I totally agree it could be useful in other places. Maybe formalized and
used with wake_up_nr() instead of adding a new primitive? Haven't looked
into that, I may be talking nonsense.

In any case, I did get a chance to test it and it works for me. Here's
the "finished" version, slightly cleaned up and with a comment added
for good measure.


diff --git a/fs/io_uring.c b/fs/io_uring.c
index ca7570aca430..14fae454cf75 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2768,6 +2768,42 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit,
 	return submit;
 }
 
+struct io_wait_queue {
+	struct wait_queue_entry wq;
+	struct io_ring_ctx *ctx;
+	struct task_struct *task;
+	unsigned to_wait;
+	unsigned nr_timeouts;
+};
+
+static inline bool io_should_wake(struct io_wait_queue *iowq)
+{
+	struct io_ring_ctx *ctx = iowq->ctx;
+
+	/*
+	 * Wake up if we have enough events, or if a timeout occured since we
+	 * started waiting. For timeouts, we always want to return to userspace,
+	 * regardless of event count.
+	 */
+	return io_cqring_events(ctx->rings) >= iowq->to_wait ||
+			atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
+}
+
+static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
+			    int wake_flags, void *key)
+{
+	struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
+							wq);
+
+	if (io_should_wake(iowq)) {
+		list_del_init(&curr->entry);
+		wake_up_process(iowq->task);
+		return 1;
+	}
+
+	return -1;
+}
+
 /*
  * Wait until events become available, if we don't already have some. The
  * application must reap them itself, as they reside on the shared cq ring.
@@ -2775,8 +2811,16 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit,
 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 			  const sigset_t __user *sig, size_t sigsz)
 {
+	struct io_wait_queue iowq = {
+		.wq = {
+			.func	= io_wake_function,
+			.entry	= LIST_HEAD_INIT(iowq.wq.entry),
+		},
+		.task		= current,
+		.ctx		= ctx,
+		.to_wait	= min_events,
+	};
 	struct io_rings *rings = ctx->rings;
-	unsigned nr_timeouts;
 	int ret;
 
 	if (io_cqring_events(rings) >= min_events)
@@ -2795,15 +2839,16 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 			return ret;
 	}
 
-	nr_timeouts = atomic_read(&ctx->cq_timeouts);
-	/*
-	 * Return if we have enough events, or if a timeout occured since
-	 * we started waiting. For timeouts, we always want to return to
-	 * userspace.
-	 */
-	ret = wait_event_interruptible(ctx->wait,
-				io_cqring_events(rings) >= min_events ||
-				atomic_read(&ctx->cq_timeouts) != nr_timeouts);
+	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
+	prepare_to_wait_exclusive(&ctx->wait, &iowq.wq, TASK_INTERRUPTIBLE);
+	do {
+		if (io_should_wake(&iowq))
+			break;
+		schedule();
+		set_current_state(TASK_INTERRUPTIBLE);
+	} while (1);
+	finish_wait(&ctx->wait, &iowq.wq);
+
 	restore_saved_sigmask_unless(ret == -ERESTARTSYS);
 	if (ret == -ERESTARTSYS)
 		ret = -EINTR;

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 0/2] Optimise io_uring completion waiting
  2019-09-24  8:02               ` Jens Axboe
@ 2019-09-24  8:27                 ` Jens Axboe
  2019-09-24  8:36                   ` Jens Axboe
  2019-09-24  9:20                   ` Pavel Begunkov
  2019-09-24  9:21                 ` Pavel Begunkov
  1 sibling, 2 replies; 38+ messages in thread
From: Jens Axboe @ 2019-09-24  8:27 UTC (permalink / raw)
  To: Pavel Begunkov, Ingo Molnar
  Cc: Ingo Molnar, Peter Zijlstra, linux-block, linux-kernel

On 9/24/19 2:02 AM, Jens Axboe wrote:
> On 9/24/19 1:06 AM, Pavel Begunkov wrote:
>> On 24/09/2019 02:00, Jens Axboe wrote:
>>>> I think we can do the same thing, just wrapping the waitqueue in a
>>>> structure with a count in it, on the stack. Got some flight time
>>>> coming up later today, let me try and cook up a patch.
>>>
>>> Totally untested, and sent out 5 min before departure... But something
>>> like this.
>> Hmm, reminds me my first version. Basically that's the same thing but
>> with macroses inlined. I wanted to make it reusable and self-contained,
>> though.
>>
>> If you don't think it could be useful in other places, sure, we could do
>> something like that. Is that so?
> 
> I totally agree it could be useful in other places. Maybe formalized and
> used with wake_up_nr() instead of adding a new primitive? Haven't looked
> into that, I may be talking nonsense.
> 
> In any case, I did get a chance to test it and it works for me. Here's
> the "finished" version, slightly cleaned up and with a comment added
> for good measure.

Notes:

This version gets the ordering right, you need exclusive waits to get
fifo ordering on the waitqueue.

Both versions (yours and mine) suffer from the problem of potentially
waking too many. I don't think this is a real issue, as generally we
don't do threaded access to the io_urings. But if you had the following
tasks wait on the cqring:

[min_events = 32], [min_events = 8], [min_events = 8]

and we reach the io_cqring_events() == threshold, we'll wake all three.
I don't see a good solution to this, so I suspect we just live with
until proven an issue. Both versions are much better than what we have
now.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 0/2] Optimise io_uring completion waiting
  2019-09-24  8:27                 ` Jens Axboe
@ 2019-09-24  8:36                   ` Jens Axboe
  2019-09-24  9:33                     ` Pavel Begunkov
  2019-09-24  9:49                     ` Peter Zijlstra
  2019-09-24  9:20                   ` Pavel Begunkov
  1 sibling, 2 replies; 38+ messages in thread
From: Jens Axboe @ 2019-09-24  8:36 UTC (permalink / raw)
  To: Pavel Begunkov, Ingo Molnar
  Cc: Ingo Molnar, Peter Zijlstra, linux-block, linux-kernel

On 9/24/19 2:27 AM, Jens Axboe wrote:
> On 9/24/19 2:02 AM, Jens Axboe wrote:
>> On 9/24/19 1:06 AM, Pavel Begunkov wrote:
>>> On 24/09/2019 02:00, Jens Axboe wrote:
>>>>> I think we can do the same thing, just wrapping the waitqueue in a
>>>>> structure with a count in it, on the stack. Got some flight time
>>>>> coming up later today, let me try and cook up a patch.
>>>>
>>>> Totally untested, and sent out 5 min before departure... But something
>>>> like this.
>>> Hmm, reminds me my first version. Basically that's the same thing but
>>> with macroses inlined. I wanted to make it reusable and self-contained,
>>> though.
>>>
>>> If you don't think it could be useful in other places, sure, we could do
>>> something like that. Is that so?
>>
>> I totally agree it could be useful in other places. Maybe formalized and
>> used with wake_up_nr() instead of adding a new primitive? Haven't looked
>> into that, I may be talking nonsense.
>>
>> In any case, I did get a chance to test it and it works for me. Here's
>> the "finished" version, slightly cleaned up and with a comment added
>> for good measure.
> 
> Notes:
> 
> This version gets the ordering right, you need exclusive waits to get
> fifo ordering on the waitqueue.
> 
> Both versions (yours and mine) suffer from the problem of potentially
> waking too many. I don't think this is a real issue, as generally we
> don't do threaded access to the io_urings. But if you had the following
> tasks wait on the cqring:
> 
> [min_events = 32], [min_events = 8], [min_events = 8]
> 
> and we reach the io_cqring_events() == threshold, we'll wake all three.
> I don't see a good solution to this, so I suspect we just live with
> until proven an issue. Both versions are much better than what we have
> now.

Forgot an issue around signal handling, version below adds the
right check for that too.

Curious what your test case was for this?


diff --git a/fs/io_uring.c b/fs/io_uring.c
index ca7570aca430..3fbab5692f14 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2768,6 +2768,42 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit,
 	return submit;
 }
 
+struct io_wait_queue {
+	struct wait_queue_entry wq;
+	struct io_ring_ctx *ctx;
+	struct task_struct *task;
+	unsigned to_wait;
+	unsigned nr_timeouts;
+};
+
+static inline bool io_should_wake(struct io_wait_queue *iowq)
+{
+	struct io_ring_ctx *ctx = iowq->ctx;
+
+	/*
+	 * Wake up if we have enough events, or if a timeout occured since we
+	 * started waiting. For timeouts, we always want to return to userspace,
+	 * regardless of event count.
+	 */
+	return io_cqring_events(ctx->rings) >= iowq->to_wait ||
+			atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
+}
+
+static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
+			    int wake_flags, void *key)
+{
+	struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
+							wq);
+
+	if (io_should_wake(iowq)) {
+		list_del_init(&curr->entry);
+		wake_up_process(iowq->task);
+		return 1;
+	}
+
+	return -1;
+}
+
 /*
  * Wait until events become available, if we don't already have some. The
  * application must reap them itself, as they reside on the shared cq ring.
@@ -2775,8 +2811,16 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit,
 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 			  const sigset_t __user *sig, size_t sigsz)
 {
+	struct io_wait_queue iowq = {
+		.wq = {
+			.func	= io_wake_function,
+			.entry	= LIST_HEAD_INIT(iowq.wq.entry),
+		},
+		.task		= current,
+		.ctx		= ctx,
+		.to_wait	= min_events,
+	};
 	struct io_rings *rings = ctx->rings;
-	unsigned nr_timeouts;
 	int ret;
 
 	if (io_cqring_events(rings) >= min_events)
@@ -2795,15 +2839,18 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 			return ret;
 	}
 
-	nr_timeouts = atomic_read(&ctx->cq_timeouts);
-	/*
-	 * Return if we have enough events, or if a timeout occured since
-	 * we started waiting. For timeouts, we always want to return to
-	 * userspace.
-	 */
-	ret = wait_event_interruptible(ctx->wait,
-				io_cqring_events(rings) >= min_events ||
-				atomic_read(&ctx->cq_timeouts) != nr_timeouts);
+	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
+	prepare_to_wait_exclusive(&ctx->wait, &iowq.wq, TASK_INTERRUPTIBLE);
+	do {
+		if (io_should_wake(&iowq))
+			break;
+		schedule();
+		if (signal_pending(current))
+			break;
+		set_current_state(TASK_INTERRUPTIBLE);
+	} while (1);
+	finish_wait(&ctx->wait, &iowq.wq);
+
 	restore_saved_sigmask_unless(ret == -ERESTARTSYS);
 	if (ret == -ERESTARTSYS)
 		ret = -EINTR;

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 0/2] Optimise io_uring completion waiting
  2019-09-24  8:27                 ` Jens Axboe
  2019-09-24  8:36                   ` Jens Axboe
@ 2019-09-24  9:20                   ` Pavel Begunkov
  2019-09-24 10:09                     ` Jens Axboe
  1 sibling, 1 reply; 38+ messages in thread
From: Pavel Begunkov @ 2019-09-24  9:20 UTC (permalink / raw)
  To: Jens Axboe, Ingo Molnar
  Cc: Ingo Molnar, Peter Zijlstra, linux-block, linux-kernel


[-- Attachment #1.1: Type: text/plain, Size: 2127 bytes --]



On 24/09/2019 11:27, Jens Axboe wrote:
> On 9/24/19 2:02 AM, Jens Axboe wrote:
>> On 9/24/19 1:06 AM, Pavel Begunkov wrote:
>>> On 24/09/2019 02:00, Jens Axboe wrote:
>>>>> I think we can do the same thing, just wrapping the waitqueue in a
>>>>> structure with a count in it, on the stack. Got some flight time
>>>>> coming up later today, let me try and cook up a patch.
>>>>
>>>> Totally untested, and sent out 5 min before departure... But something
>>>> like this.
>>> Hmm, reminds me my first version. Basically that's the same thing but
>>> with macroses inlined. I wanted to make it reusable and self-contained,
>>> though.
>>>
>>> If you don't think it could be useful in other places, sure, we could do
>>> something like that. Is that so?
>>
>> I totally agree it could be useful in other places. Maybe formalized and
>> used with wake_up_nr() instead of adding a new primitive? Haven't looked
>> into that, I may be talking nonsense.
>>
>> In any case, I did get a chance to test it and it works for me. Here's
>> the "finished" version, slightly cleaned up and with a comment added
>> for good measure.
> 
> Notes:
> 
> This version gets the ordering right, you need exclusive waits to get
> fifo ordering on the waitqueue.
> 
> Both versions (yours and mine) suffer from the problem of potentially
> waking too many. I don't think this is a real issue, as generally we
> don't do threaded access to the io_urings. But if you had the following
> tasks wait on the cqring:
> 
> [min_events = 32], [min_events = 8], [min_events = 8]
> 
> and we reach the io_cqring_events() == threshold, we'll wake all three.
> I don't see a good solution to this, so I suspect we just live with
> until proven an issue. Both versions are much better than what we have
> now.
> 
If io_cqring_events() == 8, only the last two would be woken up in both
implementations, as to_wait/threshold is specified per waiter. Isn't it?

Agree with waiting, I don't see a good real-life case for that, that
couldn't be done efficiently in userspace.


-- 
Yours sincerely,
Pavel Begunkov


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 0/2] Optimise io_uring completion waiting
  2019-09-24  8:02               ` Jens Axboe
  2019-09-24  8:27                 ` Jens Axboe
@ 2019-09-24  9:21                 ` Pavel Begunkov
  2019-09-24 10:09                   ` Jens Axboe
  1 sibling, 1 reply; 38+ messages in thread
From: Pavel Begunkov @ 2019-09-24  9:21 UTC (permalink / raw)
  To: Jens Axboe, Ingo Molnar
  Cc: Ingo Molnar, Peter Zijlstra, linux-block, linux-kernel


[-- Attachment #1.1: Type: text/plain, Size: 4179 bytes --]

On 24/09/2019 11:02, Jens Axboe wrote:
> On 9/24/19 1:06 AM, Pavel Begunkov wrote:
>> On 24/09/2019 02:00, Jens Axboe wrote:
>>>> I think we can do the same thing, just wrapping the waitqueue in a
>>>> structure with a count in it, on the stack. Got some flight time
>>>> coming up later today, let me try and cook up a patch.
>>>
>>> Totally untested, and sent out 5 min before departure... But something
>>> like this.
>> Hmm, reminds me my first version. Basically that's the same thing but
>> with macroses inlined. I wanted to make it reusable and self-contained,
>> though.
>>
>> If you don't think it could be useful in other places, sure, we could do
>> something like that. Is that so?
> 
> I totally agree it could be useful in other places. Maybe formalized and
> used with wake_up_nr() instead of adding a new primitive? Haven't looked
> into that, I may be talking nonsense.

@nr there is about number of tasks to wake up. AFAIK doesn't solve the
problem.


> 
> In any case, I did get a chance to test it and it works for me. Here's
> the "finished" version, slightly cleaned up and with a comment added
> for good measure.
> 
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index ca7570aca430..14fae454cf75 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -2768,6 +2768,42 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit,
>  	return submit;
>  }
>  
> +struct io_wait_queue {
> +	struct wait_queue_entry wq;
> +	struct io_ring_ctx *ctx;
> +	struct task_struct *task;
> +	unsigned to_wait;
> +	unsigned nr_timeouts;
> +};
> +
> +static inline bool io_should_wake(struct io_wait_queue *iowq)
> +{
> +	struct io_ring_ctx *ctx = iowq->ctx;
> +
> +	/*
> +	 * Wake up if we have enough events, or if a timeout occured since we
> +	 * started waiting. For timeouts, we always want to return to userspace,
> +	 * regardless of event count.
> +	 */
> +	return io_cqring_events(ctx->rings) >= iowq->to_wait ||
> +			atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
> +}
> +
> +static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
> +			    int wake_flags, void *key)
> +{
> +	struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
> +							wq);
> +
> +	if (io_should_wake(iowq)) {
> +		list_del_init(&curr->entry);
> +		wake_up_process(iowq->task);
> +		return 1;
> +	}
> +
> +	return -1;
> +}
> +
>  /*
>   * Wait until events become available, if we don't already have some. The
>   * application must reap them itself, as they reside on the shared cq ring.
> @@ -2775,8 +2811,16 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit,
>  static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
>  			  const sigset_t __user *sig, size_t sigsz)
>  {
> +	struct io_wait_queue iowq = {
> +		.wq = {
> +			.func	= io_wake_function,
> +			.entry	= LIST_HEAD_INIT(iowq.wq.entry),
> +		},
> +		.task		= current,
> +		.ctx		= ctx,
> +		.to_wait	= min_events,
> +	};
>  	struct io_rings *rings = ctx->rings;
> -	unsigned nr_timeouts;
>  	int ret;
>  
>  	if (io_cqring_events(rings) >= min_events)
> @@ -2795,15 +2839,16 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
>  			return ret;
>  	}
>  
> -	nr_timeouts = atomic_read(&ctx->cq_timeouts);
> -	/*
> -	 * Return if we have enough events, or if a timeout occured since
> -	 * we started waiting. For timeouts, we always want to return to
> -	 * userspace.
> -	 */
> -	ret = wait_event_interruptible(ctx->wait,
> -				io_cqring_events(rings) >= min_events ||
> -				atomic_read(&ctx->cq_timeouts) != nr_timeouts);
> +	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
> +	prepare_to_wait_exclusive(&ctx->wait, &iowq.wq, TASK_INTERRUPTIBLE);
> +	do {
> +		if (io_should_wake(&iowq))
> +			break;
> +		schedule();
> +		set_current_state(TASK_INTERRUPTIBLE);
> +	} while (1);
> +	finish_wait(&ctx->wait, &iowq.wq);
> +
>  	restore_saved_sigmask_unless(ret == -ERESTARTSYS);
>  	if (ret == -ERESTARTSYS)
>  		ret = -EINTR;
> 

-- 
Yours sincerely,
Pavel Begunkov


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 0/2] Optimise io_uring completion waiting
  2019-09-24  8:36                   ` Jens Axboe
@ 2019-09-24  9:33                     ` Pavel Begunkov
  2019-09-24 10:11                       ` Jens Axboe
  2019-09-24  9:49                     ` Peter Zijlstra
  1 sibling, 1 reply; 38+ messages in thread
From: Pavel Begunkov @ 2019-09-24  9:33 UTC (permalink / raw)
  To: Jens Axboe, Ingo Molnar
  Cc: Ingo Molnar, Peter Zijlstra, linux-block, linux-kernel


[-- Attachment #1.1: Type: text/plain, Size: 5510 bytes --]



On 24/09/2019 11:36, Jens Axboe wrote:
> On 9/24/19 2:27 AM, Jens Axboe wrote:
>> On 9/24/19 2:02 AM, Jens Axboe wrote:
>>> On 9/24/19 1:06 AM, Pavel Begunkov wrote:
>>>> On 24/09/2019 02:00, Jens Axboe wrote:
>>>>>> I think we can do the same thing, just wrapping the waitqueue in a
>>>>>> structure with a count in it, on the stack. Got some flight time
>>>>>> coming up later today, let me try and cook up a patch.
>>>>>
>>>>> Totally untested, and sent out 5 min before departure... But something
>>>>> like this.
>>>> Hmm, reminds me my first version. Basically that's the same thing but
>>>> with macroses inlined. I wanted to make it reusable and self-contained,
>>>> though.
>>>>
>>>> If you don't think it could be useful in other places, sure, we could do
>>>> something like that. Is that so?
>>>
>>> I totally agree it could be useful in other places. Maybe formalized and
>>> used with wake_up_nr() instead of adding a new primitive? Haven't looked
>>> into that, I may be talking nonsense.
>>>
>>> In any case, I did get a chance to test it and it works for me. Here's
>>> the "finished" version, slightly cleaned up and with a comment added
>>> for good measure.
>>
>> Notes:
>>
>> This version gets the ordering right, you need exclusive waits to get
>> fifo ordering on the waitqueue.
>>
>> Both versions (yours and mine) suffer from the problem of potentially
>> waking too many. I don't think this is a real issue, as generally we
>> don't do threaded access to the io_urings. But if you had the following
>> tasks wait on the cqring:
>>
>> [min_events = 32], [min_events = 8], [min_events = 8]
>>
>> and we reach the io_cqring_events() == threshold, we'll wake all three.
>> I don't see a good solution to this, so I suspect we just live with
>> until proven an issue. Both versions are much better than what we have
>> now.
> 
> Forgot an issue around signal handling, version below adds the
> right check for that too.

It seems to be a good reason to not keep reimplementing
"prepare_to_wait*() + wait loop" every time, but keep it in sched :)

> 
> Curious what your test case was for this?
You mean a performance test case? It's briefly described in a comment
for the second patch. That's just rewritten io_uring-bench, with
1. a thread generating 1 request per call in a loop
2. and the second thread waiting for ~128 events.
Both are pinned to the same core.

> 
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index ca7570aca430..3fbab5692f14 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -2768,6 +2768,42 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit,
>  	return submit;
>  }
>  
> +struct io_wait_queue {
> +	struct wait_queue_entry wq;
> +	struct io_ring_ctx *ctx;
> +	struct task_struct *task;
> +	unsigned to_wait;
> +	unsigned nr_timeouts;
> +};
> +
> +static inline bool io_should_wake(struct io_wait_queue *iowq)
> +{
> +	struct io_ring_ctx *ctx = iowq->ctx;
> +
> +	/*
> +	 * Wake up if we have enough events, or if a timeout occured since we
> +	 * started waiting. For timeouts, we always want to return to userspace,
> +	 * regardless of event count.
> +	 */
> +	return io_cqring_events(ctx->rings) >= iowq->to_wait ||
> +			atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
> +}
> +
> +static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
> +			    int wake_flags, void *key)
> +{
> +	struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
> +							wq);
> +
> +	if (io_should_wake(iowq)) {
> +		list_del_init(&curr->entry);
> +		wake_up_process(iowq->task);
> +		return 1;
> +	}
> +
> +	return -1;
> +}
> +
>  /*
>   * Wait until events become available, if we don't already have some. The
>   * application must reap them itself, as they reside on the shared cq ring.
> @@ -2775,8 +2811,16 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit,
>  static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
>  			  const sigset_t __user *sig, size_t sigsz)
>  {
> +	struct io_wait_queue iowq = {
> +		.wq = {
> +			.func	= io_wake_function,
> +			.entry	= LIST_HEAD_INIT(iowq.wq.entry),
> +		},
> +		.task		= current,
> +		.ctx		= ctx,
> +		.to_wait	= min_events,
> +	};
>  	struct io_rings *rings = ctx->rings;
> -	unsigned nr_timeouts;
>  	int ret;
>  
>  	if (io_cqring_events(rings) >= min_events)
> @@ -2795,15 +2839,18 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
>  			return ret;
>  	}
>  
> -	nr_timeouts = atomic_read(&ctx->cq_timeouts);
> -	/*
> -	 * Return if we have enough events, or if a timeout occured since
> -	 * we started waiting. For timeouts, we always want to return to
> -	 * userspace.
> -	 */
> -	ret = wait_event_interruptible(ctx->wait,
> -				io_cqring_events(rings) >= min_events ||
> -				atomic_read(&ctx->cq_timeouts) != nr_timeouts);
> +	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
> +	prepare_to_wait_exclusive(&ctx->wait, &iowq.wq, TASK_INTERRUPTIBLE);
> +	do {
> +		if (io_should_wake(&iowq))
> +			break;
> +		schedule();
> +		if (signal_pending(current))
> +			break;
> +		set_current_state(TASK_INTERRUPTIBLE);
> +	} while (1);
> +	finish_wait(&ctx->wait, &iowq.wq);
> +
>  	restore_saved_sigmask_unless(ret == -ERESTARTSYS);
>  	if (ret == -ERESTARTSYS)
>  		ret = -EINTR;
> 

-- 
Yours sincerely,
Pavel Begunkov


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 0/2] Optimise io_uring completion waiting
  2019-09-24  8:36                   ` Jens Axboe
  2019-09-24  9:33                     ` Pavel Begunkov
@ 2019-09-24  9:49                     ` Peter Zijlstra
  2019-09-24 10:13                       ` Jens Axboe
  1 sibling, 1 reply; 38+ messages in thread
From: Peter Zijlstra @ 2019-09-24  9:49 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Pavel Begunkov, Ingo Molnar, Ingo Molnar, linux-block, linux-kernel

On Tue, Sep 24, 2019 at 10:36:28AM +0200, Jens Axboe wrote:

> +struct io_wait_queue {
> +	struct wait_queue_entry wq;
> +	struct io_ring_ctx *ctx;
> +	struct task_struct *task;

wq.private is where the normal waitqueue stores the task pointer.

(I'm going to rename that)

> +	unsigned to_wait;
> +	unsigned nr_timeouts;
> +};
> +
> +static inline bool io_should_wake(struct io_wait_queue *iowq)
> +{
> +	struct io_ring_ctx *ctx = iowq->ctx;
> +
> +	/*
> +	 * Wake up if we have enough events, or if a timeout occured since we
> +	 * started waiting. For timeouts, we always want to return to userspace,
> +	 * regardless of event count.
> +	 */
> +	return io_cqring_events(ctx->rings) >= iowq->to_wait ||
> +			atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
> +}
> +
> +static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
> +			    int wake_flags, void *key)
> +{
> +	struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
> +							wq);
> +
> +	if (io_should_wake(iowq)) {
> +		list_del_init(&curr->entry);
> +		wake_up_process(iowq->task);

Then you can use autoremove_wake_function() here.

> +		return 1;
> +	}
> +
> +	return -1;
> +}

Ideally we'd get wait_event()'s @cond in a custom wake function. Then we
can _always_ do this.

This is one I'd love to have lambda functions for. It would actually
work with GCC nested functions, because the wake function will always be
in scope, but we can't use those in the kernel for other reasons :/


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 0/2] Optimise io_uring completion waiting
  2019-09-24  9:20                   ` Pavel Begunkov
@ 2019-09-24 10:09                     ` Jens Axboe
  0 siblings, 0 replies; 38+ messages in thread
From: Jens Axboe @ 2019-09-24 10:09 UTC (permalink / raw)
  To: Pavel Begunkov, Ingo Molnar
  Cc: Ingo Molnar, Peter Zijlstra, linux-block, linux-kernel

On 9/24/19 3:20 AM, Pavel Begunkov wrote:
> 
> 
> On 24/09/2019 11:27, Jens Axboe wrote:
>> On 9/24/19 2:02 AM, Jens Axboe wrote:
>>> On 9/24/19 1:06 AM, Pavel Begunkov wrote:
>>>> On 24/09/2019 02:00, Jens Axboe wrote:
>>>>>> I think we can do the same thing, just wrapping the waitqueue in a
>>>>>> structure with a count in it, on the stack. Got some flight time
>>>>>> coming up later today, let me try and cook up a patch.
>>>>>
>>>>> Totally untested, and sent out 5 min before departure... But something
>>>>> like this.
>>>> Hmm, reminds me my first version. Basically that's the same thing but
>>>> with macroses inlined. I wanted to make it reusable and self-contained,
>>>> though.
>>>>
>>>> If you don't think it could be useful in other places, sure, we could do
>>>> something like that. Is that so?
>>>
>>> I totally agree it could be useful in other places. Maybe formalized and
>>> used with wake_up_nr() instead of adding a new primitive? Haven't looked
>>> into that, I may be talking nonsense.
>>>
>>> In any case, I did get a chance to test it and it works for me. Here's
>>> the "finished" version, slightly cleaned up and with a comment added
>>> for good measure.
>>
>> Notes:
>>
>> This version gets the ordering right, you need exclusive waits to get
>> fifo ordering on the waitqueue.
>>
>> Both versions (yours and mine) suffer from the problem of potentially
>> waking too many. I don't think this is a real issue, as generally we
>> don't do threaded access to the io_urings. But if you had the following
>> tasks wait on the cqring:
>>
>> [min_events = 32], [min_events = 8], [min_events = 8]
>>
>> and we reach the io_cqring_events() == threshold, we'll wake all three.
>> I don't see a good solution to this, so I suspect we just live with
>> until proven an issue. Both versions are much better than what we have
>> now.
>>
> If io_cqring_events() == 8, only the last two would be woken up in both
> implementations, as to_wait/threshold is specified per waiter. Isn't it?

If io_cqring_events() == 8, then none would be woken in my
implementation since the first one will break the wakeup loop.

> Agree with waiting, I don't see a good real-life case for that, that
> couldn't be done efficiently in userspace.

Exactly

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 0/2] Optimise io_uring completion waiting
  2019-09-24  9:21                 ` Pavel Begunkov
@ 2019-09-24 10:09                   ` Jens Axboe
  0 siblings, 0 replies; 38+ messages in thread
From: Jens Axboe @ 2019-09-24 10:09 UTC (permalink / raw)
  To: Pavel Begunkov, Ingo Molnar
  Cc: Ingo Molnar, Peter Zijlstra, linux-block, linux-kernel

On 9/24/19 3:21 AM, Pavel Begunkov wrote:
> On 24/09/2019 11:02, Jens Axboe wrote:
>> On 9/24/19 1:06 AM, Pavel Begunkov wrote:
>>> On 24/09/2019 02:00, Jens Axboe wrote:
>>>>> I think we can do the same thing, just wrapping the waitqueue in a
>>>>> structure with a count in it, on the stack. Got some flight time
>>>>> coming up later today, let me try and cook up a patch.
>>>>
>>>> Totally untested, and sent out 5 min before departure... But something
>>>> like this.
>>> Hmm, reminds me my first version. Basically that's the same thing but
>>> with macroses inlined. I wanted to make it reusable and self-contained,
>>> though.
>>>
>>> If you don't think it could be useful in other places, sure, we could do
>>> something like that. Is that so?
>>
>> I totally agree it could be useful in other places. Maybe formalized and
>> used with wake_up_nr() instead of adding a new primitive? Haven't looked
>> into that, I may be talking nonsense.
> 
> @nr there is about number of tasks to wake up. AFAIK doesn't solve the
> problem.

Ah right, embarassingly I'm actually the one that added that
functionality ages ago...

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 0/2] Optimise io_uring completion waiting
  2019-09-24  9:33                     ` Pavel Begunkov
@ 2019-09-24 10:11                       ` Jens Axboe
  0 siblings, 0 replies; 38+ messages in thread
From: Jens Axboe @ 2019-09-24 10:11 UTC (permalink / raw)
  To: Pavel Begunkov, Ingo Molnar
  Cc: Ingo Molnar, Peter Zijlstra, linux-block, linux-kernel

On 9/24/19 3:33 AM, Pavel Begunkov wrote:
> 
> 
> On 24/09/2019 11:36, Jens Axboe wrote:
>> On 9/24/19 2:27 AM, Jens Axboe wrote:
>>> On 9/24/19 2:02 AM, Jens Axboe wrote:
>>>> On 9/24/19 1:06 AM, Pavel Begunkov wrote:
>>>>> On 24/09/2019 02:00, Jens Axboe wrote:
>>>>>>> I think we can do the same thing, just wrapping the waitqueue in a
>>>>>>> structure with a count in it, on the stack. Got some flight time
>>>>>>> coming up later today, let me try and cook up a patch.
>>>>>>
>>>>>> Totally untested, and sent out 5 min before departure... But something
>>>>>> like this.
>>>>> Hmm, reminds me my first version. Basically that's the same thing but
>>>>> with macroses inlined. I wanted to make it reusable and self-contained,
>>>>> though.
>>>>>
>>>>> If you don't think it could be useful in other places, sure, we could do
>>>>> something like that. Is that so?
>>>>
>>>> I totally agree it could be useful in other places. Maybe formalized and
>>>> used with wake_up_nr() instead of adding a new primitive? Haven't looked
>>>> into that, I may be talking nonsense.
>>>>
>>>> In any case, I did get a chance to test it and it works for me. Here's
>>>> the "finished" version, slightly cleaned up and with a comment added
>>>> for good measure.
>>>
>>> Notes:
>>>
>>> This version gets the ordering right, you need exclusive waits to get
>>> fifo ordering on the waitqueue.
>>>
>>> Both versions (yours and mine) suffer from the problem of potentially
>>> waking too many. I don't think this is a real issue, as generally we
>>> don't do threaded access to the io_urings. But if you had the following
>>> tasks wait on the cqring:
>>>
>>> [min_events = 32], [min_events = 8], [min_events = 8]
>>>
>>> and we reach the io_cqring_events() == threshold, we'll wake all three.
>>> I don't see a good solution to this, so I suspect we just live with
>>> until proven an issue. Both versions are much better than what we have
>>> now.
>>
>> Forgot an issue around signal handling, version below adds the
>> right check for that too.
> 
> It seems to be a good reason to not keep reimplementing
> "prepare_to_wait*() + wait loop" every time, but keep it in sched :)

I think if we do the ->private cleanup that Peter mentioned, then
there's not much left in terms of consolidation. Not convinced the case
is interesting enough to warrant a special helper. If others show up,
it's easy enough to consolidate the use cases and unify them.

If you look at wake_up_nr(), I would have thought that would be more
widespread. But it really isn't.

>> Curious what your test case was for this?
> You mean a performance test case? It's briefly described in a comment
> for the second patch. That's just rewritten io_uring-bench, with
> 1. a thread generating 1 request per call in a loop
> 2. and the second thread waiting for ~128 events.
> Both are pinned to the same core.

Gotcha, thanks.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 0/2] Optimise io_uring completion waiting
  2019-09-24  9:49                     ` Peter Zijlstra
@ 2019-09-24 10:13                       ` Jens Axboe
  2019-09-24 10:34                         ` Jens Axboe
  0 siblings, 1 reply; 38+ messages in thread
From: Jens Axboe @ 2019-09-24 10:13 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Pavel Begunkov, Ingo Molnar, Ingo Molnar, linux-block, linux-kernel

On 9/24/19 3:49 AM, Peter Zijlstra wrote:
> On Tue, Sep 24, 2019 at 10:36:28AM +0200, Jens Axboe wrote:
> 
>> +struct io_wait_queue {
>> +	struct wait_queue_entry wq;
>> +	struct io_ring_ctx *ctx;
>> +	struct task_struct *task;
> 
> wq.private is where the normal waitqueue stores the task pointer.
> 
> (I'm going to rename that)

If you do that, then we can just base the io_uring parts on that. 

>> +	unsigned to_wait;
>> +	unsigned nr_timeouts;
>> +};
>> +
>> +static inline bool io_should_wake(struct io_wait_queue *iowq)
>> +{
>> +	struct io_ring_ctx *ctx = iowq->ctx;
>> +
>> +	/*
>> +	 * Wake up if we have enough events, or if a timeout occured since we
>> +	 * started waiting. For timeouts, we always want to return to userspace,
>> +	 * regardless of event count.
>> +	 */
>> +	return io_cqring_events(ctx->rings) >= iowq->to_wait ||
>> +			atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
>> +}
>> +
>> +static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
>> +			    int wake_flags, void *key)
>> +{
>> +	struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
>> +							wq);
>> +
>> +	if (io_should_wake(iowq)) {
>> +		list_del_init(&curr->entry);
>> +		wake_up_process(iowq->task);
> 
> Then you can use autoremove_wake_function() here.
> 
>> +		return 1;
>> +	}
>> +
>> +	return -1;
>> +}
> 
> Ideally we'd get wait_event()'s @cond in a custom wake function. Then we
> can _always_ do this.
> 
> This is one I'd love to have lambda functions for. It would actually
> work with GCC nested functions, because the wake function will always be
> in scope, but we can't use those in the kernel for other reasons :/

I'll be happy enough if I can just call autoremove_wake_function(), I
think that will simplify the case enough for io_uring to not really make
me care too much about going further. I'll leave that to you, if you
have the desire :-)

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 0/2] Optimise io_uring completion waiting
  2019-09-24 10:13                       ` Jens Axboe
@ 2019-09-24 10:34                         ` Jens Axboe
  2019-09-24 11:11                           ` Pavel Begunkov
  2019-09-24 11:33                           ` Peter Zijlstra
  0 siblings, 2 replies; 38+ messages in thread
From: Jens Axboe @ 2019-09-24 10:34 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Pavel Begunkov, Ingo Molnar, Ingo Molnar, linux-block, linux-kernel

On 9/24/19 4:13 AM, Jens Axboe wrote:
> On 9/24/19 3:49 AM, Peter Zijlstra wrote:
>> On Tue, Sep 24, 2019 at 10:36:28AM +0200, Jens Axboe wrote:
>>
>>> +struct io_wait_queue {
>>> +	struct wait_queue_entry wq;
>>> +	struct io_ring_ctx *ctx;
>>> +	struct task_struct *task;
>>
>> wq.private is where the normal waitqueue stores the task pointer.
>>
>> (I'm going to rename that)
> 
> If you do that, then we can just base the io_uring parts on that.

Just took a quick look at it, and ran into block/kyber-iosched.c that
actually uses the private pointer for something that isn't a task
struct...

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 0/2] Optimise io_uring completion waiting
  2019-09-24 10:34                         ` Jens Axboe
@ 2019-09-24 11:11                           ` Pavel Begunkov
  2019-09-24 11:15                             ` Jens Axboe
  2019-09-24 11:43                             ` Peter Zijlstra
  2019-09-24 11:33                           ` Peter Zijlstra
  1 sibling, 2 replies; 38+ messages in thread
From: Pavel Begunkov @ 2019-09-24 11:11 UTC (permalink / raw)
  To: Jens Axboe, Peter Zijlstra
  Cc: Ingo Molnar, Ingo Molnar, linux-block, linux-kernel

On 24/09/2019 13:34, Jens Axboe wrote:
> On 9/24/19 4:13 AM, Jens Axboe wrote:
>> On 9/24/19 3:49 AM, Peter Zijlstra wrote:
>>> On Tue, Sep 24, 2019 at 10:36:28AM +0200, Jens Axboe wrote:
>>>
>>>> +struct io_wait_queue {
>>>> +	struct wait_queue_entry wq;
>>>> +	struct io_ring_ctx *ctx;
>>>> +	struct task_struct *task;
>>>
>>> wq.private is where the normal waitqueue stores the task pointer.
>>>
>>> (I'm going to rename that)
>>
>> If you do that, then we can just base the io_uring parts on that.
> 
> Just took a quick look at it, and ran into block/kyber-iosched.c that
> actually uses the private pointer for something that isn't a task
> struct...
> 

Let reuse autoremove_wake_function anyway. Changed a bit your patch:


diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5c3f2bb81637..a77971290fdd 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2690,6 +2690,38 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit,
 	return submit;
 }
 
+struct io_wait_queue {
+	struct wait_queue_entry wq;
+	struct io_ring_ctx *ctx;
+	unsigned to_wait;
+	unsigned nr_timeouts;
+};
+
+static inline bool io_should_wake(struct io_wait_queue *iowq)
+{
+	struct io_ring_ctx *ctx = iowq->ctx;
+
+	/*
+	 * Wake up if we have enough events, or if a timeout occured since we
+	 * started waiting. For timeouts, we always want to return to userspace,
+	 * regardless of event count.
+	 */
+	return io_cqring_events(ctx->rings) >= iowq->to_wait ||
+			atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
+}
+
+static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
+			    int wake_flags, void *key)
+{
+	struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
+							wq);
+
+	if (!io_should_wake(iowq))
+		return -1;
+
+	return autoremove_wake_function(curr, mode, wake_flags, key);
+}
+
 /*
  * Wait until events become available, if we don't already have some. The
  * application must reap them itself, as they reside on the shared cq ring.
@@ -2697,8 +2729,16 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit,
 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 			  const sigset_t __user *sig, size_t sigsz)
 {
+	struct io_wait_queue iowq = {
+		.wq = {
+			.private = current,
+			.func	= io_wake_function,
+			.entry	= LIST_HEAD_INIT(iowq.wq.entry),
+		},
+		.ctx		= ctx,
+		.to_wait	= min_events,
+	};
 	struct io_rings *rings = ctx->rings;
-	unsigned nr_timeouts;
 	int ret;
 
 	if (io_cqring_events(rings) >= min_events)
@@ -2717,15 +2757,18 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 			return ret;
 	}
 
-	nr_timeouts = atomic_read(&ctx->cq_timeouts);
-	/*
-	 * Return if we have enough events, or if a timeout occured since
-	 * we started waiting. For timeouts, we always want to return to
-	 * userspace.
-	 */
-	ret = wait_event_interruptible(ctx->wait,
-				io_cqring_events(rings) >= min_events ||
-				atomic_read(&ctx->cq_timeouts) != nr_timeouts);
+	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
+	prepare_to_wait_exclusive(&ctx->wait, &iowq.wq, TASK_INTERRUPTIBLE);
+	do {
+		if (io_should_wake(&iowq))
+			break;
+		schedule();
+		if (signal_pending(current))
+			break;
+		set_current_state(TASK_INTERRUPTIBLE);
+	} while (1);
+	finish_wait(&ctx->wait, &iowq.wq);
+
 	restore_saved_sigmask_unless(ret == -ERESTARTSYS);
 	if (ret == -ERESTARTSYS)
 		ret = -EINTR;


-- 
Yours sincerely,
Pavel Begunkov

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 0/2] Optimise io_uring completion waiting
  2019-09-24 11:11                           ` Pavel Begunkov
@ 2019-09-24 11:15                             ` Jens Axboe
  2019-09-24 11:23                               ` Pavel Begunkov
  2019-09-24 11:43                             ` Peter Zijlstra
  1 sibling, 1 reply; 38+ messages in thread
From: Jens Axboe @ 2019-09-24 11:15 UTC (permalink / raw)
  To: Pavel Begunkov, Peter Zijlstra
  Cc: Ingo Molnar, Ingo Molnar, linux-block, linux-kernel

On 9/24/19 5:11 AM, Pavel Begunkov wrote:
> On 24/09/2019 13:34, Jens Axboe wrote:
>> On 9/24/19 4:13 AM, Jens Axboe wrote:
>>> On 9/24/19 3:49 AM, Peter Zijlstra wrote:
>>>> On Tue, Sep 24, 2019 at 10:36:28AM +0200, Jens Axboe wrote:
>>>>
>>>>> +struct io_wait_queue {
>>>>> +	struct wait_queue_entry wq;
>>>>> +	struct io_ring_ctx *ctx;
>>>>> +	struct task_struct *task;
>>>>
>>>> wq.private is where the normal waitqueue stores the task pointer.
>>>>
>>>> (I'm going to rename that)
>>>
>>> If you do that, then we can just base the io_uring parts on that.
>>
>> Just took a quick look at it, and ran into block/kyber-iosched.c that
>> actually uses the private pointer for something that isn't a task
>> struct...
>>
> 
> Let reuse autoremove_wake_function anyway. Changed a bit your patch:

Yep that should do it, and saves 8 bytes of stack as well.

BTW, did you test my patch, this one or the previous? Just curious if it
worked for you.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 0/2] Optimise io_uring completion waiting
  2019-09-24 11:15                             ` Jens Axboe
@ 2019-09-24 11:23                               ` Pavel Begunkov
  2019-09-24 13:13                                 ` Jens Axboe
  0 siblings, 1 reply; 38+ messages in thread
From: Pavel Begunkov @ 2019-09-24 11:23 UTC (permalink / raw)
  To: Jens Axboe, Peter Zijlstra
  Cc: Ingo Molnar, Ingo Molnar, linux-block, linux-kernel


[-- Attachment #1.1: Type: text/plain, Size: 1128 bytes --]

On 24/09/2019 14:15, Jens Axboe wrote:
> On 9/24/19 5:11 AM, Pavel Begunkov wrote:
>> On 24/09/2019 13:34, Jens Axboe wrote:
>>> On 9/24/19 4:13 AM, Jens Axboe wrote:
>>>> On 9/24/19 3:49 AM, Peter Zijlstra wrote:
>>>>> On Tue, Sep 24, 2019 at 10:36:28AM +0200, Jens Axboe wrote:
>>>>>
>>>>>> +struct io_wait_queue {
>>>>>> +	struct wait_queue_entry wq;
>>>>>> +	struct io_ring_ctx *ctx;
>>>>>> +	struct task_struct *task;
>>>>>
>>>>> wq.private is where the normal waitqueue stores the task pointer.
>>>>>
>>>>> (I'm going to rename that)
>>>>
>>>> If you do that, then we can just base the io_uring parts on that.
>>>
>>> Just took a quick look at it, and ran into block/kyber-iosched.c that
>>> actually uses the private pointer for something that isn't a task
>>> struct...
>>>
>>
>> Let reuse autoremove_wake_function anyway. Changed a bit your patch:
> 
> Yep that should do it, and saves 8 bytes of stack as well.
> 
> BTW, did you test my patch, this one or the previous? Just curious if it
> worked for you.
> 
Not yet, going to do that tonight

-- 
Yours sincerely,
Pavel Begunkov


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 0/2] Optimise io_uring completion waiting
  2019-09-24 10:34                         ` Jens Axboe
  2019-09-24 11:11                           ` Pavel Begunkov
@ 2019-09-24 11:33                           ` Peter Zijlstra
  1 sibling, 0 replies; 38+ messages in thread
From: Peter Zijlstra @ 2019-09-24 11:33 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Pavel Begunkov, Ingo Molnar, Ingo Molnar, linux-block, linux-kernel

On Tue, Sep 24, 2019 at 12:34:17PM +0200, Jens Axboe wrote:

> Just took a quick look at it, and ran into block/kyber-iosched.c that
> actually uses the private pointer for something that isn't a task
> struct...

Argh... that's some 'creative' abuse of the waitqueue API :/

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 0/2] Optimise io_uring completion waiting
  2019-09-24 11:11                           ` Pavel Begunkov
  2019-09-24 11:15                             ` Jens Axboe
@ 2019-09-24 11:43                             ` Peter Zijlstra
  2019-09-24 12:57                               ` Jens Axboe
  1 sibling, 1 reply; 38+ messages in thread
From: Peter Zijlstra @ 2019-09-24 11:43 UTC (permalink / raw)
  To: Pavel Begunkov
  Cc: Jens Axboe, Ingo Molnar, Ingo Molnar, linux-block, linux-kernel

On Tue, Sep 24, 2019 at 02:11:29PM +0300, Pavel Begunkov wrote:

> @@ -2717,15 +2757,18 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
>  			return ret;
>  	}
>  
> +	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
> +	prepare_to_wait_exclusive(&ctx->wait, &iowq.wq, TASK_INTERRUPTIBLE);
> +	do {
> +		if (io_should_wake(&iowq))
> +			break;
> +		schedule();
> +		if (signal_pending(current))
> +			break;
> +		set_current_state(TASK_INTERRUPTIBLE);
> +	} while (1);
> +	finish_wait(&ctx->wait, &iowq.wq);

It it likely OK, but for paranoia, I'd prefer this form:

	for (;;) {
		prepare_to_wait_exclusive(&ctx->wait, &iowq.wq, TASK_INTERRUPTIBLE);
		if (io_should_wake(&iowq))
			break;
		schedule();
		if (signal_pending(current))
			break;
	}
	finish_wait(&ctx->wait, &iowq.wq);

The thing is, if we ever succeed with io_wake_function() (that CPU
observes io_should_wake()), but when waking here, we do not observe
is_wake_function() and go sleep again, we might never wake up if we
don't put ourselves back on the wait-list again.


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 0/2] Optimise io_uring completion waiting
  2019-09-24 11:43                             ` Peter Zijlstra
@ 2019-09-24 12:57                               ` Jens Axboe
  0 siblings, 0 replies; 38+ messages in thread
From: Jens Axboe @ 2019-09-24 12:57 UTC (permalink / raw)
  To: Peter Zijlstra, Pavel Begunkov
  Cc: Ingo Molnar, Ingo Molnar, linux-block, linux-kernel

On 9/24/19 5:43 AM, Peter Zijlstra wrote:
> On Tue, Sep 24, 2019 at 02:11:29PM +0300, Pavel Begunkov wrote:
> 
>> @@ -2717,15 +2757,18 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
>>   			return ret;
>>   	}
>>   
>> +	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
>> +	prepare_to_wait_exclusive(&ctx->wait, &iowq.wq, TASK_INTERRUPTIBLE);
>> +	do {
>> +		if (io_should_wake(&iowq))
>> +			break;
>> +		schedule();
>> +		if (signal_pending(current))
>> +			break;
>> +		set_current_state(TASK_INTERRUPTIBLE);
>> +	} while (1);
>> +	finish_wait(&ctx->wait, &iowq.wq);
> 
> It it likely OK, but for paranoia, I'd prefer this form:
> 
> 	for (;;) {
> 		prepare_to_wait_exclusive(&ctx->wait, &iowq.wq, TASK_INTERRUPTIBLE);
> 		if (io_should_wake(&iowq))
> 			break;
> 		schedule();
> 		if (signal_pending(current))
> 			break;
> 	}
> 	finish_wait(&ctx->wait, &iowq.wq);
> 
> The thing is, if we ever succeed with io_wake_function() (that CPU
> observes io_should_wake()), but when waking here, we do not observe
> is_wake_function() and go sleep again, we might never wake up if we
> don't put ourselves back on the wait-list again.

Might be paranoia indeed, but especially after this change, we don't
expect to make frequent roundtrips there. So better safe than sorry,
I'll make the change.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 0/2] Optimise io_uring completion waiting
  2019-09-24 11:23                               ` Pavel Begunkov
@ 2019-09-24 13:13                                 ` Jens Axboe
  2019-09-24 17:33                                   ` Pavel Begunkov
  0 siblings, 1 reply; 38+ messages in thread
From: Jens Axboe @ 2019-09-24 13:13 UTC (permalink / raw)
  To: Pavel Begunkov, Peter Zijlstra
  Cc: Ingo Molnar, Ingo Molnar, linux-block, linux-kernel

On 9/24/19 5:23 AM, Pavel Begunkov wrote:
>> Yep that should do it, and saves 8 bytes of stack as well.
>>
>> BTW, did you test my patch, this one or the previous? Just curious if it
>> worked for you.
>>
> Not yet, going to do that tonight

Thanks! For reference, the final version is below. There was still a
signal mishap in there, now it should all be correct afaict.


diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9b84232e5cc4..d2a86164d520 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2768,6 +2768,38 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit,
 	return submit;
 }
 
+struct io_wait_queue {
+	struct wait_queue_entry wq;
+	struct io_ring_ctx *ctx;
+	unsigned to_wait;
+	unsigned nr_timeouts;
+};
+
+static inline bool io_should_wake(struct io_wait_queue *iowq)
+{
+	struct io_ring_ctx *ctx = iowq->ctx;
+
+	/*
+	 * Wake up if we have enough events, or if a timeout occured since we
+	 * started waiting. For timeouts, we always want to return to userspace,
+	 * regardless of event count.
+	 */
+	return io_cqring_events(ctx->rings) >= iowq->to_wait ||
+			atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
+}
+
+static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
+			    int wake_flags, void *key)
+{
+	struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
+							wq);
+
+	if (!io_should_wake(iowq))
+		return -1;
+
+	return autoremove_wake_function(curr, mode, wake_flags, key);
+}
+
 /*
  * Wait until events become available, if we don't already have some. The
  * application must reap them itself, as they reside on the shared cq ring.
@@ -2775,8 +2807,16 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit,
 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 			  const sigset_t __user *sig, size_t sigsz)
 {
+	struct io_wait_queue iowq = {
+		.wq = {
+			.private	= current,
+			.func		= io_wake_function,
+			.entry		= LIST_HEAD_INIT(iowq.wq.entry),
+		},
+		.ctx		= ctx,
+		.to_wait	= min_events,
+	};
 	struct io_rings *rings = ctx->rings;
-	unsigned nr_timeouts;
 	int ret;
 
 	if (io_cqring_events(rings) >= min_events)
@@ -2795,15 +2835,20 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 			return ret;
 	}
 
-	nr_timeouts = atomic_read(&ctx->cq_timeouts);
-	/*
-	 * Return if we have enough events, or if a timeout occured since
-	 * we started waiting. For timeouts, we always want to return to
-	 * userspace.
-	 */
-	ret = wait_event_interruptible(ctx->wait,
-				io_cqring_events(rings) >= min_events ||
-				atomic_read(&ctx->cq_timeouts) != nr_timeouts);
+	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
+	do {
+		prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
+						TASK_INTERRUPTIBLE);
+		if (io_should_wake(&iowq))
+			break;
+		schedule();
+		if (signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+	} while (1);
+	finish_wait(&ctx->wait, &iowq.wq);
+
 	restore_saved_sigmask_unless(ret == -ERESTARTSYS);
 	if (ret == -ERESTARTSYS)
 		ret = -EINTR;

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 0/2] Optimise io_uring completion waiting
  2019-09-24 13:13                                 ` Jens Axboe
@ 2019-09-24 17:33                                   ` Pavel Begunkov
  2019-09-24 17:46                                     ` Jens Axboe
  0 siblings, 1 reply; 38+ messages in thread
From: Pavel Begunkov @ 2019-09-24 17:33 UTC (permalink / raw)
  To: Jens Axboe, Peter Zijlstra
  Cc: Ingo Molnar, Ingo Molnar, linux-block, linux-kernel


[-- Attachment #1.1: Type: text/plain, Size: 3668 bytes --]

On 24/09/2019 16:13, Jens Axboe wrote:
> On 9/24/19 5:23 AM, Pavel Begunkov wrote:
>>> Yep that should do it, and saves 8 bytes of stack as well.
>>>
>>> BTW, did you test my patch, this one or the previous? Just curious if it
>>> worked for you.
>>>
>> Not yet, going to do that tonight
> 
> Thanks! For reference, the final version is below. There was still a
> signal mishap in there, now it should all be correct afaict.
> 
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 9b84232e5cc4..d2a86164d520 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -2768,6 +2768,38 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit,
>  	return submit;
>  }
>  
> +struct io_wait_queue {
> +	struct wait_queue_entry wq;
> +	struct io_ring_ctx *ctx;
> +	unsigned to_wait;
> +	unsigned nr_timeouts;
> +};
> +
> +static inline bool io_should_wake(struct io_wait_queue *iowq)
> +{
> +	struct io_ring_ctx *ctx = iowq->ctx;
> +
> +	/*
> +	 * Wake up if we have enough events, or if a timeout occured since we
> +	 * started waiting. For timeouts, we always want to return to userspace,
> +	 * regardless of event count.
> +	 */
> +	return io_cqring_events(ctx->rings) >= iowq->to_wait ||
> +			atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
> +}
> +
> +static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
> +			    int wake_flags, void *key)
> +{
> +	struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
> +							wq);
> +
> +	if (!io_should_wake(iowq))
> +		return -1;

It would try to schedule only the first task in the wait list. Is that the
semantic you want?
E.g. for waiters=[32,8] and nr_events == 8, io_wake_function() returns
after @32, and won't wake up the second one. 

> +
> +	return autoremove_wake_function(curr, mode, wake_flags, key);
> +}
> +
>  /*
>   * Wait until events become available, if we don't already have some. The
>   * application must reap them itself, as they reside on the shared cq ring.
> @@ -2775,8 +2807,16 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit,
>  static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
>  			  const sigset_t __user *sig, size_t sigsz)
>  {
> +	struct io_wait_queue iowq = {
> +		.wq = {
> +			.private	= current,
> +			.func		= io_wake_function,
> +			.entry		= LIST_HEAD_INIT(iowq.wq.entry),
> +		},
> +		.ctx		= ctx,
> +		.to_wait	= min_events,
> +	};
>  	struct io_rings *rings = ctx->rings;
> -	unsigned nr_timeouts;
>  	int ret;
>  
>  	if (io_cqring_events(rings) >= min_events)
> @@ -2795,15 +2835,20 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
>  			return ret;
>  	}
>  
> -	nr_timeouts = atomic_read(&ctx->cq_timeouts);
> -	/*
> -	 * Return if we have enough events, or if a timeout occured since
> -	 * we started waiting. For timeouts, we always want to return to
> -	 * userspace.
> -	 */
> -	ret = wait_event_interruptible(ctx->wait,
> -				io_cqring_events(rings) >= min_events ||
> -				atomic_read(&ctx->cq_timeouts) != nr_timeouts);
> +	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
> +	do {
> +		prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
> +						TASK_INTERRUPTIBLE);
> +		if (io_should_wake(&iowq))
> +			break;
> +		schedule();
> +		if (signal_pending(current)) {
> +			ret = -ERESTARTSYS;
> +			break;
> +		}
> +	} while (1);
> +	finish_wait(&ctx->wait, &iowq.wq);
> +
>  	restore_saved_sigmask_unless(ret == -ERESTARTSYS);
>  	if (ret == -ERESTARTSYS)
>  		ret = -EINTR;
> 

-- 
Yours sincerely,
Pavel Begunkov


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 0/2] Optimise io_uring completion waiting
  2019-09-24 17:33                                   ` Pavel Begunkov
@ 2019-09-24 17:46                                     ` Jens Axboe
  2019-09-24 18:28                                       ` Pavel Begunkov
  0 siblings, 1 reply; 38+ messages in thread
From: Jens Axboe @ 2019-09-24 17:46 UTC (permalink / raw)
  To: Pavel Begunkov, Peter Zijlstra
  Cc: Ingo Molnar, Ingo Molnar, linux-block, linux-kernel

On 9/24/19 11:33 AM, Pavel Begunkov wrote:
> On 24/09/2019 16:13, Jens Axboe wrote:
>> On 9/24/19 5:23 AM, Pavel Begunkov wrote:
>>>> Yep that should do it, and saves 8 bytes of stack as well.
>>>>
>>>> BTW, did you test my patch, this one or the previous? Just curious if it
>>>> worked for you.
>>>>
>>> Not yet, going to do that tonight
>>
>> Thanks! For reference, the final version is below. There was still a
>> signal mishap in there, now it should all be correct afaict.
>>
>>
>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>> index 9b84232e5cc4..d2a86164d520 100644
>> --- a/fs/io_uring.c
>> +++ b/fs/io_uring.c
>> @@ -2768,6 +2768,38 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit,
>>   	return submit;
>>   }
>>   
>> +struct io_wait_queue {
>> +	struct wait_queue_entry wq;
>> +	struct io_ring_ctx *ctx;
>> +	unsigned to_wait;
>> +	unsigned nr_timeouts;
>> +};
>> +
>> +static inline bool io_should_wake(struct io_wait_queue *iowq)
>> +{
>> +	struct io_ring_ctx *ctx = iowq->ctx;
>> +
>> +	/*
>> +	 * Wake up if we have enough events, or if a timeout occured since we
>> +	 * started waiting. For timeouts, we always want to return to userspace,
>> +	 * regardless of event count.
>> +	 */
>> +	return io_cqring_events(ctx->rings) >= iowq->to_wait ||
>> +			atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
>> +}
>> +
>> +static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
>> +			    int wake_flags, void *key)
>> +{
>> +	struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
>> +							wq);
>> +
>> +	if (!io_should_wake(iowq))
>> +		return -1;
> 
> It would try to schedule only the first task in the wait list. Is that the
> semantic you want?
> E.g. for waiters=[32,8] and nr_events == 8, io_wake_function() returns
> after @32, and won't wake up the second one.

Right, those are the semantics I want. We keep the list ordered by using
the exclusive wait addition. Which means that for the case you list,
waiters=32 came first, and we should not wake others before that task
gets the completions it wants. Otherwise we could potentially starve
higher count waiters, if we always keep going and new waiters come in.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 0/2] Optimise io_uring completion waiting
  2019-09-24 17:46                                     ` Jens Axboe
@ 2019-09-24 18:28                                       ` Pavel Begunkov
  2019-09-24 19:32                                         ` Jens Axboe
  0 siblings, 1 reply; 38+ messages in thread
From: Pavel Begunkov @ 2019-09-24 18:28 UTC (permalink / raw)
  To: Jens Axboe, Peter Zijlstra
  Cc: Ingo Molnar, Ingo Molnar, linux-block, linux-kernel


[-- Attachment #1.1: Type: text/plain, Size: 2918 bytes --]

On 24/09/2019 20:46, Jens Axboe wrote:
> On 9/24/19 11:33 AM, Pavel Begunkov wrote:
>> On 24/09/2019 16:13, Jens Axboe wrote:
>>> On 9/24/19 5:23 AM, Pavel Begunkov wrote:
>>>>> Yep that should do it, and saves 8 bytes of stack as well.
>>>>>
>>>>> BTW, did you test my patch, this one or the previous? Just curious if it
>>>>> worked for you.
>>>>>
>>>> Not yet, going to do that tonight
>>>
>>> Thanks! For reference, the final version is below. There was still a
>>> signal mishap in there, now it should all be correct afaict.
>>>
>>>
>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>> index 9b84232e5cc4..d2a86164d520 100644
>>> --- a/fs/io_uring.c
>>> +++ b/fs/io_uring.c
>>> @@ -2768,6 +2768,38 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit,
>>>   	return submit;
>>>   }
>>>   
>>> +struct io_wait_queue {
>>> +	struct wait_queue_entry wq;
>>> +	struct io_ring_ctx *ctx;
>>> +	unsigned to_wait;
>>> +	unsigned nr_timeouts;
>>> +};
>>> +
>>> +static inline bool io_should_wake(struct io_wait_queue *iowq)
>>> +{
>>> +	struct io_ring_ctx *ctx = iowq->ctx;
>>> +
>>> +	/*
>>> +	 * Wake up if we have enough events, or if a timeout occured since we
>>> +	 * started waiting. For timeouts, we always want to return to userspace,
>>> +	 * regardless of event count.
>>> +	 */
>>> +	return io_cqring_events(ctx->rings) >= iowq->to_wait ||
>>> +			atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
>>> +}
>>> +
>>> +static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
>>> +			    int wake_flags, void *key)
>>> +{
>>> +	struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
>>> +							wq);
>>> +
>>> +	if (!io_should_wake(iowq))
>>> +		return -1;
>>
>> It would try to schedule only the first task in the wait list. Is that the
>> semantic you want?
>> E.g. for waiters=[32,8] and nr_events == 8, io_wake_function() returns
>> after @32, and won't wake up the second one.
> 
> Right, those are the semantics I want. We keep the list ordered by using
> the exclusive wait addition. Which means that for the case you list,
> waiters=32 came first, and we should not wake others before that task
> gets the completions it wants. Otherwise we could potentially starve
> higher count waiters, if we always keep going and new waiters come in.
> 
Yes. I think It would better to be documented in userspace API. I
could imagine some crazy case deadlocking userspace. E.g. 
thread 1: wait_events(8), reap_events
thread 2: wait_events(32), wait(thread 1), reap_events

works well
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Tested-by: Pavel Begunkov <asml.silence@gmail.com>

BTW, I searched for wait_event*(), and it seems there are plenty of
similar use cases. So, generic case would be useful, but this is for
later.



-- 
Yours sincerely,
Pavel Begunkov


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH v2 0/2] Optimise io_uring completion waiting
  2019-09-24 18:28                                       ` Pavel Begunkov
@ 2019-09-24 19:32                                         ` Jens Axboe
  0 siblings, 0 replies; 38+ messages in thread
From: Jens Axboe @ 2019-09-24 19:32 UTC (permalink / raw)
  To: Pavel Begunkov, Peter Zijlstra
  Cc: Ingo Molnar, Ingo Molnar, linux-block, linux-kernel

On 9/24/19 12:28 PM, Pavel Begunkov wrote:
> On 24/09/2019 20:46, Jens Axboe wrote:
>> On 9/24/19 11:33 AM, Pavel Begunkov wrote:
>>> On 24/09/2019 16:13, Jens Axboe wrote:
>>>> On 9/24/19 5:23 AM, Pavel Begunkov wrote:
>>>>>> Yep that should do it, and saves 8 bytes of stack as well.
>>>>>>
>>>>>> BTW, did you test my patch, this one or the previous? Just curious if it
>>>>>> worked for you.
>>>>>>
>>>>> Not yet, going to do that tonight
>>>>
>>>> Thanks! For reference, the final version is below. There was still a
>>>> signal mishap in there, now it should all be correct afaict.
>>>>
>>>>
>>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>>> index 9b84232e5cc4..d2a86164d520 100644
>>>> --- a/fs/io_uring.c
>>>> +++ b/fs/io_uring.c
>>>> @@ -2768,6 +2768,38 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit,
>>>>    	return submit;
>>>>    }
>>>>    
>>>> +struct io_wait_queue {
>>>> +	struct wait_queue_entry wq;
>>>> +	struct io_ring_ctx *ctx;
>>>> +	unsigned to_wait;
>>>> +	unsigned nr_timeouts;
>>>> +};
>>>> +
>>>> +static inline bool io_should_wake(struct io_wait_queue *iowq)
>>>> +{
>>>> +	struct io_ring_ctx *ctx = iowq->ctx;
>>>> +
>>>> +	/*
>>>> +	 * Wake up if we have enough events, or if a timeout occured since we
>>>> +	 * started waiting. For timeouts, we always want to return to userspace,
>>>> +	 * regardless of event count.
>>>> +	 */
>>>> +	return io_cqring_events(ctx->rings) >= iowq->to_wait ||
>>>> +			atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
>>>> +}
>>>> +
>>>> +static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
>>>> +			    int wake_flags, void *key)
>>>> +{
>>>> +	struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
>>>> +							wq);
>>>> +
>>>> +	if (!io_should_wake(iowq))
>>>> +		return -1;
>>>
>>> It would try to schedule only the first task in the wait list. Is that the
>>> semantic you want?
>>> E.g. for waiters=[32,8] and nr_events == 8, io_wake_function() returns
>>> after @32, and won't wake up the second one.
>>
>> Right, those are the semantics I want. We keep the list ordered by using
>> the exclusive wait addition. Which means that for the case you list,
>> waiters=32 came first, and we should not wake others before that task
>> gets the completions it wants. Otherwise we could potentially starve
>> higher count waiters, if we always keep going and new waiters come in.
>>
> Yes. I think It would better to be documented in userspace API. I
> could imagine some crazy case deadlocking userspace. E.g.
> thread 1: wait_events(8), reap_events
> thread 2: wait_events(32), wait(thread 1), reap_events

No matter how you handle cases like this, there will always be deadlocks
possible... So I don't think that's a huge concern. It's more important
to not have intentional livelocks, which we would have if we always
allowed the lowest wait count to get woken and steal the budget
everytime.

> works well
> Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
> Tested-by: Pavel Begunkov <asml.silence@gmail.com>

Thanks, will add!

> BTW, I searched for wait_event*(), and it seems there are plenty of
> similar use cases. So, generic case would be useful, but this is for
> later.

Agree, it would undoubtedly be useful.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 38+ messages in thread

end of thread, other threads:[~2019-09-24 19:32 UTC | newest]

Thread overview: 38+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-09-22  8:08 [PATCH v2 0/2] Optimise io_uring completion waiting Pavel Begunkov (Silence)
2019-09-22  8:08 ` [PATCH v2 1/2] sched/wait: Add wait_threshold Pavel Begunkov (Silence)
2019-09-23  7:19   ` Peter Zijlstra
2019-09-23 16:37     ` Pavel Begunkov
2019-09-23 19:27       ` Peter Zijlstra
2019-09-23 20:23         ` Peter Zijlstra
2019-09-24  6:44         ` Pavel Begunkov
2019-09-22  8:08 ` [PATCH v2 2/2] io_uring: Optimise cq waiting with wait_threshold Pavel Begunkov (Silence)
2019-09-22 15:51 ` [PATCH v2 0/2] Optimise io_uring completion waiting Jens Axboe
2019-09-23  8:35   ` Ingo Molnar
2019-09-23 16:21     ` Pavel Begunkov
2019-09-23 16:32       ` Pavel Begunkov
2019-09-23 20:48         ` Jens Axboe
2019-09-23 23:00           ` Jens Axboe
2019-09-24  7:06             ` Pavel Begunkov
2019-09-24  8:02               ` Jens Axboe
2019-09-24  8:27                 ` Jens Axboe
2019-09-24  8:36                   ` Jens Axboe
2019-09-24  9:33                     ` Pavel Begunkov
2019-09-24 10:11                       ` Jens Axboe
2019-09-24  9:49                     ` Peter Zijlstra
2019-09-24 10:13                       ` Jens Axboe
2019-09-24 10:34                         ` Jens Axboe
2019-09-24 11:11                           ` Pavel Begunkov
2019-09-24 11:15                             ` Jens Axboe
2019-09-24 11:23                               ` Pavel Begunkov
2019-09-24 13:13                                 ` Jens Axboe
2019-09-24 17:33                                   ` Pavel Begunkov
2019-09-24 17:46                                     ` Jens Axboe
2019-09-24 18:28                                       ` Pavel Begunkov
2019-09-24 19:32                                         ` Jens Axboe
2019-09-24 11:43                             ` Peter Zijlstra
2019-09-24 12:57                               ` Jens Axboe
2019-09-24 11:33                           ` Peter Zijlstra
2019-09-24  9:20                   ` Pavel Begunkov
2019-09-24 10:09                     ` Jens Axboe
2019-09-24  9:21                 ` Pavel Begunkov
2019-09-24 10:09                   ` Jens Axboe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).