Linux-Block Archive on lore.kernel.org
 help / color / Atom feed
* [RFC PATCH 0/2] Optimise io_uring completion waiting
@ 2019-09-13 22:28 Pavel Begunkov (Silence)
  2019-09-13 22:28 ` [RFC PATCH 1/2] sched/wait: Add wait_threshold Pavel Begunkov (Silence)
                   ` (2 more replies)
  0 siblings, 3 replies; 5+ messages in thread
From: Pavel Begunkov (Silence) @ 2019-09-13 22:28 UTC (permalink / raw)
  To: Jens Axboe, Ingo Molnar, Peter Zijlstra, linux-block, linux-kernel
  Cc: Pavel Begunkov

From: Pavel Begunkov <asml.silence@gmail.com>

There could be a lot of overhead within generic wait_event_*() used for
waiting for large number of completions. The patchset removes much of
it by using custom wait event (wait_threshold).

Synthetic test showed ~40% performance boost. (see patch 2)

Pavel Begunkov (2):
  sched/wait: Add wait_threshold
  io_uring: Optimise cq waiting with wait_threshold

 fs/io_uring.c                  | 21 ++++++-----
 include/linux/wait_threshold.h | 64 ++++++++++++++++++++++++++++++++++
 kernel/sched/Makefile          |  2 +-
 kernel/sched/wait_threshold.c  | 26 ++++++++++++++
 4 files changed, 103 insertions(+), 10 deletions(-)
 create mode 100644 include/linux/wait_threshold.h
 create mode 100644 kernel/sched/wait_threshold.c

-- 
2.22.0


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [RFC PATCH 1/2] sched/wait: Add wait_threshold
  2019-09-13 22:28 [RFC PATCH 0/2] Optimise io_uring completion waiting Pavel Begunkov (Silence)
@ 2019-09-13 22:28 ` Pavel Begunkov (Silence)
  2019-09-13 22:28 ` [RFC PATCH 2/2] io_uring: Optimise cq waiting with wait_threshold Pavel Begunkov (Silence)
  2019-09-14  0:31 ` [RFC PATCH 0/2] Optimise io_uring completion waiting Jens Axboe
  2 siblings, 0 replies; 5+ messages in thread
From: Pavel Begunkov (Silence) @ 2019-09-13 22:28 UTC (permalink / raw)
  To: Jens Axboe, Ingo Molnar, Peter Zijlstra, linux-block, linux-kernel
  Cc: Pavel Begunkov

From: Pavel Begunkov <asml.silence@gmail.com>

Add wait_threshold -- a custom wait_event derivative, that waits until
a value is equal to or greater than the specified threshold.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 include/linux/wait_threshold.h | 64 ++++++++++++++++++++++++++++++++++
 kernel/sched/Makefile          |  2 +-
 kernel/sched/wait_threshold.c  | 26 ++++++++++++++
 3 files changed, 91 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/wait_threshold.h
 create mode 100644 kernel/sched/wait_threshold.c

diff --git a/include/linux/wait_threshold.h b/include/linux/wait_threshold.h
new file mode 100644
index 000000000000..01798c3aae1f
--- /dev/null
+++ b/include/linux/wait_threshold.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_WAIT_THRESHOLD_H
+#define _LINUX_WAIT_THRESHOLD_H
+
+#include <linux/wait.h>
+
+struct wait_threshold_queue_entry {
+	struct wait_queue_entry wq_entry;
+	unsigned int threshold;
+};
+
+void init_wait_threshold_entry(struct wait_threshold_queue_entry *wtq_entry,
+				unsigned int threshold);
+
+static inline void wake_up_threshold(struct wait_queue_head *wq_head,
+					unsigned int val)
+{
+	void *arg = (void *)(unsigned long)val;
+
+	__wake_up(wq_head, TASK_NORMAL, 1, arg);
+}
+
+#define ___wait_threshold_event(q, thresh, condition, state,		\
+				exclusive, ret, cmd)			\
+({									\
+	__label__ __out;						\
+	struct wait_queue_head *__wq_head = &q;				\
+	struct wait_threshold_queue_entry __wtq_entry;			\
+	struct wait_queue_entry *__wq_entry = &__wtq_entry.wq_entry;	\
+	long __ret = ret; /* explicit shadow */				\
+									\
+	init_wait_threshold_entry(&__wtq_entry, thresh);		\
+	for (;;) {							\
+		long __int = prepare_to_wait_event(__wq_head,		\
+						   __wq_entry,		\
+						   state);		\
+		if (condition)						\
+			break;						\
+									\
+		if (___wait_is_interruptible(state) && __int) {		\
+			__ret = __int;					\
+			goto __out;					\
+		}							\
+									\
+		cmd;							\
+	}								\
+	finish_wait(__wq_head, __wq_entry);				\
+__out:	__ret;								\
+})
+
+#define __wait_threshold_interruptible(q, thresh, condition)		\
+	___wait_threshold_event(q, thresh, condition, TASK_INTERRUPTIBLE, 0, 0,\
+			  schedule())
+
+#define wait_threshold_interruptible(q, threshold, val)		\
+({								\
+	int __ret = 0;						\
+	might_sleep();						\
+	if ((val) < (threshold))				\
+		__ret = __wait_threshold_interruptible(q,	\
+			threshold, ((val) >= (threshold)));	\
+	__ret;							\
+})
+#endif /* _LINUX_WAIT_THRESHOLD_H */
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 21fb5a5662b5..bb895a3184f9 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -18,7 +18,7 @@ endif
 
 obj-y += core.o loadavg.o clock.o cputime.o
 obj-y += idle.o fair.o rt.o deadline.o
-obj-y += wait.o wait_bit.o swait.o completion.o
+obj-y += wait.o wait_bit.o wait_threshold.o swait.o completion.o
 
 obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
diff --git a/kernel/sched/wait_threshold.c b/kernel/sched/wait_threshold.c
new file mode 100644
index 000000000000..80a027c02ff3
--- /dev/null
+++ b/kernel/sched/wait_threshold.c
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include "sched.h"
+#include <linux/wait_threshold.h>
+
+static int wake_threshold_function(struct wait_queue_entry *wq_entry,
+				   unsigned int mode, int sync, void *arg)
+{
+	unsigned int val = (unsigned int)(unsigned long)arg;
+	struct wait_threshold_queue_entry *wtq_entry =
+		container_of(wq_entry, struct wait_threshold_queue_entry,
+			wq_entry);
+
+	if (val < wtq_entry->threshold)
+		return 0;
+
+	return default_wake_function(wq_entry, mode, sync, arg);
+}
+
+void init_wait_threshold_entry(struct wait_threshold_queue_entry *wtq_entry,
+			       unsigned int threshold)
+{
+	init_wait_entry(&wtq_entry->wq_entry, 0);
+	wtq_entry->wq_entry.func = wake_threshold_function;
+	wtq_entry->threshold = threshold;
+}
+EXPORT_SYMBOL(init_wait_threshold_entry);
-- 
2.22.0


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [RFC PATCH 2/2] io_uring: Optimise cq waiting with wait_threshold
  2019-09-13 22:28 [RFC PATCH 0/2] Optimise io_uring completion waiting Pavel Begunkov (Silence)
  2019-09-13 22:28 ` [RFC PATCH 1/2] sched/wait: Add wait_threshold Pavel Begunkov (Silence)
@ 2019-09-13 22:28 ` Pavel Begunkov (Silence)
  2019-09-14  0:31 ` [RFC PATCH 0/2] Optimise io_uring completion waiting Jens Axboe
  2 siblings, 0 replies; 5+ messages in thread
From: Pavel Begunkov (Silence) @ 2019-09-13 22:28 UTC (permalink / raw)
  To: Jens Axboe, Ingo Molnar, Peter Zijlstra, linux-block, linux-kernel
  Cc: Pavel Begunkov

From: Pavel Begunkov <asml.silence@gmail.com>

While waiting for completion events in io_cqring_wait(), the process
will be waken up inside wait_threshold_interruptible() on any request
completion, check num of events in completion queue and potentially go
to sleep again.

Apparently, there could be a lot of such spurious wakeups with lots of
overhead. It especially manifests itself, when min_events is large, and
completions are arriving one by one or in small batches (that usually
is true).

E.g. if device completes requests one by one and io_uring_enter is
waiting for 100 events, then there will be ~99 spurious wakeups.

Use new wait_threshold_*() instead, which won't wake it up until
necessary number of events is collected.

Performance test:
The first thread generates requests (QD=512) one by one, so they will
be completed in the similar pattern. The second thread waiting for
128 events to complete.

Tested with null_blk with 5us delay
and 3.8GHz Intel CPU.

throughput before: 270 KIOPS
throughput after:  370 KIOPS
So, ~40% throughput boost on this exaggerate test.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 37395208a729..17d2d30b763a 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -70,6 +70,7 @@
 #include <linux/nospec.h>
 #include <linux/sizes.h>
 #include <linux/hugetlb.h>
+#include <linux/wait_threshold.h>
 
 #include <uapi/linux/io_uring.h>
 
@@ -403,6 +404,13 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	return ctx;
 }
 
+static unsigned int io_cqring_events(struct io_rings *rings)
+{
+	/* See comment at the top of this file */
+	smp_rmb();
+	return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head);
+}
+
 static inline bool io_sequence_defer(struct io_ring_ctx *ctx,
 				     struct io_kiocb *req)
 {
@@ -521,7 +529,7 @@ static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
 static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
 {
 	if (waitqueue_active(&ctx->wait))
-		wake_up(&ctx->wait);
+		wake_up_threshold(&ctx->wait, io_cqring_events(ctx->rings));
 	if (waitqueue_active(&ctx->sqo_wait))
 		wake_up(&ctx->sqo_wait);
 	if (ctx->cq_ev_fd)
@@ -546,7 +554,7 @@ static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs)
 	percpu_ref_put_many(&ctx->refs, refs);
 
 	if (waitqueue_active(&ctx->wait))
-		wake_up(&ctx->wait);
+		wake_up_threshold(&ctx->wait, io_cqring_events(ctx->rings));
 }
 
 static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
@@ -681,12 +689,6 @@ static void io_put_req(struct io_kiocb *req)
 		io_free_req(req);
 }
 
-static unsigned io_cqring_events(struct io_rings *rings)
-{
-	/* See comment at the top of this file */
-	smp_rmb();
-	return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head);
-}
 
 /*
  * Find and free completed poll iocbs
@@ -2591,7 +2593,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 			return ret;
 	}
 
-	ret = wait_event_interruptible(ctx->wait, io_cqring_events(rings) >= min_events);
+	ret = wait_threshold_interruptible(ctx->wait, min_events,
+					   io_cqring_events(rings));
 	restore_saved_sigmask_unless(ret == -ERESTARTSYS);
 	if (ret == -ERESTARTSYS)
 		ret = -EINTR;
-- 
2.22.0


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC PATCH 0/2] Optimise io_uring completion waiting
  2019-09-13 22:28 [RFC PATCH 0/2] Optimise io_uring completion waiting Pavel Begunkov (Silence)
  2019-09-13 22:28 ` [RFC PATCH 1/2] sched/wait: Add wait_threshold Pavel Begunkov (Silence)
  2019-09-13 22:28 ` [RFC PATCH 2/2] io_uring: Optimise cq waiting with wait_threshold Pavel Begunkov (Silence)
@ 2019-09-14  0:31 ` Jens Axboe
  2019-09-14 10:11   ` Pavel Begunkov
  2 siblings, 1 reply; 5+ messages in thread
From: Jens Axboe @ 2019-09-14  0:31 UTC (permalink / raw)
  To: Pavel Begunkov (Silence),
	Ingo Molnar, Peter Zijlstra, linux-block, linux-kernel

On 9/13/19 4:28 PM, Pavel Begunkov (Silence) wrote:
> From: Pavel Begunkov <asml.silence@gmail.com>
> 
> There could be a lot of overhead within generic wait_event_*() used for
> waiting for large number of completions. The patchset removes much of
> it by using custom wait event (wait_threshold).
> 
> Synthetic test showed ~40% performance boost. (see patch 2)

Nifty, from an io_uring perspective, I like this a lot.

The core changes needed to support it look fine as well. I'll await
Peter/Ingo's comments on it.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC PATCH 0/2] Optimise io_uring completion waiting
  2019-09-14  0:31 ` [RFC PATCH 0/2] Optimise io_uring completion waiting Jens Axboe
@ 2019-09-14 10:11   ` Pavel Begunkov
  0 siblings, 0 replies; 5+ messages in thread
From: Pavel Begunkov @ 2019-09-14 10:11 UTC (permalink / raw)
  To: Jens Axboe, Ingo Molnar, Peter Zijlstra, linux-block, linux-kernel

[-- Attachment #1.1: Type: text/plain, Size: 1234 bytes --]

It solves much of the problem, though still have overhead on traversing
a wait queue + indirect calls for checking.

I've been thinking to either
1. create n wait queues and bucketing waiter. E.g. log2(min_events)
bucketing  would remove at least half of such calls for arbitary
min_events and all if min_events is pow2.

2. or dig deeper and add custom wake_up with perhaps sorted wait_queue.
As I see it, it's pretty bulky and over-engineered, but maybe somebody
knows an easier way?

Anyway, I don't have performance numbers for that, so don't know if this
would be justified.


On 14/09/2019 03:31, Jens Axboe wrote:
> On 9/13/19 4:28 PM, Pavel Begunkov (Silence) wrote:
>> From: Pavel Begunkov <asml.silence@gmail.com>
>>
>> There could be a lot of overhead within generic wait_event_*() used for
>> waiting for large number of completions. The patchset removes much of
>> it by using custom wait event (wait_threshold).
>>
>> Synthetic test showed ~40% performance boost. (see patch 2)
> 
> Nifty, from an io_uring perspective, I like this a lot.
> 
> The core changes needed to support it look fine as well. I'll await
> Peter/Ingo's comments on it.
> 

-- 
Yours sincerely,
Pavel Begunkov


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, back to index

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-09-13 22:28 [RFC PATCH 0/2] Optimise io_uring completion waiting Pavel Begunkov (Silence)
2019-09-13 22:28 ` [RFC PATCH 1/2] sched/wait: Add wait_threshold Pavel Begunkov (Silence)
2019-09-13 22:28 ` [RFC PATCH 2/2] io_uring: Optimise cq waiting with wait_threshold Pavel Begunkov (Silence)
2019-09-14  0:31 ` [RFC PATCH 0/2] Optimise io_uring completion waiting Jens Axboe
2019-09-14 10:11   ` Pavel Begunkov

Linux-Block Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-block/0 linux-block/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-block linux-block/ https://lore.kernel.org/linux-block \
		linux-block@vger.kernel.org linux-block@archiver.kernel.org
	public-inbox-index linux-block

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-block


AGPL code for this site: git clone https://public-inbox.org/ public-inbox