* [RFC PATCH 1/2] sched/wait: Add wait_threshold
2019-09-13 22:28 [RFC PATCH 0/2] Optimise io_uring completion waiting Pavel Begunkov (Silence)
@ 2019-09-13 22:28 ` Pavel Begunkov (Silence)
2019-09-13 22:28 ` [RFC PATCH 2/2] io_uring: Optimise cq waiting with wait_threshold Pavel Begunkov (Silence)
2019-09-14 0:31 ` [RFC PATCH 0/2] Optimise io_uring completion waiting Jens Axboe
2 siblings, 0 replies; 5+ messages in thread
From: Pavel Begunkov (Silence) @ 2019-09-13 22:28 UTC (permalink / raw)
To: Jens Axboe, Ingo Molnar, Peter Zijlstra, linux-block, linux-kernel
Cc: Pavel Begunkov
From: Pavel Begunkov <asml.silence@gmail.com>
Add wait_threshold -- a custom wait_event derivative, that waits until
a value is equal to or greater than the specified threshold.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
include/linux/wait_threshold.h | 64 ++++++++++++++++++++++++++++++++++
kernel/sched/Makefile | 2 +-
kernel/sched/wait_threshold.c | 26 ++++++++++++++
3 files changed, 91 insertions(+), 1 deletion(-)
create mode 100644 include/linux/wait_threshold.h
create mode 100644 kernel/sched/wait_threshold.c
diff --git a/include/linux/wait_threshold.h b/include/linux/wait_threshold.h
new file mode 100644
index 000000000000..01798c3aae1f
--- /dev/null
+++ b/include/linux/wait_threshold.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_WAIT_THRESHOLD_H
+#define _LINUX_WAIT_THRESHOLD_H
+
+#include <linux/wait.h>
+
+struct wait_threshold_queue_entry {
+ struct wait_queue_entry wq_entry;
+ unsigned int threshold;
+};
+
+void init_wait_threshold_entry(struct wait_threshold_queue_entry *wtq_entry,
+ unsigned int threshold);
+
+static inline void wake_up_threshold(struct wait_queue_head *wq_head,
+ unsigned int val)
+{
+ void *arg = (void *)(unsigned long)val;
+
+ __wake_up(wq_head, TASK_NORMAL, 1, arg);
+}
+
+#define ___wait_threshold_event(q, thresh, condition, state, \
+ exclusive, ret, cmd) \
+({ \
+ __label__ __out; \
+ struct wait_queue_head *__wq_head = &q; \
+ struct wait_threshold_queue_entry __wtq_entry; \
+ struct wait_queue_entry *__wq_entry = &__wtq_entry.wq_entry; \
+ long __ret = ret; /* explicit shadow */ \
+ \
+ init_wait_threshold_entry(&__wtq_entry, thresh); \
+ for (;;) { \
+ long __int = prepare_to_wait_event(__wq_head, \
+ __wq_entry, \
+ state); \
+ if (condition) \
+ break; \
+ \
+ if (___wait_is_interruptible(state) && __int) { \
+ __ret = __int; \
+ goto __out; \
+ } \
+ \
+ cmd; \
+ } \
+ finish_wait(__wq_head, __wq_entry); \
+__out: __ret; \
+})
+
+#define __wait_threshold_interruptible(q, thresh, condition) \
+ ___wait_threshold_event(q, thresh, condition, TASK_INTERRUPTIBLE, 0, 0,\
+ schedule())
+
+#define wait_threshold_interruptible(q, threshold, val) \
+({ \
+ int __ret = 0; \
+ might_sleep(); \
+ if ((val) < (threshold)) \
+ __ret = __wait_threshold_interruptible(q, \
+ threshold, ((val) >= (threshold))); \
+ __ret; \
+})
+#endif /* _LINUX_WAIT_THRESHOLD_H */
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 21fb5a5662b5..bb895a3184f9 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -18,7 +18,7 @@ endif
obj-y += core.o loadavg.o clock.o cputime.o
obj-y += idle.o fair.o rt.o deadline.o
-obj-y += wait.o wait_bit.o swait.o completion.o
+obj-y += wait.o wait_bit.o wait_threshold.o swait.o completion.o
obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o
obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
diff --git a/kernel/sched/wait_threshold.c b/kernel/sched/wait_threshold.c
new file mode 100644
index 000000000000..80a027c02ff3
--- /dev/null
+++ b/kernel/sched/wait_threshold.c
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include "sched.h"
+#include <linux/wait_threshold.h>
+
+static int wake_threshold_function(struct wait_queue_entry *wq_entry,
+ unsigned int mode, int sync, void *arg)
+{
+ unsigned int val = (unsigned int)(unsigned long)arg;
+ struct wait_threshold_queue_entry *wtq_entry =
+ container_of(wq_entry, struct wait_threshold_queue_entry,
+ wq_entry);
+
+ if (val < wtq_entry->threshold)
+ return 0;
+
+ return default_wake_function(wq_entry, mode, sync, arg);
+}
+
+void init_wait_threshold_entry(struct wait_threshold_queue_entry *wtq_entry,
+ unsigned int threshold)
+{
+ init_wait_entry(&wtq_entry->wq_entry, 0);
+ wtq_entry->wq_entry.func = wake_threshold_function;
+ wtq_entry->threshold = threshold;
+}
+EXPORT_SYMBOL(init_wait_threshold_entry);
--
2.22.0
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [RFC PATCH 2/2] io_uring: Optimise cq waiting with wait_threshold
2019-09-13 22:28 [RFC PATCH 0/2] Optimise io_uring completion waiting Pavel Begunkov (Silence)
2019-09-13 22:28 ` [RFC PATCH 1/2] sched/wait: Add wait_threshold Pavel Begunkov (Silence)
@ 2019-09-13 22:28 ` Pavel Begunkov (Silence)
2019-09-14 0:31 ` [RFC PATCH 0/2] Optimise io_uring completion waiting Jens Axboe
2 siblings, 0 replies; 5+ messages in thread
From: Pavel Begunkov (Silence) @ 2019-09-13 22:28 UTC (permalink / raw)
To: Jens Axboe, Ingo Molnar, Peter Zijlstra, linux-block, linux-kernel
Cc: Pavel Begunkov
From: Pavel Begunkov <asml.silence@gmail.com>
While waiting for completion events in io_cqring_wait(), the process
will be waken up inside wait_threshold_interruptible() on any request
completion, check num of events in completion queue and potentially go
to sleep again.
Apparently, there could be a lot of such spurious wakeups with lots of
overhead. It especially manifests itself, when min_events is large, and
completions are arriving one by one or in small batches (that usually
is true).
E.g. if device completes requests one by one and io_uring_enter is
waiting for 100 events, then there will be ~99 spurious wakeups.
Use new wait_threshold_*() instead, which won't wake it up until
necessary number of events is collected.
Performance test:
The first thread generates requests (QD=512) one by one, so they will
be completed in the similar pattern. The second thread waiting for
128 events to complete.
Tested with null_blk with 5us delay
and 3.8GHz Intel CPU.
throughput before: 270 KIOPS
throughput after: 370 KIOPS
So, ~40% throughput boost on this exaggerate test.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
fs/io_uring.c | 21 ++++++++++++---------
1 file changed, 12 insertions(+), 9 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 37395208a729..17d2d30b763a 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -70,6 +70,7 @@
#include <linux/nospec.h>
#include <linux/sizes.h>
#include <linux/hugetlb.h>
+#include <linux/wait_threshold.h>
#include <uapi/linux/io_uring.h>
@@ -403,6 +404,13 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
return ctx;
}
+static unsigned int io_cqring_events(struct io_rings *rings)
+{
+ /* See comment at the top of this file */
+ smp_rmb();
+ return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head);
+}
+
static inline bool io_sequence_defer(struct io_ring_ctx *ctx,
struct io_kiocb *req)
{
@@ -521,7 +529,7 @@ static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
{
if (waitqueue_active(&ctx->wait))
- wake_up(&ctx->wait);
+ wake_up_threshold(&ctx->wait, io_cqring_events(ctx->rings));
if (waitqueue_active(&ctx->sqo_wait))
wake_up(&ctx->sqo_wait);
if (ctx->cq_ev_fd)
@@ -546,7 +554,7 @@ static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs)
percpu_ref_put_many(&ctx->refs, refs);
if (waitqueue_active(&ctx->wait))
- wake_up(&ctx->wait);
+ wake_up_threshold(&ctx->wait, io_cqring_events(ctx->rings));
}
static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
@@ -681,12 +689,6 @@ static void io_put_req(struct io_kiocb *req)
io_free_req(req);
}
-static unsigned io_cqring_events(struct io_rings *rings)
-{
- /* See comment at the top of this file */
- smp_rmb();
- return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head);
-}
/*
* Find and free completed poll iocbs
@@ -2591,7 +2593,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
return ret;
}
- ret = wait_event_interruptible(ctx->wait, io_cqring_events(rings) >= min_events);
+ ret = wait_threshold_interruptible(ctx->wait, min_events,
+ io_cqring_events(rings));
restore_saved_sigmask_unless(ret == -ERESTARTSYS);
if (ret == -ERESTARTSYS)
ret = -EINTR;
--
2.22.0
^ permalink raw reply related [flat|nested] 5+ messages in thread