All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH RFC for-next 0/8] io_uring: tw contention improvments
@ 2022-06-20 16:18 Dylan Yudaken
  2022-06-20 16:18 ` [PATCH RFC for-next 1/8] io_uring: remove priority tw list optimisation Dylan Yudaken
                   ` (8 more replies)
  0 siblings, 9 replies; 20+ messages in thread
From: Dylan Yudaken @ 2022-06-20 16:18 UTC (permalink / raw)
  To: axboe, asml.silence, io-uring; +Cc: Kernel-team, Dylan Yudaken

Task work currently uses a spin lock to guard task_list and
task_running. Some use cases such as networking can trigger task_work_add
from multiple threads all at once, which suffers from contention here.

This can be changed to use a lockless list which seems to have better
performance. Running the micro benchmark in [1] I see 20% improvment in
multithreaded task work add. It required removing the priority tw list
optimisation, however it isn't clear how important that optimisation is.
Additionally it has fairly easy to break semantics.

Patch 1-2 remove the priority tw list optimisation
Patch 3-5 add lockless lists for task work
Patch 6 fixes a bug I noticed in io_uring event tracing
Patch 7-8 adds tracing for task_work_run

Dylan Yudaken (8):
  io_uring: remove priority tw list optimisation
  io_uring: remove __io_req_task_work_add
  io_uring: lockless task list
  io_uring: introduce llist helpers
  io_uring: batch task_work
  io_uring: move io_uring_get_opcode out of TP_printk
  io_uring: add trace event for running task work
  io_uring: trace task_work_run

 include/linux/io_uring_types.h  |   2 +-
 include/trace/events/io_uring.h |  72 +++++++++++++--
 io_uring/io_uring.c             | 150 ++++++++++++--------------------
 io_uring/io_uring.h             |   1 -
 io_uring/rw.c                   |   2 +-
 io_uring/tctx.c                 |   4 +-
 io_uring/tctx.h                 |   7 +-
 7 files changed, 127 insertions(+), 111 deletions(-)


base-commit: 094abe8fbccb0d79bef982c67eb7372e92452c0e
-- 
2.30.2


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH RFC for-next 1/8] io_uring: remove priority tw list optimisation
  2022-06-20 16:18 [PATCH RFC for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
@ 2022-06-20 16:18 ` Dylan Yudaken
  2022-06-20 16:18 ` [PATCH RFC for-next 2/8] io_uring: remove __io_req_task_work_add Dylan Yudaken
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 20+ messages in thread
From: Dylan Yudaken @ 2022-06-20 16:18 UTC (permalink / raw)
  To: axboe, asml.silence, io-uring; +Cc: Kernel-team, Dylan Yudaken

This optimisation has some built in assumptions that make it easy to
introduce bugs. It also does not have clear wins that make it worth keeping.

Signed-off-by: Dylan Yudaken <dylany@fb.com>
---
 io_uring/io_uring.c | 77 +++++++--------------------------------------
 io_uring/io_uring.h |  1 -
 io_uring/rw.c       |  2 +-
 io_uring/tctx.c     |  1 -
 io_uring/tctx.h     |  1 -
 5 files changed, 12 insertions(+), 70 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index afda42246d12..cc524d33748d 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -986,44 +986,6 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
 	percpu_ref_put(&ctx->refs);
 }
 
-static void handle_prev_tw_list(struct io_wq_work_node *node,
-				struct io_ring_ctx **ctx, bool *uring_locked)
-{
-	if (*ctx && !*uring_locked)
-		spin_lock(&(*ctx)->completion_lock);
-
-	do {
-		struct io_wq_work_node *next = node->next;
-		struct io_kiocb *req = container_of(node, struct io_kiocb,
-						    io_task_work.node);
-
-		prefetch(container_of(next, struct io_kiocb, io_task_work.node));
-
-		if (req->ctx != *ctx) {
-			if (unlikely(!*uring_locked && *ctx))
-				io_cq_unlock_post(*ctx);
-
-			ctx_flush_and_put(*ctx, uring_locked);
-			*ctx = req->ctx;
-			/* if not contended, grab and improve batching */
-			*uring_locked = mutex_trylock(&(*ctx)->uring_lock);
-			percpu_ref_get(&(*ctx)->refs);
-			if (unlikely(!*uring_locked))
-				io_cq_lock(*ctx);
-		}
-		if (likely(*uring_locked)) {
-			req->io_task_work.func(req, uring_locked);
-		} else {
-			req->cqe.flags = io_put_kbuf_comp(req);
-			__io_req_complete_post(req);
-		}
-		node = next;
-	} while (node);
-
-	if (unlikely(!*uring_locked))
-		io_cq_unlock_post(*ctx);
-}
-
 static void handle_tw_list(struct io_wq_work_node *node,
 			   struct io_ring_ctx **ctx, bool *locked)
 {
@@ -1054,27 +1016,20 @@ void tctx_task_work(struct callback_head *cb)
 						  task_work);
 
 	while (1) {
-		struct io_wq_work_node *node1, *node2;
+		struct io_wq_work_node *node;
 
 		spin_lock_irq(&tctx->task_lock);
-		node1 = tctx->prio_task_list.first;
-		node2 = tctx->task_list.first;
+		node = tctx->task_list.first;
 		INIT_WQ_LIST(&tctx->task_list);
-		INIT_WQ_LIST(&tctx->prio_task_list);
-		if (!node2 && !node1)
+		if (!node)
 			tctx->task_running = false;
 		spin_unlock_irq(&tctx->task_lock);
-		if (!node2 && !node1)
+		if (!node)
 			break;
-
-		if (node1)
-			handle_prev_tw_list(node1, &ctx, &uring_locked);
-		if (node2)
-			handle_tw_list(node2, &ctx, &uring_locked);
+		handle_tw_list(node, &ctx, &uring_locked);
 		cond_resched();
 
-		if (data_race(!tctx->task_list.first) &&
-		    data_race(!tctx->prio_task_list.first) && uring_locked)
+		if (data_race(!tctx->task_list.first) && uring_locked)
 			io_submit_flush_completions(ctx);
 	}
 
@@ -1086,8 +1041,7 @@ void tctx_task_work(struct callback_head *cb)
 }
 
 static void __io_req_task_work_add(struct io_kiocb *req,
-				   struct io_uring_task *tctx,
-				   struct io_wq_work_list *list)
+				   struct io_uring_task *tctx)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_wq_work_node *node;
@@ -1095,7 +1049,7 @@ static void __io_req_task_work_add(struct io_kiocb *req,
 	bool running;
 
 	spin_lock_irqsave(&tctx->task_lock, flags);
-	wq_list_add_tail(&req->io_task_work.node, list);
+	wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
 	running = tctx->task_running;
 	if (!running)
 		tctx->task_running = true;
@@ -1113,7 +1067,8 @@ static void __io_req_task_work_add(struct io_kiocb *req,
 
 	spin_lock_irqsave(&tctx->task_lock, flags);
 	tctx->task_running = false;
-	node = wq_list_merge(&tctx->prio_task_list, &tctx->task_list);
+	node = tctx->task_list.first;
+	INIT_WQ_LIST(&tctx->task_list);
 	spin_unlock_irqrestore(&tctx->task_lock, flags);
 
 	while (node) {
@@ -1129,17 +1084,7 @@ void io_req_task_work_add(struct io_kiocb *req)
 {
 	struct io_uring_task *tctx = req->task->io_uring;
 
-	__io_req_task_work_add(req, tctx, &tctx->task_list);
-}
-
-void io_req_task_prio_work_add(struct io_kiocb *req)
-{
-	struct io_uring_task *tctx = req->task->io_uring;
-
-	if (req->ctx->flags & IORING_SETUP_SQPOLL)
-		__io_req_task_work_add(req, tctx, &tctx->prio_task_list);
-	else
-		__io_req_task_work_add(req, tctx, &tctx->task_list);
+	__io_req_task_work_add(req, tctx);
 }
 
 static void io_req_tw_post(struct io_kiocb *req, bool *locked)
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 7a00bbe85d35..864e552c76a5 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -35,7 +35,6 @@ struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
 bool io_is_uring_fops(struct file *file);
 bool io_alloc_async_data(struct io_kiocb *req);
 void io_req_task_work_add(struct io_kiocb *req);
-void io_req_task_prio_work_add(struct io_kiocb *req);
 void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags);
 void io_req_task_queue(struct io_kiocb *req);
 void io_queue_iowq(struct io_kiocb *req, bool *dont_use);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 9166d8166b82..caf72f50c341 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -215,7 +215,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res)
 		return;
 	io_req_set_res(req, res, 0);
 	req->io_task_work.func = io_req_task_complete;
-	io_req_task_prio_work_add(req);
+	io_req_task_work_add(req);
 }
 
 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
diff --git a/io_uring/tctx.c b/io_uring/tctx.c
index 9b30fb0d3603..7a68ba9beec3 100644
--- a/io_uring/tctx.c
+++ b/io_uring/tctx.c
@@ -88,7 +88,6 @@ __cold int io_uring_alloc_task_context(struct task_struct *task,
 	task->io_uring = tctx;
 	spin_lock_init(&tctx->task_lock);
 	INIT_WQ_LIST(&tctx->task_list);
-	INIT_WQ_LIST(&tctx->prio_task_list);
 	init_task_work(&tctx->task_work, tctx_task_work);
 	return 0;
 }
diff --git a/io_uring/tctx.h b/io_uring/tctx.h
index dead0ed00429..c8566ea5dca4 100644
--- a/io_uring/tctx.h
+++ b/io_uring/tctx.h
@@ -22,7 +22,6 @@ struct io_uring_task {
 		spinlock_t		task_lock;
 		bool			task_running;
 		struct io_wq_work_list	task_list;
-		struct io_wq_work_list	prio_task_list;
 		struct callback_head	task_work;
 	} ____cacheline_aligned_in_smp;
 };
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH RFC for-next 2/8] io_uring: remove __io_req_task_work_add
  2022-06-20 16:18 [PATCH RFC for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
  2022-06-20 16:18 ` [PATCH RFC for-next 1/8] io_uring: remove priority tw list optimisation Dylan Yudaken
@ 2022-06-20 16:18 ` Dylan Yudaken
  2022-06-20 16:18 ` [PATCH RFC for-next 3/8] io_uring: lockless task list Dylan Yudaken
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 20+ messages in thread
From: Dylan Yudaken @ 2022-06-20 16:18 UTC (permalink / raw)
  To: axboe, asml.silence, io-uring; +Cc: Kernel-team, Dylan Yudaken

this is no longer needed as there is only one caller

Signed-off-by: Dylan Yudaken <dylany@fb.com>
---
 io_uring/io_uring.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index cc524d33748d..e1523b62103b 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1040,9 +1040,9 @@ void tctx_task_work(struct callback_head *cb)
 		io_uring_drop_tctx_refs(current);
 }
 
-static void __io_req_task_work_add(struct io_kiocb *req,
-				   struct io_uring_task *tctx)
+void io_req_task_work_add(struct io_kiocb *req)
 {
+	struct io_uring_task *tctx = req->task->io_uring;
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_wq_work_node *node;
 	unsigned long flags;
@@ -1080,13 +1080,6 @@ static void __io_req_task_work_add(struct io_kiocb *req,
 	}
 }
 
-void io_req_task_work_add(struct io_kiocb *req)
-{
-	struct io_uring_task *tctx = req->task->io_uring;
-
-	__io_req_task_work_add(req, tctx);
-}
-
 static void io_req_tw_post(struct io_kiocb *req, bool *locked)
 {
 	io_req_complete_post(req);
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH RFC for-next 3/8] io_uring: lockless task list
  2022-06-20 16:18 [PATCH RFC for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
  2022-06-20 16:18 ` [PATCH RFC for-next 1/8] io_uring: remove priority tw list optimisation Dylan Yudaken
  2022-06-20 16:18 ` [PATCH RFC for-next 2/8] io_uring: remove __io_req_task_work_add Dylan Yudaken
@ 2022-06-20 16:18 ` Dylan Yudaken
  2022-06-20 16:18 ` [PATCH RFC for-next 4/8] io_uring: introduce llist helpers Dylan Yudaken
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 20+ messages in thread
From: Dylan Yudaken @ 2022-06-20 16:18 UTC (permalink / raw)
  To: axboe, asml.silence, io-uring; +Cc: Kernel-team, Dylan Yudaken

With networking use cases we see contention on the spinlock used to
protect the task_list when multiple threads try and add completions at once.
Instead we can use a lockless list, and assume that the first caller to
add to the list is responsible for kicking off task work.

Signed-off-by: Dylan Yudaken <dylany@fb.com>
---
 include/linux/io_uring_types.h |  2 +-
 io_uring/io_uring.c            | 38 ++++++++--------------------------
 io_uring/tctx.c                |  3 +--
 io_uring/tctx.h                |  6 +++---
 4 files changed, 14 insertions(+), 35 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 5987f8acca38..918165a20053 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -428,7 +428,7 @@ typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
 
 struct io_task_work {
 	union {
-		struct io_wq_work_node	node;
+		struct llist_node	node;
 		struct llist_node	fallback_node;
 	};
 	io_req_tw_func_t		func;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index e1523b62103b..985b46dfebb6 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -986,11 +986,12 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
 	percpu_ref_put(&ctx->refs);
 }
 
-static void handle_tw_list(struct io_wq_work_node *node,
+
+static void handle_tw_list(struct llist_node *node,
 			   struct io_ring_ctx **ctx, bool *locked)
 {
 	do {
-		struct io_wq_work_node *next = node->next;
+		struct llist_node *next = node->next;
 		struct io_kiocb *req = container_of(node, struct io_kiocb,
 						    io_task_work.node);
 
@@ -1014,23 +1015,11 @@ void tctx_task_work(struct callback_head *cb)
 	struct io_ring_ctx *ctx = NULL;
 	struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
 						  task_work);
+	struct llist_node *node = llist_del_all(&tctx->task_list);
 
-	while (1) {
-		struct io_wq_work_node *node;
-
-		spin_lock_irq(&tctx->task_lock);
-		node = tctx->task_list.first;
-		INIT_WQ_LIST(&tctx->task_list);
-		if (!node)
-			tctx->task_running = false;
-		spin_unlock_irq(&tctx->task_lock);
-		if (!node)
-			break;
+	if (node) {
 		handle_tw_list(node, &ctx, &uring_locked);
 		cond_resched();
-
-		if (data_race(!tctx->task_list.first) && uring_locked)
-			io_submit_flush_completions(ctx);
 	}
 
 	ctx_flush_and_put(ctx, &uring_locked);
@@ -1044,16 +1033,10 @@ void io_req_task_work_add(struct io_kiocb *req)
 {
 	struct io_uring_task *tctx = req->task->io_uring;
 	struct io_ring_ctx *ctx = req->ctx;
-	struct io_wq_work_node *node;
-	unsigned long flags;
+	struct llist_node *node;
 	bool running;
 
-	spin_lock_irqsave(&tctx->task_lock, flags);
-	wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
-	running = tctx->task_running;
-	if (!running)
-		tctx->task_running = true;
-	spin_unlock_irqrestore(&tctx->task_lock, flags);
+	running = !llist_add(&req->io_task_work.node, &tctx->task_list);
 
 	/* task_work already pending, we're done */
 	if (running)
@@ -1065,11 +1048,8 @@ void io_req_task_work_add(struct io_kiocb *req)
 	if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method)))
 		return;
 
-	spin_lock_irqsave(&tctx->task_lock, flags);
-	tctx->task_running = false;
-	node = tctx->task_list.first;
-	INIT_WQ_LIST(&tctx->task_list);
-	spin_unlock_irqrestore(&tctx->task_lock, flags);
+
+	node = llist_del_all(&tctx->task_list);
 
 	while (node) {
 		req = container_of(node, struct io_kiocb, io_task_work.node);
diff --git a/io_uring/tctx.c b/io_uring/tctx.c
index 7a68ba9beec3..7f97d97fef0a 100644
--- a/io_uring/tctx.c
+++ b/io_uring/tctx.c
@@ -86,8 +86,7 @@ __cold int io_uring_alloc_task_context(struct task_struct *task,
 	atomic_set(&tctx->in_idle, 0);
 	atomic_set(&tctx->inflight_tracked, 0);
 	task->io_uring = tctx;
-	spin_lock_init(&tctx->task_lock);
-	INIT_WQ_LIST(&tctx->task_list);
+	init_llist_head(&tctx->task_list);
 	init_task_work(&tctx->task_work, tctx_task_work);
 	return 0;
 }
diff --git a/io_uring/tctx.h b/io_uring/tctx.h
index c8566ea5dca4..8a33ff6e5d91 100644
--- a/io_uring/tctx.h
+++ b/io_uring/tctx.h
@@ -1,5 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
+#include <linux/llist.h>
+
 /*
  * Arbitrary limit, can be raised if need be
  */
@@ -19,9 +21,7 @@ struct io_uring_task {
 	struct percpu_counter		inflight;
 
 	struct { /* task_work */
-		spinlock_t		task_lock;
-		bool			task_running;
-		struct io_wq_work_list	task_list;
+		struct llist_head	task_list;
 		struct callback_head	task_work;
 	} ____cacheline_aligned_in_smp;
 };
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH RFC for-next 4/8] io_uring: introduce llist helpers
  2022-06-20 16:18 [PATCH RFC for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
                   ` (2 preceding siblings ...)
  2022-06-20 16:18 ` [PATCH RFC for-next 3/8] io_uring: lockless task list Dylan Yudaken
@ 2022-06-20 16:18 ` Dylan Yudaken
  2022-06-20 16:18 ` [PATCH RFC for-next 5/8] io_uring: batch task_work Dylan Yudaken
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 20+ messages in thread
From: Dylan Yudaken @ 2022-06-20 16:18 UTC (permalink / raw)
  To: axboe, asml.silence, io-uring; +Cc: Kernel-team, Dylan Yudaken

Introduce helpers to atomically switch llist.

Will later move this into common code

Signed-off-by: Dylan Yudaken <dylany@fb.com>
---
 io_uring/io_uring.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 985b46dfebb6..eb29e3f7da5c 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1009,6 +1009,36 @@ static void handle_tw_list(struct llist_node *node,
 	} while (node);
 }
 
+/**
+ * io_llist_xchg - swap all entries in a lock-less list
+ * @head:	the head of lock-less list to delete all entries
+ * @new:	new entry as the head of the list
+ *
+ * If list is empty, return NULL, otherwise, return the pointer to the first entry.
+ * The order of entries returned is from the newest to the oldest added one.
+ */
+static inline struct llist_node *io_llist_xchg(struct llist_head *head,
+					       struct llist_node *node)
+{
+	return xchg(&head->first, node);
+}
+
+/**
+ * io_llist_xchg - possibly swap all entries in a lock-less list
+ * @head:	the head of lock-less list to delete all entries
+ * @old:	expected old value of the first entry of the list
+ * @new:	new entry as the head of the list
+ *
+ * perform a cmpxchg on the first entry of the list.
+ */
+
+static inline struct llist_node *io_llist_cmpxchg(struct llist_head *head,
+						  struct llist_node *old,
+						  struct llist_node *new)
+{
+	return cmpxchg(&head->first, old, new);
+}
+
 void tctx_task_work(struct callback_head *cb)
 {
 	bool uring_locked = false;
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH RFC for-next 5/8] io_uring: batch task_work
  2022-06-20 16:18 [PATCH RFC for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
                   ` (3 preceding siblings ...)
  2022-06-20 16:18 ` [PATCH RFC for-next 4/8] io_uring: introduce llist helpers Dylan Yudaken
@ 2022-06-20 16:18 ` Dylan Yudaken
  2022-06-20 16:18 ` [PATCH RFC for-next 6/8] io_uring: move io_uring_get_opcode out of TP_printk Dylan Yudaken
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 20+ messages in thread
From: Dylan Yudaken @ 2022-06-20 16:18 UTC (permalink / raw)
  To: axboe, asml.silence, io-uring; +Cc: Kernel-team, Dylan Yudaken

Batching task work up is an important performance optimisation, as
task_work_add is expensive.

In order to keep the semantics replace the task_list with a fake node
while processing the old list, and then do a cmpxchg at the end to see if
there is more work.

Signed-off-by: Dylan Yudaken <dylany@fb.com>
---
 io_uring/io_uring.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index eb29e3f7da5c..0b4d3eb06d16 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -988,9 +988,10 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
 
 
 static void handle_tw_list(struct llist_node *node,
-			   struct io_ring_ctx **ctx, bool *locked)
+			   struct io_ring_ctx **ctx, bool *locked,
+			   struct llist_node *fake)
 {
-	do {
+	while (node && node != fake) {
 		struct llist_node *next = node->next;
 		struct io_kiocb *req = container_of(node, struct io_kiocb,
 						    io_task_work.node);
@@ -1006,7 +1007,10 @@ static void handle_tw_list(struct llist_node *node,
 		}
 		req->io_task_work.func(req, locked);
 		node = next;
-	} while (node);
+		count++;
+	}
+
+	return count;
 }
 
 /**
@@ -1045,11 +1049,15 @@ void tctx_task_work(struct callback_head *cb)
 	struct io_ring_ctx *ctx = NULL;
 	struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
 						  task_work);
-	struct llist_node *node = llist_del_all(&tctx->task_list);
-
-	if (node) {
-		handle_tw_list(node, &ctx, &uring_locked);
-		cond_resched();
+	struct llist_node fake = {};
+	struct llist_node *node = io_llist_xchg(&tctx->task_list, &fake);
+
+	handle_tw_list(node, &ctx, &uring_locked, &fake);
+	node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL);
+	while (node != &fake) {
+		node = io_llist_xchg(&tctx->task_list, &fake);
+		handle_tw_list(node, &ctx, &uring_locked, &fake);
+		node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL);
 	}
 
 	ctx_flush_and_put(ctx, &uring_locked);
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH RFC for-next 6/8] io_uring: move io_uring_get_opcode out of TP_printk
  2022-06-20 16:18 [PATCH RFC for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
                   ` (4 preceding siblings ...)
  2022-06-20 16:18 ` [PATCH RFC for-next 5/8] io_uring: batch task_work Dylan Yudaken
@ 2022-06-20 16:18 ` Dylan Yudaken
  2022-06-20 16:19 ` [PATCH RFC for-next 7/8] io_uring: add trace event for running task work Dylan Yudaken
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 20+ messages in thread
From: Dylan Yudaken @ 2022-06-20 16:18 UTC (permalink / raw)
  To: axboe, asml.silence, io-uring; +Cc: Kernel-team, Dylan Yudaken

The TP_printk macro's are not supposed to use custom code ([1]) or else
tools such as perf cannot use these events.

Convert the opcode string representation to use the __string wiring that
the event framework provides ([2]).

[1]: https://lwn.net/Articles/379903/
[2]: https://lwn.net/Articles/381064/

Fixes: 033b87d2 ("io_uring: use the text representation of ops in trace")
Signed-off-by: Dylan Yudaken <dylany@fb.com>
---
 include/trace/events/io_uring.h | 42 +++++++++++++++++++++++++++------
 1 file changed, 35 insertions(+), 7 deletions(-)

diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
index 5635912e1013..3bc8dec9acaa 100644
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -151,6 +151,8 @@ TRACE_EVENT(io_uring_queue_async_work,
 		__field(  unsigned int,			flags		)
 		__field(  struct io_wq_work *,		work		)
 		__field(  int,				rw		)
+
+		__string( op_str, io_uring_get_opcode(req->opcode)	)
 	),
 
 	TP_fast_assign(
@@ -161,11 +163,13 @@ TRACE_EVENT(io_uring_queue_async_work,
 		__entry->opcode		= req->opcode;
 		__entry->work		= &req->work;
 		__entry->rw		= rw;
+
+		__assign_str(op_str, io_uring_get_opcode(req->opcode));
 	),
 
 	TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%x, %s queue, work %p",
 		__entry->ctx, __entry->req, __entry->user_data,
-		io_uring_get_opcode(__entry->opcode),
+		__get_str(op_str),
 		__entry->flags, __entry->rw ? "hashed" : "normal", __entry->work)
 );
 
@@ -188,6 +192,8 @@ TRACE_EVENT(io_uring_defer,
 		__field(  void *,		req	)
 		__field(  unsigned long long,	data	)
 		__field(  u8,			opcode	)
+
+		__string( op_str, io_uring_get_opcode(req->opcode) )
 	),
 
 	TP_fast_assign(
@@ -195,11 +201,13 @@ TRACE_EVENT(io_uring_defer,
 		__entry->req	= req;
 		__entry->data	= req->cqe.user_data;
 		__entry->opcode	= req->opcode;
+
+		__assign_str(op_str, io_uring_get_opcode(req->opcode));
 	),
 
 	TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s",
 		__entry->ctx, __entry->req, __entry->data,
-		io_uring_get_opcode(__entry->opcode))
+		__get_str(op_str))
 );
 
 /**
@@ -284,6 +292,8 @@ TRACE_EVENT(io_uring_fail_link,
 		__field(  unsigned long long,	user_data	)
 		__field(  u8,			opcode		)
 		__field(  void *,		link		)
+
+		__string( op_str, io_uring_get_opcode(req->opcode) )
 	),
 
 	TP_fast_assign(
@@ -292,11 +302,13 @@ TRACE_EVENT(io_uring_fail_link,
 		__entry->user_data	= req->cqe.user_data;
 		__entry->opcode		= req->opcode;
 		__entry->link		= link;
+
+		__assign_str(op_str, io_uring_get_opcode(req->opcode));
 	),
 
 	TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, link %p",
 		__entry->ctx, __entry->req, __entry->user_data,
-		io_uring_get_opcode(__entry->opcode), __entry->link)
+		__get_str(op_str), __entry->link)
 );
 
 /**
@@ -370,6 +382,8 @@ TRACE_EVENT(io_uring_submit_sqe,
 		__field(  u32,			flags		)
 		__field(  bool,			force_nonblock	)
 		__field(  bool,			sq_thread	)
+
+		__string( op_str, io_uring_get_opcode(req->opcode) )
 	),
 
 	TP_fast_assign(
@@ -380,11 +394,13 @@ TRACE_EVENT(io_uring_submit_sqe,
 		__entry->flags		= req->flags;
 		__entry->force_nonblock	= force_nonblock;
 		__entry->sq_thread	= req->ctx->flags & IORING_SETUP_SQPOLL;
+
+		__assign_str(op_str, io_uring_get_opcode(req->opcode));
 	),
 
 	TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%x, "
 		  "non block %d, sq_thread %d", __entry->ctx, __entry->req,
-		  __entry->user_data, io_uring_get_opcode(__entry->opcode),
+		  __entry->user_data, __get_str(op_str),
 		  __entry->flags, __entry->force_nonblock, __entry->sq_thread)
 );
 
@@ -411,6 +427,8 @@ TRACE_EVENT(io_uring_poll_arm,
 		__field(  u8,			opcode		)
 		__field(  int,			mask		)
 		__field(  int,			events		)
+
+		__string( op_str, io_uring_get_opcode(req->opcode) )
 	),
 
 	TP_fast_assign(
@@ -420,11 +438,13 @@ TRACE_EVENT(io_uring_poll_arm,
 		__entry->opcode		= req->opcode;
 		__entry->mask		= mask;
 		__entry->events		= events;
+
+		__assign_str(op_str, io_uring_get_opcode(req->opcode));
 	),
 
 	TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask 0x%x, events 0x%x",
 		  __entry->ctx, __entry->req, __entry->user_data,
-		  io_uring_get_opcode(__entry->opcode),
+		  __get_str(op_str),
 		  __entry->mask, __entry->events)
 );
 
@@ -447,6 +467,8 @@ TRACE_EVENT(io_uring_task_add,
 		__field(  unsigned long long,	user_data	)
 		__field(  u8,			opcode		)
 		__field(  int,			mask		)
+
+		__string( op_str, io_uring_get_opcode(req->opcode) )
 	),
 
 	TP_fast_assign(
@@ -455,11 +477,13 @@ TRACE_EVENT(io_uring_task_add,
 		__entry->user_data	= req->cqe.user_data;
 		__entry->opcode		= req->opcode;
 		__entry->mask		= mask;
+
+		__assign_str(op_str, io_uring_get_opcode(req->opcode));
 	),
 
 	TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask %x",
 		__entry->ctx, __entry->req, __entry->user_data,
-		io_uring_get_opcode(__entry->opcode),
+		__get_str(op_str),
 		__entry->mask)
 );
 
@@ -495,6 +519,8 @@ TRACE_EVENT(io_uring_req_failed,
 		__field( u64,			pad1		)
 		__field( u64,			addr3		)
 		__field( int,			error		)
+
+		__string( op_str, io_uring_get_opcode(sqe->opcode) )
 	),
 
 	TP_fast_assign(
@@ -514,6 +540,8 @@ TRACE_EVENT(io_uring_req_failed,
 		__entry->pad1		= sqe->__pad2[0];
 		__entry->addr3		= sqe->addr3;
 		__entry->error		= error;
+
+		__assign_str(op_str, io_uring_get_opcode(sqe->opcode));
 	),
 
 	TP_printk("ring %p, req %p, user_data 0x%llx, "
@@ -522,7 +550,7 @@ TRACE_EVENT(io_uring_req_failed,
 		  "personality=%d, file_index=%d, pad=0x%llx, addr3=%llx, "
 		  "error=%d",
 		  __entry->ctx, __entry->req, __entry->user_data,
-		  io_uring_get_opcode(__entry->opcode),
+		  __get_str(op_str),
 		  __entry->flags, __entry->ioprio,
 		  (unsigned long long)__entry->off,
 		  (unsigned long long) __entry->addr, __entry->len,
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH RFC for-next 7/8] io_uring: add trace event for running task work
  2022-06-20 16:18 [PATCH RFC for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
                   ` (5 preceding siblings ...)
  2022-06-20 16:18 ` [PATCH RFC for-next 6/8] io_uring: move io_uring_get_opcode out of TP_printk Dylan Yudaken
@ 2022-06-20 16:19 ` Dylan Yudaken
  2022-06-20 16:19 ` [PATCH RFC for-next 8/8] io_uring: trace task_work_run Dylan Yudaken
  2022-06-21  5:10 ` [PATCH RFC for-next 0/8] io_uring: tw contention improvments Hao Xu
  8 siblings, 0 replies; 20+ messages in thread
From: Dylan Yudaken @ 2022-06-20 16:19 UTC (permalink / raw)
  To: axboe, asml.silence, io-uring; +Cc: Kernel-team, Dylan Yudaken

This is useful for investigating if task_work is batching

Signed-off-by: Dylan Yudaken <dylany@fb.com>
---
 include/trace/events/io_uring.h | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
index 3bc8dec9acaa..918e3a43e4b2 100644
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -600,6 +600,36 @@ TRACE_EVENT(io_uring_cqe_overflow,
 		  __entry->cflags, __entry->ocqe)
 );
 
+/*
+ * io_uring_task_work_run - ran task work
+ *
+ * @tctx:		pointer to a io_uring_task
+ * @count:		how many functions it ran
+ * @loops:		how many loops it ran
+ *
+ */
+TRACE_EVENT(io_uring_task_work_run,
+
+	TP_PROTO(void *tctx, unsigned int count, unsigned int loops),
+
+	TP_ARGS(tctx, count, loops),
+
+	TP_STRUCT__entry (
+		__field(  void *,		tctx		)
+		__field(  unsigned int,		count		)
+		__field(  unsigned int,		loops		)
+	),
+
+	TP_fast_assign(
+		__entry->tctx		= tctx;
+		__entry->count		= count;
+		__entry->loops		= loops;
+	),
+
+	TP_printk("tctx %p, count %u, loops %u",
+		 __entry->tctx, __entry->count, __entry->loops)
+);
+
 #endif /* _TRACE_IO_URING_H */
 
 /* This part must be outside protection */
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH RFC for-next 8/8] io_uring: trace task_work_run
  2022-06-20 16:18 [PATCH RFC for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
                   ` (6 preceding siblings ...)
  2022-06-20 16:19 ` [PATCH RFC for-next 7/8] io_uring: add trace event for running task work Dylan Yudaken
@ 2022-06-20 16:19 ` Dylan Yudaken
  2022-06-21  5:10 ` [PATCH RFC for-next 0/8] io_uring: tw contention improvments Hao Xu
  8 siblings, 0 replies; 20+ messages in thread
From: Dylan Yudaken @ 2022-06-20 16:19 UTC (permalink / raw)
  To: axboe, asml.silence, io-uring; +Cc: Kernel-team, Dylan Yudaken

trace task_work_run to help provide stats on how often task work is run
and what batch sizes are coming through.

Signed-off-by: Dylan Yudaken <dylany@fb.com>
---
 io_uring/io_uring.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 0b4d3eb06d16..8cf868315008 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -987,10 +987,12 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
 }
 
 
-static void handle_tw_list(struct llist_node *node,
-			   struct io_ring_ctx **ctx, bool *locked,
-			   struct llist_node *fake)
+static unsigned int handle_tw_list(struct llist_node *node,
+				   struct io_ring_ctx **ctx, bool *locked,
+				   struct llist_node *fake)
 {
+	unsigned int count = 0;
+
 	while (node && node != fake) {
 		struct llist_node *next = node->next;
 		struct io_kiocb *req = container_of(node, struct io_kiocb,
@@ -1051,12 +1053,14 @@ void tctx_task_work(struct callback_head *cb)
 						  task_work);
 	struct llist_node fake = {};
 	struct llist_node *node = io_llist_xchg(&tctx->task_list, &fake);
+	unsigned int loops = 1;
+	unsigned int count = handle_tw_list(node, &ctx, &uring_locked, &fake);
 
-	handle_tw_list(node, &ctx, &uring_locked, &fake);
 	node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL);
 	while (node != &fake) {
+		loops++;
 		node = io_llist_xchg(&tctx->task_list, &fake);
-		handle_tw_list(node, &ctx, &uring_locked, &fake);
+		count += handle_tw_list(node, &ctx, &uring_locked, &fake);
 		node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL);
 	}
 
@@ -1065,6 +1069,8 @@ void tctx_task_work(struct callback_head *cb)
 	/* relaxed read is enough as only the task itself sets ->in_idle */
 	if (unlikely(atomic_read(&tctx->in_idle)))
 		io_uring_drop_tctx_refs(current);
+
+	trace_io_uring_task_work_run(tctx, count, loops);
 }
 
 void io_req_task_work_add(struct io_kiocb *req)
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH RFC for-next 0/8] io_uring: tw contention improvments
  2022-06-20 16:18 [PATCH RFC for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
                   ` (7 preceding siblings ...)
  2022-06-20 16:19 ` [PATCH RFC for-next 8/8] io_uring: trace task_work_run Dylan Yudaken
@ 2022-06-21  5:10 ` Hao Xu
  2022-06-21  7:03   ` Dylan Yudaken
  8 siblings, 1 reply; 20+ messages in thread
From: Hao Xu @ 2022-06-21  5:10 UTC (permalink / raw)
  To: Dylan Yudaken, axboe, asml.silence, io-uring; +Cc: Kernel-team

On 6/21/22 00:18, Dylan Yudaken wrote:
> Task work currently uses a spin lock to guard task_list and
> task_running. Some use cases such as networking can trigger task_work_add
> from multiple threads all at once, which suffers from contention here.
> 
> This can be changed to use a lockless list which seems to have better
> performance. Running the micro benchmark in [1] I see 20% improvment in
> multithreaded task work add. It required removing the priority tw list
> optimisation, however it isn't clear how important that optimisation is.
> Additionally it has fairly easy to break semantics.
> 
> Patch 1-2 remove the priority tw list optimisation
> Patch 3-5 add lockless lists for task work
> Patch 6 fixes a bug I noticed in io_uring event tracing
> Patch 7-8 adds tracing for task_work_run
> 

Compared to the spinlock overhead, the prio task list optimization is
definitely unimportant, so I agree with removing it here.
Replace the task list with llisy was something I considered but I gave
it up since it changes the list to a stack which means we have to handle
the tasks in a reverse order. This may affect the latency, do you have
some numbers for it, like avg and 99% 95% lat?


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH RFC for-next 0/8] io_uring: tw contention improvments
  2022-06-21  5:10 ` [PATCH RFC for-next 0/8] io_uring: tw contention improvments Hao Xu
@ 2022-06-21  7:03   ` Dylan Yudaken
  2022-06-21  7:34     ` Hao Xu
  2022-06-21  7:38     ` Hao Xu
  0 siblings, 2 replies; 20+ messages in thread
From: Dylan Yudaken @ 2022-06-21  7:03 UTC (permalink / raw)
  To: axboe, hao.xu, asml.silence, io-uring; +Cc: Kernel Team

On Tue, 2022-06-21 at 13:10 +0800, Hao Xu wrote:
> On 6/21/22 00:18, Dylan Yudaken wrote:
> > Task work currently uses a spin lock to guard task_list and
> > task_running. Some use cases such as networking can trigger
> > task_work_add
> > from multiple threads all at once, which suffers from contention
> > here.
> > 
> > This can be changed to use a lockless list which seems to have
> > better
> > performance. Running the micro benchmark in [1] I see 20%
> > improvment in
> > multithreaded task work add. It required removing the priority tw
> > list
> > optimisation, however it isn't clear how important that
> > optimisation is.
> > Additionally it has fairly easy to break semantics.
> > 
> > Patch 1-2 remove the priority tw list optimisation
> > Patch 3-5 add lockless lists for task work
> > Patch 6 fixes a bug I noticed in io_uring event tracing
> > Patch 7-8 adds tracing for task_work_run
> > 
> 
> Compared to the spinlock overhead, the prio task list optimization is
> definitely unimportant, so I agree with removing it here.
> Replace the task list with llisy was something I considered but I
> gave
> it up since it changes the list to a stack which means we have to
> handle
> the tasks in a reverse order. This may affect the latency, do you
> have
> some numbers for it, like avg and 99% 95% lat?
> 

Do you have an idea for how to test that? I used a microbenchmark as
well as a network benchmark [1] to verify that overall throughput is
higher. TW latency sounds a lot more complicated to measure as it's
difficult to trigger accurately.

My feeling is that with reasonable batching (say 8-16 items) the
latency will be low as TW is generally very quick, but if you have an
idea for benchmarking I can take a look

[1]: https://github.com/DylanZA/netbench

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH RFC for-next 0/8] io_uring: tw contention improvments
  2022-06-21  7:03   ` Dylan Yudaken
@ 2022-06-21  7:34     ` Hao Xu
  2022-06-22  9:31       ` Dylan Yudaken
  2022-06-21  7:38     ` Hao Xu
  1 sibling, 1 reply; 20+ messages in thread
From: Hao Xu @ 2022-06-21  7:34 UTC (permalink / raw)
  To: Dylan Yudaken, axboe, asml.silence, io-uring; +Cc: Kernel Team

On 6/21/22 15:03, Dylan Yudaken wrote:
> On Tue, 2022-06-21 at 13:10 +0800, Hao Xu wrote:
>> On 6/21/22 00:18, Dylan Yudaken wrote:
>>> Task work currently uses a spin lock to guard task_list and
>>> task_running. Some use cases such as networking can trigger
>>> task_work_add
>>> from multiple threads all at once, which suffers from contention
>>> here.
>>>
>>> This can be changed to use a lockless list which seems to have
>>> better
>>> performance. Running the micro benchmark in [1] I see 20%
>>> improvment in
>>> multithreaded task work add. It required removing the priority tw
>>> list
>>> optimisation, however it isn't clear how important that
>>> optimisation is.
>>> Additionally it has fairly easy to break semantics.
>>>
>>> Patch 1-2 remove the priority tw list optimisation
>>> Patch 3-5 add lockless lists for task work
>>> Patch 6 fixes a bug I noticed in io_uring event tracing
>>> Patch 7-8 adds tracing for task_work_run
>>>
>>
>> Compared to the spinlock overhead, the prio task list optimization is
>> definitely unimportant, so I agree with removing it here.
>> Replace the task list with llisy was something I considered but I
>> gave
>> it up since it changes the list to a stack which means we have to
>> handle
>> the tasks in a reverse order. This may affect the latency, do you
>> have
>> some numbers for it, like avg and 99% 95% lat?
>>
> 
> Do you have an idea for how to test that? I used a microbenchmark as
> well as a network benchmark [1] to verify that overall throughput is
> higher. TW latency sounds a lot more complicated to measure as it's
> difficult to trigger accurately.
> 
> My feeling is that with reasonable batching (say 8-16 items) the
> latency will be low as TW is generally very quick, but if you have an
> idea for benchmarking I can take a look
> 
> [1]: https://github.com/DylanZA/netbench

It can be normal IO requests I think. We can test the latency by fio
with small size IO to a fast block device(like nvme) in SQPOLL
mode(since for non-SQPOLL, it doesn't make difference). This way we can
see the influence of reverse order handling.

Regards,
Hao

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH RFC for-next 0/8] io_uring: tw contention improvments
  2022-06-21  7:03   ` Dylan Yudaken
  2022-06-21  7:34     ` Hao Xu
@ 2022-06-21  7:38     ` Hao Xu
  1 sibling, 0 replies; 20+ messages in thread
From: Hao Xu @ 2022-06-21  7:38 UTC (permalink / raw)
  To: Dylan Yudaken, axboe, asml.silence, io-uring; +Cc: Kernel Team

On 6/21/22 15:03, Dylan Yudaken wrote:
> On Tue, 2022-06-21 at 13:10 +0800, Hao Xu wrote:
>> On 6/21/22 00:18, Dylan Yudaken wrote:
>>> Task work currently uses a spin lock to guard task_list and
>>> task_running. Some use cases such as networking can trigger
>>> task_work_add
>>> from multiple threads all at once, which suffers from contention
>>> here.
>>>
>>> This can be changed to use a lockless list which seems to have
>>> better
>>> performance. Running the micro benchmark in [1] I see 20%
>>> improvment in
>>> multithreaded task work add. It required removing the priority tw
>>> list
>>> optimisation, however it isn't clear how important that
>>> optimisation is.
>>> Additionally it has fairly easy to break semantics.
>>>
>>> Patch 1-2 remove the priority tw list optimisation
>>> Patch 3-5 add lockless lists for task work
>>> Patch 6 fixes a bug I noticed in io_uring event tracing
>>> Patch 7-8 adds tracing for task_work_run
>>>
>>
>> Compared to the spinlock overhead, the prio task list optimization is
>> definitely unimportant, so I agree with removing it here.
>> Replace the task list with llisy was something I considered but I
>> gave
>> it up since it changes the list to a stack which means we have to
>> handle
>> the tasks in a reverse order. This may affect the latency, do you
>> have
>> some numbers for it, like avg and 99% 95% lat?
>>
> 
> Do you have an idea for how to test that? I used a microbenchmark as
> well as a network benchmark [1] to verify that overall throughput is
> higher. TW latency sounds a lot more complicated to measure as it's

I think measuring the end to end latency is a way good enough, no need
to get the accurate number of TW queueing time.

> difficult to trigger accurately.
> 
> My feeling is that with reasonable batching (say 8-16 items) the
> latency will be low as TW is generally very quick, but if you have an
> idea for benchmarking I can take a look
> 
> [1]: https://github.com/DylanZA/netbench


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH RFC for-next 0/8] io_uring: tw contention improvments
  2022-06-21  7:34     ` Hao Xu
@ 2022-06-22  9:31       ` Dylan Yudaken
  2022-06-22 11:16         ` Hao Xu
  0 siblings, 1 reply; 20+ messages in thread
From: Dylan Yudaken @ 2022-06-22  9:31 UTC (permalink / raw)
  To: axboe, hao.xu, asml.silence, io-uring; +Cc: Kernel Team

On Tue, 2022-06-21 at 15:34 +0800, Hao Xu wrote:
> On 6/21/22 15:03, Dylan Yudaken wrote:
> > On Tue, 2022-06-21 at 13:10 +0800, Hao Xu wrote:
> > > On 6/21/22 00:18, Dylan Yudaken wrote:
> > > > Task work currently uses a spin lock to guard task_list and
> > > > task_running. Some use cases such as networking can trigger
> > > > task_work_add
> > > > from multiple threads all at once, which suffers from
> > > > contention
> > > > here.
> > > > 
> > > > This can be changed to use a lockless list which seems to have
> > > > better
> > > > performance. Running the micro benchmark in [1] I see 20%
> > > > improvment in
> > > > multithreaded task work add. It required removing the priority
> > > > tw
> > > > list
> > > > optimisation, however it isn't clear how important that
> > > > optimisation is.
> > > > Additionally it has fairly easy to break semantics.
> > > > 
> > > > Patch 1-2 remove the priority tw list optimisation
> > > > Patch 3-5 add lockless lists for task work
> > > > Patch 6 fixes a bug I noticed in io_uring event tracing
> > > > Patch 7-8 adds tracing for task_work_run
> > > > 
> > > 
> > > Compared to the spinlock overhead, the prio task list
> > > optimization is
> > > definitely unimportant, so I agree with removing it here.
> > > Replace the task list with llisy was something I considered but I
> > > gave
> > > it up since it changes the list to a stack which means we have to
> > > handle
> > > the tasks in a reverse order. This may affect the latency, do you
> > > have
> > > some numbers for it, like avg and 99% 95% lat?
> > > 
> > 
> > Do you have an idea for how to test that? I used a microbenchmark
> > as
> > well as a network benchmark [1] to verify that overall throughput
> > is
> > higher. TW latency sounds a lot more complicated to measure as it's
> > difficult to trigger accurately.
> > 
> > My feeling is that with reasonable batching (say 8-16 items) the
> > latency will be low as TW is generally very quick, but if you have
> > an
> > idea for benchmarking I can take a look
> > 
> > [1]: https://github.com/DylanZA/netbench
> 
> It can be normal IO requests I think. We can test the latency by fio
> with small size IO to a fast block device(like nvme) in SQPOLL
> mode(since for non-SQPOLL, it doesn't make difference). This way we
> can
> see the influence of reverse order handling.
> 
> Regards,
> Hao

I see little difference locally, but there is quite a big stdev so it's
possible my test setup is a bit wonky

new:
    clat (msec): min=2027, max=10544, avg=6347.10, stdev=2458.20
     lat (nsec): min=1440, max=16719k, avg=119714.72, stdev=153571.49
old:
    clat (msec): min=2738, max=10550, avg=6700.68, stdev=2251.77
     lat (nsec): min=1278, max=16610k, avg=121025.73, stdev=211896.14


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH RFC for-next 0/8] io_uring: tw contention improvments
  2022-06-22  9:31       ` Dylan Yudaken
@ 2022-06-22 11:16         ` Hao Xu
  2022-06-22 11:24           ` Hao Xu
  0 siblings, 1 reply; 20+ messages in thread
From: Hao Xu @ 2022-06-22 11:16 UTC (permalink / raw)
  To: Dylan Yudaken, axboe, asml.silence, io-uring; +Cc: Kernel Team

On 6/22/22 17:31, Dylan Yudaken wrote:
> On Tue, 2022-06-21 at 15:34 +0800, Hao Xu wrote:
>> On 6/21/22 15:03, Dylan Yudaken wrote:
>>> On Tue, 2022-06-21 at 13:10 +0800, Hao Xu wrote:
>>>> On 6/21/22 00:18, Dylan Yudaken wrote:
>>>>> Task work currently uses a spin lock to guard task_list and
>>>>> task_running. Some use cases such as networking can trigger
>>>>> task_work_add
>>>>> from multiple threads all at once, which suffers from
>>>>> contention
>>>>> here.
>>>>>
>>>>> This can be changed to use a lockless list which seems to have
>>>>> better
>>>>> performance. Running the micro benchmark in [1] I see 20%
>>>>> improvment in
>>>>> multithreaded task work add. It required removing the priority
>>>>> tw
>>>>> list
>>>>> optimisation, however it isn't clear how important that
>>>>> optimisation is.
>>>>> Additionally it has fairly easy to break semantics.
>>>>>
>>>>> Patch 1-2 remove the priority tw list optimisation
>>>>> Patch 3-5 add lockless lists for task work
>>>>> Patch 6 fixes a bug I noticed in io_uring event tracing
>>>>> Patch 7-8 adds tracing for task_work_run
>>>>>
>>>>
>>>> Compared to the spinlock overhead, the prio task list
>>>> optimization is
>>>> definitely unimportant, so I agree with removing it here.
>>>> Replace the task list with llisy was something I considered but I
>>>> gave
>>>> it up since it changes the list to a stack which means we have to
>>>> handle
>>>> the tasks in a reverse order. This may affect the latency, do you
>>>> have
>>>> some numbers for it, like avg and 99% 95% lat?
>>>>
>>>
>>> Do you have an idea for how to test that? I used a microbenchmark
>>> as
>>> well as a network benchmark [1] to verify that overall throughput
>>> is
>>> higher. TW latency sounds a lot more complicated to measure as it's
>>> difficult to trigger accurately.
>>>
>>> My feeling is that with reasonable batching (say 8-16 items) the
>>> latency will be low as TW is generally very quick, but if you have
>>> an
>>> idea for benchmarking I can take a look
>>>
>>> [1]: https://github.com/DylanZA/netbench
>>
>> It can be normal IO requests I think. We can test the latency by fio
>> with small size IO to a fast block device(like nvme) in SQPOLL
>> mode(since for non-SQPOLL, it doesn't make difference). This way we
>> can
>> see the influence of reverse order handling.
>>
>> Regards,
>> Hao
> 
> I see little difference locally, but there is quite a big stdev so it's
> possible my test setup is a bit wonky
> 
> new:
>      clat (msec): min=2027, max=10544, avg=6347.10, stdev=2458.20
>       lat (nsec): min=1440, max=16719k, avg=119714.72, stdev=153571.49
> old:
>      clat (msec): min=2738, max=10550, avg=6700.68, stdev=2251.77
>       lat (nsec): min=1278, max=16610k, avg=121025.73, stdev=211896.14
> 

Hi Dylan,

Could you post the arguments you use and the 99% 95% latency as well?

Regards,
Hao


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH RFC for-next 0/8] io_uring: tw contention improvments
  2022-06-22 11:16         ` Hao Xu
@ 2022-06-22 11:24           ` Hao Xu
  2022-06-22 11:51             ` Dylan Yudaken
  2022-06-22 11:52             ` Hao Xu
  0 siblings, 2 replies; 20+ messages in thread
From: Hao Xu @ 2022-06-22 11:24 UTC (permalink / raw)
  To: Dylan Yudaken, axboe, asml.silence, io-uring; +Cc: Kernel Team

On 6/22/22 19:16, Hao Xu wrote:
> On 6/22/22 17:31, Dylan Yudaken wrote:
>> On Tue, 2022-06-21 at 15:34 +0800, Hao Xu wrote:
>>> On 6/21/22 15:03, Dylan Yudaken wrote:
>>>> On Tue, 2022-06-21 at 13:10 +0800, Hao Xu wrote:
>>>>> On 6/21/22 00:18, Dylan Yudaken wrote:
>>>>>> Task work currently uses a spin lock to guard task_list and
>>>>>> task_running. Some use cases such as networking can trigger
>>>>>> task_work_add
>>>>>> from multiple threads all at once, which suffers from
>>>>>> contention
>>>>>> here.
>>>>>>
>>>>>> This can be changed to use a lockless list which seems to have
>>>>>> better
>>>>>> performance. Running the micro benchmark in [1] I see 20%
>>>>>> improvment in
>>>>>> multithreaded task work add. It required removing the priority
>>>>>> tw
>>>>>> list
>>>>>> optimisation, however it isn't clear how important that
>>>>>> optimisation is.
>>>>>> Additionally it has fairly easy to break semantics.
>>>>>>
>>>>>> Patch 1-2 remove the priority tw list optimisation
>>>>>> Patch 3-5 add lockless lists for task work
>>>>>> Patch 6 fixes a bug I noticed in io_uring event tracing
>>>>>> Patch 7-8 adds tracing for task_work_run
>>>>>>
>>>>>
>>>>> Compared to the spinlock overhead, the prio task list
>>>>> optimization is
>>>>> definitely unimportant, so I agree with removing it here.
>>>>> Replace the task list with llisy was something I considered but I
>>>>> gave
>>>>> it up since it changes the list to a stack which means we have to
>>>>> handle
>>>>> the tasks in a reverse order. This may affect the latency, do you
>>>>> have
>>>>> some numbers for it, like avg and 99% 95% lat?
>>>>>
>>>>
>>>> Do you have an idea for how to test that? I used a microbenchmark
>>>> as
>>>> well as a network benchmark [1] to verify that overall throughput
>>>> is
>>>> higher. TW latency sounds a lot more complicated to measure as it's
>>>> difficult to trigger accurately.
>>>>
>>>> My feeling is that with reasonable batching (say 8-16 items) the
>>>> latency will be low as TW is generally very quick, but if you have
>>>> an
>>>> idea for benchmarking I can take a look
>>>>
>>>> [1]: https://github.com/DylanZA/netbench
>>>
>>> It can be normal IO requests I think. We can test the latency by fio
>>> with small size IO to a fast block device(like nvme) in SQPOLL
>>> mode(since for non-SQPOLL, it doesn't make difference). This way we
>>> can
>>> see the influence of reverse order handling.
>>>
>>> Regards,
>>> Hao
>>
>> I see little difference locally, but there is quite a big stdev so it's
>> possible my test setup is a bit wonky
>>
>> new:
>>      clat (msec): min=2027, max=10544, avg=6347.10, stdev=2458.20
>>       lat (nsec): min=1440, max=16719k, avg=119714.72, stdev=153571.49
>> old:
>>      clat (msec): min=2738, max=10550, avg=6700.68, stdev=2251.77
>>       lat (nsec): min=1278, max=16610k, avg=121025.73, stdev=211896.14
>>
> 
> Hi Dylan,
> 
> Could you post the arguments you use and the 99% 95% latency as well?
> 
> Regards,
> Hao
> 

One thing I'm worrying about is under heavy workloads, there are
contiguous TWs coming in, thus the TWs at the end of the TW list doesn't
get the chance to run, which leads to the latency of those ones becoming
high.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH RFC for-next 0/8] io_uring: tw contention improvments
  2022-06-22 11:24           ` Hao Xu
@ 2022-06-22 11:51             ` Dylan Yudaken
  2022-06-22 12:28               ` Hao Xu
  2022-06-22 11:52             ` Hao Xu
  1 sibling, 1 reply; 20+ messages in thread
From: Dylan Yudaken @ 2022-06-22 11:51 UTC (permalink / raw)
  To: axboe, hao.xu, asml.silence, io-uring; +Cc: Kernel Team

On Wed, 2022-06-22 at 19:24 +0800, Hao Xu wrote:
> On 6/22/22 19:16, Hao Xu wrote:
> > On 6/22/22 17:31, Dylan Yudaken wrote:
> > > On Tue, 2022-06-21 at 15:34 +0800, Hao Xu wrote:
> > > > On 6/21/22 15:03, Dylan Yudaken wrote:
> > > > > On Tue, 2022-06-21 at 13:10 +0800, Hao Xu wrote:
> > > > > > On 6/21/22 00:18, Dylan Yudaken wrote:
> > > > > > > Task work currently uses a spin lock to guard task_list
> > > > > > > and
> > > > > > > task_running. Some use cases such as networking can
> > > > > > > trigger
> > > > > > > task_work_add
> > > > > > > from multiple threads all at once, which suffers from
> > > > > > > contention
> > > > > > > here.
> > > > > > > 
> > > > > > > This can be changed to use a lockless list which seems to
> > > > > > > have
> > > > > > > better
> > > > > > > performance. Running the micro benchmark in [1] I see 20%
> > > > > > > improvment in
> > > > > > > multithreaded task work add. It required removing the
> > > > > > > priority
> > > > > > > tw
> > > > > > > list
> > > > > > > optimisation, however it isn't clear how important that
> > > > > > > optimisation is.
> > > > > > > Additionally it has fairly easy to break semantics.
> > > > > > > 
> > > > > > > Patch 1-2 remove the priority tw list optimisation
> > > > > > > Patch 3-5 add lockless lists for task work
> > > > > > > Patch 6 fixes a bug I noticed in io_uring event tracing
> > > > > > > Patch 7-8 adds tracing for task_work_run
> > > > > > > 
> > > > > > 
> > > > > > Compared to the spinlock overhead, the prio task list
> > > > > > optimization is
> > > > > > definitely unimportant, so I agree with removing it here.
> > > > > > Replace the task list with llisy was something I considered
> > > > > > but I
> > > > > > gave
> > > > > > it up since it changes the list to a stack which means we
> > > > > > have to
> > > > > > handle
> > > > > > the tasks in a reverse order. This may affect the latency,
> > > > > > do you
> > > > > > have
> > > > > > some numbers for it, like avg and 99% 95% lat?
> > > > > > 
> > > > > 
> > > > > Do you have an idea for how to test that? I used a
> > > > > microbenchmark
> > > > > as
> > > > > well as a network benchmark [1] to verify that overall
> > > > > throughput
> > > > > is
> > > > > higher. TW latency sounds a lot more complicated to measure
> > > > > as it's
> > > > > difficult to trigger accurately.
> > > > > 
> > > > > My feeling is that with reasonable batching (say 8-16 items)
> > > > > the
> > > > > latency will be low as TW is generally very quick, but if you
> > > > > have
> > > > > an
> > > > > idea for benchmarking I can take a look
> > > > > 
> > > > > [1]: https://github.com/DylanZA/netbench
> > > > 
> > > > It can be normal IO requests I think. We can test the latency
> > > > by fio
> > > > with small size IO to a fast block device(like nvme) in SQPOLL
> > > > mode(since for non-SQPOLL, it doesn't make difference). This
> > > > way we
> > > > can
> > > > see the influence of reverse order handling.
> > > > 
> > > > Regards,
> > > > Hao
> > > 
> > > I see little difference locally, but there is quite a big stdev
> > > so it's
> > > possible my test setup is a bit wonky
> > > 
> > > new:
> > >      clat (msec): min=2027, max=10544, avg=6347.10, stdev=2458.20
> > >       lat (nsec): min=1440, max=16719k, avg=119714.72,
> > > stdev=153571.49
> > > old:
> > >      clat (msec): min=2738, max=10550, avg=6700.68, stdev=2251.77
> > >       lat (nsec): min=1278, max=16610k, avg=121025.73,
> > > stdev=211896.14
> > > 
> > 
> > Hi Dylan,
> > 
> > Could you post the arguments you use and the 99% 95% latency as
> > well?
> > 
> > Regards,
> > Hao
> > 
> 
> One thing I'm worrying about is under heavy workloads, there are
> contiguous TWs coming in, thus the TWs at the end of the TW list
> doesn't
> get the chance to run, which leads to the latency of those ones
> becoming
> high.

Pavel mentioned I should change some arguments, so I reran it. I'll
just post all the output below as not sure exactly what you are looking
for. Note I checked that it was definitely batching and it is doing
batches of 10-20 in tctx_task_work


*** config ***

[global]
ioengine=io_uring
sqthread_poll=1
registerfiles=1
fixedbufs=1
hipri=0
thread=1
direct=0
rw=randread
time_based=1
runtime=600
ramp_time=30
randrepeat=0
group_reporting=0
sqthread_poll_cpu=15
iodepth=32

[job0]
filename=/dev/nullb0
cpus_allowed=1
bs=512

*** new ***
job0: (g=0): rw=randread, bs=(R) 512B-512B, (W) 512B-512B, (T) 512B-
512B, ioengine=io_uring, iodepth=32
fio-3.30-59-gd4bf5
Starting 1 thread
Jobs: 1 (f=0): [f(1)][100.0%][r=360MiB/s][r=738k IOPS][eta 00m:00s]
job0: (groupid=0, jobs=1): err= 0: pid=37255: Wed Jun 22 03:44:23 2022
  read: IOPS=596k, BW=291MiB/s (305MB/s)(171GiB/600001msec)
    clat (msec): min=30343, max=630343, avg=369885.75, stdev=164921.26
     lat (usec): min=14, max=1802, avg=53.23, stdev=18.84
    clat percentiles (msec):
     |  1.00th=[17113],  5.00th=[17113], 10.00th=[17113],
20.00th=[17113],
     | 30.00th=[17113], 40.00th=[17113], 50.00th=[17113],
60.00th=[17113],
     | 70.00th=[17113], 80.00th=[17113], 90.00th=[17113],
95.00th=[17113],
     | 99.00th=[17113], 99.50th=[17113], 99.90th=[17113],
99.95th=[17113],
     | 99.99th=[17113]
   bw (  KiB/s): min=169237, max=381603, per=100.00%, avg=298171.22,
stdev=70580.65, samples=1199
   iops        : min=338474, max=763206, avg=596342.60,
stdev=141161.31, samples=1199
  lat (msec)   : >=2000=100.00%
  cpu          : usr=99.98%, sys=0.00%, ctx=4378, majf=0, minf=9
  IO depths    : 1=0.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=100.0%,
>=64=0.0%
     submit    : 0=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%,
>=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.1%, 64=0.0%,
>=64=0.0%
     issued rwts: total=357661967,0,0,0 short=0,0,0,0 dropped=0,0,0,0
     latency   : target=0, window=0, percentile=100.00%, depth=32

Run status group 0 (all jobs):
   READ: bw=291MiB/s (305MB/s), 291MiB/s-291MiB/s (305MB/s-305MB/s),
io=171GiB (183GB), run=600001-600001msec

Disk stats (read/write):
  nullb0: ios=72127555/0, merge=11/0, ticks=1396298/0,
in_queue=1396298, util=100.00%
  
*** old ***

job0: (g=0): rw=randread, bs=(R) 512B-512B, (W) 512B-512B, (T) 512B-
512B, ioengine=io_uring, iodepth=32
fio-3.30-59-gd4bf5
Starting 1 thread
Jobs: 1 (f=1): [r(1)][100.0%][r=367MiB/s][r=751k IOPS][eta 00m:00s]
job0: (groupid=0, jobs=1): err= 0: pid=19216: Wed Jun 22 04:43:36 2022
  read: IOPS=609k, BW=297MiB/s (312MB/s)(174GiB/600001msec)
    clat (msec): min=30333, max=630333, avg=368961.53, stdev=164532.01
     lat (usec): min=14, max=5830, avg=52.11, stdev=18.64
    clat percentiles (msec):
     |  1.00th=[17113],  5.00th=[17113], 10.00th=[17113],
20.00th=[17113],
     | 30.00th=[17113], 40.00th=[17113], 50.00th=[17113],
60.00th=[17113],
     | 70.00th=[17113], 80.00th=[17113], 90.00th=[17113],
95.00th=[17113],
     | 99.00th=[17113], 99.50th=[17113], 99.90th=[17113],
99.95th=[17113],
     | 99.99th=[17113]
   bw (  KiB/s): min=170273, max=386932, per=100.00%, avg=304548.39,
stdev=70732.20, samples=1200
   iops        : min=340547, max=773864, avg=609096.94,
stdev=141464.41, samples=1200
  lat (msec)   : >=2000=100.00%
  cpu          : usr=99.98%, sys=0.00%, ctx=3912, majf=0, minf=5
  IO depths    : 1=0.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=100.0%,
>=64=0.0%
     submit    : 0=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%,
>=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.1%, 64=0.0%,
>=64=0.0%
     issued rwts: total=365258392,0,0,0 short=0,0,0,0 dropped=0,0,0,0
     latency   : target=0, window=0, percentile=100.00%, depth=32

Run status group 0 (all jobs):
   READ: bw=297MiB/s (312MB/s), 297MiB/s-297MiB/s (312MB/s-312MB/s),
io=174GiB (187GB), run=600001-600001msec

Disk stats (read/write):
  nullb0: ios=69031421/0, merge=1/0, ticks=1323086/0, in_queue=1323086,
util=100.00%


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH RFC for-next 0/8] io_uring: tw contention improvments
  2022-06-22 11:24           ` Hao Xu
  2022-06-22 11:51             ` Dylan Yudaken
@ 2022-06-22 11:52             ` Hao Xu
  1 sibling, 0 replies; 20+ messages in thread
From: Hao Xu @ 2022-06-22 11:52 UTC (permalink / raw)
  To: Dylan Yudaken, axboe, asml.silence, io-uring; +Cc: Kernel Team

On 6/22/22 19:24, Hao Xu wrote:
> On 6/22/22 19:16, Hao Xu wrote:
>> On 6/22/22 17:31, Dylan Yudaken wrote:
>>> On Tue, 2022-06-21 at 15:34 +0800, Hao Xu wrote:
>>>> On 6/21/22 15:03, Dylan Yudaken wrote:
>>>>> On Tue, 2022-06-21 at 13:10 +0800, Hao Xu wrote:
>>>>>> On 6/21/22 00:18, Dylan Yudaken wrote:
>>>>>>> Task work currently uses a spin lock to guard task_list and
>>>>>>> task_running. Some use cases such as networking can trigger
>>>>>>> task_work_add
>>>>>>> from multiple threads all at once, which suffers from
>>>>>>> contention
>>>>>>> here.
>>>>>>>
>>>>>>> This can be changed to use a lockless list which seems to have
>>>>>>> better
>>>>>>> performance. Running the micro benchmark in [1] I see 20%
>>>>>>> improvment in
>>>>>>> multithreaded task work add. It required removing the priority
>>>>>>> tw
>>>>>>> list
>>>>>>> optimisation, however it isn't clear how important that
>>>>>>> optimisation is.
>>>>>>> Additionally it has fairly easy to break semantics.
>>>>>>>
>>>>>>> Patch 1-2 remove the priority tw list optimisation
>>>>>>> Patch 3-5 add lockless lists for task work
>>>>>>> Patch 6 fixes a bug I noticed in io_uring event tracing
>>>>>>> Patch 7-8 adds tracing for task_work_run
>>>>>>>
>>>>>>
>>>>>> Compared to the spinlock overhead, the prio task list
>>>>>> optimization is
>>>>>> definitely unimportant, so I agree with removing it here.
>>>>>> Replace the task list with llisy was something I considered but I
>>>>>> gave
>>>>>> it up since it changes the list to a stack which means we have to
>>>>>> handle
>>>>>> the tasks in a reverse order. This may affect the latency, do you
>>>>>> have
>>>>>> some numbers for it, like avg and 99% 95% lat?
>>>>>>
>>>>>
>>>>> Do you have an idea for how to test that? I used a microbenchmark
>>>>> as
>>>>> well as a network benchmark [1] to verify that overall throughput
>>>>> is
>>>>> higher. TW latency sounds a lot more complicated to measure as it's
>>>>> difficult to trigger accurately.
>>>>>
>>>>> My feeling is that with reasonable batching (say 8-16 items) the
>>>>> latency will be low as TW is generally very quick, but if you have
>>>>> an
>>>>> idea for benchmarking I can take a look
>>>>>
>>>>> [1]: https://github.com/DylanZA/netbench
>>>>
>>>> It can be normal IO requests I think. We can test the latency by fio
>>>> with small size IO to a fast block device(like nvme) in SQPOLL
>>>> mode(since for non-SQPOLL, it doesn't make difference). This way we
>>>> can
>>>> see the influence of reverse order handling.
>>>>
>>>> Regards,
>>>> Hao
>>>
>>> I see little difference locally, but there is quite a big stdev so it's
>>> possible my test setup is a bit wonky
>>>
>>> new:
>>>      clat (msec): min=2027, max=10544, avg=6347.10, stdev=2458.20
>>>       lat (nsec): min=1440, max=16719k, avg=119714.72, stdev=153571.49
>>> old:
>>>      clat (msec): min=2738, max=10550, avg=6700.68, stdev=2251.77
>>>       lat (nsec): min=1278, max=16610k, avg=121025.73, stdev=211896.14
>>>
>>
>> Hi Dylan,
>>
>> Could you post the arguments you use and the 99% 95% latency as well?
>>
>> Regards,
>> Hao
>>
> 
> One thing I'm worrying about is under heavy workloads, there are
> contiguous TWs coming in, thus the TWs at the end of the TW list doesn't
> get the chance to run, which leads to the latency of those ones becoming
> high.

Ah, looked at the code again, seems we take the whole list not a single
node at each time, so it shouldn't be a big problem.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH RFC for-next 0/8] io_uring: tw contention improvments
  2022-06-22 11:51             ` Dylan Yudaken
@ 2022-06-22 12:28               ` Hao Xu
  2022-06-22 12:29                 ` Hao Xu
  0 siblings, 1 reply; 20+ messages in thread
From: Hao Xu @ 2022-06-22 12:28 UTC (permalink / raw)
  To: Dylan Yudaken, axboe, asml.silence, io-uring; +Cc: Kernel Team

On 6/22/22 19:51, Dylan Yudaken wrote:
> On Wed, 2022-06-22 at 19:24 +0800, Hao Xu wrote:
>> On 6/22/22 19:16, Hao Xu wrote:
>>> On 6/22/22 17:31, Dylan Yudaken wrote:
>>>> On Tue, 2022-06-21 at 15:34 +0800, Hao Xu wrote:
>>>>> On 6/21/22 15:03, Dylan Yudaken wrote:
>>>>>> On Tue, 2022-06-21 at 13:10 +0800, Hao Xu wrote:
>>>>>>> On 6/21/22 00:18, Dylan Yudaken wrote:
>>>>>>>> Task work currently uses a spin lock to guard task_list
>>>>>>>> and
>>>>>>>> task_running. Some use cases such as networking can
>>>>>>>> trigger
>>>>>>>> task_work_add
>>>>>>>> from multiple threads all at once, which suffers from
>>>>>>>> contention
>>>>>>>> here.
>>>>>>>>
>>>>>>>> This can be changed to use a lockless list which seems to
>>>>>>>> have
>>>>>>>> better
>>>>>>>> performance. Running the micro benchmark in [1] I see 20%
>>>>>>>> improvment in
>>>>>>>> multithreaded task work add. It required removing the
>>>>>>>> priority
>>>>>>>> tw
>>>>>>>> list
>>>>>>>> optimisation, however it isn't clear how important that
>>>>>>>> optimisation is.
>>>>>>>> Additionally it has fairly easy to break semantics.
>>>>>>>>
>>>>>>>> Patch 1-2 remove the priority tw list optimisation
>>>>>>>> Patch 3-5 add lockless lists for task work
>>>>>>>> Patch 6 fixes a bug I noticed in io_uring event tracing
>>>>>>>> Patch 7-8 adds tracing for task_work_run
>>>>>>>>
>>>>>>>
>>>>>>> Compared to the spinlock overhead, the prio task list
>>>>>>> optimization is
>>>>>>> definitely unimportant, so I agree with removing it here.
>>>>>>> Replace the task list with llisy was something I considered
>>>>>>> but I
>>>>>>> gave
>>>>>>> it up since it changes the list to a stack which means we
>>>>>>> have to
>>>>>>> handle
>>>>>>> the tasks in a reverse order. This may affect the latency,
>>>>>>> do you
>>>>>>> have
>>>>>>> some numbers for it, like avg and 99% 95% lat?
>>>>>>>
>>>>>>
>>>>>> Do you have an idea for how to test that? I used a
>>>>>> microbenchmark
>>>>>> as
>>>>>> well as a network benchmark [1] to verify that overall
>>>>>> throughput
>>>>>> is
>>>>>> higher. TW latency sounds a lot more complicated to measure
>>>>>> as it's
>>>>>> difficult to trigger accurately.
>>>>>>
>>>>>> My feeling is that with reasonable batching (say 8-16 items)
>>>>>> the
>>>>>> latency will be low as TW is generally very quick, but if you
>>>>>> have
>>>>>> an
>>>>>> idea for benchmarking I can take a look
>>>>>>
>>>>>> [1]: https://github.com/DylanZA/netbench
>>>>>
>>>>> It can be normal IO requests I think. We can test the latency
>>>>> by fio
>>>>> with small size IO to a fast block device(like nvme) in SQPOLL
>>>>> mode(since for non-SQPOLL, it doesn't make difference). This
>>>>> way we
>>>>> can
>>>>> see the influence of reverse order handling.
>>>>>
>>>>> Regards,
>>>>> Hao
>>>>
>>>> I see little difference locally, but there is quite a big stdev
>>>> so it's
>>>> possible my test setup is a bit wonky
>>>>
>>>> new:
>>>>       clat (msec): min=2027, max=10544, avg=6347.10, stdev=2458.20
>>>>        lat (nsec): min=1440, max=16719k, avg=119714.72,
>>>> stdev=153571.49
>>>> old:
>>>>       clat (msec): min=2738, max=10550, avg=6700.68, stdev=2251.77
>>>>        lat (nsec): min=1278, max=16610k, avg=121025.73,
>>>> stdev=211896.14
>>>>
>>>
>>> Hi Dylan,
>>>
>>> Could you post the arguments you use and the 99% 95% latency as
>>> well?
>>>
>>> Regards,
>>> Hao
>>>
>>
>> One thing I'm worrying about is under heavy workloads, there are
>> contiguous TWs coming in, thus the TWs at the end of the TW list
>> doesn't
>> get the chance to run, which leads to the latency of those ones
>> becoming
>> high.
> 
> Pavel mentioned I should change some arguments, so I reran it. I'll
> just post all the output below as not sure exactly what you are looking
> for. Note I checked that it was definitely batching and it is doing
> batches of 10-20 in tctx_task_work
> 
> 
> *** config ***
> 
> [global]
> ioengine=io_uring
> sqthread_poll=1
> registerfiles=1
> fixedbufs=1
> hipri=0
> thread=1
> direct=0
> rw=randread
> time_based=1
> runtime=600
> ramp_time=30
> randrepeat=0
> group_reporting=0
> sqthread_poll_cpu=15
> iodepth=32
> 
> [job0]
> filename=/dev/nullb0
> cpus_allowed=1
> bs=512
> 
> *** new ***
> job0: (g=0): rw=randread, bs=(R) 512B-512B, (W) 512B-512B, (T) 512B-
> 512B, ioengine=io_uring, iodepth=32
> fio-3.30-59-gd4bf5
> Starting 1 thread
> Jobs: 1 (f=0): [f(1)][100.0%][r=360MiB/s][r=738k IOPS][eta 00m:00s]
> job0: (groupid=0, jobs=1): err= 0: pid=37255: Wed Jun 22 03:44:23 2022
>    read: IOPS=596k, BW=291MiB/s (305MB/s)(171GiB/600001msec)
>      clat (msec): min=30343, max=630343, avg=369885.75, stdev=164921.26
>       lat (usec): min=14, max=1802, avg=53.23, stdev=18.84
>      clat percentiles (msec):
>       |  1.00th=[17113],  5.00th=[17113], 10.00th=[17113],
> 20.00th=[17113],
>       | 30.00th=[17113], 40.00th=[17113], 50.00th=[17113],
> 60.00th=[17113],
>       | 70.00th=[17113], 80.00th=[17113], 90.00th=[17113],
> 95.00th=[17113],
>       | 99.00th=[17113], 99.50th=[17113], 99.90th=[17113],
> 99.95th=[17113],
>       | 99.99th=[17113]
>     bw (  KiB/s): min=169237, max=381603, per=100.00%, avg=298171.22,
> stdev=70580.65, samples=1199
>     iops        : min=338474, max=763206, avg=596342.60,
> stdev=141161.31, samples=1199
>    lat (msec)   : >=2000=100.00%
>    cpu          : usr=99.98%, sys=0.00%, ctx=4378, majf=0, minf=9
>    IO depths    : 1=0.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=100.0%,
>> =64=0.0%
>       submit    : 0=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%,
>> =64=0.0%
>       complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.1%, 64=0.0%,
>> =64=0.0%
>       issued rwts: total=357661967,0,0,0 short=0,0,0,0 dropped=0,0,0,0
>       latency   : target=0, window=0, percentile=100.00%, depth=32
> 
> Run status group 0 (all jobs):
>     READ: bw=291MiB/s (305MB/s), 291MiB/s-291MiB/s (305MB/s-305MB/s),
> io=171GiB (183GB), run=600001-600001msec
> 
> Disk stats (read/write):
>    nullb0: ios=72127555/0, merge=11/0, ticks=1396298/0,
> in_queue=1396298, util=100.00%
>    
> *** old ***
> 
> job0: (g=0): rw=randread, bs=(R) 512B-512B, (W) 512B-512B, (T) 512B-
> 512B, ioengine=io_uring, iodepth=32
> fio-3.30-59-gd4bf5
> Starting 1 thread
> Jobs: 1 (f=1): [r(1)][100.0%][r=367MiB/s][r=751k IOPS][eta 00m:00s]
> job0: (groupid=0, jobs=1): err= 0: pid=19216: Wed Jun 22 04:43:36 2022
>    read: IOPS=609k, BW=297MiB/s (312MB/s)(174GiB/600001msec)
>      clat (msec): min=30333, max=630333, avg=368961.53, stdev=164532.01
>       lat (usec): min=14, max=5830, avg=52.11, stdev=18.64
>      clat percentiles (msec):
>       |  1.00th=[17113],  5.00th=[17113], 10.00th=[17113],
> 20.00th=[17113],
>       | 30.00th=[17113], 40.00th=[17113], 50.00th=[17113],
> 60.00th=[17113],
>       | 70.00th=[17113], 80.00th=[17113], 90.00th=[17113],
> 95.00th=[17113],
>       | 99.00th=[17113], 99.50th=[17113], 99.90th=[17113],
> 99.95th=[17113],
>       | 99.99th=[17113]
>     bw (  KiB/s): min=170273, max=386932, per=100.00%, avg=304548.39,
> stdev=70732.20, samples=1200
>     iops        : min=340547, max=773864, avg=609096.94,
> stdev=141464.41, samples=1200
>    lat (msec)   : >=2000=100.00%
>    cpu          : usr=99.98%, sys=0.00%, ctx=3912, majf=0, minf=5
>    IO depths    : 1=0.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=100.0%,
>> =64=0.0%
>       submit    : 0=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%,
>> =64=0.0%
>       complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.1%, 64=0.0%,
>> =64=0.0%
>       issued rwts: total=365258392,0,0,0 short=0,0,0,0 dropped=0,0,0,0
>       latency   : target=0, window=0, percentile=100.00%, depth=32
> 
> Run status group 0 (all jobs):
>     READ: bw=297MiB/s (312MB/s), 297MiB/s-297MiB/s (312MB/s-312MB/s),
> io=174GiB (187GB), run=600001-600001msec
> 
> Disk stats (read/write):
>    nullb0: ios=69031421/0, merge=1/0, ticks=1323086/0, in_queue=1323086,
> util=100.00%
> 

Ok, the clat percentiles seems meanless here... from the min max and avg
data it should be fine. Thanks for testing.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH RFC for-next 0/8] io_uring: tw contention improvments
  2022-06-22 12:28               ` Hao Xu
@ 2022-06-22 12:29                 ` Hao Xu
  0 siblings, 0 replies; 20+ messages in thread
From: Hao Xu @ 2022-06-22 12:29 UTC (permalink / raw)
  To: Dylan Yudaken, axboe, asml.silence, io-uring; +Cc: Kernel Team

On 6/22/22 20:28, Hao Xu wrote:
> On 6/22/22 19:51, Dylan Yudaken wrote:
>> On Wed, 2022-06-22 at 19:24 +0800, Hao Xu wrote:
>>> On 6/22/22 19:16, Hao Xu wrote:
>>>> On 6/22/22 17:31, Dylan Yudaken wrote:
>>>>> On Tue, 2022-06-21 at 15:34 +0800, Hao Xu wrote:
>>>>>> On 6/21/22 15:03, Dylan Yudaken wrote:
>>>>>>> On Tue, 2022-06-21 at 13:10 +0800, Hao Xu wrote:
>>>>>>>> On 6/21/22 00:18, Dylan Yudaken wrote:
>>>>>>>>> Task work currently uses a spin lock to guard task_list
>>>>>>>>> and
>>>>>>>>> task_running. Some use cases such as networking can
>>>>>>>>> trigger
>>>>>>>>> task_work_add
>>>>>>>>> from multiple threads all at once, which suffers from
>>>>>>>>> contention
>>>>>>>>> here.
>>>>>>>>>
>>>>>>>>> This can be changed to use a lockless list which seems to
>>>>>>>>> have
>>>>>>>>> better
>>>>>>>>> performance. Running the micro benchmark in [1] I see 20%
>>>>>>>>> improvment in
>>>>>>>>> multithreaded task work add. It required removing the
>>>>>>>>> priority
>>>>>>>>> tw
>>>>>>>>> list
>>>>>>>>> optimisation, however it isn't clear how important that
>>>>>>>>> optimisation is.
>>>>>>>>> Additionally it has fairly easy to break semantics.
>>>>>>>>>
>>>>>>>>> Patch 1-2 remove the priority tw list optimisation
>>>>>>>>> Patch 3-5 add lockless lists for task work
>>>>>>>>> Patch 6 fixes a bug I noticed in io_uring event tracing
>>>>>>>>> Patch 7-8 adds tracing for task_work_run
>>>>>>>>>
>>>>>>>>
>>>>>>>> Compared to the spinlock overhead, the prio task list
>>>>>>>> optimization is
>>>>>>>> definitely unimportant, so I agree with removing it here.
>>>>>>>> Replace the task list with llisy was something I considered
>>>>>>>> but I
>>>>>>>> gave
>>>>>>>> it up since it changes the list to a stack which means we
>>>>>>>> have to
>>>>>>>> handle
>>>>>>>> the tasks in a reverse order. This may affect the latency,
>>>>>>>> do you
>>>>>>>> have
>>>>>>>> some numbers for it, like avg and 99% 95% lat?
>>>>>>>>
>>>>>>>
>>>>>>> Do you have an idea for how to test that? I used a
>>>>>>> microbenchmark
>>>>>>> as
>>>>>>> well as a network benchmark [1] to verify that overall
>>>>>>> throughput
>>>>>>> is
>>>>>>> higher. TW latency sounds a lot more complicated to measure
>>>>>>> as it's
>>>>>>> difficult to trigger accurately.
>>>>>>>
>>>>>>> My feeling is that with reasonable batching (say 8-16 items)
>>>>>>> the
>>>>>>> latency will be low as TW is generally very quick, but if you
>>>>>>> have
>>>>>>> an
>>>>>>> idea for benchmarking I can take a look
>>>>>>>
>>>>>>> [1]: https://github.com/DylanZA/netbench
>>>>>>
>>>>>> It can be normal IO requests I think. We can test the latency
>>>>>> by fio
>>>>>> with small size IO to a fast block device(like nvme) in SQPOLL
>>>>>> mode(since for non-SQPOLL, it doesn't make difference). This
>>>>>> way we
>>>>>> can
>>>>>> see the influence of reverse order handling.
>>>>>>
>>>>>> Regards,
>>>>>> Hao
>>>>>
>>>>> I see little difference locally, but there is quite a big stdev
>>>>> so it's
>>>>> possible my test setup is a bit wonky
>>>>>
>>>>> new:
>>>>>       clat (msec): min=2027, max=10544, avg=6347.10, stdev=2458.20
>>>>>        lat (nsec): min=1440, max=16719k, avg=119714.72,
>>>>> stdev=153571.49
>>>>> old:
>>>>>       clat (msec): min=2738, max=10550, avg=6700.68, stdev=2251.77
>>>>>        lat (nsec): min=1278, max=16610k, avg=121025.73,
>>>>> stdev=211896.14
>>>>>
>>>>
>>>> Hi Dylan,
>>>>
>>>> Could you post the arguments you use and the 99% 95% latency as
>>>> well?
>>>>
>>>> Regards,
>>>> Hao
>>>>
>>>
>>> One thing I'm worrying about is under heavy workloads, there are
>>> contiguous TWs coming in, thus the TWs at the end of the TW list
>>> doesn't
>>> get the chance to run, which leads to the latency of those ones
>>> becoming
>>> high.
>>
>> Pavel mentioned I should change some arguments, so I reran it. I'll
>> just post all the output below as not sure exactly what you are looking
>> for. Note I checked that it was definitely batching and it is doing
>> batches of 10-20 in tctx_task_work
>>
>>
>> *** config ***
>>
>> [global]
>> ioengine=io_uring
>> sqthread_poll=1
>> registerfiles=1
>> fixedbufs=1
>> hipri=0
>> thread=1
>> direct=0
>> rw=randread
>> time_based=1
>> runtime=600
>> ramp_time=30
>> randrepeat=0
>> group_reporting=0
>> sqthread_poll_cpu=15
>> iodepth=32
>>
>> [job0]
>> filename=/dev/nullb0
>> cpus_allowed=1
>> bs=512
>>
>> *** new ***
>> job0: (g=0): rw=randread, bs=(R) 512B-512B, (W) 512B-512B, (T) 512B-
>> 512B, ioengine=io_uring, iodepth=32
>> fio-3.30-59-gd4bf5
>> Starting 1 thread
>> Jobs: 1 (f=0): [f(1)][100.0%][r=360MiB/s][r=738k IOPS][eta 00m:00s]
>> job0: (groupid=0, jobs=1): err= 0: pid=37255: Wed Jun 22 03:44:23 2022
>>    read: IOPS=596k, BW=291MiB/s (305MB/s)(171GiB/600001msec)
>>      clat (msec): min=30343, max=630343, avg=369885.75, stdev=164921.26
>>       lat (usec): min=14, max=1802, avg=53.23, stdev=18.84
>>      clat percentiles (msec):
>>       |  1.00th=[17113],  5.00th=[17113], 10.00th=[17113],
>> 20.00th=[17113],
>>       | 30.00th=[17113], 40.00th=[17113], 50.00th=[17113],
>> 60.00th=[17113],
>>       | 70.00th=[17113], 80.00th=[17113], 90.00th=[17113],
>> 95.00th=[17113],
>>       | 99.00th=[17113], 99.50th=[17113], 99.90th=[17113],
>> 99.95th=[17113],
>>       | 99.99th=[17113]
>>     bw (  KiB/s): min=169237, max=381603, per=100.00%, avg=298171.22,
>> stdev=70580.65, samples=1199
>>     iops        : min=338474, max=763206, avg=596342.60,
>> stdev=141161.31, samples=1199
>>    lat (msec)   : >=2000=100.00%
>>    cpu          : usr=99.98%, sys=0.00%, ctx=4378, majf=0, minf=9
>>    IO depths    : 1=0.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=100.0%,
>>> =64=0.0%
>>       submit    : 0=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%,
>>> =64=0.0%
>>       complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.1%, 64=0.0%,
>>> =64=0.0%
>>       issued rwts: total=357661967,0,0,0 short=0,0,0,0 dropped=0,0,0,0
>>       latency   : target=0, window=0, percentile=100.00%, depth=32
>>
>> Run status group 0 (all jobs):
>>     READ: bw=291MiB/s (305MB/s), 291MiB/s-291MiB/s (305MB/s-305MB/s),
>> io=171GiB (183GB), run=600001-600001msec
>>
>> Disk stats (read/write):
>>    nullb0: ios=72127555/0, merge=11/0, ticks=1396298/0,
>> in_queue=1396298, util=100.00%
>> *** old ***
>>
>> job0: (g=0): rw=randread, bs=(R) 512B-512B, (W) 512B-512B, (T) 512B-
>> 512B, ioengine=io_uring, iodepth=32
>> fio-3.30-59-gd4bf5
>> Starting 1 thread
>> Jobs: 1 (f=1): [r(1)][100.0%][r=367MiB/s][r=751k IOPS][eta 00m:00s]
>> job0: (groupid=0, jobs=1): err= 0: pid=19216: Wed Jun 22 04:43:36 2022
>>    read: IOPS=609k, BW=297MiB/s (312MB/s)(174GiB/600001msec)
>>      clat (msec): min=30333, max=630333, avg=368961.53, stdev=164532.01
>>       lat (usec): min=14, max=5830, avg=52.11, stdev=18.64
>>      clat percentiles (msec):
>>       |  1.00th=[17113],  5.00th=[17113], 10.00th=[17113],
>> 20.00th=[17113],
>>       | 30.00th=[17113], 40.00th=[17113], 50.00th=[17113],
>> 60.00th=[17113],
>>       | 70.00th=[17113], 80.00th=[17113], 90.00th=[17113],
>> 95.00th=[17113],
>>       | 99.00th=[17113], 99.50th=[17113], 99.90th=[17113],
>> 99.95th=[17113],
>>       | 99.99th=[17113]
>>     bw (  KiB/s): min=170273, max=386932, per=100.00%, avg=304548.39,
>> stdev=70732.20, samples=1200
>>     iops        : min=340547, max=773864, avg=609096.94,
>> stdev=141464.41, samples=1200
>>    lat (msec)   : >=2000=100.00%
>>    cpu          : usr=99.98%, sys=0.00%, ctx=3912, majf=0, minf=5
>>    IO depths    : 1=0.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=100.0%,
>>> =64=0.0%
>>       submit    : 0=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%,
>>> =64=0.0%
>>       complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.1%, 64=0.0%,
>>> =64=0.0%
>>       issued rwts: total=365258392,0,0,0 short=0,0,0,0 dropped=0,0,0,0
>>       latency   : target=0, window=0, percentile=100.00%, depth=32
>>
>> Run status group 0 (all jobs):
>>     READ: bw=297MiB/s (312MB/s), 297MiB/s-297MiB/s (312MB/s-312MB/s),
>> io=174GiB (187GB), run=600001-600001msec
>>
>> Disk stats (read/write):
>>    nullb0: ios=69031421/0, merge=1/0, ticks=1323086/0, in_queue=1323086,
>> util=100.00%
>>
> 
> Ok, the clat percentiles seems meanless here... from the min max and avg

                                     ^ meaningless

> data it should be fine. Thanks for testing.


^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2022-06-22 12:29 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-06-20 16:18 [PATCH RFC for-next 0/8] io_uring: tw contention improvments Dylan Yudaken
2022-06-20 16:18 ` [PATCH RFC for-next 1/8] io_uring: remove priority tw list optimisation Dylan Yudaken
2022-06-20 16:18 ` [PATCH RFC for-next 2/8] io_uring: remove __io_req_task_work_add Dylan Yudaken
2022-06-20 16:18 ` [PATCH RFC for-next 3/8] io_uring: lockless task list Dylan Yudaken
2022-06-20 16:18 ` [PATCH RFC for-next 4/8] io_uring: introduce llist helpers Dylan Yudaken
2022-06-20 16:18 ` [PATCH RFC for-next 5/8] io_uring: batch task_work Dylan Yudaken
2022-06-20 16:18 ` [PATCH RFC for-next 6/8] io_uring: move io_uring_get_opcode out of TP_printk Dylan Yudaken
2022-06-20 16:19 ` [PATCH RFC for-next 7/8] io_uring: add trace event for running task work Dylan Yudaken
2022-06-20 16:19 ` [PATCH RFC for-next 8/8] io_uring: trace task_work_run Dylan Yudaken
2022-06-21  5:10 ` [PATCH RFC for-next 0/8] io_uring: tw contention improvments Hao Xu
2022-06-21  7:03   ` Dylan Yudaken
2022-06-21  7:34     ` Hao Xu
2022-06-22  9:31       ` Dylan Yudaken
2022-06-22 11:16         ` Hao Xu
2022-06-22 11:24           ` Hao Xu
2022-06-22 11:51             ` Dylan Yudaken
2022-06-22 12:28               ` Hao Xu
2022-06-22 12:29                 ` Hao Xu
2022-06-22 11:52             ` Hao Xu
2022-06-21  7:38     ` Hao Xu

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.