IO-Uring Archive on lore.kernel.org
 help / color / Atom feed
* [RFC 0/2] 3 cacheline io_kiocb
@ 2020-07-25  8:31 Pavel Begunkov
  2020-07-25  8:31 ` [PATCH 1/2] io_uring: allocate req->work dynamically Pavel Begunkov
                   ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: Pavel Begunkov @ 2020-07-25  8:31 UTC (permalink / raw)
  To: Jens Axboe, io-uring

That's not final for a several reasons, but good enough for discussion.
That brings io_kiocb down to 192B. I didn't try to benchmark it
properly, but quick nop test gave +5% throughput increase.
7531 vs 7910 KIOPS with fio/t/io_uring

The whole situation is obviously a bunch of tradeoffs. For instance,
instead of shrinking it, we can inline apoll to speed apoll path.

[2/2] just for a reference, I'm thinking about other ways to shrink it.
e.g. ->link_list can be a single-linked list with linked tiemouts
storing a back-reference. This can turn out to be better, because
that would move ->fixed_file_refs to the 2nd cacheline, so we won't
ever touch 3rd cacheline in the submission path.
Any other ideas?

note: on top of for-5.9/io_uring,
f56040b819998 ("io_uring: deduplicate io_grab_files() calls")

Pavel Begunkov (2):
  io_uring: allocate req->work dynamically
  io_uring: unionise ->apoll and ->work

 fs/io-wq.h    |   1 +
 fs/io_uring.c | 207 ++++++++++++++++++++++++++------------------------
 2 files changed, 110 insertions(+), 98 deletions(-)

-- 
2.24.0


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 1/2] io_uring: allocate req->work dynamically
  2020-07-25  8:31 [RFC 0/2] 3 cacheline io_kiocb Pavel Begunkov
@ 2020-07-25  8:31 ` Pavel Begunkov
  2020-07-25  8:31 ` [PATCH 2/2] io_uring: unionise ->apoll and ->work Pavel Begunkov
  2020-07-25 15:45 ` [RFC 0/2] 3 cacheline io_kiocb Jens Axboe
  2 siblings, 0 replies; 8+ messages in thread
From: Pavel Begunkov @ 2020-07-25  8:31 UTC (permalink / raw)
  To: Jens Axboe, io-uring

req->work takes a lot of space and is not needed in hot path, don't
embed it into struct io_kiocb and allocate dynamically. The changes are
pretty straightforward, the only noticible thing is a ->private field
in io_wq_work growing it to 64B, and used to store a reference to the
request.

That shrinks io_kiocb to 200 bytes

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io-wq.h    |   1 +
 fs/io_uring.c | 203 +++++++++++++++++++++++++-------------------------
 2 files changed, 104 insertions(+), 100 deletions(-)

diff --git a/fs/io-wq.h b/fs/io-wq.h
index ddaf9614cf9b..1358e022ed4b 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -85,6 +85,7 @@ static inline void wq_list_del(struct io_wq_work_list *list,
 
 struct io_wq_work {
 	struct io_wq_work_node list;
+	void *private;
 	struct files_struct *files;
 	struct mm_struct *mm;
 	const struct cred *creds;
diff --git a/fs/io_uring.c b/fs/io_uring.c
index c7e8e9a1b27b..ef4c6e50aa4f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -544,7 +544,6 @@ enum {
 	REQ_F_POLLED_BIT,
 	REQ_F_BUFFER_SELECTED_BIT,
 	REQ_F_NO_FILE_TABLE_BIT,
-	REQ_F_WORK_INITIALIZED_BIT,
 	REQ_F_TASK_PINNED_BIT,
 
 	/* not a real bit, just to check we're not overflowing the space */
@@ -591,8 +590,6 @@ enum {
 	REQ_F_BUFFER_SELECTED	= BIT(REQ_F_BUFFER_SELECTED_BIT),
 	/* doesn't need file table for this request */
 	REQ_F_NO_FILE_TABLE	= BIT(REQ_F_NO_FILE_TABLE_BIT),
-	/* io_wq_work is initialized */
-	REQ_F_WORK_INITIALIZED	= BIT(REQ_F_WORK_INITIALIZED_BIT),
 	/* req->task is refcounted */
 	REQ_F_TASK_PINNED	= BIT(REQ_F_TASK_PINNED_BIT),
 };
@@ -600,7 +597,6 @@ enum {
 struct async_poll {
 	struct io_poll_iocb	poll;
 	struct io_poll_iocb	*double_poll;
-	struct io_wq_work	work;
 };
 
 /*
@@ -657,19 +653,13 @@ struct io_kiocb {
 
 	struct percpu_ref	*fixed_file_refs;
 
-	union {
-		/*
-		 * Only commands that never go async can use the below fields,
-		 * obviously. Right now only IORING_OP_POLL_ADD uses them, and
-		 * async armed poll handlers for regular commands. The latter
-		 * restore the work, if needed.
-		 */
-		struct {
-			struct hlist_node	hash_node;
-			struct async_poll	*apoll;
-		};
-		struct io_wq_work	work;
-	};
+	/*
+	 * Right now only IORING_OP_POLL_ADD uses it, and
+	 * async armed poll handlers for regular commands.
+	 */
+	struct hlist_node	hash_node;
+	struct async_poll	*apoll;
+	struct io_wq_work	*work;
 	struct callback_head	task_work;
 };
 
@@ -902,6 +892,7 @@ enum io_mem_account {
 	ACCT_PINNED,
 };
 
+static void io_req_complete(struct io_kiocb *req, long res);
 static bool io_rw_reissue(struct io_kiocb *req, long res);
 static void io_cqring_fill_event(struct io_kiocb *req, long res);
 static void io_put_req(struct io_kiocb *req);
@@ -1008,13 +999,12 @@ static inline void req_set_fail_links(struct io_kiocb *req)
  * Note: must call io_req_init_async() for the first time you
  * touch any members of io_wq_work.
  */
-static inline void io_req_init_async(struct io_kiocb *req)
+static inline bool io_req_init_async(struct io_kiocb *req)
 {
-	if (req->flags & REQ_F_WORK_INITIALIZED)
-		return;
-
-	memset(&req->work, 0, sizeof(req->work));
-	req->flags |= REQ_F_WORK_INITIALIZED;
+	if (req->work)
+		return true;
+	req->work = kzalloc(sizeof(*req->work), GFP_KERNEL);
+	return req->work != NULL;
 }
 
 static inline bool io_async_submit(struct io_ring_ctx *ctx)
@@ -1121,72 +1111,85 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx)
 
 static void io_req_clean_work(struct io_kiocb *req)
 {
-	if (!(req->flags & REQ_F_WORK_INITIALIZED))
+	struct io_wq_work *work = req->work;
+
+	if (!work)
 		return;
 
-	if (req->work.mm) {
-		mmdrop(req->work.mm);
-		req->work.mm = NULL;
+	if (work->mm) {
+		mmdrop(work->mm);
+		work->mm = NULL;
 	}
-	if (req->work.creds) {
-		put_cred(req->work.creds);
-		req->work.creds = NULL;
+	if (work->creds) {
+		put_cred(work->creds);
+		work->creds = NULL;
 	}
-	if (req->work.fs) {
-		struct fs_struct *fs = req->work.fs;
+	if (work->fs) {
+		struct fs_struct *fs = work->fs;
 
-		spin_lock(&req->work.fs->lock);
+		spin_lock(&work->fs->lock);
 		if (--fs->users)
 			fs = NULL;
-		spin_unlock(&req->work.fs->lock);
+		spin_unlock(&work->fs->lock);
 		if (fs)
 			free_fs_struct(fs);
 	}
+	kfree(work);
 }
 
-static void io_prep_async_work(struct io_kiocb *req)
+static bool io_prep_async_work(struct io_kiocb *req)
 {
 	const struct io_op_def *def = &io_op_defs[req->opcode];
+	struct io_wq_work *work;
 
-	io_req_init_async(req);
+	if (!io_req_init_async(req))
+		return false;
+	work = req->work;
+	work->private = req;
 
 	if (req->flags & REQ_F_ISREG) {
 		if (def->hash_reg_file)
-			io_wq_hash_work(&req->work, file_inode(req->file));
+			io_wq_hash_work(work, file_inode(req->file));
 	} else {
 		if (def->unbound_nonreg_file)
-			req->work.flags |= IO_WQ_WORK_UNBOUND;
+			work->flags |= IO_WQ_WORK_UNBOUND;
 	}
-	if (!req->work.mm && def->needs_mm) {
+	if (!work->mm && def->needs_mm) {
 		mmgrab(current->mm);
-		req->work.mm = current->mm;
+		work->mm = current->mm;
 	}
-	if (!req->work.creds)
-		req->work.creds = get_current_cred();
-	if (!req->work.fs && def->needs_fs) {
+	if (!work->creds)
+		work->creds = get_current_cred();
+	if (!work->fs && def->needs_fs) {
 		spin_lock(&current->fs->lock);
 		if (!current->fs->in_exec) {
-			req->work.fs = current->fs;
-			req->work.fs->users++;
+			work->fs = current->fs;
+			work->fs->users++;
 		} else {
-			req->work.flags |= IO_WQ_WORK_CANCEL;
+			work->flags |= IO_WQ_WORK_CANCEL;
 		}
 		spin_unlock(&current->fs->lock);
 	}
 	if (def->needs_fsize)
-		req->work.fsize = rlimit(RLIMIT_FSIZE);
+		work->fsize = rlimit(RLIMIT_FSIZE);
 	else
-		req->work.fsize = RLIM_INFINITY;
+		work->fsize = RLIM_INFINITY;
+	return true;
 }
 
-static void io_prep_async_link(struct io_kiocb *req)
+static bool io_prep_async_link(struct io_kiocb *req)
 {
 	struct io_kiocb *cur;
 
-	io_prep_async_work(req);
-	if (req->flags & REQ_F_LINK_HEAD)
-		list_for_each_entry(cur, &req->link_list, link_list)
-			io_prep_async_work(cur);
+	if (!io_prep_async_work(req))
+		return false;
+	if (!(req->flags & REQ_F_LINK_HEAD))
+		return true;
+
+	list_for_each_entry(cur, &req->link_list, link_list)
+		if (!io_prep_async_work(cur))
+			return false;
+	return true;
 }
 
 static void __io_queue_async_work(struct io_kiocb *req)
@@ -1194,9 +1197,9 @@ static void __io_queue_async_work(struct io_kiocb *req)
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_kiocb *link = io_prep_linked_timeout(req);
 
-	trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
-					&req->work, req->flags);
-	io_wq_enqueue(ctx->io_wq, &req->work);
+	trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(req->work), req,
+					req->work, req->flags);
+	io_wq_enqueue(ctx->io_wq, req->work);
 
 	if (link)
 		io_queue_linked_timeout(link);
@@ -1205,7 +1208,12 @@ static void __io_queue_async_work(struct io_kiocb *req)
 static void io_queue_async_work(struct io_kiocb *req)
 {
 	/* init ->work of the whole link before punting */
-	io_prep_async_link(req);
+	if (!io_prep_async_link(req)) {
+		req_set_fail_links(req);
+		io_put_req(req);
+		io_req_complete(req, -ENOMEM);
+		return;
+	}
 	__io_queue_async_work(req);
 }
 
@@ -1898,7 +1906,7 @@ static struct io_wq_work *io_steal_work(struct io_kiocb *req)
 		return NULL;
 
 	nxt = io_req_find_next(req);
-	return nxt ? &nxt->work : NULL;
+	return nxt ? nxt->work : NULL;
 }
 
 /*
@@ -3226,8 +3234,9 @@ static int __io_splice_prep(struct io_kiocb *req,
 		 * Splice operation will be punted aync, and here need to
 		 * modify io_wq_work.flags, so initialize io_wq_work firstly.
 		 */
-		io_req_init_async(req);
-		req->work.flags |= IO_WQ_WORK_UNBOUND;
+		if (!io_req_init_async(req))
+			return -ENOMEM;
+		req->work->flags |= IO_WQ_WORK_UNBOUND;
 	}
 
 	return 0;
@@ -3804,8 +3813,9 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	 * leave the 'file' in an undeterminate state, and here need to modify
 	 * io_wq_work.flags, so initialize io_wq_work firstly.
 	 */
-	io_req_init_async(req);
-	req->work.flags |= IO_WQ_WORK_NO_CANCEL;
+	if (!io_req_init_async(req))
+		return -ENOMEM;
+	req->work->flags |= IO_WQ_WORK_NO_CANCEL;
 
 	if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
 		return -EINVAL;
@@ -3847,7 +3857,7 @@ static int io_close(struct io_kiocb *req, bool force_nonblock,
 	}
 
 	/* No ->flush() or already async, safely close from here */
-	ret = filp_close(close->put_file, req->work.files);
+	ret = filp_close(close->put_file, req->work->files);
 	if (ret < 0)
 		req_set_fail_links(req);
 	fput(close->put_file);
@@ -4666,10 +4676,6 @@ static void io_async_task_func(struct callback_head *cb)
 	io_poll_remove_double(req, apoll->double_poll);
 	spin_unlock_irq(&ctx->completion_lock);
 
-	/* restore ->work in case we need to retry again */
-	if (req->flags & REQ_F_WORK_INITIALIZED)
-		memcpy(&req->work, &apoll->work, sizeof(req->work));
-
 	if (!READ_ONCE(apoll->poll.canceled))
 		__io_req_task_submit(req);
 	else
@@ -4761,9 +4767,6 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
 	apoll->double_poll = NULL;
 
 	req->flags |= REQ_F_POLLED;
-	if (req->flags & REQ_F_WORK_INITIALIZED)
-		memcpy(&apoll->work, &req->work, sizeof(req->work));
-
 	io_get_req_task(req);
 	req->apoll = apoll;
 	INIT_HLIST_NODE(&req->hash_node);
@@ -4782,8 +4785,6 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
 	if (ret) {
 		io_poll_remove_double(req, apoll->double_poll);
 		spin_unlock_irq(&ctx->completion_lock);
-		if (req->flags & REQ_F_WORK_INITIALIZED)
-			memcpy(&req->work, &apoll->work, sizeof(req->work));
 		kfree(apoll->double_poll);
 		kfree(apoll);
 		return false;
@@ -4826,14 +4827,6 @@ static bool io_poll_remove_one(struct io_kiocb *req)
 		do_complete = __io_poll_remove_one(req, &apoll->poll);
 		if (do_complete) {
 			io_put_req(req);
-			/*
-			 * restore ->work because we will call
-			 * io_req_clean_work below when dropping the
-			 * final reference.
-			 */
-			if (req->flags & REQ_F_WORK_INITIALIZED)
-				memcpy(&req->work, &apoll->work,
-				       sizeof(req->work));
 			kfree(apoll->double_poll);
 			kfree(apoll);
 		}
@@ -5166,7 +5159,7 @@ static int io_timeout(struct io_kiocb *req)
 
 static bool io_cancel_cb(struct io_wq_work *work, void *data)
 {
-	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+	struct io_kiocb *req = work->private;
 
 	return req->user_data == (unsigned long) data;
 }
@@ -5434,7 +5427,9 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		if (ret)
 			return ret;
 	}
-	io_prep_async_link(req);
+	if (!io_prep_async_link(req))
+		return -ENOMEM;
+
 	de = kmalloc(sizeof(*de), GFP_KERNEL);
 	if (!de)
 		return -ENOMEM;
@@ -5757,7 +5752,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 
 static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work)
 {
-	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+	struct io_kiocb *req = work->private;
 	struct io_kiocb *timeout;
 	int ret = 0;
 
@@ -5847,9 +5842,10 @@ static int io_grab_files(struct io_kiocb *req)
 	int ret = -EBADF;
 	struct io_ring_ctx *ctx = req->ctx;
 
-	io_req_init_async(req);
+	if (!io_req_init_async(req))
+		return -ENOMEM;
 
-	if (req->work.files || (req->flags & REQ_F_NO_FILE_TABLE))
+	if (req->work->files || (req->flags & REQ_F_NO_FILE_TABLE))
 		return 0;
 	if (!ctx->ring_file)
 		return -EBADF;
@@ -5865,7 +5861,7 @@ static int io_grab_files(struct io_kiocb *req)
 	if (fcheck(ctx->ring_fd) == ctx->ring_file) {
 		list_add(&req->inflight_entry, &ctx->inflight_list);
 		req->flags |= REQ_F_INFLIGHT;
-		req->work.files = current->files;
+		req->work->files = current->files;
 		ret = 0;
 	}
 	spin_unlock_irq(&ctx->inflight_lock);
@@ -5964,19 +5960,20 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	struct io_kiocb *linked_timeout;
 	struct io_kiocb *nxt;
 	const struct cred *old_creds = NULL;
+	struct io_wq_work *work;
 	int ret;
 
 again:
 	linked_timeout = io_prep_linked_timeout(req);
 
-	if ((req->flags & REQ_F_WORK_INITIALIZED) && req->work.creds &&
-	    req->work.creds != current_cred()) {
+	work = req->work;
+	if (work && work->creds && work->creds != current_cred()) {
 		if (old_creds)
 			revert_creds(old_creds);
-		if (old_creds == req->work.creds)
+		if (old_creds == work->creds)
 			old_creds = NULL; /* restored original creds */
 		else
-			old_creds = override_creds(req->work.creds);
+			old_creds = override_creds(work->creds);
 	}
 
 	ret = io_issue_sqe(req, sqe, true, cs);
@@ -6050,12 +6047,16 @@ static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 				goto fail_req;
 		}
 
+		if (!io_req_init_async(req)) {
+			ret = -ENOMEM;
+			goto fail_req;
+		}
+
 		/*
 		 * Never try inline submit of IOSQE_ASYNC is set, go straight
 		 * to async execution.
 		 */
-		io_req_init_async(req);
-		req->work.flags |= IO_WQ_WORK_CONCURRENT;
+		req->work->flags |= IO_WQ_WORK_CONCURRENT;
 		io_queue_async_work(req);
 	} else {
 		__io_queue_sqe(req, sqe, cs);
@@ -6231,6 +6232,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	req->file = NULL;
 	req->ctx = ctx;
 	req->flags = 0;
+	req->work = NULL;
 	/* one is dropped after submission, the other at completion */
 	refcount_set(&req->refs, 2);
 	req->task = current;
@@ -6253,11 +6255,12 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 
 	id = READ_ONCE(sqe->personality);
 	if (id) {
-		io_req_init_async(req);
-		req->work.creds = idr_find(&ctx->personality_idr, id);
-		if (unlikely(!req->work.creds))
+		if (!io_req_init_async(req))
+			return -ENOMEM;
+		req->work->creds = idr_find(&ctx->personality_idr, id);
+		if (unlikely(!req->work->creds))
 			return -EINVAL;
-		get_cred(req->work.creds);
+		get_cred(req->work->creds);
 	}
 
 	/* same numerical values with corresponding REQ_F_*, safe to copy */
@@ -7237,7 +7240,7 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
 
 static void io_free_work(struct io_wq_work *work)
 {
-	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+	struct io_kiocb *req = work->private;
 
 	/* Consider that io_steal_work() relies on this ref */
 	io_put_req(req);
@@ -7853,7 +7856,7 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
 
 		spin_lock_irq(&ctx->inflight_lock);
 		list_for_each_entry(req, &ctx->inflight_list, inflight_entry) {
-			if (req->work.files != files)
+			if (req->work->files != files)
 				continue;
 			/* req is being completed, ignore */
 			if (!refcount_inc_not_zero(&req->refs))
@@ -7894,7 +7897,7 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
 				continue;
 			}
 		} else {
-			io_wq_cancel_work(ctx->io_wq, &cancel_req->work);
+			io_wq_cancel_work(ctx->io_wq, cancel_req->work);
 			io_put_req(cancel_req);
 		}
 
@@ -7905,7 +7908,7 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
 
 static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
 {
-	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+	struct io_kiocb *req = work->private;
 	struct task_struct *task = data;
 
 	return req->task == task;
-- 
2.24.0


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 2/2] io_uring: unionise ->apoll and ->work
  2020-07-25  8:31 [RFC 0/2] 3 cacheline io_kiocb Pavel Begunkov
  2020-07-25  8:31 ` [PATCH 1/2] io_uring: allocate req->work dynamically Pavel Begunkov
@ 2020-07-25  8:31 ` Pavel Begunkov
  2020-07-25 15:45 ` [RFC 0/2] 3 cacheline io_kiocb Jens Axboe
  2 siblings, 0 replies; 8+ messages in thread
From: Pavel Begunkov @ 2020-07-25  8:31 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Save a bit of space by placing ->apoll and ->work ptrs into a union,
making io_kiocb to take 192B (3 cachelines)

note: this patch is just for reference, there are other probably better
ways to save 8B.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ef4c6e50aa4f..6894a9a5db30 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -597,6 +597,7 @@ enum {
 struct async_poll {
 	struct io_poll_iocb	poll;
 	struct io_poll_iocb	*double_poll;
+	struct io_wq_work	*work;
 };
 
 /*
@@ -658,8 +659,10 @@ struct io_kiocb {
 	 * async armed poll handlers for regular commands.
 	 */
 	struct hlist_node	hash_node;
-	struct async_poll	*apoll;
-	struct io_wq_work	*work;
+	union {
+		struct async_poll	*apoll;
+		struct io_wq_work	*work;
+	};
 	struct callback_head	task_work;
 };
 
@@ -4676,6 +4679,8 @@ static void io_async_task_func(struct callback_head *cb)
 	io_poll_remove_double(req, apoll->double_poll);
 	spin_unlock_irq(&ctx->completion_lock);
 
+	req->work = apoll->work;
+
 	if (!READ_ONCE(apoll->poll.canceled))
 		__io_req_task_submit(req);
 	else
@@ -4765,6 +4770,7 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
 	if (unlikely(!apoll))
 		return false;
 	apoll->double_poll = NULL;
+	apoll->work = req->work;
 
 	req->flags |= REQ_F_POLLED;
 	io_get_req_task(req);
@@ -4785,6 +4791,7 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
 	if (ret) {
 		io_poll_remove_double(req, apoll->double_poll);
 		spin_unlock_irq(&ctx->completion_lock);
+		req->work = apoll->work;
 		kfree(apoll->double_poll);
 		kfree(apoll);
 		return false;
@@ -4826,6 +4833,7 @@ static bool io_poll_remove_one(struct io_kiocb *req)
 		/* non-poll requests have submit ref still */
 		do_complete = __io_poll_remove_one(req, &apoll->poll);
 		if (do_complete) {
+			req->work = apoll->work;
 			io_put_req(req);
 			kfree(apoll->double_poll);
 			kfree(apoll);
@@ -4962,7 +4970,7 @@ static int io_poll_add(struct io_kiocb *req)
 
 	/* ->work is in union with hash_node and others */
 	io_req_clean_work(req);
-	req->flags &= ~REQ_F_WORK_INITIALIZED;
+	req->work = NULL;
 
 	INIT_HLIST_NODE(&req->hash_node);
 	ipt.pt._qproc = io_poll_queue_proc;
-- 
2.24.0


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC 0/2] 3 cacheline io_kiocb
  2020-07-25  8:31 [RFC 0/2] 3 cacheline io_kiocb Pavel Begunkov
  2020-07-25  8:31 ` [PATCH 1/2] io_uring: allocate req->work dynamically Pavel Begunkov
  2020-07-25  8:31 ` [PATCH 2/2] io_uring: unionise ->apoll and ->work Pavel Begunkov
@ 2020-07-25 15:45 ` Jens Axboe
  2020-07-25 18:24   ` Pavel Begunkov
  2 siblings, 1 reply; 8+ messages in thread
From: Jens Axboe @ 2020-07-25 15:45 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring

On 7/25/20 2:31 AM, Pavel Begunkov wrote:
> That's not final for a several reasons, but good enough for discussion.
> That brings io_kiocb down to 192B. I didn't try to benchmark it
> properly, but quick nop test gave +5% throughput increase.
> 7531 vs 7910 KIOPS with fio/t/io_uring
> 
> The whole situation is obviously a bunch of tradeoffs. For instance,
> instead of shrinking it, we can inline apoll to speed apoll path.
> 
> [2/2] just for a reference, I'm thinking about other ways to shrink it.
> e.g. ->link_list can be a single-linked list with linked tiemouts
> storing a back-reference. This can turn out to be better, because
> that would move ->fixed_file_refs to the 2nd cacheline, so we won't
> ever touch 3rd cacheline in the submission path.
> Any other ideas?

Nothing noticeable for me, still about the same performance. But
generally speaking, I don't necessarily think we need to go all in on
making this as tiny as possible. It's much more important to chase the
items where we only use 2 cachelines for the hot path, and then we have
the extra space in there already for the semi hot paths like poll driven
retry. Yes, we're still allocating from a pool that has slightly larger
objects, but that doesn't really matter _that_ much. Avoiding an extra
kmalloc+kfree for the semi hot paths are a bigger deal than making
io_kiocb smaller and smaller.

That said, for no-brainer changes, we absolutely should make it smaller.
I just don't want to jump through convoluted hoops to get there.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC 0/2] 3 cacheline io_kiocb
  2020-07-25 15:45 ` [RFC 0/2] 3 cacheline io_kiocb Jens Axboe
@ 2020-07-25 18:24   ` Pavel Begunkov
  2020-07-25 19:40     ` Jens Axboe
  0 siblings, 1 reply; 8+ messages in thread
From: Pavel Begunkov @ 2020-07-25 18:24 UTC (permalink / raw)
  To: Jens Axboe, io-uring

On 25/07/2020 18:45, Jens Axboe wrote:
> On 7/25/20 2:31 AM, Pavel Begunkov wrote:
>> That's not final for a several reasons, but good enough for discussion.
>> That brings io_kiocb down to 192B. I didn't try to benchmark it
>> properly, but quick nop test gave +5% throughput increase.
>> 7531 vs 7910 KIOPS with fio/t/io_uring
>>
>> The whole situation is obviously a bunch of tradeoffs. For instance,
>> instead of shrinking it, we can inline apoll to speed apoll path.
>>
>> [2/2] just for a reference, I'm thinking about other ways to shrink it.
>> e.g. ->link_list can be a single-linked list with linked tiemouts
>> storing a back-reference. This can turn out to be better, because
>> that would move ->fixed_file_refs to the 2nd cacheline, so we won't
>> ever touch 3rd cacheline in the submission path.
>> Any other ideas?
> 
> Nothing noticeable for me, still about the same performance. But
> generally speaking, I don't necessarily think we need to go all in on
> making this as tiny as possible. It's much more important to chase the
> items where we only use 2 cachelines for the hot path, and then we have
> the extra space in there already for the semi hot paths like poll driven
> retry. Yes, we're still allocating from a pool that has slightly larger
> objects, but that doesn't really matter _that_ much. Avoiding an extra
> kmalloc+kfree for the semi hot paths are a bigger deal than making
> io_kiocb smaller and smaller.
> 
> That said, for no-brainer changes, we absolutely should make it smaller.
> I just don't want to jump through convoluted hoops to get there.

Agree, but that's not the end goal. The first point is to kill the union,
but it already has enough space for that.

The second is to see, whether we can use the space in a better way. From
the high level perspective ->apoll and ->work are alike and both serve to
provide asynchronous paths, hence the idea to swap them naturally comes to
mind. TBH, I don't think it'd do much, because init of ->io would probably
hide any benefit.

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC 0/2] 3 cacheline io_kiocb
  2020-07-25 18:24   ` Pavel Begunkov
@ 2020-07-25 19:40     ` Jens Axboe
  2020-07-25 20:14       ` Pavel Begunkov
  0 siblings, 1 reply; 8+ messages in thread
From: Jens Axboe @ 2020-07-25 19:40 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring

On 7/25/20 12:24 PM, Pavel Begunkov wrote:
> On 25/07/2020 18:45, Jens Axboe wrote:
>> On 7/25/20 2:31 AM, Pavel Begunkov wrote:
>>> That's not final for a several reasons, but good enough for discussion.
>>> That brings io_kiocb down to 192B. I didn't try to benchmark it
>>> properly, but quick nop test gave +5% throughput increase.
>>> 7531 vs 7910 KIOPS with fio/t/io_uring
>>>
>>> The whole situation is obviously a bunch of tradeoffs. For instance,
>>> instead of shrinking it, we can inline apoll to speed apoll path.
>>>
>>> [2/2] just for a reference, I'm thinking about other ways to shrink it.
>>> e.g. ->link_list can be a single-linked list with linked tiemouts
>>> storing a back-reference. This can turn out to be better, because
>>> that would move ->fixed_file_refs to the 2nd cacheline, so we won't
>>> ever touch 3rd cacheline in the submission path.
>>> Any other ideas?
>>
>> Nothing noticeable for me, still about the same performance. But
>> generally speaking, I don't necessarily think we need to go all in on
>> making this as tiny as possible. It's much more important to chase the
>> items where we only use 2 cachelines for the hot path, and then we have
>> the extra space in there already for the semi hot paths like poll driven
>> retry. Yes, we're still allocating from a pool that has slightly larger
>> objects, but that doesn't really matter _that_ much. Avoiding an extra
>> kmalloc+kfree for the semi hot paths are a bigger deal than making
>> io_kiocb smaller and smaller.
>>
>> That said, for no-brainer changes, we absolutely should make it smaller.
>> I just don't want to jump through convoluted hoops to get there.
> 
> Agree, but that's not the end goal. The first point is to kill the union,
> but it already has enough space for that.

Right

> The second is to see, whether we can use the space in a better way. From
> the high level perspective ->apoll and ->work are alike and both serve to
> provide asynchronous paths, hence the idea to swap them naturally comes to
> mind.

Totally agree, which is why the union of those kind of makes sense.
We're definitely NOT using them at the same time, but the fact that we
had various mm/creds/whatnot in the work_struct made that a bit iffy.

> TBH, I don't think it'd do much, because init of ->io would probably
> hide any benefit.

There should be no ->io init/alloc for this test case.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC 0/2] 3 cacheline io_kiocb
  2020-07-25 19:40     ` Jens Axboe
@ 2020-07-25 20:14       ` Pavel Begunkov
  2020-07-25 20:25         ` Jens Axboe
  0 siblings, 1 reply; 8+ messages in thread
From: Pavel Begunkov @ 2020-07-25 20:14 UTC (permalink / raw)
  To: Jens Axboe, io-uring

 On 25/07/2020 22:40, Jens Axboe wrote:
> On 7/25/20 12:24 PM, Pavel Begunkov wrote:
>> On 25/07/2020 18:45, Jens Axboe wrote:
>>> On 7/25/20 2:31 AM, Pavel Begunkov wrote:
>>>> That's not final for a several reasons, but good enough for discussion.
>>>> That brings io_kiocb down to 192B. I didn't try to benchmark it
>>>> properly, but quick nop test gave +5% throughput increase.
>>>> 7531 vs 7910 KIOPS with fio/t/io_uring
>>>>
>>>> The whole situation is obviously a bunch of tradeoffs. For instance,
>>>> instead of shrinking it, we can inline apoll to speed apoll path.
>>>>
>>>> [2/2] just for a reference, I'm thinking about other ways to shrink it.
>>>> e.g. ->link_list can be a single-linked list with linked tiemouts
>>>> storing a back-reference. This can turn out to be better, because
>>>> that would move ->fixed_file_refs to the 2nd cacheline, so we won't
>>>> ever touch 3rd cacheline in the submission path.
>>>> Any other ideas?
>>>
>>> Nothing noticeable for me, still about the same performance. But
>>> generally speaking, I don't necessarily think we need to go all in on
>>> making this as tiny as possible. It's much more important to chase the
>>> items where we only use 2 cachelines for the hot path, and then we have
>>> the extra space in there already for the semi hot paths like poll driven
>>> retry. Yes, we're still allocating from a pool that has slightly larger
>>> objects, but that doesn't really matter _that_ much. Avoiding an extra
>>> kmalloc+kfree for the semi hot paths are a bigger deal than making
>>> io_kiocb smaller and smaller.
>>>
>>> That said, for no-brainer changes, we absolutely should make it smaller.
>>> I just don't want to jump through convoluted hoops to get there.
>>
>> Agree, but that's not the end goal. The first point is to kill the union,
>> but it already has enough space for that.
> 
> Right
> 
>> The second is to see, whether we can use the space in a better way. From
>> the high level perspective ->apoll and ->work are alike and both serve to
>> provide asynchronous paths, hence the idea to swap them naturally comes to
>> mind.
> 
> Totally agree, which is why the union of those kind of makes sense.
> We're definitely NOT using them at the same time, but the fact that we
> had various mm/creds/whatnot in the work_struct made that a bit iffy.

Thinking of it, if combined with work de-init as you proposed before, it's
probably possible to make a layout similar to the one below

struct io_kiocb {
	...
	struct hlist_node	hash_node;
	struct callback_head	task_work;	
	union {
		struct io_wq_work	work;
		struct async_poll	apoll;
	};
};

Saves ->apoll kmalloc(), and the actual work de-init would be negligibly
rare. Worth to try


> 
>> TBH, I don't think it'd do much, because init of ->io would probably
>> hide any benefit.
> 
> There should be no ->io init/alloc for this test case.

I mean, before getting into io_arm_poll_handler(), it should get -EAGAIN
in io_{read,write}() and initialise ->io in io_setup_async_rw(), at least
for READV, WRITEV.

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC 0/2] 3 cacheline io_kiocb
  2020-07-25 20:14       ` Pavel Begunkov
@ 2020-07-25 20:25         ` Jens Axboe
  0 siblings, 0 replies; 8+ messages in thread
From: Jens Axboe @ 2020-07-25 20:25 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring

On 7/25/20 2:14 PM, Pavel Begunkov wrote:
>  On 25/07/2020 22:40, Jens Axboe wrote:
>> On 7/25/20 12:24 PM, Pavel Begunkov wrote:
>>> On 25/07/2020 18:45, Jens Axboe wrote:
>>>> On 7/25/20 2:31 AM, Pavel Begunkov wrote:
>>>>> That's not final for a several reasons, but good enough for discussion.
>>>>> That brings io_kiocb down to 192B. I didn't try to benchmark it
>>>>> properly, but quick nop test gave +5% throughput increase.
>>>>> 7531 vs 7910 KIOPS with fio/t/io_uring
>>>>>
>>>>> The whole situation is obviously a bunch of tradeoffs. For instance,
>>>>> instead of shrinking it, we can inline apoll to speed apoll path.
>>>>>
>>>>> [2/2] just for a reference, I'm thinking about other ways to shrink it.
>>>>> e.g. ->link_list can be a single-linked list with linked tiemouts
>>>>> storing a back-reference. This can turn out to be better, because
>>>>> that would move ->fixed_file_refs to the 2nd cacheline, so we won't
>>>>> ever touch 3rd cacheline in the submission path.
>>>>> Any other ideas?
>>>>
>>>> Nothing noticeable for me, still about the same performance. But
>>>> generally speaking, I don't necessarily think we need to go all in on
>>>> making this as tiny as possible. It's much more important to chase the
>>>> items where we only use 2 cachelines for the hot path, and then we have
>>>> the extra space in there already for the semi hot paths like poll driven
>>>> retry. Yes, we're still allocating from a pool that has slightly larger
>>>> objects, but that doesn't really matter _that_ much. Avoiding an extra
>>>> kmalloc+kfree for the semi hot paths are a bigger deal than making
>>>> io_kiocb smaller and smaller.
>>>>
>>>> That said, for no-brainer changes, we absolutely should make it smaller.
>>>> I just don't want to jump through convoluted hoops to get there.
>>>
>>> Agree, but that's not the end goal. The first point is to kill the union,
>>> but it already has enough space for that.
>>
>> Right
>>
>>> The second is to see, whether we can use the space in a better way. From
>>> the high level perspective ->apoll and ->work are alike and both serve to
>>> provide asynchronous paths, hence the idea to swap them naturally comes to
>>> mind.
>>
>> Totally agree, which is why the union of those kind of makes sense.
>> We're definitely NOT using them at the same time, but the fact that we
>> had various mm/creds/whatnot in the work_struct made that a bit iffy.
> 
> Thinking of it, if combined with work de-init as you proposed before, it's
> probably possible to make a layout similar to the one below
> 
> struct io_kiocb {
> 	...
> 	struct hlist_node	hash_node;
> 	struct callback_head	task_work;	
> 	union {
> 		struct io_wq_work	work;
> 		struct async_poll	apoll;
> 	};
> };
> 
> Saves ->apoll kmalloc(), and the actual work de-init would be negligibly
> rare. Worth to try

And hopefully get rid of the stupid apoll->work and the copy back and
forth... But yes, this would be most excellent, and an ideal layout.

>>> TBH, I don't think it'd do much, because init of ->io would probably
>>> hide any benefit.
>>
>> There should be no ->io init/alloc for this test case.
> 
> I mean, before getting into io_arm_poll_handler(), it should get -EAGAIN
> in io_{read,write}() and initialise ->io in io_setup_async_rw(), at least
> for READV, WRITEV.

Sure, but for my testing, there's never an EAGAIN, so I won't be
exercising that path for the peak testing.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, back to index

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-07-25  8:31 [RFC 0/2] 3 cacheline io_kiocb Pavel Begunkov
2020-07-25  8:31 ` [PATCH 1/2] io_uring: allocate req->work dynamically Pavel Begunkov
2020-07-25  8:31 ` [PATCH 2/2] io_uring: unionise ->apoll and ->work Pavel Begunkov
2020-07-25 15:45 ` [RFC 0/2] 3 cacheline io_kiocb Jens Axboe
2020-07-25 18:24   ` Pavel Begunkov
2020-07-25 19:40     ` Jens Axboe
2020-07-25 20:14       ` Pavel Begunkov
2020-07-25 20:25         ` Jens Axboe

IO-Uring Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/io-uring/0 io-uring/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 io-uring io-uring/ https://lore.kernel.org/io-uring \
		io-uring@vger.kernel.org
	public-inbox-index io-uring

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.io-uring


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git