All of lore.kernel.org
 help / color / mirror / Atom feed
From: Jens Axboe <axboe@kernel.dk>
To: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>,
	io-uring@vger.kernel.org
Cc: asml.silence@gmail.com
Subject: Re: [PATCH] io_uring: add io_uring_enter(2) fixed file support
Date: Thu, 3 Mar 2022 09:31:36 -0700	[thread overview]
Message-ID: <951ea55c-b6a3-59e4-1011-4f46fae547b3@kernel.dk> (raw)
In-Reply-To: <040e9262-4ebb-8505-5a14-6f399e40332c@kernel.dk>

On 3/3/22 7:40 AM, Jens Axboe wrote:
> On 3/3/22 7:36 AM, Jens Axboe wrote:
>> The only potential oddity here is that the fd passed back is not a
>> legitimate fd. io_uring does support poll(2) on its file descriptor, so
>> that could cause some confusion even if I don't think anyone actually
>> does poll(2) on io_uring.
> 
> Side note - the only implication here is that we then likely can't make
> the optimized behavior the default, it has to be an IORING_SETUP_REG
> flag which tells us that the application is aware of this limitation.
> Though I guess close(2) might mess with that too... Hmm.

Not sure I can find a good approach for that. Tried out your patch and
made some fixes:

- Missing free on final tctx free
- Rename registered_files to registered_rings
- Fix off-by-ones in checking max registration count
- Use kcalloc
- Rename ENTER_FIXED_FILE -> ENTER_REGISTERED_RING
- Don't pass in tctx to io_uring_unreg_ringfd()
- Get rid of forward declaration for adding tctx node
- Get rid of extra file pointer in io_uring_enter()
- Fix deadlock in io_ringfd_register()
- Use io_uring_rsrc_update rather than add a new struct type

Patch I ran below.

Ran some testing here, and on my laptop, running:

axboe@m1pro-kvm ~/g/fio (master)> t/io_uring -N1 -s1 -f0
polled=1, fixedbufs=1/0, register_files=1, buffered=0, QD=128
Engine=io_uring, sq_ring=128, cq_ring=128
submitter=0, tid=673
IOPS=6627K, IOS/call=1/1, inflight=()
IOPS=6995K, IOS/call=1/1, inflight=()
IOPS=6992K, IOS/call=1/1, inflight=()
IOPS=7005K, IOS/call=1/1, inflight=()
IOPS=6999K, IOS/call=1/1, inflight=()

and with registered ring

axboe@m1pro-kvm ~/g/fio (master)> t/io_uring -N1 -s1 -f1
polled=1, fixedbufs=1/0, register_files=1, buffered=0, QD=128
Engine=io_uring, sq_ring=128, cq_ring=128
submitter=0, tid=687
ring register 0
IOPS=7714K, IOS/call=1/1, inflight=()
IOPS=8030K, IOS/call=1/1, inflight=()
IOPS=8025K, IOS/call=1/1, inflight=()
IOPS=8015K, IOS/call=1/1, inflight=()
IOPS=8037K, IOS/call=1/1, inflight=()

which is about a 15% improvement, pretty massive...

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ad3e0b0ab3b9..8a1f97054b71 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -466,6 +466,8 @@ struct io_ring_ctx {
 	};
 };
 
+#define IO_RINGFD_REG_MAX 16
+
 struct io_uring_task {
 	/* submission side */
 	int			cached_refs;
@@ -481,6 +483,7 @@ struct io_uring_task {
 	struct io_wq_work_list	task_list;
 	struct io_wq_work_list	prior_task_list;
 	struct callback_head	task_work;
+	struct file		**registered_rings;
 	bool			task_running;
 };
 
@@ -8806,8 +8809,16 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task,
 	if (unlikely(!tctx))
 		return -ENOMEM;
 
+	tctx->registered_rings = kcalloc(IO_RINGFD_REG_MAX,
+					 sizeof(struct file *), GFP_KERNEL);
+	if (unlikely(!tctx->registered_rings)) {
+		kfree(tctx);
+		return -ENOMEM;
+	}
+
 	ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
 	if (unlikely(ret)) {
+		kfree(tctx->registered_rings);
 		kfree(tctx);
 		return ret;
 	}
@@ -8816,6 +8827,7 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task,
 	if (IS_ERR(tctx->io_wq)) {
 		ret = PTR_ERR(tctx->io_wq);
 		percpu_counter_destroy(&tctx->inflight);
+		kfree(tctx->registered_rings);
 		kfree(tctx);
 		return ret;
 	}
@@ -8840,6 +8852,7 @@ void __io_uring_free(struct task_struct *tsk)
 	WARN_ON_ONCE(tctx->io_wq);
 	WARN_ON_ONCE(tctx->cached_refs);
 
+	kfree(tctx->registered_rings);
 	percpu_counter_destroy(&tctx->inflight);
 	kfree(tctx);
 	tsk->io_uring = NULL;
@@ -10061,6 +10074,68 @@ void __io_uring_cancel(bool cancel_all)
 	io_uring_cancel_generic(cancel_all, NULL);
 }
 
+void io_uring_unreg_ringfd(void)
+{
+	struct io_uring_task *tctx = current->io_uring;
+	int i;
+
+	for (i = 0; i < IO_RINGFD_REG_MAX; i++) {
+		if (tctx->registered_rings[i]) {
+			fput(tctx->registered_rings[i]);
+			tctx->registered_rings[i] = NULL;
+		}
+	}
+}
+
+static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *arg)
+{
+	struct io_uring_rsrc_update reg;
+	struct io_uring_task *tctx;
+	struct file *file;
+	int ret;
+
+	if (copy_from_user(&reg, arg, sizeof(reg)))
+		return -EFAULT;
+	if (reg.offset >= IO_RINGFD_REG_MAX)
+		return -EINVAL;
+
+	mutex_unlock(&ctx->uring_lock);
+	ret = io_uring_add_tctx_node(ctx);
+	mutex_lock(&ctx->uring_lock);
+	if (unlikely(ret))
+		return ret;
+
+	tctx = current->io_uring;
+	if (tctx->registered_rings[reg.offset])
+		return -EBUSY;
+	file = fget(reg.data);
+	if (unlikely(!file))
+		return -EBADF;
+	tctx->registered_rings[reg.offset] = file;
+	return 0;
+}
+
+static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *arg)
+{
+	struct io_uring_task *tctx = current->io_uring;
+	__u32 offset;
+
+	if (!tctx)
+		return 0;
+
+	if (copy_from_user(&offset, arg, sizeof(__u32)))
+		return -EFAULT;
+	if (offset >= IO_RINGFD_REG_MAX)
+		return -EINVAL;
+
+	if (tctx->registered_rings[offset]) {
+		fput(tctx->registered_rings[offset]);
+		tctx->registered_rings[offset] = NULL;
+	}
+
+	return 0;
+}
+
 static void *io_uring_validate_mmap_request(struct file *file,
 					    loff_t pgoff, size_t sz)
 {
@@ -10191,12 +10266,23 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 	io_run_task_work();
 
 	if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
-			       IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)))
+			       IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
+			       IORING_ENTER_REGISTERED_RING)))
 		return -EINVAL;
 
-	f = fdget(fd);
-	if (unlikely(!f.file))
-		return -EBADF;
+	if (flags & IORING_ENTER_REGISTERED_RING) {
+		struct io_uring_task *tctx = current->io_uring;
+
+		if (fd >= IO_RINGFD_REG_MAX || !tctx)
+			return -EINVAL;
+		f.file = tctx->registered_rings[fd];
+		if (unlikely(!f.file))
+			return -EBADF;
+	} else {
+		f = fdget(fd);
+		if (unlikely(!f.file))
+			return -EBADF;
+	}
 
 	ret = -EOPNOTSUPP;
 	if (unlikely(f.file->f_op != &io_uring_fops))
@@ -10270,7 +10356,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 out:
 	percpu_ref_put(&ctx->refs);
 out_fput:
-	fdput(f);
+	if (!(flags & IORING_ENTER_REGISTERED_RING))
+		fdput(f);
 	return submitted ? submitted : ret;
 }
 
@@ -11160,6 +11247,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_register_iowq_max_workers(ctx, arg);
 		break;
+	case IORING_REGISTER_IORINGFD:
+		ret = -EINVAL;
+		if (nr_args != 1)
+			break;
+		ret = io_ringfd_register(ctx, arg);
+		break;
+	case IORING_UNREGISTER_IORINGFD:
+		ret = -EINVAL;
+		if (nr_args != 1)
+			break;
+		ret = io_ringfd_unregister(ctx, arg);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 649a4d7c241b..1814e698d861 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -9,11 +9,14 @@
 struct sock *io_uring_get_socket(struct file *file);
 void __io_uring_cancel(bool cancel_all);
 void __io_uring_free(struct task_struct *tsk);
+void io_uring_unreg_ringfd(void);
 
 static inline void io_uring_files_cancel(void)
 {
-	if (current->io_uring)
+	if (current->io_uring) {
+		io_uring_unreg_ringfd();
 		__io_uring_cancel(false);
+	}
 }
 static inline void io_uring_task_cancel(void)
 {
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 787f491f0d2a..a36d31f2cc66 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -257,10 +257,11 @@ struct io_cqring_offsets {
 /*
  * io_uring_enter(2) flags
  */
-#define IORING_ENTER_GETEVENTS	(1U << 0)
-#define IORING_ENTER_SQ_WAKEUP	(1U << 1)
-#define IORING_ENTER_SQ_WAIT	(1U << 2)
-#define IORING_ENTER_EXT_ARG	(1U << 3)
+#define IORING_ENTER_GETEVENTS		(1U << 0)
+#define IORING_ENTER_SQ_WAKEUP		(1U << 1)
+#define IORING_ENTER_SQ_WAIT		(1U << 2)
+#define IORING_ENTER_EXT_ARG		(1U << 3)
+#define IORING_ENTER_REGISTERED_RING	(1U << 4)
 
 /*
  * Passed in for io_uring_setup(2). Copied back with updated info on success
@@ -325,6 +326,10 @@ enum {
 	/* set/get max number of io-wq workers */
 	IORING_REGISTER_IOWQ_MAX_WORKERS	= 19,
 
+	/* register/unregister io_uring fd with the ring */
+	IORING_REGISTER_IORINGFD		= 20,
+	IORING_UNREGISTER_IORINGFD		= 21,
+
 	/* this goes last */
 	IORING_REGISTER_LAST
 };

-- 
Jens Axboe


  reply	other threads:[~2022-03-03 16:31 UTC|newest]

Thread overview: 28+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-03-03  5:28 [PATCH] io_uring: add io_uring_enter(2) fixed file support Xiaoguang Wang
2022-03-03  8:56 ` Hao Xu
2022-03-03 13:38 ` Jens Axboe
2022-03-03 14:36   ` Jens Axboe
2022-03-03 14:40     ` Jens Axboe
2022-03-03 16:31       ` Jens Axboe [this message]
2022-03-03 17:18         ` Jens Axboe
2022-03-03 20:41           ` Jens Axboe
2022-03-03 21:19             ` Jens Axboe
2022-03-04  0:07               ` Jens Axboe
2022-03-04 13:39                 ` Xiaoguang Wang
2022-03-04 13:44                   ` Jens Axboe
2022-03-04 15:16                     ` Xiaoguang Wang
2022-03-04 15:22                       ` Jens Axboe
2022-03-08  8:38                         ` Xiaoguang Wang
2022-03-08 13:10                           ` Jens Axboe
2022-03-03 22:24             ` Vito Caputo
2022-03-03 22:26               ` Jens Axboe
2022-03-04  1:49         ` Pavel Begunkov
2022-03-04  2:18           ` Jens Axboe
2022-03-04  2:28             ` Pavel Begunkov
2022-03-04  2:35               ` Pavel Begunkov
2022-03-04  2:43               ` Jens Axboe
2022-03-04  1:52         ` Pavel Begunkov
2022-03-04  2:19           ` Jens Axboe
2022-03-04  2:39             ` Pavel Begunkov
2022-03-04  3:03               ` Jens Axboe
2022-04-21 14:16     ` Hao Xu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=951ea55c-b6a3-59e4-1011-4f46fae547b3@kernel.dk \
    --to=axboe@kernel.dk \
    --cc=asml.silence@gmail.com \
    --cc=io-uring@vger.kernel.org \
    --cc=xiaoguang.wang@linux.alibaba.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.