All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] io_uring: add io_uring_enter(2) fixed file support
@ 2022-03-03  5:28 Xiaoguang Wang
  2022-03-03  8:56 ` Hao Xu
  2022-03-03 13:38 ` Jens Axboe
  0 siblings, 2 replies; 28+ messages in thread
From: Xiaoguang Wang @ 2022-03-03  5:28 UTC (permalink / raw)
  To: io-uring; +Cc: axboe, asml.silence

IORING_REGISTER_FILES is a good feature to reduce fget/fput overhead for
each IO we do on file, but still left one, which is io_uring_enter(2).
In io_uring_enter(2), it still fget/fput io_ring fd. I have observed
this overhead in some our internal oroutine implementations based on
io_uring with low submit batch. To totally remove fget/fput overhead in
io_uring, we may add a small struct file cache in io_uring_task and add
a new IORING_ENTER_FIXED_FILE flag. Currently the capacity of this file
cache is 16, wihcih I think it maybe enough, also not that this cache is
per-thread.

Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
---
 fs/io_uring.c                 | 127 +++++++++++++++++++++++++++++++++++++-----
 include/linux/io_uring.h      |   5 +-
 include/uapi/linux/io_uring.h |   9 +++
 3 files changed, 126 insertions(+), 15 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 77b9c7e4793b..e1d4b537cb60 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -461,6 +461,8 @@ struct io_ring_ctx {
 	};
 };
 
+#define IO_RINGFD_REG_MAX 16
+
 struct io_uring_task {
 	/* submission side */
 	int			cached_refs;
@@ -476,6 +478,7 @@ struct io_uring_task {
 	struct io_wq_work_list	task_list;
 	struct io_wq_work_list	prior_task_list;
 	struct callback_head	task_work;
+	struct file		**registered_files;
 	bool			task_running;
 };
 
@@ -8739,8 +8742,16 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task,
 	if (unlikely(!tctx))
 		return -ENOMEM;
 
+	tctx->registered_files = kzalloc(sizeof(struct file *) * IO_RINGFD_REG_MAX,
+					 GFP_KERNEL);
+	if (unlikely(!tctx->registered_files)) {
+		kfree(tctx);
+		return -ENOMEM;
+	}
+
 	ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
 	if (unlikely(ret)) {
+		kfree(tctx->registered_files);
 		kfree(tctx);
 		return ret;
 	}
@@ -8749,6 +8760,7 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task,
 	if (IS_ERR(tctx->io_wq)) {
 		ret = PTR_ERR(tctx->io_wq);
 		percpu_counter_destroy(&tctx->inflight);
+		kfree(tctx->registered_files);
 		kfree(tctx);
 		return ret;
 	}
@@ -9382,6 +9394,56 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx)
 	return -ENXIO;
 }
 
+static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx, bool locked);
+
+static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *arg)
+{
+	struct io_uring_fd_reg reg;
+	struct io_uring_task *tctx;
+	struct file *file;
+	int ret;
+
+	if (copy_from_user(&reg, arg, sizeof(struct io_uring_fd_reg)))
+		return -EFAULT;
+	if (reg.offset > IO_RINGFD_REG_MAX)
+		return -EINVAL;
+
+	ret = io_uring_add_tctx_node(ctx, true);
+	if (unlikely(ret))
+		return ret;
+
+	tctx = current->io_uring;
+	if (tctx->registered_files[reg.offset])
+		return -EBUSY;
+	file = fget(reg.fd);
+	if (unlikely(!file))
+		return -EBADF;
+	tctx->registered_files[reg.offset] = file;
+	return 0;
+}
+
+static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *arg)
+{
+	struct io_uring_task *tctx;
+	__u32 offset;
+
+	if (!current->io_uring)
+		return 0;
+
+	if (copy_from_user(&offset, arg, sizeof(__u32)))
+		return -EFAULT;
+	if (offset > IO_RINGFD_REG_MAX)
+		return -EINVAL;
+
+	tctx = current->io_uring;
+	if (tctx->registered_files[offset]) {
+		fput(tctx->registered_files[offset]);
+		tctx->registered_files[offset] = NULL;
+	}
+
+	return 0;
+}
+
 static void io_destroy_buffers(struct io_ring_ctx *ctx)
 {
 	struct io_buffer *buf;
@@ -9790,7 +9852,7 @@ static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 	}
 }
 
-static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
+static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx, bool locked)
 {
 	struct io_uring_task *tctx = current->io_uring;
 	struct io_tctx_node *node;
@@ -9825,9 +9887,11 @@ static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
 			return ret;
 		}
 
-		mutex_lock(&ctx->uring_lock);
+		if (!locked)
+			mutex_lock(&ctx->uring_lock);
 		list_add(&node->ctx_node, &ctx->tctx_list);
-		mutex_unlock(&ctx->uring_lock);
+		if (!locked)
+			mutex_unlock(&ctx->uring_lock);
 	}
 	tctx->last = ctx;
 	return 0;
@@ -9836,13 +9900,13 @@ static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
 /*
  * Note that this task has used io_uring. We use it for cancelation purposes.
  */
-static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
+static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx, bool locked)
 {
 	struct io_uring_task *tctx = current->io_uring;
 
 	if (likely(tctx && tctx->last == ctx))
 		return 0;
-	return __io_uring_add_tctx_node(ctx);
+	return __io_uring_add_tctx_node(ctx, locked);
 }
 
 /*
@@ -9973,6 +10037,16 @@ void __io_uring_cancel(bool cancel_all)
 	io_uring_cancel_generic(cancel_all, NULL);
 }
 
+void io_uring_unreg_ringfd(struct io_uring_task *tctx)
+{
+	int i;
+
+	for (i = 0; i < IO_RINGFD_REG_MAX; i++) {
+		if (tctx->registered_files[i])
+			fput(tctx->registered_files[i]);
+	}
+}
+
 static void *io_uring_validate_mmap_request(struct file *file,
 					    loff_t pgoff, size_t sz)
 {
@@ -10098,24 +10172,36 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 	struct io_ring_ctx *ctx;
 	int submitted = 0;
 	struct fd f;
+	struct file *file;
 	long ret;
 
 	io_run_task_work();
 
 	if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
-			       IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)))
+			       IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
+			       IORING_ENTER_FIXED_FILE)))
 		return -EINVAL;
 
-	f = fdget(fd);
-	if (unlikely(!f.file))
-		return -EBADF;
+	if (flags & IORING_ENTER_FIXED_FILE) {
+		if (fd > IO_RINGFD_REG_MAX || !current->io_uring)
+			return -EINVAL;
+
+		file = current->io_uring->registered_files[fd];
+		if (unlikely(!file))
+			return -EBADF;
+	} else {
+		f = fdget(fd);
+		if (unlikely(!f.file))
+			return -EBADF;
+		file = f.file;
+	}
 
 	ret = -EOPNOTSUPP;
-	if (unlikely(f.file->f_op != &io_uring_fops))
+	if (unlikely(file->f_op != &io_uring_fops))
 		goto out_fput;
 
 	ret = -ENXIO;
-	ctx = f.file->private_data;
+	ctx = file->private_data;
 	if (unlikely(!percpu_ref_tryget(&ctx->refs)))
 		goto out_fput;
 
@@ -10145,7 +10231,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 		}
 		submitted = to_submit;
 	} else if (to_submit) {
-		ret = io_uring_add_tctx_node(ctx);
+		ret = io_uring_add_tctx_node(ctx, false);
 		if (unlikely(ret))
 			goto out;
 		mutex_lock(&ctx->uring_lock);
@@ -10182,7 +10268,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 out:
 	percpu_ref_put(&ctx->refs);
 out_fput:
-	fdput(f);
+	if (!(flags & IORING_ENTER_FIXED_FILE))
+		fdput(f);
 	return submitted ? submitted : ret;
 }
 
@@ -10413,7 +10500,7 @@ static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
 	if (fd < 0)
 		return fd;
 
-	ret = io_uring_add_tctx_node(ctx);
+	ret = io_uring_add_tctx_node(ctx, false);
 	if (ret) {
 		put_unused_fd(fd);
 		return ret;
@@ -11134,6 +11221,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_register_iowq_max_workers(ctx, arg);
 		break;
+	case IORING_REGISTER_IORINGFD:
+		ret = -EINVAL;
+		if (nr_args != 1)
+			break;
+		ret = io_ringfd_register(ctx, arg);
+		break;
+	case IORING_UNREGISTER_IORINGFD:
+		ret = -EINVAL;
+		if (nr_args != 1)
+			break;
+		ret = io_ringfd_unregister(ctx, arg);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 649a4d7c241b..5ddea83912c7 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -9,11 +9,14 @@
 struct sock *io_uring_get_socket(struct file *file);
 void __io_uring_cancel(bool cancel_all);
 void __io_uring_free(struct task_struct *tsk);
+void io_uring_unreg_ringfd(struct io_uring_task *tctx);
 
 static inline void io_uring_files_cancel(void)
 {
-	if (current->io_uring)
+	if (current->io_uring) {
+		io_uring_unreg_ringfd(current->io_uring);
 		__io_uring_cancel(false);
+	}
 }
 static inline void io_uring_task_cancel(void)
 {
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 787f491f0d2a..f076a203317a 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -261,6 +261,7 @@ struct io_cqring_offsets {
 #define IORING_ENTER_SQ_WAKEUP	(1U << 1)
 #define IORING_ENTER_SQ_WAIT	(1U << 2)
 #define IORING_ENTER_EXT_ARG	(1U << 3)
+#define IORING_ENTER_FIXED_FILE	(1U << 4)
 
 /*
  * Passed in for io_uring_setup(2). Copied back with updated info on success
@@ -325,6 +326,9 @@ enum {
 	/* set/get max number of io-wq workers */
 	IORING_REGISTER_IOWQ_MAX_WORKERS	= 19,
 
+	IORING_REGISTER_IORINGFD		= 20,
+	IORING_UNREGISTER_IORINGFD		= 21,
+
 	/* this goes last */
 	IORING_REGISTER_LAST
 };
@@ -422,4 +426,9 @@ struct io_uring_getevents_arg {
 	__u64	ts;
 };
 
+struct io_uring_fd_reg {
+	__u32	offset;
+	__s32	fd;
+};
+
 #endif
-- 
2.14.4.44.g2045bb6


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH] io_uring: add io_uring_enter(2) fixed file support
  2022-03-03  5:28 [PATCH] io_uring: add io_uring_enter(2) fixed file support Xiaoguang Wang
@ 2022-03-03  8:56 ` Hao Xu
  2022-03-03 13:38 ` Jens Axboe
  1 sibling, 0 replies; 28+ messages in thread
From: Hao Xu @ 2022-03-03  8:56 UTC (permalink / raw)
  To: Xiaoguang Wang, io-uring; +Cc: axboe, asml.silence


On 3/3/22 13:28, Xiaoguang Wang wrote:
> IORING_REGISTER_FILES is a good feature to reduce fget/fput overhead for
> each IO we do on file, but still left one, which is io_uring_enter(2).
> In io_uring_enter(2), it still fget/fput io_ring fd. I have observed
> this overhead in some our internal oroutine implementations based on
> io_uring with low submit batch. To totally remove fget/fput overhead in
> io_uring, we may add a small struct file cache in io_uring_task and add
> a new IORING_ENTER_FIXED_FILE flag. Currently the capacity of this file
> cache is 16, wihcih I think it maybe enough, also not that this cache is
> per-thread.
>
> Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
> ---
>   fs/io_uring.c                 | 127 +++++++++++++++++++++++++++++++++++++-----
>   include/linux/io_uring.h      |   5 +-
>   include/uapi/linux/io_uring.h |   9 +++
>   3 files changed, 126 insertions(+), 15 deletions(-)
>
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 77b9c7e4793b..e1d4b537cb60 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -461,6 +461,8 @@ struct io_ring_ctx {
>   	};
>   };
>   
> +#define IO_RINGFD_REG_MAX 16
> +
>   struct io_uring_task {
>   	/* submission side */
>   	int			cached_refs;
> @@ -476,6 +478,7 @@ struct io_uring_task {
>   	struct io_wq_work_list	task_list;
>   	struct io_wq_work_list	prior_task_list;
>   	struct callback_head	task_work;
> +	struct file		**registered_files;
>   	bool			task_running;
>   };
>   
> @@ -8739,8 +8742,16 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task,
>   	if (unlikely(!tctx))
>   		return -ENOMEM;
>   
> +	tctx->registered_files = kzalloc(sizeof(struct file *) * IO_RINGFD_REG_MAX,
> +					 GFP_KERNEL);
> +	if (unlikely(!tctx->registered_files)) {
> +		kfree(tctx);
> +		return -ENOMEM;
> +	}
> +
>   	ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
>   	if (unlikely(ret)) {
> +		kfree(tctx->registered_files);
>   		kfree(tctx);
>   		return ret;
>   	}
> @@ -8749,6 +8760,7 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task,
>   	if (IS_ERR(tctx->io_wq)) {
>   		ret = PTR_ERR(tctx->io_wq);
>   		percpu_counter_destroy(&tctx->inflight);
> +		kfree(tctx->registered_files);
>   		kfree(tctx);
>   		return ret;
>   	}
> @@ -9382,6 +9394,56 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx)
>   	return -ENXIO;
>   }
>   
> +static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx, bool locked);
> +
> +static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *arg)
> +{
> +	struct io_uring_fd_reg reg;
> +	struct io_uring_task *tctx;
> +	struct file *file;
> +	int ret;
> +
> +	if (copy_from_user(&reg, arg, sizeof(struct io_uring_fd_reg)))
> +		return -EFAULT;
> +	if (reg.offset > IO_RINGFD_REG_MAX)
should be >=
> +		return -EINVAL;
> +
> +	ret = io_uring_add_tctx_node(ctx, true);
> +	if (unlikely(ret))
> +		return ret;
> +
> +	tctx = current->io_uring;
> +	if (tctx->registered_files[reg.offset])
> +		return -EBUSY;
> +	file = fget(reg.fd);
> +	if (unlikely(!file))
> +		return -EBADF;
> +	tctx->registered_files[reg.offset] = file;
> +	return 0;
> +}
> +
> +static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *arg)
> +{
> +	struct io_uring_task *tctx;
> +	__u32 offset;
> +
> +	if (!current->io_uring)
> +		return 0;
> +
> +	if (copy_from_user(&offset, arg, sizeof(__u32)))
> +		return -EFAULT;
> +	if (offset > IO_RINGFD_REG_MAX)
ditto
> +		return -EINVAL;
> +
> +	tctx = current->io_uring;
> +	if (tctx->registered_files[offset]) {
> +		fput(tctx->registered_files[offset]);
> +		tctx->registered_files[offset] = NULL;
> +	}
> +
> +	return 0;
> +}
> +
>   static void io_destroy_buffers(struct io_ring_ctx *ctx)
>   {
>   	struct io_buffer *buf;
> @@ -9790,7 +9852,7 @@ static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
>   	}
>   }
>   
> -static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
> +static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx, bool locked)
>   {
>   	struct io_uring_task *tctx = current->io_uring;
>   	struct io_tctx_node *node;
> @@ -9825,9 +9887,11 @@ static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
>   			return ret;
>   		}
>   
> -		mutex_lock(&ctx->uring_lock);
> +		if (!locked)
> +			mutex_lock(&ctx->uring_lock);
>   		list_add(&node->ctx_node, &ctx->tctx_list);
> -		mutex_unlock(&ctx->uring_lock);
> +		if (!locked)
> +			mutex_unlock(&ctx->uring_lock);
>   	}
>   	tctx->last = ctx;
>   	return 0;
> @@ -9836,13 +9900,13 @@ static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
>   /*
>    * Note that this task has used io_uring. We use it for cancelation purposes.
>    */
> -static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
> +static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx, bool locked)
>   {
>   	struct io_uring_task *tctx = current->io_uring;
>   
>   	if (likely(tctx && tctx->last == ctx))
>   		return 0;
> -	return __io_uring_add_tctx_node(ctx);
> +	return __io_uring_add_tctx_node(ctx, locked);
>   }
>   
>   /*
> @@ -9973,6 +10037,16 @@ void __io_uring_cancel(bool cancel_all)
>   	io_uring_cancel_generic(cancel_all, NULL);
>   }
>   
> +void io_uring_unreg_ringfd(struct io_uring_task *tctx)
> +{
> +	int i;
> +
> +	for (i = 0; i < IO_RINGFD_REG_MAX; i++) {
> +		if (tctx->registered_files[i])
> +			fput(tctx->registered_files[i]);
> +	}
> +}
> +
>   static void *io_uring_validate_mmap_request(struct file *file,
>   					    loff_t pgoff, size_t sz)
>   {
> @@ -10098,24 +10172,36 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
>   	struct io_ring_ctx *ctx;
>   	int submitted = 0;
>   	struct fd f;
> +	struct file *file;
>   	long ret;
>   
>   	io_run_task_work();
>   
>   	if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
> -			       IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)))
> +			       IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
> +			       IORING_ENTER_FIXED_FILE)))
>   		return -EINVAL;
>   
> -	f = fdget(fd);
> -	if (unlikely(!f.file))
> -		return -EBADF;
> +	if (flags & IORING_ENTER_FIXED_FILE) {
> +		if (fd > IO_RINGFD_REG_MAX || !current->io_uring)
ditto
> +			return -EINVAL;
> +
> +		file = current->io_uring->registered_files[fd];
> +		if (unlikely(!file))
> +			return -EBADF;
> +	} else {
> +		f = fdget(fd);
> +		if (unlikely(!f.file))
> +			return -EBADF;
> +		file = f.file;
> +	}
>   
>   	ret = -EOPNOTSUPP;
> -	if (unlikely(f.file->f_op != &io_uring_fops))
> +	if (unlikely(file->f_op != &io_uring_fops))
>   		goto out_fput;
>   
>   	ret = -ENXIO;
> -	ctx = f.file->private_data;
> +	ctx = file->private_data;
>   	if (unlikely(!percpu_ref_tryget(&ctx->refs)))
>   		goto out_fput;
>   
> @@ -10145,7 +10231,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
>   		}
>   		submitted = to_submit;
>   	} else if (to_submit) {
> -		ret = io_uring_add_tctx_node(ctx);
> +		ret = io_uring_add_tctx_node(ctx, false);
>   		if (unlikely(ret))
>   			goto out;
>   		mutex_lock(&ctx->uring_lock);
> @@ -10182,7 +10268,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
>   out:
>   	percpu_ref_put(&ctx->refs);
>   out_fput:
> -	fdput(f);
> +	if (!(flags & IORING_ENTER_FIXED_FILE))
> +		fdput(f);
>   	return submitted ? submitted : ret;
>   }
>   
> @@ -10413,7 +10500,7 @@ static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
>   	if (fd < 0)
>   		return fd;
>   
> -	ret = io_uring_add_tctx_node(ctx);
> +	ret = io_uring_add_tctx_node(ctx, false);
>   	if (ret) {
>   		put_unused_fd(fd);
>   		return ret;
> @@ -11134,6 +11221,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
>   			break;
>   		ret = io_register_iowq_max_workers(ctx, arg);
>   		break;
> +	case IORING_REGISTER_IORINGFD:
> +		ret = -EINVAL;
> +		if (nr_args != 1)
> +			break;
> +		ret = io_ringfd_register(ctx, arg);
> +		break;
> +	case IORING_UNREGISTER_IORINGFD:
> +		ret = -EINVAL;
> +		if (nr_args != 1)
> +			break;
> +		ret = io_ringfd_unregister(ctx, arg);
> +		break;
>   	default:
>   		ret = -EINVAL;
>   		break;
> diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
> index 649a4d7c241b..5ddea83912c7 100644
> --- a/include/linux/io_uring.h
> +++ b/include/linux/io_uring.h
> @@ -9,11 +9,14 @@
>   struct sock *io_uring_get_socket(struct file *file);
>   void __io_uring_cancel(bool cancel_all);
>   void __io_uring_free(struct task_struct *tsk);
> +void io_uring_unreg_ringfd(struct io_uring_task *tctx);
>   
>   static inline void io_uring_files_cancel(void)
>   {
> -	if (current->io_uring)
> +	if (current->io_uring) {
> +		io_uring_unreg_ringfd(current->io_uring);
>   		__io_uring_cancel(false);
> +	}
>   }
>   static inline void io_uring_task_cancel(void)
>   {
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index 787f491f0d2a..f076a203317a 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -261,6 +261,7 @@ struct io_cqring_offsets {
>   #define IORING_ENTER_SQ_WAKEUP	(1U << 1)
>   #define IORING_ENTER_SQ_WAIT	(1U << 2)
>   #define IORING_ENTER_EXT_ARG	(1U << 3)
> +#define IORING_ENTER_FIXED_FILE	(1U << 4)
>   
>   /*
>    * Passed in for io_uring_setup(2). Copied back with updated info on success
> @@ -325,6 +326,9 @@ enum {
>   	/* set/get max number of io-wq workers */
>   	IORING_REGISTER_IOWQ_MAX_WORKERS	= 19,
>   
> +	IORING_REGISTER_IORINGFD		= 20,
> +	IORING_UNREGISTER_IORINGFD		= 21,
> +
>   	/* this goes last */
>   	IORING_REGISTER_LAST
>   };
> @@ -422,4 +426,9 @@ struct io_uring_getevents_arg {
>   	__u64	ts;
>   };
>   
> +struct io_uring_fd_reg {
> +	__u32	offset;
> +	__s32	fd;
> +};
> +
>   #endif

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] io_uring: add io_uring_enter(2) fixed file support
  2022-03-03  5:28 [PATCH] io_uring: add io_uring_enter(2) fixed file support Xiaoguang Wang
  2022-03-03  8:56 ` Hao Xu
@ 2022-03-03 13:38 ` Jens Axboe
  2022-03-03 14:36   ` Jens Axboe
  1 sibling, 1 reply; 28+ messages in thread
From: Jens Axboe @ 2022-03-03 13:38 UTC (permalink / raw)
  To: Xiaoguang Wang, io-uring; +Cc: asml.silence

On 3/2/22 10:28 PM, Xiaoguang Wang wrote:
> IORING_REGISTER_FILES is a good feature to reduce fget/fput overhead for
> each IO we do on file, but still left one, which is io_uring_enter(2).
> In io_uring_enter(2), it still fget/fput io_ring fd. I have observed
> this overhead in some our internal oroutine implementations based on
> io_uring with low submit batch. To totally remove fget/fput overhead in
> io_uring, we may add a small struct file cache in io_uring_task and add
> a new IORING_ENTER_FIXED_FILE flag. Currently the capacity of this file
> cache is 16, wihcih I think it maybe enough, also not that this cache is
> per-thread.

Would indeed be nice to get rid of, can be a substantial amount of time
wasted in fdget/fdput. Does this resolve dependencies correctly if
someone passes the ring fd? Adding ring registration to test/ring-leak.c
from the liburing repo would be a useful exercise.

Comments below.

> @@ -8739,8 +8742,16 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task,
>  	if (unlikely(!tctx))
>  		return -ENOMEM;
>  
> +	tctx->registered_files = kzalloc(sizeof(struct file *) * IO_RINGFD_REG_MAX,
> +					 GFP_KERNEL);

kcalloc()

> +static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx, bool locked);
> +
> +static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *arg)
> +{
> +	struct io_uring_fd_reg reg;
> +	struct io_uring_task *tctx;
> +	struct file *file;
> +	int ret;
> +
> +	if (copy_from_user(&reg, arg, sizeof(struct io_uring_fd_reg)))
> +		return -EFAULT;
> +	if (reg.offset > IO_RINGFD_REG_MAX)
> +		return -EINVAL;
> +
> +	ret = io_uring_add_tctx_node(ctx, true);
> +	if (unlikely(ret))
> +		return ret;

Can we safely drop ctx->uring_lock around this call instead? The locked
argument is always kind of ugly, and you currently have a deadlock as
far as I can tell from io_uring_alloc_task_context() that now holds
ctx->uring_lock, and then calling io_init_wq_offload() which will grab
it again.

Why not just use io_uring_rsrc_update here rather than add a new type?

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] io_uring: add io_uring_enter(2) fixed file support
  2022-03-03 13:38 ` Jens Axboe
@ 2022-03-03 14:36   ` Jens Axboe
  2022-03-03 14:40     ` Jens Axboe
  2022-04-21 14:16     ` Hao Xu
  0 siblings, 2 replies; 28+ messages in thread
From: Jens Axboe @ 2022-03-03 14:36 UTC (permalink / raw)
  To: Xiaoguang Wang, io-uring; +Cc: asml.silence

On 3/3/22 6:38 AM, Jens Axboe wrote:
> On 3/2/22 10:28 PM, Xiaoguang Wang wrote:
>> IORING_REGISTER_FILES is a good feature to reduce fget/fput overhead for
>> each IO we do on file, but still left one, which is io_uring_enter(2).
>> In io_uring_enter(2), it still fget/fput io_ring fd. I have observed
>> this overhead in some our internal oroutine implementations based on
>> io_uring with low submit batch. To totally remove fget/fput overhead in
>> io_uring, we may add a small struct file cache in io_uring_task and add
>> a new IORING_ENTER_FIXED_FILE flag. Currently the capacity of this file
>> cache is 16, wihcih I think it maybe enough, also not that this cache is
>> per-thread.
> 
> Would indeed be nice to get rid of, can be a substantial amount of time
> wasted in fdget/fdput. Does this resolve dependencies correctly if
> someone passes the ring fd? Adding ring registration to test/ring-leak.c
> from the liburing repo would be a useful exercise.

Seems to pass that fine, but I did miss on first read through that you
add that hook to files_cancel() which should break that dependency.

Since I think this is a potentially big win for certain workloads, maybe
we should consider making this easier to use? I don't think we
necessarily need to tie this to the regular file registration. What if
we instead added a SETUP flag for this, and just return the internal
offset for that case? Then we don't need an enter flag, we don't need to
add register/unregister opcodes for it.

This does pose a problem when we fill the array. We can easily go beyond
16 here, that's just an arbitrary limit, but at some point we do have to
handle the case where SETUP_REGISTERED (or whatever we call it) can't
get a slot. I think we just clear the flag and setup the fd normally in
that case. The user doesn't need to know, all the application needs to
are about is that it can use the passed back 'fd' to call the other
io_uring functions.

The only potential oddity here is that the fd passed back is not a
legitimate fd. io_uring does support poll(2) on its file descriptor, so
that could cause some confusion even if I don't think anyone actually
does poll(2) on io_uring.

What do you think?

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] io_uring: add io_uring_enter(2) fixed file support
  2022-03-03 14:36   ` Jens Axboe
@ 2022-03-03 14:40     ` Jens Axboe
  2022-03-03 16:31       ` Jens Axboe
  2022-04-21 14:16     ` Hao Xu
  1 sibling, 1 reply; 28+ messages in thread
From: Jens Axboe @ 2022-03-03 14:40 UTC (permalink / raw)
  To: Xiaoguang Wang, io-uring; +Cc: asml.silence

On 3/3/22 7:36 AM, Jens Axboe wrote:
> The only potential oddity here is that the fd passed back is not a
> legitimate fd. io_uring does support poll(2) on its file descriptor, so
> that could cause some confusion even if I don't think anyone actually
> does poll(2) on io_uring.

Side note - the only implication here is that we then likely can't make
the optimized behavior the default, it has to be an IORING_SETUP_REG
flag which tells us that the application is aware of this limitation.
Though I guess close(2) might mess with that too... Hmm.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] io_uring: add io_uring_enter(2) fixed file support
  2022-03-03 14:40     ` Jens Axboe
@ 2022-03-03 16:31       ` Jens Axboe
  2022-03-03 17:18         ` Jens Axboe
                           ` (2 more replies)
  0 siblings, 3 replies; 28+ messages in thread
From: Jens Axboe @ 2022-03-03 16:31 UTC (permalink / raw)
  To: Xiaoguang Wang, io-uring; +Cc: asml.silence

On 3/3/22 7:40 AM, Jens Axboe wrote:
> On 3/3/22 7:36 AM, Jens Axboe wrote:
>> The only potential oddity here is that the fd passed back is not a
>> legitimate fd. io_uring does support poll(2) on its file descriptor, so
>> that could cause some confusion even if I don't think anyone actually
>> does poll(2) on io_uring.
> 
> Side note - the only implication here is that we then likely can't make
> the optimized behavior the default, it has to be an IORING_SETUP_REG
> flag which tells us that the application is aware of this limitation.
> Though I guess close(2) might mess with that too... Hmm.

Not sure I can find a good approach for that. Tried out your patch and
made some fixes:

- Missing free on final tctx free
- Rename registered_files to registered_rings
- Fix off-by-ones in checking max registration count
- Use kcalloc
- Rename ENTER_FIXED_FILE -> ENTER_REGISTERED_RING
- Don't pass in tctx to io_uring_unreg_ringfd()
- Get rid of forward declaration for adding tctx node
- Get rid of extra file pointer in io_uring_enter()
- Fix deadlock in io_ringfd_register()
- Use io_uring_rsrc_update rather than add a new struct type

Patch I ran below.

Ran some testing here, and on my laptop, running:

axboe@m1pro-kvm ~/g/fio (master)> t/io_uring -N1 -s1 -f0
polled=1, fixedbufs=1/0, register_files=1, buffered=0, QD=128
Engine=io_uring, sq_ring=128, cq_ring=128
submitter=0, tid=673
IOPS=6627K, IOS/call=1/1, inflight=()
IOPS=6995K, IOS/call=1/1, inflight=()
IOPS=6992K, IOS/call=1/1, inflight=()
IOPS=7005K, IOS/call=1/1, inflight=()
IOPS=6999K, IOS/call=1/1, inflight=()

and with registered ring

axboe@m1pro-kvm ~/g/fio (master)> t/io_uring -N1 -s1 -f1
polled=1, fixedbufs=1/0, register_files=1, buffered=0, QD=128
Engine=io_uring, sq_ring=128, cq_ring=128
submitter=0, tid=687
ring register 0
IOPS=7714K, IOS/call=1/1, inflight=()
IOPS=8030K, IOS/call=1/1, inflight=()
IOPS=8025K, IOS/call=1/1, inflight=()
IOPS=8015K, IOS/call=1/1, inflight=()
IOPS=8037K, IOS/call=1/1, inflight=()

which is about a 15% improvement, pretty massive...

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ad3e0b0ab3b9..8a1f97054b71 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -466,6 +466,8 @@ struct io_ring_ctx {
 	};
 };
 
+#define IO_RINGFD_REG_MAX 16
+
 struct io_uring_task {
 	/* submission side */
 	int			cached_refs;
@@ -481,6 +483,7 @@ struct io_uring_task {
 	struct io_wq_work_list	task_list;
 	struct io_wq_work_list	prior_task_list;
 	struct callback_head	task_work;
+	struct file		**registered_rings;
 	bool			task_running;
 };
 
@@ -8806,8 +8809,16 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task,
 	if (unlikely(!tctx))
 		return -ENOMEM;
 
+	tctx->registered_rings = kcalloc(IO_RINGFD_REG_MAX,
+					 sizeof(struct file *), GFP_KERNEL);
+	if (unlikely(!tctx->registered_rings)) {
+		kfree(tctx);
+		return -ENOMEM;
+	}
+
 	ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
 	if (unlikely(ret)) {
+		kfree(tctx->registered_rings);
 		kfree(tctx);
 		return ret;
 	}
@@ -8816,6 +8827,7 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task,
 	if (IS_ERR(tctx->io_wq)) {
 		ret = PTR_ERR(tctx->io_wq);
 		percpu_counter_destroy(&tctx->inflight);
+		kfree(tctx->registered_rings);
 		kfree(tctx);
 		return ret;
 	}
@@ -8840,6 +8852,7 @@ void __io_uring_free(struct task_struct *tsk)
 	WARN_ON_ONCE(tctx->io_wq);
 	WARN_ON_ONCE(tctx->cached_refs);
 
+	kfree(tctx->registered_rings);
 	percpu_counter_destroy(&tctx->inflight);
 	kfree(tctx);
 	tsk->io_uring = NULL;
@@ -10061,6 +10074,68 @@ void __io_uring_cancel(bool cancel_all)
 	io_uring_cancel_generic(cancel_all, NULL);
 }
 
+void io_uring_unreg_ringfd(void)
+{
+	struct io_uring_task *tctx = current->io_uring;
+	int i;
+
+	for (i = 0; i < IO_RINGFD_REG_MAX; i++) {
+		if (tctx->registered_rings[i]) {
+			fput(tctx->registered_rings[i]);
+			tctx->registered_rings[i] = NULL;
+		}
+	}
+}
+
+static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *arg)
+{
+	struct io_uring_rsrc_update reg;
+	struct io_uring_task *tctx;
+	struct file *file;
+	int ret;
+
+	if (copy_from_user(&reg, arg, sizeof(reg)))
+		return -EFAULT;
+	if (reg.offset >= IO_RINGFD_REG_MAX)
+		return -EINVAL;
+
+	mutex_unlock(&ctx->uring_lock);
+	ret = io_uring_add_tctx_node(ctx);
+	mutex_lock(&ctx->uring_lock);
+	if (unlikely(ret))
+		return ret;
+
+	tctx = current->io_uring;
+	if (tctx->registered_rings[reg.offset])
+		return -EBUSY;
+	file = fget(reg.data);
+	if (unlikely(!file))
+		return -EBADF;
+	tctx->registered_rings[reg.offset] = file;
+	return 0;
+}
+
+static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *arg)
+{
+	struct io_uring_task *tctx = current->io_uring;
+	__u32 offset;
+
+	if (!tctx)
+		return 0;
+
+	if (copy_from_user(&offset, arg, sizeof(__u32)))
+		return -EFAULT;
+	if (offset >= IO_RINGFD_REG_MAX)
+		return -EINVAL;
+
+	if (tctx->registered_rings[offset]) {
+		fput(tctx->registered_rings[offset]);
+		tctx->registered_rings[offset] = NULL;
+	}
+
+	return 0;
+}
+
 static void *io_uring_validate_mmap_request(struct file *file,
 					    loff_t pgoff, size_t sz)
 {
@@ -10191,12 +10266,23 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 	io_run_task_work();
 
 	if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
-			       IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)))
+			       IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
+			       IORING_ENTER_REGISTERED_RING)))
 		return -EINVAL;
 
-	f = fdget(fd);
-	if (unlikely(!f.file))
-		return -EBADF;
+	if (flags & IORING_ENTER_REGISTERED_RING) {
+		struct io_uring_task *tctx = current->io_uring;
+
+		if (fd >= IO_RINGFD_REG_MAX || !tctx)
+			return -EINVAL;
+		f.file = tctx->registered_rings[fd];
+		if (unlikely(!f.file))
+			return -EBADF;
+	} else {
+		f = fdget(fd);
+		if (unlikely(!f.file))
+			return -EBADF;
+	}
 
 	ret = -EOPNOTSUPP;
 	if (unlikely(f.file->f_op != &io_uring_fops))
@@ -10270,7 +10356,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 out:
 	percpu_ref_put(&ctx->refs);
 out_fput:
-	fdput(f);
+	if (!(flags & IORING_ENTER_REGISTERED_RING))
+		fdput(f);
 	return submitted ? submitted : ret;
 }
 
@@ -11160,6 +11247,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_register_iowq_max_workers(ctx, arg);
 		break;
+	case IORING_REGISTER_IORINGFD:
+		ret = -EINVAL;
+		if (nr_args != 1)
+			break;
+		ret = io_ringfd_register(ctx, arg);
+		break;
+	case IORING_UNREGISTER_IORINGFD:
+		ret = -EINVAL;
+		if (nr_args != 1)
+			break;
+		ret = io_ringfd_unregister(ctx, arg);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 649a4d7c241b..1814e698d861 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -9,11 +9,14 @@
 struct sock *io_uring_get_socket(struct file *file);
 void __io_uring_cancel(bool cancel_all);
 void __io_uring_free(struct task_struct *tsk);
+void io_uring_unreg_ringfd(void);
 
 static inline void io_uring_files_cancel(void)
 {
-	if (current->io_uring)
+	if (current->io_uring) {
+		io_uring_unreg_ringfd();
 		__io_uring_cancel(false);
+	}
 }
 static inline void io_uring_task_cancel(void)
 {
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 787f491f0d2a..a36d31f2cc66 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -257,10 +257,11 @@ struct io_cqring_offsets {
 /*
  * io_uring_enter(2) flags
  */
-#define IORING_ENTER_GETEVENTS	(1U << 0)
-#define IORING_ENTER_SQ_WAKEUP	(1U << 1)
-#define IORING_ENTER_SQ_WAIT	(1U << 2)
-#define IORING_ENTER_EXT_ARG	(1U << 3)
+#define IORING_ENTER_GETEVENTS		(1U << 0)
+#define IORING_ENTER_SQ_WAKEUP		(1U << 1)
+#define IORING_ENTER_SQ_WAIT		(1U << 2)
+#define IORING_ENTER_EXT_ARG		(1U << 3)
+#define IORING_ENTER_REGISTERED_RING	(1U << 4)
 
 /*
  * Passed in for io_uring_setup(2). Copied back with updated info on success
@@ -325,6 +326,10 @@ enum {
 	/* set/get max number of io-wq workers */
 	IORING_REGISTER_IOWQ_MAX_WORKERS	= 19,
 
+	/* register/unregister io_uring fd with the ring */
+	IORING_REGISTER_IORINGFD		= 20,
+	IORING_UNREGISTER_IORINGFD		= 21,
+
 	/* this goes last */
 	IORING_REGISTER_LAST
 };

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH] io_uring: add io_uring_enter(2) fixed file support
  2022-03-03 16:31       ` Jens Axboe
@ 2022-03-03 17:18         ` Jens Axboe
  2022-03-03 20:41           ` Jens Axboe
  2022-03-04  1:49         ` Pavel Begunkov
  2022-03-04  1:52         ` Pavel Begunkov
  2 siblings, 1 reply; 28+ messages in thread
From: Jens Axboe @ 2022-03-03 17:18 UTC (permalink / raw)
  To: Xiaoguang Wang, io-uring; +Cc: asml.silence

On 3/3/22 9:31 AM, Jens Axboe wrote:
> On 3/3/22 7:40 AM, Jens Axboe wrote:
>> On 3/3/22 7:36 AM, Jens Axboe wrote:
>>> The only potential oddity here is that the fd passed back is not a
>>> legitimate fd. io_uring does support poll(2) on its file descriptor, so
>>> that could cause some confusion even if I don't think anyone actually
>>> does poll(2) on io_uring.
>>
>> Side note - the only implication here is that we then likely can't make
>> the optimized behavior the default, it has to be an IORING_SETUP_REG
>> flag which tells us that the application is aware of this limitation.
>> Though I guess close(2) might mess with that too... Hmm.
> 
> Not sure I can find a good approach for that. Tried out your patch and
> made some fixes:
> 
> - Missing free on final tctx free
> - Rename registered_files to registered_rings
> - Fix off-by-ones in checking max registration count
> - Use kcalloc
> - Rename ENTER_FIXED_FILE -> ENTER_REGISTERED_RING
> - Don't pass in tctx to io_uring_unreg_ringfd()
> - Get rid of forward declaration for adding tctx node
> - Get rid of extra file pointer in io_uring_enter()
> - Fix deadlock in io_ringfd_register()
> - Use io_uring_rsrc_update rather than add a new struct type

- Allow multiple register/unregister instead of enforcing just 1 at the
  time
- Check for it really being a ring fd when registering

For different batch counts, nice improvements are seen. Roughly:

Batch==1	15% faster
Batch==2	13% faster
Batch==4	11% faster

This is just in microbenchmarks where the fdget/fdput play a bigger
factor, but it will certainly help with real world situations where
batching is more limited than in benchmarks.

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ad3e0b0ab3b9..452e68b73e1f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -466,6 +466,8 @@ struct io_ring_ctx {
 	};
 };
 
+#define IO_RINGFD_REG_MAX 16
+
 struct io_uring_task {
 	/* submission side */
 	int			cached_refs;
@@ -481,6 +483,7 @@ struct io_uring_task {
 	struct io_wq_work_list	task_list;
 	struct io_wq_work_list	prior_task_list;
 	struct callback_head	task_work;
+	struct file		**registered_rings;
 	bool			task_running;
 };
 
@@ -8806,8 +8809,16 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task,
 	if (unlikely(!tctx))
 		return -ENOMEM;
 
+	tctx->registered_rings = kcalloc(IO_RINGFD_REG_MAX,
+					 sizeof(struct file *), GFP_KERNEL);
+	if (unlikely(!tctx->registered_rings)) {
+		kfree(tctx);
+		return -ENOMEM;
+	}
+
 	ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
 	if (unlikely(ret)) {
+		kfree(tctx->registered_rings);
 		kfree(tctx);
 		return ret;
 	}
@@ -8816,6 +8827,7 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task,
 	if (IS_ERR(tctx->io_wq)) {
 		ret = PTR_ERR(tctx->io_wq);
 		percpu_counter_destroy(&tctx->inflight);
+		kfree(tctx->registered_rings);
 		kfree(tctx);
 		return ret;
 	}
@@ -8840,6 +8852,7 @@ void __io_uring_free(struct task_struct *tsk)
 	WARN_ON_ONCE(tctx->io_wq);
 	WARN_ON_ONCE(tctx->cached_refs);
 
+	kfree(tctx->registered_rings);
 	percpu_counter_destroy(&tctx->inflight);
 	kfree(tctx);
 	tsk->io_uring = NULL;
@@ -10061,6 +10074,103 @@ void __io_uring_cancel(bool cancel_all)
 	io_uring_cancel_generic(cancel_all, NULL);
 }
 
+void io_uring_unreg_ringfd(void)
+{
+	struct io_uring_task *tctx = current->io_uring;
+	int i;
+
+	for (i = 0; i < IO_RINGFD_REG_MAX; i++) {
+		if (tctx->registered_rings[i]) {
+			fput(tctx->registered_rings[i]);
+			tctx->registered_rings[i] = NULL;
+		}
+	}
+}
+
+static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *arg,
+			      unsigned nr_args)
+{
+	struct io_uring_rsrc_update reg;
+	struct io_uring_task *tctx;
+	struct file *file;
+	int ret, nr, i;
+
+	if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
+		return -EINVAL;
+
+	mutex_unlock(&ctx->uring_lock);
+	ret = io_uring_add_tctx_node(ctx);
+	mutex_lock(&ctx->uring_lock);
+	if (ret)
+		return ret;
+
+	tctx = current->io_uring;
+	for (i = 0; i < nr_args; i++) {
+		if (copy_from_user(&reg, arg, sizeof(reg))) {
+			ret = -EFAULT;
+			break;
+		}
+		if (reg.offset >= IO_RINGFD_REG_MAX) {
+			ret = -EINVAL;
+			break;
+		}
+
+		if (tctx->registered_rings[reg.offset]) {
+			ret = -EBUSY;
+			break;
+		}
+		file = fget(reg.data);
+		if (!file) {
+			ret = -EBADF;
+			break;
+		} else if (file->f_op != &io_uring_fops) {
+			ret = -EOPNOTSUPP;
+			fput(file);
+			break;
+		}
+
+		tctx->registered_rings[reg.offset] = file;
+		arg += sizeof(reg);
+		nr++;
+	}
+
+	return nr ? nr : ret;
+}
+
+static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *arg,
+				unsigned nr_args)
+{
+	struct io_uring_task *tctx = current->io_uring;
+	struct io_uring_rsrc_update reg;
+	int ret, nr, i;
+
+	if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
+		return -EINVAL;
+	if (!tctx)
+		return 0;
+
+	ret = nr = 0;
+	for (i = 0; i < nr_args; i++) {
+		if (copy_from_user(&reg, arg, sizeof(reg))) {
+			ret = -EFAULT;
+			break;
+		}
+		if (reg.offset >= IO_RINGFD_REG_MAX) {
+			ret = -EINVAL;
+			break;
+		}
+
+		if (tctx->registered_rings[reg.offset]) {
+			fput(tctx->registered_rings[reg.offset]);
+			tctx->registered_rings[reg.offset] = NULL;
+		}
+		arg += sizeof(reg);
+		nr++;
+	}
+
+	return nr ? nr : ret;
+}
+
 static void *io_uring_validate_mmap_request(struct file *file,
 					    loff_t pgoff, size_t sz)
 {
@@ -10191,12 +10301,23 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 	io_run_task_work();
 
 	if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
-			       IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)))
+			       IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
+			       IORING_ENTER_REGISTERED_RING)))
 		return -EINVAL;
 
-	f = fdget(fd);
-	if (unlikely(!f.file))
-		return -EBADF;
+	if (flags & IORING_ENTER_REGISTERED_RING) {
+		struct io_uring_task *tctx = current->io_uring;
+
+		if (fd >= IO_RINGFD_REG_MAX || !tctx)
+			return -EINVAL;
+		f.file = tctx->registered_rings[fd];
+		if (unlikely(!f.file))
+			return -EBADF;
+	} else {
+		f = fdget(fd);
+		if (unlikely(!f.file))
+			return -EBADF;
+	}
 
 	ret = -EOPNOTSUPP;
 	if (unlikely(f.file->f_op != &io_uring_fops))
@@ -10270,7 +10391,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 out:
 	percpu_ref_put(&ctx->refs);
 out_fput:
-	fdput(f);
+	if (!(flags & IORING_ENTER_REGISTERED_RING))
+		fdput(f);
 	return submitted ? submitted : ret;
 }
 
@@ -11160,6 +11282,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_register_iowq_max_workers(ctx, arg);
 		break;
+	case IORING_REGISTER_IORINGFD:
+		ret = io_ringfd_register(ctx, arg, nr_args);
+		break;
+	case IORING_UNREGISTER_IORINGFD:
+		ret = io_ringfd_unregister(ctx, arg, nr_args);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 649a4d7c241b..1814e698d861 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -9,11 +9,14 @@
 struct sock *io_uring_get_socket(struct file *file);
 void __io_uring_cancel(bool cancel_all);
 void __io_uring_free(struct task_struct *tsk);
+void io_uring_unreg_ringfd(void);
 
 static inline void io_uring_files_cancel(void)
 {
-	if (current->io_uring)
+	if (current->io_uring) {
+		io_uring_unreg_ringfd();
 		__io_uring_cancel(false);
+	}
 }
 static inline void io_uring_task_cancel(void)
 {
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 787f491f0d2a..a36d31f2cc66 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -257,10 +257,11 @@ struct io_cqring_offsets {
 /*
  * io_uring_enter(2) flags
  */
-#define IORING_ENTER_GETEVENTS	(1U << 0)
-#define IORING_ENTER_SQ_WAKEUP	(1U << 1)
-#define IORING_ENTER_SQ_WAIT	(1U << 2)
-#define IORING_ENTER_EXT_ARG	(1U << 3)
+#define IORING_ENTER_GETEVENTS		(1U << 0)
+#define IORING_ENTER_SQ_WAKEUP		(1U << 1)
+#define IORING_ENTER_SQ_WAIT		(1U << 2)
+#define IORING_ENTER_EXT_ARG		(1U << 3)
+#define IORING_ENTER_REGISTERED_RING	(1U << 4)
 
 /*
  * Passed in for io_uring_setup(2). Copied back with updated info on success
@@ -325,6 +326,10 @@ enum {
 	/* set/get max number of io-wq workers */
 	IORING_REGISTER_IOWQ_MAX_WORKERS	= 19,
 
+	/* register/unregister io_uring fd with the ring */
+	IORING_REGISTER_IORINGFD		= 20,
+	IORING_UNREGISTER_IORINGFD		= 21,
+
 	/* this goes last */
 	IORING_REGISTER_LAST
 };

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH] io_uring: add io_uring_enter(2) fixed file support
  2022-03-03 17:18         ` Jens Axboe
@ 2022-03-03 20:41           ` Jens Axboe
  2022-03-03 21:19             ` Jens Axboe
  2022-03-03 22:24             ` Vito Caputo
  0 siblings, 2 replies; 28+ messages in thread
From: Jens Axboe @ 2022-03-03 20:41 UTC (permalink / raw)
  To: Xiaoguang Wang, io-uring; +Cc: asml.silence

On 3/3/22 10:18 AM, Jens Axboe wrote:
> On 3/3/22 9:31 AM, Jens Axboe wrote:
>> On 3/3/22 7:40 AM, Jens Axboe wrote:
>>> On 3/3/22 7:36 AM, Jens Axboe wrote:
>>>> The only potential oddity here is that the fd passed back is not a
>>>> legitimate fd. io_uring does support poll(2) on its file descriptor, so
>>>> that could cause some confusion even if I don't think anyone actually
>>>> does poll(2) on io_uring.
>>>
>>> Side note - the only implication here is that we then likely can't make
>>> the optimized behavior the default, it has to be an IORING_SETUP_REG
>>> flag which tells us that the application is aware of this limitation.
>>> Though I guess close(2) might mess with that too... Hmm.
>>
>> Not sure I can find a good approach for that. Tried out your patch and
>> made some fixes:
>>
>> - Missing free on final tctx free
>> - Rename registered_files to registered_rings
>> - Fix off-by-ones in checking max registration count
>> - Use kcalloc
>> - Rename ENTER_FIXED_FILE -> ENTER_REGISTERED_RING
>> - Don't pass in tctx to io_uring_unreg_ringfd()
>> - Get rid of forward declaration for adding tctx node
>> - Get rid of extra file pointer in io_uring_enter()
>> - Fix deadlock in io_ringfd_register()
>> - Use io_uring_rsrc_update rather than add a new struct type
> 
> - Allow multiple register/unregister instead of enforcing just 1 at the
>   time
> - Check for it really being a ring fd when registering
> 
> For different batch counts, nice improvements are seen. Roughly:
> 
> Batch==1	15% faster
> Batch==2	13% faster
> Batch==4	11% faster
> 
> This is just in microbenchmarks where the fdget/fdput play a bigger
> factor, but it will certainly help with real world situations where
> batching is more limited than in benchmarks.

In trying this out in applications, I think the better registration API
is to allow io_uring to pick the offset. The application doesn't care,
it's just a magic integer there. And if we allow io_uring to pick it,
then that makes things a lot easier to deal with.

For registration, pass in an array of io_uring_rsrc_update structs, just
filling in the ring_fd in the data field. Return value is number of ring
fds registered, and up->offset now contains the chosen offset for each
of them.

Unregister is the same struct, but just with offset filled in.

For applications using io_uring, which is all of them as far as I'm
aware, we can also easily hide this. This means we can get the optimized
behavior by default.

What do you think?


diff --git a/fs/io_uring.c b/fs/io_uring.c
index ad3e0b0ab3b9..e3be96e81164 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -466,6 +466,11 @@ struct io_ring_ctx {
 	};
 };
 
+/*
+ * Arbitrary limit, can be raised if need be
+ */
+#define IO_RINGFD_REG_MAX 16
+
 struct io_uring_task {
 	/* submission side */
 	int			cached_refs;
@@ -481,6 +486,7 @@ struct io_uring_task {
 	struct io_wq_work_list	task_list;
 	struct io_wq_work_list	prior_task_list;
 	struct callback_head	task_work;
+	struct file		**registered_rings;
 	bool			task_running;
 };
 
@@ -8806,8 +8812,16 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task,
 	if (unlikely(!tctx))
 		return -ENOMEM;
 
+	tctx->registered_rings = kcalloc(IO_RINGFD_REG_MAX,
+					 sizeof(struct file *), GFP_KERNEL);
+	if (unlikely(!tctx->registered_rings)) {
+		kfree(tctx);
+		return -ENOMEM;
+	}
+
 	ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
 	if (unlikely(ret)) {
+		kfree(tctx->registered_rings);
 		kfree(tctx);
 		return ret;
 	}
@@ -8816,6 +8830,7 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task,
 	if (IS_ERR(tctx->io_wq)) {
 		ret = PTR_ERR(tctx->io_wq);
 		percpu_counter_destroy(&tctx->inflight);
+		kfree(tctx->registered_rings);
 		kfree(tctx);
 		return ret;
 	}
@@ -8840,6 +8855,7 @@ void __io_uring_free(struct task_struct *tsk)
 	WARN_ON_ONCE(tctx->io_wq);
 	WARN_ON_ONCE(tctx->cached_refs);
 
+	kfree(tctx->registered_rings);
 	percpu_counter_destroy(&tctx->inflight);
 	kfree(tctx);
 	tsk->io_uring = NULL;
@@ -10061,6 +10077,121 @@ void __io_uring_cancel(bool cancel_all)
 	io_uring_cancel_generic(cancel_all, NULL);
 }
 
+void io_uring_unreg_ringfd(void)
+{
+	struct io_uring_task *tctx = current->io_uring;
+	int i;
+
+	for (i = 0; i < IO_RINGFD_REG_MAX; i++) {
+		if (tctx->registered_rings[i]) {
+			fput(tctx->registered_rings[i]);
+			tctx->registered_rings[i] = NULL;
+		}
+	}
+}
+
+static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd)
+{
+	struct file *file;
+	int offset;
+
+	for (offset = 0; offset < IO_RINGFD_REG_MAX; offset++) {
+		if (tctx->registered_rings[offset])
+			continue;
+
+		file = fget(fd);
+		if (!file) {
+			return -EBADF;
+		} else if (file->f_op != &io_uring_fops) {
+			fput(file);
+			return -EOPNOTSUPP;
+		}
+		tctx->registered_rings[offset] = file;
+		return offset;
+	}
+
+	return -EBUSY;
+}
+
+static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *arg,
+			      unsigned nr_args)
+{
+	struct io_uring_rsrc_update reg;
+	struct io_uring_task *tctx;
+	int ret, nr, i;
+
+	if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
+		return -EINVAL;
+
+	mutex_unlock(&ctx->uring_lock);
+	ret = io_uring_add_tctx_node(ctx);
+	mutex_lock(&ctx->uring_lock);
+	if (ret)
+		return ret;
+
+	tctx = current->io_uring;
+	for (i = 0; i < nr_args; i++) {
+		if (copy_from_user(&reg, arg, sizeof(reg))) {
+			ret = -EFAULT;
+			break;
+		}
+		if (reg.offset >= IO_RINGFD_REG_MAX) {
+			ret = -EINVAL;
+			break;
+		}
+
+		ret = io_ring_add_registered_fd(tctx, reg.data);
+		if (ret < 0)
+			break;
+
+		reg.offset = ret;
+		if (copy_to_user(arg, &reg, sizeof(reg))) {
+			fput(tctx->registered_rings[reg.offset]);
+			tctx->registered_rings[reg.offset] = NULL;
+			ret = -EFAULT;
+			break;
+		}
+		arg += sizeof(reg);
+		nr++;
+	}
+
+	return nr ? nr : ret;
+}
+
+static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *arg,
+				unsigned nr_args)
+{
+	struct io_uring_task *tctx = current->io_uring;
+	struct io_uring_rsrc_update reg;
+	int ret, nr, i;
+
+	if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
+		return -EINVAL;
+	if (!tctx)
+		return 0;
+
+	ret = nr = 0;
+	for (i = 0; i < nr_args; i++) {
+		if (copy_from_user(&reg, arg, sizeof(reg))) {
+			ret = -EFAULT;
+			break;
+		}
+		if (reg.offset >= IO_RINGFD_REG_MAX) {
+			ret = -EINVAL;
+			break;
+		}
+
+		if (tctx->registered_rings[reg.offset]) {
+			fput(tctx->registered_rings[reg.offset]);
+			tctx->registered_rings[reg.offset] = NULL;
+		}
+		arg += sizeof(reg);
+		nr++;
+	}
+
+	return nr ? nr : ret;
+}
+
 static void *io_uring_validate_mmap_request(struct file *file,
 					    loff_t pgoff, size_t sz)
 {
@@ -10191,12 +10322,23 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 	io_run_task_work();
 
 	if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
-			       IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)))
+			       IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
+			       IORING_ENTER_REGISTERED_RING)))
 		return -EINVAL;
 
-	f = fdget(fd);
-	if (unlikely(!f.file))
-		return -EBADF;
+	if (flags & IORING_ENTER_REGISTERED_RING) {
+		struct io_uring_task *tctx = current->io_uring;
+
+		if (fd >= IO_RINGFD_REG_MAX || !tctx)
+			return -EINVAL;
+		f.file = tctx->registered_rings[fd];
+		if (unlikely(!f.file))
+			return -EBADF;
+	} else {
+		f = fdget(fd);
+		if (unlikely(!f.file))
+			return -EBADF;
+	}
 
 	ret = -EOPNOTSUPP;
 	if (unlikely(f.file->f_op != &io_uring_fops))
@@ -10270,7 +10412,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 out:
 	percpu_ref_put(&ctx->refs);
 out_fput:
-	fdput(f);
+	if (!(flags & IORING_ENTER_REGISTERED_RING))
+		fdput(f);
 	return submitted ? submitted : ret;
 }
 
@@ -11160,6 +11303,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_register_iowq_max_workers(ctx, arg);
 		break;
+	case IORING_REGISTER_RING_FDS:
+		ret = io_ringfd_register(ctx, arg, nr_args);
+		break;
+	case IORING_UNREGISTER_RING_FDS:
+		ret = io_ringfd_unregister(ctx, arg, nr_args);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 649a4d7c241b..1814e698d861 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -9,11 +9,14 @@
 struct sock *io_uring_get_socket(struct file *file);
 void __io_uring_cancel(bool cancel_all);
 void __io_uring_free(struct task_struct *tsk);
+void io_uring_unreg_ringfd(void);
 
 static inline void io_uring_files_cancel(void)
 {
-	if (current->io_uring)
+	if (current->io_uring) {
+		io_uring_unreg_ringfd();
 		__io_uring_cancel(false);
+	}
 }
 static inline void io_uring_task_cancel(void)
 {
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 787f491f0d2a..42b2fe84dbcd 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -257,10 +257,11 @@ struct io_cqring_offsets {
 /*
  * io_uring_enter(2) flags
  */
-#define IORING_ENTER_GETEVENTS	(1U << 0)
-#define IORING_ENTER_SQ_WAKEUP	(1U << 1)
-#define IORING_ENTER_SQ_WAIT	(1U << 2)
-#define IORING_ENTER_EXT_ARG	(1U << 3)
+#define IORING_ENTER_GETEVENTS		(1U << 0)
+#define IORING_ENTER_SQ_WAKEUP		(1U << 1)
+#define IORING_ENTER_SQ_WAIT		(1U << 2)
+#define IORING_ENTER_EXT_ARG		(1U << 3)
+#define IORING_ENTER_REGISTERED_RING	(1U << 4)
 
 /*
  * Passed in for io_uring_setup(2). Copied back with updated info on success
@@ -325,6 +326,10 @@ enum {
 	/* set/get max number of io-wq workers */
 	IORING_REGISTER_IOWQ_MAX_WORKERS	= 19,
 
+	/* register/unregister io_uring fd with the ring */
+	IORING_REGISTER_RING_FDS		= 20,
+	IORING_UNREGISTER_RING_FDS		= 21,
+
 	/* this goes last */
 	IORING_REGISTER_LAST
 };

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH] io_uring: add io_uring_enter(2) fixed file support
  2022-03-03 20:41           ` Jens Axboe
@ 2022-03-03 21:19             ` Jens Axboe
  2022-03-04  0:07               ` Jens Axboe
  2022-03-03 22:24             ` Vito Caputo
  1 sibling, 1 reply; 28+ messages in thread
From: Jens Axboe @ 2022-03-03 21:19 UTC (permalink / raw)
  To: Xiaoguang Wang, io-uring; +Cc: asml.silence

On 3/3/22 1:41 PM, Jens Axboe wrote:
> On 3/3/22 10:18 AM, Jens Axboe wrote:
>> On 3/3/22 9:31 AM, Jens Axboe wrote:
>>> On 3/3/22 7:40 AM, Jens Axboe wrote:
>>>> On 3/3/22 7:36 AM, Jens Axboe wrote:
>>>>> The only potential oddity here is that the fd passed back is not a
>>>>> legitimate fd. io_uring does support poll(2) on its file descriptor, so
>>>>> that could cause some confusion even if I don't think anyone actually
>>>>> does poll(2) on io_uring.
>>>>
>>>> Side note - the only implication here is that we then likely can't make
>>>> the optimized behavior the default, it has to be an IORING_SETUP_REG
>>>> flag which tells us that the application is aware of this limitation.
>>>> Though I guess close(2) might mess with that too... Hmm.
>>>
>>> Not sure I can find a good approach for that. Tried out your patch and
>>> made some fixes:
>>>
>>> - Missing free on final tctx free
>>> - Rename registered_files to registered_rings
>>> - Fix off-by-ones in checking max registration count
>>> - Use kcalloc
>>> - Rename ENTER_FIXED_FILE -> ENTER_REGISTERED_RING
>>> - Don't pass in tctx to io_uring_unreg_ringfd()
>>> - Get rid of forward declaration for adding tctx node
>>> - Get rid of extra file pointer in io_uring_enter()
>>> - Fix deadlock in io_ringfd_register()
>>> - Use io_uring_rsrc_update rather than add a new struct type
>>
>> - Allow multiple register/unregister instead of enforcing just 1 at the
>>   time
>> - Check for it really being a ring fd when registering
>>
>> For different batch counts, nice improvements are seen. Roughly:
>>
>> Batch==1	15% faster
>> Batch==2	13% faster
>> Batch==4	11% faster
>>
>> This is just in microbenchmarks where the fdget/fdput play a bigger
>> factor, but it will certainly help with real world situations where
>> batching is more limited than in benchmarks.
> 
> In trying this out in applications, I think the better registration API
> is to allow io_uring to pick the offset. The application doesn't care,
> it's just a magic integer there. And if we allow io_uring to pick it,
> then that makes things a lot easier to deal with.
> 
> For registration, pass in an array of io_uring_rsrc_update structs, just
> filling in the ring_fd in the data field. Return value is number of ring
> fds registered, and up->offset now contains the chosen offset for each
> of them.
> 
> Unregister is the same struct, but just with offset filled in.
> 
> For applications using io_uring, which is all of them as far as I'm
> aware, we can also easily hide this. This means we can get the optimized
> behavior by default.

Only complication here is if the ring is shared across threads or
processes. The thread one can be common, one thread doing submit and one
doing completions. That one is a bit harder to handle. Without
inheriting registered ring fds, not sure how it can be handled by
default. Should probably just introduce a new queue init helper, but
then that requires application changes to adopt...

Ideas welcome!

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] io_uring: add io_uring_enter(2) fixed file support
  2022-03-03 20:41           ` Jens Axboe
  2022-03-03 21:19             ` Jens Axboe
@ 2022-03-03 22:24             ` Vito Caputo
  2022-03-03 22:26               ` Jens Axboe
  1 sibling, 1 reply; 28+ messages in thread
From: Vito Caputo @ 2022-03-03 22:24 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Xiaoguang Wang, io-uring, asml.silence

On Thu, Mar 03, 2022 at 01:41:50PM -0700, Jens Axboe wrote:
> On 3/3/22 10:18 AM, Jens Axboe wrote:
> > On 3/3/22 9:31 AM, Jens Axboe wrote:
> >> On 3/3/22 7:40 AM, Jens Axboe wrote:
> >>> On 3/3/22 7:36 AM, Jens Axboe wrote:
> >>>> The only potential oddity here is that the fd passed back is not a
> >>>> legitimate fd. io_uring does support poll(2) on its file descriptor, so
> >>>> that could cause some confusion even if I don't think anyone actually
> >>>> does poll(2) on io_uring.
> >>>
> >>> Side note - the only implication here is that we then likely can't make
> >>> the optimized behavior the default, it has to be an IORING_SETUP_REG
> >>> flag which tells us that the application is aware of this limitation.
> >>> Though I guess close(2) might mess with that too... Hmm.
> >>
> >> Not sure I can find a good approach for that. Tried out your patch and
> >> made some fixes:
> >>
> >> - Missing free on final tctx free
> >> - Rename registered_files to registered_rings
> >> - Fix off-by-ones in checking max registration count
> >> - Use kcalloc
> >> - Rename ENTER_FIXED_FILE -> ENTER_REGISTERED_RING
> >> - Don't pass in tctx to io_uring_unreg_ringfd()
> >> - Get rid of forward declaration for adding tctx node
> >> - Get rid of extra file pointer in io_uring_enter()
> >> - Fix deadlock in io_ringfd_register()
> >> - Use io_uring_rsrc_update rather than add a new struct type
> > 
> > - Allow multiple register/unregister instead of enforcing just 1 at the
> >   time
> > - Check for it really being a ring fd when registering
> > 
> > For different batch counts, nice improvements are seen. Roughly:
> > 
> > Batch==1	15% faster
> > Batch==2	13% faster
> > Batch==4	11% faster
> > 
> > This is just in microbenchmarks where the fdget/fdput play a bigger
> > factor, but it will certainly help with real world situations where
> > batching is more limited than in benchmarks.
> 

Certainly seems worthwhile.

> In trying this out in applications, I think the better registration API
> is to allow io_uring to pick the offset. The application doesn't care,
> it's just a magic integer there. And if we allow io_uring to pick it,
> then that makes things a lot easier to deal with.
> 
> For registration, pass in an array of io_uring_rsrc_update structs, just
> filling in the ring_fd in the data field. Return value is number of ring
> fds registered, and up->offset now contains the chosen offset for each
> of them.
> 
> Unregister is the same struct, but just with offset filled in.
> 
> For applications using io_uring, which is all of them as far as I'm
> aware, we can also easily hide this. This means we can get the optimized
> behavior by default.
> 

Did you mean s/using io_uring/using liburing/ here?

Regards,
Vito Caputo

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] io_uring: add io_uring_enter(2) fixed file support
  2022-03-03 22:24             ` Vito Caputo
@ 2022-03-03 22:26               ` Jens Axboe
  0 siblings, 0 replies; 28+ messages in thread
From: Jens Axboe @ 2022-03-03 22:26 UTC (permalink / raw)
  To: Vito Caputo; +Cc: Xiaoguang Wang, io-uring, asml.silence

>> In trying this out in applications, I think the better registration API
>> is to allow io_uring to pick the offset. The application doesn't care,
>> it's just a magic integer there. And if we allow io_uring to pick it,
>> then that makes things a lot easier to deal with.
>>
>> For registration, pass in an array of io_uring_rsrc_update structs, just
>> filling in the ring_fd in the data field. Return value is number of ring
>> fds registered, and up->offset now contains the chosen offset for each
>> of them.
>>
>> Unregister is the same struct, but just with offset filled in.
>>
>> For applications using io_uring, which is all of them as far as I'm
>> aware, we can also easily hide this. This means we can get the optimized
>> behavior by default.
>>
> 
> Did you mean s/using io_uring/using liburing/ here?

I did, applications using liburing as their io_uring interface.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] io_uring: add io_uring_enter(2) fixed file support
  2022-03-03 21:19             ` Jens Axboe
@ 2022-03-04  0:07               ` Jens Axboe
  2022-03-04 13:39                 ` Xiaoguang Wang
  0 siblings, 1 reply; 28+ messages in thread
From: Jens Axboe @ 2022-03-04  0:07 UTC (permalink / raw)
  To: Xiaoguang Wang, io-uring; +Cc: asml.silence

On 3/3/22 2:19 PM, Jens Axboe wrote:
> On 3/3/22 1:41 PM, Jens Axboe wrote:
>> On 3/3/22 10:18 AM, Jens Axboe wrote:
>>> On 3/3/22 9:31 AM, Jens Axboe wrote:
>>>> On 3/3/22 7:40 AM, Jens Axboe wrote:
>>>>> On 3/3/22 7:36 AM, Jens Axboe wrote:
>>>>>> The only potential oddity here is that the fd passed back is not a
>>>>>> legitimate fd. io_uring does support poll(2) on its file descriptor, so
>>>>>> that could cause some confusion even if I don't think anyone actually
>>>>>> does poll(2) on io_uring.
>>>>>
>>>>> Side note - the only implication here is that we then likely can't make
>>>>> the optimized behavior the default, it has to be an IORING_SETUP_REG
>>>>> flag which tells us that the application is aware of this limitation.
>>>>> Though I guess close(2) might mess with that too... Hmm.
>>>>
>>>> Not sure I can find a good approach for that. Tried out your patch and
>>>> made some fixes:
>>>>
>>>> - Missing free on final tctx free
>>>> - Rename registered_files to registered_rings
>>>> - Fix off-by-ones in checking max registration count
>>>> - Use kcalloc
>>>> - Rename ENTER_FIXED_FILE -> ENTER_REGISTERED_RING
>>>> - Don't pass in tctx to io_uring_unreg_ringfd()
>>>> - Get rid of forward declaration for adding tctx node
>>>> - Get rid of extra file pointer in io_uring_enter()
>>>> - Fix deadlock in io_ringfd_register()
>>>> - Use io_uring_rsrc_update rather than add a new struct type
>>>
>>> - Allow multiple register/unregister instead of enforcing just 1 at the
>>>   time
>>> - Check for it really being a ring fd when registering
>>>
>>> For different batch counts, nice improvements are seen. Roughly:
>>>
>>> Batch==1	15% faster
>>> Batch==2	13% faster
>>> Batch==4	11% faster
>>>
>>> This is just in microbenchmarks where the fdget/fdput play a bigger
>>> factor, but it will certainly help with real world situations where
>>> batching is more limited than in benchmarks.
>>
>> In trying this out in applications, I think the better registration API
>> is to allow io_uring to pick the offset. The application doesn't care,
>> it's just a magic integer there. And if we allow io_uring to pick it,
>> then that makes things a lot easier to deal with.
>>
>> For registration, pass in an array of io_uring_rsrc_update structs, just
>> filling in the ring_fd in the data field. Return value is number of ring
>> fds registered, and up->offset now contains the chosen offset for each
>> of them.
>>
>> Unregister is the same struct, but just with offset filled in.
>>
>> For applications using io_uring, which is all of them as far as I'm
>> aware, we can also easily hide this. This means we can get the optimized
>> behavior by default.
> 
> Only complication here is if the ring is shared across threads or
> processes. The thread one can be common, one thread doing submit and one
> doing completions. That one is a bit harder to handle. Without
> inheriting registered ring fds, not sure how it can be handled by
> default. Should probably just introduce a new queue init helper, but
> then that requires application changes to adopt...
> 
> Ideas welcome!

Don't see a way to do it by default, so I think it'll have to be opt-in.
We could make it the default if you never shared a ring, which most
people don't do, but we can't easily do so if it's ever shared between
tasks or processes.

With liburing, if you share, you share the io_uring struct as well. So
it's hard to maintain a new ->enter_ring_fd in there. It'd be doable if
we could reserve real fd == registered fd. Which is of course possible
using xarray or similar, but that'll add extra overhead.

Anyway, current version below. Only real change here is allowing either
specific offset or generated offset, depending on what the
io_uring_rsrc_update->offset is set to. If set to -1U, then io_uring
will find a free offset. If set to anything else, io_uring will use that
index (as long as it's >=0 && < MAX).

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] io_uring: add io_uring_enter(2) fixed file support
  2022-03-03 16:31       ` Jens Axboe
  2022-03-03 17:18         ` Jens Axboe
@ 2022-03-04  1:49         ` Pavel Begunkov
  2022-03-04  2:18           ` Jens Axboe
  2022-03-04  1:52         ` Pavel Begunkov
  2 siblings, 1 reply; 28+ messages in thread
From: Pavel Begunkov @ 2022-03-04  1:49 UTC (permalink / raw)
  To: Jens Axboe, Xiaoguang Wang, io-uring

On 3/3/22 16:31, Jens Axboe wrote:
> On 3/3/22 7:40 AM, Jens Axboe wrote:
>> On 3/3/22 7:36 AM, Jens Axboe wrote:
>>> The only potential oddity here is that the fd passed back is not a
>>> legitimate fd. io_uring does support poll(2) on its file descriptor, so
>>> that could cause some confusion even if I don't think anyone actually
>>> does poll(2) on io_uring.
>>
>> Side note - the only implication here is that we then likely can't make
>> the optimized behavior the default, it has to be an IORING_SETUP_REG
>> flag which tells us that the application is aware of this limitation.
>> Though I guess close(2) might mess with that too... Hmm.
> 
> Not sure I can find a good approach for that. Tried out your patch and
> made some fixes:
> 
> - Missing free on final tctx free
> - Rename registered_files to registered_rings
> - Fix off-by-ones in checking max registration count
> - Use kcalloc
> - Rename ENTER_FIXED_FILE -> ENTER_REGISTERED_RING
> - Don't pass in tctx to io_uring_unreg_ringfd()
> - Get rid of forward declaration for adding tctx node
> - Get rid of extra file pointer in io_uring_enter()
> - Fix deadlock in io_ringfd_register()
> - Use io_uring_rsrc_update rather than add a new struct type
> 
> Patch I ran below.
> 
> Ran some testing here, and on my laptop, running:
> 
> axboe@m1pro-kvm ~/g/fio (master)> t/io_uring -N1 -s1 -f0
> polled=1, fixedbufs=1/0, register_files=1, buffered=0, QD=128
> Engine=io_uring, sq_ring=128, cq_ring=128
> submitter=0, tid=673
> IOPS=6627K, IOS/call=1/1, inflight=()
> IOPS=6995K, IOS/call=1/1, inflight=()
> IOPS=6992K, IOS/call=1/1, inflight=()
> IOPS=7005K, IOS/call=1/1, inflight=()
> IOPS=6999K, IOS/call=1/1, inflight=()
> 
> and with registered ring
> 
> axboe@m1pro-kvm ~/g/fio (master)> t/io_uring -N1 -s1 -f1
> polled=1, fixedbufs=1/0, register_files=1, buffered=0, QD=128
> Engine=io_uring, sq_ring=128, cq_ring=128
> submitter=0, tid=687
> ring register 0
> IOPS=7714K, IOS/call=1/1, inflight=()
> IOPS=8030K, IOS/call=1/1, inflight=()
> IOPS=8025K, IOS/call=1/1, inflight=()
> IOPS=8015K, IOS/call=1/1, inflight=()
> IOPS=8037K, IOS/call=1/1, inflight=()
> 
> which is about a 15% improvement, pretty massive...

Is the bench single threaded (including io-wq)? Because if it
is, get/put shouldn't do any atomics and I don't see where the
result comes from.

static unsigned long __fget_light(unsigned int fd, fmode_t mask)
{
	struct files_struct *files = current->files;
	struct file *file;

	if (atomic_read(&files->count) == 1) {
		file = files_lookup_fd_raw(files, fd);
		if (!file || unlikely(file->f_mode & mask))
			return 0;
		return (unsigned long)file;
	} else {
		file = __fget(fd, mask, 1);
		if (!file)
			return 0;
		return FDPUT_FPUT | (unsigned long)file;
	}
}

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] io_uring: add io_uring_enter(2) fixed file support
  2022-03-03 16:31       ` Jens Axboe
  2022-03-03 17:18         ` Jens Axboe
  2022-03-04  1:49         ` Pavel Begunkov
@ 2022-03-04  1:52         ` Pavel Begunkov
  2022-03-04  2:19           ` Jens Axboe
  2 siblings, 1 reply; 28+ messages in thread
From: Pavel Begunkov @ 2022-03-04  1:52 UTC (permalink / raw)
  To: Jens Axboe, Xiaoguang Wang, io-uring

On 3/3/22 16:31, Jens Axboe wrote:
> On 3/3/22 7:40 AM, Jens Axboe wrote:
>> On 3/3/22 7:36 AM, Jens Axboe wrote:
>>> The only potential oddity here is that the fd passed back is not a
>>> legitimate fd. io_uring does support poll(2) on its file descriptor, so
>>> that could cause some confusion even if I don't think anyone actually
>>> does poll(2) on io_uring.
>>
>> Side note - the only implication here is that we then likely can't make
>> the optimized behavior the default, it has to be an IORING_SETUP_REG
>> flag which tells us that the application is aware of this limitation.
>> Though I guess close(2) might mess with that too... Hmm.
> 
> Not sure I can find a good approach for that. Tried out your patch and
> made some fixes:
> 
> - Missing free on final tctx free
> - Rename registered_files to registered_rings
> - Fix off-by-ones in checking max registration count
> - Use kcalloc
> - Rename ENTER_FIXED_FILE -> ENTER_REGISTERED_RING
> - Don't pass in tctx to io_uring_unreg_ringfd()
> - Get rid of forward declaration for adding tctx node
> - Get rid of extra file pointer in io_uring_enter()
> - Fix deadlock in io_ringfd_register()
> - Use io_uring_rsrc_update rather than add a new struct type
> 
> Patch I ran below.
> 
> Ran some testing here, and on my laptop, running:
> 
> axboe@m1pro-kvm ~/g/fio (master)> t/io_uring -N1 -s1 -f0
> polled=1, fixedbufs=1/0, register_files=1, buffered=0, QD=128
> Engine=io_uring, sq_ring=128, cq_ring=128
> submitter=0, tid=673
> IOPS=6627K, IOS/call=1/1, inflight=()
> IOPS=6995K, IOS/call=1/1, inflight=()
> IOPS=6992K, IOS/call=1/1, inflight=()
> IOPS=7005K, IOS/call=1/1, inflight=()
> IOPS=6999K, IOS/call=1/1, inflight=()
> 
> and with registered ring
> 
> axboe@m1pro-kvm ~/g/fio (master)> t/io_uring -N1 -s1 -f1
> polled=1, fixedbufs=1/0, register_files=1, buffered=0, QD=128
> Engine=io_uring, sq_ring=128, cq_ring=128
> submitter=0, tid=687
> ring register 0
> IOPS=7714K, IOS/call=1/1, inflight=()
> IOPS=8030K, IOS/call=1/1, inflight=()
> IOPS=8025K, IOS/call=1/1, inflight=()
> IOPS=8015K, IOS/call=1/1, inflight=()
> IOPS=8037K, IOS/call=1/1, inflight=()
> 
> which is about a 15% improvement, pretty massive...
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index ad3e0b0ab3b9..8a1f97054b71 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
[...]
>   static void *io_uring_validate_mmap_request(struct file *file,
>   					    loff_t pgoff, size_t sz)
>   {
> @@ -10191,12 +10266,23 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
>   	io_run_task_work();
>   
>   	if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
> -			       IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)))
> +			       IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
> +			       IORING_ENTER_REGISTERED_RING)))
>   		return -EINVAL;
>   
> -	f = fdget(fd);
> -	if (unlikely(!f.file))
> -		return -EBADF;
> +	if (flags & IORING_ENTER_REGISTERED_RING) {
> +		struct io_uring_task *tctx = current->io_uring;
> +
> +		if (fd >= IO_RINGFD_REG_MAX || !tctx)
> +			return -EINVAL;
> +		f.file = tctx->registered_rings[fd];

btw, array_index_nospec(), possibly not only here.

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] io_uring: add io_uring_enter(2) fixed file support
  2022-03-04  1:49         ` Pavel Begunkov
@ 2022-03-04  2:18           ` Jens Axboe
  2022-03-04  2:28             ` Pavel Begunkov
  0 siblings, 1 reply; 28+ messages in thread
From: Jens Axboe @ 2022-03-04  2:18 UTC (permalink / raw)
  To: Pavel Begunkov, Xiaoguang Wang, io-uring

On 3/3/22 6:49 PM, Pavel Begunkov wrote:
> On 3/3/22 16:31, Jens Axboe wrote:
>> On 3/3/22 7:40 AM, Jens Axboe wrote:
>>> On 3/3/22 7:36 AM, Jens Axboe wrote:
>>>> The only potential oddity here is that the fd passed back is not a
>>>> legitimate fd. io_uring does support poll(2) on its file descriptor, so
>>>> that could cause some confusion even if I don't think anyone actually
>>>> does poll(2) on io_uring.
>>>
>>> Side note - the only implication here is that we then likely can't make
>>> the optimized behavior the default, it has to be an IORING_SETUP_REG
>>> flag which tells us that the application is aware of this limitation.
>>> Though I guess close(2) might mess with that too... Hmm.
>>
>> Not sure I can find a good approach for that. Tried out your patch and
>> made some fixes:
>>
>> - Missing free on final tctx free
>> - Rename registered_files to registered_rings
>> - Fix off-by-ones in checking max registration count
>> - Use kcalloc
>> - Rename ENTER_FIXED_FILE -> ENTER_REGISTERED_RING
>> - Don't pass in tctx to io_uring_unreg_ringfd()
>> - Get rid of forward declaration for adding tctx node
>> - Get rid of extra file pointer in io_uring_enter()
>> - Fix deadlock in io_ringfd_register()
>> - Use io_uring_rsrc_update rather than add a new struct type
>>
>> Patch I ran below.
>>
>> Ran some testing here, and on my laptop, running:
>>
>> axboe@m1pro-kvm ~/g/fio (master)> t/io_uring -N1 -s1 -f0
>> polled=1, fixedbufs=1/0, register_files=1, buffered=0, QD=128
>> Engine=io_uring, sq_ring=128, cq_ring=128
>> submitter=0, tid=673
>> IOPS=6627K, IOS/call=1/1, inflight=()
>> IOPS=6995K, IOS/call=1/1, inflight=()
>> IOPS=6992K, IOS/call=1/1, inflight=()
>> IOPS=7005K, IOS/call=1/1, inflight=()
>> IOPS=6999K, IOS/call=1/1, inflight=()
>>
>> and with registered ring
>>
>> axboe@m1pro-kvm ~/g/fio (master)> t/io_uring -N1 -s1 -f1
>> polled=1, fixedbufs=1/0, register_files=1, buffered=0, QD=128
>> Engine=io_uring, sq_ring=128, cq_ring=128
>> submitter=0, tid=687
>> ring register 0
>> IOPS=7714K, IOS/call=1/1, inflight=()
>> IOPS=8030K, IOS/call=1/1, inflight=()
>> IOPS=8025K, IOS/call=1/1, inflight=()
>> IOPS=8015K, IOS/call=1/1, inflight=()
>> IOPS=8037K, IOS/call=1/1, inflight=()
>>
>> which is about a 15% improvement, pretty massive...
> 
> Is the bench single threaded (including io-wq)? Because if it
> is, get/put shouldn't do any atomics and I don't see where the
> result comes from.

Yes, it has a main thread and IO threads. Which is not uncommon, most
things are multithreaded these days...

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] io_uring: add io_uring_enter(2) fixed file support
  2022-03-04  1:52         ` Pavel Begunkov
@ 2022-03-04  2:19           ` Jens Axboe
  2022-03-04  2:39             ` Pavel Begunkov
  0 siblings, 1 reply; 28+ messages in thread
From: Jens Axboe @ 2022-03-04  2:19 UTC (permalink / raw)
  To: Pavel Begunkov, Xiaoguang Wang, io-uring

On 3/3/22 6:52 PM, Pavel Begunkov wrote:
> On 3/3/22 16:31, Jens Axboe wrote:
>> On 3/3/22 7:40 AM, Jens Axboe wrote:
>>> On 3/3/22 7:36 AM, Jens Axboe wrote:
>>>> The only potential oddity here is that the fd passed back is not a
>>>> legitimate fd. io_uring does support poll(2) on its file descriptor, so
>>>> that could cause some confusion even if I don't think anyone actually
>>>> does poll(2) on io_uring.
>>>
>>> Side note - the only implication here is that we then likely can't make
>>> the optimized behavior the default, it has to be an IORING_SETUP_REG
>>> flag which tells us that the application is aware of this limitation.
>>> Though I guess close(2) might mess with that too... Hmm.
>>
>> Not sure I can find a good approach for that. Tried out your patch and
>> made some fixes:
>>
>> - Missing free on final tctx free
>> - Rename registered_files to registered_rings
>> - Fix off-by-ones in checking max registration count
>> - Use kcalloc
>> - Rename ENTER_FIXED_FILE -> ENTER_REGISTERED_RING
>> - Don't pass in tctx to io_uring_unreg_ringfd()
>> - Get rid of forward declaration for adding tctx node
>> - Get rid of extra file pointer in io_uring_enter()
>> - Fix deadlock in io_ringfd_register()
>> - Use io_uring_rsrc_update rather than add a new struct type
>>
>> Patch I ran below.
>>
>> Ran some testing here, and on my laptop, running:
>>
>> axboe@m1pro-kvm ~/g/fio (master)> t/io_uring -N1 -s1 -f0
>> polled=1, fixedbufs=1/0, register_files=1, buffered=0, QD=128
>> Engine=io_uring, sq_ring=128, cq_ring=128
>> submitter=0, tid=673
>> IOPS=6627K, IOS/call=1/1, inflight=()
>> IOPS=6995K, IOS/call=1/1, inflight=()
>> IOPS=6992K, IOS/call=1/1, inflight=()
>> IOPS=7005K, IOS/call=1/1, inflight=()
>> IOPS=6999K, IOS/call=1/1, inflight=()
>>
>> and with registered ring
>>
>> axboe@m1pro-kvm ~/g/fio (master)> t/io_uring -N1 -s1 -f1
>> polled=1, fixedbufs=1/0, register_files=1, buffered=0, QD=128
>> Engine=io_uring, sq_ring=128, cq_ring=128
>> submitter=0, tid=687
>> ring register 0
>> IOPS=7714K, IOS/call=1/1, inflight=()
>> IOPS=8030K, IOS/call=1/1, inflight=()
>> IOPS=8025K, IOS/call=1/1, inflight=()
>> IOPS=8015K, IOS/call=1/1, inflight=()
>> IOPS=8037K, IOS/call=1/1, inflight=()
>>
>> which is about a 15% improvement, pretty massive...
>>
>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>> index ad3e0b0ab3b9..8a1f97054b71 100644
>> --- a/fs/io_uring.c
>> +++ b/fs/io_uring.c
> [...]
>>   static void *io_uring_validate_mmap_request(struct file *file,
>>                           loff_t pgoff, size_t sz)
>>   {
>> @@ -10191,12 +10266,23 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
>>       io_run_task_work();
>>         if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
>> -                   IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)))
>> +                   IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
>> +                   IORING_ENTER_REGISTERED_RING)))
>>           return -EINVAL;
>>   -    f = fdget(fd);
>> -    if (unlikely(!f.file))
>> -        return -EBADF;
>> +    if (flags & IORING_ENTER_REGISTERED_RING) {
>> +        struct io_uring_task *tctx = current->io_uring;
>> +
>> +        if (fd >= IO_RINGFD_REG_MAX || !tctx)
>> +            return -EINVAL;
>> +        f.file = tctx->registered_rings[fd];
> 
> btw, array_index_nospec(), possibly not only here.

Yeah, was thinking that earlier too in fact but forgot about it. Might
as well, though I don't think it's strictly required as it isn't a user
table.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] io_uring: add io_uring_enter(2) fixed file support
  2022-03-04  2:18           ` Jens Axboe
@ 2022-03-04  2:28             ` Pavel Begunkov
  2022-03-04  2:35               ` Pavel Begunkov
  2022-03-04  2:43               ` Jens Axboe
  0 siblings, 2 replies; 28+ messages in thread
From: Pavel Begunkov @ 2022-03-04  2:28 UTC (permalink / raw)
  To: Jens Axboe, Xiaoguang Wang, io-uring

On 3/4/22 02:18, Jens Axboe wrote:
> On 3/3/22 6:49 PM, Pavel Begunkov wrote:
>> On 3/3/22 16:31, Jens Axboe wrote:
>>> On 3/3/22 7:40 AM, Jens Axboe wrote:
>>>> On 3/3/22 7:36 AM, Jens Axboe wrote:
>>>>> The only potential oddity here is that the fd passed back is not a
>>>>> legitimate fd. io_uring does support poll(2) on its file descriptor, so
>>>>> that could cause some confusion even if I don't think anyone actually
>>>>> does poll(2) on io_uring.
>>>>
[...]
>>> which is about a 15% improvement, pretty massive...
>>
>> Is the bench single threaded (including io-wq)? Because if it
>> is, get/put shouldn't do any atomics and I don't see where the
>> result comes from.
> 
> Yes, it has a main thread and IO threads. Which is not uncommon, most
> things are multithreaded these days...

They definitely are, just was confused by the bench as I can't
recall t/io_uring having >1 threads for nops and/or direct bdev I/O

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] io_uring: add io_uring_enter(2) fixed file support
  2022-03-04  2:28             ` Pavel Begunkov
@ 2022-03-04  2:35               ` Pavel Begunkov
  2022-03-04  2:43               ` Jens Axboe
  1 sibling, 0 replies; 28+ messages in thread
From: Pavel Begunkov @ 2022-03-04  2:35 UTC (permalink / raw)
  To: Jens Axboe, Xiaoguang Wang, io-uring

On 3/4/22 02:28, Pavel Begunkov wrote:
> On 3/4/22 02:18, Jens Axboe wrote:
>> On 3/3/22 6:49 PM, Pavel Begunkov wrote:
>>> On 3/3/22 16:31, Jens Axboe wrote:
>>>> On 3/3/22 7:40 AM, Jens Axboe wrote:
>>>>> On 3/3/22 7:36 AM, Jens Axboe wrote:
>>>>>> The only potential oddity here is that the fd passed back is not a
>>>>>> legitimate fd. io_uring does support poll(2) on its file descriptor, so
>>>>>> that could cause some confusion even if I don't think anyone actually
>>>>>> does poll(2) on io_uring.
>>>>>
> [...]
>>>> which is about a 15% improvement, pretty massive...
>>>
>>> Is the bench single threaded (including io-wq)? Because if it
>>> is, get/put shouldn't do any atomics and I don't see where the
>>> result comes from.
>>
>> Yes, it has a main thread and IO threads. Which is not uncommon, most
>> things are multithreaded these days...
> 
> They definitely are, just was confused by the bench as I can't
> recall t/io_uring having >1 threads for nops and/or direct bdev I/O

fwiw, not trying to say it doesn't use threads

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] io_uring: add io_uring_enter(2) fixed file support
  2022-03-04  2:19           ` Jens Axboe
@ 2022-03-04  2:39             ` Pavel Begunkov
  2022-03-04  3:03               ` Jens Axboe
  0 siblings, 1 reply; 28+ messages in thread
From: Pavel Begunkov @ 2022-03-04  2:39 UTC (permalink / raw)
  To: Jens Axboe, Xiaoguang Wang, io-uring

On 3/4/22 02:19, Jens Axboe wrote:
> On 3/3/22 6:52 PM, Pavel Begunkov wrote:
>> On 3/3/22 16:31, Jens Axboe wrote:
>>> On 3/3/22 7:40 AM, Jens Axboe wrote:
>>>> On 3/3/22 7:36 AM, Jens Axboe wrote:
[...]
>>>
>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>> index ad3e0b0ab3b9..8a1f97054b71 100644
>>> --- a/fs/io_uring.c
>>> +++ b/fs/io_uring.c
>> [...]
>>>    static void *io_uring_validate_mmap_request(struct file *file,
>>>                            loff_t pgoff, size_t sz)
>>>    {
>>> @@ -10191,12 +10266,23 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
>>>        io_run_task_work();
>>>          if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
>>> -                   IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)))
>>> +                   IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
>>> +                   IORING_ENTER_REGISTERED_RING)))
>>>            return -EINVAL;
>>>    -    f = fdget(fd);
>>> -    if (unlikely(!f.file))
>>> -        return -EBADF;
>>> +    if (flags & IORING_ENTER_REGISTERED_RING) {
>>> +        struct io_uring_task *tctx = current->io_uring;
>>> +
>>> +        if (fd >= IO_RINGFD_REG_MAX || !tctx)
>>> +            return -EINVAL;
>>> +        f.file = tctx->registered_rings[fd];
>>
>> btw, array_index_nospec(), possibly not only here.
> 
> Yeah, was thinking that earlier too in fact but forgot about it. Might
> as well, though I don't think it's strictly required as it isn't a user
> table.

I may have missed in what cases it's used, but shouldn't it be
in all cases when we use a user passed index for array addressing?
e.g. to protect from pre-caching a chunk of memory computed from
an out-of-array malevolent index

I just don't see any relevant difference from normal file tables

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] io_uring: add io_uring_enter(2) fixed file support
  2022-03-04  2:28             ` Pavel Begunkov
  2022-03-04  2:35               ` Pavel Begunkov
@ 2022-03-04  2:43               ` Jens Axboe
  1 sibling, 0 replies; 28+ messages in thread
From: Jens Axboe @ 2022-03-04  2:43 UTC (permalink / raw)
  To: Pavel Begunkov, Xiaoguang Wang, io-uring

On 3/3/22 7:28 PM, Pavel Begunkov wrote:
> On 3/4/22 02:18, Jens Axboe wrote:
>> On 3/3/22 6:49 PM, Pavel Begunkov wrote:
>>> On 3/3/22 16:31, Jens Axboe wrote:
>>>> On 3/3/22 7:40 AM, Jens Axboe wrote:
>>>>> On 3/3/22 7:36 AM, Jens Axboe wrote:
>>>>>> The only potential oddity here is that the fd passed back is not a
>>>>>> legitimate fd. io_uring does support poll(2) on its file descriptor, so
>>>>>> that could cause some confusion even if I don't think anyone actually
>>>>>> does poll(2) on io_uring.
>>>>>
> [...]
>>>> which is about a 15% improvement, pretty massive...
>>>
>>> Is the bench single threaded (including io-wq)? Because if it
>>> is, get/put shouldn't do any atomics and I don't see where the
>>> result comes from.
>>
>> Yes, it has a main thread and IO threads. Which is not uncommon, most
>> things are multithreaded these days...
> 
> They definitely are, just was confused by the bench as I can't
> recall t/io_uring having >1 threads for nops and/or direct bdev I/O

It has a main task that spawns a thread for each job. Even for nops
that's true. The main task just checks progress of each IO job (thread)
and prints it, sleeping most of the time.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] io_uring: add io_uring_enter(2) fixed file support
  2022-03-04  2:39             ` Pavel Begunkov
@ 2022-03-04  3:03               ` Jens Axboe
  0 siblings, 0 replies; 28+ messages in thread
From: Jens Axboe @ 2022-03-04  3:03 UTC (permalink / raw)
  To: Pavel Begunkov, Xiaoguang Wang, io-uring

On 3/3/22 7:39 PM, Pavel Begunkov wrote:
> On 3/4/22 02:19, Jens Axboe wrote:
>> On 3/3/22 6:52 PM, Pavel Begunkov wrote:
>>> On 3/3/22 16:31, Jens Axboe wrote:
>>>> On 3/3/22 7:40 AM, Jens Axboe wrote:
>>>>> On 3/3/22 7:36 AM, Jens Axboe wrote:
> [...]
>>>>
>>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>>> index ad3e0b0ab3b9..8a1f97054b71 100644
>>>> --- a/fs/io_uring.c
>>>> +++ b/fs/io_uring.c
>>> [...]
>>>>    static void *io_uring_validate_mmap_request(struct file *file,
>>>>                            loff_t pgoff, size_t sz)
>>>>    {
>>>> @@ -10191,12 +10266,23 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
>>>>        io_run_task_work();
>>>>          if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
>>>> -                   IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)))
>>>> +                   IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
>>>> +                   IORING_ENTER_REGISTERED_RING)))
>>>>            return -EINVAL;
>>>>    -    f = fdget(fd);
>>>> -    if (unlikely(!f.file))
>>>> -        return -EBADF;
>>>> +    if (flags & IORING_ENTER_REGISTERED_RING) {
>>>> +        struct io_uring_task *tctx = current->io_uring;
>>>> +
>>>> +        if (fd >= IO_RINGFD_REG_MAX || !tctx)
>>>> +            return -EINVAL;
>>>> +        f.file = tctx->registered_rings[fd];
>>>
>>> btw, array_index_nospec(), possibly not only here.
>>
>> Yeah, was thinking that earlier too in fact but forgot about it. Might
>> as well, though I don't think it's strictly required as it isn't a user
>> table.
> 
> I may have missed in what cases it's used, but shouldn't it be
> in all cases when we use a user passed index for array addressing?
> e.g. to protect from pre-caching a chunk of memory computed from
> an out-of-array malevolent index
> 
> I just don't see any relevant difference from normal file tables

Indeed, I guess it's the indexing that matters, not the table itself.
I'll make the edit.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] io_uring: add io_uring_enter(2) fixed file support
  2022-03-04  0:07               ` Jens Axboe
@ 2022-03-04 13:39                 ` Xiaoguang Wang
  2022-03-04 13:44                   ` Jens Axboe
  0 siblings, 1 reply; 28+ messages in thread
From: Xiaoguang Wang @ 2022-03-04 13:39 UTC (permalink / raw)
  To: Jens Axboe, io-uring; +Cc: asml.silence

hi,

> On 3/3/22 2:19 PM, Jens Axboe wrote:
>> On 3/3/22 1:41 PM, Jens Axboe wrote:
>>> On 3/3/22 10:18 AM, Jens Axboe wrote:
>>>> On 3/3/22 9:31 AM, Jens Axboe wrote:
>>>>> On 3/3/22 7:40 AM, Jens Axboe wrote:
>>>>>> On 3/3/22 7:36 AM, Jens Axboe wrote:
>>>>>>> The only potential oddity here is that the fd passed back is not a
>>>>>>> legitimate fd. io_uring does support poll(2) on its file descriptor, so
>>>>>>> that could cause some confusion even if I don't think anyone actually
>>>>>>> does poll(2) on io_uring.
>>>>>> Side note - the only implication here is that we then likely can't make
>>>>>> the optimized behavior the default, it has to be an IORING_SETUP_REG
>>>>>> flag which tells us that the application is aware of this limitation.
>>>>>> Though I guess close(2) might mess with that too... Hmm.
>>>>> Not sure I can find a good approach for that. Tried out your patch and
>>>>> made some fixes:
>>>>>
>>>>> - Missing free on final tctx free
>>>>> - Rename registered_files to registered_rings
>>>>> - Fix off-by-ones in checking max registration count
>>>>> - Use kcalloc
>>>>> - Rename ENTER_FIXED_FILE -> ENTER_REGISTERED_RING
>>>>> - Don't pass in tctx to io_uring_unreg_ringfd()
>>>>> - Get rid of forward declaration for adding tctx node
>>>>> - Get rid of extra file pointer in io_uring_enter()
>>>>> - Fix deadlock in io_ringfd_register()
>>>>> - Use io_uring_rsrc_update rather than add a new struct type
>>>> - Allow multiple register/unregister instead of enforcing just 1 at the
>>>>    time
>>>> - Check for it really being a ring fd when registering
>>>>
>>>> For different batch counts, nice improvements are seen. Roughly:
>>>>
>>>> Batch==1	15% faster
>>>> Batch==2	13% faster
>>>> Batch==4	11% faster
>>>>
>>>> This is just in microbenchmarks where the fdget/fdput play a bigger
>>>> factor, but it will certainly help with real world situations where
>>>> batching is more limited than in benchmarks.
>>> In trying this out in applications, I think the better registration API
>>> is to allow io_uring to pick the offset. The application doesn't care,
>>> it's just a magic integer there. And if we allow io_uring to pick it,
>>> then that makes things a lot easier to deal with.
>>>
>>> For registration, pass in an array of io_uring_rsrc_update structs, just
>>> filling in the ring_fd in the data field. Return value is number of ring
>>> fds registered, and up->offset now contains the chosen offset for each
>>> of them.
>>>
>>> Unregister is the same struct, but just with offset filled in.
>>>
>>> For applications using io_uring, which is all of them as far as I'm
>>> aware, we can also easily hide this. This means we can get the optimized
>>> behavior by default.
>> Only complication here is if the ring is shared across threads or
>> processes. The thread one can be common, one thread doing submit and one
>> doing completions. That one is a bit harder to handle. Without
>> inheriting registered ring fds, not sure how it can be handled by
>> default. Should probably just introduce a new queue init helper, but
>> then that requires application changes to adopt...
>>
>> Ideas welcome!
> Don't see a way to do it by default, so I think it'll have to be opt-in.
> We could make it the default if you never shared a ring, which most
> people don't do, but we can't easily do so if it's ever shared between
> tasks or processes.
Before sending this patch, I also have thought about whether we can make 
this
enter_ring_fd be shared between threads and be default feature, and 
found that
it's hard :)
   1) first we need to ensure this ring fd registration always can be 
unregistered,
so this cache is tied to io_uring_task, then when thread exits, we can 
ensure
fput called correspondingly.
   2) we may also implement a file cache shared between threads in a 
process,
but this may introduce extra overhead, at least need locks to protect 
modifications
to this cache. If this cache is per thread, it doesn't need any 
synchronizations.

So I suggest to make it be just simple and opt-in, and the registration 
interface
seems not complicated, a thread trying to submit sqes can adopt it easily.
>
> With liburing, if you share, you share the io_uring struct as well. So
> it's hard to maintain a new ->enter_ring_fd in there. It'd be doable if
> we could reserve real fd == . Which is of course possible
> using xarray or similar, but that'll add extra overhead.
For liburing, we may need to add fixed file version for helpers which 
submit sqes
or reap cqes, for example,  io_uring_submit_fixed(), which passes a 
enter_ring_fd.

>
> Anyway, current version below. Only real change here is allowing either
> specific offset or generated offset, depending on what the
> io_uring_rsrc_update->offset is set to. If set to -1U, then io_uring
> will find a free offset. If set to anything else, io_uring will use that
> index (as long as it's >=0 && < MAX).
Seems you forgot to attach the newest version, and also don't see a 
patch attachment.
Finally, thanks for your quick response and many code improvements, 
really appreciate it.

Regards,
Xiaoguang Wang
>


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] io_uring: add io_uring_enter(2) fixed file support
  2022-03-04 13:39                 ` Xiaoguang Wang
@ 2022-03-04 13:44                   ` Jens Axboe
  2022-03-04 15:16                     ` Xiaoguang Wang
  0 siblings, 1 reply; 28+ messages in thread
From: Jens Axboe @ 2022-03-04 13:44 UTC (permalink / raw)
  To: Xiaoguang Wang, io-uring; +Cc: asml.silence

On 3/4/22 6:39 AM, Xiaoguang Wang wrote:
> hi,
> 
>> On 3/3/22 2:19 PM, Jens Axboe wrote:
>>> On 3/3/22 1:41 PM, Jens Axboe wrote:
>>>> On 3/3/22 10:18 AM, Jens Axboe wrote:
>>>>> On 3/3/22 9:31 AM, Jens Axboe wrote:
>>>>>> On 3/3/22 7:40 AM, Jens Axboe wrote:
>>>>>>> On 3/3/22 7:36 AM, Jens Axboe wrote:
>>>>>>>> The only potential oddity here is that the fd passed back is not a
>>>>>>>> legitimate fd. io_uring does support poll(2) on its file descriptor, so
>>>>>>>> that could cause some confusion even if I don't think anyone actually
>>>>>>>> does poll(2) on io_uring.
>>>>>>> Side note - the only implication here is that we then likely can't make
>>>>>>> the optimized behavior the default, it has to be an IORING_SETUP_REG
>>>>>>> flag which tells us that the application is aware of this limitation.
>>>>>>> Though I guess close(2) might mess with that too... Hmm.
>>>>>> Not sure I can find a good approach for that. Tried out your patch and
>>>>>> made some fixes:
>>>>>>
>>>>>> - Missing free on final tctx free
>>>>>> - Rename registered_files to registered_rings
>>>>>> - Fix off-by-ones in checking max registration count
>>>>>> - Use kcalloc
>>>>>> - Rename ENTER_FIXED_FILE -> ENTER_REGISTERED_RING
>>>>>> - Don't pass in tctx to io_uring_unreg_ringfd()
>>>>>> - Get rid of forward declaration for adding tctx node
>>>>>> - Get rid of extra file pointer in io_uring_enter()
>>>>>> - Fix deadlock in io_ringfd_register()
>>>>>> - Use io_uring_rsrc_update rather than add a new struct type
>>>>> - Allow multiple register/unregister instead of enforcing just 1 at the
>>>>>    time
>>>>> - Check for it really being a ring fd when registering
>>>>>
>>>>> For different batch counts, nice improvements are seen. Roughly:
>>>>>
>>>>> Batch==1    15% faster
>>>>> Batch==2    13% faster
>>>>> Batch==4    11% faster
>>>>>
>>>>> This is just in microbenchmarks where the fdget/fdput play a bigger
>>>>> factor, but it will certainly help with real world situations where
>>>>> batching is more limited than in benchmarks.
>>>> In trying this out in applications, I think the better registration API
>>>> is to allow io_uring to pick the offset. The application doesn't care,
>>>> it's just a magic integer there. And if we allow io_uring to pick it,
>>>> then that makes things a lot easier to deal with.
>>>>
>>>> For registration, pass in an array of io_uring_rsrc_update structs, just
>>>> filling in the ring_fd in the data field. Return value is number of ring
>>>> fds registered, and up->offset now contains the chosen offset for each
>>>> of them.
>>>>
>>>> Unregister is the same struct, but just with offset filled in.
>>>>
>>>> For applications using io_uring, which is all of them as far as I'm
>>>> aware, we can also easily hide this. This means we can get the optimized
>>>> behavior by default.
>>> Only complication here is if the ring is shared across threads or
>>> processes. The thread one can be common, one thread doing submit and one
>>> doing completions. That one is a bit harder to handle. Without
>>> inheriting registered ring fds, not sure how it can be handled by
>>> default. Should probably just introduce a new queue init helper, but
>>> then that requires application changes to adopt...
>>>
>>> Ideas welcome!
>> Don't see a way to do it by default, so I think it'll have to be opt-in.
>> We could make it the default if you never shared a ring, which most
>> people don't do, but we can't easily do so if it's ever shared between
>> tasks or processes.

> Before sending this patch, I also have thought about whether we can
> make this enter_ring_fd be shared between threads and be default
> feature, and found that it's hard :)
>
>   1) first we need to ensure this ring fd registration always can be
>   unregistered, so this cache is tied to io_uring_task, then when
>   thread exits, we can ensure fput called correspondingly.
>
>   2) we may also implement a file cache shared between threads in a
>   process, but this may introduce extra overhead, at least need locks
>   to protect modifications to this cache. If this cache is per thread,
>   it doesn't need any synchronizations.
> 
> So I suggest to make it be just simple and opt-in, and the
> registration interface seems not complicated, a thread trying to
> submit sqes can adopt it easily.

Yes, I pretty much have come to that same conclusion myself...

>> With liburing, if you share, you share the io_uring struct as well. So
>> it's hard to maintain a new ->enter_ring_fd in there. It'd be doable if
>> we could reserve real fd == . Which is of course possible
>> using xarray or similar, but that'll add extra overhead.
>
> For liburing, we may need to add fixed file version for helpers which
> submit sqes or reap cqes, for example,  io_uring_submit_fixed(), which
> passes a enter_ring_fd.

I'll take a look at liburing and see what we need to do there. I think
the sanest thing to do here is say that using a registered ring fd means
you cannot share the ring, ever. And then just have a
ring->enter_ring_fd which is normally just set to ring_fd when the ring
is setup, and if you register the ring fd, then we set it to whatever
the registered value is. Everything calling io_uring_enter() then just
needs to be modified to use ->enter_ring_fd instead of ->ring_fd.

>> Anyway, current version below. Only real change here is allowing either
>> specific offset or generated offset, depending on what the
>> io_uring_rsrc_update->offset is set to. If set to -1U, then io_uring
>> will find a free offset. If set to anything else, io_uring will use that
>> index (as long as it's >=0 && < MAX).

> Seems you forgot to attach the newest version, and also don't see a
> patch attachment. Finally, thanks for your quick response and many
> code improvements, really appreciate it.

Oops, below now. How do you want to handle this patch? It's now a bit of
a mix of both of our stuff...


diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8f26c4602384..9180f122ecd7 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -466,6 +466,11 @@ struct io_ring_ctx {
 	};
 };
 
+/*
+ * Arbitrary limit, can be raised if need be
+ */
+#define IO_RINGFD_REG_MAX 16
+
 struct io_uring_task {
 	/* submission side */
 	int			cached_refs;
@@ -481,6 +486,7 @@ struct io_uring_task {
 	struct io_wq_work_list	task_list;
 	struct io_wq_work_list	prior_task_list;
 	struct callback_head	task_work;
+	struct file		**registered_rings;
 	bool			task_running;
 };
 
@@ -8779,8 +8785,16 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task,
 	if (unlikely(!tctx))
 		return -ENOMEM;
 
+	tctx->registered_rings = kcalloc(IO_RINGFD_REG_MAX,
+					 sizeof(struct file *), GFP_KERNEL);
+	if (unlikely(!tctx->registered_rings)) {
+		kfree(tctx);
+		return -ENOMEM;
+	}
+
 	ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
 	if (unlikely(ret)) {
+		kfree(tctx->registered_rings);
 		kfree(tctx);
 		return ret;
 	}
@@ -8789,6 +8803,7 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task,
 	if (IS_ERR(tctx->io_wq)) {
 		ret = PTR_ERR(tctx->io_wq);
 		percpu_counter_destroy(&tctx->inflight);
+		kfree(tctx->registered_rings);
 		kfree(tctx);
 		return ret;
 	}
@@ -8813,6 +8828,7 @@ void __io_uring_free(struct task_struct *tsk)
 	WARN_ON_ONCE(tctx->io_wq);
 	WARN_ON_ONCE(tctx->cached_refs);
 
+	kfree(tctx->registered_rings);
 	percpu_counter_destroy(&tctx->inflight);
 	kfree(tctx);
 	tsk->io_uring = NULL;
@@ -10035,6 +10051,139 @@ void __io_uring_cancel(bool cancel_all)
 	io_uring_cancel_generic(cancel_all, NULL);
 }
 
+void io_uring_unreg_ringfd(void)
+{
+	struct io_uring_task *tctx = current->io_uring;
+	int i;
+
+	for (i = 0; i < IO_RINGFD_REG_MAX; i++) {
+		if (tctx->registered_rings[i]) {
+			fput(tctx->registered_rings[i]);
+			tctx->registered_rings[i] = NULL;
+		}
+	}
+}
+
+static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd,
+				     int start, int end)
+{
+	struct file *file;
+	int offset;
+
+	for (offset = start; offset < end; offset++) {
+		offset = array_index_nospec(offset, IO_RINGFD_REG_MAX);
+		if (tctx->registered_rings[offset])
+			continue;
+
+		file = fget(fd);
+		if (!file) {
+			return -EBADF;
+		} else if (file->f_op != &io_uring_fops) {
+			fput(file);
+			return -EOPNOTSUPP;
+		}
+		tctx->registered_rings[offset] = file;
+		return offset;
+	}
+
+	return -EBUSY;
+}
+
+/*
+ * Register a ring fd to avoid fdget/fdput for each io_uring_enter()
+ * invocation. User passes in an array of struct io_uring_rsrc_update
+ * with ->data set to the ring_fd, and ->offset given for the desired
+ * index. If no index is desired, application may set ->offset == -1U
+ * and we'll find an available index. Returns number of entries
+ * successfully processed, or < 0 on error if none were processed.
+ */
+static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg,
+			      unsigned nr_args)
+{
+	struct io_uring_rsrc_update __user *arg = __arg;
+	struct io_uring_rsrc_update reg;
+	struct io_uring_task *tctx;
+	int ret, i;
+
+	if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
+		return -EINVAL;
+
+	mutex_unlock(&ctx->uring_lock);
+	ret = io_uring_add_tctx_node(ctx);
+	mutex_lock(&ctx->uring_lock);
+	if (ret)
+		return ret;
+
+	tctx = current->io_uring;
+	for (i = 0; i < nr_args; i++) {
+		int start, end;
+
+		if (copy_from_user(&reg, &arg[i], sizeof(reg))) {
+			ret = -EFAULT;
+			break;
+		}
+
+		if (reg.offset == -1U) {
+			start = 0;
+			end = IO_RINGFD_REG_MAX;
+		} else {
+			if (reg.offset >= IO_RINGFD_REG_MAX) {
+				ret = -EINVAL;
+				break;
+			}
+			start = reg.offset;
+			end = start + 1;
+		}
+
+		ret = io_ring_add_registered_fd(tctx, reg.data, start, end);
+		if (ret < 0)
+			break;
+
+		reg.offset = ret;
+		if (copy_to_user(&arg[i], &reg, sizeof(reg))) {
+			fput(tctx->registered_rings[reg.offset]);
+			tctx->registered_rings[reg.offset] = NULL;
+			ret = -EFAULT;
+			break;
+		}
+	}
+
+	return i ? i : ret;
+}
+
+static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg,
+				unsigned nr_args)
+{
+	struct io_uring_rsrc_update __user *arg = __arg;
+	struct io_uring_task *tctx = current->io_uring;
+	struct io_uring_rsrc_update reg;
+	int ret = 0, i;
+
+	if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
+		return -EINVAL;
+	if (!tctx)
+		return 0;
+
+	for (i = 0; i < nr_args; i++) {
+		if (copy_from_user(&reg, &arg[i], sizeof(reg))) {
+			ret = -EFAULT;
+			break;
+		}
+		if (reg.offset >= IO_RINGFD_REG_MAX) {
+			ret = -EINVAL;
+			break;
+		}
+
+		reg.offset = array_index_nospec(reg.offset, IO_RINGFD_REG_MAX);
+		if (tctx->registered_rings[reg.offset]) {
+			fput(tctx->registered_rings[reg.offset]);
+			tctx->registered_rings[reg.offset] = NULL;
+		}
+	}
+
+	return i ? i : ret;
+}
+
 static void *io_uring_validate_mmap_request(struct file *file,
 					    loff_t pgoff, size_t sz)
 {
@@ -10165,12 +10314,28 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 	io_run_task_work();
 
 	if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
-			       IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)))
+			       IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
+			       IORING_ENTER_REGISTERED_RING)))
 		return -EINVAL;
 
-	f = fdget(fd);
-	if (unlikely(!f.file))
-		return -EBADF;
+	/*
+	 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
+	 * need only dereference our task private array to find it.
+	 */
+	if (flags & IORING_ENTER_REGISTERED_RING) {
+		struct io_uring_task *tctx = current->io_uring;
+
+		if (fd < 0 || fd >= IO_RINGFD_REG_MAX || !tctx)
+			return -EINVAL;
+		fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
+		f.file = tctx->registered_rings[fd];
+		if (unlikely(!f.file))
+			return -EBADF;
+	} else {
+		f = fdget(fd);
+		if (unlikely(!f.file))
+			return -EBADF;
+	}
 
 	ret = -EOPNOTSUPP;
 	if (unlikely(f.file->f_op != &io_uring_fops))
@@ -10244,7 +10409,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 out:
 	percpu_ref_put(&ctx->refs);
 out_fput:
-	fdput(f);
+	if (!(flags & IORING_ENTER_REGISTERED_RING))
+		fdput(f);
 	return submitted ? submitted : ret;
 }
 
@@ -11134,6 +11300,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_register_iowq_max_workers(ctx, arg);
 		break;
+	case IORING_REGISTER_RING_FDS:
+		ret = io_ringfd_register(ctx, arg, nr_args);
+		break;
+	case IORING_UNREGISTER_RING_FDS:
+		ret = io_ringfd_unregister(ctx, arg, nr_args);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 649a4d7c241b..1814e698d861 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -9,11 +9,14 @@
 struct sock *io_uring_get_socket(struct file *file);
 void __io_uring_cancel(bool cancel_all);
 void __io_uring_free(struct task_struct *tsk);
+void io_uring_unreg_ringfd(void);
 
 static inline void io_uring_files_cancel(void)
 {
-	if (current->io_uring)
+	if (current->io_uring) {
+		io_uring_unreg_ringfd();
 		__io_uring_cancel(false);
+	}
 }
 static inline void io_uring_task_cancel(void)
 {
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 787f491f0d2a..42b2fe84dbcd 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -257,10 +257,11 @@ struct io_cqring_offsets {
 /*
  * io_uring_enter(2) flags
  */
-#define IORING_ENTER_GETEVENTS	(1U << 0)
-#define IORING_ENTER_SQ_WAKEUP	(1U << 1)
-#define IORING_ENTER_SQ_WAIT	(1U << 2)
-#define IORING_ENTER_EXT_ARG	(1U << 3)
+#define IORING_ENTER_GETEVENTS		(1U << 0)
+#define IORING_ENTER_SQ_WAKEUP		(1U << 1)
+#define IORING_ENTER_SQ_WAIT		(1U << 2)
+#define IORING_ENTER_EXT_ARG		(1U << 3)
+#define IORING_ENTER_REGISTERED_RING	(1U << 4)
 
 /*
  * Passed in for io_uring_setup(2). Copied back with updated info on success
@@ -325,6 +326,10 @@ enum {
 	/* set/get max number of io-wq workers */
 	IORING_REGISTER_IOWQ_MAX_WORKERS	= 19,
 
+	/* register/unregister io_uring fd with the ring */
+	IORING_REGISTER_RING_FDS		= 20,
+	IORING_UNREGISTER_RING_FDS		= 21,
+
 	/* this goes last */
 	IORING_REGISTER_LAST
 };

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH] io_uring: add io_uring_enter(2) fixed file support
  2022-03-04 13:44                   ` Jens Axboe
@ 2022-03-04 15:16                     ` Xiaoguang Wang
  2022-03-04 15:22                       ` Jens Axboe
  0 siblings, 1 reply; 28+ messages in thread
From: Xiaoguang Wang @ 2022-03-04 15:16 UTC (permalink / raw)
  To: Jens Axboe, io-uring; +Cc: asml.silence

hi,

> On 3/4/22 6:39 AM, Xiaoguang Wang wrote:
>> hi,
>>
>>> On 3/3/22 2:19 PM, Jens Axboe wrote:
>>>> On 3/3/22 1:41 PM, Jens Axboe wrote:
>>>>> On 3/3/22 10:18 AM, Jens Axboe wrote:
>>>>>> On 3/3/22 9:31 AM, Jens Axboe wrote:
>>>>>>> On 3/3/22 7:40 AM, Jens Axboe wrote:
>>>>>>>> On 3/3/22 7:36 AM, Jens Axboe wrote:
>>>>>>>>> The only potential oddity here is that the fd passed back is not a
>>>>>>>>> legitimate fd. io_uring does support poll(2) on its file descriptor, so
>>>>>>>>> that could cause some confusion even if I don't think anyone actually
>>>>>>>>> does poll(2) on io_uring.
>>>>>>>> Side note - the only implication here is that we then likely can't make
>>>>>>>> the optimized behavior the default, it has to be an IORING_SETUP_REG
>>>>>>>> flag which tells us that the application is aware of this limitation.
>>>>>>>> Though I guess close(2) might mess with that too... Hmm.
>>>>>>> Not sure I can find a good approach for that. Tried out your patch and
>>>>>>> made some fixes:
>>>>>>>
>>>>>>> - Missing free on final tctx free
>>>>>>> - Rename registered_files to registered_rings
>>>>>>> - Fix off-by-ones in checking max registration count
>>>>>>> - Use kcalloc
>>>>>>> - Rename ENTER_FIXED_FILE -> ENTER_REGISTERED_RING
>>>>>>> - Don't pass in tctx to io_uring_unreg_ringfd()
>>>>>>> - Get rid of forward declaration for adding tctx node
>>>>>>> - Get rid of extra file pointer in io_uring_enter()
>>>>>>> - Fix deadlock in io_ringfd_register()
>>>>>>> - Use io_uring_rsrc_update rather than add a new struct type
>>>>>> - Allow multiple register/unregister instead of enforcing just 1 at the
>>>>>>     time
>>>>>> - Check for it really being a ring fd when registering
>>>>>>
>>>>>> For different batch counts, nice improvements are seen. Roughly:
>>>>>>
>>>>>> Batch==1    15% faster
>>>>>> Batch==2    13% faster
>>>>>> Batch==4    11% faster
>>>>>>
>>>>>> This is just in microbenchmarks where the fdget/fdput play a bigger
>>>>>> factor, but it will certainly help with real world situations where
>>>>>> batching is more limited than in benchmarks.
>>>>> In trying this out in applications, I think the better registration API
>>>>> is to allow io_uring to pick the offset. The application doesn't care,
>>>>> it's just a magic integer there. And if we allow io_uring to pick it,
>>>>> then that makes things a lot easier to deal with.
>>>>>
>>>>> For registration, pass in an array of io_uring_rsrc_update structs, just
>>>>> filling in the ring_fd in the data field. Return value is number of ring
>>>>> fds registered, and up->offset now contains the chosen offset for each
>>>>> of them.
>>>>>
>>>>> Unregister is the same struct, but just with offset filled in.
>>>>>
>>>>> For applications using io_uring, which is all of them as far as I'm
>>>>> aware, we can also easily hide this. This means we can get the optimized
>>>>> behavior by default.
>>>> Only complication here is if the ring is shared across threads or
>>>> processes. The thread one can be common, one thread doing submit and one
>>>> doing completions. That one is a bit harder to handle. Without
>>>> inheriting registered ring fds, not sure how it can be handled by
>>>> default. Should probably just introduce a new queue init helper, but
>>>> then that requires application changes to adopt...
>>>>
>>>> Ideas welcome!
>>> Don't see a way to do it by default, so I think it'll have to be opt-in.
>>> We could make it the default if you never shared a ring, which most
>>> people don't do, but we can't easily do so if it's ever shared between
>>> tasks or processes.
>> Before sending this patch, I also have thought about whether we can
>> make this enter_ring_fd be shared between threads and be default
>> feature, and found that it's hard :)
>>
>>    1) first we need to ensure this ring fd registration always can be
>>    unregistered, so this cache is tied to io_uring_task, then when
>>    thread exits, we can ensure fput called correspondingly.
>>
>>    2) we may also implement a file cache shared between threads in a
>>    process, but this may introduce extra overhead, at least need locks
>>    to protect modifications to this cache. If this cache is per thread,
>>    it doesn't need any synchronizations.
>>
>> So I suggest to make it be just simple and opt-in, and the
>> registration interface seems not complicated, a thread trying to
>> submit sqes can adopt it easily.
> Yes, I pretty much have come to that same conclusion myself...
>
>>> With liburing, if you share, you share the io_uring struct as well. So
>>> it's hard to maintain a new ->enter_ring_fd in there. It'd be doable if
>>> we could reserve real fd == . Which is of course possible
>>> using xarray or similar, but that'll add extra overhead.
>> For liburing, we may need to add fixed file version for helpers which
>> submit sqes or reap cqes, for example,  io_uring_submit_fixed(), which
>> passes a enter_ring_fd.
> I'll take a look at liburing and see what we need to do there. I think
> the sanest thing to do here is say that using a registered ring fd means
> you cannot share the ring, ever. And then just have a
> ring->enter_ring_fd which is normally just set to ring_fd when the ring
> is setup, and if you register the ring fd, then we set it to whatever
> the registered value is. Everything calling io_uring_enter() then just
> needs to be modified to use ->enter_ring_fd instead of ->ring_fd.
ok, look forward to use this api.
>
>>> Anyway, current version below. Only real change here is allowing either
>>> specific offset or generated offset, depending on what the
>>> io_uring_rsrc_update->offset is set to. If set to -1U, then io_uring
>>> will find a free offset. If set to anything else, io_uring will use that
>>> index (as long as it's >=0 && < MAX).
>> Seems you forgot to attach the newest version, and also don't see a
>> patch attachment. Finally, thanks for your quick response and many
>> code improvements, really appreciate it.
> Oops, below now. How do you want to handle this patch? It's now a bit of
> a mix of both of our stuff...
Since you have almost rewritten most of my original patch and now it 
looks much better, so I would
suggest just adds my Reported-by :)

Regards,
Xiaoguang Wang
>
>
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 8f26c4602384..9180f122ecd7 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -466,6 +466,11 @@ struct io_ring_ctx {
>   	};
>   };
>   
> +/*
> + * Arbitrary limit, can be raised if need be
> + */
> +#define IO_RINGFD_REG_MAX 16
> +
>   struct io_uring_task {
>   	/* submission side */
>   	int			cached_refs;
> @@ -481,6 +486,7 @@ struct io_uring_task {
>   	struct io_wq_work_list	task_list;
>   	struct io_wq_work_list	prior_task_list;
>   	struct callback_head	task_work;
> +	struct file		**registered_rings;
>   	bool			task_running;
>   };
>   
> @@ -8779,8 +8785,16 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task,
>   	if (unlikely(!tctx))
>   		return -ENOMEM;
>   
> +	tctx->registered_rings = kcalloc(IO_RINGFD_REG_MAX,
> +					 sizeof(struct file *), GFP_KERNEL);
> +	if (unlikely(!tctx->registered_rings)) {
> +		kfree(tctx);
> +		return -ENOMEM;
> +	}
> +
>   	ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
>   	if (unlikely(ret)) {
> +		kfree(tctx->registered_rings);
>   		kfree(tctx);
>   		return ret;
>   	}
> @@ -8789,6 +8803,7 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task,
>   	if (IS_ERR(tctx->io_wq)) {
>   		ret = PTR_ERR(tctx->io_wq);
>   		percpu_counter_destroy(&tctx->inflight);
> +		kfree(tctx->registered_rings);
>   		kfree(tctx);
>   		return ret;
>   	}
> @@ -8813,6 +8828,7 @@ void __io_uring_free(struct task_struct *tsk)
>   	WARN_ON_ONCE(tctx->io_wq);
>   	WARN_ON_ONCE(tctx->cached_refs);
>   
> +	kfree(tctx->registered_rings);
>   	percpu_counter_destroy(&tctx->inflight);
>   	kfree(tctx);
>   	tsk->io_uring = NULL;
> @@ -10035,6 +10051,139 @@ void __io_uring_cancel(bool cancel_all)
>   	io_uring_cancel_generic(cancel_all, NULL);
>   }
>   
> +void io_uring_unreg_ringfd(void)
> +{
> +	struct io_uring_task *tctx = current->io_uring;
> +	int i;
> +
> +	for (i = 0; i < IO_RINGFD_REG_MAX; i++) {
> +		if (tctx->registered_rings[i]) {
> +			fput(tctx->registered_rings[i]);
> +			tctx->registered_rings[i] = NULL;
> +		}
> +	}
> +}
> +
> +static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd,
> +				     int start, int end)
> +{
> +	struct file *file;
> +	int offset;
> +
> +	for (offset = start; offset < end; offset++) {
> +		offset = array_index_nospec(offset, IO_RINGFD_REG_MAX);
> +		if (tctx->registered_rings[offset])
> +			continue;
> +
> +		file = fget(fd);
> +		if (!file) {
> +			return -EBADF;
> +		} else if (file->f_op != &io_uring_fops) {
> +			fput(file);
> +			return -EOPNOTSUPP;
> +		}
> +		tctx->registered_rings[offset] = file;
> +		return offset;
> +	}
> +
> +	return -EBUSY;
> +}
> +
> +/*
> + * Register a ring fd to avoid fdget/fdput for each io_uring_enter()
> + * invocation. User passes in an array of struct io_uring_rsrc_update
> + * with ->data set to the ring_fd, and ->offset given for the desired
> + * index. If no index is desired, application may set ->offset == -1U
> + * and we'll find an available index. Returns number of entries
> + * successfully processed, or < 0 on error if none were processed.
> + */
> +static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg,
> +			      unsigned nr_args)
> +{
> +	struct io_uring_rsrc_update __user *arg = __arg;
> +	struct io_uring_rsrc_update reg;
> +	struct io_uring_task *tctx;
> +	int ret, i;
> +
> +	if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
> +		return -EINVAL;
> +
> +	mutex_unlock(&ctx->uring_lock);
> +	ret = io_uring_add_tctx_node(ctx);
> +	mutex_lock(&ctx->uring_lock);
> +	if (ret)
> +		return ret;
> +
> +	tctx = current->io_uring;
> +	for (i = 0; i < nr_args; i++) {
> +		int start, end;
> +
> +		if (copy_from_user(&reg, &arg[i], sizeof(reg))) {
> +			ret = -EFAULT;
> +			break;
> +		}
> +
> +		if (reg.offset == -1U) {
> +			start = 0;
> +			end = IO_RINGFD_REG_MAX;
> +		} else {
> +			if (reg.offset >= IO_RINGFD_REG_MAX) {
> +				ret = -EINVAL;
> +				break;
> +			}
> +			start = reg.offset;
> +			end = start + 1;
> +		}
> +
> +		ret = io_ring_add_registered_fd(tctx, reg.data, start, end);
> +		if (ret < 0)
> +			break;
> +
> +		reg.offset = ret;
> +		if (copy_to_user(&arg[i], &reg, sizeof(reg))) {
> +			fput(tctx->registered_rings[reg.offset]);
> +			tctx->registered_rings[reg.offset] = NULL;
> +			ret = -EFAULT;
> +			break;
> +		}
> +	}
> +
> +	return i ? i : ret;
> +}
> +
> +static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg,
> +				unsigned nr_args)
> +{
> +	struct io_uring_rsrc_update __user *arg = __arg;
> +	struct io_uring_task *tctx = current->io_uring;
> +	struct io_uring_rsrc_update reg;
> +	int ret = 0, i;
> +
> +	if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
> +		return -EINVAL;
> +	if (!tctx)
> +		return 0;
> +
> +	for (i = 0; i < nr_args; i++) {
> +		if (copy_from_user(&reg, &arg[i], sizeof(reg))) {
> +			ret = -EFAULT;
> +			break;
> +		}
> +		if (reg.offset >= IO_RINGFD_REG_MAX) {
> +			ret = -EINVAL;
> +			break;
> +		}
> +
> +		reg.offset = array_index_nospec(reg.offset, IO_RINGFD_REG_MAX);
> +		if (tctx->registered_rings[reg.offset]) {
> +			fput(tctx->registered_rings[reg.offset]);
> +			tctx->registered_rings[reg.offset] = NULL;
> +		}
> +	}
> +
> +	return i ? i : ret;
> +}
> +
>   static void *io_uring_validate_mmap_request(struct file *file,
>   					    loff_t pgoff, size_t sz)
>   {
> @@ -10165,12 +10314,28 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
>   	io_run_task_work();
>   
>   	if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
> -			       IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)))
> +			       IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
> +			       IORING_ENTER_REGISTERED_RING)))
>   		return -EINVAL;
>   
> -	f = fdget(fd);
> -	if (unlikely(!f.file))
> -		return -EBADF;
> +	/*
> +	 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
> +	 * need only dereference our task private array to find it.
> +	 */
> +	if (flags & IORING_ENTER_REGISTERED_RING) {
> +		struct io_uring_task *tctx = current->io_uring;
> +
> +		if (fd < 0 || fd >= IO_RINGFD_REG_MAX || !tctx)
> +			return -EINVAL;
> +		fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
> +		f.file = tctx->registered_rings[fd];
> +		if (unlikely(!f.file))
> +			return -EBADF;
> +	} else {
> +		f = fdget(fd);
> +		if (unlikely(!f.file))
> +			return -EBADF;
> +	}
>   
>   	ret = -EOPNOTSUPP;
>   	if (unlikely(f.file->f_op != &io_uring_fops))
> @@ -10244,7 +10409,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
>   out:
>   	percpu_ref_put(&ctx->refs);
>   out_fput:
> -	fdput(f);
> +	if (!(flags & IORING_ENTER_REGISTERED_RING))
> +		fdput(f);
>   	return submitted ? submitted : ret;
>   }
>   
> @@ -11134,6 +11300,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
>   			break;
>   		ret = io_register_iowq_max_workers(ctx, arg);
>   		break;
> +	case IORING_REGISTER_RING_FDS:
> +		ret = io_ringfd_register(ctx, arg, nr_args);
> +		break;
> +	case IORING_UNREGISTER_RING_FDS:
> +		ret = io_ringfd_unregister(ctx, arg, nr_args);
> +		break;
>   	default:
>   		ret = -EINVAL;
>   		break;
> diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
> index 649a4d7c241b..1814e698d861 100644
> --- a/include/linux/io_uring.h
> +++ b/include/linux/io_uring.h
> @@ -9,11 +9,14 @@
>   struct sock *io_uring_get_socket(struct file *file);
>   void __io_uring_cancel(bool cancel_all);
>   void __io_uring_free(struct task_struct *tsk);
> +void io_uring_unreg_ringfd(void);
>   
>   static inline void io_uring_files_cancel(void)
>   {
> -	if (current->io_uring)
> +	if (current->io_uring) {
> +		io_uring_unreg_ringfd();
>   		__io_uring_cancel(false);
> +	}
>   }
>   static inline void io_uring_task_cancel(void)
>   {
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index 787f491f0d2a..42b2fe84dbcd 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -257,10 +257,11 @@ struct io_cqring_offsets {
>   /*
>    * io_uring_enter(2) flags
>    */
> -#define IORING_ENTER_GETEVENTS	(1U << 0)
> -#define IORING_ENTER_SQ_WAKEUP	(1U << 1)
> -#define IORING_ENTER_SQ_WAIT	(1U << 2)
> -#define IORING_ENTER_EXT_ARG	(1U << 3)
> +#define IORING_ENTER_GETEVENTS		(1U << 0)
> +#define IORING_ENTER_SQ_WAKEUP		(1U << 1)
> +#define IORING_ENTER_SQ_WAIT		(1U << 2)
> +#define IORING_ENTER_EXT_ARG		(1U << 3)
> +#define IORING_ENTER_REGISTERED_RING	(1U << 4)
>   
>   /*
>    * Passed in for io_uring_setup(2). Copied back with updated info on success
> @@ -325,6 +326,10 @@ enum {
>   	/* set/get max number of io-wq workers */
>   	IORING_REGISTER_IOWQ_MAX_WORKERS	= 19,
>   
> +	/* register/unregister io_uring fd with the ring */
> +	IORING_REGISTER_RING_FDS		= 20,
> +	IORING_UNREGISTER_RING_FDS		= 21,
> +
>   	/* this goes last */
>   	IORING_REGISTER_LAST
>   };
>


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] io_uring: add io_uring_enter(2) fixed file support
  2022-03-04 15:16                     ` Xiaoguang Wang
@ 2022-03-04 15:22                       ` Jens Axboe
  2022-03-08  8:38                         ` Xiaoguang Wang
  0 siblings, 1 reply; 28+ messages in thread
From: Jens Axboe @ 2022-03-04 15:22 UTC (permalink / raw)
  To: Xiaoguang Wang, io-uring; +Cc: asml.silence

>> I'll take a look at liburing and see what we need to do there. I think
>> the sanest thing to do here is say that using a registered ring fd means
>> you cannot share the ring, ever. And then just have a
>> ring->enter_ring_fd which is normally just set to ring_fd when the ring
>> is setup, and if you register the ring fd, then we set it to whatever
>> the registered value is. Everything calling io_uring_enter() then just
>> needs to be modified to use ->enter_ring_fd instead of ->ring_fd.
> ok, look forward to use this api.

Can you take a look at the registered-ring branch for liburing:

https://git.kernel.dk/cgit/liburing/log/?h=registered-ring

which has the basic plumbing for it. Comments (or patches) welcome!

Few things I don't really love:

1) You need to call io_uring_register_ring_fd() after setting up the
   ring. We could provide init helpers for that, which just do queue
   init and then register ring. Maybe that'd make it more likely to get
   picked up by applications.

2) For the setup where you do share the ring between a submitter and
   reaper, we need to ensure that the registered ring fd is the same
   between both of them. We need a helper for that. It's basically the
   same as io_uring_register_ring_fd(), but we need the specific offset.
   And if that fails with -EBUSY, we should just turn off
   INT_FLAG_RING_REG for the ring and you don't get the registered fd
   for either of them. At least it can be handled transparantly.

>>>> Anyway, current version below. Only real change here is allowing either
>>>> specific offset or generated offset, depending on what the
>>>> io_uring_rsrc_update->offset is set to. If set to -1U, then io_uring
>>>> will find a free offset. If set to anything else, io_uring will use that
>>>> index (as long as it's >=0 && < MAX).
>>> Seems you forgot to attach the newest version, and also don't see a
>>> patch attachment. Finally, thanks for your quick response and many
>>> code improvements, really appreciate it.
>> Oops, below now. How do you want to handle this patch? It's now a bit of
>> a mix of both of our stuff...
> Since you have almost rewritten most of my original patch and now it
> looks much better, so I would suggest just adds my Reported-by :)

OK I'll post it, but Co-developed-by is probably a better one.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] io_uring: add io_uring_enter(2) fixed file support
  2022-03-04 15:22                       ` Jens Axboe
@ 2022-03-08  8:38                         ` Xiaoguang Wang
  2022-03-08 13:10                           ` Jens Axboe
  0 siblings, 1 reply; 28+ messages in thread
From: Xiaoguang Wang @ 2022-03-08  8:38 UTC (permalink / raw)
  To: Jens Axboe, io-uring; +Cc: asml.silence

hi,

>>> I'll take a look at liburing and see what we need to do there. I think
>>> the sanest thing to do here is say that using a registered ring fd means
>>> you cannot share the ring, ever. And then just have a
>>> ring->enter_ring_fd which is normally just set to ring_fd when the ring
>>> is setup, and if you register the ring fd, then we set it to whatever
>>> the registered value is. Everything calling io_uring_enter() then just
>>> needs to be modified to use ->enter_ring_fd instead of ->ring_fd.
>> ok, look forward to use this api.
> Can you take a look at the registered-ring branch for liburing:
>
> https://git.kernel.dk/cgit/liburing/log/?h=registered-ring
>
> which has the basic plumbing for it. Comments (or patches) welcome!
Sorry for late reply, spend time to read your patch today. Basically it 
looks ok,
there is one minor issue in "Add preliminary support for using a 
registered ring fd":
@@ -417,6 +425,10 @@ struct io_uring_sqe *io_uring_get_sqe(struct 
io_uring *ring)

  int __io_uring_sqring_wait(struct io_uring *ring)
  {
-    return  ____sys_io_uring_enter(ring->ring_fd, 0, 0,
-                       IORING_ENTER_SQ_WAIT, NULL);
+    int flags = IORING_ENTER_SQ_WAIT;
+
+    if (ring->int_flags & INT_FLAG_REG_RING)
+        flags |= IORING_ENTER_REGISTERED_RING;
+
+    return  ____sys_io_uring_enter(ring->ring_fd, 0, 0, flags, NULL);
  }

Here it should be enter_ring_fd.

>
> Few things I don't really love:
>
> 1) You need to call io_uring_register_ring_fd() after setting up the
>     ring. We could provide init helpers for that, which just do queue
>     init and then register ring. Maybe that'd make it more likely to get
>     picked up by applications.
Agree, that'd be better in some cases, but consider that currently the 
capacity of ring
fd cache is just 16, I'd suggest to let users make their own decisions, 
in case some
ring fds could not allocate one empty slot, but some ring fds don't need 
them at all,
for example, ring fd which enable sqpoll may no need this feature.

>
> 2) For the setup where you do share the ring between a submitter and
>     reaper, we need to ensure that the registered ring fd is the same
>     between both of them. We need a helper for that. It's basically the
>     same as io_uring_register_ring_fd(), but we need the specific offset.
>     And if that fails with -EBUSY, we should just turn off
>     INT_FLAG_RING_REG for the ring and you don't get the registered fd
>     for either of them. At least it can be handled transparantly.
Storing enter_ring_fd in struct io_uring seems not good, struct io_uring 
is a shared struct,
as what you say, different threads that share one ring fd may have 
differed offset in ring fd
cache. I have two suggestions:
1) Threads keep their offset in ring fd cache alone, and pass it to 
io_uring_submit, which may
look ugly :)
2) define enter_ring_fd in struct io_ring to be a thread_local type, 
then your patches don't
need to do any modifications.

Regards,
Xiaoguang Wang
>
>>>>> Anyway, current version below. Only real change here is allowing either
>>>>> specific offset or generated offset, depending on what the
>>>>> io_uring_rsrc_update->offset is set to. If set to -1U, then io_uring
>>>>> will find a free offset. If set to anything else, io_uring will use that
>>>>> index (as long as it's >=0 && < MAX).
>>>> Seems you forgot to attach the newest version, and also don't see a
>>>> patch attachment. Finally, thanks for your quick response and many
>>>> code improvements, really appreciate it.
>>> Oops, below now. How do you want to handle this patch? It's now a bit of
>>> a mix of both of our stuff...
>> Since you have almost rewritten most of my original patch and now it
>> looks much better, so I would suggest just adds my Reported-by :)
> OK I'll post it, but Co-developed-by is probably a better one.
>


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] io_uring: add io_uring_enter(2) fixed file support
  2022-03-08  8:38                         ` Xiaoguang Wang
@ 2022-03-08 13:10                           ` Jens Axboe
  0 siblings, 0 replies; 28+ messages in thread
From: Jens Axboe @ 2022-03-08 13:10 UTC (permalink / raw)
  To: Xiaoguang Wang, io-uring; +Cc: asml.silence

On 3/8/22 1:38 AM, Xiaoguang Wang wrote:
> hi,
> 
>>>> I'll take a look at liburing and see what we need to do there. I think
>>>> the sanest thing to do here is say that using a registered ring fd means
>>>> you cannot share the ring, ever. And then just have a
>>>> ring->enter_ring_fd which is normally just set to ring_fd when the ring
>>>> is setup, and if you register the ring fd, then we set it to whatever
>>>> the registered value is. Everything calling io_uring_enter() then just
>>>> needs to be modified to use ->enter_ring_fd instead of ->ring_fd.
>>> ok, look forward to use this api.
>> Can you take a look at the registered-ring branch for liburing:
>>
>> https://git.kernel.dk/cgit/liburing/log/?h=registered-ring
>>
>> which has the basic plumbing for it. Comments (or patches) welcome!
> Sorry for late reply, spend time to read your patch today. Basically it looks ok,
> there is one minor issue in "Add preliminary support for using a registered ring fd":
> @@ -417,6 +425,10 @@ struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring)
> 
>  int __io_uring_sqring_wait(struct io_uring *ring)
>  {
> -    return  ____sys_io_uring_enter(ring->ring_fd, 0, 0,
> -                       IORING_ENTER_SQ_WAIT, NULL);
> +    int flags = IORING_ENTER_SQ_WAIT;
> +
> +    if (ring->int_flags & INT_FLAG_REG_RING)
> +        flags |= IORING_ENTER_REGISTERED_RING;
> +
> +    return  ____sys_io_uring_enter(ring->ring_fd, 0, 0, flags, NULL);
>  }
> 
> Here it should be enter_ring_fd.

Ah good catch, I've fixed that up.

>> Few things I don't really love:
>>
>> 1) You need to call io_uring_register_ring_fd() after setting up the
>>     ring. We could provide init helpers for that, which just do queue
>>     init and then register ring. Maybe that'd make it more likely to get
>>     picked up by applications.
> Agree, that'd be better in some cases, but consider that currently the
> capacity of ring fd cache is just 16, I'd suggest to let users make
> their own decisions, in case some ring fds could not allocate one
> empty slot, but some ring fds don't need them at all, for example,
> ring fd which enable sqpoll may no need this feature.

Agree, that's the route I ended up taking too.

>> 2) For the setup where you do share the ring between a submitter and
>>     reaper, we need to ensure that the registered ring fd is the same
>>     between both of them. We need a helper for that. It's basically the
>>     same as io_uring_register_ring_fd(), but we need the specific offset.
>>     And if that fails with -EBUSY, we should just turn off
>>     INT_FLAG_RING_REG for the ring and you don't get the registered fd
>>     for either of them. At least it can be handled transparantly.
> Storing enter_ring_fd in struct io_uring seems not good, struct
> io_uring is a shared struct, as what you say, different threads that
> share one ring fd may have differed offset in ring fd cache. I have
> two suggestions:
> 1) Threads keep their offset in ring fd cache alone, and pass it to
> io_uring_submit, which may look ugly :)
> 2) define enter_ring_fd in struct io_ring to be a thread_local type,
> then your patches don't need to do any modifications.

Maybe I'm missing something here, but I don't see how your number 2 is
possible?

I think for now I'll just keep it as-is. Yes, then you can't use a
registered ring fd if you share the ring betwen threads, but so be it.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] io_uring: add io_uring_enter(2) fixed file support
  2022-03-03 14:36   ` Jens Axboe
  2022-03-03 14:40     ` Jens Axboe
@ 2022-04-21 14:16     ` Hao Xu
  1 sibling, 0 replies; 28+ messages in thread
From: Hao Xu @ 2022-04-21 14:16 UTC (permalink / raw)
  To: Jens Axboe, Xiaoguang Wang, io-uring; +Cc: asml.silence

Hi all,

在 3.3.22 下午10:36, Jens Axboe 写道:
> On 3/3/22 6:38 AM, Jens Axboe wrote:
>> On 3/2/22 10:28 PM, Xiaoguang Wang wrote:
>>> IORING_REGISTER_FILES is a good feature to reduce fget/fput overhead for
>>> each IO we do on file, but still left one, which is io_uring_enter(2).
>>> In io_uring_enter(2), it still fget/fput io_ring fd. I have observed
>>> this overhead in some our internal oroutine implementations based on
>>> io_uring with low submit batch. To totally remove fget/fput overhead in
>>> io_uring, we may add a small struct file cache in io_uring_task and add
>>> a new IORING_ENTER_FIXED_FILE flag. Currently the capacity of this file
>>> cache is 16, wihcih I think it maybe enough, also not that this cache is
>>> per-thread.
>> Would indeed be nice to get rid of, can be a substantial amount of time
>> wasted in fdget/fdput. Does this resolve dependencies correctly if
>> someone passes the ring fd? Adding ring registration to test/ring-leak.c
>> from the liburing repo would be a useful exercise.
> Seems to pass that fine, but I did miss on first read through that you
> add that hook to files_cancel() which should break that dependency.
>
> Since I think this is a potentially big win for certain workloads, maybe
> we should consider making this easier to use? I don't think we
> necessarily need to tie this to the regular file registration. What if
> we instead added a SETUP flag for this, and just return the internal
> offset for that case? Then we don't need an enter flag, we don't need to
> add register/unregister opcodes for it.
[1]
>
> This does pose a problem when we fill the array. We can easily go beyond
> 16 here, that's just an arbitrary limit, but at some point we do have to
> handle the case where SETUP_REGISTERED (or whatever we call it) can't
> get a slot. I think we just clear the flag and setup the fd normally in
> that case. The user doesn't need to know, all the application needs to
> are about is that it can use the passed back 'fd' to call the other
> io_uring functions.
>
> The only potential oddity here is that the fd passed back is not a
> legitimate fd. io_uring does support poll(2) on its file descriptor,
[2]
> so
> that could cause some confusion even if I don't think anyone actually
> does poll(2) on io_uring.
>
> What do you think?

Can we use something like a heap based on the registered_rings arrray so 
that we can return the

real fd to the userspace meanwhilely. The disadvantage is the time cost 
is O(lgn) for each fd

registration/searching. Then we can achieve [1] and avoid [2].


Regards,

Hao

>

^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2022-04-21 14:16 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-03-03  5:28 [PATCH] io_uring: add io_uring_enter(2) fixed file support Xiaoguang Wang
2022-03-03  8:56 ` Hao Xu
2022-03-03 13:38 ` Jens Axboe
2022-03-03 14:36   ` Jens Axboe
2022-03-03 14:40     ` Jens Axboe
2022-03-03 16:31       ` Jens Axboe
2022-03-03 17:18         ` Jens Axboe
2022-03-03 20:41           ` Jens Axboe
2022-03-03 21:19             ` Jens Axboe
2022-03-04  0:07               ` Jens Axboe
2022-03-04 13:39                 ` Xiaoguang Wang
2022-03-04 13:44                   ` Jens Axboe
2022-03-04 15:16                     ` Xiaoguang Wang
2022-03-04 15:22                       ` Jens Axboe
2022-03-08  8:38                         ` Xiaoguang Wang
2022-03-08 13:10                           ` Jens Axboe
2022-03-03 22:24             ` Vito Caputo
2022-03-03 22:26               ` Jens Axboe
2022-03-04  1:49         ` Pavel Begunkov
2022-03-04  2:18           ` Jens Axboe
2022-03-04  2:28             ` Pavel Begunkov
2022-03-04  2:35               ` Pavel Begunkov
2022-03-04  2:43               ` Jens Axboe
2022-03-04  1:52         ` Pavel Begunkov
2022-03-04  2:19           ` Jens Axboe
2022-03-04  2:39             ` Pavel Begunkov
2022-03-04  3:03               ` Jens Axboe
2022-04-21 14:16     ` Hao Xu

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.