All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/2] io_uring: avoid ring quiesce in io_uring_register for eventfd opcodes
@ 2022-02-03 15:11 Usama Arif
  2022-02-03 15:11 ` [PATCH 1/2] io_uring: avoid ring quiesce while registering/unregistering eventfd Usama Arif
  2022-02-03 15:11 ` [PATCH 2/2] io_uring: avoid ring quiesce for IORING_REGISTER_EVENTFD_ASYNC Usama Arif
  0 siblings, 2 replies; 9+ messages in thread
From: Usama Arif @ 2022-02-03 15:11 UTC (permalink / raw)
  To: io-uring, axboe, asml.silence, linux-kernel; +Cc: fam.zheng, Usama Arif

This is done by creating a new RCU data structure (io_ev_fd) as part of
io_ring_ctx that holds the eventfd_ctx, with reads to the structure protected
by rcu_read_lock and writes (register/unregister calls) protected by a mutex.

With the above approach ring quiesce can be avoided which is much more
expensive then using RCU lock. On the system tested, io_uring_reigster with
IORING_REGISTER_EVENTFD takes less than 1ms with RCU lock, compared to 15ms
before with ring quiesce.

The first patch creates the RCU protected data structure and removes ring
quiesce for IORING_REGISTER_EVENTFD and IORING_UNREGISTER_EVENTFD.

The second patch builds on top of the first patch and removes ring quiesce
for IORING_REGISTER_EVENTFD_ASYNC.

Usama Arif (2):
  io_uring: avoid ring quiesce while registering/unregistering eventfd
  io_uring: avoid ring quiesce for IORING_REGISTER_EVENTFD_ASYNC

 fs/io_uring.c | 125 +++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 92 insertions(+), 33 deletions(-)

-- 
2.25.1


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 1/2] io_uring: avoid ring quiesce while registering/unregistering eventfd
  2022-02-03 15:11 [PATCH 0/2] io_uring: avoid ring quiesce in io_uring_register for eventfd opcodes Usama Arif
@ 2022-02-03 15:11 ` Usama Arif
  2022-02-03 15:48   ` Pavel Begunkov
  2022-02-03 15:55   ` Jens Axboe
  2022-02-03 15:11 ` [PATCH 2/2] io_uring: avoid ring quiesce for IORING_REGISTER_EVENTFD_ASYNC Usama Arif
  1 sibling, 2 replies; 9+ messages in thread
From: Usama Arif @ 2022-02-03 15:11 UTC (permalink / raw)
  To: io-uring, axboe, asml.silence, linux-kernel; +Cc: fam.zheng, Usama Arif

This is done by creating a new RCU data structure (io_ev_fd) as part of
io_ring_ctx that holds the eventfd_ctx.

The function io_eventfd_signal is executed under rcu_read_lock with a
single rcu_dereference to io_ev_fd so that if another thread unregisters
the eventfd while io_eventfd_signal is still being executed, the
eventfd_signal for which io_eventfd_signal was called completes
successfully.

The process of registering/unregistering eventfd is done under a lock
so multiple threads don't enter a race condition while
registering/unregistering eventfd.

With the above approach ring quiesce can be avoided which is much more
expensive then using RCU lock. On the system tested, io_uring_reigster with
IORING_REGISTER_EVENTFD takes less than 1ms with RCU lock, compared to 15ms
before with ring quiesce.

Signed-off-by: Usama Arif <usama.arif@bytedance.com>
---
 fs/io_uring.c | 103 +++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 80 insertions(+), 23 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 2e04f718319d..f07cfbb387a6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -326,6 +326,12 @@ struct io_submit_state {
 	struct blk_plug		plug;
 };
 
+struct io_ev_fd {
+	struct eventfd_ctx	*cq_ev_fd;
+	struct io_ring_ctx	*ctx;
+	struct rcu_head		rcu;
+};
+
 struct io_ring_ctx {
 	/* const or read-mostly hot data */
 	struct {
@@ -399,7 +405,8 @@ struct io_ring_ctx {
 	struct {
 		unsigned		cached_cq_tail;
 		unsigned		cq_entries;
-		struct eventfd_ctx	*cq_ev_fd;
+		struct io_ev_fd	__rcu	*io_ev_fd;
+		struct mutex		ev_fd_lock;
 		struct wait_queue_head	cq_wait;
 		unsigned		cq_extra;
 		atomic_t		cq_timeouts;
@@ -1448,6 +1455,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1);
 	xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
 	mutex_init(&ctx->uring_lock);
+	mutex_init(&ctx->ev_fd_lock);
 	init_waitqueue_head(&ctx->cq_wait);
 	spin_lock_init(&ctx->completion_lock);
 	spin_lock_init(&ctx->timeout_lock);
@@ -1726,15 +1734,30 @@ static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
 	return &rings->cqes[tail & mask];
 }
 
-static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
+static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx, struct io_ev_fd *ev_fd)
 {
-	if (likely(!ctx->cq_ev_fd))
+	if (likely(!ev_fd))
 		return false;
 	if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
 		return false;
 	return !ctx->eventfd_async || io_wq_current_is_worker();
 }
 
+static void io_eventfd_signal(struct io_ring_ctx *ctx)
+{
+	struct io_ev_fd *ev_fd;
+
+	rcu_read_lock();
+	ev_fd = rcu_dereference(ctx->io_ev_fd);
+
+	if (!io_should_trigger_evfd(ctx, ev_fd))
+		goto out;
+
+	eventfd_signal(ev_fd->cq_ev_fd, 1);
+out:
+	rcu_read_unlock();
+}
+
 /*
  * This should only get called when at least one event has been posted.
  * Some applications rely on the eventfd notification count only changing
@@ -1751,8 +1774,7 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
 	 */
 	if (wq_has_sleeper(&ctx->cq_wait))
 		wake_up_all(&ctx->cq_wait);
-	if (io_should_trigger_evfd(ctx))
-		eventfd_signal(ctx->cq_ev_fd, 1);
+	io_eventfd_signal(ctx);
 }
 
 static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
@@ -1764,8 +1786,7 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
 		if (waitqueue_active(&ctx->cq_wait))
 			wake_up_all(&ctx->cq_wait);
 	}
-	if (io_should_trigger_evfd(ctx))
-		eventfd_signal(ctx->cq_ev_fd, 1);
+	io_eventfd_signal(ctx);
 }
 
 /* Returns true if there are no backlogged entries after the flush */
@@ -9353,35 +9374,67 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
 
 static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
 {
+	struct io_ev_fd *ev_fd;
 	__s32 __user *fds = arg;
-	int fd;
+	int fd, ret;
 
-	if (ctx->cq_ev_fd)
-		return -EBUSY;
+	mutex_lock(&ctx->ev_fd_lock);
+	ret = -EBUSY;
+	if (rcu_dereference_protected(ctx->io_ev_fd, lockdep_is_held(&ctx->ev_fd_lock)))
+		goto out;
 
+	ret = -EFAULT;
 	if (copy_from_user(&fd, fds, sizeof(*fds)))
-		return -EFAULT;
+		goto out;
 
-	ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
-	if (IS_ERR(ctx->cq_ev_fd)) {
-		int ret = PTR_ERR(ctx->cq_ev_fd);
+	ret = -ENOMEM;
+	ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
+	if (!ev_fd)
+		goto out;
 
-		ctx->cq_ev_fd = NULL;
-		return ret;
+	ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
+	if (IS_ERR(ev_fd->cq_ev_fd)) {
+		ret = PTR_ERR(ev_fd->cq_ev_fd);
+		kfree(ev_fd);
+		goto out;
 	}
+	ev_fd->ctx = ctx;
 
-	return 0;
+	rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
+	ret = 0;
+
+out:
+	mutex_unlock(&ctx->ev_fd_lock);
+	return ret;
+}
+
+static void io_eventfd_put(struct rcu_head *rcu)
+{
+	struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
+	struct io_ring_ctx *ctx = ev_fd->ctx;
+
+	eventfd_ctx_put(ev_fd->cq_ev_fd);
+	kfree(ev_fd);
+	rcu_assign_pointer(ctx->io_ev_fd, NULL);
 }
 
 static int io_eventfd_unregister(struct io_ring_ctx *ctx)
 {
-	if (ctx->cq_ev_fd) {
-		eventfd_ctx_put(ctx->cq_ev_fd);
-		ctx->cq_ev_fd = NULL;
-		return 0;
+	struct io_ev_fd *ev_fd;
+	int ret;
+
+	mutex_lock(&ctx->ev_fd_lock);
+	ev_fd = rcu_dereference_protected(ctx->io_ev_fd, lockdep_is_held(&ctx->ev_fd_lock));
+	if (ev_fd) {
+		call_rcu(&ev_fd->rcu, io_eventfd_put);
+		ret = 0;
+		goto out;
 	}
+	ret = -ENXIO;
 
-	return -ENXIO;
+out:
+	mutex_unlock(&ctx->ev_fd_lock);
+	return ret;
 }
 
 static void io_destroy_buffers(struct io_ring_ctx *ctx)
@@ -10960,6 +11013,8 @@ static bool io_register_op_must_quiesce(int op)
 	case IORING_REGISTER_FILES:
 	case IORING_UNREGISTER_FILES:
 	case IORING_REGISTER_FILES_UPDATE:
+	case IORING_REGISTER_EVENTFD:
+	case IORING_UNREGISTER_EVENTFD:
 	case IORING_REGISTER_PROBE:
 	case IORING_REGISTER_PERSONALITY:
 	case IORING_UNREGISTER_PERSONALITY:
@@ -11171,8 +11226,10 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
 	mutex_lock(&ctx->uring_lock);
 	ret = __io_uring_register(ctx, opcode, arg, nr_args);
 	mutex_unlock(&ctx->uring_lock);
+	rcu_read_lock();
 	trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
-							ctx->cq_ev_fd != NULL, ret);
+				rcu_dereference(ctx->io_ev_fd) != NULL, ret);
+	rcu_read_unlock();
 out_fput:
 	fdput(f);
 	return ret;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 2/2] io_uring: avoid ring quiesce for IORING_REGISTER_EVENTFD_ASYNC
  2022-02-03 15:11 [PATCH 0/2] io_uring: avoid ring quiesce in io_uring_register for eventfd opcodes Usama Arif
  2022-02-03 15:11 ` [PATCH 1/2] io_uring: avoid ring quiesce while registering/unregistering eventfd Usama Arif
@ 2022-02-03 15:11 ` Usama Arif
  1 sibling, 0 replies; 9+ messages in thread
From: Usama Arif @ 2022-02-03 15:11 UTC (permalink / raw)
  To: io-uring, axboe, asml.silence, linux-kernel; +Cc: fam.zheng, Usama Arif

This is done using the RCU data structure (io_ev_fd). eventfd_async
is moved from io_ring_ctx to io_ev_fd which is RCU protected hence
avoiding ring quiesce which is much more expensive than an RCU lock.
io_should_trigger_evfd is already under rcu_read_lock so there is
no extra RCU read-side critical section needed.

Signed-off-by: Usama Arif <usama.arif@bytedance.com>
---
 fs/io_uring.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index f07cfbb387a6..30ac08ad6810 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -329,6 +329,7 @@ struct io_submit_state {
 struct io_ev_fd {
 	struct eventfd_ctx	*cq_ev_fd;
 	struct io_ring_ctx	*ctx;
+	unsigned int		eventfd_async: 1;
 	struct rcu_head		rcu;
 };
 
@@ -341,7 +342,6 @@ struct io_ring_ctx {
 		unsigned int		flags;
 		unsigned int		compat: 1;
 		unsigned int		drain_next: 1;
-		unsigned int		eventfd_async: 1;
 		unsigned int		restricted: 1;
 		unsigned int		off_timeout_used: 1;
 		unsigned int		drain_active: 1;
@@ -1740,7 +1740,7 @@ static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx, struct io_ev_
 		return false;
 	if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
 		return false;
-	return !ctx->eventfd_async || io_wq_current_is_worker();
+	return !ev_fd->eventfd_async || io_wq_current_is_worker();
 }
 
 static void io_eventfd_signal(struct io_ring_ctx *ctx)
@@ -9372,7 +9372,8 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
 	return done ? done : err;
 }
 
-static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
+static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
+			       unsigned int eventfd_async)
 {
 	struct io_ev_fd *ev_fd;
 	__s32 __user *fds = arg;
@@ -9399,6 +9400,7 @@ static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
 		goto out;
 	}
 	ev_fd->ctx = ctx;
+	ev_fd->eventfd_async = eventfd_async;
 
 	rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
 	ret = 0;
@@ -11014,6 +11016,7 @@ static bool io_register_op_must_quiesce(int op)
 	case IORING_UNREGISTER_FILES:
 	case IORING_REGISTER_FILES_UPDATE:
 	case IORING_REGISTER_EVENTFD:
+	case IORING_REGISTER_EVENTFD_ASYNC:
 	case IORING_UNREGISTER_EVENTFD:
 	case IORING_REGISTER_PROBE:
 	case IORING_REGISTER_PERSONALITY:
@@ -11114,17 +11117,16 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 		ret = io_register_files_update(ctx, arg, nr_args);
 		break;
 	case IORING_REGISTER_EVENTFD:
-	case IORING_REGISTER_EVENTFD_ASYNC:
 		ret = -EINVAL;
 		if (nr_args != 1)
 			break;
-		ret = io_eventfd_register(ctx, arg);
-		if (ret)
+		ret = io_eventfd_register(ctx, arg, 0);
+		break;
+	case IORING_REGISTER_EVENTFD_ASYNC:
+		ret = -EINVAL;
+		if (nr_args != 1)
 			break;
-		if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
-			ctx->eventfd_async = 1;
-		else
-			ctx->eventfd_async = 0;
+		ret = io_eventfd_register(ctx, arg, 1);
 		break;
 	case IORING_UNREGISTER_EVENTFD:
 		ret = -EINVAL;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] io_uring: avoid ring quiesce while registering/unregistering eventfd
  2022-02-03 15:11 ` [PATCH 1/2] io_uring: avoid ring quiesce while registering/unregistering eventfd Usama Arif
@ 2022-02-03 15:48   ` Pavel Begunkov
  2022-02-03 17:47     ` Usama Arif
  2022-02-03 15:55   ` Jens Axboe
  1 sibling, 1 reply; 9+ messages in thread
From: Pavel Begunkov @ 2022-02-03 15:48 UTC (permalink / raw)
  To: Usama Arif, io-uring, axboe, linux-kernel; +Cc: fam.zheng

On 2/3/22 15:11, Usama Arif wrote:
> This is done by creating a new RCU data structure (io_ev_fd) as part of
> io_ring_ctx that holds the eventfd_ctx.
> 
> The function io_eventfd_signal is executed under rcu_read_lock with a
> single rcu_dereference to io_ev_fd so that if another thread unregisters
> the eventfd while io_eventfd_signal is still being executed, the
> eventfd_signal for which io_eventfd_signal was called completes
> successfully.
> 
> The process of registering/unregistering eventfd is done under a lock
> so multiple threads don't enter a race condition while
> registering/unregistering eventfd.
> 
> With the above approach ring quiesce can be avoided which is much more
> expensive then using RCU lock. On the system tested, io_uring_reigster with
> IORING_REGISTER_EVENTFD takes less than 1ms with RCU lock, compared to 15ms
> before with ring quiesce.
> 
> Signed-off-by: Usama Arif <usama.arif@bytedance.com>
> ---
>   fs/io_uring.c | 103 +++++++++++++++++++++++++++++++++++++++-----------
>   1 file changed, 80 insertions(+), 23 deletions(-)
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 2e04f718319d..f07cfbb387a6 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -326,6 +326,12 @@ struct io_submit_state {
>   	struct blk_plug		plug;
>   };
>   

> -static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
> +static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx, struct io_ev_fd *ev_fd)
>   {
> -	if (likely(!ctx->cq_ev_fd))
> +	if (likely(!ev_fd))
>   		return false;
>   	if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
>   		return false;
>   	return !ctx->eventfd_async || io_wq_current_is_worker();
>   }
>   
> +static void io_eventfd_signal(struct io_ring_ctx *ctx)
> +{
> +	struct io_ev_fd *ev_fd;
> +
> +	rcu_read_lock();

Please always think about the fast path, which is not set eventfd.
We don't want extra overhead here.

if (ctx->ev_fd) {
	rcu_read_lock();
         ev_fd = rcu_deref(...);
         ...
         rcu_read_unlock();
}

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] io_uring: avoid ring quiesce while registering/unregistering eventfd
  2022-02-03 15:11 ` [PATCH 1/2] io_uring: avoid ring quiesce while registering/unregistering eventfd Usama Arif
  2022-02-03 15:48   ` Pavel Begunkov
@ 2022-02-03 15:55   ` Jens Axboe
  2022-02-03 16:49     ` Usama Arif
  1 sibling, 1 reply; 9+ messages in thread
From: Jens Axboe @ 2022-02-03 15:55 UTC (permalink / raw)
  To: Usama Arif, io-uring, asml.silence, linux-kernel; +Cc: fam.zheng

On 2/3/22 8:11 AM, Usama Arif wrote:
> +static void io_eventfd_signal(struct io_ring_ctx *ctx)
> +{
> +	struct io_ev_fd *ev_fd;
> +
> +	rcu_read_lock();
> +	ev_fd = rcu_dereference(ctx->io_ev_fd);
> +
> +	if (!io_should_trigger_evfd(ctx, ev_fd))
> +		goto out;
> +
> +	eventfd_signal(ev_fd->cq_ev_fd, 1);
> +out:
> +	rcu_read_unlock();
> +}

Would be cleaner as:

static void io_eventfd_signal(struct io_ring_ctx *ctx)
{
	struct io_ev_fd *ev_fd;

	rcu_read_lock();
	ev_fd = rcu_dereference(ctx->io_ev_fd);

	if (io_should_trigger_evfd(ctx, ev_fd))
		eventfd_signal(ev_fd->cq_ev_fd, 1);

	rcu_read_unlock();
}

and might be worth considering pulling in the io_should_trigger_evfd()
code rather than have it be a separate helper now with just the one
caller.

> @@ -9353,35 +9374,67 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
>  
>  static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
>  {
> +	struct io_ev_fd *ev_fd;
>  	__s32 __user *fds = arg;
> -	int fd;
> +	int fd, ret;
>  
> -	if (ctx->cq_ev_fd)
> -		return -EBUSY;
> +	mutex_lock(&ctx->ev_fd_lock);
> +	ret = -EBUSY;
> +	if (rcu_dereference_protected(ctx->io_ev_fd, lockdep_is_held(&ctx->ev_fd_lock)))
> +		goto out;
>  
> +	ret = -EFAULT;
>  	if (copy_from_user(&fd, fds, sizeof(*fds)))
> -		return -EFAULT;
> +		goto out;
>  
> -	ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
> -	if (IS_ERR(ctx->cq_ev_fd)) {
> -		int ret = PTR_ERR(ctx->cq_ev_fd);
> +	ret = -ENOMEM;
> +	ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
> +	if (!ev_fd)
> +		goto out;
>  
> -		ctx->cq_ev_fd = NULL;
> -		return ret;
> +	ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
> +	if (IS_ERR(ev_fd->cq_ev_fd)) {
> +		ret = PTR_ERR(ev_fd->cq_ev_fd);
> +		kfree(ev_fd);
> +		goto out;
>  	}
> +	ev_fd->ctx = ctx;
>  
> -	return 0;
> +	rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
> +	ret = 0;
> +
> +out:
> +	mutex_unlock(&ctx->ev_fd_lock);
> +	return ret;
> +}

One thing that both mine and your version suffers from is if someone
does an eventfd unregister, and then immediately does an eventfd
register. If the rcu grace period hasn't passed, we'll get -EBUSY on
trying to do that, when I think the right behavior there would be to
wait for the grace period to pass.

I do think we need to handle that gracefully, spurious -EBUSY is
impossible for an application to deal with.

> @@ -11171,8 +11226,10 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
>  	mutex_lock(&ctx->uring_lock);
>  	ret = __io_uring_register(ctx, opcode, arg, nr_args);
>  	mutex_unlock(&ctx->uring_lock);
> +	rcu_read_lock();
>  	trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
> -							ctx->cq_ev_fd != NULL, ret);
> +				rcu_dereference(ctx->io_ev_fd) != NULL, ret);
> +	rcu_read_unlock();
>  out_fput:
>  	fdput(f);
>  	return ret;

We should probably just modify that tracepoint, kill that ev_fd argument
(it makes very little sense).

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] io_uring: avoid ring quiesce while registering/unregistering eventfd
  2022-02-03 15:55   ` Jens Axboe
@ 2022-02-03 16:49     ` Usama Arif
  2022-02-03 16:58       ` Jens Axboe
  0 siblings, 1 reply; 9+ messages in thread
From: Usama Arif @ 2022-02-03 16:49 UTC (permalink / raw)
  To: Jens Axboe, io-uring, asml.silence, linux-kernel; +Cc: fam.zheng



On 03/02/2022 15:55, Jens Axboe wrote:
> On 2/3/22 8:11 AM, Usama Arif wrote:
>> +static void io_eventfd_signal(struct io_ring_ctx *ctx)
>> +{
>> +	struct io_ev_fd *ev_fd;
>> +
>> +	rcu_read_lock();
>> +	ev_fd = rcu_dereference(ctx->io_ev_fd);
>> +
>> +	if (!io_should_trigger_evfd(ctx, ev_fd))
>> +		goto out;
>> +
>> +	eventfd_signal(ev_fd->cq_ev_fd, 1);
>> +out:
>> +	rcu_read_unlock();
>> +}
> 
> Would be cleaner as:
> 
> static void io_eventfd_signal(struct io_ring_ctx *ctx)
> {
> 	struct io_ev_fd *ev_fd;
> 
> 	rcu_read_lock();
> 	ev_fd = rcu_dereference(ctx->io_ev_fd);
> 
> 	if (io_should_trigger_evfd(ctx, ev_fd))
> 		eventfd_signal(ev_fd->cq_ev_fd, 1);
> 
> 	rcu_read_unlock();
> }
> 
> and might be worth considering pulling in the io_should_trigger_evfd()
> code rather than have it be a separate helper now with just the one
> caller.

Hi,
Thanks for the review. Have pulled in the code for 
io_should_trigger_evfd into io_eventfd_signal.
> 
>> @@ -9353,35 +9374,67 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
>>   
>>   static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
>>   {
>> +	struct io_ev_fd *ev_fd;
>>   	__s32 __user *fds = arg;
>> -	int fd;
>> +	int fd, ret;
>>   
>> -	if (ctx->cq_ev_fd)
>> -		return -EBUSY;
>> +	mutex_lock(&ctx->ev_fd_lock);
>> +	ret = -EBUSY;
>> +	if (rcu_dereference_protected(ctx->io_ev_fd, lockdep_is_held(&ctx->ev_fd_lock)))
>> +		goto out;
>>   
>> +	ret = -EFAULT;
>>   	if (copy_from_user(&fd, fds, sizeof(*fds)))
>> -		return -EFAULT;
>> +		goto out;
>>   
>> -	ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
>> -	if (IS_ERR(ctx->cq_ev_fd)) {
>> -		int ret = PTR_ERR(ctx->cq_ev_fd);
>> +	ret = -ENOMEM;
>> +	ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
>> +	if (!ev_fd)
>> +		goto out;
>>   
>> -		ctx->cq_ev_fd = NULL;
>> -		return ret;
>> +	ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
>> +	if (IS_ERR(ev_fd->cq_ev_fd)) {
>> +		ret = PTR_ERR(ev_fd->cq_ev_fd);
>> +		kfree(ev_fd);
>> +		goto out;
>>   	}
>> +	ev_fd->ctx = ctx;
>>   
>> -	return 0;
>> +	rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
>> +	ret = 0;
>> +
>> +out:
>> +	mutex_unlock(&ctx->ev_fd_lock);
>> +	return ret;
>> +}
> 
> One thing that both mine and your version suffers from is if someone
> does an eventfd unregister, and then immediately does an eventfd
> register. If the rcu grace period hasn't passed, we'll get -EBUSY on
> trying to do that, when I think the right behavior there would be to
> wait for the grace period to pass.
> 
> I do think we need to handle that gracefully, spurious -EBUSY is
> impossible for an application to deal with.

I don't think my version would suffer from this as its protected by 
locks? The mutex_unlock on ev_fd_lock in unregister happens only after 
the call_rcu. And the mutex is locked in io_eventfd_register at the 
start, so wouldnt get the -EBUSY if there is a register immediately 
after unregister?
> 
>> @@ -11171,8 +11226,10 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
>>   	mutex_lock(&ctx->uring_lock);
>>   	ret = __io_uring_register(ctx, opcode, arg, nr_args);
>>   	mutex_unlock(&ctx->uring_lock);
>> +	rcu_read_lock();
>>   	trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
>> -							ctx->cq_ev_fd != NULL, ret);
>> +				rcu_dereference(ctx->io_ev_fd) != NULL, ret);
>> +	rcu_read_unlock();
>>   out_fput:
>>   	fdput(f);
>>   	return ret;
> 
> We should probably just modify that tracepoint, kill that ev_fd argument
> (it makes very little sense).
> 

Thanks! have added that in patch 1 in v2.

Regards,
Usama

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] io_uring: avoid ring quiesce while registering/unregistering eventfd
  2022-02-03 16:49     ` Usama Arif
@ 2022-02-03 16:58       ` Jens Axboe
  2022-02-03 17:42         ` [External] " Usama Arif
  0 siblings, 1 reply; 9+ messages in thread
From: Jens Axboe @ 2022-02-03 16:58 UTC (permalink / raw)
  To: Usama Arif, io-uring, asml.silence, linux-kernel; +Cc: fam.zheng

On 2/3/22 9:49 AM, Usama Arif wrote:
>> One thing that both mine and your version suffers from is if someone
>> does an eventfd unregister, and then immediately does an eventfd
>> register. If the rcu grace period hasn't passed, we'll get -EBUSY on
>> trying to do that, when I think the right behavior there would be to
>> wait for the grace period to pass.
>>
>> I do think we need to handle that gracefully, spurious -EBUSY is
>> impossible for an application to deal with.
> 
> I don't think my version would suffer from this as its protected by 
> locks? The mutex_unlock on ev_fd_lock in unregister happens only after 
> the call_rcu. And the mutex is locked in io_eventfd_register at the 
> start, so wouldnt get the -EBUSY if there is a register immediately 
> after unregister?

The call_rcu() just registers it for getting the callback when the grace
period has passed, it doesn't mean it's done by the time it returns.
Hence there's definitely a window where you can enter
io_uring_register() with the callback still being pending, and you'd get
-EBUSY from that.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [External] Re: [PATCH 1/2] io_uring: avoid ring quiesce while registering/unregistering eventfd
  2022-02-03 16:58       ` Jens Axboe
@ 2022-02-03 17:42         ` Usama Arif
  0 siblings, 0 replies; 9+ messages in thread
From: Usama Arif @ 2022-02-03 17:42 UTC (permalink / raw)
  To: Jens Axboe, io-uring, asml.silence, linux-kernel; +Cc: fam.zheng



On 03/02/2022 16:58, Jens Axboe wrote:
> On 2/3/22 9:49 AM, Usama Arif wrote:
>>> One thing that both mine and your version suffers from is if someone
>>> does an eventfd unregister, and then immediately does an eventfd
>>> register. If the rcu grace period hasn't passed, we'll get -EBUSY on
>>> trying to do that, when I think the right behavior there would be to
>>> wait for the grace period to pass.
>>>
>>> I do think we need to handle that gracefully, spurious -EBUSY is
>>> impossible for an application to deal with.
>>
>> I don't think my version would suffer from this as its protected by
>> locks? The mutex_unlock on ev_fd_lock in unregister happens only after
>> the call_rcu. And the mutex is locked in io_eventfd_register at the
>> start, so wouldnt get the -EBUSY if there is a register immediately
>> after unregister?
> 
> The call_rcu() just registers it for getting the callback when the grace
> period has passed, it doesn't mean it's done by the time it returns.
> Hence there's definitely a window where you can enter
> io_uring_register() with the callback still being pending, and you'd get
> -EBUSY from that.
> 

Does using synchronize_rcu make sense? I have sent v3 with how that 
would look. I have kept cq_ev_fd under io_ev_fd as it will be useful in 
patch 3 when eventfd_async is added.

Thanks,
Usama

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] io_uring: avoid ring quiesce while registering/unregistering eventfd
  2022-02-03 15:48   ` Pavel Begunkov
@ 2022-02-03 17:47     ` Usama Arif
  0 siblings, 0 replies; 9+ messages in thread
From: Usama Arif @ 2022-02-03 17:47 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring, axboe, linux-kernel; +Cc: fam.zheng



On 03/02/2022 15:48, Pavel Begunkov wrote:
> On 2/3/22 15:11, Usama Arif wrote:
>> This is done by creating a new RCU data structure (io_ev_fd) as part of
>> io_ring_ctx that holds the eventfd_ctx.
>>
>> The function io_eventfd_signal is executed under rcu_read_lock with a
>> single rcu_dereference to io_ev_fd so that if another thread unregisters
>> the eventfd while io_eventfd_signal is still being executed, the
>> eventfd_signal for which io_eventfd_signal was called completes
>> successfully.
>>
>> The process of registering/unregistering eventfd is done under a lock
>> so multiple threads don't enter a race condition while
>> registering/unregistering eventfd.
>>
>> With the above approach ring quiesce can be avoided which is much more
>> expensive then using RCU lock. On the system tested, io_uring_reigster 
>> with
>> IORING_REGISTER_EVENTFD takes less than 1ms with RCU lock, compared to 
>> 15ms
>> before with ring quiesce.
>>
>> Signed-off-by: Usama Arif <usama.arif@bytedance.com>
>> ---
>>   fs/io_uring.c | 103 +++++++++++++++++++++++++++++++++++++++-----------
>>   1 file changed, 80 insertions(+), 23 deletions(-)
>>
>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>> index 2e04f718319d..f07cfbb387a6 100644
>> --- a/fs/io_uring.c
>> +++ b/fs/io_uring.c
>> @@ -326,6 +326,12 @@ struct io_submit_state {
>>       struct blk_plug        plug;
>>   };
> 
>> -static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
>> +static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx, 
>> struct io_ev_fd *ev_fd)
>>   {
>> -    if (likely(!ctx->cq_ev_fd))
>> +    if (likely(!ev_fd))
>>           return false;
>>       if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
>>           return false;
>>       return !ctx->eventfd_async || io_wq_current_is_worker();
>>   }
>> +static void io_eventfd_signal(struct io_ring_ctx *ctx)
>> +{
>> +    struct io_ev_fd *ev_fd;
>> +
>> +    rcu_read_lock();
> 
> Please always think about the fast path, which is not set eventfd.
> We don't want extra overhead here.
> 

I guess this should be ok now from v3?

Thanks,
Usama
> if (ctx->ev_fd) {
>      rcu_read_lock();
>          ev_fd = rcu_deref(...);
>          ...
>          rcu_read_unlock();
> }
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2022-02-03 17:47 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-02-03 15:11 [PATCH 0/2] io_uring: avoid ring quiesce in io_uring_register for eventfd opcodes Usama Arif
2022-02-03 15:11 ` [PATCH 1/2] io_uring: avoid ring quiesce while registering/unregistering eventfd Usama Arif
2022-02-03 15:48   ` Pavel Begunkov
2022-02-03 17:47     ` Usama Arif
2022-02-03 15:55   ` Jens Axboe
2022-02-03 16:49     ` Usama Arif
2022-02-03 16:58       ` Jens Axboe
2022-02-03 17:42         ` [External] " Usama Arif
2022-02-03 15:11 ` [PATCH 2/2] io_uring: avoid ring quiesce for IORING_REGISTER_EVENTFD_ASYNC Usama Arif

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.