Re: io_uring and spurious wake-ups from eventfd

From: Jens Axboe <axboe@kernel.dk>
To: Mark Papadakis <markuspapadakis@icloud.com>, io-uring@vger.kernel.org
Subject: Re: io_uring and spurious wake-ups from eventfd
Date: Tue, 7 Jan 2020 13:34:11 -0700	[thread overview]
Message-ID: <60360091-ffce-fc8b-50d5-1a20fecaf047@kernel.dk> (raw)
In-Reply-To: <55243723-480f-0220-2b93-74cc033c6e1d@kernel.dk>

On 1/7/20 1:26 PM, Jens Axboe wrote:
> On 1/7/20 8:55 AM, Mark Papadakis wrote:
>> This is perhaps an odd request, but if it’s trivial to implement
>> support for this described feature, it could help others like it ‘d
>> help me (I ‘ve been experimenting with io_uring for some time now).
>>
>> Being able to register an eventfd with an io_uring context is very
>> handy, if you e.g have some sort of reactor thread multiplexing I/O
>> using epoll etc, where you want to be notified when there are pending
>> CQEs to drain. The problem, such as it is, is that this can result in
>> un-necessary/spurious wake-ups.
>>
>> If, for example, you are monitoring some sockets for EPOLLIN, and when
>> poll says you have pending bytes to read from their sockets, and said
>> sockets are non-blocking, and for each some reported event you reserve
>> an SQE for preadv() to read that data and then you io_uring_enter to
>> submit the SQEs, because the data is readily available, as soon as
>> io_uring_enter returns, you will have your completions available -
>> which you can process.  The “problem” is that poll will wake up
>> immediately thereafter in the next reactor loop iteration because
>> eventfd was tripped (which is reasonable but un-necessary).
>>
>> What if there was a flag for io_uring_setup() so that the eventfd
>> would only be tripped for CQEs that were processed asynchronously, or,
>> if that’s non-trivial, only for CQEs that reference file FDs?
>>
>> That’d help with that spurious wake-up.
> 
> One easy way to do that would be for the application to signal that it
> doesn't want eventfd notifications for certain requests. Like using an
> IOSQE_ flag for that. Then you could set that on the requests you submit
> in response to triggering an eventfd event.

Something like this - totally untested. Note that for some error cases I
didn't go through the trouble of wiring this up, but for all the normal
cases it should work.

Patch is against my for-5.6/io_uring branch.

diff --git a/fs/io_uring.c b/fs/io_uring.c
index c770c2c0eb52..5c6da1e1f29c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -512,6 +512,7 @@ struct io_kiocb {
 #define REQ_F_HARDLINK		65536	/* doesn't sever on completion < 0 */
 #define REQ_F_FORCE_ASYNC	131072	/* IOSQE_ASYNC */
 #define REQ_F_CUR_POS		262144	/* read/write uses file position */
+#define REQ_F_NO_EVFD		524288	/* don't trigger eventfd for this req */
 	u64			user_data;
 	u32			result;
 	u32			sequence;
@@ -960,13 +961,13 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
 	return &rings->cqes[tail & ctx->cq_mask];
 }
 
-static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
+static void io_cqring_ev_posted(struct io_ring_ctx *ctx, bool evfd)
 {
 	if (waitqueue_active(&ctx->wait))
 		wake_up(&ctx->wait);
 	if (waitqueue_active(&ctx->sqo_wait))
 		wake_up(&ctx->sqo_wait);
-	if (ctx->cq_ev_fd)
+	if (evfd && ctx->cq_ev_fd)
 		eventfd_signal(ctx->cq_ev_fd, 1);
 }
 
@@ -1018,7 +1019,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 		clear_bit(0, &ctx->cq_check_overflow);
 	}
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
-	io_cqring_ev_posted(ctx);
+	io_cqring_ev_posted(ctx, true);
 
 	while (!list_empty(&list)) {
 		req = list_first_entry(&list, struct io_kiocb, list);
@@ -1070,7 +1071,7 @@ static void io_cqring_add_event(struct io_kiocb *req, long res)
 	io_commit_cqring(ctx);
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 
-	io_cqring_ev_posted(ctx);
+	io_cqring_ev_posted(ctx, !(req->flags & REQ_F_NO_EVFD));
 }
 
 static inline bool io_is_fallback_req(struct io_kiocb *req)
@@ -1288,7 +1289,7 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
 
 	req->flags |= REQ_F_LINK_NEXT;
 	if (wake_ev)
-		io_cqring_ev_posted(ctx);
+		io_cqring_ev_posted(ctx, true);
 }
 
 /*
@@ -1320,7 +1321,7 @@ static void io_fail_links(struct io_kiocb *req)
 
 	io_commit_cqring(ctx);
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
-	io_cqring_ev_posted(ctx);
+	io_cqring_ev_posted(ctx, true);
 }
 
 static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
@@ -3366,7 +3367,7 @@ static void io_poll_complete_work(struct io_wq_work **workptr)
 	io_poll_complete(req, mask, ret);
 	spin_unlock_irq(&ctx->completion_lock);
 
-	io_cqring_ev_posted(ctx);
+	io_cqring_ev_posted(ctx, !(req->flags & REQ_F_NO_EVFD));
 
 	if (ret < 0)
 		req_set_fail_links(req);
@@ -3379,6 +3380,7 @@ static void __io_poll_flush(struct io_ring_ctx *ctx, struct llist_node *nodes)
 {
 	struct io_kiocb *req, *tmp;
 	struct req_batch rb;
+	bool evfd = false;
 
 	rb.to_free = rb.need_iter = 0;
 	spin_lock_irq(&ctx->completion_lock);
@@ -3386,6 +3388,8 @@ static void __io_poll_flush(struct io_ring_ctx *ctx, struct llist_node *nodes)
 		hash_del(&req->hash_node);
 		io_poll_complete(req, req->result, 0);
 
+		if (!evfd && !(req->flags & REQ_F_NO_EVFD))
+			evfd = true;
 		if (refcount_dec_and_test(&req->refs) &&
 		    !io_req_multi_free(&rb, req)) {
 			req->flags |= REQ_F_COMP_LOCKED;
@@ -3394,7 +3398,7 @@ static void __io_poll_flush(struct io_ring_ctx *ctx, struct llist_node *nodes)
 	}
 	spin_unlock_irq(&ctx->completion_lock);
 
-	io_cqring_ev_posted(ctx);
+	io_cqring_ev_posted(ctx, evfd);
 	io_free_req_many(ctx, &rb);
 }
 
@@ -3439,7 +3443,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 			io_put_req(req);
 			spin_unlock_irqrestore(&ctx->completion_lock, flags);
 
-			io_cqring_ev_posted(ctx);
+			io_cqring_ev_posted(ctx, !(req->flags & REQ_F_NO_EVFD));
 			req = NULL;
 		} else {
 			req->result = mask;
@@ -3557,7 +3561,7 @@ static int io_poll_add(struct io_kiocb *req, struct io_kiocb **nxt)
 	spin_unlock_irq(&ctx->completion_lock);
 
 	if (mask) {
-		io_cqring_ev_posted(ctx);
+		io_cqring_ev_posted(ctx, !(req->flags & REQ_F_NO_EVFD));
 		io_put_req_find_next(req, nxt);
 	}
 	return ipt.error;
@@ -3597,7 +3601,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
 	io_commit_cqring(ctx);
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 
-	io_cqring_ev_posted(ctx);
+	io_cqring_ev_posted(ctx, !(req->flags & REQ_F_NO_EVFD));
 	req_set_fail_links(req);
 	io_put_req(req);
 	return HRTIMER_NORESTART;
@@ -3659,7 +3663,7 @@ static int io_timeout_remove(struct io_kiocb *req)
 	io_cqring_fill_event(req, ret);
 	io_commit_cqring(ctx);
 	spin_unlock_irq(&ctx->completion_lock);
-	io_cqring_ev_posted(ctx);
+	io_cqring_ev_posted(ctx, !(req->flags & REQ_F_NO_EVFD));
 	if (ret < 0)
 		req_set_fail_links(req);
 	io_put_req(req);
@@ -3829,7 +3833,7 @@ static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
 	io_cqring_fill_event(req, ret);
 	io_commit_cqring(ctx);
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
-	io_cqring_ev_posted(ctx);
+	io_cqring_ev_posted(ctx, !(req->flags & REQ_F_NO_EVFD));
 
 	if (ret < 0)
 		req_set_fail_links(req);
@@ -4528,7 +4532,8 @@ static inline void io_queue_link_head(struct io_kiocb *req)
 }
 
 #define SQE_VALID_FLAGS	(IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK|	\
-				IOSQE_IO_HARDLINK | IOSQE_ASYNC)
+				IOSQE_IO_HARDLINK | IOSQE_ASYNC | 	\
+				IOSQE_NO_EVENTFD)
 
 static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 			  struct io_submit_state *state, struct io_kiocb **link)
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index be64c9757ff1..72af96a38ed8 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -53,6 +53,7 @@ struct io_uring_sqe {
 #define IOSQE_IO_LINK		(1U << 2)	/* links next sqe */
 #define IOSQE_IO_HARDLINK	(1U << 3)	/* like LINK, but stronger */
 #define IOSQE_ASYNC		(1U << 4)	/* always go async */
+#define IOSQE_NO_EVENTFD	(1U << 5)	/* don't trigger eventfd */
 
 /*
  * io_uring_setup() flags

-- 
Jens Axboe