[PATCH RFC] io_uring: limit inflight IO

* [PATCH RFC] io_uring: limit inflight IO
@ 2019-11-07 23:21 Jens Axboe
  2019-11-08  0:19 ` Jens Axboe
  0 siblings, 1 reply; 11+ messages in thread
From: Jens Axboe @ 2019-11-07 23:21 UTC (permalink / raw)
  To: io-uring

I'd like some feedback on this one. Even tith the overflow backpressure
patch, we still have a potentially large gap where applications can
submit IO before we get any dropped events in the CQ ring. This is
especially true if the execution time of those requests are long
(unbounded).

This adds IORING_SETUP_INFLIGHT, which if set, will return -EBUSY if we
have more IO pending than we can feasibly support. This is normally the
CQ ring size, but of IORING_SETUP_CQ_NODROP is enabled, then it's twice
the CQ ring size.

This helps manage the pending queue size instead of letting it grow
indefinitely.

Note that we could potentially just make this the default behavior -
applications need to handle -EAGAIN returns already, in case we run out
of memory, and if we change this to return -EAGAIN as well, then it
doesn't introduce any new failure cases. I'm tempted to do that...

Anyway, comments solicited!

Not-yet-signed-off-by

---

diff --git a/fs/io_uring.c b/fs/io_uring.c
index f8344f95817e..db8b7e06f36d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -203,6 +203,7 @@ struct io_ring_ctx {
 		unsigned		sq_mask;
 		unsigned		sq_thread_idle;
 		unsigned		cached_sq_dropped;
+		atomic_t		cached_cq_overflow;
 		struct io_uring_sqe	*sq_sqes;
 
 		struct list_head	defer_list;
@@ -221,13 +222,12 @@ struct io_ring_ctx {
 
 	struct {
 		unsigned		cached_cq_tail;
-		atomic_t		cached_cq_overflow;
 		unsigned		cq_entries;
 		unsigned		cq_mask;
+		atomic_t		cq_timeouts;
 		struct wait_queue_head	cq_wait;
 		struct fasync_struct	*cq_fasync;
 		struct eventfd_ctx	*cq_ev_fd;
-		atomic_t		cq_timeouts;
 	} ____cacheline_aligned_in_smp;
 
 	struct io_rings	*rings;
@@ -705,23 +705,53 @@ static void io_cqring_add_event(struct io_kiocb *req, long res)
 	io_cqring_ev_posted(ctx);
 }
 
+static bool io_req_over_limit(struct io_ring_ctx *ctx)
+{
+	unsigned limit, inflight;
+
+	if (!(ctx->flags & IORING_SETUP_INFLIGHT))
+		return false;
+	/* only do checks every once in a while */
+	if (ctx->cached_sq_head & ctx->sq_mask)
+		return false;
+
+	if (ctx->flags & IORING_SETUP_CQ_NODROP)
+		limit = 2 * ctx->cq_entries;
+	else
+		limit = ctx->cq_entries;
+
+	inflight = ctx->cached_sq_head -
+		  (ctx->cached_cq_tail + atomic_read(&ctx->cached_cq_overflow));
+	return inflight >= limit;
+}
+
 static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
-				   struct io_submit_state *state)
+				   struct io_submit_state *state, bool force)
 {
 	gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
 	struct io_kiocb *req;
 
 	if (!percpu_ref_tryget(&ctx->refs))
-		return NULL;
+		return ERR_PTR(-ENXIO);
 
 	if (!state) {
+		if (!force && io_req_over_limit(ctx)) {
+			req = ERR_PTR(-EBUSY);
+			goto out;
+		}
 		req = kmem_cache_alloc(req_cachep, gfp);
-		if (unlikely(!req))
+		if (unlikely(!req)) {
+			req = ERR_PTR(-EAGAIN);
 			goto out;
+		}
 	} else if (!state->free_reqs) {
 		size_t sz;
 		int ret;
 
+		if (!force && io_req_over_limit(ctx)) {
+			req = ERR_PTR(-EBUSY);
+			goto out;
+		}
 		sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
 		ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);
 
@@ -731,8 +761,10 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
 		 */
 		if (unlikely(ret <= 0)) {
 			state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
-			if (!state->reqs[0])
+			if (!state->reqs[0]) {
+				req = ERR_PTR(-EAGAIN);
 				goto out;
+			}
 			ret = 1;
 		}
 		state->free_reqs = ret - 1;
@@ -754,7 +786,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
 	return req;
 out:
 	percpu_ref_put(&ctx->refs);
-	return NULL;
+	return req;
 }
 
 static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
@@ -2963,10 +2995,11 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 		struct io_kiocb *req;
 		unsigned int sqe_flags;
 
-		req = io_get_req(ctx, statep);
-		if (unlikely(!req)) {
+		req = io_get_req(ctx, statep, false);
+		if (unlikely(IS_ERR(req))) {
 			if (!submitted)
-				submitted = -EAGAIN;
+				submitted = PTR_ERR(req);
+			req = NULL;
 			break;
 		}
 		if (!io_get_sqring(ctx, &req->submit)) {
@@ -2986,9 +3019,11 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 
 		if (link && (sqe_flags & IOSQE_IO_DRAIN)) {
 			if (!shadow_req) {
-				shadow_req = io_get_req(ctx, NULL);
-				if (unlikely(!shadow_req))
+				shadow_req = io_get_req(ctx, NULL, true);
+				if (unlikely(IS_ERR(shadow_req))) {
+					shadow_req = NULL;
 					goto out;
+				}
 				shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN);
 				refcount_dec(&shadow_req->refs);
 			}
@@ -4501,7 +4536,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
 
 	if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
 			IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
-			IORING_SETUP_CQ_NODROP))
+			IORING_SETUP_CQ_NODROP | IORING_SETUP_INFLIGHT))
 		return -EINVAL;
 
 	ret = io_uring_create(entries, &p);
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 3d8517eb376e..e7d8e16f9e22 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -57,6 +57,7 @@ struct io_uring_sqe {
 #define IORING_SETUP_SQ_AFF	(1U << 2)	/* sq_thread_cpu is valid */
 #define IORING_SETUP_CQSIZE	(1U << 3)	/* app defines CQ size */
 #define IORING_SETUP_CQ_NODROP	(1U << 4)	/* no CQ drops */
+#define IORING_SETUP_INFLIGHT	(1U << 5)	/* reject IO over limit */
 
 #define IORING_OP_NOP		0
 #define IORING_OP_READV		1

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 11+ messages in thread