[PATCH RFC} io_uring: io_kiocb alloc cache

* [PATCH RFC} io_uring: io_kiocb alloc cache
@ 2020-05-13 16:30 Jens Axboe
  2020-05-13 17:42   ` Jann Horn
  2020-05-14  8:25 ` Xiaoguang Wang
  0 siblings, 2 replies; 22+ messages in thread
From: Jens Axboe @ 2020-05-13 16:30 UTC (permalink / raw)
  To: io-uring; +Cc: Xiaoguang Wang, joseph qi, Jiufei Xue, Pavel Begunkov

I turned the quick'n dirty from the other day into something a bit
more done. Would be great if someone else could run some performance
testing with this, I get about a 10% boost on the pure NOP benchmark
with this. But that's just on my laptop in qemu, so some real iron
testing would be awesome.

The idea here is to have a percpu alloc cache. There's two sets of
state:

1) Requests that have IRQ completion. preempt disable is not enough
   there, we need to disable local irqs. This is a lot slower in
   certain setups, so we keep this separate.

2) No IRQ completion, we can get by with just disabling preempt.

Outside of that, any freed requests goes to the ce->alloc_list.
Attempting to alloc a request will check there first. When freeing
a request, if we're over some threshold, move requests to the
ce->free_list. This list can be browsed by the shrinker to free
up memory. If a CPU goes offline, all requests are reaped.

That's about it. If we go further with this, it'll be split into
a few separate patches. For now, just throwing this out there
for testing. The patch is against my for-5.8/io_uring branch.

It survives basic testing for me, with the exception being some issue
related to fixed files (the kworker will hang on exit).

diff --git a/fs/io_uring.c b/fs/io_uring.c
index d2e37215d05a..4da8f4a9a285 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -79,6 +79,7 @@
 #include <linux/fs_struct.h>
 #include <linux/splice.h>
 #include <linux/task_work.h>
+#include <linux/cpuhotplug.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/io_uring.h>
@@ -652,17 +653,10 @@ struct io_kiocb {
 };
 
 #define IO_PLUG_THRESHOLD		2
-#define IO_IOPOLL_BATCH			8
 
 struct io_submit_state {
 	struct blk_plug		plug;
 
-	/*
-	 * io_kiocb alloc cache
-	 */
-	void			*reqs[IO_IOPOLL_BATCH];
-	unsigned int		free_reqs;
-
 	/*
 	 * File reference cache
 	 */
@@ -673,6 +667,24 @@ struct io_submit_state {
 	unsigned int		ios_left;
 };
 
+struct io_kiocb_cache_entry {
+	struct list_head	alloc_list;
+	unsigned		nr_avail;
+
+	spinlock_t		free_lock;
+	struct list_head	free_list;
+	unsigned		nr_free;
+};
+
+struct io_kiocb_cache {
+	struct io_kiocb_cache_entry	caches[2];
+};
+
+#define IO_KIOCB_CACHE_MAX	256
+#define IO_KIOCB_CACHE_RECLAIM	 16
+
+static struct io_kiocb_cache *alloc_cache;
+
 struct io_op_def {
 	/* needs req->io allocated for deferral/async */
 	unsigned		async_ctx : 1;
@@ -695,6 +707,8 @@ struct io_op_def {
 	unsigned		pollout : 1;
 	/* op supports buffer selection */
 	unsigned		buffer_select : 1;
+	/* IRQ completion */
+	unsigned		irq_comp : 1;
 };
 
 static const struct io_op_def io_op_defs[] = {
@@ -706,6 +720,7 @@ static const struct io_op_def io_op_defs[] = {
 		.unbound_nonreg_file	= 1,
 		.pollin			= 1,
 		.buffer_select		= 1,
+		.irq_comp		= 1,
 	},
 	[IORING_OP_WRITEV] = {
 		.async_ctx		= 1,
@@ -714,6 +729,7 @@ static const struct io_op_def io_op_defs[] = {
 		.hash_reg_file		= 1,
 		.unbound_nonreg_file	= 1,
 		.pollout		= 1,
+		.irq_comp		= 1,
 	},
 	[IORING_OP_FSYNC] = {
 		.needs_file		= 1,
@@ -722,12 +738,14 @@ static const struct io_op_def io_op_defs[] = {
 		.needs_file		= 1,
 		.unbound_nonreg_file	= 1,
 		.pollin			= 1,
+		.irq_comp		= 1,
 	},
 	[IORING_OP_WRITE_FIXED] = {
 		.needs_file		= 1,
 		.hash_reg_file		= 1,
 		.unbound_nonreg_file	= 1,
 		.pollout		= 1,
+		.irq_comp		= 1,
 	},
 	[IORING_OP_POLL_ADD] = {
 		.needs_file		= 1,
@@ -803,12 +821,14 @@ static const struct io_op_def io_op_defs[] = {
 		.unbound_nonreg_file	= 1,
 		.pollin			= 1,
 		.buffer_select		= 1,
+		.irq_comp		= 1,
 	},
 	[IORING_OP_WRITE] = {
 		.needs_mm		= 1,
 		.needs_file		= 1,
 		.unbound_nonreg_file	= 1,
 		.pollout		= 1,
+		.irq_comp		= 1,
 	},
 	[IORING_OP_FADVISE] = {
 		.needs_file		= 1,
@@ -1281,54 +1301,138 @@ static inline bool io_is_fallback_req(struct io_kiocb *req)
 			((unsigned long) req->ctx->fallback_req & ~1UL);
 }
 
-static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx)
+static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx, int op)
 {
 	struct io_kiocb *req;
 
 	req = ctx->fallback_req;
-	if (!test_and_set_bit_lock(0, (unsigned long *) &ctx->fallback_req))
+	if (!test_and_set_bit_lock(0, (unsigned long *) &ctx->fallback_req)) {
+		req->opcode = op;
 		return req;
+	}
 
 	return NULL;
 }
 
-static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx,
-				     struct io_submit_state *state)
+static struct io_kiocb *io_req_cache_alloc(int op)
 {
-	gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
-	struct io_kiocb *req;
+	const bool irq_comp = io_op_defs[op].irq_comp;
+	struct io_kiocb_cache_entry *ce;
+	struct io_kiocb_cache *cache;
+	struct io_kiocb *req = NULL;
 
-	if (!state) {
-		req = kmem_cache_alloc(req_cachep, gfp);
-		if (unlikely(!req))
-			goto fallback;
-	} else if (!state->free_reqs) {
-		size_t sz;
-		int ret;
+	if (irq_comp)
+		local_irq_disable();
+	else
+		preempt_disable();
 
-		sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
-		ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);
+	cache = this_cpu_ptr(alloc_cache);
+	ce = &cache->caches[irq_comp];
 
-		/*
-		 * Bulk alloc is all-or-nothing. If we fail to get a batch,
-		 * retry single alloc to be on the safe side.
-		 */
-		if (unlikely(ret <= 0)) {
-			state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
-			if (!state->reqs[0])
-				goto fallback;
-			ret = 1;
-		}
-		state->free_reqs = ret - 1;
-		req = state->reqs[ret - 1];
-	} else {
-		state->free_reqs--;
-		req = state->reqs[state->free_reqs];
+	if (!list_empty(&ce->alloc_list)) {
+		req = list_first_entry(&ce->alloc_list, struct io_kiocb, list);
+		list_del(&req->list);
+		ce->nr_avail--;
 	}
 
-	return req;
-fallback:
-	return io_get_fallback_req(ctx);
+	if (irq_comp)
+		local_irq_enable();
+	else
+		preempt_enable();
+
+	if (req)
+		return req;
+
+	return kmem_cache_alloc(req_cachep, GFP_KERNEL);
+}
+
+static void io_req_cache_reclaim(struct io_kiocb_cache_entry *ce)
+{
+	LIST_HEAD(free_list);
+	int nr = 0;
+
+	while (!list_empty(&ce->alloc_list)) {
+		struct io_kiocb *req;
+
+		req = list_last_entry(&ce->alloc_list, struct io_kiocb, list);
+		list_move(&req->list, &free_list);
+		nr++;
+	}
+
+	spin_lock(&ce->free_lock);
+	list_splice(&free_list, &ce->free_list);
+	ce->nr_free += nr;
+	ce->nr_avail -= nr;
+	spin_unlock(&ce->free_lock);
+}
+
+struct req_batch {
+	struct list_head list;
+	int to_free;
+	bool need_iter;
+	bool irq_comp;
+};
+
+static void io_req_cache_free_bulk(struct req_batch *rb)
+{
+	struct io_kiocb_cache_entry *ce;
+	struct io_kiocb_cache *cache;
+
+	if (rb->irq_comp)
+		local_irq_disable();
+	else
+		preempt_disable();
+
+	cache = this_cpu_ptr(alloc_cache);
+	ce = &cache->caches[rb->irq_comp];
+
+	list_splice_init(&rb->list, &ce->alloc_list);
+	ce->nr_avail += rb->to_free;
+	if (ce->nr_avail > IO_KIOCB_CACHE_MAX)
+		io_req_cache_reclaim(ce);
+
+	if (rb->irq_comp)
+		local_irq_enable();
+	else
+		preempt_enable();
+}
+
+static void io_req_cache_free(struct io_kiocb *req)
+{
+	const bool irq_comp = io_op_defs[req->opcode].irq_comp;
+	struct io_kiocb_cache_entry *ce;
+	struct io_kiocb_cache *cache;
+	unsigned long flags;
+
+	if (irq_comp)
+		local_irq_save(flags);
+	else
+		preempt_disable();
+
+	cache = this_cpu_ptr(alloc_cache);
+	ce = &cache->caches[irq_comp];
+
+	list_add(&req->list, &ce->alloc_list);
+	if (++ce->nr_avail > IO_KIOCB_CACHE_MAX)
+		io_req_cache_reclaim(ce);
+
+	if (irq_comp)
+		local_irq_restore(flags);
+	else
+		preempt_enable();
+}
+
+static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx, int opcode)
+{
+	struct io_kiocb *req;
+
+	req = io_req_cache_alloc(opcode);
+	if (req) {
+		req->opcode = opcode;
+		return req;
+	}
+
+	return io_get_fallback_req(ctx, opcode);
 }
 
 static inline void io_put_file(struct io_kiocb *req, struct file *file,
@@ -1345,7 +1449,8 @@ static void __io_req_aux_free(struct io_kiocb *req)
 	if (req->flags & REQ_F_NEED_CLEANUP)
 		io_cleanup_req(req);
 
-	kfree(req->io);
+	if (req->io)
+		kfree(req->io);
 	if (req->file)
 		io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
 	if (req->task)
@@ -1371,28 +1476,21 @@ static void __io_free_req(struct io_kiocb *req)
 
 	percpu_ref_put(&req->ctx->refs);
 	if (likely(!io_is_fallback_req(req)))
-		kmem_cache_free(req_cachep, req);
+		io_req_cache_free(req);
 	else
 		clear_bit_unlock(0, (unsigned long *) &req->ctx->fallback_req);
 }
 
-struct req_batch {
-	void *reqs[IO_IOPOLL_BATCH];
-	int to_free;
-	int need_iter;
-};
-
 static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
 {
 	if (!rb->to_free)
 		return;
 	if (rb->need_iter) {
-		int i, inflight = 0;
+		struct io_kiocb *req;
 		unsigned long flags;
+		int inflight = 0;
 
-		for (i = 0; i < rb->to_free; i++) {
-			struct io_kiocb *req = rb->reqs[i];
-
+		list_for_each_entry(req, &rb->list, list) {
 			if (req->flags & REQ_F_FIXED_FILE) {
 				req->file = NULL;
 				percpu_ref_put(req->fixed_file_refs);
@@ -1405,9 +1503,7 @@ static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
 			goto do_free;
 
 		spin_lock_irqsave(&ctx->inflight_lock, flags);
-		for (i = 0; i < rb->to_free; i++) {
-			struct io_kiocb *req = rb->reqs[i];
-
+		list_for_each_entry(req, &rb->list, list) {
 			if (req->flags & REQ_F_INFLIGHT) {
 				list_del(&req->inflight_entry);
 				if (!--inflight)
@@ -1420,9 +1516,10 @@ static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
 			wake_up(&ctx->inflight_wait);
 	}
 do_free:
-	kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
+	io_req_cache_free_bulk(rb);
 	percpu_ref_put_many(&ctx->refs, rb->to_free);
-	rb->to_free = rb->need_iter = 0;
+	rb->to_free = 0;
+	rb->need_iter = rb->irq_comp = false;
 }
 
 static bool io_link_cancel_timeout(struct io_kiocb *req)
@@ -1670,11 +1767,12 @@ static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req)
 		return false;
 
 	if (!(req->flags & REQ_F_FIXED_FILE) || req->io)
-		rb->need_iter++;
+		rb->need_iter |= true;
+	if (!rb->irq_comp && io_op_defs[req->opcode].irq_comp)
+		rb->irq_comp |= true;
 
-	rb->reqs[rb->to_free++] = req;
-	if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs)))
-		io_free_req_many(req->ctx, rb);
+	list_add(&req->list, &rb->list);
+	rb->to_free++;
 	return true;
 }
 
@@ -1697,10 +1795,14 @@ static int io_put_kbuf(struct io_kiocb *req)
 static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 			       struct list_head *done)
 {
-	struct req_batch rb;
+	struct req_batch rb = {
+		.list		= LIST_HEAD_INIT(rb.list),
+		.to_free	= 0,
+		.need_iter	= false,
+		.irq_comp	= false
+	};
 	struct io_kiocb *req;
 
-	rb.to_free = rb.need_iter = 0;
 	while (!list_empty(done)) {
 		int cflags = 0;
 
@@ -5703,8 +5805,6 @@ static void io_submit_state_end(struct io_submit_state *state)
 {
 	blk_finish_plug(&state->plug);
 	io_file_put(state);
-	if (state->free_reqs)
-		kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
 }
 
 /*
@@ -5714,7 +5814,6 @@ static void io_submit_state_start(struct io_submit_state *state,
 				  unsigned int max_ios)
 {
 	blk_start_plug(&state->plug);
-	state->free_reqs = 0;
 	state->file = NULL;
 	state->ios_left = max_ios;
 }
@@ -5784,7 +5883,6 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	 * link list.
 	 */
 	req->sequence = ctx->cached_sq_head - ctx->cached_sq_dropped;
-	req->opcode = READ_ONCE(sqe->opcode);
 	req->user_data = READ_ONCE(sqe->user_data);
 	req->io = NULL;
 	req->file = NULL;
@@ -5872,14 +5970,14 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 			io_consume_sqe(ctx);
 			break;
 		}
-		req = io_alloc_req(ctx, statep);
+		req = io_alloc_req(ctx, READ_ONCE(sqe->opcode));
 		if (unlikely(!req)) {
 			if (!submitted)
 				submitted = -EAGAIN;
 			break;
 		}
 
-		err = io_init_req(ctx, req, sqe, statep, async);
+		err = io_init_req(ctx, req, sqe, NULL, async);
 		io_consume_sqe(ctx);
 		/* will complete beyond this point, count as submitted */
 		submitted++;
@@ -7626,6 +7724,17 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
 					req->task->task_works != NULL);
 	}
 	spin_unlock_irq(&ctx->completion_lock);
+	seq_printf(m, "AllocCache:\n");
+	for_each_possible_cpu(i) {
+		struct io_kiocb_cache *cache = per_cpu_ptr(alloc_cache, i);
+		int j;
+
+		for (j = 0; j < ARRAY_SIZE(cache->caches); j++) {
+			struct io_kiocb_cache_entry *ce = &cache->caches[j];
+
+			seq_printf(m, "  cpu%d: irq=%d, nr_free=%d, nr_avail=%d\n", i, j, ce->nr_free, ce->nr_avail);
+		}
+	}
 	mutex_unlock(&ctx->uring_lock);
 }
 
@@ -8101,8 +8210,130 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
 	return ret;
 }
 
+static unsigned long io_uring_cache_count(struct shrinker *shrink,
+					  struct shrink_control *sc)
+{
+	unsigned long count = 0;
+	int cpu, i;
+
+	for_each_possible_cpu(cpu) {
+		struct io_kiocb_cache *cache;
+
+		cache = per_cpu_ptr(alloc_cache, cpu);
+		for (i = 0; i < ARRAY_SIZE(cache->caches); i++) {
+			struct io_kiocb_cache_entry *ce = &cache->caches[i];
+
+			count += ce->nr_free;
+		}
+	}
+
+	return count;
+}
+
+static unsigned long __io_uring_cache_shrink(struct io_kiocb_cache_entry *ce,
+					     int irq_comp, int *nr_to_scan)
+{
+	unsigned long freed = 0;
+	struct io_kiocb *req;
+	LIST_HEAD(free_list);
+
+	if (!ce->nr_free)
+		return 0;
+
+	if (irq_comp)
+		spin_lock_irq(&ce->free_lock);
+	else
+		spin_lock(&ce->free_lock);
+
+	while (!list_empty(&ce->free_list)) {
+		req = list_first_entry(&ce->free_list, struct io_kiocb, list);
+		list_move(&req->list, &free_list);
+		freed++;
+		if (!--(*nr_to_scan))
+			break;
+	}
+
+	if (irq_comp)
+		spin_unlock_irq(&ce->free_lock);
+	else
+		spin_unlock(&ce->free_lock);
+
+	while (!list_empty(&free_list)) {
+		req = list_first_entry(&free_list, struct io_kiocb, list);
+		list_del(&req->list);
+		kmem_cache_free(req_cachep, req);
+	}
+
+	return freed;
+}
+
+static unsigned long io_uring_cache_shrink(int nr_to_scan)
+{
+	long freed = 0;
+	int cpu, i;
+
+	for_each_possible_cpu(cpu) {
+		struct io_kiocb_cache *cache = per_cpu_ptr(alloc_cache, cpu);
+
+		for (i = 0; i < ARRAY_SIZE(cache->caches); i++) {
+			struct io_kiocb_cache_entry *ce = &cache->caches[i];
+
+			freed += __io_uring_cache_shrink(ce, i, &nr_to_scan);
+			if (!nr_to_scan)
+				break;
+		}
+		if (!nr_to_scan)
+			break;
+	}
+
+	return freed ?: SHRINK_STOP;
+}
+
+static unsigned long io_uring_cache_scan(struct shrinker *shrink,
+					 struct shrink_control *sc)
+{
+	if ((sc->gfp_mask & GFP_KERNEL) != GFP_KERNEL)
+		return SHRINK_STOP;
+
+	return io_uring_cache_shrink(sc->nr_to_scan);
+}
+
+static struct shrinker io_uring_shrinker = {
+	.count_objects	= io_uring_cache_count,
+	.scan_objects	= io_uring_cache_scan,
+	.seeks		= DEFAULT_SEEKS,
+};
+
+static void io_uring_kill_ce(struct io_kiocb_cache_entry *ce, bool irq_comp)
+{
+	struct io_kiocb *req;
+
+	list_splice_init(&ce->alloc_list, &ce->free_list);
+
+	while (!list_empty(&ce->free_list)) {
+		req = list_first_entry(&ce->free_list, struct io_kiocb, list);
+		list_del(&req->list);
+		kmem_cache_free(req_cachep, req);
+	}
+
+	ce->nr_free = ce->nr_avail = 0;
+}
+
+static int io_uring_notify_dead(unsigned int cpu)
+{
+	struct io_kiocb_cache *cache = per_cpu_ptr(alloc_cache, cpu);
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(cache->caches); i++)
+		io_uring_kill_ce(&cache->caches[i], i);
+
+	return 0;
+}
+
 static int __init io_uring_init(void)
 {
+	int cpu, i;
+
 #define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
 	BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
 	BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
@@ -8142,6 +8373,25 @@ static int __init io_uring_init(void)
 	BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
 	BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
 	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
+
+	alloc_cache = alloc_percpu(struct io_kiocb_cache);
+	for_each_possible_cpu(cpu) {
+		struct io_kiocb_cache *cache = per_cpu_ptr(alloc_cache, cpu);
+
+		for (i = 0; i < ARRAY_SIZE(cache->caches); i++) {
+			struct io_kiocb_cache_entry *ce = &cache->caches[i];
+
+			INIT_LIST_HEAD(&ce->alloc_list);
+			spin_lock_init(&ce->free_lock);
+			INIT_LIST_HEAD(&ce->free_list);
+			ce->nr_free = 0;
+			ce->nr_avail = 0;
+		}
+	}
+
+	cpuhp_setup_state_nocalls(CPUHP_IOURING_DEAD, "io_uring:dead", NULL,
+					io_uring_notify_dead);
+	WARN_ON(register_shrinker(&io_uring_shrinker));
 	return 0;
 };
 __initcall(io_uring_init);
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 77d70b633531..3b80556572a5 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -60,6 +60,7 @@ enum cpuhp_state {
 	CPUHP_LUSTRE_CFS_DEAD,
 	CPUHP_AP_ARM_CACHE_B15_RAC_DEAD,
 	CPUHP_PADATA_DEAD,
+	CPUHP_IOURING_DEAD,
 	CPUHP_WORKQUEUE_PREP,
 	CPUHP_POWER_NUMA_PREPARE,
 	CPUHP_HRTIMERS_PREPARE,

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 22+ messages in thread