All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH RFC} io_uring: io_kiocb alloc cache
@ 2020-05-13 16:30 Jens Axboe
  2020-05-13 17:42   ` Jann Horn
  2020-05-14  8:25 ` Xiaoguang Wang
  0 siblings, 2 replies; 22+ messages in thread
From: Jens Axboe @ 2020-05-13 16:30 UTC (permalink / raw)
  To: io-uring; +Cc: Xiaoguang Wang, joseph qi, Jiufei Xue, Pavel Begunkov

I turned the quick'n dirty from the other day into something a bit
more done. Would be great if someone else could run some performance
testing with this, I get about a 10% boost on the pure NOP benchmark
with this. But that's just on my laptop in qemu, so some real iron
testing would be awesome.

The idea here is to have a percpu alloc cache. There's two sets of
state:

1) Requests that have IRQ completion. preempt disable is not enough
   there, we need to disable local irqs. This is a lot slower in
   certain setups, so we keep this separate.

2) No IRQ completion, we can get by with just disabling preempt.

Outside of that, any freed requests goes to the ce->alloc_list.
Attempting to alloc a request will check there first. When freeing
a request, if we're over some threshold, move requests to the
ce->free_list. This list can be browsed by the shrinker to free
up memory. If a CPU goes offline, all requests are reaped.

That's about it. If we go further with this, it'll be split into
a few separate patches. For now, just throwing this out there
for testing. The patch is against my for-5.8/io_uring branch.

It survives basic testing for me, with the exception being some issue
related to fixed files (the kworker will hang on exit).


diff --git a/fs/io_uring.c b/fs/io_uring.c
index d2e37215d05a..4da8f4a9a285 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -79,6 +79,7 @@
 #include <linux/fs_struct.h>
 #include <linux/splice.h>
 #include <linux/task_work.h>
+#include <linux/cpuhotplug.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/io_uring.h>
@@ -652,17 +653,10 @@ struct io_kiocb {
 };
 
 #define IO_PLUG_THRESHOLD		2
-#define IO_IOPOLL_BATCH			8
 
 struct io_submit_state {
 	struct blk_plug		plug;
 
-	/*
-	 * io_kiocb alloc cache
-	 */
-	void			*reqs[IO_IOPOLL_BATCH];
-	unsigned int		free_reqs;
-
 	/*
 	 * File reference cache
 	 */
@@ -673,6 +667,24 @@ struct io_submit_state {
 	unsigned int		ios_left;
 };
 
+struct io_kiocb_cache_entry {
+	struct list_head	alloc_list;
+	unsigned		nr_avail;
+
+	spinlock_t		free_lock;
+	struct list_head	free_list;
+	unsigned		nr_free;
+};
+
+struct io_kiocb_cache {
+	struct io_kiocb_cache_entry	caches[2];
+};
+
+#define IO_KIOCB_CACHE_MAX	256
+#define IO_KIOCB_CACHE_RECLAIM	 16
+
+static struct io_kiocb_cache *alloc_cache;
+
 struct io_op_def {
 	/* needs req->io allocated for deferral/async */
 	unsigned		async_ctx : 1;
@@ -695,6 +707,8 @@ struct io_op_def {
 	unsigned		pollout : 1;
 	/* op supports buffer selection */
 	unsigned		buffer_select : 1;
+	/* IRQ completion */
+	unsigned		irq_comp : 1;
 };
 
 static const struct io_op_def io_op_defs[] = {
@@ -706,6 +720,7 @@ static const struct io_op_def io_op_defs[] = {
 		.unbound_nonreg_file	= 1,
 		.pollin			= 1,
 		.buffer_select		= 1,
+		.irq_comp		= 1,
 	},
 	[IORING_OP_WRITEV] = {
 		.async_ctx		= 1,
@@ -714,6 +729,7 @@ static const struct io_op_def io_op_defs[] = {
 		.hash_reg_file		= 1,
 		.unbound_nonreg_file	= 1,
 		.pollout		= 1,
+		.irq_comp		= 1,
 	},
 	[IORING_OP_FSYNC] = {
 		.needs_file		= 1,
@@ -722,12 +738,14 @@ static const struct io_op_def io_op_defs[] = {
 		.needs_file		= 1,
 		.unbound_nonreg_file	= 1,
 		.pollin			= 1,
+		.irq_comp		= 1,
 	},
 	[IORING_OP_WRITE_FIXED] = {
 		.needs_file		= 1,
 		.hash_reg_file		= 1,
 		.unbound_nonreg_file	= 1,
 		.pollout		= 1,
+		.irq_comp		= 1,
 	},
 	[IORING_OP_POLL_ADD] = {
 		.needs_file		= 1,
@@ -803,12 +821,14 @@ static const struct io_op_def io_op_defs[] = {
 		.unbound_nonreg_file	= 1,
 		.pollin			= 1,
 		.buffer_select		= 1,
+		.irq_comp		= 1,
 	},
 	[IORING_OP_WRITE] = {
 		.needs_mm		= 1,
 		.needs_file		= 1,
 		.unbound_nonreg_file	= 1,
 		.pollout		= 1,
+		.irq_comp		= 1,
 	},
 	[IORING_OP_FADVISE] = {
 		.needs_file		= 1,
@@ -1281,54 +1301,138 @@ static inline bool io_is_fallback_req(struct io_kiocb *req)
 			((unsigned long) req->ctx->fallback_req & ~1UL);
 }
 
-static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx)
+static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx, int op)
 {
 	struct io_kiocb *req;
 
 	req = ctx->fallback_req;
-	if (!test_and_set_bit_lock(0, (unsigned long *) &ctx->fallback_req))
+	if (!test_and_set_bit_lock(0, (unsigned long *) &ctx->fallback_req)) {
+		req->opcode = op;
 		return req;
+	}
 
 	return NULL;
 }
 
-static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx,
-				     struct io_submit_state *state)
+static struct io_kiocb *io_req_cache_alloc(int op)
 {
-	gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
-	struct io_kiocb *req;
+	const bool irq_comp = io_op_defs[op].irq_comp;
+	struct io_kiocb_cache_entry *ce;
+	struct io_kiocb_cache *cache;
+	struct io_kiocb *req = NULL;
 
-	if (!state) {
-		req = kmem_cache_alloc(req_cachep, gfp);
-		if (unlikely(!req))
-			goto fallback;
-	} else if (!state->free_reqs) {
-		size_t sz;
-		int ret;
+	if (irq_comp)
+		local_irq_disable();
+	else
+		preempt_disable();
 
-		sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
-		ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);
+	cache = this_cpu_ptr(alloc_cache);
+	ce = &cache->caches[irq_comp];
 
-		/*
-		 * Bulk alloc is all-or-nothing. If we fail to get a batch,
-		 * retry single alloc to be on the safe side.
-		 */
-		if (unlikely(ret <= 0)) {
-			state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
-			if (!state->reqs[0])
-				goto fallback;
-			ret = 1;
-		}
-		state->free_reqs = ret - 1;
-		req = state->reqs[ret - 1];
-	} else {
-		state->free_reqs--;
-		req = state->reqs[state->free_reqs];
+	if (!list_empty(&ce->alloc_list)) {
+		req = list_first_entry(&ce->alloc_list, struct io_kiocb, list);
+		list_del(&req->list);
+		ce->nr_avail--;
 	}
 
-	return req;
-fallback:
-	return io_get_fallback_req(ctx);
+	if (irq_comp)
+		local_irq_enable();
+	else
+		preempt_enable();
+
+	if (req)
+		return req;
+
+	return kmem_cache_alloc(req_cachep, GFP_KERNEL);
+}
+
+static void io_req_cache_reclaim(struct io_kiocb_cache_entry *ce)
+{
+	LIST_HEAD(free_list);
+	int nr = 0;
+
+	while (!list_empty(&ce->alloc_list)) {
+		struct io_kiocb *req;
+
+		req = list_last_entry(&ce->alloc_list, struct io_kiocb, list);
+		list_move(&req->list, &free_list);
+		nr++;
+	}
+
+	spin_lock(&ce->free_lock);
+	list_splice(&free_list, &ce->free_list);
+	ce->nr_free += nr;
+	ce->nr_avail -= nr;
+	spin_unlock(&ce->free_lock);
+}
+
+struct req_batch {
+	struct list_head list;
+	int to_free;
+	bool need_iter;
+	bool irq_comp;
+};
+
+static void io_req_cache_free_bulk(struct req_batch *rb)
+{
+	struct io_kiocb_cache_entry *ce;
+	struct io_kiocb_cache *cache;
+
+	if (rb->irq_comp)
+		local_irq_disable();
+	else
+		preempt_disable();
+
+	cache = this_cpu_ptr(alloc_cache);
+	ce = &cache->caches[rb->irq_comp];
+
+	list_splice_init(&rb->list, &ce->alloc_list);
+	ce->nr_avail += rb->to_free;
+	if (ce->nr_avail > IO_KIOCB_CACHE_MAX)
+		io_req_cache_reclaim(ce);
+
+	if (rb->irq_comp)
+		local_irq_enable();
+	else
+		preempt_enable();
+}
+
+static void io_req_cache_free(struct io_kiocb *req)
+{
+	const bool irq_comp = io_op_defs[req->opcode].irq_comp;
+	struct io_kiocb_cache_entry *ce;
+	struct io_kiocb_cache *cache;
+	unsigned long flags;
+
+	if (irq_comp)
+		local_irq_save(flags);
+	else
+		preempt_disable();
+
+	cache = this_cpu_ptr(alloc_cache);
+	ce = &cache->caches[irq_comp];
+
+	list_add(&req->list, &ce->alloc_list);
+	if (++ce->nr_avail > IO_KIOCB_CACHE_MAX)
+		io_req_cache_reclaim(ce);
+
+	if (irq_comp)
+		local_irq_restore(flags);
+	else
+		preempt_enable();
+}
+
+static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx, int opcode)
+{
+	struct io_kiocb *req;
+
+	req = io_req_cache_alloc(opcode);
+	if (req) {
+		req->opcode = opcode;
+		return req;
+	}
+
+	return io_get_fallback_req(ctx, opcode);
 }
 
 static inline void io_put_file(struct io_kiocb *req, struct file *file,
@@ -1345,7 +1449,8 @@ static void __io_req_aux_free(struct io_kiocb *req)
 	if (req->flags & REQ_F_NEED_CLEANUP)
 		io_cleanup_req(req);
 
-	kfree(req->io);
+	if (req->io)
+		kfree(req->io);
 	if (req->file)
 		io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
 	if (req->task)
@@ -1371,28 +1476,21 @@ static void __io_free_req(struct io_kiocb *req)
 
 	percpu_ref_put(&req->ctx->refs);
 	if (likely(!io_is_fallback_req(req)))
-		kmem_cache_free(req_cachep, req);
+		io_req_cache_free(req);
 	else
 		clear_bit_unlock(0, (unsigned long *) &req->ctx->fallback_req);
 }
 
-struct req_batch {
-	void *reqs[IO_IOPOLL_BATCH];
-	int to_free;
-	int need_iter;
-};
-
 static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
 {
 	if (!rb->to_free)
 		return;
 	if (rb->need_iter) {
-		int i, inflight = 0;
+		struct io_kiocb *req;
 		unsigned long flags;
+		int inflight = 0;
 
-		for (i = 0; i < rb->to_free; i++) {
-			struct io_kiocb *req = rb->reqs[i];
-
+		list_for_each_entry(req, &rb->list, list) {
 			if (req->flags & REQ_F_FIXED_FILE) {
 				req->file = NULL;
 				percpu_ref_put(req->fixed_file_refs);
@@ -1405,9 +1503,7 @@ static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
 			goto do_free;
 
 		spin_lock_irqsave(&ctx->inflight_lock, flags);
-		for (i = 0; i < rb->to_free; i++) {
-			struct io_kiocb *req = rb->reqs[i];
-
+		list_for_each_entry(req, &rb->list, list) {
 			if (req->flags & REQ_F_INFLIGHT) {
 				list_del(&req->inflight_entry);
 				if (!--inflight)
@@ -1420,9 +1516,10 @@ static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
 			wake_up(&ctx->inflight_wait);
 	}
 do_free:
-	kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
+	io_req_cache_free_bulk(rb);
 	percpu_ref_put_many(&ctx->refs, rb->to_free);
-	rb->to_free = rb->need_iter = 0;
+	rb->to_free = 0;
+	rb->need_iter = rb->irq_comp = false;
 }
 
 static bool io_link_cancel_timeout(struct io_kiocb *req)
@@ -1670,11 +1767,12 @@ static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req)
 		return false;
 
 	if (!(req->flags & REQ_F_FIXED_FILE) || req->io)
-		rb->need_iter++;
+		rb->need_iter |= true;
+	if (!rb->irq_comp && io_op_defs[req->opcode].irq_comp)
+		rb->irq_comp |= true;
 
-	rb->reqs[rb->to_free++] = req;
-	if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs)))
-		io_free_req_many(req->ctx, rb);
+	list_add(&req->list, &rb->list);
+	rb->to_free++;
 	return true;
 }
 
@@ -1697,10 +1795,14 @@ static int io_put_kbuf(struct io_kiocb *req)
 static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 			       struct list_head *done)
 {
-	struct req_batch rb;
+	struct req_batch rb = {
+		.list		= LIST_HEAD_INIT(rb.list),
+		.to_free	= 0,
+		.need_iter	= false,
+		.irq_comp	= false
+	};
 	struct io_kiocb *req;
 
-	rb.to_free = rb.need_iter = 0;
 	while (!list_empty(done)) {
 		int cflags = 0;
 
@@ -5703,8 +5805,6 @@ static void io_submit_state_end(struct io_submit_state *state)
 {
 	blk_finish_plug(&state->plug);
 	io_file_put(state);
-	if (state->free_reqs)
-		kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
 }
 
 /*
@@ -5714,7 +5814,6 @@ static void io_submit_state_start(struct io_submit_state *state,
 				  unsigned int max_ios)
 {
 	blk_start_plug(&state->plug);
-	state->free_reqs = 0;
 	state->file = NULL;
 	state->ios_left = max_ios;
 }
@@ -5784,7 +5883,6 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	 * link list.
 	 */
 	req->sequence = ctx->cached_sq_head - ctx->cached_sq_dropped;
-	req->opcode = READ_ONCE(sqe->opcode);
 	req->user_data = READ_ONCE(sqe->user_data);
 	req->io = NULL;
 	req->file = NULL;
@@ -5872,14 +5970,14 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 			io_consume_sqe(ctx);
 			break;
 		}
-		req = io_alloc_req(ctx, statep);
+		req = io_alloc_req(ctx, READ_ONCE(sqe->opcode));
 		if (unlikely(!req)) {
 			if (!submitted)
 				submitted = -EAGAIN;
 			break;
 		}
 
-		err = io_init_req(ctx, req, sqe, statep, async);
+		err = io_init_req(ctx, req, sqe, NULL, async);
 		io_consume_sqe(ctx);
 		/* will complete beyond this point, count as submitted */
 		submitted++;
@@ -7626,6 +7724,17 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
 					req->task->task_works != NULL);
 	}
 	spin_unlock_irq(&ctx->completion_lock);
+	seq_printf(m, "AllocCache:\n");
+	for_each_possible_cpu(i) {
+		struct io_kiocb_cache *cache = per_cpu_ptr(alloc_cache, i);
+		int j;
+
+		for (j = 0; j < ARRAY_SIZE(cache->caches); j++) {
+			struct io_kiocb_cache_entry *ce = &cache->caches[j];
+
+			seq_printf(m, "  cpu%d: irq=%d, nr_free=%d, nr_avail=%d\n", i, j, ce->nr_free, ce->nr_avail);
+		}
+	}
 	mutex_unlock(&ctx->uring_lock);
 }
 
@@ -8101,8 +8210,130 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
 	return ret;
 }
 
+static unsigned long io_uring_cache_count(struct shrinker *shrink,
+					  struct shrink_control *sc)
+{
+	unsigned long count = 0;
+	int cpu, i;
+
+	for_each_possible_cpu(cpu) {
+		struct io_kiocb_cache *cache;
+
+		cache = per_cpu_ptr(alloc_cache, cpu);
+		for (i = 0; i < ARRAY_SIZE(cache->caches); i++) {
+			struct io_kiocb_cache_entry *ce = &cache->caches[i];
+
+			count += ce->nr_free;
+		}
+	}
+
+	return count;
+}
+
+static unsigned long __io_uring_cache_shrink(struct io_kiocb_cache_entry *ce,
+					     int irq_comp, int *nr_to_scan)
+{
+	unsigned long freed = 0;
+	struct io_kiocb *req;
+	LIST_HEAD(free_list);
+
+	if (!ce->nr_free)
+		return 0;
+
+	if (irq_comp)
+		spin_lock_irq(&ce->free_lock);
+	else
+		spin_lock(&ce->free_lock);
+
+	while (!list_empty(&ce->free_list)) {
+		req = list_first_entry(&ce->free_list, struct io_kiocb, list);
+		list_move(&req->list, &free_list);
+		freed++;
+		if (!--(*nr_to_scan))
+			break;
+	}
+
+	if (irq_comp)
+		spin_unlock_irq(&ce->free_lock);
+	else
+		spin_unlock(&ce->free_lock);
+
+	while (!list_empty(&free_list)) {
+		req = list_first_entry(&free_list, struct io_kiocb, list);
+		list_del(&req->list);
+		kmem_cache_free(req_cachep, req);
+	}
+
+	return freed;
+}
+
+static unsigned long io_uring_cache_shrink(int nr_to_scan)
+{
+	long freed = 0;
+	int cpu, i;
+
+	for_each_possible_cpu(cpu) {
+		struct io_kiocb_cache *cache = per_cpu_ptr(alloc_cache, cpu);
+
+		for (i = 0; i < ARRAY_SIZE(cache->caches); i++) {
+			struct io_kiocb_cache_entry *ce = &cache->caches[i];
+
+			freed += __io_uring_cache_shrink(ce, i, &nr_to_scan);
+			if (!nr_to_scan)
+				break;
+		}
+		if (!nr_to_scan)
+			break;
+	}
+
+	return freed ?: SHRINK_STOP;
+}
+
+static unsigned long io_uring_cache_scan(struct shrinker *shrink,
+					 struct shrink_control *sc)
+{
+	if ((sc->gfp_mask & GFP_KERNEL) != GFP_KERNEL)
+		return SHRINK_STOP;
+
+	return io_uring_cache_shrink(sc->nr_to_scan);
+}
+
+static struct shrinker io_uring_shrinker = {
+	.count_objects	= io_uring_cache_count,
+	.scan_objects	= io_uring_cache_scan,
+	.seeks		= DEFAULT_SEEKS,
+};
+
+static void io_uring_kill_ce(struct io_kiocb_cache_entry *ce, bool irq_comp)
+{
+	struct io_kiocb *req;
+
+	list_splice_init(&ce->alloc_list, &ce->free_list);
+
+	while (!list_empty(&ce->free_list)) {
+		req = list_first_entry(&ce->free_list, struct io_kiocb, list);
+		list_del(&req->list);
+		kmem_cache_free(req_cachep, req);
+	}
+
+	ce->nr_free = ce->nr_avail = 0;
+}
+
+static int io_uring_notify_dead(unsigned int cpu)
+{
+	struct io_kiocb_cache *cache = per_cpu_ptr(alloc_cache, cpu);
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(cache->caches); i++)
+		io_uring_kill_ce(&cache->caches[i], i);
+
+	return 0;
+}
+
 static int __init io_uring_init(void)
 {
+	int cpu, i;
+
 #define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
 	BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
 	BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
@@ -8142,6 +8373,25 @@ static int __init io_uring_init(void)
 	BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
 	BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
 	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
+
+	alloc_cache = alloc_percpu(struct io_kiocb_cache);
+	for_each_possible_cpu(cpu) {
+		struct io_kiocb_cache *cache = per_cpu_ptr(alloc_cache, cpu);
+
+		for (i = 0; i < ARRAY_SIZE(cache->caches); i++) {
+			struct io_kiocb_cache_entry *ce = &cache->caches[i];
+
+			INIT_LIST_HEAD(&ce->alloc_list);
+			spin_lock_init(&ce->free_lock);
+			INIT_LIST_HEAD(&ce->free_list);
+			ce->nr_free = 0;
+			ce->nr_avail = 0;
+		}
+	}
+
+	cpuhp_setup_state_nocalls(CPUHP_IOURING_DEAD, "io_uring:dead", NULL,
+					io_uring_notify_dead);
+	WARN_ON(register_shrinker(&io_uring_shrinker));
 	return 0;
 };
 __initcall(io_uring_init);
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 77d70b633531..3b80556572a5 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -60,6 +60,7 @@ enum cpuhp_state {
 	CPUHP_LUSTRE_CFS_DEAD,
 	CPUHP_AP_ARM_CACHE_B15_RAC_DEAD,
 	CPUHP_PADATA_DEAD,
+	CPUHP_IOURING_DEAD,
 	CPUHP_WORKQUEUE_PREP,
 	CPUHP_POWER_NUMA_PREPARE,
 	CPUHP_HRTIMERS_PREPARE,

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH RFC} io_uring: io_kiocb alloc cache
  2020-05-13 16:30 [PATCH RFC} io_uring: io_kiocb alloc cache Jens Axboe
@ 2020-05-13 17:42   ` Jann Horn
  2020-05-14  8:25 ` Xiaoguang Wang
  1 sibling, 0 replies; 22+ messages in thread
From: Jann Horn @ 2020-05-13 17:42 UTC (permalink / raw)
  To: Jens Axboe
  Cc: io-uring, Xiaoguang Wang, joseph qi, Jiufei Xue, Pavel Begunkov,
	Christoph Lameter, Pekka Enberg, David Rientjes, Joonsoo Kim,
	Andrew Morton, Linux-MM

+slab allocator people

On Wed, May 13, 2020 at 6:30 PM Jens Axboe <axboe@kernel.dk> wrote:
> I turned the quick'n dirty from the other day into something a bit
> more done. Would be great if someone else could run some performance
> testing with this, I get about a 10% boost on the pure NOP benchmark
> with this. But that's just on my laptop in qemu, so some real iron
> testing would be awesome.

10% boost compared to which allocator? Are you using CONFIG_SLUB?

> The idea here is to have a percpu alloc cache. There's two sets of
> state:
>
> 1) Requests that have IRQ completion. preempt disable is not enough
>    there, we need to disable local irqs. This is a lot slower in
>    certain setups, so we keep this separate.
>
> 2) No IRQ completion, we can get by with just disabling preempt.

The SLUB allocator has percpu caching, too, and as long as you don't
enable any SLUB debugging or ASAN or such, and you're not hitting any
slowpath processing, it doesn't even have to disable interrupts, it
gets away with cmpxchg_double.

Have you profiled what the actual problem is when using SLUB? Have you
tested with CONFIG_SLAB_FREELIST_HARDENED turned off,
CONFIG_SLUB_DEBUG turned off, CONFIG_TRACING turned off,
CONFIG_FAILSLAB turned off, and so on? As far as I know, if you
disable all hardening and debugging infrastructure, SLUB's
kmem_cache_alloc()/kmem_cache_free() on the fastpaths should be really
straightforward. And if you don't turn those off, the comparison is
kinda unfair, because your custom freelist won't respect those flags.

When you build custom allocators like this, it interferes with
infrastructure meant to catch memory safety issues and such (both pure
debugging code and safety checks meant for production use) - for
example, ASAN and memory tagging will no longer be able to detect
use-after-free issues in objects managed by your custom allocator
cache.

So please, don't implement custom one-off allocators in random
subsystems. And if you do see a way to actually improve the
performance of memory allocation, add that to the generic SLUB
infrastructure.

> Outside of that, any freed requests goes to the ce->alloc_list.
> Attempting to alloc a request will check there first. When freeing
> a request, if we're over some threshold, move requests to the
> ce->free_list. This list can be browsed by the shrinker to free
> up memory. If a CPU goes offline, all requests are reaped.
>
> That's about it. If we go further with this, it'll be split into
> a few separate patches. For now, just throwing this out there
> for testing. The patch is against my for-5.8/io_uring branch.

That branch doesn't seem to exist on
<https://git.kernel.dk/cgit/linux-block/>...

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH RFC} io_uring: io_kiocb alloc cache
@ 2020-05-13 17:42   ` Jann Horn
  0 siblings, 0 replies; 22+ messages in thread
From: Jann Horn @ 2020-05-13 17:42 UTC (permalink / raw)
  To: Jens Axboe
  Cc: io-uring, Xiaoguang Wang, joseph qi, Jiufei Xue, Pavel Begunkov,
	Christoph Lameter, Pekka Enberg, David Rientjes, Joonsoo Kim,
	Andrew Morton, Linux-MM

+slab allocator people

On Wed, May 13, 2020 at 6:30 PM Jens Axboe <axboe@kernel.dk> wrote:
> I turned the quick'n dirty from the other day into something a bit
> more done. Would be great if someone else could run some performance
> testing with this, I get about a 10% boost on the pure NOP benchmark
> with this. But that's just on my laptop in qemu, so some real iron
> testing would be awesome.

10% boost compared to which allocator? Are you using CONFIG_SLUB?

> The idea here is to have a percpu alloc cache. There's two sets of
> state:
>
> 1) Requests that have IRQ completion. preempt disable is not enough
>    there, we need to disable local irqs. This is a lot slower in
>    certain setups, so we keep this separate.
>
> 2) No IRQ completion, we can get by with just disabling preempt.

The SLUB allocator has percpu caching, too, and as long as you don't
enable any SLUB debugging or ASAN or such, and you're not hitting any
slowpath processing, it doesn't even have to disable interrupts, it
gets away with cmpxchg_double.

Have you profiled what the actual problem is when using SLUB? Have you
tested with CONFIG_SLAB_FREELIST_HARDENED turned off,
CONFIG_SLUB_DEBUG turned off, CONFIG_TRACING turned off,
CONFIG_FAILSLAB turned off, and so on? As far as I know, if you
disable all hardening and debugging infrastructure, SLUB's
kmem_cache_alloc()/kmem_cache_free() on the fastpaths should be really
straightforward. And if you don't turn those off, the comparison is
kinda unfair, because your custom freelist won't respect those flags.

When you build custom allocators like this, it interferes with
infrastructure meant to catch memory safety issues and such (both pure
debugging code and safety checks meant for production use) - for
example, ASAN and memory tagging will no longer be able to detect
use-after-free issues in objects managed by your custom allocator
cache.

So please, don't implement custom one-off allocators in random
subsystems. And if you do see a way to actually improve the
performance of memory allocation, add that to the generic SLUB
infrastructure.

> Outside of that, any freed requests goes to the ce->alloc_list.
> Attempting to alloc a request will check there first. When freeing
> a request, if we're over some threshold, move requests to the
> ce->free_list. This list can be browsed by the shrinker to free
> up memory. If a CPU goes offline, all requests are reaped.
>
> That's about it. If we go further with this, it'll be split into
> a few separate patches. For now, just throwing this out there
> for testing. The patch is against my for-5.8/io_uring branch.

That branch doesn't seem to exist on
<https://git.kernel.dk/cgit/linux-block/>...


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH RFC} io_uring: io_kiocb alloc cache
  2020-05-13 17:42   ` Jann Horn
  (?)
@ 2020-05-13 18:34   ` Jens Axboe
  -1 siblings, 0 replies; 22+ messages in thread
From: Jens Axboe @ 2020-05-13 18:34 UTC (permalink / raw)
  To: Jann Horn
  Cc: io-uring, Xiaoguang Wang, joseph qi, Jiufei Xue, Pavel Begunkov,
	Christoph Lameter, Pekka Enberg, David Rientjes, Joonsoo Kim,
	Andrew Morton, Linux-MM

On 5/13/20 11:42 AM, Jann Horn wrote:
> +slab allocator people
> 
> On Wed, May 13, 2020 at 6:30 PM Jens Axboe <axboe@kernel.dk> wrote:
>> I turned the quick'n dirty from the other day into something a bit
>> more done. Would be great if someone else could run some performance
>> testing with this, I get about a 10% boost on the pure NOP benchmark
>> with this. But that's just on my laptop in qemu, so some real iron
>> testing would be awesome.
> 
> 10% boost compared to which allocator? Are you using CONFIG_SLUB?

SLUB, yes.

>> The idea here is to have a percpu alloc cache. There's two sets of
>> state:
>>
>> 1) Requests that have IRQ completion. preempt disable is not enough
>>    there, we need to disable local irqs. This is a lot slower in
>>    certain setups, so we keep this separate.
>>
>> 2) No IRQ completion, we can get by with just disabling preempt.
> 
> The SLUB allocator has percpu caching, too, and as long as you don't
> enable any SLUB debugging or ASAN or such, and you're not hitting any
> slowpath processing, it doesn't even have to disable interrupts, it
> gets away with cmpxchg_double.
> 
> Have you profiled what the actual problem is when using SLUB? Have you
> tested with CONFIG_SLAB_FREELIST_HARDENED turned off,
> CONFIG_SLUB_DEBUG turned off, CONFIG_TRACING turned off,
> CONFIG_FAILSLAB turned off, and so on? As far as I know, if you
> disable all hardening and debugging infrastructure, SLUB's
> kmem_cache_alloc()/kmem_cache_free() on the fastpaths should be really
> straightforward. And if you don't turn those off, the comparison is
> kinda unfair, because your custom freelist won't respect those flags.

But that's sort of the point. I don't have any nasty SLUB options
enabled, just the default. And that includes CONFIG_SLUB_DEBUG. Which
all the distros have enabled, I believe.

So yes, I could compare to a bare bones SLUB, and I'll definitely do
that because I'm curious. And it also could be an artifact of qemu,
sometimes that behaves differently than a real host (locks/irq is more
expensive, for example). Not sure how much with SLUB in particular,
haven't done targeted benchmarking of that.

The patch is just tossed out there for experimentation reasons, in case
it wasn't clear. It's not like I'm proposing this for inclusion. But if
the wins are big enough over a _normal_ configuration, then it's
definitely tempting.

> When you build custom allocators like this, it interferes with
> infrastructure meant to catch memory safety issues and such (both pure
> debugging code and safety checks meant for production use) - for
> example, ASAN and memory tagging will no longer be able to detect
> use-after-free issues in objects managed by your custom allocator
> cache.
> 
> So please, don't implement custom one-off allocators in random
> subsystems. And if you do see a way to actually improve the
> performance of memory allocation, add that to the generic SLUB
> infrastructure.

I hear you. This isn't unique, fwiw. Networking has a page pool
allocator for example, which I did consider tapping into.

Anyway, I/we will be a lot wiser once this experiment progresses!

>> Outside of that, any freed requests goes to the ce->alloc_list.
>> Attempting to alloc a request will check there first. When freeing
>> a request, if we're over some threshold, move requests to the
>> ce->free_list. This list can be browsed by the shrinker to free
>> up memory. If a CPU goes offline, all requests are reaped.
>>
>> That's about it. If we go further with this, it'll be split into
>> a few separate patches. For now, just throwing this out there
>> for testing. The patch is against my for-5.8/io_uring branch.
> 
> That branch doesn't seem to exist on
> <https://git.kernel.dk/cgit/linux-block/>...

Oh oops, guess I never pushed that out. Will do so.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH RFC} io_uring: io_kiocb alloc cache
  2020-05-13 17:42   ` Jann Horn
  (?)
  (?)
@ 2020-05-13 19:20   ` Pekka Enberg
  2020-05-13 20:09     ` Jens Axboe
  -1 siblings, 1 reply; 22+ messages in thread
From: Pekka Enberg @ 2020-05-13 19:20 UTC (permalink / raw)
  To: Jann Horn
  Cc: Jens Axboe, io-uring, Xiaoguang Wang, joseph qi, Jiufei Xue,
	Pavel Begunkov, Christoph Lameter, Pekka Enberg, David Rientjes,
	Joonsoo Kim, Andrew Morton, Linux-MM


Hi,

On Wed, May 13, 2020 at 6:30 PM Jens Axboe <axboe@kernel.dk> wrote:
> > I turned the quick'n dirty from the other day into something a bit 
> > more done. Would be great if someone else could run some
> > performance testing with this, I get about a 10% boost on the pure
> > NOP benchmark with this. But that's just on my laptop in qemu, so
> > some real iron testing would be awesome.

On 5/13/20 8:42 PM, Jann Horn wrote:> +slab allocator people
> 10% boost compared to which allocator? Are you using CONFIG_SLUB?
 
On Wed, May 13, 2020 at 6:30 PM Jens Axboe <axboe@kernel.dk> wrote:
> > The idea here is to have a percpu alloc cache. There's two sets of 
> > state:
> > 
> > 1) Requests that have IRQ completion. preempt disable is not
> > enough there, we need to disable local irqs. This is a lot slower
> > in certain setups, so we keep this separate.
> > 
> > 2) No IRQ completion, we can get by with just disabling preempt.

On 5/13/20 8:42 PM, Jann Horn wrote:> +slab allocator people
> The SLUB allocator has percpu caching, too, and as long as you don't 
> enable any SLUB debugging or ASAN or such, and you're not hitting
> any slowpath processing, it doesn't even have to disable interrupts,
> it gets away with cmpxchg_double.

The struct io_kiocb is 240 bytes. I don't see a dedicated slab for it in
/proc/slabinfo on my machine, so it likely got merged to the kmalloc-256
cache. This means that there's 32 objects in the per-CPU cache. Jens, on
the other hand, made the cache much bigger:

+#define IO_KIOCB_CACHE_MAX 256

So I assume if someone does "perf record", they will see significant
reduction in page allocator activity with Jens' patch. One possible way
around that is forcing the page allocation order to be much higher. IOW,
something like the following completely untested patch:

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 979d9f977409..c3bf7b72026d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8143,7 +8143,7 @@ static int __init io_uring_init(void)
 
 	BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
 	BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
-	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
+	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_LARGE_ORDER);
 	return 0;
 };
 __initcall(io_uring_init);
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 6d454886bcaf..316fd821ec1f 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -39,6 +39,8 @@
 #define SLAB_STORE_USER		((slab_flags_t __force)0x00010000U)
 /* Panic if kmem_cache_create() fails */
 #define SLAB_PANIC		((slab_flags_t __force)0x00040000U)
+/* Force slab page allocation order to be as large as possible */
+#define SLAB_LARGE_ORDER	((slab_flags_t __force)0x00080000U)
 /*
  * SLAB_TYPESAFE_BY_RCU - **WARNING** READ THIS!
  *
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 23c7500eea7d..a18bbe9472e4 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -51,7 +51,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
  */
 #define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
 		SLAB_TRACE | SLAB_TYPESAFE_BY_RCU | SLAB_NOLEAKTRACE | \
-		SLAB_FAILSLAB | SLAB_KASAN)
+		SLAB_FAILSLAB | SLAB_KASAN | SLAB_LARGE_ORDER)
 
 #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
 			 SLAB_CACHE_DMA32 | SLAB_ACCOUNT)
diff --git a/mm/slub.c b/mm/slub.c
index b762450fc9f0..d1d86b1279aa 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3318,12 +3318,15 @@ static inline unsigned int slab_order(unsigned int size,
 	return order;
 }
 
-static inline int calculate_order(unsigned int size)
+static inline int calculate_order(unsigned int size, gfp_t flags)
 {
 	unsigned int order;
 	unsigned int min_objects;
 	unsigned int max_objects;
 
+	if (flags & SLAB_LARGE_ORDER)
+		return slub_max_order;
+
 	/*
 	 * Attempt to find best configuration for a slab. This
 	 * works by first attempting to generate a layout with
@@ -3651,7 +3654,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
 	if (forced_order >= 0)
 		order = forced_order;
 	else
-		order = calculate_order(size);
+		order = calculate_order(size, flags);
 
 	if ((int)order < 0)
 		return 0;

- Pekka

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH RFC} io_uring: io_kiocb alloc cache
  2020-05-13 19:20   ` Pekka Enberg
@ 2020-05-13 20:09     ` Jens Axboe
  2020-05-13 20:31       ` Pekka Enberg
  0 siblings, 1 reply; 22+ messages in thread
From: Jens Axboe @ 2020-05-13 20:09 UTC (permalink / raw)
  To: Pekka Enberg, Jann Horn
  Cc: io-uring, Xiaoguang Wang, joseph qi, Jiufei Xue, Pavel Begunkov,
	Christoph Lameter, Pekka Enberg, David Rientjes, Joonsoo Kim,
	Andrew Morton, Linux-MM

On 5/13/20 1:20 PM, Pekka Enberg wrote:
> 
> Hi,
> 
> On Wed, May 13, 2020 at 6:30 PM Jens Axboe <axboe@kernel.dk> wrote:
>>> I turned the quick'n dirty from the other day into something a bit 
>>> more done. Would be great if someone else could run some
>>> performance testing with this, I get about a 10% boost on the pure
>>> NOP benchmark with this. But that's just on my laptop in qemu, so
>>> some real iron testing would be awesome.
> 
> On 5/13/20 8:42 PM, Jann Horn wrote:> +slab allocator people
>> 10% boost compared to which allocator? Are you using CONFIG_SLUB?
>  
> On Wed, May 13, 2020 at 6:30 PM Jens Axboe <axboe@kernel.dk> wrote:
>>> The idea here is to have a percpu alloc cache. There's two sets of 
>>> state:
>>>
>>> 1) Requests that have IRQ completion. preempt disable is not
>>> enough there, we need to disable local irqs. This is a lot slower
>>> in certain setups, so we keep this separate.
>>>
>>> 2) No IRQ completion, we can get by with just disabling preempt.
> 
> On 5/13/20 8:42 PM, Jann Horn wrote:> +slab allocator people
>> The SLUB allocator has percpu caching, too, and as long as you don't 
>> enable any SLUB debugging or ASAN or such, and you're not hitting
>> any slowpath processing, it doesn't even have to disable interrupts,
>> it gets away with cmpxchg_double.
> 
> The struct io_kiocb is 240 bytes. I don't see a dedicated slab for it in
> /proc/slabinfo on my machine, so it likely got merged to the kmalloc-256
> cache. This means that there's 32 objects in the per-CPU cache. Jens, on
> the other hand, made the cache much bigger:

Right, it gets merged with kmalloc-256 (and 5 others) in my testing.

> +#define IO_KIOCB_CACHE_MAX 256
> 
> So I assume if someone does "perf record", they will see significant
> reduction in page allocator activity with Jens' patch. One possible way
> around that is forcing the page allocation order to be much higher. IOW,
> something like the following completely untested patch:

Now tested, I gave it a shot. This seems to bring performance to
basically what the io_uring patch does, so that's great! Again, just in
the microbenchmark test case, so freshly booted and just running the
case.

Will this patch introduce latencies or non-deterministic behavior for a
fragmented system?

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH RFC} io_uring: io_kiocb alloc cache
  2020-05-13 20:09     ` Jens Axboe
@ 2020-05-13 20:31       ` Pekka Enberg
  2020-05-13 20:44         ` Jens Axboe
  0 siblings, 1 reply; 22+ messages in thread
From: Pekka Enberg @ 2020-05-13 20:31 UTC (permalink / raw)
  To: Jens Axboe, Jann Horn
  Cc: io-uring, Xiaoguang Wang, joseph qi, Jiufei Xue, Pavel Begunkov,
	Christoph Lameter, Pekka Enberg, David Rientjes, Joonsoo Kim,
	Andrew Morton, Linux-MM

Hi Jens,

On 5/13/20 1:20 PM, Pekka Enberg wrote:
>> So I assume if someone does "perf record", they will see significant
>> reduction in page allocator activity with Jens' patch. One possible way
>> around that is forcing the page allocation order to be much higher. IOW,
>> something like the following completely untested patch:

On 5/13/20 11:09 PM, Jens Axboe wrote:
> Now tested, I gave it a shot. This seems to bring performance to
> basically what the io_uring patch does, so that's great! Again, just in
> the microbenchmark test case, so freshly booted and just running the
> case.

Great, thanks for testing!

On 5/13/20 11:09 PM, Jens Axboe wrote:
> Will this patch introduce latencies or non-deterministic behavior for a
> fragmented system?

You have to talk to someone who is more up-to-date with how the page 
allocator operates today. But yeah, I assume people still want to avoid 
higher-order allocations as much as possible, because they make 
allocation harder when memory is fragmented.

That said, perhaps it's not going to the page allocator as much as I 
thought, but the problem is that the per-CPU cache size is just to small 
for these allocations, forcing do_slab_free() to take the slow path 
often. Would be interesting to know if CONFIG_SLAB does better here 
because the per-CPU cache size is much larger IIRC.

- Pekka

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH RFC} io_uring: io_kiocb alloc cache
  2020-05-13 20:31       ` Pekka Enberg
@ 2020-05-13 20:44         ` Jens Axboe
  0 siblings, 0 replies; 22+ messages in thread
From: Jens Axboe @ 2020-05-13 20:44 UTC (permalink / raw)
  To: Pekka Enberg, Jann Horn
  Cc: io-uring, Xiaoguang Wang, joseph qi, Jiufei Xue, Pavel Begunkov,
	Christoph Lameter, Pekka Enberg, David Rientjes, Joonsoo Kim,
	Andrew Morton, Linux-MM

On 5/13/20 2:31 PM, Pekka Enberg wrote:
> Hi Jens,
> 
> On 5/13/20 1:20 PM, Pekka Enberg wrote:
>>> So I assume if someone does "perf record", they will see significant
>>> reduction in page allocator activity with Jens' patch. One possible way
>>> around that is forcing the page allocation order to be much higher. IOW,
>>> something like the following completely untested patch:
> 
> On 5/13/20 11:09 PM, Jens Axboe wrote:
>> Now tested, I gave it a shot. This seems to bring performance to
>> basically what the io_uring patch does, so that's great! Again, just in
>> the microbenchmark test case, so freshly booted and just running the
>> case.
> 
> Great, thanks for testing!
> 
> On 5/13/20 11:09 PM, Jens Axboe wrote:
>> Will this patch introduce latencies or non-deterministic behavior for a
>> fragmented system?
> 
> You have to talk to someone who is more up-to-date with how the page 
> allocator operates today. But yeah, I assume people still want to avoid 
> higher-order allocations as much as possible, because they make 
> allocation harder when memory is fragmented.

That was my thinking... I don't want a random io_kiocb allocation to
take a long time because of high order allocations.

> That said, perhaps it's not going to the page allocator as much as I 
> thought, but the problem is that the per-CPU cache size is just to small 
> for these allocations, forcing do_slab_free() to take the slow path 
> often. Would be interesting to know if CONFIG_SLAB does better here 
> because the per-CPU cache size is much larger IIRC.

Just tried with SLAB, and it's roughly 4-5% down from the baseline
(non-modified) SLUB. So not faster, at least for this case.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH RFC} io_uring: io_kiocb alloc cache
  2020-05-13 16:30 [PATCH RFC} io_uring: io_kiocb alloc cache Jens Axboe
  2020-05-13 17:42   ` Jann Horn
@ 2020-05-14  8:25 ` Xiaoguang Wang
  2020-05-14 14:22   ` Jens Axboe
  1 sibling, 1 reply; 22+ messages in thread
From: Xiaoguang Wang @ 2020-05-14  8:25 UTC (permalink / raw)
  To: Jens Axboe, io-uring; +Cc: joseph qi, Jiufei Xue, Pavel Begunkov

hi,

> +
> +static void io_req_cache_free_bulk(struct req_batch *rb)
> +{
> +	struct io_kiocb_cache_entry *ce;
> +	struct io_kiocb_cache *cache;
> +
> +	if (rb->irq_comp)
> +		local_irq_disable();
> +	else
> +		preempt_disable();
> +
> +	cache = this_cpu_ptr(alloc_cache);
> +	ce = &cache->caches[rb->irq_comp];
> +
> +	list_splice_init(&rb->list, &ce->alloc_list);
> +	ce->nr_avail += rb->to_free;
> +	if (ce->nr_avail > IO_KIOCB_CACHE_MAX)
> +		io_req_cache_reclaim(ce);
> +
> +	if (rb->irq_comp)
> +		local_irq_enable();
> +	else
> +		preempt_enable();
> +}
> +
> +static void io_req_cache_free(struct io_kiocb *req)
> +{
> +	const bool irq_comp = io_op_defs[req->opcode].irq_comp;
> +	struct io_kiocb_cache_entry *ce;
> +	struct io_kiocb_cache *cache;
> +	unsigned long flags;
> +
> +	if (irq_comp)
> +		local_irq_save(flags);
> +	else
> +		preempt_disable();
> +
> +	cache = this_cpu_ptr(alloc_cache);
> +	ce = &cache->caches[irq_comp];
> +
> +	list_add(&req->list, &ce->alloc_list);
> +	if (++ce->nr_avail > IO_KIOCB_CACHE_MAX)
> +		io_req_cache_reclaim(ce);
Above codes seem that io_req_cache_reclaim() will free all reqs in alloc_list,
then we'll need to kmem_cache_alloc() again, I guess indeed you intend to reserve
IO_KIOCB_CACHE_MAX reqs in alloc_list?

I still use my previous io_uring_nop_stress tool to evaluate the improvement
in a physical machine. Memory 250GB and cpu is "Intel(R) Xeon(R) CPU E5-2682 v4 @ 2.50GHz".
Before this patch:
$sudo taskset -c 60 ./io_uring_nop_stress -r 300
total ios: 1608773840
IOPS:      5362579

With this patch:
sudo taskset -c 60 ./io_uring_nop_stress -r 300
total ios: 1676910736
IOPS:      5589702
About 4.2% improvement.

Regards,
Xiaoguang Wang
> +
> +	if (irq_comp)
> +		local_irq_restore(flags);
> +	else
> +		preempt_enable();
> +}
> +
> +static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx, int opcode)
> +{
> +	struct io_kiocb *req;
> +
> +	req = io_req_cache_alloc(opcode);
> +	if (req) {
> +		req->opcode = opcode;
> +		return req;
> +	}
> +
> +	return io_get_fallback_req(ctx, opcode);
>   }
>   
>   static inline void io_put_file(struct io_kiocb *req, struct file *file,
> @@ -1345,7 +1449,8 @@ static void __io_req_aux_free(struct io_kiocb *req)
>   	if (req->flags & REQ_F_NEED_CLEANUP)
>   		io_cleanup_req(req);
>   
> -	kfree(req->io);
> +	if (req->io)
> +		kfree(req->io);
>   	if (req->file)
>   		io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
>   	if (req->task)
> @@ -1371,28 +1476,21 @@ static void __io_free_req(struct io_kiocb *req)
>   
>   	percpu_ref_put(&req->ctx->refs);
>   	if (likely(!io_is_fallback_req(req)))
> -		kmem_cache_free(req_cachep, req);
> +		io_req_cache_free(req);
>   	else
>   		clear_bit_unlock(0, (unsigned long *) &req->ctx->fallback_req);
>   }
>   
> -struct req_batch {
> -	void *reqs[IO_IOPOLL_BATCH];
> -	int to_free;
> -	int need_iter;
> -};
> -
>   static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
>   {
>   	if (!rb->to_free)
>   		return;
>   	if (rb->need_iter) {
> -		int i, inflight = 0;
> +		struct io_kiocb *req;
>   		unsigned long flags;
> +		int inflight = 0;
>   
> -		for (i = 0; i < rb->to_free; i++) {
> -			struct io_kiocb *req = rb->reqs[i];
> -
> +		list_for_each_entry(req, &rb->list, list) {
>   			if (req->flags & REQ_F_FIXED_FILE) {
>   				req->file = NULL;
>   				percpu_ref_put(req->fixed_file_refs);
> @@ -1405,9 +1503,7 @@ static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
>   			goto do_free;
>   
>   		spin_lock_irqsave(&ctx->inflight_lock, flags);
> -		for (i = 0; i < rb->to_free; i++) {
> -			struct io_kiocb *req = rb->reqs[i];
> -
> +		list_for_each_entry(req, &rb->list, list) {
>   			if (req->flags & REQ_F_INFLIGHT) {
>   				list_del(&req->inflight_entry);
>   				if (!--inflight)
> @@ -1420,9 +1516,10 @@ static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
>   			wake_up(&ctx->inflight_wait);
>   	}
>   do_free:
> -	kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
> +	io_req_cache_free_bulk(rb);
>   	percpu_ref_put_many(&ctx->refs, rb->to_free);
> -	rb->to_free = rb->need_iter = 0;
> +	rb->to_free = 0;
> +	rb->need_iter = rb->irq_comp = false;
>   }
>   
>   static bool io_link_cancel_timeout(struct io_kiocb *req)
> @@ -1670,11 +1767,12 @@ static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req)
>   		return false;
>   
>   	if (!(req->flags & REQ_F_FIXED_FILE) || req->io)
> -		rb->need_iter++;
> +		rb->need_iter |= true;
> +	if (!rb->irq_comp && io_op_defs[req->opcode].irq_comp)
> +		rb->irq_comp |= true;
>   
> -	rb->reqs[rb->to_free++] = req;
> -	if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs)))
> -		io_free_req_many(req->ctx, rb);
> +	list_add(&req->list, &rb->list);
> +	rb->to_free++;
>   	return true;
>   }
>   
> @@ -1697,10 +1795,14 @@ static int io_put_kbuf(struct io_kiocb *req)
>   static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
>   			       struct list_head *done)
>   {
> -	struct req_batch rb;
> +	struct req_batch rb = {
> +		.list		= LIST_HEAD_INIT(rb.list),
> +		.to_free	= 0,
> +		.need_iter	= false,
> +		.irq_comp	= false
> +	};
>   	struct io_kiocb *req;
>   
> -	rb.to_free = rb.need_iter = 0;
>   	while (!list_empty(done)) {
>   		int cflags = 0;
>   
> @@ -5703,8 +5805,6 @@ static void io_submit_state_end(struct io_submit_state *state)
>   {
>   	blk_finish_plug(&state->plug);
>   	io_file_put(state);
> -	if (state->free_reqs)
> -		kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
>   }
>   
>   /*
> @@ -5714,7 +5814,6 @@ static void io_submit_state_start(struct io_submit_state *state,
>   				  unsigned int max_ios)
>   {
>   	blk_start_plug(&state->plug);
> -	state->free_reqs = 0;
>   	state->file = NULL;
>   	state->ios_left = max_ios;
>   }
> @@ -5784,7 +5883,6 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
>   	 * link list.
>   	 */
>   	req->sequence = ctx->cached_sq_head - ctx->cached_sq_dropped;
> -	req->opcode = READ_ONCE(sqe->opcode);
>   	req->user_data = READ_ONCE(sqe->user_data);
>   	req->io = NULL;
>   	req->file = NULL;
> @@ -5872,14 +5970,14 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
>   			io_consume_sqe(ctx);
>   			break;
>   		}
> -		req = io_alloc_req(ctx, statep);
> +		req = io_alloc_req(ctx, READ_ONCE(sqe->opcode));
>   		if (unlikely(!req)) {
>   			if (!submitted)
>   				submitted = -EAGAIN;
>   			break;
>   		}
>   
> -		err = io_init_req(ctx, req, sqe, statep, async);
> +		err = io_init_req(ctx, req, sqe, NULL, async);
>   		io_consume_sqe(ctx);
>   		/* will complete beyond this point, count as submitted */
>   		submitted++;
> @@ -7626,6 +7724,17 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
>   					req->task->task_works != NULL);
>   	}
>   	spin_unlock_irq(&ctx->completion_lock);
> +	seq_printf(m, "AllocCache:\n");
> +	for_each_possible_cpu(i) {
> +		struct io_kiocb_cache *cache = per_cpu_ptr(alloc_cache, i);
> +		int j;
> +
> +		for (j = 0; j < ARRAY_SIZE(cache->caches); j++) {
> +			struct io_kiocb_cache_entry *ce = &cache->caches[j];
> +
> +			seq_printf(m, "  cpu%d: irq=%d, nr_free=%d, nr_avail=%d\n", i, j, ce->nr_free, ce->nr_avail);
> +		}
> +	}
>   	mutex_unlock(&ctx->uring_lock);
>   }
>   
> @@ -8101,8 +8210,130 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
>   	return ret;
>   }
>   
> +static unsigned long io_uring_cache_count(struct shrinker *shrink,
> +					  struct shrink_control *sc)
> +{
> +	unsigned long count = 0;
> +	int cpu, i;
> +
> +	for_each_possible_cpu(cpu) {
> +		struct io_kiocb_cache *cache;
> +
> +		cache = per_cpu_ptr(alloc_cache, cpu);
> +		for (i = 0; i < ARRAY_SIZE(cache->caches); i++) {
> +			struct io_kiocb_cache_entry *ce = &cache->caches[i];
> +
> +			count += ce->nr_free;
> +		}
> +	}
> +
> +	return count;
> +}
> +
> +static unsigned long __io_uring_cache_shrink(struct io_kiocb_cache_entry *ce,
> +					     int irq_comp, int *nr_to_scan)
> +{
> +	unsigned long freed = 0;
> +	struct io_kiocb *req;
> +	LIST_HEAD(free_list);
> +
> +	if (!ce->nr_free)
> +		return 0;
> +
> +	if (irq_comp)
> +		spin_lock_irq(&ce->free_lock);
> +	else
> +		spin_lock(&ce->free_lock);
> +
> +	while (!list_empty(&ce->free_list)) {
> +		req = list_first_entry(&ce->free_list, struct io_kiocb, list);
> +		list_move(&req->list, &free_list);
> +		freed++;
> +		if (!--(*nr_to_scan))
> +			break;
> +	}
> +
> +	if (irq_comp)
> +		spin_unlock_irq(&ce->free_lock);
> +	else
> +		spin_unlock(&ce->free_lock);
> +
> +	while (!list_empty(&free_list)) {
> +		req = list_first_entry(&free_list, struct io_kiocb, list);
> +		list_del(&req->list);
> +		kmem_cache_free(req_cachep, req);
> +	}
> +
> +	return freed;
> +}
> +
> +static unsigned long io_uring_cache_shrink(int nr_to_scan)
> +{
> +	long freed = 0;
> +	int cpu, i;
> +
> +	for_each_possible_cpu(cpu) {
> +		struct io_kiocb_cache *cache = per_cpu_ptr(alloc_cache, cpu);
> +
> +		for (i = 0; i < ARRAY_SIZE(cache->caches); i++) {
> +			struct io_kiocb_cache_entry *ce = &cache->caches[i];
> +
> +			freed += __io_uring_cache_shrink(ce, i, &nr_to_scan);
> +			if (!nr_to_scan)
> +				break;
> +		}
> +		if (!nr_to_scan)
> +			break;
> +	}
> +
> +	return freed ?: SHRINK_STOP;
> +}
> +
> +static unsigned long io_uring_cache_scan(struct shrinker *shrink,
> +					 struct shrink_control *sc)
> +{
> +	if ((sc->gfp_mask & GFP_KERNEL) != GFP_KERNEL)
> +		return SHRINK_STOP;
> +
> +	return io_uring_cache_shrink(sc->nr_to_scan);
> +}
> +
> +static struct shrinker io_uring_shrinker = {
> +	.count_objects	= io_uring_cache_count,
> +	.scan_objects	= io_uring_cache_scan,
> +	.seeks		= DEFAULT_SEEKS,
> +};
> +
> +static void io_uring_kill_ce(struct io_kiocb_cache_entry *ce, bool irq_comp)
> +{
> +	struct io_kiocb *req;
> +
> +	list_splice_init(&ce->alloc_list, &ce->free_list);
> +
> +	while (!list_empty(&ce->free_list)) {
> +		req = list_first_entry(&ce->free_list, struct io_kiocb, list);
> +		list_del(&req->list);
> +		kmem_cache_free(req_cachep, req);
> +	}
> +
> +	ce->nr_free = ce->nr_avail = 0;
> +}
> +
> +static int io_uring_notify_dead(unsigned int cpu)
> +{
> +	struct io_kiocb_cache *cache = per_cpu_ptr(alloc_cache, cpu);
> +	int i;
> +
> +	for (i = 0; i < ARRAY_SIZE(cache->caches); i++)
> +		io_uring_kill_ce(&cache->caches[i], i);
> +
> +	return 0;
> +}
> +
>   static int __init io_uring_init(void)
>   {
> +	int cpu, i;
> +
>   #define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
>   	BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
>   	BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
> @@ -8142,6 +8373,25 @@ static int __init io_uring_init(void)
>   	BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
>   	BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
>   	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
> +
> +	alloc_cache = alloc_percpu(struct io_kiocb_cache);
> +	for_each_possible_cpu(cpu) {
> +		struct io_kiocb_cache *cache = per_cpu_ptr(alloc_cache, cpu);
> +
> +		for (i = 0; i < ARRAY_SIZE(cache->caches); i++) {
> +			struct io_kiocb_cache_entry *ce = &cache->caches[i];
> +
> +			INIT_LIST_HEAD(&ce->alloc_list);
> +			spin_lock_init(&ce->free_lock);
> +			INIT_LIST_HEAD(&ce->free_list);
> +			ce->nr_free = 0;
> +			ce->nr_avail = 0;
> +		}
> +	}
> +
> +	cpuhp_setup_state_nocalls(CPUHP_IOURING_DEAD, "io_uring:dead", NULL,
> +					io_uring_notify_dead);
> +	WARN_ON(register_shrinker(&io_uring_shrinker));
>   	return 0;
>   };
>   __initcall(io_uring_init);
> diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
> index 77d70b633531..3b80556572a5 100644
> --- a/include/linux/cpuhotplug.h
> +++ b/include/linux/cpuhotplug.h
> @@ -60,6 +60,7 @@ enum cpuhp_state {
>   	CPUHP_LUSTRE_CFS_DEAD,
>   	CPUHP_AP_ARM_CACHE_B15_RAC_DEAD,
>   	CPUHP_PADATA_DEAD,
> +	CPUHP_IOURING_DEAD,
>   	CPUHP_WORKQUEUE_PREP,
>   	CPUHP_POWER_NUMA_PREPARE,
>   	CPUHP_HRTIMERS_PREPARE,
> 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH RFC} io_uring: io_kiocb alloc cache
  2020-05-14  8:25 ` Xiaoguang Wang
@ 2020-05-14 14:22   ` Jens Axboe
  2020-05-14 14:33     ` Jens Axboe
  2020-05-16 16:15     ` Xiaoguang Wang
  0 siblings, 2 replies; 22+ messages in thread
From: Jens Axboe @ 2020-05-14 14:22 UTC (permalink / raw)
  To: Xiaoguang Wang, io-uring; +Cc: joseph qi, Jiufei Xue, Pavel Begunkov

On 5/14/20 2:25 AM, Xiaoguang Wang wrote:
> hi,
> 
>> +
>> +static void io_req_cache_free_bulk(struct req_batch *rb)
>> +{
>> +	struct io_kiocb_cache_entry *ce;
>> +	struct io_kiocb_cache *cache;
>> +
>> +	if (rb->irq_comp)
>> +		local_irq_disable();
>> +	else
>> +		preempt_disable();
>> +
>> +	cache = this_cpu_ptr(alloc_cache);
>> +	ce = &cache->caches[rb->irq_comp];
>> +
>> +	list_splice_init(&rb->list, &ce->alloc_list);
>> +	ce->nr_avail += rb->to_free;
>> +	if (ce->nr_avail > IO_KIOCB_CACHE_MAX)
>> +		io_req_cache_reclaim(ce);
>> +
>> +	if (rb->irq_comp)
>> +		local_irq_enable();
>> +	else
>> +		preempt_enable();
>> +}
>> +
>> +static void io_req_cache_free(struct io_kiocb *req)
>> +{
>> +	const bool irq_comp = io_op_defs[req->opcode].irq_comp;
>> +	struct io_kiocb_cache_entry *ce;
>> +	struct io_kiocb_cache *cache;
>> +	unsigned long flags;
>> +
>> +	if (irq_comp)
>> +		local_irq_save(flags);
>> +	else
>> +		preempt_disable();
>> +
>> +	cache = this_cpu_ptr(alloc_cache);
>> +	ce = &cache->caches[irq_comp];
>> +
>> +	list_add(&req->list, &ce->alloc_list);
>> +	if (++ce->nr_avail > IO_KIOCB_CACHE_MAX)
>> +		io_req_cache_reclaim(ce);
> Above codes seem that io_req_cache_reclaim() will free all reqs in
> alloc_list, then we'll need to kmem_cache_alloc() again, I guess
> indeed you intend to reserve IO_KIOCB_CACHE_MAX reqs in alloc_list?

Yeah a thinko in that one, actually did a v2 shortly after that, just
didn't send it out. Including it below in case you are interested, when
it hits reclaim, it reclaims IO_KIOCB_CACHE_RECLAIM of them.

> I still use my previous io_uring_nop_stress tool to evaluate the improvement
> in a physical machine. Memory 250GB and cpu is "Intel(R) Xeon(R) CPU E5-2682 v4 @ 2.50GHz".
> Before this patch:
> $sudo taskset -c 60 ./io_uring_nop_stress -r 300
> total ios: 1608773840
> IOPS:      5362579
> 
> With this patch:
> sudo taskset -c 60 ./io_uring_nop_stress -r 300
> total ios: 1676910736
> IOPS:      5589702
> About 4.2% improvement.

That's not bad. Can you try the patch from Pekka as well, just to see if
that helps for you?

I also had another idea... We basically have two types of request life
times:

1) io_kiocb can get queued up internally
2) io_kiocb completes inline

For the latter, it's totally feasible to just have the io_kiocb on
stack. The downside is if we need to go the slower path, then we need to
alloc an io_kiocb then and copy it. But maybe that's OK... I'll play
with it.


diff --git a/fs/io_uring.c b/fs/io_uring.c
index d2e37215d05a..3be5f0e60d9f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -79,6 +79,7 @@
 #include <linux/fs_struct.h>
 #include <linux/splice.h>
 #include <linux/task_work.h>
+#include <linux/cpuhotplug.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/io_uring.h>
@@ -652,17 +653,10 @@ struct io_kiocb {
 };
 
 #define IO_PLUG_THRESHOLD		2
-#define IO_IOPOLL_BATCH			8
 
 struct io_submit_state {
 	struct blk_plug		plug;
 
-	/*
-	 * io_kiocb alloc cache
-	 */
-	void			*reqs[IO_IOPOLL_BATCH];
-	unsigned int		free_reqs;
-
 	/*
 	 * File reference cache
 	 */
@@ -673,6 +667,27 @@ struct io_submit_state {
 	unsigned int		ios_left;
 };
 
+struct io_kiocb_cache_entry {
+	/* ready for allocation */
+	struct list_head	alloc_list;
+	unsigned		nr_avail;
+
+	/* ready for shrinker reclaim */
+	spinlock_t		free_lock;
+	struct list_head	free_list;
+	unsigned		nr_free;
+};
+
+struct io_kiocb_cache {
+	/* one for requests with IRQ completion, one for no IRQ */
+	struct io_kiocb_cache_entry	caches[2];
+};
+
+#define IO_KIOCB_CACHE_MAX	256
+#define IO_KIOCB_CACHE_RECLAIM	 16
+
+static struct io_kiocb_cache *alloc_cache;
+
 struct io_op_def {
 	/* needs req->io allocated for deferral/async */
 	unsigned		async_ctx : 1;
@@ -695,6 +710,8 @@ struct io_op_def {
 	unsigned		pollout : 1;
 	/* op supports buffer selection */
 	unsigned		buffer_select : 1;
+	/* IRQ completion */
+	unsigned		irq_comp : 1;
 };
 
 static const struct io_op_def io_op_defs[] = {
@@ -706,6 +723,7 @@ static const struct io_op_def io_op_defs[] = {
 		.unbound_nonreg_file	= 1,
 		.pollin			= 1,
 		.buffer_select		= 1,
+		.irq_comp		= 1,
 	},
 	[IORING_OP_WRITEV] = {
 		.async_ctx		= 1,
@@ -714,6 +732,7 @@ static const struct io_op_def io_op_defs[] = {
 		.hash_reg_file		= 1,
 		.unbound_nonreg_file	= 1,
 		.pollout		= 1,
+		.irq_comp		= 1,
 	},
 	[IORING_OP_FSYNC] = {
 		.needs_file		= 1,
@@ -722,12 +741,14 @@ static const struct io_op_def io_op_defs[] = {
 		.needs_file		= 1,
 		.unbound_nonreg_file	= 1,
 		.pollin			= 1,
+		.irq_comp		= 1,
 	},
 	[IORING_OP_WRITE_FIXED] = {
 		.needs_file		= 1,
 		.hash_reg_file		= 1,
 		.unbound_nonreg_file	= 1,
 		.pollout		= 1,
+		.irq_comp		= 1,
 	},
 	[IORING_OP_POLL_ADD] = {
 		.needs_file		= 1,
@@ -803,12 +824,14 @@ static const struct io_op_def io_op_defs[] = {
 		.unbound_nonreg_file	= 1,
 		.pollin			= 1,
 		.buffer_select		= 1,
+		.irq_comp		= 1,
 	},
 	[IORING_OP_WRITE] = {
 		.needs_mm		= 1,
 		.needs_file		= 1,
 		.unbound_nonreg_file	= 1,
 		.pollout		= 1,
+		.irq_comp		= 1,
 	},
 	[IORING_OP_FADVISE] = {
 		.needs_file		= 1,
@@ -1281,54 +1304,161 @@ static inline bool io_is_fallback_req(struct io_kiocb *req)
 			((unsigned long) req->ctx->fallback_req & ~1UL);
 }
 
-static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx)
+static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx, int op)
 {
 	struct io_kiocb *req;
 
 	req = ctx->fallback_req;
-	if (!test_and_set_bit_lock(0, (unsigned long *) &ctx->fallback_req))
+	if (!test_and_set_bit_lock(0, (unsigned long *) &ctx->fallback_req)) {
+		req->opcode = op;
 		return req;
+	}
 
 	return NULL;
 }
 
-static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx,
-				     struct io_submit_state *state)
+static bool io_req_cache_steal(struct io_kiocb_cache_entry *ce)
 {
-	gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
-	struct io_kiocb *req;
+	int nr = 0;
 
-	if (!state) {
-		req = kmem_cache_alloc(req_cachep, gfp);
-		if (unlikely(!req))
-			goto fallback;
-	} else if (!state->free_reqs) {
-		size_t sz;
-		int ret;
+	if (ce->nr_free < IO_KIOCB_CACHE_RECLAIM)
+		return false;
 
-		sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
-		ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);
+	spin_lock(&ce->free_lock);
+	while (!list_empty(&ce->free_list)) {
+		struct io_kiocb *req;
 
-		/*
-		 * Bulk alloc is all-or-nothing. If we fail to get a batch,
-		 * retry single alloc to be on the safe side.
-		 */
-		if (unlikely(ret <= 0)) {
-			state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
-			if (!state->reqs[0])
-				goto fallback;
-			ret = 1;
-		}
-		state->free_reqs = ret - 1;
-		req = state->reqs[ret - 1];
-	} else {
-		state->free_reqs--;
-		req = state->reqs[state->free_reqs];
+		req = list_first_entry(&ce->free_list, struct io_kiocb, list);
+		list_move(&req->list, &ce->alloc_list);
+		if (++nr >= IO_KIOCB_CACHE_RECLAIM)
+			break;
 	}
+	ce->nr_avail += nr;
+	ce->nr_free -= nr;
+	spin_unlock(&ce->free_lock);
+	return nr > 0;
+}
+
+static struct io_kiocb *io_req_cache_alloc(int op)
+{
+	const bool irq_comp = io_op_defs[op].irq_comp;
+	struct io_kiocb_cache_entry *ce;
+	struct io_kiocb_cache *cache;
+	struct io_kiocb *req = NULL;
+
+	if (irq_comp)
+		local_irq_disable();
+	else
+		preempt_disable();
+
+	cache = this_cpu_ptr(alloc_cache);
+	ce = &cache->caches[irq_comp];
+
+	if (!list_empty(&ce->alloc_list) || io_req_cache_steal(ce)) {
+		req = list_first_entry(&ce->alloc_list, struct io_kiocb, list);
+		list_del(&req->list);
+		ce->nr_avail--;
+	}
+
+	if (irq_comp)
+		local_irq_enable();
+	else
+		preempt_enable();
+
+	if (req)
+		return req;
 
-	return req;
-fallback:
-	return io_get_fallback_req(ctx);
+	return kmem_cache_alloc(req_cachep, GFP_KERNEL);
+}
+
+static void io_req_cache_reclaim(struct io_kiocb_cache_entry *ce)
+{
+	LIST_HEAD(free_list);
+	int nr = 0;
+
+	while (!list_empty(&ce->alloc_list)) {
+		struct io_kiocb *req;
+
+		req = list_last_entry(&ce->alloc_list, struct io_kiocb, list);
+		list_move(&req->list, &free_list);
+		if (++nr >= IO_KIOCB_CACHE_RECLAIM)
+			break;
+	}
+
+	spin_lock(&ce->free_lock);
+	list_splice(&free_list, &ce->free_list);
+	ce->nr_free += nr;
+	ce->nr_avail -= nr;
+	spin_unlock(&ce->free_lock);
+}
+
+struct req_batch {
+	struct list_head list;
+	int to_free;
+	bool need_iter;
+	bool irq_comp;
+};
+
+static void io_req_cache_free_bulk(struct req_batch *rb)
+{
+	struct io_kiocb_cache_entry *ce;
+	struct io_kiocb_cache *cache;
+
+	if (rb->irq_comp)
+		local_irq_disable();
+	else
+		preempt_disable();
+
+	cache = this_cpu_ptr(alloc_cache);
+	ce = &cache->caches[rb->irq_comp];
+
+	list_splice_init(&rb->list, &ce->alloc_list);
+	ce->nr_avail += rb->to_free;
+	if (ce->nr_avail > IO_KIOCB_CACHE_MAX)
+		io_req_cache_reclaim(ce);
+
+	if (rb->irq_comp)
+		local_irq_enable();
+	else
+		preempt_enable();
+}
+
+static void io_req_cache_free(struct io_kiocb *req)
+{
+	const bool irq_comp = io_op_defs[req->opcode].irq_comp;
+	struct io_kiocb_cache_entry *ce;
+	struct io_kiocb_cache *cache;
+	unsigned long flags;
+
+	if (irq_comp)
+		local_irq_save(flags);
+	else
+		preempt_disable();
+
+	cache = this_cpu_ptr(alloc_cache);
+	ce = &cache->caches[irq_comp];
+
+	list_add(&req->list, &ce->alloc_list);
+	if (++ce->nr_avail > IO_KIOCB_CACHE_MAX)
+		io_req_cache_reclaim(ce);
+
+	if (irq_comp)
+		local_irq_restore(flags);
+	else
+		preempt_enable();
+}
+
+static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx, int opcode)
+{
+	struct io_kiocb *req;
+
+	req = io_req_cache_alloc(opcode);
+	if (req) {
+		req->opcode = opcode;
+		return req;
+	}
+
+	return io_get_fallback_req(ctx, opcode);
 }
 
 static inline void io_put_file(struct io_kiocb *req, struct file *file,
@@ -1345,7 +1475,8 @@ static void __io_req_aux_free(struct io_kiocb *req)
 	if (req->flags & REQ_F_NEED_CLEANUP)
 		io_cleanup_req(req);
 
-	kfree(req->io);
+	if (req->io)
+		kfree(req->io);
 	if (req->file)
 		io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
 	if (req->task)
@@ -1371,28 +1502,21 @@ static void __io_free_req(struct io_kiocb *req)
 
 	percpu_ref_put(&req->ctx->refs);
 	if (likely(!io_is_fallback_req(req)))
-		kmem_cache_free(req_cachep, req);
+		io_req_cache_free(req);
 	else
 		clear_bit_unlock(0, (unsigned long *) &req->ctx->fallback_req);
 }
 
-struct req_batch {
-	void *reqs[IO_IOPOLL_BATCH];
-	int to_free;
-	int need_iter;
-};
-
 static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
 {
 	if (!rb->to_free)
 		return;
 	if (rb->need_iter) {
-		int i, inflight = 0;
+		struct io_kiocb *req;
 		unsigned long flags;
+		int inflight = 0;
 
-		for (i = 0; i < rb->to_free; i++) {
-			struct io_kiocb *req = rb->reqs[i];
-
+		list_for_each_entry(req, &rb->list, list) {
 			if (req->flags & REQ_F_FIXED_FILE) {
 				req->file = NULL;
 				percpu_ref_put(req->fixed_file_refs);
@@ -1405,9 +1529,7 @@ static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
 			goto do_free;
 
 		spin_lock_irqsave(&ctx->inflight_lock, flags);
-		for (i = 0; i < rb->to_free; i++) {
-			struct io_kiocb *req = rb->reqs[i];
-
+		list_for_each_entry(req, &rb->list, list) {
 			if (req->flags & REQ_F_INFLIGHT) {
 				list_del(&req->inflight_entry);
 				if (!--inflight)
@@ -1420,9 +1542,8 @@ static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
 			wake_up(&ctx->inflight_wait);
 	}
 do_free:
-	kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
+	io_req_cache_free_bulk(rb);
 	percpu_ref_put_many(&ctx->refs, rb->to_free);
-	rb->to_free = rb->need_iter = 0;
 }
 
 static bool io_link_cancel_timeout(struct io_kiocb *req)
@@ -1670,11 +1791,12 @@ static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req)
 		return false;
 
 	if (!(req->flags & REQ_F_FIXED_FILE) || req->io)
-		rb->need_iter++;
+		rb->need_iter |= true;
+	if (!rb->irq_comp && io_op_defs[req->opcode].irq_comp)
+		rb->irq_comp |= true;
 
-	rb->reqs[rb->to_free++] = req;
-	if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs)))
-		io_free_req_many(req->ctx, rb);
+	list_add(&req->list, &rb->list);
+	rb->to_free++;
 	return true;
 }
 
@@ -1697,10 +1819,14 @@ static int io_put_kbuf(struct io_kiocb *req)
 static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 			       struct list_head *done)
 {
-	struct req_batch rb;
+	struct req_batch rb = {
+		.list		= LIST_HEAD_INIT(rb.list),
+		.to_free	= 0,
+		.need_iter	= false,
+		.irq_comp	= false
+	};
 	struct io_kiocb *req;
 
-	rb.to_free = rb.need_iter = 0;
 	while (!list_empty(done)) {
 		int cflags = 0;
 
@@ -5703,8 +5829,6 @@ static void io_submit_state_end(struct io_submit_state *state)
 {
 	blk_finish_plug(&state->plug);
 	io_file_put(state);
-	if (state->free_reqs)
-		kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
 }
 
 /*
@@ -5714,7 +5838,6 @@ static void io_submit_state_start(struct io_submit_state *state,
 				  unsigned int max_ios)
 {
 	blk_start_plug(&state->plug);
-	state->free_reqs = 0;
 	state->file = NULL;
 	state->ios_left = max_ios;
 }
@@ -5784,7 +5907,6 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	 * link list.
 	 */
 	req->sequence = ctx->cached_sq_head - ctx->cached_sq_dropped;
-	req->opcode = READ_ONCE(sqe->opcode);
 	req->user_data = READ_ONCE(sqe->user_data);
 	req->io = NULL;
 	req->file = NULL;
@@ -5872,7 +5994,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 			io_consume_sqe(ctx);
 			break;
 		}
-		req = io_alloc_req(ctx, statep);
+		req = io_alloc_req(ctx, READ_ONCE(sqe->opcode));
 		if (unlikely(!req)) {
 			if (!submitted)
 				submitted = -EAGAIN;
@@ -7626,6 +7748,17 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
 					req->task->task_works != NULL);
 	}
 	spin_unlock_irq(&ctx->completion_lock);
+	seq_printf(m, "AllocCache:\n");
+	for_each_possible_cpu(i) {
+		struct io_kiocb_cache *cache = per_cpu_ptr(alloc_cache, i);
+		int j;
+
+		for (j = 0; j < ARRAY_SIZE(cache->caches); j++) {
+			struct io_kiocb_cache_entry *ce = &cache->caches[j];
+
+			seq_printf(m, "  cpu%d: irq=%d, nr_free=%d, nr_avail=%d\n", i, j, ce->nr_free, ce->nr_avail);
+		}
+	}
 	mutex_unlock(&ctx->uring_lock);
 }
 
@@ -8101,8 +8234,131 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
 	return ret;
 }
 
+static unsigned long io_uring_cache_count(struct shrinker *shrink,
+					  struct shrink_control *sc)
+{
+	unsigned long count = 0;
+	int cpu, i;
+
+	for_each_possible_cpu(cpu) {
+		struct io_kiocb_cache *cache;
+
+		cache = per_cpu_ptr(alloc_cache, cpu);
+		for (i = 0; i < ARRAY_SIZE(cache->caches); i++) {
+			struct io_kiocb_cache_entry *ce = &cache->caches[i];
+
+			count += ce->nr_free;
+		}
+	}
+
+	return count;
+}
+
+static unsigned long __io_uring_cache_shrink(struct io_kiocb_cache_entry *ce,
+					     const bool irq_comp,
+					     int *nr_to_scan)
+{
+	unsigned long freed = 0;
+	struct io_kiocb *req;
+	LIST_HEAD(free_list);
+
+	if (!ce->nr_free)
+		return 0;
+
+	if (irq_comp)
+		spin_lock_irq(&ce->free_lock);
+	else
+		spin_lock(&ce->free_lock);
+
+	while (!list_empty(&ce->free_list)) {
+		req = list_first_entry(&ce->free_list, struct io_kiocb, list);
+		list_move(&req->list, &free_list);
+		freed++;
+		if (!--(*nr_to_scan))
+			break;
+	}
+
+	if (irq_comp)
+		spin_unlock_irq(&ce->free_lock);
+	else
+		spin_unlock(&ce->free_lock);
+
+	while (!list_empty(&free_list)) {
+		req = list_first_entry(&free_list, struct io_kiocb, list);
+		list_del(&req->list);
+		kmem_cache_free(req_cachep, req);
+	}
+
+	return freed;
+}
+
+static unsigned long io_uring_cache_shrink(int nr_to_scan)
+{
+	long freed = 0;
+	int cpu, i;
+
+	for_each_possible_cpu(cpu) {
+		struct io_kiocb_cache *cache = per_cpu_ptr(alloc_cache, cpu);
+
+		for (i = 0; i < ARRAY_SIZE(cache->caches); i++) {
+			struct io_kiocb_cache_entry *ce = &cache->caches[i];
+
+			freed += __io_uring_cache_shrink(ce, i, &nr_to_scan);
+			if (!nr_to_scan)
+				break;
+		}
+		if (!nr_to_scan)
+			break;
+	}
+
+	return freed ?: SHRINK_STOP;
+}
+
+static unsigned long io_uring_cache_scan(struct shrinker *shrink,
+					 struct shrink_control *sc)
+{
+	if ((sc->gfp_mask & GFP_KERNEL) != GFP_KERNEL)
+		return SHRINK_STOP;
+
+	return io_uring_cache_shrink(sc->nr_to_scan);
+}
+
+static struct shrinker io_uring_shrinker = {
+	.count_objects	= io_uring_cache_count,
+	.scan_objects	= io_uring_cache_scan,
+	.seeks		= DEFAULT_SEEKS,
+};
+
+static void io_uring_kill_ce(struct io_kiocb_cache_entry *ce)
+{
+	struct io_kiocb *req;
+
+	list_splice_init(&ce->alloc_list, &ce->free_list);
+
+	while (!list_empty(&ce->free_list)) {
+		req = list_first_entry(&ce->free_list, struct io_kiocb, list);
+		list_del(&req->list);
+		kmem_cache_free(req_cachep, req);
+	}
+
+	ce->nr_free = ce->nr_avail = 0;
+}
+
+static int io_uring_notify_dead(unsigned int cpu)
+{
+	struct io_kiocb_cache *cache = per_cpu_ptr(alloc_cache, cpu);
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(cache->caches); i++)
+		io_uring_kill_ce(&cache->caches[i]);
+
+	return 0;
+}
+
 static int __init io_uring_init(void)
 {
+	int cpu, i;
+
 #define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
 	BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
 	BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
@@ -8142,6 +8398,25 @@ static int __init io_uring_init(void)
 	BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
 	BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
 	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
+
+	alloc_cache = alloc_percpu(struct io_kiocb_cache);
+	for_each_possible_cpu(cpu) {
+		struct io_kiocb_cache *cache = per_cpu_ptr(alloc_cache, cpu);
+
+		for (i = 0; i < ARRAY_SIZE(cache->caches); i++) {
+			struct io_kiocb_cache_entry *ce = &cache->caches[i];
+
+			INIT_LIST_HEAD(&ce->alloc_list);
+			spin_lock_init(&ce->free_lock);
+			INIT_LIST_HEAD(&ce->free_list);
+			ce->nr_free = 0;
+			ce->nr_avail = 0;
+		}
+	}
+
+	cpuhp_setup_state_nocalls(CPUHP_IOURING_DEAD, "io_uring:dead", NULL,
+					io_uring_notify_dead);
+	WARN_ON(register_shrinker(&io_uring_shrinker));
 	return 0;
 };
 __initcall(io_uring_init);
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 77d70b633531..3b80556572a5 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -60,6 +60,7 @@ enum cpuhp_state {
 	CPUHP_LUSTRE_CFS_DEAD,
 	CPUHP_AP_ARM_CACHE_B15_RAC_DEAD,
 	CPUHP_PADATA_DEAD,
+	CPUHP_IOURING_DEAD,
 	CPUHP_WORKQUEUE_PREP,
 	CPUHP_POWER_NUMA_PREPARE,
 	CPUHP_HRTIMERS_PREPARE,

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH RFC} io_uring: io_kiocb alloc cache
  2020-05-14 14:22   ` Jens Axboe
@ 2020-05-14 14:33     ` Jens Axboe
  2020-05-14 14:53       ` Pavel Begunkov
  2020-05-16  9:20       ` Xiaoguang Wang
  2020-05-16 16:15     ` Xiaoguang Wang
  1 sibling, 2 replies; 22+ messages in thread
From: Jens Axboe @ 2020-05-14 14:33 UTC (permalink / raw)
  To: Xiaoguang Wang, io-uring; +Cc: joseph qi, Jiufei Xue, Pavel Begunkov

On 5/14/20 8:22 AM, Jens Axboe wrote:
>> I still use my previous io_uring_nop_stress tool to evaluate the improvement
>> in a physical machine. Memory 250GB and cpu is "Intel(R) Xeon(R) CPU E5-2682 v4 @ 2.50GHz".
>> Before this patch:
>> $sudo taskset -c 60 ./io_uring_nop_stress -r 300
>> total ios: 1608773840
>> IOPS:      5362579
>>
>> With this patch:
>> sudo taskset -c 60 ./io_uring_nop_stress -r 300
>> total ios: 1676910736
>> IOPS:      5589702
>> About 4.2% improvement.
> 
> That's not bad. Can you try the patch from Pekka as well, just to see if
> that helps for you?
> 
> I also had another idea... We basically have two types of request life
> times:
> 
> 1) io_kiocb can get queued up internally
> 2) io_kiocb completes inline
> 
> For the latter, it's totally feasible to just have the io_kiocb on
> stack. The downside is if we need to go the slower path, then we need to
> alloc an io_kiocb then and copy it. But maybe that's OK... I'll play
> with it.

Can you try this with your microbenchmark? Just curious what it looks
like for that test case if we completely take slab alloc+free out of it.


diff --git a/fs/io_uring.c b/fs/io_uring.c
index d2e37215d05a..4ecd6bd38f02 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -525,6 +525,7 @@ enum {
 	REQ_F_POLLED_BIT,
 	REQ_F_BUFFER_SELECTED_BIT,
 	REQ_F_NO_FILE_TABLE_BIT,
+	REQ_F_STACK_REQ_BIT,
 
 	/* not a real bit, just to check we're not overflowing the space */
 	__REQ_F_LAST_BIT,
@@ -580,6 +581,8 @@ enum {
 	REQ_F_BUFFER_SELECTED	= BIT(REQ_F_BUFFER_SELECTED_BIT),
 	/* doesn't need file table for this request */
 	REQ_F_NO_FILE_TABLE	= BIT(REQ_F_NO_FILE_TABLE_BIT),
+	/* on-stack req */
+	REQ_F_STACK_REQ		= BIT(REQ_F_STACK_REQ_BIT),
 };
 
 struct async_poll {
@@ -695,10 +698,14 @@ struct io_op_def {
 	unsigned		pollout : 1;
 	/* op supports buffer selection */
 	unsigned		buffer_select : 1;
+	/* op can use stack req */
+	unsigned		stack_req : 1;
 };
 
 static const struct io_op_def io_op_defs[] = {
-	[IORING_OP_NOP] = {},
+	[IORING_OP_NOP] = {
+		.stack_req		= 1,
+	},
 	[IORING_OP_READV] = {
 		.async_ctx		= 1,
 		.needs_mm		= 1,
@@ -1345,7 +1352,8 @@ static void __io_req_aux_free(struct io_kiocb *req)
 	if (req->flags & REQ_F_NEED_CLEANUP)
 		io_cleanup_req(req);
 
-	kfree(req->io);
+	if (req->io)
+		kfree(req->io);
 	if (req->file)
 		io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
 	if (req->task)
@@ -1370,6 +1378,8 @@ static void __io_free_req(struct io_kiocb *req)
 	}
 
 	percpu_ref_put(&req->ctx->refs);
+	if (req->flags & REQ_F_STACK_REQ)
+		return;
 	if (likely(!io_is_fallback_req(req)))
 		kmem_cache_free(req_cachep, req);
 	else
@@ -5784,12 +5794,10 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	 * link list.
 	 */
 	req->sequence = ctx->cached_sq_head - ctx->cached_sq_dropped;
-	req->opcode = READ_ONCE(sqe->opcode);
 	req->user_data = READ_ONCE(sqe->user_data);
 	req->io = NULL;
 	req->file = NULL;
 	req->ctx = ctx;
-	req->flags = 0;
 	/* one is dropped after submission, the other at completion */
 	refcount_set(&req->refs, 2);
 	req->task = NULL;
@@ -5839,6 +5847,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 {
 	struct io_submit_state state, *statep = NULL;
 	struct io_kiocb *link = NULL;
+	struct io_kiocb stack_req;
 	int i, submitted = 0;
 
 	/* if we have a backlog and couldn't flush it all, return BUSY */
@@ -5865,20 +5874,31 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 	for (i = 0; i < nr; i++) {
 		const struct io_uring_sqe *sqe;
 		struct io_kiocb *req;
-		int err;
+		int err, op;
 
 		sqe = io_get_sqe(ctx);
 		if (unlikely(!sqe)) {
 			io_consume_sqe(ctx);
 			break;
 		}
-		req = io_alloc_req(ctx, statep);
-		if (unlikely(!req)) {
-			if (!submitted)
-				submitted = -EAGAIN;
-			break;
+
+		op = READ_ONCE(sqe->opcode);
+
+		if (io_op_defs[op].stack_req) {
+			req = &stack_req;
+			req->flags = REQ_F_STACK_REQ;
+		} else {
+			req = io_alloc_req(ctx, statep);
+			if (unlikely(!req)) {
+				if (!submitted)
+					submitted = -EAGAIN;
+				break;
+			}
+			req->flags = 0;
 		}
 
+		req->opcode = op;
+
 		err = io_init_req(ctx, req, sqe, statep, async);
 		io_consume_sqe(ctx);
 		/* will complete beyond this point, count as submitted */

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH RFC} io_uring: io_kiocb alloc cache
  2020-05-14 14:33     ` Jens Axboe
@ 2020-05-14 14:53       ` Pavel Begunkov
  2020-05-14 15:15         ` Jens Axboe
  2020-05-16  9:20       ` Xiaoguang Wang
  1 sibling, 1 reply; 22+ messages in thread
From: Pavel Begunkov @ 2020-05-14 14:53 UTC (permalink / raw)
  To: Jens Axboe, Xiaoguang Wang, io-uring; +Cc: joseph qi, Jiufei Xue

On 14/05/2020 17:33, Jens Axboe wrote:
> On 5/14/20 8:22 AM, Jens Axboe wrote:
>>> I still use my previous io_uring_nop_stress tool to evaluate the improvement
>>> in a physical machine. Memory 250GB and cpu is "Intel(R) Xeon(R) CPU E5-2682 v4 @ 2.50GHz".
>>> Before this patch:
>>> $sudo taskset -c 60 ./io_uring_nop_stress -r 300
>>> total ios: 1608773840
>>> IOPS:      5362579
>>>
>>> With this patch:
>>> sudo taskset -c 60 ./io_uring_nop_stress -r 300
>>> total ios: 1676910736
>>> IOPS:      5589702
>>> About 4.2% improvement.
>>
>> That's not bad. Can you try the patch from Pekka as well, just to see if
>> that helps for you?
>>
>> I also had another idea... We basically have two types of request life
>> times:
>>
>> 1) io_kiocb can get queued up internally
>> 2) io_kiocb completes inline
>>
>> For the latter, it's totally feasible to just have the io_kiocb on
>> stack. The downside is if we need to go the slower path, then we need to
>> alloc an io_kiocb then and copy it. But maybe that's OK... I'll play
>> with it.

Does it differ from having one pre-allocated req? Like fallback_req, but without
atomics and returned only under uring_mutex (i.e. in __io_queue_sqe()). Putting
aside its usefulness, at least it will have a chance to work with reads/writes.

> 
> Can you try this with your microbenchmark? Just curious what it looks
> like for that test case if we completely take slab alloc+free out of it.
> 
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index d2e37215d05a..4ecd6bd38f02 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -525,6 +525,7 @@ enum {
>  	REQ_F_POLLED_BIT,
>  	REQ_F_BUFFER_SELECTED_BIT,
>  	REQ_F_NO_FILE_TABLE_BIT,
> +	REQ_F_STACK_REQ_BIT,
>  
>  	/* not a real bit, just to check we're not overflowing the space */
>  	__REQ_F_LAST_BIT,
> @@ -580,6 +581,8 @@ enum {
>  	REQ_F_BUFFER_SELECTED	= BIT(REQ_F_BUFFER_SELECTED_BIT),
>  	/* doesn't need file table for this request */
>  	REQ_F_NO_FILE_TABLE	= BIT(REQ_F_NO_FILE_TABLE_BIT),
> +	/* on-stack req */
> +	REQ_F_STACK_REQ		= BIT(REQ_F_STACK_REQ_BIT),
>  };
>  
>  struct async_poll {
> @@ -695,10 +698,14 @@ struct io_op_def {
>  	unsigned		pollout : 1;
>  	/* op supports buffer selection */
>  	unsigned		buffer_select : 1;
> +	/* op can use stack req */
> +	unsigned		stack_req : 1;
>  };
>  
>  static const struct io_op_def io_op_defs[] = {
> -	[IORING_OP_NOP] = {},
> +	[IORING_OP_NOP] = {
> +		.stack_req		= 1,
> +	},
>  	[IORING_OP_READV] = {
>  		.async_ctx		= 1,
>  		.needs_mm		= 1,
> @@ -1345,7 +1352,8 @@ static void __io_req_aux_free(struct io_kiocb *req)
>  	if (req->flags & REQ_F_NEED_CLEANUP)
>  		io_cleanup_req(req);
>  
> -	kfree(req->io);
> +	if (req->io)
> +		kfree(req->io);
>  	if (req->file)
>  		io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
>  	if (req->task)
> @@ -1370,6 +1378,8 @@ static void __io_free_req(struct io_kiocb *req)
>  	}
>  
>  	percpu_ref_put(&req->ctx->refs);
> +	if (req->flags & REQ_F_STACK_REQ)
> +		return;
>  	if (likely(!io_is_fallback_req(req)))
>  		kmem_cache_free(req_cachep, req);
>  	else
> @@ -5784,12 +5794,10 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
>  	 * link list.
>  	 */
>  	req->sequence = ctx->cached_sq_head - ctx->cached_sq_dropped;
> -	req->opcode = READ_ONCE(sqe->opcode);
>  	req->user_data = READ_ONCE(sqe->user_data);
>  	req->io = NULL;
>  	req->file = NULL;
>  	req->ctx = ctx;
> -	req->flags = 0;
>  	/* one is dropped after submission, the other at completion */
>  	refcount_set(&req->refs, 2);
>  	req->task = NULL;
> @@ -5839,6 +5847,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
>  {
>  	struct io_submit_state state, *statep = NULL;
>  	struct io_kiocb *link = NULL;
> +	struct io_kiocb stack_req;
>  	int i, submitted = 0;
>  
>  	/* if we have a backlog and couldn't flush it all, return BUSY */
> @@ -5865,20 +5874,31 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
>  	for (i = 0; i < nr; i++) {
>  		const struct io_uring_sqe *sqe;
>  		struct io_kiocb *req;
> -		int err;
> +		int err, op;
>  
>  		sqe = io_get_sqe(ctx);
>  		if (unlikely(!sqe)) {
>  			io_consume_sqe(ctx);
>  			break;
>  		}
> -		req = io_alloc_req(ctx, statep);
> -		if (unlikely(!req)) {
> -			if (!submitted)
> -				submitted = -EAGAIN;
> -			break;
> +
> +		op = READ_ONCE(sqe->opcode);
> +
> +		if (io_op_defs[op].stack_req) {
> +			req = &stack_req;
> +			req->flags = REQ_F_STACK_REQ;
> +		} else {
> +			req = io_alloc_req(ctx, statep);
> +			if (unlikely(!req)) {
> +				if (!submitted)
> +					submitted = -EAGAIN;
> +				break;
> +			}
> +			req->flags = 0;
>  		}
>  
> +		req->opcode = op;
> +
>  		err = io_init_req(ctx, req, sqe, statep, async);
>  		io_consume_sqe(ctx);
>  		/* will complete beyond this point, count as submitted */
> 

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH RFC} io_uring: io_kiocb alloc cache
  2020-05-14 14:53       ` Pavel Begunkov
@ 2020-05-14 15:15         ` Jens Axboe
  2020-05-14 15:37           ` Pavel Begunkov
  0 siblings, 1 reply; 22+ messages in thread
From: Jens Axboe @ 2020-05-14 15:15 UTC (permalink / raw)
  To: Pavel Begunkov, Xiaoguang Wang, io-uring; +Cc: joseph qi, Jiufei Xue

On 5/14/20 8:53 AM, Pavel Begunkov wrote:
> On 14/05/2020 17:33, Jens Axboe wrote:
>> On 5/14/20 8:22 AM, Jens Axboe wrote:
>>>> I still use my previous io_uring_nop_stress tool to evaluate the improvement
>>>> in a physical machine. Memory 250GB and cpu is "Intel(R) Xeon(R) CPU E5-2682 v4 @ 2.50GHz".
>>>> Before this patch:
>>>> $sudo taskset -c 60 ./io_uring_nop_stress -r 300
>>>> total ios: 1608773840
>>>> IOPS:      5362579
>>>>
>>>> With this patch:
>>>> sudo taskset -c 60 ./io_uring_nop_stress -r 300
>>>> total ios: 1676910736
>>>> IOPS:      5589702
>>>> About 4.2% improvement.
>>>
>>> That's not bad. Can you try the patch from Pekka as well, just to see if
>>> that helps for you?
>>>
>>> I also had another idea... We basically have two types of request life
>>> times:
>>>
>>> 1) io_kiocb can get queued up internally
>>> 2) io_kiocb completes inline
>>>
>>> For the latter, it's totally feasible to just have the io_kiocb on
>>> stack. The downside is if we need to go the slower path, then we need to
>>> alloc an io_kiocb then and copy it. But maybe that's OK... I'll play
>>> with it.
> 
> Does it differ from having one pre-allocated req? Like fallback_req,
> but without atomics and returned only under uring_mutex (i.e. in
> __io_queue_sqe()). Putting aside its usefulness, at least it will have
> a chance to work with reads/writes.

But then you need atomics. I actually think the bigger win here is not
having to use atomic refcounts for this particular part, since we know
the request can't get shared.

Modified below with poor-man-dec-and-test, pretty sure that part is a
bigger win than avoiding the allocator. So maybe that's a better
directiont to take, for things we can't get irq completions on.


diff --git a/fs/io_uring.c b/fs/io_uring.c
index d2e37215d05a..27c1c8f4d2f4 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -525,6 +525,7 @@ enum {
 	REQ_F_POLLED_BIT,
 	REQ_F_BUFFER_SELECTED_BIT,
 	REQ_F_NO_FILE_TABLE_BIT,
+	REQ_F_STACK_REQ_BIT,
 
 	/* not a real bit, just to check we're not overflowing the space */
 	__REQ_F_LAST_BIT,
@@ -580,6 +581,8 @@ enum {
 	REQ_F_BUFFER_SELECTED	= BIT(REQ_F_BUFFER_SELECTED_BIT),
 	/* doesn't need file table for this request */
 	REQ_F_NO_FILE_TABLE	= BIT(REQ_F_NO_FILE_TABLE_BIT),
+	/* on-stack req */
+	REQ_F_STACK_REQ		= BIT(REQ_F_STACK_REQ_BIT),
 };
 
 struct async_poll {
@@ -695,10 +698,14 @@ struct io_op_def {
 	unsigned		pollout : 1;
 	/* op supports buffer selection */
 	unsigned		buffer_select : 1;
+	/* op can use stack req */
+	unsigned		stack_req : 1;
 };
 
 static const struct io_op_def io_op_defs[] = {
-	[IORING_OP_NOP] = {},
+	[IORING_OP_NOP] = {
+		.stack_req		= 1,
+	},
 	[IORING_OP_READV] = {
 		.async_ctx		= 1,
 		.needs_mm		= 1,
@@ -1345,7 +1352,8 @@ static void __io_req_aux_free(struct io_kiocb *req)
 	if (req->flags & REQ_F_NEED_CLEANUP)
 		io_cleanup_req(req);
 
-	kfree(req->io);
+	if (req->io)
+		kfree(req->io);
 	if (req->file)
 		io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
 	if (req->task)
@@ -1370,6 +1378,8 @@ static void __io_free_req(struct io_kiocb *req)
 	}
 
 	percpu_ref_put(&req->ctx->refs);
+	if (req->flags & REQ_F_STACK_REQ)
+		return;
 	if (likely(!io_is_fallback_req(req)))
 		kmem_cache_free(req_cachep, req);
 	else
@@ -1585,7 +1595,14 @@ static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt)
 __attribute__((nonnull))
 static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
 {
-	if (refcount_dec_and_test(&req->refs)) {
+	bool free_it;
+
+	if (req->flags & REQ_F_STACK_REQ)
+		free_it = __refcount_dec_and_test(&req->refs);
+	else
+		free_it = refcount_dec_and_test(&req->refs);
+
+	if (free_it) {
 		io_req_find_next(req, nxtptr);
 		__io_free_req(req);
 	}
@@ -1593,7 +1610,13 @@ static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
 
 static void io_put_req(struct io_kiocb *req)
 {
-	if (refcount_dec_and_test(&req->refs))
+	bool free_it;
+
+	if (req->flags & REQ_F_STACK_REQ)
+		free_it = __refcount_dec_and_test(&req->refs);
+	else
+		free_it = refcount_dec_and_test(&req->refs);
+	if (free_it)
 		io_free_req(req);
 }
 
@@ -5784,12 +5807,10 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	 * link list.
 	 */
 	req->sequence = ctx->cached_sq_head - ctx->cached_sq_dropped;
-	req->opcode = READ_ONCE(sqe->opcode);
 	req->user_data = READ_ONCE(sqe->user_data);
 	req->io = NULL;
 	req->file = NULL;
 	req->ctx = ctx;
-	req->flags = 0;
 	/* one is dropped after submission, the other at completion */
 	refcount_set(&req->refs, 2);
 	req->task = NULL;
@@ -5839,6 +5860,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 {
 	struct io_submit_state state, *statep = NULL;
 	struct io_kiocb *link = NULL;
+	struct io_kiocb stack_req;
 	int i, submitted = 0;
 
 	/* if we have a backlog and couldn't flush it all, return BUSY */
@@ -5865,20 +5887,31 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 	for (i = 0; i < nr; i++) {
 		const struct io_uring_sqe *sqe;
 		struct io_kiocb *req;
-		int err;
+		int err, op;
 
 		sqe = io_get_sqe(ctx);
 		if (unlikely(!sqe)) {
 			io_consume_sqe(ctx);
 			break;
 		}
-		req = io_alloc_req(ctx, statep);
-		if (unlikely(!req)) {
-			if (!submitted)
-				submitted = -EAGAIN;
-			break;
+
+		op = READ_ONCE(sqe->opcode);
+
+		if (io_op_defs[op].stack_req) {
+			req = &stack_req;
+			req->flags = REQ_F_STACK_REQ;
+		} else {
+			req = io_alloc_req(ctx, statep);
+			if (unlikely(!req)) {
+				if (!submitted)
+					submitted = -EAGAIN;
+				break;
+			}
+			req->flags = 0;
 		}
 
+		req->opcode = op;
+
 		err = io_init_req(ctx, req, sqe, statep, async);
 		io_consume_sqe(ctx);
 		/* will complete beyond this point, count as submitted */
diff --git a/include/linux/refcount.h b/include/linux/refcount.h
index 0e3ee25eb156..1afaf73c0984 100644
--- a/include/linux/refcount.h
+++ b/include/linux/refcount.h
@@ -294,6 +294,13 @@ static inline __must_check bool refcount_dec_and_test(refcount_t *r)
 	return refcount_sub_and_test(1, r);
 }
 
+static inline __must_check bool __refcount_dec_and_test(refcount_t *r)
+{
+	int old = --r->refs.counter;
+
+	return old == 0;
+}
+
 /**
  * refcount_dec - decrement a refcount
  * @r: the refcount

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH RFC} io_uring: io_kiocb alloc cache
  2020-05-14 15:15         ` Jens Axboe
@ 2020-05-14 15:37           ` Pavel Begunkov
  2020-05-14 15:53             ` Jens Axboe
  0 siblings, 1 reply; 22+ messages in thread
From: Pavel Begunkov @ 2020-05-14 15:37 UTC (permalink / raw)
  To: Jens Axboe, Xiaoguang Wang, io-uring; +Cc: joseph qi, Jiufei Xue



On 14/05/2020 18:15, Jens Axboe wrote:
> On 5/14/20 8:53 AM, Pavel Begunkov wrote:
>> On 14/05/2020 17:33, Jens Axboe wrote:
>>> On 5/14/20 8:22 AM, Jens Axboe wrote:
>>>>> I still use my previous io_uring_nop_stress tool to evaluate the improvement
>>>>> in a physical machine. Memory 250GB and cpu is "Intel(R) Xeon(R) CPU E5-2682 v4 @ 2.50GHz".
>>>>> Before this patch:
>>>>> $sudo taskset -c 60 ./io_uring_nop_stress -r 300
>>>>> total ios: 1608773840
>>>>> IOPS:      5362579
>>>>>
>>>>> With this patch:
>>>>> sudo taskset -c 60 ./io_uring_nop_stress -r 300
>>>>> total ios: 1676910736
>>>>> IOPS:      5589702
>>>>> About 4.2% improvement.
>>>>
>>>> That's not bad. Can you try the patch from Pekka as well, just to see if
>>>> that helps for you?
>>>>
>>>> I also had another idea... We basically have two types of request life
>>>> times:
>>>>
>>>> 1) io_kiocb can get queued up internally
>>>> 2) io_kiocb completes inline
>>>>
>>>> For the latter, it's totally feasible to just have the io_kiocb on
>>>> stack. The downside is if we need to go the slower path, then we need to
>>>> alloc an io_kiocb then and copy it. But maybe that's OK... I'll play
>>>> with it.
>>
>> Does it differ from having one pre-allocated req? Like fallback_req,
>> but without atomics and returned only under uring_mutex (i.e. in
>> __io_queue_sqe()). Putting aside its usefulness, at least it will have
>> a chance to work with reads/writes.
> 
> But then you need atomics. I actually think the bigger win here is not
> having to use atomic refcounts for this particular part, since we know
> the request can't get shared.

Don't think we need, see:

struct ctx {
	/* protected by uring_mtx */
	struct req *cache_req;
}

__io_queue_sqe()
{
	ret = issue_inline(req);
	if (completed(ret)) {
		// don't need req anymore, return it
		ctx->cache_req = req;
	} else if (need_async) {
		// still under uring_mtx, just replenish the cache
		// alloc()+memcpy() here for on-stack
		ctx->cache_req = alloc_req();
		punt(req);
	}

	// restored it in any case
	assert(ctx->cache_req != NULL);
}

submit_sqes() __holds(uring_mtx)
{
	while (...) {
		// already holding the mutex, no need for sync here
		// also, there is always a req there
		req = ctx->cache_req;
		ctx->cache_req = NULL;
		...
	}
}

BTW, there will be a lot of problems to make either work properly with
IORING_FEAT_SUBMIT_STABLE.

> 
> Modified below with poor-man-dec-and-test, pretty sure that part is a
> bigger win than avoiding the allocator. So maybe that's a better
> directiont to take, for things we can't get irq completions on.
> 
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index d2e37215d05a..27c1c8f4d2f4 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -525,6 +525,7 @@ enum {
>  	REQ_F_POLLED_BIT,
>  	REQ_F_BUFFER_SELECTED_BIT,
>  	REQ_F_NO_FILE_TABLE_BIT,
> +	REQ_F_STACK_REQ_BIT,
>  
>  	/* not a real bit, just to check we're not overflowing the space */
>  	__REQ_F_LAST_BIT,
> @@ -580,6 +581,8 @@ enum {
>  	REQ_F_BUFFER_SELECTED	= BIT(REQ_F_BUFFER_SELECTED_BIT),
>  	/* doesn't need file table for this request */
>  	REQ_F_NO_FILE_TABLE	= BIT(REQ_F_NO_FILE_TABLE_BIT),
> +	/* on-stack req */
> +	REQ_F_STACK_REQ		= BIT(REQ_F_STACK_REQ_BIT),
>  };
>  
>  struct async_poll {
> @@ -695,10 +698,14 @@ struct io_op_def {
>  	unsigned		pollout : 1;
>  	/* op supports buffer selection */
>  	unsigned		buffer_select : 1;
> +	/* op can use stack req */
> +	unsigned		stack_req : 1;
>  };
>  
>  static const struct io_op_def io_op_defs[] = {
> -	[IORING_OP_NOP] = {},
> +	[IORING_OP_NOP] = {
> +		.stack_req		= 1,
> +	},
>  	[IORING_OP_READV] = {
>  		.async_ctx		= 1,
>  		.needs_mm		= 1,
> @@ -1345,7 +1352,8 @@ static void __io_req_aux_free(struct io_kiocb *req)
>  	if (req->flags & REQ_F_NEED_CLEANUP)
>  		io_cleanup_req(req);
>  
> -	kfree(req->io);
> +	if (req->io)
> +		kfree(req->io);
>  	if (req->file)
>  		io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
>  	if (req->task)
> @@ -1370,6 +1378,8 @@ static void __io_free_req(struct io_kiocb *req)
>  	}
>  
>  	percpu_ref_put(&req->ctx->refs);
> +	if (req->flags & REQ_F_STACK_REQ)
> +		return;
>  	if (likely(!io_is_fallback_req(req)))
>  		kmem_cache_free(req_cachep, req);
>  	else
> @@ -1585,7 +1595,14 @@ static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt)
>  __attribute__((nonnull))
>  static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
>  {
> -	if (refcount_dec_and_test(&req->refs)) {
> +	bool free_it;
> +
> +	if (req->flags & REQ_F_STACK_REQ)
> +		free_it = __refcount_dec_and_test(&req->refs);
> +	else
> +		free_it = refcount_dec_and_test(&req->refs);
> +
> +	if (free_it) {
>  		io_req_find_next(req, nxtptr);
>  		__io_free_req(req);
>  	}
> @@ -1593,7 +1610,13 @@ static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
>  
>  static void io_put_req(struct io_kiocb *req)
>  {
> -	if (refcount_dec_and_test(&req->refs))
> +	bool free_it;
> +
> +	if (req->flags & REQ_F_STACK_REQ)
> +		free_it = __refcount_dec_and_test(&req->refs);
> +	else
> +		free_it = refcount_dec_and_test(&req->refs);
> +	if (free_it)
>  		io_free_req(req);
>  }
>  
> @@ -5784,12 +5807,10 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
>  	 * link list.
>  	 */
>  	req->sequence = ctx->cached_sq_head - ctx->cached_sq_dropped;
> -	req->opcode = READ_ONCE(sqe->opcode);
>  	req->user_data = READ_ONCE(sqe->user_data);
>  	req->io = NULL;
>  	req->file = NULL;
>  	req->ctx = ctx;
> -	req->flags = 0;
>  	/* one is dropped after submission, the other at completion */
>  	refcount_set(&req->refs, 2);
>  	req->task = NULL;
> @@ -5839,6 +5860,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
>  {
>  	struct io_submit_state state, *statep = NULL;
>  	struct io_kiocb *link = NULL;
> +	struct io_kiocb stack_req;
>  	int i, submitted = 0;
>  
>  	/* if we have a backlog and couldn't flush it all, return BUSY */
> @@ -5865,20 +5887,31 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
>  	for (i = 0; i < nr; i++) {
>  		const struct io_uring_sqe *sqe;
>  		struct io_kiocb *req;
> -		int err;
> +		int err, op;
>  
>  		sqe = io_get_sqe(ctx);
>  		if (unlikely(!sqe)) {
>  			io_consume_sqe(ctx);
>  			break;
>  		}
> -		req = io_alloc_req(ctx, statep);
> -		if (unlikely(!req)) {
> -			if (!submitted)
> -				submitted = -EAGAIN;
> -			break;
> +
> +		op = READ_ONCE(sqe->opcode);
> +
> +		if (io_op_defs[op].stack_req) {
> +			req = &stack_req;
> +			req->flags = REQ_F_STACK_REQ;
> +		} else {
> +			req = io_alloc_req(ctx, statep);
> +			if (unlikely(!req)) {
> +				if (!submitted)
> +					submitted = -EAGAIN;
> +				break;
> +			}
> +			req->flags = 0;
>  		}
>  
> +		req->opcode = op;
> +
>  		err = io_init_req(ctx, req, sqe, statep, async);
>  		io_consume_sqe(ctx);
>  		/* will complete beyond this point, count as submitted */
> diff --git a/include/linux/refcount.h b/include/linux/refcount.h
> index 0e3ee25eb156..1afaf73c0984 100644
> --- a/include/linux/refcount.h
> +++ b/include/linux/refcount.h
> @@ -294,6 +294,13 @@ static inline __must_check bool refcount_dec_and_test(refcount_t *r)
>  	return refcount_sub_and_test(1, r);
>  }
>  
> +static inline __must_check bool __refcount_dec_and_test(refcount_t *r)
> +{
> +	int old = --r->refs.counter;
> +
> +	return old == 0;
> +}
> +
>  /**
>   * refcount_dec - decrement a refcount
>   * @r: the refcount
> 

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH RFC} io_uring: io_kiocb alloc cache
  2020-05-14 15:37           ` Pavel Begunkov
@ 2020-05-14 15:53             ` Jens Axboe
  2020-05-14 16:18               ` Pavel Begunkov
  0 siblings, 1 reply; 22+ messages in thread
From: Jens Axboe @ 2020-05-14 15:53 UTC (permalink / raw)
  To: Pavel Begunkov, Xiaoguang Wang, io-uring; +Cc: joseph qi, Jiufei Xue

On 5/14/20 9:37 AM, Pavel Begunkov wrote:
> 
> 
> On 14/05/2020 18:15, Jens Axboe wrote:
>> On 5/14/20 8:53 AM, Pavel Begunkov wrote:
>>> On 14/05/2020 17:33, Jens Axboe wrote:
>>>> On 5/14/20 8:22 AM, Jens Axboe wrote:
>>>>>> I still use my previous io_uring_nop_stress tool to evaluate the improvement
>>>>>> in a physical machine. Memory 250GB and cpu is "Intel(R) Xeon(R) CPU E5-2682 v4 @ 2.50GHz".
>>>>>> Before this patch:
>>>>>> $sudo taskset -c 60 ./io_uring_nop_stress -r 300
>>>>>> total ios: 1608773840
>>>>>> IOPS:      5362579
>>>>>>
>>>>>> With this patch:
>>>>>> sudo taskset -c 60 ./io_uring_nop_stress -r 300
>>>>>> total ios: 1676910736
>>>>>> IOPS:      5589702
>>>>>> About 4.2% improvement.
>>>>>
>>>>> That's not bad. Can you try the patch from Pekka as well, just to see if
>>>>> that helps for you?
>>>>>
>>>>> I also had another idea... We basically have two types of request life
>>>>> times:
>>>>>
>>>>> 1) io_kiocb can get queued up internally
>>>>> 2) io_kiocb completes inline
>>>>>
>>>>> For the latter, it's totally feasible to just have the io_kiocb on
>>>>> stack. The downside is if we need to go the slower path, then we need to
>>>>> alloc an io_kiocb then and copy it. But maybe that's OK... I'll play
>>>>> with it.
>>>
>>> Does it differ from having one pre-allocated req? Like fallback_req,
>>> but without atomics and returned only under uring_mutex (i.e. in
>>> __io_queue_sqe()). Putting aside its usefulness, at least it will have
>>> a chance to work with reads/writes.
>>
>> But then you need atomics. I actually think the bigger win here is not
>> having to use atomic refcounts for this particular part, since we know
>> the request can't get shared.
> 
> Don't think we need, see:
> 
> struct ctx {
> 	/* protected by uring_mtx */
> 	struct req *cache_req;
> }
> 
> __io_queue_sqe()
> {
> 	ret = issue_inline(req);
> 	if (completed(ret)) {
> 		// don't need req anymore, return it
> 		ctx->cache_req = req;
> 	} else if (need_async) {
> 		// still under uring_mtx, just replenish the cache
> 		// alloc()+memcpy() here for on-stack
> 		ctx->cache_req = alloc_req();
> 		punt(req);
> 	}
> 
> 	// restored it in any case
> 	assert(ctx->cache_req != NULL);
> }
> 
> submit_sqes() __holds(uring_mtx)
> {
> 	while (...) {
> 		// already holding the mutex, no need for sync here
> 		// also, there is always a req there
> 		req = ctx->cache_req;
> 		ctx->cache_req = NULL;
> 		...
> 	}
> }

Hmm yes good point, it should work pretty easily, barring the use cases
that do IRQ complete. But that was also a special case with the other
cache.

> BTW, there will be a lot of problems to make either work properly with
> IORING_FEAT_SUBMIT_STABLE.

How so? Once the request is setup, any state should be retained there.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH RFC} io_uring: io_kiocb alloc cache
  2020-05-14 15:53             ` Jens Axboe
@ 2020-05-14 16:18               ` Pavel Begunkov
  2020-05-14 16:21                 ` Jens Axboe
  2020-05-14 16:25                 ` Pavel Begunkov
  0 siblings, 2 replies; 22+ messages in thread
From: Pavel Begunkov @ 2020-05-14 16:18 UTC (permalink / raw)
  To: Jens Axboe, Xiaoguang Wang, io-uring; +Cc: joseph qi, Jiufei Xue

On 14/05/2020 18:53, Jens Axboe wrote:
> On 5/14/20 9:37 AM, Pavel Begunkov wrote:
> Hmm yes good point, it should work pretty easily, barring the use cases
> that do IRQ complete. But that was also a special case with the other
> cache.
> 
>> BTW, there will be a lot of problems to make either work properly with
>> IORING_FEAT_SUBMIT_STABLE.
> 
> How so? Once the request is setup, any state should be retained there.

If a late alloc fails (e.g. in __io_queue_sqe()), you'd need to file a CQE with
an error. If there is no place in CQ, to postpone the completion it'd require an
allocated req. Of course it can be dropped, but I'd prefer to have strict
guarantees.

That's the same reason, I have a patch stashed, that grabs links from SQ
atomically (i.e. take all SQEs of a link or none).

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH RFC} io_uring: io_kiocb alloc cache
  2020-05-14 16:18               ` Pavel Begunkov
@ 2020-05-14 16:21                 ` Jens Axboe
  2020-05-14 16:25                 ` Pavel Begunkov
  1 sibling, 0 replies; 22+ messages in thread
From: Jens Axboe @ 2020-05-14 16:21 UTC (permalink / raw)
  To: Pavel Begunkov, Xiaoguang Wang, io-uring; +Cc: joseph qi, Jiufei Xue

On 5/14/20 10:18 AM, Pavel Begunkov wrote:
> On 14/05/2020 18:53, Jens Axboe wrote:
>> On 5/14/20 9:37 AM, Pavel Begunkov wrote:
>> Hmm yes good point, it should work pretty easily, barring the use cases
>> that do IRQ complete. But that was also a special case with the other
>> cache.
>>
>>> BTW, there will be a lot of problems to make either work properly with
>>> IORING_FEAT_SUBMIT_STABLE.
>>
>> How so? Once the request is setup, any state should be retained there.
> 
> If a late alloc fails (e.g. in __io_queue_sqe()), you'd need to file a
> CQE with an error. If there is no place in CQ, to postpone the
> completion it'd require an allocated req. Of course it can be dropped,
> but I'd prefer to have strict guarantees.
> 
> That's the same reason, I have a patch stashed, that grabs links from
> SQ atomically (i.e. take all SQEs of a link or none).

OK, I see what you mean. Yeah there's definitely some quirkiness
associated with deferring the allocation. I'm currently just working off
the idea that we just need to fix the refcounts, using a relaxed
version for the cases where we don't have shared semantics. We basically
only need that for requests with out-of-line completions, like irq
completions or async completions.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH RFC} io_uring: io_kiocb alloc cache
  2020-05-14 16:18               ` Pavel Begunkov
  2020-05-14 16:21                 ` Jens Axboe
@ 2020-05-14 16:25                 ` Pavel Begunkov
  2020-05-14 17:01                   ` Jens Axboe
  1 sibling, 1 reply; 22+ messages in thread
From: Pavel Begunkov @ 2020-05-14 16:25 UTC (permalink / raw)
  To: Jens Axboe, Xiaoguang Wang, io-uring; +Cc: joseph qi, Jiufei Xue

On 14/05/2020 19:18, Pavel Begunkov wrote:
> On 14/05/2020 18:53, Jens Axboe wrote:
>> On 5/14/20 9:37 AM, Pavel Begunkov wrote:
>> Hmm yes good point, it should work pretty easily, barring the use cases
>> that do IRQ complete. But that was also a special case with the other
>> cache.
>>
>>> BTW, there will be a lot of problems to make either work properly with
>>> IORING_FEAT_SUBMIT_STABLE.
>>
>> How so? Once the request is setup, any state should be retained there.
> 
> If a late alloc fails (e.g. in __io_queue_sqe()), you'd need to file a CQE with
> an error. If there is no place in CQ, to postpone the completion it'd require an
> allocated req. Of course it can be dropped, but I'd prefer to have strict
> guarantees.

I know how to do it right for my version.
Is it still just for fun thing, or you think it'll be useful for real I/O?

> 
> That's the same reason, I have a patch stashed, that grabs links from SQ
> atomically (i.e. take all SQEs of a link or none).
> 

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH RFC} io_uring: io_kiocb alloc cache
  2020-05-14 16:25                 ` Pavel Begunkov
@ 2020-05-14 17:01                   ` Jens Axboe
  2020-05-14 17:41                     ` Jens Axboe
  0 siblings, 1 reply; 22+ messages in thread
From: Jens Axboe @ 2020-05-14 17:01 UTC (permalink / raw)
  To: Pavel Begunkov, Xiaoguang Wang, io-uring; +Cc: joseph qi, Jiufei Xue

On 5/14/20 10:25 AM, Pavel Begunkov wrote:
> On 14/05/2020 19:18, Pavel Begunkov wrote:
>> On 14/05/2020 18:53, Jens Axboe wrote:
>>> On 5/14/20 9:37 AM, Pavel Begunkov wrote:
>>> Hmm yes good point, it should work pretty easily, barring the use cases
>>> that do IRQ complete. But that was also a special case with the other
>>> cache.
>>>
>>>> BTW, there will be a lot of problems to make either work properly with
>>>> IORING_FEAT_SUBMIT_STABLE.
>>>
>>> How so? Once the request is setup, any state should be retained there.
>>
>> If a late alloc fails (e.g. in __io_queue_sqe()), you'd need to file a CQE with
>> an error. If there is no place in CQ, to postpone the completion it'd require an
>> allocated req. Of course it can be dropped, but I'd prefer to have strict
>> guarantees.
> 
> I know how to do it right for my version.
> Is it still just for fun thing, or you think it'll be useful for real I/O?

We're definitely spending quite a bit of time on alloc+free and the atomics
for the refcount. Considering we're core limited on some workloads, any
cycles we can get back will ultimately increase the performance. So yeah,
definitely worth exploring and finding something that works.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH RFC} io_uring: io_kiocb alloc cache
  2020-05-14 17:01                   ` Jens Axboe
@ 2020-05-14 17:41                     ` Jens Axboe
  0 siblings, 0 replies; 22+ messages in thread
From: Jens Axboe @ 2020-05-14 17:41 UTC (permalink / raw)
  To: Pavel Begunkov, Xiaoguang Wang, io-uring; +Cc: joseph qi, Jiufei Xue

On 5/14/20 11:01 AM, Jens Axboe wrote:
> On 5/14/20 10:25 AM, Pavel Begunkov wrote:
>> On 14/05/2020 19:18, Pavel Begunkov wrote:
>>> On 14/05/2020 18:53, Jens Axboe wrote:
>>>> On 5/14/20 9:37 AM, Pavel Begunkov wrote:
>>>> Hmm yes good point, it should work pretty easily, barring the use cases
>>>> that do IRQ complete. But that was also a special case with the other
>>>> cache.
>>>>
>>>>> BTW, there will be a lot of problems to make either work properly with
>>>>> IORING_FEAT_SUBMIT_STABLE.
>>>>
>>>> How so? Once the request is setup, any state should be retained there.
>>>
>>> If a late alloc fails (e.g. in __io_queue_sqe()), you'd need to file a CQE with
>>> an error. If there is no place in CQ, to postpone the completion it'd require an
>>> allocated req. Of course it can be dropped, but I'd prefer to have strict
>>> guarantees.
>>
>> I know how to do it right for my version.
>> Is it still just for fun thing, or you think it'll be useful for real I/O?
> 
> We're definitely spending quite a bit of time on alloc+free and the atomics
> for the refcount. Considering we're core limited on some workloads, any
> cycles we can get back will ultimately increase the performance. So yeah,
> definitely worth exploring and finding something that works.

BTW, one oddity of the NOP microbenchmark that makes it less than useful
as a general test case is the fact that any request will complete immediately.
The default settings of that test is submitting batches of 16, which means
that we'll bulk allocate 16 io_kiocbs when we enter. But we only ever really
need one, as by the time we get to request #2, we've already freed the first
request (and so forth). 

I kind of like the idea of recycling requests. If the completion does happen
inline, then we're cache hot for the next issue. Right now we go through
a new request every time, regardless of whether or not the previous one just
got freed.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH RFC} io_uring: io_kiocb alloc cache
  2020-05-14 14:33     ` Jens Axboe
  2020-05-14 14:53       ` Pavel Begunkov
@ 2020-05-16  9:20       ` Xiaoguang Wang
  1 sibling, 0 replies; 22+ messages in thread
From: Xiaoguang Wang @ 2020-05-16  9:20 UTC (permalink / raw)
  To: Jens Axboe, io-uring; +Cc: joseph qi, Jiufei Xue, Pavel Begunkov

hi,

>> For the latter, it's totally feasible to just have the io_kiocb on
>> stack. The downside is if we need to go the slower path, then we need to
>> alloc an io_kiocb then and copy it. But maybe that's OK... I'll play
>> with it.
> 
> Can you try this with your microbenchmark? Just curious what it looks
> like for that test case if we completely take slab alloc+free out of it.
I run two rounds, every runs 300 seconds:
1st IOPS:      6231528, about 15% improvement
2nd IOPS:      6173959  about 14% improvement.
Looks better for nop tests :)

Regards,
Xiaoguang Wang

> 
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index d2e37215d05a..4ecd6bd38f02 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -525,6 +525,7 @@ enum {
>   	REQ_F_POLLED_BIT,
>   	REQ_F_BUFFER_SELECTED_BIT,
>   	REQ_F_NO_FILE_TABLE_BIT,
> +	REQ_F_STACK_REQ_BIT,
>   
>   	/* not a real bit, just to check we're not overflowing the space */
>   	__REQ_F_LAST_BIT,
> @@ -580,6 +581,8 @@ enum {
>   	REQ_F_BUFFER_SELECTED	= BIT(REQ_F_BUFFER_SELECTED_BIT),
>   	/* doesn't need file table for this request */
>   	REQ_F_NO_FILE_TABLE	= BIT(REQ_F_NO_FILE_TABLE_BIT),
> +	/* on-stack req */
> +	REQ_F_STACK_REQ		= BIT(REQ_F_STACK_REQ_BIT),
>   };
>   
>   struct async_poll {
> @@ -695,10 +698,14 @@ struct io_op_def {
>   	unsigned		pollout : 1;
>   	/* op supports buffer selection */
>   	unsigned		buffer_select : 1;
> +	/* op can use stack req */
> +	unsigned		stack_req : 1;
>   };
>   
>   static const struct io_op_def io_op_defs[] = {
> -	[IORING_OP_NOP] = {},
> +	[IORING_OP_NOP] = {
> +		.stack_req		= 1,
> +	},
>   	[IORING_OP_READV] = {
>   		.async_ctx		= 1,
>   		.needs_mm		= 1,
> @@ -1345,7 +1352,8 @@ static void __io_req_aux_free(struct io_kiocb *req)
>   	if (req->flags & REQ_F_NEED_CLEANUP)
>   		io_cleanup_req(req);
>   
> -	kfree(req->io);
> +	if (req->io)
> +		kfree(req->io);
>   	if (req->file)
>   		io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
>   	if (req->task)
> @@ -1370,6 +1378,8 @@ static void __io_free_req(struct io_kiocb *req)
>   	}
>   
>   	percpu_ref_put(&req->ctx->refs);
> +	if (req->flags & REQ_F_STACK_REQ)
> +		return;
>   	if (likely(!io_is_fallback_req(req)))
>   		kmem_cache_free(req_cachep, req);
>   	else
> @@ -5784,12 +5794,10 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
>   	 * link list.
>   	 */
>   	req->sequence = ctx->cached_sq_head - ctx->cached_sq_dropped;
> -	req->opcode = READ_ONCE(sqe->opcode);
>   	req->user_data = READ_ONCE(sqe->user_data);
>   	req->io = NULL;
>   	req->file = NULL;
>   	req->ctx = ctx;
> -	req->flags = 0;
>   	/* one is dropped after submission, the other at completion */
>   	refcount_set(&req->refs, 2);
>   	req->task = NULL;
> @@ -5839,6 +5847,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
>   {
>   	struct io_submit_state state, *statep = NULL;
>   	struct io_kiocb *link = NULL;
> +	struct io_kiocb stack_req;
>   	int i, submitted = 0;
>   
>   	/* if we have a backlog and couldn't flush it all, return BUSY */
> @@ -5865,20 +5874,31 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
>   	for (i = 0; i < nr; i++) {
>   		const struct io_uring_sqe *sqe;
>   		struct io_kiocb *req;
> -		int err;
> +		int err, op;
>   
>   		sqe = io_get_sqe(ctx);
>   		if (unlikely(!sqe)) {
>   			io_consume_sqe(ctx);
>   			break;
>   		}
> -		req = io_alloc_req(ctx, statep);
> -		if (unlikely(!req)) {
> -			if (!submitted)
> -				submitted = -EAGAIN;
> -			break;
> +
> +		op = READ_ONCE(sqe->opcode);
> +
> +		if (io_op_defs[op].stack_req) {
> +			req = &stack_req;
> +			req->flags = REQ_F_STACK_REQ;
> +		} else {
> +			req = io_alloc_req(ctx, statep);
> +			if (unlikely(!req)) {
> +				if (!submitted)
> +					submitted = -EAGAIN;
> +				break;
> +			}
> +			req->flags = 0;
>   		}
>   
> +		req->opcode = op;
> +
>   		err = io_init_req(ctx, req, sqe, statep, async);
>   		io_consume_sqe(ctx);
>   		/* will complete beyond this point, count as submitted */
> 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH RFC} io_uring: io_kiocb alloc cache
  2020-05-14 14:22   ` Jens Axboe
  2020-05-14 14:33     ` Jens Axboe
@ 2020-05-16 16:15     ` Xiaoguang Wang
  1 sibling, 0 replies; 22+ messages in thread
From: Xiaoguang Wang @ 2020-05-16 16:15 UTC (permalink / raw)
  To: Jens Axboe, io-uring; +Cc: joseph qi, Jiufei Xue, Pavel Begunkov

hi,

> On 5/14/20 2:25 AM, Xiaoguang Wang wrote:
>> hi,
>>
>>> +
>>> +static void io_req_cache_free_bulk(struct req_batch *rb)
>>> +{
>>> +	struct io_kiocb_cache_entry *ce;
>>> +	struct io_kiocb_cache *cache;
>>> +
>>> +	if (rb->irq_comp)
>>> +		local_irq_disable();
>>> +	else
>>> +		preempt_disable();
>>> +
>>> +	cache = this_cpu_ptr(alloc_cache);
>>> +	ce = &cache->caches[rb->irq_comp];
>>> +
>>> +	list_splice_init(&rb->list, &ce->alloc_list);
>>> +	ce->nr_avail += rb->to_free;
>>> +	if (ce->nr_avail > IO_KIOCB_CACHE_MAX)
>>> +		io_req_cache_reclaim(ce);
>>> +
>>> +	if (rb->irq_comp)
>>> +		local_irq_enable();
>>> +	else
>>> +		preempt_enable();
>>> +}
>>> +
>>> +static void io_req_cache_free(struct io_kiocb *req)
>>> +{
>>> +	const bool irq_comp = io_op_defs[req->opcode].irq_comp;
>>> +	struct io_kiocb_cache_entry *ce;
>>> +	struct io_kiocb_cache *cache;
>>> +	unsigned long flags;
>>> +
>>> +	if (irq_comp)
>>> +		local_irq_save(flags);
>>> +	else
>>> +		preempt_disable();
>>> +
>>> +	cache = this_cpu_ptr(alloc_cache);
>>> +	ce = &cache->caches[irq_comp];
>>> +
>>> +	list_add(&req->list, &ce->alloc_list);
>>> +	if (++ce->nr_avail > IO_KIOCB_CACHE_MAX)
>>> +		io_req_cache_reclaim(ce);
>> Above codes seem that io_req_cache_reclaim() will free all reqs in
>> alloc_list, then we'll need to kmem_cache_alloc() again, I guess
>> indeed you intend to reserve IO_KIOCB_CACHE_MAX reqs in alloc_list?
> 
> Yeah a thinko in that one, actually did a v2 shortly after that, just
> didn't send it out. Including it below in case you are interested, when
> it hits reclaim, it reclaims IO_KIOCB_CACHE_RECLAIM of them.
> 
>> I still use my previous io_uring_nop_stress tool to evaluate the improvement
>> in a physical machine. Memory 250GB and cpu is "Intel(R) Xeon(R) CPU E5-2682 v4 @ 2.50GHz".
>> Before this patch:
>> $sudo taskset -c 60 ./io_uring_nop_stress -r 300
>> total ios: 1608773840
>> IOPS:      5362579
>>
>> With this patch:
>> sudo taskset -c 60 ./io_uring_nop_stress -r 300
>> total ios: 1676910736
>> IOPS:      5589702
>> About 4.2% improvement.
> 
> That's not bad. Can you try the patch from Pekka as well, just to see if
> that helps for you?
Indeed I had sent below reply about 8 hours ago, but seems it wasn't sent out
to mail list successfully, so send it again.

Sorry for being late, I was busy with internal work before.
I had tried Pekka's patch in same environment:
First without both patches:
IOPS:      5415314

With your previous patch:
IOPS:      5588999  about 3.2% improvement, in previous mail, I got 4.2%
improvement, though test result seems variable, but with your patch, I
always got obvious improvement.

With Pekka's patch:
IOPS:      5394832
and it maybe even wrose, sometimes I got "IOPS:      4898400".

Regards,
Xiaoguang Wang

> 
> I also had another idea... We basically have two types of request life
> times:
> 
> 1) io_kiocb can get queued up internally
> 2) io_kiocb completes inline
> 
> For the latter, it's totally feasible to just have the io_kiocb on
> stack. The downside is if we need to go the slower path, then we need to
> alloc an io_kiocb then and copy it. But maybe that's OK... I'll play
> with it.
> 
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index d2e37215d05a..3be5f0e60d9f 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -79,6 +79,7 @@
>   #include <linux/fs_struct.h>
>   #include <linux/splice.h>
>   #include <linux/task_work.h>
> +#include <linux/cpuhotplug.h>
>   
>   #define CREATE_TRACE_POINTS
>   #include <trace/events/io_uring.h>
> @@ -652,17 +653,10 @@ struct io_kiocb {
>   };
>   
>   #define IO_PLUG_THRESHOLD		2
> -#define IO_IOPOLL_BATCH			8
>   
>   struct io_submit_state {
>   	struct blk_plug		plug;
>   
> -	/*
> -	 * io_kiocb alloc cache
> -	 */
> -	void			*reqs[IO_IOPOLL_BATCH];
> -	unsigned int		free_reqs;
> -
>   	/*
>   	 * File reference cache
>   	 */
> @@ -673,6 +667,27 @@ struct io_submit_state {
>   	unsigned int		ios_left;
>   };
>   
> +struct io_kiocb_cache_entry {
> +	/* ready for allocation */
> +	struct list_head	alloc_list;
> +	unsigned		nr_avail;
> +
> +	/* ready for shrinker reclaim */
> +	spinlock_t		free_lock;
> +	struct list_head	free_list;
> +	unsigned		nr_free;
> +};
> +
> +struct io_kiocb_cache {
> +	/* one for requests with IRQ completion, one for no IRQ */
> +	struct io_kiocb_cache_entry	caches[2];
> +};
> +
> +#define IO_KIOCB_CACHE_MAX	256
> +#define IO_KIOCB_CACHE_RECLAIM	 16
> +
> +static struct io_kiocb_cache *alloc_cache;
> +
>   struct io_op_def {
>   	/* needs req->io allocated for deferral/async */
>   	unsigned		async_ctx : 1;
> @@ -695,6 +710,8 @@ struct io_op_def {
>   	unsigned		pollout : 1;
>   	/* op supports buffer selection */
>   	unsigned		buffer_select : 1;
> +	/* IRQ completion */
> +	unsigned		irq_comp : 1;
>   };
>   
>   static const struct io_op_def io_op_defs[] = {
> @@ -706,6 +723,7 @@ static const struct io_op_def io_op_defs[] = {
>   		.unbound_nonreg_file	= 1,
>   		.pollin			= 1,
>   		.buffer_select		= 1,
> +		.irq_comp		= 1,
>   	},
>   	[IORING_OP_WRITEV] = {
>   		.async_ctx		= 1,
> @@ -714,6 +732,7 @@ static const struct io_op_def io_op_defs[] = {
>   		.hash_reg_file		= 1,
>   		.unbound_nonreg_file	= 1,
>   		.pollout		= 1,
> +		.irq_comp		= 1,
>   	},
>   	[IORING_OP_FSYNC] = {
>   		.needs_file		= 1,
> @@ -722,12 +741,14 @@ static const struct io_op_def io_op_defs[] = {
>   		.needs_file		= 1,
>   		.unbound_nonreg_file	= 1,
>   		.pollin			= 1,
> +		.irq_comp		= 1,
>   	},
>   	[IORING_OP_WRITE_FIXED] = {
>   		.needs_file		= 1,
>   		.hash_reg_file		= 1,
>   		.unbound_nonreg_file	= 1,
>   		.pollout		= 1,
> +		.irq_comp		= 1,
>   	},
>   	[IORING_OP_POLL_ADD] = {
>   		.needs_file		= 1,
> @@ -803,12 +824,14 @@ static const struct io_op_def io_op_defs[] = {
>   		.unbound_nonreg_file	= 1,
>   		.pollin			= 1,
>   		.buffer_select		= 1,
> +		.irq_comp		= 1,
>   	},
>   	[IORING_OP_WRITE] = {
>   		.needs_mm		= 1,
>   		.needs_file		= 1,
>   		.unbound_nonreg_file	= 1,
>   		.pollout		= 1,
> +		.irq_comp		= 1,
>   	},
>   	[IORING_OP_FADVISE] = {
>   		.needs_file		= 1,
> @@ -1281,54 +1304,161 @@ static inline bool io_is_fallback_req(struct io_kiocb *req)
>   			((unsigned long) req->ctx->fallback_req & ~1UL);
>   }
>   
> -static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx)
> +static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx, int op)
>   {
>   	struct io_kiocb *req;
>   
>   	req = ctx->fallback_req;
> -	if (!test_and_set_bit_lock(0, (unsigned long *) &ctx->fallback_req))
> +	if (!test_and_set_bit_lock(0, (unsigned long *) &ctx->fallback_req)) {
> +		req->opcode = op;
>   		return req;
> +	}
>   
>   	return NULL;
>   }
>   
> -static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx,
> -				     struct io_submit_state *state)
> +static bool io_req_cache_steal(struct io_kiocb_cache_entry *ce)
>   {
> -	gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
> -	struct io_kiocb *req;
> +	int nr = 0;
>   
> -	if (!state) {
> -		req = kmem_cache_alloc(req_cachep, gfp);
> -		if (unlikely(!req))
> -			goto fallback;
> -	} else if (!state->free_reqs) {
> -		size_t sz;
> -		int ret;
> +	if (ce->nr_free < IO_KIOCB_CACHE_RECLAIM)
> +		return false;
>   
> -		sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
> -		ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);
> +	spin_lock(&ce->free_lock);
> +	while (!list_empty(&ce->free_list)) {
> +		struct io_kiocb *req;
>   
> -		/*
> -		 * Bulk alloc is all-or-nothing. If we fail to get a batch,
> -		 * retry single alloc to be on the safe side.
> -		 */
> -		if (unlikely(ret <= 0)) {
> -			state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
> -			if (!state->reqs[0])
> -				goto fallback;
> -			ret = 1;
> -		}
> -		state->free_reqs = ret - 1;
> -		req = state->reqs[ret - 1];
> -	} else {
> -		state->free_reqs--;
> -		req = state->reqs[state->free_reqs];
> +		req = list_first_entry(&ce->free_list, struct io_kiocb, list);
> +		list_move(&req->list, &ce->alloc_list);
> +		if (++nr >= IO_KIOCB_CACHE_RECLAIM)
> +			break;
>   	}
> +	ce->nr_avail += nr;
> +	ce->nr_free -= nr;
> +	spin_unlock(&ce->free_lock);
> +	return nr > 0;
> +}
> +
> +static struct io_kiocb *io_req_cache_alloc(int op)
> +{
> +	const bool irq_comp = io_op_defs[op].irq_comp;
> +	struct io_kiocb_cache_entry *ce;
> +	struct io_kiocb_cache *cache;
> +	struct io_kiocb *req = NULL;
> +
> +	if (irq_comp)
> +		local_irq_disable();
> +	else
> +		preempt_disable();
> +
> +	cache = this_cpu_ptr(alloc_cache);
> +	ce = &cache->caches[irq_comp];
> +
> +	if (!list_empty(&ce->alloc_list) || io_req_cache_steal(ce)) {
> +		req = list_first_entry(&ce->alloc_list, struct io_kiocb, list);
> +		list_del(&req->list);
> +		ce->nr_avail--;
> +	}
> +
> +	if (irq_comp)
> +		local_irq_enable();
> +	else
> +		preempt_enable();
> +
> +	if (req)
> +		return req;
>   
> -	return req;
> -fallback:
> -	return io_get_fallback_req(ctx);
> +	return kmem_cache_alloc(req_cachep, GFP_KERNEL);
> +}
> +
> +static void io_req_cache_reclaim(struct io_kiocb_cache_entry *ce)
> +{
> +	LIST_HEAD(free_list);
> +	int nr = 0;
> +
> +	while (!list_empty(&ce->alloc_list)) {
> +		struct io_kiocb *req;
> +
> +		req = list_last_entry(&ce->alloc_list, struct io_kiocb, list);
> +		list_move(&req->list, &free_list);
> +		if (++nr >= IO_KIOCB_CACHE_RECLAIM)
> +			break;
> +	}
> +
> +	spin_lock(&ce->free_lock);
> +	list_splice(&free_list, &ce->free_list);
> +	ce->nr_free += nr;
> +	ce->nr_avail -= nr;
> +	spin_unlock(&ce->free_lock);
> +}
> +
> +struct req_batch {
> +	struct list_head list;
> +	int to_free;
> +	bool need_iter;
> +	bool irq_comp;
> +};
> +
> +static void io_req_cache_free_bulk(struct req_batch *rb)
> +{
> +	struct io_kiocb_cache_entry *ce;
> +	struct io_kiocb_cache *cache;
> +
> +	if (rb->irq_comp)
> +		local_irq_disable();
> +	else
> +		preempt_disable();
> +
> +	cache = this_cpu_ptr(alloc_cache);
> +	ce = &cache->caches[rb->irq_comp];
> +
> +	list_splice_init(&rb->list, &ce->alloc_list);
> +	ce->nr_avail += rb->to_free;
> +	if (ce->nr_avail > IO_KIOCB_CACHE_MAX)
> +		io_req_cache_reclaim(ce);
> +
> +	if (rb->irq_comp)
> +		local_irq_enable();
> +	else
> +		preempt_enable();
> +}
> +
> +static void io_req_cache_free(struct io_kiocb *req)
> +{
> +	const bool irq_comp = io_op_defs[req->opcode].irq_comp;
> +	struct io_kiocb_cache_entry *ce;
> +	struct io_kiocb_cache *cache;
> +	unsigned long flags;
> +
> +	if (irq_comp)
> +		local_irq_save(flags);
> +	else
> +		preempt_disable();
> +
> +	cache = this_cpu_ptr(alloc_cache);
> +	ce = &cache->caches[irq_comp];
> +
> +	list_add(&req->list, &ce->alloc_list);
> +	if (++ce->nr_avail > IO_KIOCB_CACHE_MAX)
> +		io_req_cache_reclaim(ce);
> +
> +	if (irq_comp)
> +		local_irq_restore(flags);
> +	else
> +		preempt_enable();
> +}
> +
> +static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx, int opcode)
> +{
> +	struct io_kiocb *req;
> +
> +	req = io_req_cache_alloc(opcode);
> +	if (req) {
> +		req->opcode = opcode;
> +		return req;
> +	}
> +
> +	return io_get_fallback_req(ctx, opcode);
>   }
>   
>   static inline void io_put_file(struct io_kiocb *req, struct file *file,
> @@ -1345,7 +1475,8 @@ static void __io_req_aux_free(struct io_kiocb *req)
>   	if (req->flags & REQ_F_NEED_CLEANUP)
>   		io_cleanup_req(req);
>   
> -	kfree(req->io);
> +	if (req->io)
> +		kfree(req->io);
>   	if (req->file)
>   		io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
>   	if (req->task)
> @@ -1371,28 +1502,21 @@ static void __io_free_req(struct io_kiocb *req)
>   
>   	percpu_ref_put(&req->ctx->refs);
>   	if (likely(!io_is_fallback_req(req)))
> -		kmem_cache_free(req_cachep, req);
> +		io_req_cache_free(req);
>   	else
>   		clear_bit_unlock(0, (unsigned long *) &req->ctx->fallback_req);
>   }
>   
> -struct req_batch {
> -	void *reqs[IO_IOPOLL_BATCH];
> -	int to_free;
> -	int need_iter;
> -};
> -
>   static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
>   {
>   	if (!rb->to_free)
>   		return;
>   	if (rb->need_iter) {
> -		int i, inflight = 0;
> +		struct io_kiocb *req;
>   		unsigned long flags;
> +		int inflight = 0;
>   
> -		for (i = 0; i < rb->to_free; i++) {
> -			struct io_kiocb *req = rb->reqs[i];
> -
> +		list_for_each_entry(req, &rb->list, list) {
>   			if (req->flags & REQ_F_FIXED_FILE) {
>   				req->file = NULL;
>   				percpu_ref_put(req->fixed_file_refs);
> @@ -1405,9 +1529,7 @@ static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
>   			goto do_free;
>   
>   		spin_lock_irqsave(&ctx->inflight_lock, flags);
> -		for (i = 0; i < rb->to_free; i++) {
> -			struct io_kiocb *req = rb->reqs[i];
> -
> +		list_for_each_entry(req, &rb->list, list) {
>   			if (req->flags & REQ_F_INFLIGHT) {
>   				list_del(&req->inflight_entry);
>   				if (!--inflight)
> @@ -1420,9 +1542,8 @@ static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
>   			wake_up(&ctx->inflight_wait);
>   	}
>   do_free:
> -	kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
> +	io_req_cache_free_bulk(rb);
>   	percpu_ref_put_many(&ctx->refs, rb->to_free);
> -	rb->to_free = rb->need_iter = 0;
>   }
>   
>   static bool io_link_cancel_timeout(struct io_kiocb *req)
> @@ -1670,11 +1791,12 @@ static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req)
>   		return false;
>   
>   	if (!(req->flags & REQ_F_FIXED_FILE) || req->io)
> -		rb->need_iter++;
> +		rb->need_iter |= true;
> +	if (!rb->irq_comp && io_op_defs[req->opcode].irq_comp)
> +		rb->irq_comp |= true;
>   
> -	rb->reqs[rb->to_free++] = req;
> -	if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs)))
> -		io_free_req_many(req->ctx, rb);
> +	list_add(&req->list, &rb->list);
> +	rb->to_free++;
>   	return true;
>   }
>   
> @@ -1697,10 +1819,14 @@ static int io_put_kbuf(struct io_kiocb *req)
>   static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
>   			       struct list_head *done)
>   {
> -	struct req_batch rb;
> +	struct req_batch rb = {
> +		.list		= LIST_HEAD_INIT(rb.list),
> +		.to_free	= 0,
> +		.need_iter	= false,
> +		.irq_comp	= false
> +	};
>   	struct io_kiocb *req;
>   
> -	rb.to_free = rb.need_iter = 0;
>   	while (!list_empty(done)) {
>   		int cflags = 0;
>   
> @@ -5703,8 +5829,6 @@ static void io_submit_state_end(struct io_submit_state *state)
>   {
>   	blk_finish_plug(&state->plug);
>   	io_file_put(state);
> -	if (state->free_reqs)
> -		kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
>   }
>   
>   /*
> @@ -5714,7 +5838,6 @@ static void io_submit_state_start(struct io_submit_state *state,
>   				  unsigned int max_ios)
>   {
>   	blk_start_plug(&state->plug);
> -	state->free_reqs = 0;
>   	state->file = NULL;
>   	state->ios_left = max_ios;
>   }
> @@ -5784,7 +5907,6 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
>   	 * link list.
>   	 */
>   	req->sequence = ctx->cached_sq_head - ctx->cached_sq_dropped;
> -	req->opcode = READ_ONCE(sqe->opcode);
>   	req->user_data = READ_ONCE(sqe->user_data);
>   	req->io = NULL;
>   	req->file = NULL;
> @@ -5872,7 +5994,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
>   			io_consume_sqe(ctx);
>   			break;
>   		}
> -		req = io_alloc_req(ctx, statep);
> +		req = io_alloc_req(ctx, READ_ONCE(sqe->opcode));
>   		if (unlikely(!req)) {
>   			if (!submitted)
>   				submitted = -EAGAIN;
> @@ -7626,6 +7748,17 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
>   					req->task->task_works != NULL);
>   	}
>   	spin_unlock_irq(&ctx->completion_lock);
> +	seq_printf(m, "AllocCache:\n");
> +	for_each_possible_cpu(i) {
> +		struct io_kiocb_cache *cache = per_cpu_ptr(alloc_cache, i);
> +		int j;
> +
> +		for (j = 0; j < ARRAY_SIZE(cache->caches); j++) {
> +			struct io_kiocb_cache_entry *ce = &cache->caches[j];
> +
> +			seq_printf(m, "  cpu%d: irq=%d, nr_free=%d, nr_avail=%d\n", i, j, ce->nr_free, ce->nr_avail);
> +		}
> +	}
>   	mutex_unlock(&ctx->uring_lock);
>   }
>   
> @@ -8101,8 +8234,131 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
>   	return ret;
>   }
>   
> +static unsigned long io_uring_cache_count(struct shrinker *shrink,
> +					  struct shrink_control *sc)
> +{
> +	unsigned long count = 0;
> +	int cpu, i;
> +
> +	for_each_possible_cpu(cpu) {
> +		struct io_kiocb_cache *cache;
> +
> +		cache = per_cpu_ptr(alloc_cache, cpu);
> +		for (i = 0; i < ARRAY_SIZE(cache->caches); i++) {
> +			struct io_kiocb_cache_entry *ce = &cache->caches[i];
> +
> +			count += ce->nr_free;
> +		}
> +	}
> +
> +	return count;
> +}
> +
> +static unsigned long __io_uring_cache_shrink(struct io_kiocb_cache_entry *ce,
> +					     const bool irq_comp,
> +					     int *nr_to_scan)
> +{
> +	unsigned long freed = 0;
> +	struct io_kiocb *req;
> +	LIST_HEAD(free_list);
> +
> +	if (!ce->nr_free)
> +		return 0;
> +
> +	if (irq_comp)
> +		spin_lock_irq(&ce->free_lock);
> +	else
> +		spin_lock(&ce->free_lock);
> +
> +	while (!list_empty(&ce->free_list)) {
> +		req = list_first_entry(&ce->free_list, struct io_kiocb, list);
> +		list_move(&req->list, &free_list);
> +		freed++;
> +		if (!--(*nr_to_scan))
> +			break;
> +	}
> +
> +	if (irq_comp)
> +		spin_unlock_irq(&ce->free_lock);
> +	else
> +		spin_unlock(&ce->free_lock);
> +
> +	while (!list_empty(&free_list)) {
> +		req = list_first_entry(&free_list, struct io_kiocb, list);
> +		list_del(&req->list);
> +		kmem_cache_free(req_cachep, req);
> +	}
> +
> +	return freed;
> +}
> +
> +static unsigned long io_uring_cache_shrink(int nr_to_scan)
> +{
> +	long freed = 0;
> +	int cpu, i;
> +
> +	for_each_possible_cpu(cpu) {
> +		struct io_kiocb_cache *cache = per_cpu_ptr(alloc_cache, cpu);
> +
> +		for (i = 0; i < ARRAY_SIZE(cache->caches); i++) {
> +			struct io_kiocb_cache_entry *ce = &cache->caches[i];
> +
> +			freed += __io_uring_cache_shrink(ce, i, &nr_to_scan);
> +			if (!nr_to_scan)
> +				break;
> +		}
> +		if (!nr_to_scan)
> +			break;
> +	}
> +
> +	return freed ?: SHRINK_STOP;
> +}
> +
> +static unsigned long io_uring_cache_scan(struct shrinker *shrink,
> +					 struct shrink_control *sc)
> +{
> +	if ((sc->gfp_mask & GFP_KERNEL) != GFP_KERNEL)
> +		return SHRINK_STOP;
> +
> +	return io_uring_cache_shrink(sc->nr_to_scan);
> +}
> +
> +static struct shrinker io_uring_shrinker = {
> +	.count_objects	= io_uring_cache_count,
> +	.scan_objects	= io_uring_cache_scan,
> +	.seeks		= DEFAULT_SEEKS,
> +};
> +
> +static void io_uring_kill_ce(struct io_kiocb_cache_entry *ce)
> +{
> +	struct io_kiocb *req;
> +
> +	list_splice_init(&ce->alloc_list, &ce->free_list);
> +
> +	while (!list_empty(&ce->free_list)) {
> +		req = list_first_entry(&ce->free_list, struct io_kiocb, list);
> +		list_del(&req->list);
> +		kmem_cache_free(req_cachep, req);
> +	}
> +
> +	ce->nr_free = ce->nr_avail = 0;
> +}
> +
> +static int io_uring_notify_dead(unsigned int cpu)
> +{
> +	struct io_kiocb_cache *cache = per_cpu_ptr(alloc_cache, cpu);
> +	int i;
> +
> +	for (i = 0; i < ARRAY_SIZE(cache->caches); i++)
> +		io_uring_kill_ce(&cache->caches[i]);
> +
> +	return 0;
> +}
> +
>   static int __init io_uring_init(void)
>   {
> +	int cpu, i;
> +
>   #define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
>   	BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
>   	BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
> @@ -8142,6 +8398,25 @@ static int __init io_uring_init(void)
>   	BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
>   	BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
>   	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
> +
> +	alloc_cache = alloc_percpu(struct io_kiocb_cache);
> +	for_each_possible_cpu(cpu) {
> +		struct io_kiocb_cache *cache = per_cpu_ptr(alloc_cache, cpu);
> +
> +		for (i = 0; i < ARRAY_SIZE(cache->caches); i++) {
> +			struct io_kiocb_cache_entry *ce = &cache->caches[i];
> +
> +			INIT_LIST_HEAD(&ce->alloc_list);
> +			spin_lock_init(&ce->free_lock);
> +			INIT_LIST_HEAD(&ce->free_list);
> +			ce->nr_free = 0;
> +			ce->nr_avail = 0;
> +		}
> +	}
> +
> +	cpuhp_setup_state_nocalls(CPUHP_IOURING_DEAD, "io_uring:dead", NULL,
> +					io_uring_notify_dead);
> +	WARN_ON(register_shrinker(&io_uring_shrinker));
>   	return 0;
>   };
>   __initcall(io_uring_init);
> diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
> index 77d70b633531..3b80556572a5 100644
> --- a/include/linux/cpuhotplug.h
> +++ b/include/linux/cpuhotplug.h
> @@ -60,6 +60,7 @@ enum cpuhp_state {
>   	CPUHP_LUSTRE_CFS_DEAD,
>   	CPUHP_AP_ARM_CACHE_B15_RAC_DEAD,
>   	CPUHP_PADATA_DEAD,
> +	CPUHP_IOURING_DEAD,
>   	CPUHP_WORKQUEUE_PREP,
>   	CPUHP_POWER_NUMA_PREPARE,
>   	CPUHP_HRTIMERS_PREPARE,
> 

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2020-05-16 16:15 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-05-13 16:30 [PATCH RFC} io_uring: io_kiocb alloc cache Jens Axboe
2020-05-13 17:42 ` Jann Horn
2020-05-13 17:42   ` Jann Horn
2020-05-13 18:34   ` Jens Axboe
2020-05-13 19:20   ` Pekka Enberg
2020-05-13 20:09     ` Jens Axboe
2020-05-13 20:31       ` Pekka Enberg
2020-05-13 20:44         ` Jens Axboe
2020-05-14  8:25 ` Xiaoguang Wang
2020-05-14 14:22   ` Jens Axboe
2020-05-14 14:33     ` Jens Axboe
2020-05-14 14:53       ` Pavel Begunkov
2020-05-14 15:15         ` Jens Axboe
2020-05-14 15:37           ` Pavel Begunkov
2020-05-14 15:53             ` Jens Axboe
2020-05-14 16:18               ` Pavel Begunkov
2020-05-14 16:21                 ` Jens Axboe
2020-05-14 16:25                 ` Pavel Begunkov
2020-05-14 17:01                   ` Jens Axboe
2020-05-14 17:41                     ` Jens Axboe
2020-05-16  9:20       ` Xiaoguang Wang
2020-05-16 16:15     ` Xiaoguang Wang

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.