Linux-Block Archive on lore.kernel.org
 help / color / Atom feed
* [PATCH] io_uring: allocate the two rings together
@ 2019-08-26 17:23 Hristo Venev
  2019-08-27 15:50 ` Jens Axboe
  0 siblings, 1 reply; 9+ messages in thread
From: Hristo Venev @ 2019-08-26 17:23 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block, Hristo Venev

Both the sq and the cq rings have sizes just over a power of two, and
the sq ring is significantly smaller. By bundling them in a single
alllocation, we get the sq ring for free.

This also means that IORING_OFF_SQ_RING and IORING_OFF_CQ_RING now mean
the same thing. If we indicate this to userspace, we can save a mmap
call.

Signed-off-by: Hristo Venev <hristo@venev.name>
---
 fs/io_uring.c | 255 +++++++++++++++++++++++++-------------------------
 1 file changed, 128 insertions(+), 127 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index cfb48bd088e1..1b28b4f2989f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -84,27 +84,29 @@ struct io_uring {
 };
 
 /*
- * This data is shared with the application through the mmap at offset
- * IORING_OFF_SQ_RING.
+ * This data is shared with the application through the mmap at offsets
+ * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
  *
  * The offsets to the member fields are published through struct
  * io_sqring_offsets when calling io_uring_setup.
  */
-struct io_sq_ring {
+struct io_rings {
 	/*
 	 * Head and tail offsets into the ring; the offsets need to be
 	 * masked to get valid indices.
 	 *
-	 * The kernel controls head and the application controls tail.
+	 * The kernel controls head of the sq ring and the tail of the cq ring,
+	 * and the application controls tail of the sq ring and the head of the
+	 * cq ring.
 	 */
-	struct io_uring		r;
+	struct io_uring		sq, cq;
 	/*
-	 * Bitmask to apply to head and tail offsets (constant, equals
+	 * Bitmasks to apply to head and tail offsets (constant, equals
 	 * ring_entries - 1)
 	 */
-	u32			ring_mask;
-	/* Ring size (constant, power of 2) */
-	u32			ring_entries;
+	u32			sq_ring_mask, cq_ring_mask;
+	/* Ring sizes (constant, power of 2) */
+	u32			sq_ring_entries, cq_ring_entries;
 	/*
 	 * Number of invalid entries dropped by the kernel due to
 	 * invalid index stored in array
@@ -117,7 +119,7 @@ struct io_sq_ring {
 	 * counter includes all submissions that were dropped reaching
 	 * the new SQ head (and possibly more).
 	 */
-	u32			dropped;
+	u32			sq_dropped;
 	/*
 	 * Runtime flags
 	 *
@@ -127,43 +129,7 @@ struct io_sq_ring {
 	 * The application needs a full memory barrier before checking
 	 * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
 	 */
-	u32			flags;
-	/*
-	 * Ring buffer of indices into array of io_uring_sqe, which is
-	 * mmapped by the application using the IORING_OFF_SQES offset.
-	 *
-	 * This indirection could e.g. be used to assign fixed
-	 * io_uring_sqe entries to operations and only submit them to
-	 * the queue when needed.
-	 *
-	 * The kernel modifies neither the indices array nor the entries
-	 * array.
-	 */
-	u32			array[];
-};
-
-/*
- * This data is shared with the application through the mmap at offset
- * IORING_OFF_CQ_RING.
- *
- * The offsets to the member fields are published through struct
- * io_cqring_offsets when calling io_uring_setup.
- */
-struct io_cq_ring {
-	/*
-	 * Head and tail offsets into the ring; the offsets need to be
-	 * masked to get valid indices.
-	 *
-	 * The application controls head and the kernel tail.
-	 */
-	struct io_uring		r;
-	/*
-	 * Bitmask to apply to head and tail offsets (constant, equals
-	 * ring_entries - 1)
-	 */
-	u32			ring_mask;
-	/* Ring size (constant, power of 2) */
-	u32			ring_entries;
+	u32			sq_flags;
 	/*
 	 * Number of completion events lost because the queue was full;
 	 * this should be avoided by the application by making sure
@@ -177,7 +143,7 @@ struct io_cq_ring {
 	 * As completion events come in out of order this counter is not
 	 * ordered with any other data.
 	 */
-	u32			overflow;
+	u32			cq_overflow;
 	/*
 	 * Ring buffer of completion events.
 	 *
@@ -185,7 +151,7 @@ struct io_cq_ring {
 	 * produced, so the application is allowed to modify pending
 	 * entries.
 	 */
-	struct io_uring_cqe	cqes[];
+	struct io_uring_cqe	cqes[] ____cacheline_aligned_in_smp;
 };
 
 struct io_mapped_ubuf {
@@ -215,8 +181,18 @@ struct io_ring_ctx {
 		bool			compat;
 		bool			account_mem;
 
-		/* SQ ring */
-		struct io_sq_ring	*sq_ring;
+		/*
+		 * Ring buffer of indices into array of io_uring_sqe, which is
+		 * mmapped by the application using the IORING_OFF_SQES offset.
+		 *
+		 * This indirection could e.g. be used to assign fixed
+		 * io_uring_sqe entries to operations and only submit them to
+		 * the queue when needed.
+		 *
+		 * The kernel modifies neither the indices array nor the entries
+		 * array.
+		 */
+		u32			*sq_array;
 		unsigned		cached_sq_head;
 		unsigned		sq_entries;
 		unsigned		sq_mask;
@@ -234,8 +210,6 @@ struct io_ring_ctx {
 	struct completion	sqo_thread_started;
 
 	struct {
-		/* CQ ring */
-		struct io_cq_ring	*cq_ring;
 		unsigned		cached_cq_tail;
 		unsigned		cq_entries;
 		unsigned		cq_mask;
@@ -244,6 +218,8 @@ struct io_ring_ctx {
 		struct eventfd_ctx	*cq_ev_fd;
 	} ____cacheline_aligned_in_smp;
 
+	struct io_rings	*rings;
+
 	/*
 	 * If used, fixed file set. Writers must ensure that ->refs is dead,
 	 * readers must ensure that ->refs is alive as long as the file* is
@@ -430,7 +406,7 @@ static inline bool io_sequence_defer(struct io_ring_ctx *ctx,
 	if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN)
 		return false;
 
-	return req->sequence != ctx->cached_cq_tail + ctx->sq_ring->dropped;
+	return req->sequence != ctx->cached_cq_tail + ctx->rings->sq_dropped;
 }
 
 static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
@@ -451,11 +427,11 @@ static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
 
 static void __io_commit_cqring(struct io_ring_ctx *ctx)
 {
-	struct io_cq_ring *ring = ctx->cq_ring;
+	struct io_rings *rings = ctx->rings;
 
-	if (ctx->cached_cq_tail != READ_ONCE(ring->r.tail)) {
+	if (ctx->cached_cq_tail != READ_ONCE(rings->cq.tail)) {
 		/* order cqe stores with ring update */
-		smp_store_release(&ring->r.tail, ctx->cached_cq_tail);
+		smp_store_release(&rings->cq.tail, ctx->cached_cq_tail);
 
 		if (wq_has_sleeper(&ctx->cq_wait)) {
 			wake_up_interruptible(&ctx->cq_wait);
@@ -478,7 +454,7 @@ static void io_commit_cqring(struct io_ring_ctx *ctx)
 
 static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
 {
-	struct io_cq_ring *ring = ctx->cq_ring;
+	struct io_rings *rings = ctx->rings;
 	unsigned tail;
 
 	tail = ctx->cached_cq_tail;
@@ -487,11 +463,11 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
 	 * control dependency is enough as we're using WRITE_ONCE to
 	 * fill the cq entry
 	 */
-	if (tail - READ_ONCE(ring->r.head) == ring->ring_entries)
+	if (tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries)
 		return NULL;
 
 	ctx->cached_cq_tail++;
-	return &ring->cqes[tail & ctx->cq_mask];
+	return &rings->cqes[tail & ctx->cq_mask];
 }
 
 static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
@@ -510,9 +486,9 @@ static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
 		WRITE_ONCE(cqe->res, res);
 		WRITE_ONCE(cqe->flags, 0);
 	} else {
-		unsigned overflow = READ_ONCE(ctx->cq_ring->overflow);
+		unsigned overflow = READ_ONCE(ctx->rings->cq_overflow);
 
-		WRITE_ONCE(ctx->cq_ring->overflow, overflow + 1);
+		WRITE_ONCE(ctx->rings->cq_overflow, overflow + 1);
 	}
 }
 
@@ -679,11 +655,11 @@ static void io_put_req(struct io_kiocb *req)
 		io_free_req(req);
 }
 
-static unsigned io_cqring_events(struct io_cq_ring *ring)
+static unsigned io_cqring_events(struct io_rings *rings)
 {
 	/* See comment at the top of this file */
 	smp_rmb();
-	return READ_ONCE(ring->r.tail) - READ_ONCE(ring->r.head);
+	return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head);
 }
 
 /*
@@ -836,7 +812,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
 		 * If we do, we can potentially be spinning for commands that
 		 * already triggered a CQE (eg in error).
 		 */
-		if (io_cqring_events(ctx->cq_ring))
+		if (io_cqring_events(ctx->rings))
 			break;
 
 		/*
@@ -2205,15 +2181,15 @@ static void io_submit_state_start(struct io_submit_state *state,
 
 static void io_commit_sqring(struct io_ring_ctx *ctx)
 {
-	struct io_sq_ring *ring = ctx->sq_ring;
+	struct io_rings *rings = ctx->rings;
 
-	if (ctx->cached_sq_head != READ_ONCE(ring->r.head)) {
+	if (ctx->cached_sq_head != READ_ONCE(rings->sq.head)) {
 		/*
 		 * Ensure any loads from the SQEs are done at this point,
 		 * since once we write the new head, the application could
 		 * write new data to them.
 		 */
-		smp_store_release(&ring->r.head, ctx->cached_sq_head);
+		smp_store_release(&rings->sq.head, ctx->cached_sq_head);
 	}
 }
 
@@ -2227,7 +2203,8 @@ static void io_commit_sqring(struct io_ring_ctx *ctx)
  */
 static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
 {
-	struct io_sq_ring *ring = ctx->sq_ring;
+	struct io_rings *rings = ctx->rings;
+	u32 *sq_array = ctx->sq_array;
 	unsigned head;
 
 	/*
@@ -2240,10 +2217,10 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
 	 */
 	head = ctx->cached_sq_head;
 	/* make sure SQ entry isn't read before tail */
-	if (head == smp_load_acquire(&ring->r.tail))
+	if (head == smp_load_acquire(&rings->sq.tail))
 		return false;
 
-	head = READ_ONCE(ring->array[head & ctx->sq_mask]);
+	head = READ_ONCE(sq_array[head & ctx->sq_mask]);
 	if (head < ctx->sq_entries) {
 		s->index = head;
 		s->sqe = &ctx->sq_sqes[head];
@@ -2253,7 +2230,7 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
 
 	/* drop invalid entries */
 	ctx->cached_sq_head++;
-	ring->dropped++;
+	rings->sq_dropped++;
 	return false;
 }
 
@@ -2366,7 +2343,7 @@ static int io_sq_thread(void *data)
 						TASK_INTERRUPTIBLE);
 
 			/* Tell userspace we may need a wakeup call */
-			ctx->sq_ring->flags |= IORING_SQ_NEED_WAKEUP;
+			ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
 			/* make sure to read SQ tail after writing flags */
 			smp_mb();
 
@@ -2380,12 +2357,12 @@ static int io_sq_thread(void *data)
 				schedule();
 				finish_wait(&ctx->sqo_wait, &wait);
 
-				ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP;
+				ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
 				continue;
 			}
 			finish_wait(&ctx->sqo_wait, &wait);
 
-			ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP;
+			ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
 		}
 
 		i = 0;
@@ -2477,10 +2454,10 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 			  const sigset_t __user *sig, size_t sigsz)
 {
-	struct io_cq_ring *ring = ctx->cq_ring;
+	struct io_rings *rings = ctx->rings;
 	int ret;
 
-	if (io_cqring_events(ring) >= min_events)
+	if (io_cqring_events(rings) >= min_events)
 		return 0;
 
 	if (sig) {
@@ -2496,12 +2473,12 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 			return ret;
 	}
 
-	ret = wait_event_interruptible(ctx->wait, io_cqring_events(ring) >= min_events);
+	ret = wait_event_interruptible(ctx->wait, io_cqring_events(rings) >= min_events);
 	restore_saved_sigmask_unless(ret == -ERESTARTSYS);
 	if (ret == -ERESTARTSYS)
 		ret = -EINTR;
 
-	return READ_ONCE(ring->r.head) == READ_ONCE(ring->r.tail) ? ret : 0;
+	return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
 }
 
 static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
@@ -2821,17 +2798,45 @@ static void *io_mem_alloc(size_t size)
 	return (void *) __get_free_pages(gfp_flags, get_order(size));
 }
 
+static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
+				size_t *sq_offset)
+{
+	struct io_rings *rings;
+	size_t off, sq_array_size;
+
+	off = struct_size(rings, cqes, cq_entries);
+	if (off == SIZE_MAX)
+		return SIZE_MAX;
+
+#ifdef CONFIG_SMP
+	off = ALIGN(off, SMP_CACHE_BYTES);
+	if (off == 0)
+		return SIZE_MAX;
+#endif
+
+	sq_array_size = array_size(sizeof(u32), sq_entries);
+	if (sq_array_size == SIZE_MAX)
+		return SIZE_MAX;
+
+	if (check_add_overflow(off, sq_array_size, &off))
+		return SIZE_MAX;
+
+	if (sq_offset)
+		*sq_offset = off;
+
+	return off;
+}
+
 static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
 {
-	struct io_sq_ring *sq_ring;
-	struct io_cq_ring *cq_ring;
-	size_t bytes;
+	size_t pages;
 
-	bytes = struct_size(sq_ring, array, sq_entries);
-	bytes += array_size(sizeof(struct io_uring_sqe), sq_entries);
-	bytes += struct_size(cq_ring, cqes, cq_entries);
+	pages = (size_t)1 << get_order(
+		rings_size(sq_entries, cq_entries, NULL));
+	pages += (size_t)1 << get_order(
+		array_size(sizeof(struct io_uring_sqe), sq_entries));
 
-	return (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
+	return pages;
 }
 
 static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
@@ -3078,9 +3083,8 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	}
 #endif
 
-	io_mem_free(ctx->sq_ring);
+	io_mem_free(ctx->rings);
 	io_mem_free(ctx->sq_sqes);
-	io_mem_free(ctx->cq_ring);
 
 	percpu_ref_exit(&ctx->refs);
 	if (ctx->account_mem)
@@ -3101,10 +3105,10 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
 	 * io_commit_cqring
 	 */
 	smp_rmb();
-	if (READ_ONCE(ctx->sq_ring->r.tail) - ctx->cached_sq_head !=
-	    ctx->sq_ring->ring_entries)
+	if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head !=
+	    ctx->rings->sq_ring_entries)
 		mask |= EPOLLOUT | EPOLLWRNORM;
-	if (READ_ONCE(ctx->cq_ring->r.head) != ctx->cached_cq_tail)
+	if (READ_ONCE(ctx->rings->sq.head) != ctx->cached_cq_tail)
 		mask |= EPOLLIN | EPOLLRDNORM;
 
 	return mask;
@@ -3149,14 +3153,12 @@ static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
 
 	switch (offset) {
 	case IORING_OFF_SQ_RING:
-		ptr = ctx->sq_ring;
+	case IORING_OFF_CQ_RING:
+		ptr = ctx->rings;
 		break;
 	case IORING_OFF_SQES:
 		ptr = ctx->sq_sqes;
 		break;
-	case IORING_OFF_CQ_RING:
-		ptr = ctx->cq_ring;
-		break;
 	default:
 		return -EINVAL;
 	}
@@ -3243,19 +3245,27 @@ static const struct file_operations io_uring_fops = {
 static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
 				  struct io_uring_params *p)
 {
-	struct io_sq_ring *sq_ring;
-	struct io_cq_ring *cq_ring;
-	size_t size;
+	struct io_rings *rings;
+	size_t size, sq_array_offset;
 
-	sq_ring = io_mem_alloc(struct_size(sq_ring, array, p->sq_entries));
-	if (!sq_ring)
+	size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
+	if (size == SIZE_MAX)
+		return -EOVERFLOW;
+
+	rings = io_mem_alloc(size);
+	if (!rings)
 		return -ENOMEM;
 
-	ctx->sq_ring = sq_ring;
-	sq_ring->ring_mask = p->sq_entries - 1;
-	sq_ring->ring_entries = p->sq_entries;
-	ctx->sq_mask = sq_ring->ring_mask;
-	ctx->sq_entries = sq_ring->ring_entries;
+	ctx->rings = rings;
+	ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
+	rings->sq_ring_mask = p->sq_entries - 1;
+	rings->cq_ring_mask = p->cq_entries - 1;
+	rings->sq_ring_entries = p->sq_entries;
+	rings->cq_ring_entries = p->cq_entries;
+	ctx->sq_mask = rings->sq_ring_mask;
+	ctx->cq_mask = rings->cq_ring_mask;
+	ctx->sq_entries = rings->sq_ring_entries;
+	ctx->cq_entries = rings->cq_ring_entries;
 
 	size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
 	if (size == SIZE_MAX)
@@ -3265,15 +3275,6 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
 	if (!ctx->sq_sqes)
 		return -ENOMEM;
 
-	cq_ring = io_mem_alloc(struct_size(cq_ring, cqes, p->cq_entries));
-	if (!cq_ring)
-		return -ENOMEM;
-
-	ctx->cq_ring = cq_ring;
-	cq_ring->ring_mask = p->cq_entries - 1;
-	cq_ring->ring_entries = p->cq_entries;
-	ctx->cq_mask = cq_ring->ring_mask;
-	ctx->cq_entries = cq_ring->ring_entries;
 	return 0;
 }
 
@@ -3377,21 +3378,21 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
 		goto err;
 
 	memset(&p->sq_off, 0, sizeof(p->sq_off));
-	p->sq_off.head = offsetof(struct io_sq_ring, r.head);
-	p->sq_off.tail = offsetof(struct io_sq_ring, r.tail);
-	p->sq_off.ring_mask = offsetof(struct io_sq_ring, ring_mask);
-	p->sq_off.ring_entries = offsetof(struct io_sq_ring, ring_entries);
-	p->sq_off.flags = offsetof(struct io_sq_ring, flags);
-	p->sq_off.dropped = offsetof(struct io_sq_ring, dropped);
-	p->sq_off.array = offsetof(struct io_sq_ring, array);
+	p->sq_off.head = offsetof(struct io_rings, sq.head);
+	p->sq_off.tail = offsetof(struct io_rings, sq.tail);
+	p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
+	p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
+	p->sq_off.flags = offsetof(struct io_rings, sq_flags);
+	p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
+	p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
 
 	memset(&p->cq_off, 0, sizeof(p->cq_off));
-	p->cq_off.head = offsetof(struct io_cq_ring, r.head);
-	p->cq_off.tail = offsetof(struct io_cq_ring, r.tail);
-	p->cq_off.ring_mask = offsetof(struct io_cq_ring, ring_mask);
-	p->cq_off.ring_entries = offsetof(struct io_cq_ring, ring_entries);
-	p->cq_off.overflow = offsetof(struct io_cq_ring, overflow);
-	p->cq_off.cqes = offsetof(struct io_cq_ring, cqes);
+	p->cq_off.head = offsetof(struct io_rings, cq.head);
+	p->cq_off.tail = offsetof(struct io_rings, cq.tail);
+	p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
+	p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
+	p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
+	p->cq_off.cqes = offsetof(struct io_rings, cqes);
 	return ret;
 err:
 	io_ring_ctx_wait_and_kill(ctx);
-- 
2.21.0


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] io_uring: allocate the two rings together
  2019-08-26 17:23 [PATCH] io_uring: allocate the two rings together Hristo Venev
@ 2019-08-27 15:50 ` Jens Axboe
  2019-08-27 19:35   ` Hristo Venev
  0 siblings, 1 reply; 9+ messages in thread
From: Jens Axboe @ 2019-08-27 15:50 UTC (permalink / raw)
  To: Hristo Venev; +Cc: linux-block

On 8/26/19 11:23 AM, Hristo Venev wrote:
> Both the sq and the cq rings have sizes just over a power of two, and
> the sq ring is significantly smaller. By bundling them in a single
> alllocation, we get the sq ring for free.
> 
> This also means that IORING_OFF_SQ_RING and IORING_OFF_CQ_RING now mean
> the same thing. If we indicate this to userspace, we can save a mmap
> call.

This looks pretty good to me. My only worry was ending up with sq and
cq cacheline sharing, but the alignment of the io_uring struct itself
should prevent that nicely.

Outside of going for a cleanup, have you observed any wins from this
change?

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] io_uring: allocate the two rings together
  2019-08-27 15:50 ` Jens Axboe
@ 2019-08-27 19:35   ` Hristo Venev
  2019-08-29 15:04     ` Jens Axboe
  0 siblings, 1 reply; 9+ messages in thread
From: Hristo Venev @ 2019-08-27 19:35 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block

[-- Attachment #1: Type: text/plain, Size: 495 bytes --]

Sorry for the duplicate reply, I forgot to CC the mailing list.

On Tue, 2019-08-27 at 09:50 -0600, Jens Axboe wrote:
> Outside of going for a cleanup, have you observed any wins from this
> change?

I haven't ran any interesting benchmarks. The cp examples in liburing
are running as fast as before, at least on x86_64.

Do you think it makes sense to tell userspace that the sq and cq mmap
offsets now mean the same thing? We could add a flag set by the kernel
to io_uring_params.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] io_uring: allocate the two rings together
  2019-08-27 19:35   ` Hristo Venev
@ 2019-08-29 15:04     ` Jens Axboe
  2019-09-06 16:26       ` Jens Axboe
  0 siblings, 1 reply; 9+ messages in thread
From: Jens Axboe @ 2019-08-29 15:04 UTC (permalink / raw)
  To: Hristo Venev; +Cc: linux-block

On 8/27/19 1:35 PM, Hristo Venev wrote:
> Sorry for the duplicate reply, I forgot to CC the mailing list.
> 
> On Tue, 2019-08-27 at 09:50 -0600, Jens Axboe wrote:
>> Outside of going for a cleanup, have you observed any wins from this
>> change?
> 
> I haven't ran any interesting benchmarks. The cp examples in liburing
> are running as fast as before, at least on x86_64.
> 
> Do you think it makes sense to tell userspace that the sq and cq mmap
> offsets now mean the same thing? We could add a flag set by the kernel
> to io_uring_params.

Not sure, there isn't a whole lot of overhead associated with having
to do two mmaps vs just one.

If you wanted to, the best approach would be to change one of the
io_uring_params reserved fields to be a feature field or something
like that. As features get added, we could flag them there. Then
the app could do:

io_uring_setup(depth, &params);
if (params.features & IORING_FEAT_SINGLE_MMAP)
	....
else
	mmap rings individually

and so forth.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] io_uring: allocate the two rings together
  2019-08-29 15:04     ` Jens Axboe
@ 2019-09-06 16:26       ` Jens Axboe
  2019-09-06 19:12         ` [PATCH 1/2] liburing/test: There are now 4 reserved fields Hristo Venev
  0 siblings, 1 reply; 9+ messages in thread
From: Jens Axboe @ 2019-09-06 16:26 UTC (permalink / raw)
  To: Hristo Venev; +Cc: linux-block

On 8/29/19 9:04 AM, Jens Axboe wrote:
> On 8/27/19 1:35 PM, Hristo Venev wrote:
>> Sorry for the duplicate reply, I forgot to CC the mailing list.
>>
>> On Tue, 2019-08-27 at 09:50 -0600, Jens Axboe wrote:
>>> Outside of going for a cleanup, have you observed any wins from this
>>> change?
>>
>> I haven't ran any interesting benchmarks. The cp examples in liburing
>> are running as fast as before, at least on x86_64.
>>
>> Do you think it makes sense to tell userspace that the sq and cq mmap
>> offsets now mean the same thing? We could add a flag set by the kernel
>> to io_uring_params.
> 
> Not sure, there isn't a whole lot of overhead associated with having
> to do two mmaps vs just one.
> 
> If you wanted to, the best approach would be to change one of the
> io_uring_params reserved fields to be a feature field or something
> like that. As features get added, we could flag them there. Then
> the app could do:
> 
> io_uring_setup(depth, &params);
> if (params.features & IORING_FEAT_SINGLE_MMAP)
> 	....
> else
> 	mmap rings individually
> 
> and so forth.

This would do it for the kernel side. I'd suggest integrating the
feature check into liburing, which means that applications get
the optimization for free.

Do you want to do it?


diff --git a/fs/io_uring.c b/fs/io_uring.c
index 17dfe57c57f8..be24596e90d7 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3391,6 +3391,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
 	p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
 	p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
 	p->cq_off.cqes = offsetof(struct io_rings, cqes);
+
+	p->features = IORING_FEAT_SINGLE_MMAP;
 	return ret;
 err:
 	io_ring_ctx_wait_and_kill(ctx);
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 1e1652f25cc1..96ee9d94b73e 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -128,11 +128,17 @@ struct io_uring_params {
 	__u32 flags;
 	__u32 sq_thread_cpu;
 	__u32 sq_thread_idle;
-	__u32 resv[5];
+	__u32 features;
+	__u32 resv[4];
 	struct io_sqring_offsets sq_off;
 	struct io_cqring_offsets cq_off;
 };
 
+/*
+ * io_uring_params->features flags
+ */
+#define IORING_FEAT_SINGLE_MMAP		(1U << 0)
+
 /*
  * io_uring_register(2) opcodes and arguments
  */

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 1/2] liburing/test: There are now 4 reserved fields
  2019-09-06 16:26       ` Jens Axboe
@ 2019-09-06 19:12         ` Hristo Venev
  2019-09-06 19:12           ` [PATCH 2/2] liburing: Use the single mmap feature Hristo Venev
  2019-09-06 19:24           ` [PATCH 1/2] liburing/test: There are now 4 reserved fields Jens Axboe
  0 siblings, 2 replies; 9+ messages in thread
From: Hristo Venev @ 2019-09-06 19:12 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block, Hristo Venev

Signed-off-by: Hristo Venev <hristo@venev.name>
---
 test/io_uring_setup.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/io_uring_setup.c b/test/io_uring_setup.c
index 2dd3763..73b0398 100644
--- a/test/io_uring_setup.c
+++ b/test/io_uring_setup.c
@@ -67,8 +67,8 @@ dump_resv(struct io_uring_params *p)
 	if (!p)
 		return "";
 
-	sprintf(resvstr, "0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x", p->resv[0],
-		p->resv[1], p->resv[2], p->resv[3], p->resv[4]);
+	sprintf(resvstr, "0x%.8x 0x%.8x 0x%.8x 0x%.8x", p->resv[0],
+		p->resv[1], p->resv[2], p->resv[3]);
 
 	return resvstr;
 }
@@ -118,7 +118,7 @@ main(int argc, char **argv)
 
 	/* resv array is non-zero */
 	memset(&p, 0, sizeof(p));
-	p.resv[0] = p.resv[1] = p.resv[2] = p.resv[3] = p.resv[4] = 1;
+	p.resv[0] = p.resv[1] = p.resv[2] = p.resv[3] = 1;
 	status |= try_io_uring_setup(1, &p, -1, EINVAL);
 
 	/* invalid flags */
-- 
2.21.0


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 2/2] liburing: Use the single mmap feature
  2019-09-06 19:12         ` [PATCH 1/2] liburing/test: There are now 4 reserved fields Hristo Venev
@ 2019-09-06 19:12           ` Hristo Venev
  2019-09-06 19:30             ` Jens Axboe
  2019-09-06 19:24           ` [PATCH 1/2] liburing/test: There are now 4 reserved fields Jens Axboe
  1 sibling, 1 reply; 9+ messages in thread
From: Hristo Venev @ 2019-09-06 19:12 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block, Hristo Venev

Signed-off-by: Hristo Venev <hristo@venev.name>
---
 src/setup.c | 35 ++++++++++++++++++++++++++---------
 1 file changed, 26 insertions(+), 9 deletions(-)

diff --git a/src/setup.c b/src/setup.c
index 47b0deb..48c96a0 100644
--- a/src/setup.c
+++ b/src/setup.c
@@ -16,10 +16,30 @@ static int io_uring_mmap(int fd, struct io_uring_params *p,
 	int ret;
 
 	sq->ring_sz = p->sq_off.array + p->sq_entries * sizeof(unsigned);
+	cq->ring_sz = p->cq_off.cqes + p->cq_entries * sizeof(struct io_uring_cqe);
+
+	if (p->features & IORING_FEAT_SINGLE_MMAP) {
+		if (cq->ring_sz > sq->ring_sz) {
+			sq->ring_sz = cq->ring_sz;
+		}
+		cq->ring_sz = sq->ring_sz;
+	}
 	sq->ring_ptr = mmap(0, sq->ring_sz, PROT_READ | PROT_WRITE,
 			MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING);
 	if (sq->ring_ptr == MAP_FAILED)
 		return -errno;
+
+	if (p->features & IORING_FEAT_SINGLE_MMAP) {
+		cq->ring_ptr = sq->ring_ptr;
+	} else {
+		cq->ring_ptr = mmap(0, cq->ring_sz, PROT_READ | PROT_WRITE,
+				MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING);
+		if (cq->ring_ptr == MAP_FAILED) {
+			ret = -errno;
+			goto err;
+		}
+	}
+
 	sq->khead = sq->ring_ptr + p->sq_off.head;
 	sq->ktail = sq->ring_ptr + p->sq_off.tail;
 	sq->kring_mask = sq->ring_ptr + p->sq_off.ring_mask;
@@ -34,19 +54,14 @@ static int io_uring_mmap(int fd, struct io_uring_params *p,
 				IORING_OFF_SQES);
 	if (sq->sqes == MAP_FAILED) {
 		ret = -errno;
+		if (cq->ring_ptr != sq->ring_ptr) {
+			munmap(cq->ring_ptr, cq->ring_sz);
+		}
 err:
 		munmap(sq->ring_ptr, sq->ring_sz);
 		return ret;
 	}
 
-	cq->ring_sz = p->cq_off.cqes + p->cq_entries * sizeof(struct io_uring_cqe);
-	cq->ring_ptr = mmap(0, cq->ring_sz, PROT_READ | PROT_WRITE,
-			MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING);
-	if (cq->ring_ptr == MAP_FAILED) {
-		ret = -errno;
-		munmap(sq->sqes, *sq->kring_entries * sizeof(struct io_uring_sqe));
-		goto err;
-	}
 	cq->khead = cq->ring_ptr + p->cq_off.head;
 	cq->ktail = cq->ring_ptr + p->cq_off.tail;
 	cq->kring_mask = cq->ring_ptr + p->cq_off.ring_mask;
@@ -105,6 +120,8 @@ void io_uring_queue_exit(struct io_uring *ring)
 
 	munmap(sq->sqes, *sq->kring_entries * sizeof(struct io_uring_sqe));
 	munmap(sq->ring_ptr, sq->ring_sz);
-	munmap(cq->ring_ptr, cq->ring_sz);
+	if (cq->ring_ptr != sq->ring_ptr) {
+		munmap(cq->ring_ptr, cq->ring_sz);
+	}
 	close(ring->ring_fd);
 }
-- 
2.21.0


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] liburing/test: There are now 4 reserved fields
  2019-09-06 19:12         ` [PATCH 1/2] liburing/test: There are now 4 reserved fields Hristo Venev
  2019-09-06 19:12           ` [PATCH 2/2] liburing: Use the single mmap feature Hristo Venev
@ 2019-09-06 19:24           ` Jens Axboe
  1 sibling, 0 replies; 9+ messages in thread
From: Jens Axboe @ 2019-09-06 19:24 UTC (permalink / raw)
  To: Hristo Venev; +Cc: linux-block

On 9/6/19 1:12 PM, Hristo Venev wrote:
> Signed-off-by: Hristo Venev <hristo@venev.name>> ---
>   test/io_uring_setup.c | 6 +++---
>   1 file changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/test/io_uring_setup.c b/test/io_uring_setup.c
> index 2dd3763..73b0398 100644
> --- a/test/io_uring_setup.c
> +++ b/test/io_uring_setup.c
> @@ -67,8 +67,8 @@ dump_resv(struct io_uring_params *p)
>   	if (!p)
>   		return "";
>   
> -	sprintf(resvstr, "0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x", p->resv[0],
> -		p->resv[1], p->resv[2], p->resv[3], p->resv[4]);
> +	sprintf(resvstr, "0x%.8x 0x%.8x 0x%.8x 0x%.8x", p->resv[0],
> +		p->resv[1], p->resv[2], p->resv[3]);
>   
>   	return resvstr;
>   }
> @@ -118,7 +118,7 @@ main(int argc, char **argv)
>   
>   	/* resv array is non-zero */
>   	memset(&p, 0, sizeof(p));
> -	p.resv[0] = p.resv[1] = p.resv[2] = p.resv[3] = p.resv[4] = 1;
> +	p.resv[0] = p.resv[1] = p.resv[2] = p.resv[3] = 1;
>   	status |= try_io_uring_setup(1, &p, -1, EINVAL);
>   
>   	/* invalid flags */

Forgot to push this one out, but I already fixed that up by adding
support for printing ->features as well.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/2] liburing: Use the single mmap feature
  2019-09-06 19:12           ` [PATCH 2/2] liburing: Use the single mmap feature Hristo Venev
@ 2019-09-06 19:30             ` Jens Axboe
  0 siblings, 0 replies; 9+ messages in thread
From: Jens Axboe @ 2019-09-06 19:30 UTC (permalink / raw)
  To: Hristo Venev; +Cc: linux-block

On 9/6/19 1:12 PM, Hristo Venev wrote:
> Signed-off-by: Hristo Venev <hristo@venev.name>
> ---
>   src/setup.c | 35 ++++++++++++++++++++++++++---------
>   1 file changed, 26 insertions(+), 9 deletions(-)
> 
> diff --git a/src/setup.c b/src/setup.c
> index 47b0deb..48c96a0 100644
> --- a/src/setup.c
> +++ b/src/setup.c
> @@ -16,10 +16,30 @@ static int io_uring_mmap(int fd, struct io_uring_params *p,
>   	int ret;
>   
>   	sq->ring_sz = p->sq_off.array + p->sq_entries * sizeof(unsigned);
> +	cq->ring_sz = p->cq_off.cqes + p->cq_entries * sizeof(struct io_uring_cqe);
> +
> +	if (p->features & IORING_FEAT_SINGLE_MMAP) {
> +		if (cq->ring_sz > sq->ring_sz) {
> +			sq->ring_sz = cq->ring_sz;
> +		}
> +		cq->ring_sz = sq->ring_sz;
> +	}
>   	sq->ring_ptr = mmap(0, sq->ring_sz, PROT_READ | PROT_WRITE,
>   			MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING);
>   	if (sq->ring_ptr == MAP_FAILED)
>   		return -errno;
> +
> +	if (p->features & IORING_FEAT_SINGLE_MMAP) {
> +		cq->ring_ptr = sq->ring_ptr;
> +	} else {
> +		cq->ring_ptr = mmap(0, cq->ring_sz, PROT_READ | PROT_WRITE,
> +				MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING);
> +		if (cq->ring_ptr == MAP_FAILED) {
> +			ret = -errno;
> +			goto err;
> +		}
> +	}
> +
>   	sq->khead = sq->ring_ptr + p->sq_off.head;
>   	sq->ktail = sq->ring_ptr + p->sq_off.tail;
>   	sq->kring_mask = sq->ring_ptr + p->sq_off.ring_mask;
> @@ -34,19 +54,14 @@ static int io_uring_mmap(int fd, struct io_uring_params *p,
>   				IORING_OFF_SQES);
>   	if (sq->sqes == MAP_FAILED) {
>   		ret = -errno;
> +		if (cq->ring_ptr != sq->ring_ptr) {
> +			munmap(cq->ring_ptr, cq->ring_sz);
> +		}
>   err:
>   		munmap(sq->ring_ptr, sq->ring_sz);
>   		return ret;
>   	}
>   
> -	cq->ring_sz = p->cq_off.cqes + p->cq_entries * sizeof(struct io_uring_cqe);
> -	cq->ring_ptr = mmap(0, cq->ring_sz, PROT_READ | PROT_WRITE,
> -			MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING);
> -	if (cq->ring_ptr == MAP_FAILED) {
> -		ret = -errno;
> -		munmap(sq->sqes, *sq->kring_entries * sizeof(struct io_uring_sqe));
> -		goto err;
> -	}
>   	cq->khead = cq->ring_ptr + p->cq_off.head;
>   	cq->ktail = cq->ring_ptr + p->cq_off.tail;
>   	cq->kring_mask = cq->ring_ptr + p->cq_off.ring_mask;
> @@ -105,6 +120,8 @@ void io_uring_queue_exit(struct io_uring *ring)
>   
>   	munmap(sq->sqes, *sq->kring_entries * sizeof(struct io_uring_sqe));
>   	munmap(sq->ring_ptr, sq->ring_sz);
> -	munmap(cq->ring_ptr, cq->ring_sz);
> +	if (cq->ring_ptr != sq->ring_ptr) {
> +		munmap(cq->ring_ptr, cq->ring_sz);
> +	}
>   	close(ring->ring_fd);
>   }

Thanks, applied.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, back to index

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-08-26 17:23 [PATCH] io_uring: allocate the two rings together Hristo Venev
2019-08-27 15:50 ` Jens Axboe
2019-08-27 19:35   ` Hristo Venev
2019-08-29 15:04     ` Jens Axboe
2019-09-06 16:26       ` Jens Axboe
2019-09-06 19:12         ` [PATCH 1/2] liburing/test: There are now 4 reserved fields Hristo Venev
2019-09-06 19:12           ` [PATCH 2/2] liburing: Use the single mmap feature Hristo Venev
2019-09-06 19:30             ` Jens Axboe
2019-09-06 19:24           ` [PATCH 1/2] liburing/test: There are now 4 reserved fields Jens Axboe

Linux-Block Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-block/0 linux-block/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-block linux-block/ https://lore.kernel.org/linux-block \
		linux-block@vger.kernel.org linux-block@archiver.kernel.org
	public-inbox-index linux-block


Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-block


AGPL code for this site: git clone https://public-inbox.org/ public-inbox