IO-Uring Archive on lore.kernel.org
 help / color / Atom feed
* [PATCHSET 0/9] cleanups, improvements, additions
@ 2019-12-28 19:21 Jens Axboe
  2019-12-28 19:21 ` [PATCH 1/9] io_uring: remove two unnecessary function declarations Jens Axboe
                   ` (8 more replies)
  0 siblings, 9 replies; 10+ messages in thread
From: Jens Axboe @ 2019-12-28 19:21 UTC (permalink / raw)
  To: io-uring

This is a grab bag of stuff:

- Removal of forward declarations we don't need
- Lookup table for opcodes, making that part saner
- Optimization of the overflow handling
- Poll performance improvements
- Addition of non-vectored read/writes
- Allow current file position offset reads/writes
- Add fadvise/madvise opcodes

-- 
Jens Axboe



^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 1/9] io_uring: remove two unnecessary function declarations
  2019-12-28 19:21 [PATCHSET 0/9] cleanups, improvements, additions Jens Axboe
@ 2019-12-28 19:21 ` Jens Axboe
  2019-12-28 19:21 ` [PATCH 2/9] io_uring: add lookup table for various opcode needs Jens Axboe
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Jens Axboe @ 2019-12-28 19:21 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

__io_free_req() and io_double_put_req() aren't used before they are
defined, so we can kill these two forwards.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index d787a97febf8..40735ecc09c9 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -518,9 +518,7 @@ struct io_submit_state {
 
 static void io_wq_submit_work(struct io_wq_work **workptr);
 static void io_cqring_fill_event(struct io_kiocb *req, long res);
-static void __io_free_req(struct io_kiocb *req);
 static void io_put_req(struct io_kiocb *req);
-static void io_double_put_req(struct io_kiocb *req);
 static void __io_double_put_req(struct io_kiocb *req);
 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
 static void io_queue_linked_timeout(struct io_kiocb *req);
-- 
2.24.1


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 2/9] io_uring: add lookup table for various opcode needs
  2019-12-28 19:21 [PATCHSET 0/9] cleanups, improvements, additions Jens Axboe
  2019-12-28 19:21 ` [PATCH 1/9] io_uring: remove two unnecessary function declarations Jens Axboe
@ 2019-12-28 19:21 ` Jens Axboe
  2019-12-28 19:21 ` [PATCH 3/9] io_uring: split overflow state into SQ and CQ side Jens Axboe
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Jens Axboe @ 2019-12-28 19:21 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

We currently have various switch statements that check if an opcode needs
a file, mm, etc. These are hard to keep in sync as opcodes are added. Add
a struct io_op_def that holds all of this information, so we have just
one spot to update when opcodes are added.

This also enables us to NOT allocate req->io if a deferred command
doesn't need it, and corrects some mistakes we had in terms of what
commands need mm context.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 209 +++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 156 insertions(+), 53 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 40735ecc09c9..0ee6b3057895 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -516,6 +516,137 @@ struct io_submit_state {
 	unsigned int		ios_left;
 };
 
+struct io_op_def {
+	/* needs req->io allocated for deferral/async */
+	unsigned		async_ctx : 1;
+	/* needs current->mm setup, does mm access */
+	unsigned		needs_mm : 1;
+	/* needs req->file assigned */
+	unsigned		needs_file : 1;
+	/* needs req->file assigned IFF fd is >= 0 */
+	unsigned		fd_non_neg : 1;
+	/* hash wq insertion if file is a regular file */
+	unsigned		hash_reg_file : 1;
+	/* unbound wq insertion if file is a non-regular file */
+	unsigned		unbound_nonreg_file : 1;
+};
+
+static const struct io_op_def io_op_defs[IORING_OP_LAST] = {
+	{
+		/* IORING_OP_NOP */
+	},
+	{
+		/* IORING_OP_READV */
+		.async_ctx		= 1,
+		.needs_mm		= 1,
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+	},
+	{
+		/* IORING_OP_WRITEV */
+		.async_ctx		= 1,
+		.needs_mm		= 1,
+		.needs_file		= 1,
+		.hash_reg_file		= 1,
+		.unbound_nonreg_file	= 1,
+	},
+	{
+		/* IORING_OP_FSYNC */
+		.needs_file		= 1,
+	},
+	{
+		/* IORING_OP_READ_FIXED */
+		.async_ctx		= 1,
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+	},
+	{
+		/* IORING_OP_WRITE_FIXED */
+		.async_ctx		= 1,
+		.needs_file		= 1,
+		.hash_reg_file		= 1,
+		.unbound_nonreg_file	= 1,
+	},
+	{
+		/* IORING_OP_POLL_ADD */
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+	},
+	{
+		/* IORING_OP_POLL_REMOVE */
+	},
+	{
+		/* IORING_OP_SYNC_FILE_RANGE */
+		.needs_file		= 1,
+	},
+	{
+		/* IORING_OP_SENDMSG */
+		.async_ctx		= 1,
+		.needs_mm		= 1,
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+	},
+	{
+		/* IORING_OP_RECVMSG */
+		.async_ctx		= 1,
+		.needs_mm		= 1,
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+	},
+	{
+		/* IORING_OP_TIMEOUT */
+		.async_ctx		= 1,
+		.needs_mm		= 1,
+	},
+	{
+		/* IORING_OP_TIMEOUT_REMOVE */
+	},
+	{
+		/* IORING_OP_ACCEPT */
+		.needs_mm		= 1,
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+	},
+	{
+		/* IORING_OP_ASYNC_CANCEL */
+	},
+	{
+		/* IORING_OP_LINK_TIMEOUT */
+		.async_ctx		= 1,
+		.needs_mm		= 1,
+	},
+	{
+		/* IORING_OP_CONNECT */
+		.async_ctx		= 1,
+		.needs_mm		= 1,
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+	},
+	{
+		/* IORING_OP_FALLOCATE */
+		.needs_file		= 1,
+	},
+	{
+		/* IORING_OP_OPENAT */
+		.needs_file		= 1,
+		.fd_non_neg		= 1,
+	},
+	{
+		/* IORING_OP_CLOSE */
+		.needs_file		= 1,
+	},
+	{
+		/* IORING_OP_FILES_UPDATE */
+		.needs_mm		= 1,
+	},
+	{
+		/* IORING_OP_STATX */
+		.needs_mm		= 1,
+		.needs_file		= 1,
+		.fd_non_neg		= 1,
+	},
+};
+
 static void io_wq_submit_work(struct io_wq_work **workptr);
 static void io_cqring_fill_event(struct io_kiocb *req, long res);
 static void io_put_req(struct io_kiocb *req);
@@ -671,41 +802,20 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx)
 	}
 }
 
-static inline bool io_req_needs_user(struct io_kiocb *req)
-{
-	return !(req->opcode == IORING_OP_READ_FIXED ||
-		 req->opcode == IORING_OP_WRITE_FIXED);
-}
-
 static inline bool io_prep_async_work(struct io_kiocb *req,
 				      struct io_kiocb **link)
 {
+	const struct io_op_def *def = &io_op_defs[req->opcode];
 	bool do_hashed = false;
 
-	switch (req->opcode) {
-	case IORING_OP_WRITEV:
-	case IORING_OP_WRITE_FIXED:
-		/* only regular files should be hashed for writes */
-		if (req->flags & REQ_F_ISREG)
+	if (req->flags & REQ_F_ISREG) {
+		if (def->hash_reg_file)
 			do_hashed = true;
-		/* fall-through */
-	case IORING_OP_READV:
-	case IORING_OP_READ_FIXED:
-	case IORING_OP_SENDMSG:
-	case IORING_OP_RECVMSG:
-	case IORING_OP_ACCEPT:
-	case IORING_OP_POLL_ADD:
-	case IORING_OP_CONNECT:
-		/*
-		 * We know REQ_F_ISREG is not set on some of these
-		 * opcodes, but this enables us to keep the check in
-		 * just one place.
-		 */
-		if (!(req->flags & REQ_F_ISREG))
+	} else {
+		if (def->unbound_nonreg_file)
 			req->work.flags |= IO_WQ_WORK_UNBOUND;
-		break;
 	}
-	if (io_req_needs_user(req))
+	if (def->needs_mm)
 		req->work.flags |= IO_WQ_WORK_NEEDS_USER;
 
 	*link = io_prep_linked_timeout(req);
@@ -1826,6 +1936,8 @@ static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size,
 
 static int io_alloc_async_ctx(struct io_kiocb *req)
 {
+	if (!io_op_defs[req->opcode].async_ctx)
+		return 0;
 	req->io = kmalloc(sizeof(*req->io), GFP_KERNEL);
 	return req->io == NULL;
 }
@@ -3773,29 +3885,13 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
 	}
 }
 
-static bool io_req_op_valid(int op)
-{
-	return op >= IORING_OP_NOP && op < IORING_OP_LAST;
-}
-
 static int io_req_needs_file(struct io_kiocb *req, int fd)
 {
-	switch (req->opcode) {
-	case IORING_OP_NOP:
-	case IORING_OP_POLL_REMOVE:
-	case IORING_OP_TIMEOUT:
-	case IORING_OP_TIMEOUT_REMOVE:
-	case IORING_OP_ASYNC_CANCEL:
-	case IORING_OP_LINK_TIMEOUT:
+	if (!io_op_defs[req->opcode].needs_file)
 		return 0;
-	case IORING_OP_OPENAT:
-	case IORING_OP_STATX:
-		return fd != -1;
-	default:
-		if (io_req_op_valid(req->opcode))
-			return 1;
-		return -EINVAL;
-	}
+	if (fd == -1 && io_op_defs[req->opcode].fd_non_neg)
+		return 0;
+	return 1;
 }
 
 static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
@@ -3812,7 +3908,7 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	unsigned flags;
-	int fd, ret;
+	int fd;
 
 	flags = READ_ONCE(sqe->flags);
 	fd = READ_ONCE(sqe->fd);
@@ -3820,9 +3916,8 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
 	if (flags & IOSQE_IO_DRAIN)
 		req->flags |= REQ_F_IO_DRAIN;
 
-	ret = io_req_needs_file(req, fd);
-	if (ret <= 0)
-		return ret;
+	if (!io_req_needs_file(req, fd))
+		return 0;
 
 	if (flags & IOSQE_FIXED_FILE) {
 		if (unlikely(!ctx->file_data ||
@@ -4248,7 +4343,16 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 			break;
 		}
 
-		if (io_req_needs_user(req) && !*mm) {
+		/* will complete beyond this point, count as submitted */
+		submitted++;
+
+		if (unlikely(req->opcode >= IORING_OP_LAST)) {
+			io_cqring_add_event(req, -EINVAL);
+			io_double_put_req(req);
+			break;
+		}
+
+		if (io_op_defs[req->opcode].needs_mm && !*mm) {
 			mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm);
 			if (!mm_fault) {
 				use_mm(ctx->sqo_mm);
@@ -4256,7 +4360,6 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 			}
 		}
 
-		submitted++;
 		req->ring_file = ring_file;
 		req->ring_fd = ring_fd;
 		req->has_user = *mm != NULL;
-- 
2.24.1


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 3/9] io_uring: split overflow state into SQ and CQ side
  2019-12-28 19:21 [PATCHSET 0/9] cleanups, improvements, additions Jens Axboe
  2019-12-28 19:21 ` [PATCH 1/9] io_uring: remove two unnecessary function declarations Jens Axboe
  2019-12-28 19:21 ` [PATCH 2/9] io_uring: add lookup table for various opcode needs Jens Axboe
@ 2019-12-28 19:21 ` Jens Axboe
  2019-12-28 19:21 ` [PATCH 4/9] io_uring: improve poll completion performance Jens Axboe
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Jens Axboe @ 2019-12-28 19:21 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

We currently check ->cq_overflow_list from both SQ and CQ context, which
causes some bouncing of that cache line. Add separate bits of state for
this instead, so that the SQ side can check using its own state, and
likewise for the CQ side.

This adds ->sq_check_overflow with the SQ state, and ->cq_check_overflow
with the CQ state. If we hit an overflow condition, both of these bits
are set. Likewise for overflow flush clear, we clear both bits. For the
fast path of just checking if there's an overflow condition on either
the SQ or CQ side, we can use our own private bit for this.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 40 +++++++++++++++++++++++++++-------------
 1 file changed, 27 insertions(+), 13 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0ee6b3057895..6f99a52c350c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -224,13 +224,14 @@ struct io_ring_ctx {
 		unsigned		sq_thread_idle;
 		unsigned		cached_sq_dropped;
 		atomic_t		cached_cq_overflow;
-		struct io_uring_sqe	*sq_sqes;
+		unsigned long		sq_check_overflow;
 
 		struct list_head	defer_list;
 		struct list_head	timeout_list;
 		struct list_head	cq_overflow_list;
 
 		wait_queue_head_t	inflight_wait;
+		struct io_uring_sqe	*sq_sqes;
 	} ____cacheline_aligned_in_smp;
 
 	struct io_rings	*rings;
@@ -272,6 +273,7 @@ struct io_ring_ctx {
 		unsigned		cq_entries;
 		unsigned		cq_mask;
 		atomic_t		cq_timeouts;
+		unsigned long		cq_check_overflow;
 		struct wait_queue_head	cq_wait;
 		struct fasync_struct	*cq_fasync;
 		struct eventfd_ctx	*cq_ev_fd;
@@ -952,6 +954,10 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 	}
 
 	io_commit_cqring(ctx);
+	if (cqe) {
+		clear_bit(0, &ctx->sq_check_overflow);
+		clear_bit(0, &ctx->cq_check_overflow);
+	}
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 	io_cqring_ev_posted(ctx);
 
@@ -985,6 +991,10 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res)
 		WRITE_ONCE(ctx->rings->cq_overflow,
 				atomic_inc_return(&ctx->cached_cq_overflow));
 	} else {
+		if (list_empty(&ctx->cq_overflow_list)) {
+			set_bit(0, &ctx->sq_check_overflow);
+			set_bit(0, &ctx->cq_check_overflow);
+		}
 		refcount_inc(&req->refs);
 		req->result = res;
 		list_add_tail(&req->list, &ctx->cq_overflow_list);
@@ -1287,19 +1297,21 @@ static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
 {
 	struct io_rings *rings = ctx->rings;
 
-	/*
-	 * noflush == true is from the waitqueue handler, just ensure we wake
-	 * up the task, and the next invocation will flush the entries. We
-	 * cannot safely to it from here.
-	 */
-	if (noflush && !list_empty(&ctx->cq_overflow_list))
-		return -1U;
+	if (test_bit(0, &ctx->cq_check_overflow)) {
+		/*
+		 * noflush == true is from the waitqueue handler, just ensure
+		 * we wake up the task, and the next invocation will flush the
+		 * entries. We cannot safely to it from here.
+		 */
+		if (noflush && !list_empty(&ctx->cq_overflow_list))
+			return -1U;
 
-	io_cqring_overflow_flush(ctx, false);
+		io_cqring_overflow_flush(ctx, false);
+	}
 
 	/* See comment at the top of this file */
 	smp_rmb();
-	return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head);
+	return ctx->cached_cq_tail - READ_ONCE(rings->cq.head);
 }
 
 static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
@@ -4319,9 +4331,11 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 	bool mm_fault = false;
 
 	/* if we have a backlog and couldn't flush it all, return BUSY */
-	if (!list_empty(&ctx->cq_overflow_list) &&
-	    !io_cqring_overflow_flush(ctx, false))
-		return -EBUSY;
+	if (test_bit(0, &ctx->sq_check_overflow)) {
+		if (!list_empty(&ctx->cq_overflow_list) &&
+		    !io_cqring_overflow_flush(ctx, false))
+			return -EBUSY;
+	}
 
 	if (nr > IO_PLUG_THRESHOLD) {
 		io_submit_state_start(&state, nr);
-- 
2.24.1


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 4/9] io_uring: improve poll completion performance
  2019-12-28 19:21 [PATCHSET 0/9] cleanups, improvements, additions Jens Axboe
                   ` (2 preceding siblings ...)
  2019-12-28 19:21 ` [PATCH 3/9] io_uring: split overflow state into SQ and CQ side Jens Axboe
@ 2019-12-28 19:21 ` Jens Axboe
  2019-12-28 19:21 ` [PATCH 5/9] io_uring: add non-vectored read/write commands Jens Axboe
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Jens Axboe @ 2019-12-28 19:21 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

For busy IORING_OP_POLL_ADD workloads, we can have enough contention
on the completion lock that we fail the inline completion path quite
often as we fail the trylock on that lock. Add a list for deferred
completions that we can use in that case. This helps reduce the number
of async offloads we have to do, as if we get multiple completions in
a row, we'll piggy back on to the poll_llist instead of having to queue
our own offload.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 108 ++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 88 insertions(+), 20 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6f99a52c350c..0ee9115a599d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -286,7 +286,8 @@ struct io_ring_ctx {
 
 	struct {
 		spinlock_t		completion_lock;
-		bool			poll_multi_file;
+		struct llist_head	poll_llist;
+
 		/*
 		 * ->poll_list is protected by the ctx->uring_lock for
 		 * io_uring instances that don't use IORING_SETUP_SQPOLL.
@@ -296,6 +297,7 @@ struct io_ring_ctx {
 		struct list_head	poll_list;
 		struct hlist_head	*cancel_hash;
 		unsigned		cancel_hash_bits;
+		bool			poll_multi_file;
 
 		spinlock_t		inflight_lock;
 		struct list_head	inflight_list;
@@ -453,7 +455,14 @@ struct io_kiocb {
 	};
 
 	struct io_async_ctx		*io;
-	struct file			*ring_file;
+	union {
+		/*
+		 * ring_file is only used in the submission path, and
+		 * llist_node is only used for poll deferred completions
+		 */
+		struct file		*ring_file;
+		struct llist_node	llist_node;
+	};
 	int				ring_fd;
 	bool				has_user;
 	bool				in_async;
@@ -727,6 +736,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	mutex_init(&ctx->uring_lock);
 	init_waitqueue_head(&ctx->wait);
 	spin_lock_init(&ctx->completion_lock);
+	init_llist_head(&ctx->poll_llist);
 	INIT_LIST_HEAD(&ctx->poll_list);
 	INIT_LIST_HEAD(&ctx->defer_list);
 	INIT_LIST_HEAD(&ctx->timeout_list);
@@ -1322,6 +1332,20 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
 	return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
 }
 
+static inline bool io_req_multi_free(struct io_kiocb *req)
+{
+	/*
+	 * If we're not using fixed files, we have to pair the completion part
+	 * with the file put. Use regular completions for those, only batch
+	 * free for fixed file and non-linked commands.
+	 */
+	if (((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) == REQ_F_FIXED_FILE)
+	    && !io_is_fallback_req(req) && !req->io)
+		return true;
+
+	return false;
+}
+
 /*
  * Find and free completed poll iocbs
  */
@@ -1341,14 +1365,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 		(*nr_events)++;
 
 		if (refcount_dec_and_test(&req->refs)) {
-			/* If we're not using fixed files, we have to pair the
-			 * completion part with the file put. Use regular
-			 * completions for those, only batch free for fixed
-			 * file and non-linked commands.
-			 */
-			if (((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) ==
-			    REQ_F_FIXED_FILE) && !io_is_fallback_req(req) &&
-			    !req->io) {
+			if (io_req_multi_free(req)) {
 				reqs[to_free++] = req;
 				if (to_free == ARRAY_SIZE(reqs))
 					io_free_req_many(ctx, reqs, &to_free);
@@ -3082,6 +3099,44 @@ static void io_poll_complete_work(struct io_wq_work **workptr)
 		*workptr = &nxt->work;
 }
 
+static void __io_poll_flush(struct io_ring_ctx *ctx, struct llist_node *nodes)
+{
+	void *reqs[IO_IOPOLL_BATCH];
+	struct io_kiocb *req, *tmp;
+	int to_free = 0;
+
+	spin_lock_irq(&ctx->completion_lock);
+	llist_for_each_entry_safe(req, tmp, nodes, llist_node) {
+		hash_del(&req->hash_node);
+		io_poll_complete(req, req->result, 0);
+
+		if (refcount_dec_and_test(&req->refs)) {
+			if (io_req_multi_free(req)) {
+				reqs[to_free++] = req;
+				if (to_free == ARRAY_SIZE(reqs))
+					io_free_req_many(ctx, reqs, &to_free);
+			} else {
+				req->flags |= REQ_F_COMP_LOCKED;
+				io_free_req(req);
+			}
+		}
+	}
+	spin_unlock_irq(&ctx->completion_lock);
+
+	io_cqring_ev_posted(ctx);
+	io_free_req_many(ctx, reqs, &to_free);
+}
+
+static void io_poll_flush(struct io_wq_work **workptr)
+{
+	struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
+	struct llist_node *nodes;
+
+	nodes = llist_del_all(&req->ctx->poll_llist);
+	if (nodes)
+		__io_poll_flush(req->ctx, nodes);
+}
+
 static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 			void *key)
 {
@@ -3089,7 +3144,6 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 	struct io_kiocb *req = container_of(poll, struct io_kiocb, poll);
 	struct io_ring_ctx *ctx = req->ctx;
 	__poll_t mask = key_to_poll(key);
-	unsigned long flags;
 
 	/* for instances that support it check for an event match first: */
 	if (mask && !(mask & poll->events))
@@ -3103,17 +3157,31 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 	 * If we have a link timeout we're going to need the completion_lock
 	 * for finalizing the request, mark us as having grabbed that already.
 	 */
-	if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) {
-		hash_del(&req->hash_node);
-		io_poll_complete(req, mask, 0);
-		req->flags |= REQ_F_COMP_LOCKED;
-		io_put_req(req);
-		spin_unlock_irqrestore(&ctx->completion_lock, flags);
+	if (mask) {
+		unsigned long flags;
 
-		io_cqring_ev_posted(ctx);
-	} else {
-		io_queue_async_work(req);
+		if (llist_empty(&ctx->poll_llist) &&
+		    spin_trylock_irqsave(&ctx->completion_lock, flags)) {
+			hash_del(&req->hash_node);
+			io_poll_complete(req, mask, 0);
+			req->flags |= REQ_F_COMP_LOCKED;
+			io_put_req(req);
+			spin_unlock_irqrestore(&ctx->completion_lock, flags);
+
+			io_cqring_ev_posted(ctx);
+			req = NULL;
+		} else {
+			req->result = mask;
+			req->llist_node.next = NULL;
+			/* if the list wasn't empty, we're done */
+			if (!llist_add(&req->llist_node, &ctx->poll_llist))
+				req = NULL;
+			else
+				req->work.func = io_poll_flush;
+		}
 	}
+	if (req)
+		io_queue_async_work(req);
 
 	return 1;
 }
-- 
2.24.1


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 5/9] io_uring: add non-vectored read/write commands
  2019-12-28 19:21 [PATCHSET 0/9] cleanups, improvements, additions Jens Axboe
                   ` (3 preceding siblings ...)
  2019-12-28 19:21 ` [PATCH 4/9] io_uring: improve poll completion performance Jens Axboe
@ 2019-12-28 19:21 ` Jens Axboe
  2019-12-28 19:21 ` [PATCH 6/9] io_uring: allow use of offset == -1 to mean file position Jens Axboe
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Jens Axboe @ 2019-12-28 19:21 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

For uses cases that don't already naturally have an iovec, it's easier
(or more convenient) to just use a buffer address + length. This is
particular true if the use case is from languages that want to create
a memory safe abstraction on top of io_uring, and where introducing
the need for the iovec may impose an ownership issue. For those cases,
they currently need an indirection buffer, which means allocating data
just for this purpose.

Add basic read/write that don't require the iovec.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 23 +++++++++++++++++++++++
 include/uapi/linux/io_uring.h |  2 ++
 2 files changed, 25 insertions(+)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0ee9115a599d..464ca73f2dd3 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -656,6 +656,18 @@ static const struct io_op_def io_op_defs[IORING_OP_LAST] = {
 		.needs_file		= 1,
 		.fd_non_neg		= 1,
 	},
+	{
+		/* IORING_OP_READ */
+		.needs_mm		= 1,
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+	},
+	{
+		/* IORING_OP_WRITE */
+		.needs_mm		= 1,
+		.needs_file		= 1,
+		.unbound_nonreg_file	= 1,
+	},
 };
 
 static void io_wq_submit_work(struct io_wq_work **workptr);
@@ -1869,6 +1881,13 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
 	if (req->rw.kiocb.private)
 		return -EINVAL;
 
+	if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
+		ssize_t ret;
+		ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
+		*iovec = NULL;
+		return ret;
+	}
+
 	if (req->io) {
 		struct io_async_rw *iorw = &req->io->rw;
 
@@ -3635,10 +3654,12 @@ static int io_req_defer_prep(struct io_kiocb *req,
 		break;
 	case IORING_OP_READV:
 	case IORING_OP_READ_FIXED:
+	case IORING_OP_READ:
 		ret = io_read_prep(req, sqe, true);
 		break;
 	case IORING_OP_WRITEV:
 	case IORING_OP_WRITE_FIXED:
+	case IORING_OP_WRITE:
 		ret = io_write_prep(req, sqe, true);
 		break;
 	case IORING_OP_POLL_ADD:
@@ -3742,6 +3763,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		break;
 	case IORING_OP_READV:
 	case IORING_OP_READ_FIXED:
+	case IORING_OP_READ:
 		if (sqe) {
 			ret = io_read_prep(req, sqe, force_nonblock);
 			if (ret < 0)
@@ -3751,6 +3773,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		break;
 	case IORING_OP_WRITEV:
 	case IORING_OP_WRITE_FIXED:
+	case IORING_OP_WRITE:
 		if (sqe) {
 			ret = io_write_prep(req, sqe, force_nonblock);
 			if (ret < 0)
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 762a63a49af1..03d2dde46152 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -84,6 +84,8 @@ enum {
 	IORING_OP_CLOSE,
 	IORING_OP_FILES_UPDATE,
 	IORING_OP_STATX,
+	IORING_OP_READ,
+	IORING_OP_WRITE,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
-- 
2.24.1


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 6/9] io_uring: allow use of offset == -1 to mean file position
  2019-12-28 19:21 [PATCHSET 0/9] cleanups, improvements, additions Jens Axboe
                   ` (4 preceding siblings ...)
  2019-12-28 19:21 ` [PATCH 5/9] io_uring: add non-vectored read/write commands Jens Axboe
@ 2019-12-28 19:21 ` Jens Axboe
  2019-12-28 19:21 ` [PATCH 7/9] io_uring: add IORING_OP_FADVISE Jens Axboe
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Jens Axboe @ 2019-12-28 19:21 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe, 李通洲

This behaves like preadv2/pwritev2 with offset == -1, it'll use (and
update) the current file position. This obviously comes with the caveat
that if the application has multiple read/writes in flight, then the
end result will not be as expected. This is similar to threads sharing
a file descriptor and doing IO using the current file position.

Since this feature isn't easily detectable by doing a read or write,
add a feature flags, IORING_FEAT_RW_CUR_POS, to allow applications to
detect presence of this feature.

Reported-by: 李通洲 <carter.li@eoitek.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 11 ++++++++++-
 include/uapi/linux/io_uring.h |  1 +
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 464ca73f2dd3..b24fcb4272be 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -495,6 +495,7 @@ struct io_kiocb {
 #define REQ_F_COMP_LOCKED	32768	/* completion under lock */
 #define REQ_F_HARDLINK		65536	/* doesn't sever on completion < 0 */
 #define REQ_F_FORCE_ASYNC	131072	/* IOSQE_ASYNC */
+#define REQ_F_CUR_POS		262144	/* read/write uses file position */
 	u64			user_data;
 	u32			result;
 	u32			sequence;
@@ -1713,6 +1714,10 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		req->flags |= REQ_F_ISREG;
 
 	kiocb->ki_pos = READ_ONCE(sqe->off);
+	if (kiocb->ki_pos == -1 && !(req->file->f_mode & FMODE_STREAM)) {
+		req->flags |= REQ_F_CUR_POS;
+		kiocb->ki_pos = req->file->f_pos;
+	}
 	kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
 	kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
 
@@ -1784,6 +1789,10 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
 static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt,
 		       bool in_async)
 {
+	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
+
+	if (req->flags & REQ_F_CUR_POS)
+		req->file->f_pos = kiocb->ki_pos;
 	if (in_async && ret >= 0 && kiocb->ki_complete == io_complete_rw)
 		*nxt = __io_complete_rw(kiocb, ret);
 	else
@@ -6153,7 +6162,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
 		goto err;
 
 	p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
-			IORING_FEAT_SUBMIT_STABLE;
+			IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS;
 	trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
 	return ret;
 err:
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 03d2dde46152..80f892628e66 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -174,6 +174,7 @@ struct io_uring_params {
 #define IORING_FEAT_SINGLE_MMAP		(1U << 0)
 #define IORING_FEAT_NODROP		(1U << 1)
 #define IORING_FEAT_SUBMIT_STABLE	(1U << 2)
+#define IORING_FEAT_RW_CUR_POS		(1U << 3)
 
 /*
  * io_uring_register(2) opcodes and arguments
-- 
2.24.1


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 7/9] io_uring: add IORING_OP_FADVISE
  2019-12-28 19:21 [PATCHSET 0/9] cleanups, improvements, additions Jens Axboe
                   ` (5 preceding siblings ...)
  2019-12-28 19:21 ` [PATCH 6/9] io_uring: allow use of offset == -1 to mean file position Jens Axboe
@ 2019-12-28 19:21 ` Jens Axboe
  2019-12-28 19:21 ` [PATCH 8/9] mm: make do_madvise() available internally Jens Axboe
  2019-12-28 19:21 ` [PATCH 9/9] io_uring: add IORING_OP_MADVISE Jens Axboe
  8 siblings, 0 replies; 10+ messages in thread
From: Jens Axboe @ 2019-12-28 19:21 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

This adds support for doing fadvise through io_uring. We assume that
WILLNEED doesn't block, but that DONTNEED may block.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 61 +++++++++++++++++++++++++++++++++++
 include/uapi/linux/io_uring.h |  2 ++
 2 files changed, 63 insertions(+)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index b24fcb4272be..8d99b9c5a568 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -72,6 +72,7 @@
 #include <linux/highmem.h>
 #include <linux/namei.h>
 #include <linux/fsnotify.h>
+#include <linux/fadvise.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/io_uring.h>
@@ -400,6 +401,13 @@ struct io_files_update {
 	u32				offset;
 };
 
+struct io_fadvise {
+	struct file			*file;
+	u64				offset;
+	u32				len;
+	u32				advice;
+};
+
 struct io_async_connect {
 	struct sockaddr_storage		address;
 };
@@ -452,6 +460,7 @@ struct io_kiocb {
 		struct io_open		open;
 		struct io_close		close;
 		struct io_files_update	files_update;
+		struct io_fadvise	fadvise;
 	};
 
 	struct io_async_ctx		*io;
@@ -669,6 +678,10 @@ static const struct io_op_def io_op_defs[IORING_OP_LAST] = {
 		.needs_file		= 1,
 		.unbound_nonreg_file	= 1,
 	},
+	{
+		/* IORING_OP_FADVISE */
+		.needs_file		= 1,
+	},
 };
 
 static void io_wq_submit_work(struct io_wq_work **workptr);
@@ -2433,6 +2446,43 @@ static int io_openat(struct io_kiocb *req, struct io_kiocb **nxt,
 	return 0;
 }
 
+static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+#if defined(CONFIG_ADVISE_SYSCALLS)
+	if (sqe->ioprio || sqe->buf_index || sqe->addr)
+		return -EINVAL;
+
+	req->fadvise.offset = READ_ONCE(sqe->off);
+	req->fadvise.len = READ_ONCE(sqe->len);
+	req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
+	return 0;
+#else
+	return -EOPNOTSUPP;
+#endif
+}
+
+static int io_fadvise(struct io_kiocb *req, struct io_kiocb **nxt,
+		      bool force_nonblock)
+{
+#if defined(CONFIG_ADVISE_SYSCALLS)
+	struct io_fadvise *fa = &req->fadvise;
+	int ret;
+
+	/* DONTNEED may block, others _should_ not */
+	if (fa->advice == POSIX_FADV_DONTNEED && force_nonblock)
+		return -EAGAIN;
+
+	ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
+	if (ret < 0)
+		req_set_fail_links(req);
+	io_cqring_add_event(req, ret);
+	io_put_req_find_next(req, nxt);
+	return 0;
+#else
+	return -EOPNOTSUPP;
+#endif
+}
+
 static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	unsigned lookup_flags;
@@ -3722,6 +3772,9 @@ static int io_req_defer_prep(struct io_kiocb *req,
 	case IORING_OP_STATX:
 		ret = io_statx_prep(req, sqe);
 		break;
+	case IORING_OP_FADVISE:
+		ret = io_fadvise_prep(req, sqe);
+		break;
 	default:
 		printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
 				req->opcode);
@@ -3918,6 +3971,14 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		}
 		ret = io_statx(req, nxt, force_nonblock);
 		break;
+	case IORING_OP_FADVISE:
+		if (sqe) {
+			ret = io_fadvise_prep(req, sqe);
+			if (ret)
+				break;
+		}
+		ret = io_fadvise(req, nxt, force_nonblock);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 80f892628e66..f87d8fb42916 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -36,6 +36,7 @@ struct io_uring_sqe {
 		__u32		cancel_flags;
 		__u32		open_flags;
 		__u32		statx_flags;
+		__u32		fadvise_advice;
 	};
 	__u64	user_data;	/* data to be passed back at completion time */
 	union {
@@ -86,6 +87,7 @@ enum {
 	IORING_OP_STATX,
 	IORING_OP_READ,
 	IORING_OP_WRITE,
+	IORING_OP_FADVISE,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
-- 
2.24.1


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 8/9] mm: make do_madvise() available internally
  2019-12-28 19:21 [PATCHSET 0/9] cleanups, improvements, additions Jens Axboe
                   ` (6 preceding siblings ...)
  2019-12-28 19:21 ` [PATCH 7/9] io_uring: add IORING_OP_FADVISE Jens Axboe
@ 2019-12-28 19:21 ` Jens Axboe
  2019-12-28 19:21 ` [PATCH 9/9] io_uring: add IORING_OP_MADVISE Jens Axboe
  8 siblings, 0 replies; 10+ messages in thread
From: Jens Axboe @ 2019-12-28 19:21 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

This is in preparation for enabling this functionality through io_uring.
Add a helper that is just exporting what sys_madvise() does, and have the
system call use it.

No functional changes in this patch.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/mm.h | 1 +
 mm/madvise.c       | 7 ++++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c97ea3b694e6..3eb156c3f016 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2328,6 +2328,7 @@ extern int __do_munmap(struct mm_struct *, unsigned long, size_t,
 		       struct list_head *uf, bool downgrade);
 extern int do_munmap(struct mm_struct *, unsigned long, size_t,
 		     struct list_head *uf);
+extern int do_madvise(unsigned long start, size_t len_in, int behavior);
 
 static inline unsigned long
 do_mmap_pgoff(struct file *file, unsigned long addr,
diff --git a/mm/madvise.c b/mm/madvise.c
index bcdb6a042787..43b47d3fae02 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -1044,7 +1044,7 @@ madvise_behavior_valid(int behavior)
  *  -EBADF  - map exists, but area maps something that isn't a file.
  *  -EAGAIN - a kernel resource was temporarily unavailable.
  */
-SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
+int do_madvise(unsigned long start, size_t len_in, int behavior)
 {
 	unsigned long end, tmp;
 	struct vm_area_struct *vma, *prev;
@@ -1141,3 +1141,8 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
 
 	return error;
 }
+
+SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
+{
+	return do_madvise(start, len_in, behavior);
+}
-- 
2.24.1


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 9/9] io_uring: add IORING_OP_MADVISE
  2019-12-28 19:21 [PATCHSET 0/9] cleanups, improvements, additions Jens Axboe
                   ` (7 preceding siblings ...)
  2019-12-28 19:21 ` [PATCH 8/9] mm: make do_madvise() available internally Jens Axboe
@ 2019-12-28 19:21 ` Jens Axboe
  8 siblings, 0 replies; 10+ messages in thread
From: Jens Axboe @ 2019-12-28 19:21 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe

This adds support for doing madvise(2) through io_uring. We assume that
any operation can block, and hence punt everything async. This could be
improved, but hard to make bullet proof. The async punt ensures it's
safe.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 56 ++++++++++++++++++++++++++++++++++-
 include/uapi/linux/io_uring.h |  1 +
 2 files changed, 56 insertions(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8d99b9c5a568..b1f28037aa74 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -403,7 +403,10 @@ struct io_files_update {
 
 struct io_fadvise {
 	struct file			*file;
-	u64				offset;
+	union {
+		u64			offset;
+		u64			addr;
+	};
 	u32				len;
 	u32				advice;
 };
@@ -682,6 +685,10 @@ static const struct io_op_def io_op_defs[IORING_OP_LAST] = {
 		/* IORING_OP_FADVISE */
 		.needs_file		= 1,
 	},
+	{
+		/* IORING_OP_MADVISE */
+		.needs_mm		= 1,
+	},
 };
 
 static void io_wq_submit_work(struct io_wq_work **workptr);
@@ -2446,6 +2453,42 @@ static int io_openat(struct io_kiocb *req, struct io_kiocb **nxt,
 	return 0;
 }
 
+static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+#if defined(CONFIG_ADVISE_SYSCALLS)
+	if (sqe->ioprio || sqe->buf_index || sqe->off)
+		return -EINVAL;
+
+	req->fadvise.addr = READ_ONCE(sqe->addr);
+	req->fadvise.len = READ_ONCE(sqe->len);
+	req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
+	return 0;
+#else
+	return -EOPNOTSUPP;
+#endif
+}
+
+static int io_madvise(struct io_kiocb *req, struct io_kiocb **nxt,
+		      bool force_nonblock)
+{
+#if defined(CONFIG_ADVISE_SYSCALLS)
+	struct io_fadvise *fa = &req->fadvise;
+	int ret;
+
+	if (force_nonblock)
+		return -EAGAIN;
+
+	ret = do_madvise(fa->addr, fa->len, fa->advice);
+	if (ret < 0)
+		req_set_fail_links(req);
+	io_cqring_add_event(req, ret);
+	io_put_req_find_next(req, nxt);
+	return 0;
+#else
+	return -EOPNOTSUPP;
+#endif
+}
+
 static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 #if defined(CONFIG_ADVISE_SYSCALLS)
@@ -3775,6 +3818,9 @@ static int io_req_defer_prep(struct io_kiocb *req,
 	case IORING_OP_FADVISE:
 		ret = io_fadvise_prep(req, sqe);
 		break;
+	case IORING_OP_MADVISE:
+		ret = io_madvise_prep(req, sqe);
+		break;
 	default:
 		printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
 				req->opcode);
@@ -3979,6 +4025,14 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		}
 		ret = io_fadvise(req, nxt, force_nonblock);
 		break;
+	case IORING_OP_MADVISE:
+		if (sqe) {
+			ret = io_madvise_prep(req, sqe);
+			if (ret)
+				break;
+		}
+		ret = io_madvise(req, nxt, force_nonblock);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index f87d8fb42916..7cb6fe0fccd7 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -88,6 +88,7 @@ enum {
 	IORING_OP_READ,
 	IORING_OP_WRITE,
 	IORING_OP_FADVISE,
+	IORING_OP_MADVISE,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
-- 
2.24.1


^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, back to index

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-12-28 19:21 [PATCHSET 0/9] cleanups, improvements, additions Jens Axboe
2019-12-28 19:21 ` [PATCH 1/9] io_uring: remove two unnecessary function declarations Jens Axboe
2019-12-28 19:21 ` [PATCH 2/9] io_uring: add lookup table for various opcode needs Jens Axboe
2019-12-28 19:21 ` [PATCH 3/9] io_uring: split overflow state into SQ and CQ side Jens Axboe
2019-12-28 19:21 ` [PATCH 4/9] io_uring: improve poll completion performance Jens Axboe
2019-12-28 19:21 ` [PATCH 5/9] io_uring: add non-vectored read/write commands Jens Axboe
2019-12-28 19:21 ` [PATCH 6/9] io_uring: allow use of offset == -1 to mean file position Jens Axboe
2019-12-28 19:21 ` [PATCH 7/9] io_uring: add IORING_OP_FADVISE Jens Axboe
2019-12-28 19:21 ` [PATCH 8/9] mm: make do_madvise() available internally Jens Axboe
2019-12-28 19:21 ` [PATCH 9/9] io_uring: add IORING_OP_MADVISE Jens Axboe

IO-Uring Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/io-uring/0 io-uring/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 io-uring io-uring/ https://lore.kernel.org/io-uring \
		io-uring@vger.kernel.org
	public-inbox-index io-uring

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.io-uring


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git