linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCHSET 0/8] io_uring fixes
@ 2019-03-14 20:44 Jens Axboe
  2019-03-14 20:44 ` [PATCH 1/8] io_uring: use regular request ref counts Jens Axboe
                   ` (7 more replies)
  0 siblings, 8 replies; 9+ messages in thread
From: Jens Axboe @ 2019-03-14 20:44 UTC (permalink / raw)
  To: linux-fsdevel, linux-block; +Cc: viro

Here's a collection of fixes/changes that should go into this revision
(at least the first 5). The first 5 are based on Al's fixes for aio, the
next two are optimizations that allow us to bring back no-page-refs for
BVEC iterators, and the last is the cache hint.

Also find the patches here:

http://git.kernel.dk/cgit/linux-block/log/?h=io_uring-next

 block/bio.c                   |  43 ++---
 fs/block_dev.c                |  12 +-
 fs/io_uring.c                 | 388 ++++++++++++++++++------------------------
 fs/iomap.c                    |  12 +-
 include/linux/blk_types.h     |   1 +
 include/linux/uio.h           |  24 ++-
 include/uapi/linux/io_uring.h |   5 +
 7 files changed, 232 insertions(+), 253 deletions(-)

-- 
Jens Axboe



^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 1/8] io_uring: use regular request ref counts
  2019-03-14 20:44 [PATCHSET 0/8] io_uring fixes Jens Axboe
@ 2019-03-14 20:44 ` Jens Axboe
  2019-03-14 20:44 ` [PATCH 2/8] io_uring: make io_read/write return an integer Jens Axboe
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: Jens Axboe @ 2019-03-14 20:44 UTC (permalink / raw)
  To: linux-fsdevel, linux-block; +Cc: viro, Jens Axboe

Get rid of the special casing of "normal" requests not having
any references to the io_kiocb. We initialize the ref count to 2,
one for the submission side, and one or the completion side.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 48 ++++++++++++++++++++++++++++++------------------
 1 file changed, 30 insertions(+), 18 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5d99376d2369..c54af70c72fd 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -411,7 +411,8 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
 
 	req->ctx = ctx;
 	req->flags = 0;
-	refcount_set(&req->refs, 0);
+	/* one is dropped after submission, the other at completion */
+	refcount_set(&req->refs, 2);
 	return req;
 out:
 	io_ring_drop_ctx_refs(ctx, 1);
@@ -429,10 +430,14 @@ static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
 
 static void io_free_req(struct io_kiocb *req)
 {
-	if (!refcount_read(&req->refs) || refcount_dec_and_test(&req->refs)) {
-		io_ring_drop_ctx_refs(req->ctx, 1);
-		kmem_cache_free(req_cachep, req);
-	}
+	io_ring_drop_ctx_refs(req->ctx, 1);
+	kmem_cache_free(req_cachep, req);
+}
+
+static void io_put_req(struct io_kiocb *req)
+{
+	if (refcount_dec_and_test(&req->refs))
+		io_free_req(req);
 }
 
 /*
@@ -453,7 +458,8 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 
 		io_cqring_fill_event(ctx, req->user_data, req->error, 0);
 
-		reqs[to_free++] = req;
+		if (refcount_dec_and_test(&req->refs))
+			reqs[to_free++] = req;
 		(*nr_events)++;
 
 		/*
@@ -616,7 +622,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
 
 	io_fput(req);
 	io_cqring_add_event(req->ctx, req->user_data, res, 0);
-	io_free_req(req);
+	io_put_req(req);
 }
 
 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
@@ -1083,7 +1089,7 @@ static int io_nop(struct io_kiocb *req, u64 user_data)
 		io_fput(req);
 	}
 	io_cqring_add_event(ctx, user_data, err, 0);
-	io_free_req(req);
+	io_put_req(req);
 	return 0;
 }
 
@@ -1146,7 +1152,7 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 
 	io_fput(req);
 	io_cqring_add_event(req->ctx, sqe->user_data, ret, 0);
-	io_free_req(req);
+	io_put_req(req);
 	return 0;
 }
 
@@ -1204,7 +1210,7 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	spin_unlock_irq(&ctx->completion_lock);
 
 	io_cqring_add_event(req->ctx, sqe->user_data, ret, 0);
-	io_free_req(req);
+	io_put_req(req);
 	return 0;
 }
 
@@ -1212,7 +1218,7 @@ static void io_poll_complete(struct io_kiocb *req, __poll_t mask)
 {
 	io_cqring_add_event(req->ctx, req->user_data, mangle_poll(mask), 0);
 	io_fput(req);
-	io_free_req(req);
+	io_put_req(req);
 }
 
 static void io_poll_complete_work(struct work_struct *work)
@@ -1346,9 +1352,6 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	INIT_LIST_HEAD(&poll->wait.entry);
 	init_waitqueue_func_entry(&poll->wait, io_poll_wake);
 
-	/* one for removal from waitqueue, one for this function */
-	refcount_set(&req->refs, 2);
-
 	mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
 	if (unlikely(!poll->head)) {
 		/* we did not manage to set up a waitqueue, done */
@@ -1380,13 +1383,12 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		 * Drop one of our refs to this req, __io_submit_sqe() will
 		 * drop the other one since we're returning an error.
 		 */
-		io_free_req(req);
+		io_put_req(req);
 		return ipt.error;
 	}
 
 	if (mask)
 		io_poll_complete(req, mask);
-	io_free_req(req);
 	return 0;
 }
 
@@ -1524,10 +1526,13 @@ static void io_sq_wq_submit_work(struct work_struct *work)
 					break;
 				cond_resched();
 			} while (1);
+
+			/* drop submission reference */
+			io_put_req(req);
 		}
 		if (ret) {
 			io_cqring_add_event(ctx, sqe->user_data, ret, 0);
-			io_free_req(req);
+			io_put_req(req);
 		}
 
 		/* async context always use a copy of the sqe */
@@ -1617,6 +1622,7 @@ static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req)
 static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
 			 struct io_submit_state *state)
 {
+	bool did_submit = true;
 	struct io_kiocb *req;
 	ssize_t ret;
 
@@ -1650,10 +1656,16 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
 				queue_work(ctx->sqo_wq, &req->work);
 			}
 			ret = 0;
+			did_submit = false;
 		}
 	}
+
+	/* drop submission reference, if we did the submit */
+	if (did_submit)
+		io_put_req(req);
+
 	if (ret)
-		io_free_req(req);
+		io_put_req(req);
 
 	return ret;
 }
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 2/8] io_uring: make io_read/write return an integer
  2019-03-14 20:44 [PATCHSET 0/8] io_uring fixes Jens Axboe
  2019-03-14 20:44 ` [PATCH 1/8] io_uring: use regular request ref counts Jens Axboe
@ 2019-03-14 20:44 ` Jens Axboe
  2019-03-14 20:44 ` [PATCH 3/8] io_uring: add prepped flag Jens Axboe
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: Jens Axboe @ 2019-03-14 20:44 UTC (permalink / raw)
  To: linux-fsdevel, linux-block; +Cc: viro, Jens Axboe

The callers all convert to an integer, and we only return 0/-ERROR
anyway.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index c54af70c72fd..ecb93bb9d84e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -893,7 +893,7 @@ static int io_import_iovec(struct io_ring_ctx *ctx, int rw,
 	opcode = READ_ONCE(sqe->opcode);
 	if (opcode == IORING_OP_READ_FIXED ||
 	    opcode == IORING_OP_WRITE_FIXED) {
-		ssize_t ret = io_import_fixed(ctx, rw, sqe, iter);
+		int ret = io_import_fixed(ctx, rw, sqe, iter);
 		*iovec = NULL;
 		return ret;
 	}
@@ -951,15 +951,15 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len)
 	async_list->io_end = io_end;
 }
 
-static ssize_t io_read(struct io_kiocb *req, const struct sqe_submit *s,
-		       bool force_nonblock, struct io_submit_state *state)
+static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
+		   bool force_nonblock, struct io_submit_state *state)
 {
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 	struct kiocb *kiocb = &req->rw;
 	struct iov_iter iter;
 	struct file *file;
 	size_t iov_count;
-	ssize_t ret;
+	int ret;
 
 	ret = io_prep_rw(req, s, force_nonblock, state);
 	if (ret)
@@ -1004,15 +1004,15 @@ static ssize_t io_read(struct io_kiocb *req, const struct sqe_submit *s,
 	return ret;
 }
 
-static ssize_t io_write(struct io_kiocb *req, const struct sqe_submit *s,
-			bool force_nonblock, struct io_submit_state *state)
+static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
+		    bool force_nonblock, struct io_submit_state *state)
 {
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 	struct kiocb *kiocb = &req->rw;
 	struct iov_iter iter;
 	struct file *file;
 	size_t iov_count;
-	ssize_t ret;
+	int ret;
 
 	ret = io_prep_rw(req, s, force_nonblock, state);
 	if (ret)
@@ -1396,8 +1396,7 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 			   const struct sqe_submit *s, bool force_nonblock,
 			   struct io_submit_state *state)
 {
-	ssize_t ret;
-	int opcode;
+	int ret, opcode;
 
 	if (unlikely(s->index >= ctx->sq_entries))
 		return -EINVAL;
@@ -1624,7 +1623,7 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
 {
 	bool did_submit = true;
 	struct io_kiocb *req;
-	ssize_t ret;
+	int ret;
 
 	/* enforce forwards compatibility on users */
 	if (unlikely(s->sqe->flags & ~IOSQE_FIXED_FILE))
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 3/8] io_uring: add prepped flag
  2019-03-14 20:44 [PATCHSET 0/8] io_uring fixes Jens Axboe
  2019-03-14 20:44 ` [PATCH 1/8] io_uring: use regular request ref counts Jens Axboe
  2019-03-14 20:44 ` [PATCH 2/8] io_uring: make io_read/write return an integer Jens Axboe
@ 2019-03-14 20:44 ` Jens Axboe
  2019-03-14 20:44 ` [PATCH 4/8] io_uring: fix fget/fput handling Jens Axboe
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: Jens Axboe @ 2019-03-14 20:44 UTC (permalink / raw)
  To: linux-fsdevel, linux-block; +Cc: viro, Jens Axboe

We currently use the fact that if ->ki_filp is already set, then we've
done the prep. In preparation for moving the file assignment earlier,
use a separate flag to tell whether the request has been prepped for
IO or not.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ecb93bb9d84e..d8468594a483 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -214,6 +214,7 @@ struct io_kiocb {
 #define REQ_F_IOPOLL_COMPLETED	2	/* polled IO has completed */
 #define REQ_F_FIXED_FILE	4	/* ctx owns file */
 #define REQ_F_SEQ_PREV		8	/* sequential with previous */
+#define REQ_F_PREPPED		16	/* prep already done */
 	u64			user_data;
 	u64			error;
 
@@ -741,7 +742,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
 	int fd, ret;
 
 	/* For -EAGAIN retry, everything is already prepped */
-	if (kiocb->ki_filp)
+	if (req->flags & REQ_F_PREPPED)
 		return 0;
 
 	flags = READ_ONCE(sqe->flags);
@@ -799,6 +800,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
 		}
 		kiocb->ki_complete = io_complete_rw;
 	}
+	req->flags |= REQ_F_PREPPED;
 	return 0;
 out_fput:
 	if (!(flags & IOSQE_FIXED_FILE)) {
@@ -1099,8 +1101,8 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	unsigned flags;
 	int fd;
 
-	/* Prep already done */
-	if (req->rw.ki_filp)
+	/* Prep already done (EAGAIN retry) */
+	if (req->flags & REQ_F_PREPPED)
 		return 0;
 
 	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
@@ -1122,6 +1124,7 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 			return -EBADF;
 	}
 
+	req->flags |= REQ_F_PREPPED;
 	return 0;
 }
 
@@ -1633,8 +1636,6 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
 	if (unlikely(!req))
 		return -EAGAIN;
 
-	req->rw.ki_filp = NULL;
-
 	ret = __io_submit_sqe(ctx, req, s, true, state);
 	if (ret == -EAGAIN) {
 		struct io_uring_sqe *sqe_copy;
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 4/8] io_uring: fix fget/fput handling
  2019-03-14 20:44 [PATCHSET 0/8] io_uring fixes Jens Axboe
                   ` (2 preceding siblings ...)
  2019-03-14 20:44 ` [PATCH 3/8] io_uring: add prepped flag Jens Axboe
@ 2019-03-14 20:44 ` Jens Axboe
  2019-03-14 20:44 ` [PATCH 5/8] io_uring: fix poll races Jens Axboe
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: Jens Axboe @ 2019-03-14 20:44 UTC (permalink / raw)
  To: linux-fsdevel, linux-block; +Cc: viro, Jens Axboe

This isn't a straight port of commit 84c4e1f89fef for aio.c, since
io_uring doesn't use files in exactly the same way. But it's pretty
close. See the commit message for that commit.

This essentially fixes a use-after-free with the poll command
handling, but it takes cue from Linus's approach to just simplifying
the file handling. We move the setup of the file into a higher level
location, so the individual commands don't have to deal with it. And
then we release the reference when we free the associated io_kiocb.

Fixes: 221c5eb23382 ("io_uring: add support for IORING_OP_POLL")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 201 +++++++++++++++++---------------------------------
 1 file changed, 67 insertions(+), 134 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index d8468594a483..f4fe9dce38ee 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -189,6 +189,10 @@ struct sqe_submit {
 	bool				needs_fixed_file;
 };
 
+/*
+ * First field must be the file pointer in all the
+ * iocb unions! See also 'struct kiocb' in <linux/fs.h>
+ */
 struct io_poll_iocb {
 	struct file			*file;
 	struct wait_queue_head		*head;
@@ -198,8 +202,15 @@ struct io_poll_iocb {
 	struct wait_queue_entry		wait;
 };
 
+/*
+ * NOTE! Each of the iocb union members has the file pointer
+ * as the first entry in their struct definition. So you can
+ * access the file pointer through any of the sub-structs,
+ * or directly as just 'ki_filp' in this struct.
+ */
 struct io_kiocb {
 	union {
+		struct file		*file;
 		struct kiocb		rw;
 		struct io_poll_iocb	poll;
 	};
@@ -429,8 +440,16 @@ static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
 	}
 }
 
+static void io_fput(struct io_kiocb *req)
+{
+	if (!(req->flags & REQ_F_FIXED_FILE))
+		fput(req->file);
+}
+
 static void io_free_req(struct io_kiocb *req)
 {
+	if (req->file)
+		io_fput(req);
 	io_ring_drop_ctx_refs(req->ctx, 1);
 	kmem_cache_free(req_cachep, req);
 }
@@ -448,45 +467,34 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 			       struct list_head *done)
 {
 	void *reqs[IO_IOPOLL_BATCH];
-	int file_count, to_free;
-	struct file *file = NULL;
 	struct io_kiocb *req;
+	int to_free;
 
-	file_count = to_free = 0;
+	to_free = 0;
 	while (!list_empty(done)) {
 		req = list_first_entry(done, struct io_kiocb, list);
 		list_del(&req->list);
 
 		io_cqring_fill_event(ctx, req->user_data, req->error, 0);
-
-		if (refcount_dec_and_test(&req->refs))
-			reqs[to_free++] = req;
 		(*nr_events)++;
 
-		/*
-		 * Batched puts of the same file, to avoid dirtying the
-		 * file usage count multiple times, if avoidable.
-		 */
-		if (!(req->flags & REQ_F_FIXED_FILE)) {
-			if (!file) {
-				file = req->rw.ki_filp;
-				file_count = 1;
-			} else if (file == req->rw.ki_filp) {
-				file_count++;
+		if (refcount_dec_and_test(&req->refs)) {
+			/* If we're not using fixed files, we have to pair the
+			 * completion part with the file put. Use regular
+			 * completions for those, only batch free for fixed
+			 * file.
+			 */
+			if (req->flags & REQ_F_FIXED_FILE) {
+				reqs[to_free++] = req;
+				if (to_free == ARRAY_SIZE(reqs))
+					io_free_req_many(ctx, reqs, &to_free);
 			} else {
-				fput_many(file, file_count);
-				file = req->rw.ki_filp;
-				file_count = 1;
+				io_free_req(req);
 			}
 		}
-
-		if (to_free == ARRAY_SIZE(reqs))
-			io_free_req_many(ctx, reqs, &to_free);
 	}
-	io_commit_cqring(ctx);
 
-	if (file)
-		fput_many(file, file_count);
+	io_commit_cqring(ctx);
 	io_free_req_many(ctx, reqs, &to_free);
 }
 
@@ -609,19 +617,12 @@ static void kiocb_end_write(struct kiocb *kiocb)
 	}
 }
 
-static void io_fput(struct io_kiocb *req)
-{
-	if (!(req->flags & REQ_F_FIXED_FILE))
-		fput(req->rw.ki_filp);
-}
-
 static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
 {
 	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
 
 	kiocb_end_write(kiocb);
 
-	io_fput(req);
 	io_cqring_add_event(req->ctx, req->user_data, res, 0);
 	io_put_req(req);
 }
@@ -738,31 +739,16 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
 	const struct io_uring_sqe *sqe = s->sqe;
 	struct io_ring_ctx *ctx = req->ctx;
 	struct kiocb *kiocb = &req->rw;
-	unsigned ioprio, flags;
-	int fd, ret;
+	unsigned ioprio;
+	int ret;
 
 	/* For -EAGAIN retry, everything is already prepped */
 	if (req->flags & REQ_F_PREPPED)
 		return 0;
 
-	flags = READ_ONCE(sqe->flags);
-	fd = READ_ONCE(sqe->fd);
+	if (force_nonblock && !io_file_supports_async(req->file))
+		force_nonblock = false;
 
-	if (flags & IOSQE_FIXED_FILE) {
-		if (unlikely(!ctx->user_files ||
-		    (unsigned) fd >= ctx->nr_user_files))
-			return -EBADF;
-		kiocb->ki_filp = ctx->user_files[fd];
-		req->flags |= REQ_F_FIXED_FILE;
-	} else {
-		if (s->needs_fixed_file)
-			return -EBADF;
-		kiocb->ki_filp = io_file_get(state, fd);
-		if (unlikely(!kiocb->ki_filp))
-			return -EBADF;
-		if (force_nonblock && !io_file_supports_async(kiocb->ki_filp))
-			force_nonblock = false;
-	}
 	kiocb->ki_pos = READ_ONCE(sqe->off);
 	kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
 	kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
@@ -771,7 +757,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
 	if (ioprio) {
 		ret = ioprio_check_cap(ioprio);
 		if (ret)
-			goto out_fput;
+			return ret;
 
 		kiocb->ki_ioprio = ioprio;
 	} else
@@ -779,39 +765,26 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
 
 	ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
 	if (unlikely(ret))
-		goto out_fput;
+		return ret;
 	if (force_nonblock) {
 		kiocb->ki_flags |= IOCB_NOWAIT;
 		req->flags |= REQ_F_FORCE_NONBLOCK;
 	}
 	if (ctx->flags & IORING_SETUP_IOPOLL) {
-		ret = -EOPNOTSUPP;
 		if (!(kiocb->ki_flags & IOCB_DIRECT) ||
 		    !kiocb->ki_filp->f_op->iopoll)
-			goto out_fput;
+			return -EOPNOTSUPP;
 
 		req->error = 0;
 		kiocb->ki_flags |= IOCB_HIPRI;
 		kiocb->ki_complete = io_complete_rw_iopoll;
 	} else {
-		if (kiocb->ki_flags & IOCB_HIPRI) {
-			ret = -EINVAL;
-			goto out_fput;
-		}
+		if (kiocb->ki_flags & IOCB_HIPRI)
+			return -EINVAL;
 		kiocb->ki_complete = io_complete_rw;
 	}
 	req->flags |= REQ_F_PREPPED;
 	return 0;
-out_fput:
-	if (!(flags & IOSQE_FIXED_FILE)) {
-		/*
-		 * in case of error, we didn't use this file reference. drop it.
-		 */
-		if (state)
-			state->used_refs--;
-		io_file_put(state, kiocb->ki_filp);
-	}
-	return ret;
 }
 
 static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
@@ -968,16 +941,14 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
 		return ret;
 	file = kiocb->ki_filp;
 
-	ret = -EBADF;
 	if (unlikely(!(file->f_mode & FMODE_READ)))
-		goto out_fput;
-	ret = -EINVAL;
+		return -EBADF;
 	if (unlikely(!file->f_op->read_iter))
-		goto out_fput;
+		return -EINVAL;
 
 	ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter);
 	if (ret)
-		goto out_fput;
+		return ret;
 
 	iov_count = iov_iter_count(&iter);
 	ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count);
@@ -999,10 +970,6 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
 		}
 	}
 	kfree(iovec);
-out_fput:
-	/* Hold on to the file for -EAGAIN */
-	if (unlikely(ret && ret != -EAGAIN))
-		io_fput(req);
 	return ret;
 }
 
@@ -1020,17 +987,15 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
 	if (ret)
 		return ret;
 
-	ret = -EBADF;
 	file = kiocb->ki_filp;
 	if (unlikely(!(file->f_mode & FMODE_WRITE)))
-		goto out_fput;
-	ret = -EINVAL;
+		return -EBADF;
 	if (unlikely(!file->f_op->write_iter))
-		goto out_fput;
+		return -EINVAL;
 
 	ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter);
 	if (ret)
-		goto out_fput;
+		return ret;
 
 	iov_count = iov_iter_count(&iter);
 
@@ -1062,10 +1027,6 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
 	}
 out_free:
 	kfree(iovec);
-out_fput:
-	/* Hold on to the file for -EAGAIN */
-	if (unlikely(ret && ret != -EAGAIN))
-		io_fput(req);
 	return ret;
 }
 
@@ -1080,16 +1041,6 @@ static int io_nop(struct io_kiocb *req, u64 user_data)
 	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
 
-	/*
-	 * Twilight zone - it's possible that someone issued an opcode that
-	 * has a file attached, then got -EAGAIN on submission, and changed
-	 * the sqe before we retried it from async context. Avoid dropping
-	 * a file reference for this malicious case, and flag the error.
-	 */
-	if (req->rw.ki_filp) {
-		err = -EBADF;
-		io_fput(req);
-	}
 	io_cqring_add_event(ctx, user_data, err, 0);
 	io_put_req(req);
 	return 0;
@@ -1098,8 +1049,6 @@ static int io_nop(struct io_kiocb *req, u64 user_data)
 static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-	unsigned flags;
-	int fd;
 
 	/* Prep already done (EAGAIN retry) */
 	if (req->flags & REQ_F_PREPPED)
@@ -1110,20 +1059,6 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
 		return -EINVAL;
 
-	fd = READ_ONCE(sqe->fd);
-	flags = READ_ONCE(sqe->flags);
-
-	if (flags & IOSQE_FIXED_FILE) {
-		if (unlikely(!ctx->user_files || fd >= ctx->nr_user_files))
-			return -EBADF;
-		req->rw.ki_filp = ctx->user_files[fd];
-		req->flags |= REQ_F_FIXED_FILE;
-	} else {
-		req->rw.ki_filp = fget(fd);
-		if (unlikely(!req->rw.ki_filp))
-			return -EBADF;
-	}
-
 	req->flags |= REQ_F_PREPPED;
 	return 0;
 }
@@ -1153,7 +1088,6 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 				end > 0 ? end : LLONG_MAX,
 				fsync_flags & IORING_FSYNC_DATASYNC);
 
-	io_fput(req);
 	io_cqring_add_event(req->ctx, sqe->user_data, ret, 0);
 	io_put_req(req);
 	return 0;
@@ -1220,7 +1154,6 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 static void io_poll_complete(struct io_kiocb *req, __poll_t mask)
 {
 	io_cqring_add_event(req->ctx, req->user_data, mangle_poll(mask), 0);
-	io_fput(req);
 	io_put_req(req);
 }
 
@@ -1314,10 +1247,8 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	struct io_poll_iocb *poll = &req->poll;
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_poll_table ipt;
-	unsigned flags;
 	__poll_t mask;
 	u16 events;
-	int fd;
 
 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
@@ -1328,20 +1259,6 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	events = READ_ONCE(sqe->poll_events);
 	poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
 
-	flags = READ_ONCE(sqe->flags);
-	fd = READ_ONCE(sqe->fd);
-
-	if (flags & IOSQE_FIXED_FILE) {
-		if (unlikely(!ctx->user_files || fd >= ctx->nr_user_files))
-			return -EBADF;
-		poll->file = ctx->user_files[fd];
-		req->flags |= REQ_F_FIXED_FILE;
-	} else {
-		poll->file = fget(fd);
-	}
-	if (unlikely(!poll->file))
-		return -EBADF;
-
 	poll->head = NULL;
 	poll->woken = false;
 	poll->canceled = false;
@@ -1380,8 +1297,6 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 out:
 	if (unlikely(ipt.error)) {
-		if (!(flags & IOSQE_FIXED_FILE))
-			fput(poll->file);
 		/*
 		 * Drop one of our refs to this req, __io_submit_sqe() will
 		 * drop the other one since we're returning an error.
@@ -1626,7 +1541,8 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
 {
 	bool did_submit = true;
 	struct io_kiocb *req;
-	int ret;
+	unsigned flags;
+	int ret, fd;
 
 	/* enforce forwards compatibility on users */
 	if (unlikely(s->sqe->flags & ~IOSQE_FIXED_FILE))
@@ -1636,6 +1552,23 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
 	if (unlikely(!req))
 		return -EAGAIN;
 
+	flags = READ_ONCE(s->sqe->flags);
+	fd = READ_ONCE(s->sqe->fd);
+
+	if (flags & IOSQE_FIXED_FILE) {
+		if (unlikely(!ctx->user_files ||
+		    (unsigned) fd >= ctx->nr_user_files))
+			return -EBADF;
+		req->file = ctx->user_files[fd];
+		req->flags |= REQ_F_FIXED_FILE;
+	} else {
+		if (s->needs_fixed_file)
+			return -EBADF;
+		req->file = io_file_get(state, fd);
+		if (unlikely(!req->file))
+			return -EBADF;
+	}
+
 	ret = __io_submit_sqe(ctx, req, s, true, state);
 	if (ret == -EAGAIN) {
 		struct io_uring_sqe *sqe_copy;
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 5/8] io_uring: fix poll races
  2019-03-14 20:44 [PATCHSET 0/8] io_uring fixes Jens Axboe
                   ` (3 preceding siblings ...)
  2019-03-14 20:44 ` [PATCH 4/8] io_uring: fix fget/fput handling Jens Axboe
@ 2019-03-14 20:44 ` Jens Axboe
  2019-03-14 20:44 ` [PATCH 6/8] iov_iter: add ITER_BVEC_FLAG_NO_REF flag Jens Axboe
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: Jens Axboe @ 2019-03-14 20:44 UTC (permalink / raw)
  To: linux-fsdevel, linux-block; +Cc: viro, Jens Axboe

This is a straight port of Al's fix for the aio poll implementation,
since the io_uring version is heavily based on that. The below
description is almost straight from that patch, just modified to
fit the io_uring situation.

io_poll() has to cope with several unpleasant problems:
	* requests that might stay around indefinitely need to
be made visible for io_cancel(2); that must not be done to
a request already completed, though.
	* in cases when ->poll() has placed us on a waitqueue,
wakeup might have happened (and request completed) before ->poll()
returns.
	* worse, in some early wakeup cases request might end
up re-added into the queue later - we can't treat "woken up and
currently not in the queue" as "it's not going to stick around
indefinitely"
	* ... moreover, ->poll() might have decided not to
put it on any queues to start with, and that needs to be distinguished
from the previous case
	* ->poll() might have tried to put us on more than one queue.
Only the first will succeed for io poll, so we might end up missing
wakeups.  OTOH, we might very well notice that only after the
wakeup hits and request gets completed (all before ->poll() gets
around to the second poll_wait()).  In that case it's too late to
decide that we have an error.

req->woken was an attempt to deal with that.  Unfortunately, it was
broken.  What we need to keep track of is not that wakeup has happened -
the thing might come back after that.  It's that async reference is
already gone and won't come back, so we can't (and needn't) put the
request on the list of cancellables.

The easiest case is "request hadn't been put on any waitqueues"; we
can tell by seeing NULL apt.head, and in that case there won't be
anything async.  We should either complete the request ourselves
(if vfs_poll() reports anything of interest) or return an error.

In all other cases we get exclusion with wakeups by grabbing the
queue lock.

If request is currently on queue and we have something interesting
from vfs_poll(), we can steal it and complete the request ourselves.

If it's on queue and vfs_poll() has not reported anything interesting,
we either put it on the cancellable list, or, if we know that it
hadn't been put on all queues ->poll() wanted it on, we steal it and
return an error.

If it's _not_ on queue, it's either been already dealt with (in which
case we do nothing), or there's io_poll_complete_work() about to be
executed.  In that case we either put it on the cancellable list,
or, if we know it hadn't been put on all queues ->poll() wanted it on,
simulate what cancel would've done.

Fixes: 221c5eb23382 ("io_uring: add support for IORING_OP_POLL")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 110 +++++++++++++++++++++++++-------------------------
 1 file changed, 54 insertions(+), 56 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index f4fe9dce38ee..46cf38b8d863 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -197,7 +197,7 @@ struct io_poll_iocb {
 	struct file			*file;
 	struct wait_queue_head		*head;
 	__poll_t			events;
-	bool				woken;
+	bool				done;
 	bool				canceled;
 	struct wait_queue_entry		wait;
 };
@@ -367,20 +367,25 @@ static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
 	}
 }
 
-static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 ki_user_data,
+static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
+{
+	if (waitqueue_active(&ctx->wait))
+		wake_up(&ctx->wait);
+	if (waitqueue_active(&ctx->sqo_wait))
+		wake_up(&ctx->sqo_wait);
+}
+
+static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data,
 				long res, unsigned ev_flags)
 {
 	unsigned long flags;
 
 	spin_lock_irqsave(&ctx->completion_lock, flags);
-	io_cqring_fill_event(ctx, ki_user_data, res, ev_flags);
+	io_cqring_fill_event(ctx, user_data, res, ev_flags);
 	io_commit_cqring(ctx);
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 
-	if (waitqueue_active(&ctx->wait))
-		wake_up(&ctx->wait);
-	if (waitqueue_active(&ctx->sqo_wait))
-		wake_up(&ctx->sqo_wait);
+	io_cqring_ev_posted(ctx);
 }
 
 static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs)
@@ -1151,10 +1156,13 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return 0;
 }
 
-static void io_poll_complete(struct io_kiocb *req, __poll_t mask)
+static void io_poll_complete(struct io_ring_ctx *ctx, struct io_kiocb *req,
+			     __poll_t mask)
 {
-	io_cqring_add_event(req->ctx, req->user_data, mangle_poll(mask), 0);
-	io_put_req(req);
+	req->poll.done = true;
+	io_cqring_fill_event(ctx, req->user_data, mangle_poll(mask), 0);
+	io_commit_cqring(ctx);
+	io_cqring_ev_posted(ctx);
 }
 
 static void io_poll_complete_work(struct work_struct *work)
@@ -1182,9 +1190,10 @@ static void io_poll_complete_work(struct work_struct *work)
 		return;
 	}
 	list_del_init(&req->list);
+	io_poll_complete(ctx, req, mask);
 	spin_unlock_irq(&ctx->completion_lock);
 
-	io_poll_complete(req, mask);
+	io_put_req(req);
 }
 
 static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
@@ -1195,29 +1204,23 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 	struct io_kiocb *req = container_of(poll, struct io_kiocb, poll);
 	struct io_ring_ctx *ctx = req->ctx;
 	__poll_t mask = key_to_poll(key);
-
-	poll->woken = true;
+	unsigned long flags;
 
 	/* for instances that support it check for an event match first: */
-	if (mask) {
-		unsigned long flags;
-
-		if (!(mask & poll->events))
-			return 0;
+	if (mask && !(mask & poll->events))
+		return 0;
 
-		/* try to complete the iocb inline if we can: */
-		if (spin_trylock_irqsave(&ctx->completion_lock, flags)) {
-			list_del(&req->list);
-			spin_unlock_irqrestore(&ctx->completion_lock, flags);
+	list_del_init(&poll->wait.entry);
 
-			list_del_init(&poll->wait.entry);
-			io_poll_complete(req, mask);
-			return 1;
-		}
+	if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) {
+		list_del(&req->list);
+		io_poll_complete(ctx, req, mask);
+		spin_unlock_irqrestore(&ctx->completion_lock, flags);
+		io_put_req(req);
+	} else {
+		queue_work(ctx->sqo_wq, &req->work);
 	}
 
-	list_del_init(&poll->wait.entry);
-	queue_work(ctx->sqo_wq, &req->work);
 	return 1;
 }
 
@@ -1247,6 +1250,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	struct io_poll_iocb *poll = &req->poll;
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_poll_table ipt;
+	bool cancel = false;
 	__poll_t mask;
 	u16 events;
 
@@ -1260,7 +1264,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
 
 	poll->head = NULL;
-	poll->woken = false;
+	poll->done = false;
 	poll->canceled = false;
 
 	ipt.pt._qproc = io_poll_queue_proc;
@@ -1273,41 +1277,35 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	init_waitqueue_func_entry(&poll->wait, io_poll_wake);
 
 	mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
-	if (unlikely(!poll->head)) {
-		/* we did not manage to set up a waitqueue, done */
-		goto out;
-	}
 
 	spin_lock_irq(&ctx->completion_lock);
-	spin_lock(&poll->head->lock);
-	if (poll->woken) {
-		/* wake_up context handles the rest */
-		mask = 0;
+	if (likely(poll->head)) {
+		spin_lock(&poll->head->lock);
+		if (unlikely(list_empty(&poll->wait.entry))) {
+			if (ipt.error)
+				cancel = true;
+			ipt.error = 0;
+			mask = 0;
+		}
+		if (mask || ipt.error)
+			list_del_init(&poll->wait.entry);
+		else if (cancel)
+			WRITE_ONCE(poll->canceled, true);
+		else if (!poll->done) /* actually waiting for an event */
+			list_add_tail(&req->list, &ctx->cancel_list);
+		spin_unlock(&poll->head->lock);
+	}
+	if (mask) { /* no async, we'd stolen it */
+		req->error = mangle_poll(mask);
 		ipt.error = 0;
-	} else if (mask || ipt.error) {
-		/* if we get an error or a mask we are done */
-		WARN_ON_ONCE(list_empty(&poll->wait.entry));
-		list_del_init(&poll->wait.entry);
-	} else {
-		/* actually waiting for an event */
-		list_add_tail(&req->list, &ctx->cancel_list);
 	}
-	spin_unlock(&poll->head->lock);
 	spin_unlock_irq(&ctx->completion_lock);
 
-out:
-	if (unlikely(ipt.error)) {
-		/*
-		 * Drop one of our refs to this req, __io_submit_sqe() will
-		 * drop the other one since we're returning an error.
-		 */
+	if (mask) {
+		io_poll_complete(ctx, req, mask);
 		io_put_req(req);
-		return ipt.error;
 	}
-
-	if (mask)
-		io_poll_complete(req, mask);
-	return 0;
+	return ipt.error;
 }
 
 static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 6/8] iov_iter: add ITER_BVEC_FLAG_NO_REF flag
  2019-03-14 20:44 [PATCHSET 0/8] io_uring fixes Jens Axboe
                   ` (4 preceding siblings ...)
  2019-03-14 20:44 ` [PATCH 5/8] io_uring: fix poll races Jens Axboe
@ 2019-03-14 20:44 ` Jens Axboe
  2019-03-14 20:44 ` [PATCH 7/8] block: add BIO_NO_PAGE_REF flag Jens Axboe
  2019-03-14 20:44 ` [PATCH 8/8] io_uring: add io_uring_event cache hit information Jens Axboe
  7 siblings, 0 replies; 9+ messages in thread
From: Jens Axboe @ 2019-03-14 20:44 UTC (permalink / raw)
  To: linux-fsdevel, linux-block; +Cc: viro, Jens Axboe

For ITER_BVEC, if we're holding on to kernel pages, the caller
doesn't need to grab a reference to the bvec pages, and drop that
same reference on IO completion. This is essentially safe for any
ITER_BVEC, but some use cases end up reusing pages and uncondtionally
dropping a page reference on completion. And example of that is
sendfile(2), that ends up being a splice_in + splice_out on the
pipe pages.

Add a flag that tells us it's fine to not grab a page reference
to the bvec pages, since that caller knows not to drop a reference
when it's done with the pages.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c       |  3 +++
 include/linux/uio.h | 24 +++++++++++++++++++-----
 2 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 46cf38b8d863..b89f91ab6e32 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -850,6 +850,9 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
 	iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
 	if (offset)
 		iov_iter_advance(iter, offset);
+
+	/* don't drop a reference to these pages */
+	iter->type |= ITER_BVEC_FLAG_NO_REF;
 	return 0;
 }
 
diff --git a/include/linux/uio.h b/include/linux/uio.h
index ecf584f6b82d..4e926641fa80 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -23,14 +23,23 @@ struct kvec {
 };
 
 enum iter_type {
-	ITER_IOVEC = 0,
-	ITER_KVEC = 2,
-	ITER_BVEC = 4,
-	ITER_PIPE = 8,
-	ITER_DISCARD = 16,
+	/* set if ITER_BVEC doesn't hold a bv_page ref */
+	ITER_BVEC_FLAG_NO_REF = 2,
+
+	/* iter types */
+	ITER_IOVEC = 4,
+	ITER_KVEC = 8,
+	ITER_BVEC = 16,
+	ITER_PIPE = 32,
+	ITER_DISCARD = 64,
 };
 
 struct iov_iter {
+	/*
+	 * Bit 0 is the read/write bit, set if we're writing.
+	 * Bit 1 is the BVEC_FLAG_NO_REF bit, set if type is a bvec and
+	 * the caller isn't expecting to drop a page reference when done.
+	 */
 	unsigned int type;
 	size_t iov_offset;
 	size_t count;
@@ -84,6 +93,11 @@ static inline unsigned char iov_iter_rw(const struct iov_iter *i)
 	return i->type & (READ | WRITE);
 }
 
+static inline bool iov_iter_bvec_no_ref(const struct iov_iter *i)
+{
+	return (i->type & ITER_BVEC_FLAG_NO_REF) != 0;
+}
+
 /*
  * Total number of bytes covered by an iovec.
  *
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 7/8] block: add BIO_NO_PAGE_REF flag
  2019-03-14 20:44 [PATCHSET 0/8] io_uring fixes Jens Axboe
                   ` (5 preceding siblings ...)
  2019-03-14 20:44 ` [PATCH 6/8] iov_iter: add ITER_BVEC_FLAG_NO_REF flag Jens Axboe
@ 2019-03-14 20:44 ` Jens Axboe
  2019-03-14 20:44 ` [PATCH 8/8] io_uring: add io_uring_event cache hit information Jens Axboe
  7 siblings, 0 replies; 9+ messages in thread
From: Jens Axboe @ 2019-03-14 20:44 UTC (permalink / raw)
  To: linux-fsdevel, linux-block; +Cc: viro, Jens Axboe

If bio_iov_iter_get_pages() is called on an iov_iter that is flagged
with NO_REF, then we don't need to add a page reference for the pages
that we add.

Add BIO_NO_PAGE_REF to track this in the bio, so IO completion knows
not to drop a reference to these pages.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c               | 43 ++++++++++++++++++++++-----------------
 fs/block_dev.c            | 12 ++++++-----
 fs/iomap.c                | 12 ++++++-----
 include/linux/blk_types.h |  1 +
 4 files changed, 39 insertions(+), 29 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 71a78d9fb8b7..b64cedc7f87c 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -849,20 +849,14 @@ static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter)
 	size = bio_add_page(bio, bv->bv_page, len,
 				bv->bv_offset + iter->iov_offset);
 	if (size == len) {
-		struct page *page;
-		int i;
+		if (!bio_flagged(bio, BIO_NO_PAGE_REF)) {
+			struct page *page;
+			int i;
+
+			mp_bvec_for_each_page(page, bv, i)
+				get_page(page);
+		}
 
-		/*
-		 * For the normal O_DIRECT case, we could skip grabbing this
-		 * reference and then not have to put them again when IO
-		 * completes. But this breaks some in-kernel users, like
-		 * splicing to/from a loop device, where we release the pipe
-		 * pages unconditionally. If we can fix that case, we can
-		 * get rid of the get here and the need to call
-		 * bio_release_pages() at IO completion time.
-		 */
-		mp_bvec_for_each_page(page, bv, i)
-			get_page(page);
 		iov_iter_advance(iter, size);
 		return 0;
 	}
@@ -925,10 +919,12 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
  * This takes either an iterator pointing to user memory, or one pointing to
  * kernel pages (BVEC iterator). If we're adding user pages, we pin them and
  * map them into the kernel. On IO completion, the caller should put those
- * pages. For now, when adding kernel pages, we still grab a reference to the
- * page. This isn't strictly needed for the common case, but some call paths
- * end up releasing pages from eg a pipe and we can't easily control these.
- * See comment in __bio_iov_bvec_add_pages().
+ * pages. If we're adding kernel pages, and the caller told us it's safe to
+ * do so, we just have to add the pages to the bio directly. We don't grab an
+ * extra reference to those pages (the user should already have that), and we
+ * don't put the page on IO completion. The caller needs to check if the bio is
+ * flagged BIO_NO_PAGE_REF on IO completion. If it isn't, then pages should be
+ * released.
  *
  * The function tries, but does not guarantee, to pin as many pages as
  * fit into the bio, or are requested in *iter, whatever is smaller. If
@@ -940,6 +936,13 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 	const bool is_bvec = iov_iter_is_bvec(iter);
 	unsigned short orig_vcnt = bio->bi_vcnt;
 
+	/*
+	 * If this is a BVEC iter, then the pages are kernel pages. Don't
+	 * release them on IO completion, if the caller asked us to.
+	 */
+	if (is_bvec && iov_iter_bvec_no_ref(iter))
+		bio_set_flag(bio, BIO_NO_PAGE_REF);
+
 	do {
 		int ret;
 
@@ -1696,7 +1699,8 @@ static void bio_dirty_fn(struct work_struct *work)
 		next = bio->bi_private;
 
 		bio_set_pages_dirty(bio);
-		bio_release_pages(bio);
+		if (!bio_flagged(bio, BIO_NO_PAGE_REF))
+			bio_release_pages(bio);
 		bio_put(bio);
 	}
 }
@@ -1713,7 +1717,8 @@ void bio_check_pages_dirty(struct bio *bio)
 			goto defer;
 	}
 
-	bio_release_pages(bio);
+	if (!bio_flagged(bio, BIO_NO_PAGE_REF))
+		bio_release_pages(bio);
 	bio_put(bio);
 	return;
 defer:
diff --git a/fs/block_dev.c b/fs/block_dev.c
index e9faa52bb489..78d3257435c0 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -336,12 +336,14 @@ static void blkdev_bio_end_io(struct bio *bio)
 	if (should_dirty) {
 		bio_check_pages_dirty(bio);
 	} else {
-		struct bio_vec *bvec;
-		int i;
-		struct bvec_iter_all iter_all;
+		if (!bio_flagged(bio, BIO_NO_PAGE_REF)) {
+			struct bvec_iter_all iter_all;
+			struct bio_vec *bvec;
+			int i;
 
-		bio_for_each_segment_all(bvec, bio, i, iter_all)
-			put_page(bvec->bv_page);
+			bio_for_each_segment_all(bvec, bio, i, iter_all)
+				put_page(bvec->bv_page);
+		}
 		bio_put(bio);
 	}
 }
diff --git a/fs/iomap.c b/fs/iomap.c
index 97cb9d486a7d..abdd18e404f8 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -1589,12 +1589,14 @@ static void iomap_dio_bio_end_io(struct bio *bio)
 	if (should_dirty) {
 		bio_check_pages_dirty(bio);
 	} else {
-		struct bio_vec *bvec;
-		int i;
-		struct bvec_iter_all iter_all;
+		if (!bio_flagged(bio, BIO_NO_PAGE_REF)) {
+			struct bvec_iter_all iter_all;
+			struct bio_vec *bvec;
+			int i;
 
-		bio_for_each_segment_all(bvec, bio, i, iter_all)
-			put_page(bvec->bv_page);
+			bio_for_each_segment_all(bvec, bio, i, iter_all)
+				put_page(bvec->bv_page);
+		}
 		bio_put(bio);
 	}
 }
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index d66bf5f32610..791fee35df88 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -215,6 +215,7 @@ struct bio {
 /*
  * bio flags
  */
+#define BIO_NO_PAGE_REF	0	/* don't put release vec pages */
 #define BIO_SEG_VALID	1	/* bi_phys_segments valid */
 #define BIO_CLONED	2	/* doesn't own data */
 #define BIO_BOUNCED	3	/* bio is a bounce bio */
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 8/8] io_uring: add io_uring_event cache hit information
  2019-03-14 20:44 [PATCHSET 0/8] io_uring fixes Jens Axboe
                   ` (6 preceding siblings ...)
  2019-03-14 20:44 ` [PATCH 7/8] block: add BIO_NO_PAGE_REF flag Jens Axboe
@ 2019-03-14 20:44 ` Jens Axboe
  7 siblings, 0 replies; 9+ messages in thread
From: Jens Axboe @ 2019-03-14 20:44 UTC (permalink / raw)
  To: linux-fsdevel, linux-block; +Cc: viro, Jens Axboe

Add hint on whether a read was served out of the page cache, or if it
hit media. This is useful for buffered async IO, O_DIRECT reads would
never have this set (for obvious reasons).

If the read hit page cache, cqe->flags will have IOCQE_FLAG_CACHEHIT
set.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 6 +++++-
 include/uapi/linux/io_uring.h | 5 +++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index b89f91ab6e32..3437f538b377 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -625,10 +625,14 @@ static void kiocb_end_write(struct kiocb *kiocb)
 static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
 {
 	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
+	unsigned ev_flags = 0;
 
 	kiocb_end_write(kiocb);
 
-	io_cqring_add_event(req->ctx, req->user_data, res, 0);
+	if (res > 0 && (req->flags & REQ_F_FORCE_NONBLOCK))
+		ev_flags = IOCQE_FLAG_CACHEHIT;
+
+	io_cqring_add_event(req->ctx, req->user_data, res, ev_flags);
 	io_put_req(req);
 }
 
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index e23408692118..24906e99fdc7 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -69,6 +69,11 @@ struct io_uring_cqe {
 	__u32	flags;
 };
 
+/*
+ * io_uring_event->flags
+ */
+#define IOCQE_FLAG_CACHEHIT	(1U << 0)	/* IO did not hit media */
+
 /*
  * Magic offsets for the application to mmap the data it needs
  */
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2019-03-14 20:44 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-03-14 20:44 [PATCHSET 0/8] io_uring fixes Jens Axboe
2019-03-14 20:44 ` [PATCH 1/8] io_uring: use regular request ref counts Jens Axboe
2019-03-14 20:44 ` [PATCH 2/8] io_uring: make io_read/write return an integer Jens Axboe
2019-03-14 20:44 ` [PATCH 3/8] io_uring: add prepped flag Jens Axboe
2019-03-14 20:44 ` [PATCH 4/8] io_uring: fix fget/fput handling Jens Axboe
2019-03-14 20:44 ` [PATCH 5/8] io_uring: fix poll races Jens Axboe
2019-03-14 20:44 ` [PATCH 6/8] iov_iter: add ITER_BVEC_FLAG_NO_REF flag Jens Axboe
2019-03-14 20:44 ` [PATCH 7/8] block: add BIO_NO_PAGE_REF flag Jens Axboe
2019-03-14 20:44 ` [PATCH 8/8] io_uring: add io_uring_event cache hit information Jens Axboe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).