linux-man.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Jens Axboe <axboe@kernel.dk>
To: Jann Horn <jannh@google.com>
Cc: linux-aio@kvack.org, linux-block@vger.kernel.org,
	linux-man <linux-man@vger.kernel.org>,
	Linux API <linux-api@vger.kernel.org>,
	hch@lst.de, jmoyer@redhat.com, Avi Kivity <avi@scylladb.com>
Subject: Re: [PATCH 05/18] Add io_uring IO interface
Date: Mon, 28 Jan 2019 20:46:35 -0700	[thread overview]
Message-ID: <b5902860-30a9-3dc9-b513-3d892afdc51b@kernel.dk> (raw)
In-Reply-To: <CAG48ez1Ms+NPTXPj_UiQyv=2aMaMR3akCdp5SdDL3x7x7gd_ig@mail.gmail.com>

On 1/28/19 7:21 PM, Jann Horn wrote:
> Please create a local copy of the request before parsing it to keep
> the data from changing under you. Additionally, it might make sense to
> annotate every pointer to shared memory with a comment, or something
> like that, to ensure that anyone looking at the code can immediately
> see for which pointers special caution is required on access.

I took a look at the viability of NOT having to local copy the data, and
I don't think it's too bad. Local copy has a noticeable impact on the
performance, hence I'd really (REALLY) like to avoid it.

Here's something on top of the current git branch. I think I even went a
bit too far in some areas, but it should hopefully catch the cases where
we might end up double evaluating the parts of the sqe that we depend
on. For most of the sqe reading we don't really care too much. For
instance, the sqe->user_data. If the app changes this field, then it
just gets whatever passed back in cqe->user_data. That's not a kernel
issue.

For cases like addr/len etc validation, it should be sound. I'll double
check this in the morning as well, and obviously would need to be folded
in along the way.

I'd appreciate your opinion on this part, if you see any major issues
with it, or if I missed something.


diff --git a/fs/io_uring.c b/fs/io_uring.c
index e8760ad02e82..64d090300990 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -668,31 +668,35 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	struct kiocb *kiocb = &req->rw;
-	int ret;
+	unsigned flags, ioprio;
+	int fd, ret;
 
-	if (sqe->flags & IOSQE_FIXED_FILE) {
-		if (unlikely(!ctx->user_files || sqe->fd >= ctx->nr_user_files))
+	flags = READ_ONCE(sqe->flags);
+	fd = READ_ONCE(sqe->fd);
+	if (flags & IOSQE_FIXED_FILE) {
+		if (unlikely(!ctx->user_files || fd >= ctx->nr_user_files))
 			return -EBADF;
-		kiocb->ki_filp = ctx->user_files[sqe->fd];
+		kiocb->ki_filp = ctx->user_files[fd];
 		req->flags |= REQ_F_FIXED_FILE;
 	} else {
-		kiocb->ki_filp = io_file_get(state, sqe->fd);
+		kiocb->ki_filp = io_file_get(state, fd);
 	}
 	if (unlikely(!kiocb->ki_filp))
 		return -EBADF;
-	kiocb->ki_pos = sqe->off;
+	kiocb->ki_pos = READ_ONCE(sqe->off);
 	kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
 	kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
-	if (sqe->ioprio) {
-		ret = ioprio_check_cap(sqe->ioprio);
+	ioprio = READ_ONCE(sqe->ioprio);
+	if (ioprio) {
+		ret = ioprio_check_cap(ioprio);
 		if (ret)
 			goto out_fput;
 
-		kiocb->ki_ioprio = sqe->ioprio;
+		kiocb->ki_ioprio = ioprio;
 	} else
 		kiocb->ki_ioprio = get_current_ioprio();
 
-	ret = kiocb_set_rw_flags(kiocb, sqe->rw_flags);
+	ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
 	if (unlikely(ret))
 		goto out_fput;
 	if (force_nonblock) {
@@ -716,7 +720,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	}
 	return 0;
 out_fput:
-	if (!(sqe->flags & IOSQE_FIXED_FILE))
+	if (!(flags & IOSQE_FIXED_FILE))
 		io_file_put(state, kiocb->ki_filp);
 	return ret;
 }
@@ -746,28 +750,31 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
 			   const struct io_uring_sqe *sqe,
 			   struct iov_iter *iter)
 {
+	size_t len = READ_ONCE(sqe->len);
 	struct io_mapped_ubuf *imu;
-	size_t len = sqe->len;
+	int buf_index, index;
 	size_t offset;
-	int index;
+	u64 buf_addr;
 
 	/* attempt to use fixed buffers without having provided iovecs */
 	if (unlikely(!ctx->user_bufs))
 		return -EFAULT;
-	if (unlikely(sqe->buf_index >= ctx->nr_user_bufs))
+
+	buf_index = READ_ONCE(sqe->buf_index);
+	if (unlikely(buf_index >= ctx->nr_user_bufs))
 		return -EFAULT;
 
-	index = array_index_nospec(sqe->buf_index, ctx->sq_entries);
+	index = array_index_nospec(buf_index, ctx->sq_entries);
 	imu = &ctx->user_bufs[index];
-	if ((unsigned long) sqe->addr < imu->ubuf ||
-	    (unsigned long) sqe->addr + len > imu->ubuf + imu->len)
+	buf_addr = READ_ONCE(sqe->addr);
+	if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
 		return -EFAULT;
 
 	/*
 	 * May not be a start of buffer, set size appropriately
 	 * and advance us to the beginning.
 	 */
-	offset = (unsigned long) sqe->addr - imu->ubuf;
+	offset = buf_addr - imu->ubuf;
 	iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
 	if (offset)
 		iov_iter_advance(iter, offset);
@@ -778,10 +785,12 @@ static int io_import_iovec(struct io_ring_ctx *ctx, int rw,
 			   const struct io_uring_sqe *sqe,
 			   struct iovec **iovec, struct iov_iter *iter)
 {
-	void __user *buf = u64_to_user_ptr(sqe->addr);
+	void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	int opcode;
 
-	if (sqe->opcode == IORING_OP_READ_FIXED ||
-	    sqe->opcode == IORING_OP_WRITE_FIXED) {
+	opcode = READ_ONCE(sqe->opcode);
+	if (opcode == IORING_OP_READ_FIXED ||
+	    opcode == IORING_OP_WRITE_FIXED) {
 		ssize_t ret = io_import_fixed(ctx, rw, sqe, iter);
 		*iovec = NULL;
 		return ret;
@@ -789,11 +798,12 @@ static int io_import_iovec(struct io_ring_ctx *ctx, int rw,
 
 #ifdef CONFIG_COMPAT
 	if (in_compat_syscall())
-		return compat_import_iovec(rw, buf, sqe->len, UIO_FASTIOV,
-						iovec, iter);
+		return compat_import_iovec(rw, buf, READ_ONCE(sqe->len),
+						UIO_FASTIOV, iovec, iter);
 #endif
 
-	return import_iovec(rw, buf, sqe->len, UIO_FASTIOV, iovec, iter);
+	return import_iovec(rw, buf, READ_ONCE(sqe->len), UIO_FASTIOV, iovec,
+				iter);
 }
 
 static void io_async_list_note(int rw, struct io_kiocb *req, size_t len)
@@ -939,14 +949,14 @@ static ssize_t io_write(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 /*
  * IORING_OP_NOP just posts a completion event, nothing else.
  */
-static int io_nop(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static int io_nop(struct io_kiocb *req, u64 user_data)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
 	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
 
-	io_cqring_add_event(ctx, sqe->user_data, 0, 0);
+	io_cqring_add_event(ctx, user_data, 0, 0);
 	io_free_req(req);
 	return 0;
 }
@@ -955,9 +965,12 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		    bool force_nonblock)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-	loff_t end = sqe->off + sqe->len;
+	loff_t sqe_off = READ_ONCE(sqe->off);
+	loff_t sqe_len = READ_ONCE(sqe->len);
+	loff_t end = sqe_off + sqe_len;
 	struct file *file;
-	int ret;
+	unsigned flags;
+	int ret, fd;
 
 	/* fsync always requires a blocking context */
 	if (force_nonblock)
@@ -970,21 +983,23 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	if (unlikely(sqe->fsync_flags & ~IORING_FSYNC_DATASYNC))
 		return -EINVAL;
 
-	if (sqe->flags & IOSQE_FIXED_FILE) {
-		if (unlikely(!ctx->user_files || sqe->fd >= ctx->nr_user_files))
+	fd = READ_ONCE(sqe->fd);
+	flags = READ_ONCE(sqe->flags);
+	if (flags & IOSQE_FIXED_FILE) {
+		if (unlikely(!ctx->user_files || fd >= ctx->nr_user_files))
 			return -EBADF;
-		file = ctx->user_files[sqe->fd];
+		file = ctx->user_files[fd];
 	} else {
-		file = fget(sqe->fd);
+		file = fget(fd);
 	}
 
 	if (unlikely(!file))
 		return -EBADF;
 
-	ret = vfs_fsync_range(file, sqe->off, end > 0 ? end : LLONG_MAX,
+	ret = vfs_fsync_range(file, sqe_off, end > 0 ? end : LLONG_MAX,
 			sqe->fsync_flags & IORING_FSYNC_DATASYNC);
 
-	if (!(sqe->flags & IOSQE_FIXED_FILE))
+	if (!(flags & IOSQE_FIXED_FILE))
 		fput(file);
 
 	io_cqring_add_event(ctx, sqe->user_data, ret, 0);
@@ -1037,7 +1052,7 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 	spin_lock_irq(&ctx->completion_lock);
 	list_for_each_entry_safe(poll_req, next, &ctx->cancel_list, list) {
-		if (sqe->addr == poll_req->user_data) {
+		if (READ_ONCE(sqe->addr) == poll_req->user_data) {
 			io_poll_remove_one(poll_req);
 			ret = 0;
 			break;
@@ -1145,7 +1160,10 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	struct io_poll_iocb *poll = &req->poll;
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_poll_table ipt;
+	unsigned flags;
 	__poll_t mask;
+	u16 events;
+	int fd;
 
 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
@@ -1153,15 +1171,18 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		return -EINVAL;
 
 	INIT_WORK(&req->work, io_poll_complete_work);
-	poll->events = demangle_poll(sqe->poll_events) | EPOLLERR | EPOLLHUP;
+	events = READ_ONCE(sqe->poll_events);
+	poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
 
-	if (sqe->flags & IOSQE_FIXED_FILE) {
-		if (unlikely(!ctx->user_files || sqe->fd >= ctx->nr_user_files))
+	flags = READ_ONCE(sqe->flags);
+	fd = READ_ONCE(sqe->fd);
+	if (flags & IOSQE_FIXED_FILE) {
+		if (unlikely(!ctx->user_files || fd >= ctx->nr_user_files))
 			return -EBADF;
-		poll->file = ctx->user_files[sqe->fd];
+		poll->file = ctx->user_files[fd];
 		req->flags |= REQ_F_FIXED_FILE;
 	} else {
-		poll->file = fget(sqe->fd);
+		poll->file = fget(fd);
 	}
 	if (unlikely(!poll->file))
 		return -EBADF;
@@ -1207,7 +1228,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 out:
 	if (unlikely(ipt.error)) {
-		if (!(sqe->flags & IOSQE_FIXED_FILE))
+		if (!(flags & IOSQE_FIXED_FILE))
 			fput(poll->file);
 		return ipt.error;
 	}
@@ -1224,15 +1245,17 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 {
 	const struct io_uring_sqe *sqe = s->sqe;
 	ssize_t ret;
+	int opcode;
 
 	if (unlikely(s->index >= ctx->sq_entries))
 		return -EINVAL;
-	req->user_data = sqe->user_data;
+	req->user_data = READ_ONCE(sqe->user_data);
 
 	ret = -EINVAL;
-	switch (sqe->opcode) {
+	opcode = READ_ONCE(sqe->opcode);
+	switch (opcode) {
 	case IORING_OP_NOP:
-		ret = io_nop(req, sqe);
+		ret = io_nop(req, req->user_data);
 		break;
 	case IORING_OP_READV:
 		if (unlikely(sqe->buf_index))
@@ -1317,7 +1340,7 @@ static void io_sq_wq_submit_work(struct work_struct *work)
 restart:
 	do {
 		struct sqe_submit *s = &req->submit;
-		u64 user_data = s->sqe->user_data;
+		u64 user_data = READ_ONCE(s->sqe->user_data);
 
 		/* Ensure we clear previously set forced non-block flag */
 		req->flags &= ~REQ_F_FORCE_NONBLOCK;

-- 
Jens Axboe

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

  parent reply	other threads:[~2019-01-29  3:46 UTC|newest]

Thread overview: 81+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-01-28 21:35 [PATCHSET v8] io_uring IO interface Jens Axboe
2019-01-28 21:35 ` [PATCH 01/18] fs: add an iopoll method to struct file_operations Jens Axboe
2019-01-28 21:35 ` [PATCH 02/18] block: wire up block device iopoll method Jens Axboe
2019-01-28 21:35 ` [PATCH 03/18] block: add bio_set_polled() helper Jens Axboe
2019-01-28 21:35 ` [PATCH 04/18] iomap: wire up the iopoll method Jens Axboe
2019-01-28 21:35 ` [PATCH 05/18] Add io_uring IO interface Jens Axboe
2019-01-28 21:53   ` Jeff Moyer
2019-01-28 21:56     ` Jens Axboe
2019-01-28 22:32   ` Jann Horn
2019-01-28 23:46     ` Jens Axboe
2019-01-28 23:59       ` Jann Horn
2019-01-29  0:03         ` Jens Axboe
2019-01-29  0:31           ` Jens Axboe
2019-01-29  0:34             ` Jann Horn
2019-01-29  0:55               ` Jens Axboe
2019-01-29  0:58                 ` Jann Horn
2019-01-29  1:01                   ` Jens Axboe
2019-02-01 16:57         ` Matt Mullins
2019-02-01 17:04           ` Jann Horn
2019-02-01 17:23             ` Jann Horn
2019-02-01 18:05               ` Al Viro
2019-01-29  1:07   ` Jann Horn
2019-01-29  2:21     ` Jann Horn
2019-01-29  2:54       ` Jens Axboe
2019-01-29  3:46       ` Jens Axboe [this message]
2019-01-29 15:56         ` Jann Horn
2019-01-29 16:06           ` Jens Axboe
2019-01-29  2:21     ` Jens Axboe
2019-01-29  1:29   ` Jann Horn
2019-01-29  1:31     ` Jens Axboe
2019-01-29  1:32       ` Jann Horn
2019-01-29  2:23         ` Jens Axboe
2019-01-29  7:12   ` Bert Wesarg
2019-01-29 12:12   ` Florian Weimer
2019-01-29 13:35     ` Jens Axboe
2019-01-28 21:35 ` [PATCH 06/18] io_uring: add fsync support Jens Axboe
2019-01-28 21:35 ` [PATCH 07/18] io_uring: support for IO polling Jens Axboe
2019-01-29 17:24   ` Christoph Hellwig
2019-01-29 18:31     ` Jens Axboe
2019-01-29 19:10       ` Jens Axboe
2019-01-29 20:35         ` Jeff Moyer
2019-01-29 20:37           ` Jens Axboe
2019-01-28 21:35 ` [PATCH 08/18] fs: add fget_many() and fput_many() Jens Axboe
2019-01-28 21:35 ` [PATCH 09/18] io_uring: use fget/fput_many() for file references Jens Axboe
2019-01-28 21:56   ` Jann Horn
2019-01-28 22:03     ` Jens Axboe
2019-01-28 21:35 ` [PATCH 10/18] io_uring: batch io_kiocb allocation Jens Axboe
2019-01-29 17:26   ` Christoph Hellwig
2019-01-29 18:14     ` Jens Axboe
2019-01-28 21:35 ` [PATCH 11/18] block: implement bio helper to add iter bvec pages to bio Jens Axboe
2019-01-28 21:35 ` [PATCH 12/18] io_uring: add support for pre-mapped user IO buffers Jens Axboe
2019-01-28 23:35   ` Jann Horn
2019-01-28 23:50     ` Jens Axboe
2019-01-29  0:36       ` Jann Horn
2019-01-29  1:25         ` Jens Axboe
2019-01-28 21:35 ` [PATCH 13/18] io_uring: add file set registration Jens Axboe
2019-01-28 21:35 ` [PATCH 14/18] io_uring: add submission polling Jens Axboe
2019-01-28 21:35 ` [PATCH 15/18] io_uring: add io_kiocb ref count Jens Axboe
2019-01-29 17:26   ` Christoph Hellwig
2019-01-28 21:35 ` [PATCH 16/18] io_uring: add support for IORING_OP_POLL Jens Axboe
2019-01-28 21:35 ` [PATCH 17/18] io_uring: allow workqueue item to handle multiple buffered requests Jens Axboe
2019-01-28 21:35 ` [PATCH 18/18] io_uring: add io_uring_event cache hit information Jens Axboe
     [not found] <20190123153536.7081-1-axboe@kernel.dk>
     [not found] ` <20190123153536.7081-6-axboe@kernel.dk>
2019-01-28 14:57   ` [PATCH 05/18] Add io_uring IO interface Christoph Hellwig
2019-01-28 16:26     ` Jens Axboe
2019-01-28 16:34       ` Christoph Hellwig
2019-01-28 19:32         ` Jens Axboe
2019-01-28 18:25     ` Jens Axboe
2019-01-29  6:30       ` Christoph Hellwig
2019-01-29 11:58         ` Arnd Bergmann
2019-01-29 15:20           ` Jens Axboe
2019-01-29 16:18             ` Arnd Bergmann
2019-01-29 16:19               ` Jens Axboe
2019-01-29 16:26                 ` Arnd Bergmann
2019-01-29 16:28                   ` Jens Axboe
2019-01-29 16:46                     ` Arnd Bergmann
2019-01-29  0:47     ` Andy Lutomirski
2019-01-29  1:20       ` Jens Axboe
2019-01-29  6:45         ` Christoph Hellwig
2019-01-29 12:05           ` Arnd Bergmann
2019-01-31  5:11         ` Andy Lutomirski
2019-01-31 16:37           ` Jens Axboe

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=b5902860-30a9-3dc9-b513-3d892afdc51b@kernel.dk \
    --to=axboe@kernel.dk \
    --cc=avi@scylladb.com \
    --cc=hch@lst.de \
    --cc=jannh@google.com \
    --cc=jmoyer@redhat.com \
    --cc=linux-aio@kvack.org \
    --cc=linux-api@vger.kernel.org \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-man@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).