IO-Uring Archive on lore.kernel.org
 help / color / Atom feed
* [PATCHSET 0/10] io_uring items for 5.6
@ 2019-12-13 18:36 Jens Axboe
  2019-12-13 18:36 ` [PATCH 01/10] io_uring: add support for fallocate() Jens Axboe
                   ` (9 more replies)
  0 siblings, 10 replies; 18+ messages in thread
From: Jens Axboe @ 2019-12-13 18:36 UTC (permalink / raw)
  To: io-uring; +Cc: linux-fsdevel

Mixed bag of stuff here, highlights:

- Support for fallocate(2)
- Support for openat(2)
- Support for close(2)
- Much faster file updates/registration
- A few cleanups

Against the io_uring-5.5 branch that I shipped off to Linus yesterday
and can also be found in my for-5.6/io_uring branch.

 drivers/android/binder.c      |   6 +-
 fs/file.c                     |   2 +-
 fs/internal.h                 |   1 +
 fs/io-wq.c                    |   8 +-
 fs/io-wq.h                    |   1 +
 fs/io_uring.c                 | 704 ++++++++++++++++++++++++++--------
 fs/namei.c                    |   2 +
 fs/open.c                     |   2 +-
 include/linux/namei.h         |   1 +
 include/uapi/linux/io_uring.h |   5 +
 10 files changed, 573 insertions(+), 159 deletions(-)

-- 
Jens Axboe



^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH 01/10] io_uring: add support for fallocate()
  2019-12-13 18:36 [PATCHSET 0/10] io_uring items for 5.6 Jens Axboe
@ 2019-12-13 18:36 ` Jens Axboe
  2019-12-13 18:36 ` [PATCH 02/10] io_uring: remove 'sqe' parameter to the OP helpers that take it Jens Axboe
                   ` (8 subsequent siblings)
  9 siblings, 0 replies; 18+ messages in thread
From: Jens Axboe @ 2019-12-13 18:36 UTC (permalink / raw)
  To: io-uring; +Cc: linux-fsdevel, Jens Axboe

This exposes fallocate(2) through io_uring.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 29 +++++++++++++++++++++++++++++
 include/uapi/linux/io_uring.h |  1 +
 2 files changed, 30 insertions(+)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9b1833fedc5c..d780477b9a56 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1974,6 +1974,32 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	return 0;
 }
 
+static int io_fallocate(struct io_kiocb *req, struct io_kiocb **nxt,
+			bool force_nonblock)
+{
+	const struct io_uring_sqe *sqe = req->sqe;
+	loff_t offset, len;
+	int ret, mode;
+
+	if (sqe->ioprio || sqe->buf_index || sqe->rw_flags)
+		return -EINVAL;
+
+	/* fallocate always requiring blocking context */
+	if (force_nonblock)
+		return -EAGAIN;
+
+	offset = READ_ONCE(sqe->off);
+	len = READ_ONCE(sqe->addr);
+	mode = READ_ONCE(sqe->len);
+
+	ret = vfs_fallocate(req->file, mode, offset, len);
+	if (ret < 0)
+		req_set_fail_links(req);
+	io_cqring_add_event(req, ret);
+	io_put_req_find_next(req, nxt);
+	return 0;
+}
+
 static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_ring_ctx *ctx = req->ctx;
@@ -2983,6 +3009,9 @@ static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt,
 	case IORING_OP_ASYNC_CANCEL:
 		ret = io_async_cancel(req, req->sqe, nxt);
 		break;
+	case IORING_OP_FALLOCATE:
+		ret = io_fallocate(req, nxt, force_nonblock);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index a3300e1b9a01..bdbe2b130179 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -76,6 +76,7 @@ enum {
 	IORING_OP_ASYNC_CANCEL,
 	IORING_OP_LINK_TIMEOUT,
 	IORING_OP_CONNECT,
+	IORING_OP_FALLOCATE,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
-- 
2.24.1


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH 02/10] io_uring: remove 'sqe' parameter to the OP helpers that take it
  2019-12-13 18:36 [PATCHSET 0/10] io_uring items for 5.6 Jens Axboe
  2019-12-13 18:36 ` [PATCH 01/10] io_uring: add support for fallocate() Jens Axboe
@ 2019-12-13 18:36 ` Jens Axboe
  2019-12-13 18:36 ` [PATCH 03/10] fs: add namei support for doing a non-blocking path lookup Jens Axboe
                   ` (7 subsequent siblings)
  9 siblings, 0 replies; 18+ messages in thread
From: Jens Axboe @ 2019-12-13 18:36 UTC (permalink / raw)
  To: io-uring; +Cc: linux-fsdevel, Jens Axboe

We pass in req->sqe for all of them, no need to pass it in as the
request is always passed in.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 70 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 38 insertions(+), 32 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index d780477b9a56..93a967cf3f9f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1942,9 +1942,10 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return 0;
 }
 
-static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
-		    struct io_kiocb **nxt, bool force_nonblock)
+static int io_fsync(struct io_kiocb *req, struct io_kiocb **nxt,
+		    bool force_nonblock)
 {
+	const struct io_uring_sqe *sqe = req->sqe;
 	loff_t sqe_off = READ_ONCE(sqe->off);
 	loff_t sqe_len = READ_ONCE(sqe->len);
 	loff_t end = sqe_off + sqe_len;
@@ -2016,11 +2017,10 @@ static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return ret;
 }
 
-static int io_sync_file_range(struct io_kiocb *req,
-			      const struct io_uring_sqe *sqe,
-			      struct io_kiocb **nxt,
+static int io_sync_file_range(struct io_kiocb *req, struct io_kiocb **nxt,
 			      bool force_nonblock)
 {
+	const struct io_uring_sqe *sqe = req->sqe;
 	loff_t sqe_off;
 	loff_t sqe_len;
 	unsigned flags;
@@ -2063,10 +2063,11 @@ static int io_sendmsg_prep(struct io_kiocb *req, struct io_async_ctx *io)
 #endif
 }
 
-static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
-		      struct io_kiocb **nxt, bool force_nonblock)
+static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt,
+		      bool force_nonblock)
 {
 #if defined(CONFIG_NET)
+	const struct io_uring_sqe *sqe = req->sqe;
 	struct socket *sock;
 	int ret;
 
@@ -2142,10 +2143,11 @@ static int io_recvmsg_prep(struct io_kiocb *req, struct io_async_ctx *io)
 #endif
 }
 
-static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
-		      struct io_kiocb **nxt, bool force_nonblock)
+static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt,
+		      bool force_nonblock)
 {
 #if defined(CONFIG_NET)
+	const struct io_uring_sqe *sqe = req->sqe;
 	struct socket *sock;
 	int ret;
 
@@ -2207,10 +2209,11 @@ static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 #endif
 }
 
-static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe,
-		     struct io_kiocb **nxt, bool force_nonblock)
+static int io_accept(struct io_kiocb *req, struct io_kiocb **nxt,
+		     bool force_nonblock)
 {
 #if defined(CONFIG_NET)
+	const struct io_uring_sqe *sqe = req->sqe;
 	struct sockaddr __user *addr;
 	int __user *addr_len;
 	unsigned file_flags;
@@ -2258,10 +2261,11 @@ static int io_connect_prep(struct io_kiocb *req, struct io_async_ctx *io)
 #endif
 }
 
-static int io_connect(struct io_kiocb *req, const struct io_uring_sqe *sqe,
-		      struct io_kiocb **nxt, bool force_nonblock)
+static int io_connect(struct io_kiocb *req, struct io_kiocb **nxt,
+		      bool force_nonblock)
 {
 #if defined(CONFIG_NET)
+	const struct io_uring_sqe *sqe = req->sqe;
 	struct io_async_ctx __io, *io;
 	unsigned file_flags;
 	int addr_len, ret;
@@ -2361,8 +2365,9 @@ static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
  * Find a running poll command that matches one specified in sqe->addr,
  * and remove it if found.
  */
-static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static int io_poll_remove(struct io_kiocb *req)
 {
+	const struct io_uring_sqe *sqe = req->sqe;
 	struct io_ring_ctx *ctx = req->ctx;
 	int ret;
 
@@ -2508,9 +2513,9 @@ static void io_poll_req_insert(struct io_kiocb *req)
 	hlist_add_head(&req->hash_node, list);
 }
 
-static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe,
-		       struct io_kiocb **nxt)
+static int io_poll_add(struct io_kiocb *req, struct io_kiocb **nxt)
 {
+	const struct io_uring_sqe *sqe = req->sqe;
 	struct io_poll_iocb *poll = &req->poll;
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_poll_table ipt;
@@ -2648,9 +2653,9 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
 /*
  * Remove or update an existing timeout command
  */
-static int io_timeout_remove(struct io_kiocb *req,
-			     const struct io_uring_sqe *sqe)
+static int io_timeout_remove(struct io_kiocb *req)
 {
+	const struct io_uring_sqe *sqe = req->sqe;
 	struct io_ring_ctx *ctx = req->ctx;
 	unsigned flags;
 	int ret;
@@ -2710,8 +2715,9 @@ static int io_timeout_prep(struct io_kiocb *req, struct io_async_ctx *io,
 	return 0;
 }
 
-static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static int io_timeout(struct io_kiocb *req)
 {
+	const struct io_uring_sqe *sqe = req->sqe;
 	unsigned count;
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_timeout_data *data;
@@ -2857,9 +2863,9 @@ static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
 	io_put_req_find_next(req, nxt);
 }
 
-static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe,
-			   struct io_kiocb **nxt)
+static int io_async_cancel(struct io_kiocb *req, struct io_kiocb **nxt)
 {
+	const struct io_uring_sqe *sqe = req->sqe;
 	struct io_ring_ctx *ctx = req->ctx;
 
 	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
@@ -2977,37 +2983,37 @@ static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt,
 		ret = io_write(req, nxt, force_nonblock);
 		break;
 	case IORING_OP_FSYNC:
-		ret = io_fsync(req, req->sqe, nxt, force_nonblock);
+		ret = io_fsync(req, nxt, force_nonblock);
 		break;
 	case IORING_OP_POLL_ADD:
-		ret = io_poll_add(req, req->sqe, nxt);
+		ret = io_poll_add(req, nxt);
 		break;
 	case IORING_OP_POLL_REMOVE:
-		ret = io_poll_remove(req, req->sqe);
+		ret = io_poll_remove(req);
 		break;
 	case IORING_OP_SYNC_FILE_RANGE:
-		ret = io_sync_file_range(req, req->sqe, nxt, force_nonblock);
+		ret = io_sync_file_range(req, nxt, force_nonblock);
 		break;
 	case IORING_OP_SENDMSG:
-		ret = io_sendmsg(req, req->sqe, nxt, force_nonblock);
+		ret = io_sendmsg(req, nxt, force_nonblock);
 		break;
 	case IORING_OP_RECVMSG:
-		ret = io_recvmsg(req, req->sqe, nxt, force_nonblock);
+		ret = io_recvmsg(req, nxt, force_nonblock);
 		break;
 	case IORING_OP_TIMEOUT:
-		ret = io_timeout(req, req->sqe);
+		ret = io_timeout(req);
 		break;
 	case IORING_OP_TIMEOUT_REMOVE:
-		ret = io_timeout_remove(req, req->sqe);
+		ret = io_timeout_remove(req);
 		break;
 	case IORING_OP_ACCEPT:
-		ret = io_accept(req, req->sqe, nxt, force_nonblock);
+		ret = io_accept(req, nxt, force_nonblock);
 		break;
 	case IORING_OP_CONNECT:
-		ret = io_connect(req, req->sqe, nxt, force_nonblock);
+		ret = io_connect(req, nxt, force_nonblock);
 		break;
 	case IORING_OP_ASYNC_CANCEL:
-		ret = io_async_cancel(req, req->sqe, nxt);
+		ret = io_async_cancel(req, nxt);
 		break;
 	case IORING_OP_FALLOCATE:
 		ret = io_fallocate(req, nxt, force_nonblock);
-- 
2.24.1


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH 03/10] fs: add namei support for doing a non-blocking path lookup
  2019-12-13 18:36 [PATCHSET 0/10] io_uring items for 5.6 Jens Axboe
  2019-12-13 18:36 ` [PATCH 01/10] io_uring: add support for fallocate() Jens Axboe
  2019-12-13 18:36 ` [PATCH 02/10] io_uring: remove 'sqe' parameter to the OP helpers that take it Jens Axboe
@ 2019-12-13 18:36 ` Jens Axboe
  2019-12-27  0:42   ` Al Viro
  2019-12-13 18:36 ` [PATCH 04/10] fs: make build_open_flags() available internally Jens Axboe
                   ` (6 subsequent siblings)
  9 siblings, 1 reply; 18+ messages in thread
From: Jens Axboe @ 2019-12-13 18:36 UTC (permalink / raw)
  To: io-uring; +Cc: linux-fsdevel, Jens Axboe

If the fast lookup fails, then return -EAGAIN to have the caller retry
the path lookup. This is in preparation for supporting non-blocking
open.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/namei.c            | 2 ++
 include/linux/namei.h | 1 +
 2 files changed, 3 insertions(+)

diff --git a/fs/namei.c b/fs/namei.c
index 2dda552bcf7a..50899721f699 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1801,6 +1801,8 @@ static int walk_component(struct nameidata *nd, int flags)
 	if (unlikely(err <= 0)) {
 		if (err < 0)
 			return err;
+		if (nd->flags & LOOKUP_NONBLOCK)
+			return -EAGAIN;
 		path.dentry = lookup_slow(&nd->last, nd->path.dentry,
 					  nd->flags);
 		if (IS_ERR(path.dentry))
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 397a08ade6a2..a50ad21e3457 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -38,6 +38,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
 #define LOOKUP_JUMPED		0x1000
 #define LOOKUP_ROOT		0x2000
 #define LOOKUP_ROOT_GRABBED	0x0008
+#define LOOKUP_NONBLOCK		0x10000	/* don't block for lookup */
 
 extern int path_pts(struct path *path);
 
-- 
2.24.1


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH 04/10] fs: make build_open_flags() available internally
  2019-12-13 18:36 [PATCHSET 0/10] io_uring items for 5.6 Jens Axboe
                   ` (2 preceding siblings ...)
  2019-12-13 18:36 ` [PATCH 03/10] fs: add namei support for doing a non-blocking path lookup Jens Axboe
@ 2019-12-13 18:36 ` Jens Axboe
  2019-12-13 18:36 ` [PATCH 05/10] io_uring: add support for IORING_OP_OPENAT Jens Axboe
                   ` (5 subsequent siblings)
  9 siblings, 0 replies; 18+ messages in thread
From: Jens Axboe @ 2019-12-13 18:36 UTC (permalink / raw)
  To: io-uring; +Cc: linux-fsdevel, Jens Axboe

This is a prep patch for supporting non-blocking open from io_uring.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/internal.h | 1 +
 fs/open.c     | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/internal.h b/fs/internal.h
index 315fcd8d237c..001f17815984 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -124,6 +124,7 @@ extern struct file *do_filp_open(int dfd, struct filename *pathname,
 		const struct open_flags *op);
 extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
 		const char *, const struct open_flags *);
+extern int build_open_flags(int flags, umode_t mode, struct open_flags *op);
 
 long do_sys_ftruncate(unsigned int fd, loff_t length, int small);
 long do_faccessat(int dfd, const char __user *filename, int mode);
diff --git a/fs/open.c b/fs/open.c
index b62f5c0923a8..24cb5d58bbda 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -955,7 +955,7 @@ struct file *open_with_fake_path(const struct path *path, int flags,
 }
 EXPORT_SYMBOL(open_with_fake_path);
 
-static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op)
+inline int build_open_flags(int flags, umode_t mode, struct open_flags *op)
 {
 	int lookup_flags = 0;
 	int acc_mode = ACC_MODE(flags);
-- 
2.24.1


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH 05/10] io_uring: add support for IORING_OP_OPENAT
  2019-12-13 18:36 [PATCHSET 0/10] io_uring items for 5.6 Jens Axboe
                   ` (3 preceding siblings ...)
  2019-12-13 18:36 ` [PATCH 04/10] fs: make build_open_flags() available internally Jens Axboe
@ 2019-12-13 18:36 ` Jens Axboe
  2019-12-13 18:36 ` [PATCH 06/10] fs: move filp_close() outside of __close_fd_get_file() Jens Axboe
                   ` (4 subsequent siblings)
  9 siblings, 0 replies; 18+ messages in thread
From: Jens Axboe @ 2019-12-13 18:36 UTC (permalink / raw)
  To: io-uring; +Cc: linux-fsdevel, Jens Axboe

This works just like openat(2), except it can be performed async. For
the normal case of a non-blocking path lookup this will complete
inline. If we have to do IO to perform the open, it'll be done from
async context.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 106 +++++++++++++++++++++++++++++++++-
 include/uapi/linux/io_uring.h |   2 +
 2 files changed, 105 insertions(+), 3 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 93a967cf3f9f..db79ac79d80e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -70,6 +70,8 @@
 #include <linux/sizes.h>
 #include <linux/hugetlb.h>
 #include <linux/highmem.h>
+#include <linux/namei.h>
+#include <linux/fsnotify.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/io_uring.h>
@@ -322,6 +324,10 @@ struct io_async_rw {
 	ssize_t				size;
 };
 
+struct io_async_open {
+	struct filename			*filename;
+};
+
 struct io_async_ctx {
 	struct io_uring_sqe		sqe;
 	union {
@@ -329,6 +335,7 @@ struct io_async_ctx {
 		struct io_async_msghdr	msg;
 		struct io_async_connect	connect;
 		struct io_timeout_data	timeout;
+		struct io_async_open	open;
 	};
 };
 
@@ -879,8 +886,11 @@ static void __io_free_req(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
-	if (req->io)
+	if (req->io) {
+		if (req->io->sqe.opcode == IORING_OP_OPENAT)
+			putname(req->io->open.filename);
 		kfree(req->io);
+	}
 	if (req->file && !(req->flags & REQ_F_FIXED_FILE))
 		fput(req->file);
 	if (req->flags & REQ_F_INFLIGHT) {
@@ -2001,6 +2011,88 @@ static int io_fallocate(struct io_kiocb *req, struct io_kiocb **nxt,
 	return 0;
 }
 
+static int io_openat_prep(struct io_kiocb *req, struct io_async_ctx *io)
+{
+	const struct io_uring_sqe *sqe = req->sqe;
+	const char __user *fname;
+	int ret;
+
+	fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	io->open.filename = getname(fname);
+	if (!IS_ERR(io->open.filename))
+		return 0;
+
+	ret = PTR_ERR(io->open.filename);
+	io->open.filename = NULL;
+	return ret;
+}
+
+static int io_openat(struct io_kiocb *req, struct io_kiocb **nxt,
+		     bool force_nonblock)
+{
+	const struct io_uring_sqe *sqe = req->sqe;
+	struct filename *filename;
+	const char __user *fname;
+	struct open_flags op;
+	int flags, ret, dfd;
+	struct file *file;
+	umode_t mode;
+
+	if (sqe->ioprio || sqe->buf_index)
+		return -EINVAL;
+
+	dfd = READ_ONCE(sqe->fd);
+	mode = READ_ONCE(sqe->len);
+	fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	flags = READ_ONCE(sqe->open_flags);
+
+	ret = build_open_flags(flags, mode, &op);
+	if (ret)
+		goto err;
+	if (force_nonblock)
+		op.lookup_flags |= LOOKUP_NONBLOCK;
+	if (req->io) {
+		filename = req->io->open.filename;
+	} else {
+		filename = getname(fname);
+		if (IS_ERR(filename)) {
+			ret = PTR_ERR(filename);
+			goto err;
+		}
+	}
+
+	ret = get_unused_fd_flags(flags);
+	if (ret < 0)
+		goto err;
+
+	file = do_filp_open(dfd, filename, &op);
+	if (IS_ERR(file)) {
+		put_unused_fd(ret);
+		ret = PTR_ERR(file);
+		if (ret == -EAGAIN) {
+			req->io = kmalloc(sizeof(*req->io), GFP_KERNEL);
+			if (!req->io) {
+				ret = -ENOMEM;
+				goto err;
+			}
+			req->io->open.filename = filename;
+			req->work.flags |= IO_WQ_WORK_NEEDS_FILES;
+			return -EAGAIN;
+		}
+		putname(filename);
+	} else {
+		fsnotify_open(file);
+		fd_install(ret, file);
+		putname(filename);
+	}
+err:
+	if (ret < 0)
+		req_set_fail_links(req);
+	io_cqring_add_event(req, ret);
+	io_put_req_find_next(req, nxt);
+	return 0;
+}
+
 static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_ring_ctx *ctx = req->ctx;
@@ -2909,6 +3001,9 @@ static int io_req_defer_prep(struct io_kiocb *req, struct io_async_ctx *io)
 		return io_timeout_prep(req, io, false);
 	case IORING_OP_LINK_TIMEOUT:
 		return io_timeout_prep(req, io, true);
+	case IORING_OP_OPENAT:
+		ret = io_openat_prep(req, io);
+		break;
 	default:
 		req->io = io;
 		return 0;
@@ -3018,6 +3113,9 @@ static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt,
 	case IORING_OP_FALLOCATE:
 		ret = io_fallocate(req, nxt, force_nonblock);
 		break;
+	case IORING_OP_OPENAT:
+		ret = io_openat(req, nxt, force_nonblock);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
@@ -3102,7 +3200,7 @@ static bool io_req_op_valid(int op)
 	return op >= IORING_OP_NOP && op < IORING_OP_LAST;
 }
 
-static int io_op_needs_file(const struct io_uring_sqe *sqe)
+static int io_op_needs_file(const struct io_uring_sqe *sqe, int fd)
 {
 	int op = READ_ONCE(sqe->opcode);
 
@@ -3114,6 +3212,8 @@ static int io_op_needs_file(const struct io_uring_sqe *sqe)
 	case IORING_OP_ASYNC_CANCEL:
 	case IORING_OP_LINK_TIMEOUT:
 		return 0;
+	case IORING_OP_OPENAT:
+		return fd != -1;
 	default:
 		if (io_req_op_valid(op))
 			return 1;
@@ -3142,7 +3242,7 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req)
 	if (flags & IOSQE_IO_DRAIN)
 		req->flags |= REQ_F_IO_DRAIN;
 
-	ret = io_op_needs_file(req->sqe);
+	ret = io_op_needs_file(req->sqe, fd);
 	if (ret <= 0)
 		return ret;
 
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index bdbe2b130179..02af580754ce 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -34,6 +34,7 @@ struct io_uring_sqe {
 		__u32		timeout_flags;
 		__u32		accept_flags;
 		__u32		cancel_flags;
+		__u32		open_flags;
 	};
 	__u64	user_data;	/* data to be passed back at completion time */
 	union {
@@ -77,6 +78,7 @@ enum {
 	IORING_OP_LINK_TIMEOUT,
 	IORING_OP_CONNECT,
 	IORING_OP_FALLOCATE,
+	IORING_OP_OPENAT,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
-- 
2.24.1


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH 06/10] fs: move filp_close() outside of __close_fd_get_file()
  2019-12-13 18:36 [PATCHSET 0/10] io_uring items for 5.6 Jens Axboe
                   ` (4 preceding siblings ...)
  2019-12-13 18:36 ` [PATCH 05/10] io_uring: add support for IORING_OP_OPENAT Jens Axboe
@ 2019-12-13 18:36 ` Jens Axboe
  2019-12-16 19:27   ` Jann Horn
  2019-12-13 18:36 ` [PATCH 07/10] io-wq: add support for uncancellable work Jens Axboe
                   ` (3 subsequent siblings)
  9 siblings, 1 reply; 18+ messages in thread
From: Jens Axboe @ 2019-12-13 18:36 UTC (permalink / raw)
  To: io-uring; +Cc: linux-fsdevel, Jens Axboe

Just one caller of this, and just use filp_close() there manually.
This is important to allow async close/removal of the fd.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/android/binder.c | 6 ++++--
 fs/file.c                | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index e9bc9fcc7ea5..e8b435870d6b 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -2249,10 +2249,12 @@ static void binder_deferred_fd_close(int fd)
 		return;
 	init_task_work(&twcb->twork, binder_do_fd_close);
 	__close_fd_get_file(fd, &twcb->file);
-	if (twcb->file)
+	if (twcb->file) {
+		filp_close(twcb->file, current->files);
 		task_work_add(current, &twcb->twork, true);
-	else
+	} else {
 		kfree(twcb);
+	}
 }
 
 static void binder_transaction_buffer_release(struct binder_proc *proc,
diff --git a/fs/file.c b/fs/file.c
index 3da91a112bab..a250d291c71b 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -662,7 +662,7 @@ int __close_fd_get_file(unsigned int fd, struct file **res)
 	spin_unlock(&files->file_lock);
 	get_file(file);
 	*res = file;
-	return filp_close(file, files);
+	return 0;
 
 out_unlock:
 	spin_unlock(&files->file_lock);
-- 
2.24.1


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH 07/10] io-wq: add support for uncancellable work
  2019-12-13 18:36 [PATCHSET 0/10] io_uring items for 5.6 Jens Axboe
                   ` (5 preceding siblings ...)
  2019-12-13 18:36 ` [PATCH 06/10] fs: move filp_close() outside of __close_fd_get_file() Jens Axboe
@ 2019-12-13 18:36 ` Jens Axboe
  2019-12-13 18:36 ` [PATCH 08/10] io_uring: add support for IORING_OP_CLOSE Jens Axboe
                   ` (2 subsequent siblings)
  9 siblings, 0 replies; 18+ messages in thread
From: Jens Axboe @ 2019-12-13 18:36 UTC (permalink / raw)
  To: io-uring; +Cc: linux-fsdevel, Jens Axboe

Not all work can be cancelled, some of it we may need to guarantee
that it runs to completion. Allow the caller to set IO_WQ_WORK_NO_CANCEL
on work that must not be cancelled. Note that the caller work function
must also check for IO_WQ_WORK_NO_CANCEL on work that is marked
IO_WQ_WORK_CANCEL.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c    | 8 +++++++-
 fs/io-wq.h    | 1 +
 fs/io_uring.c | 5 ++++-
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 90c4978781fb..d0303ad17347 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -453,6 +453,10 @@ static void io_worker_handle_work(struct io_worker *worker)
 		}
 		if (!worker->creds)
 			worker->creds = override_creds(wq->creds);
+		/*
+		 * OK to set IO_WQ_WORK_CANCEL even for uncancellable work,
+		 * the worker function will do the right thing.
+		 */
 		if (test_bit(IO_WQ_BIT_CANCEL, &wq->state))
 			work->flags |= IO_WQ_WORK_CANCEL;
 		if (worker->mm)
@@ -829,6 +833,7 @@ static bool io_work_cancel(struct io_worker *worker, void *cancel_data)
 	 */
 	spin_lock_irqsave(&worker->lock, flags);
 	if (worker->cur_work &&
+	    !(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL) &&
 	    data->cancel(worker->cur_work, data->caller_data)) {
 		send_sig(SIGINT, worker->task, 1);
 		ret = true;
@@ -903,7 +908,8 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
 		return false;
 
 	spin_lock_irqsave(&worker->lock, flags);
-	if (worker->cur_work == work) {
+	if (worker->cur_work == work &&
+	    !(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL)) {
 		send_sig(SIGINT, worker->task, 1);
 		ret = true;
 	}
diff --git a/fs/io-wq.h b/fs/io-wq.h
index fb993b2bd0ef..f0a016c4ee9c 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -12,6 +12,7 @@ enum {
 	IO_WQ_WORK_UNBOUND	= 32,
 	IO_WQ_WORK_INTERNAL	= 64,
 	IO_WQ_WORK_CB		= 128,
+	IO_WQ_WORK_NO_CANCEL	= 256,
 
 	IO_WQ_HASH_SHIFT	= 24,	/* upper 8 bits are used for hash key */
 };
diff --git a/fs/io_uring.c b/fs/io_uring.c
index db79ac79d80e..132f887ef18d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3153,8 +3153,11 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
 	/* Ensure we clear previously set non-block flag */
 	req->rw.ki_flags &= ~IOCB_NOWAIT;
 
-	if (work->flags & IO_WQ_WORK_CANCEL)
+	/* if NO_CANCEL is set, we must still run the work */
+	if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) ==
+				IO_WQ_WORK_CANCEL) {
 		ret = -ECANCELED;
+	}
 
 	if (!ret) {
 		req->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0;
-- 
2.24.1


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH 08/10] io_uring: add support for IORING_OP_CLOSE
  2019-12-13 18:36 [PATCHSET 0/10] io_uring items for 5.6 Jens Axboe
                   ` (6 preceding siblings ...)
  2019-12-13 18:36 ` [PATCH 07/10] io-wq: add support for uncancellable work Jens Axboe
@ 2019-12-13 18:36 ` Jens Axboe
  2019-12-13 18:36 ` [PATCH 09/10] io_uring: use u64_to_user_ptr() consistently Jens Axboe
  2019-12-13 18:36 ` [PATCH 10/10] io_uring: avoid ring quiesce for fixed file set unregister and update Jens Axboe
  9 siblings, 0 replies; 18+ messages in thread
From: Jens Axboe @ 2019-12-13 18:36 UTC (permalink / raw)
  To: io-uring; +Cc: linux-fsdevel, Jens Axboe

This works just like close(2), unsurprisingly. We remove the file
descriptor and post the completion inline, then offload the actual
(potential) last file put to async context.

Mark the async part of this work as uncancellable, as we really must
guarantee that the latter part of the close is run.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 70 +++++++++++++++++++++++++++++++++++
 include/uapi/linux/io_uring.h |  1 +
 2 files changed, 71 insertions(+)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 132f887ef18d..927f28112f0e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -298,6 +298,11 @@ struct io_poll_iocb {
 	struct wait_queue_entry		wait;
 };
 
+struct io_close {
+	struct file			*file;
+	struct file			*put_file;
+};
+
 struct io_timeout_data {
 	struct io_kiocb			*req;
 	struct hrtimer			timer;
@@ -350,6 +355,7 @@ struct io_kiocb {
 		struct file		*file;
 		struct kiocb		rw;
 		struct io_poll_iocb	poll;
+		struct io_close		close;
 	};
 
 	const struct io_uring_sqe	*sqe;
@@ -2093,6 +2099,64 @@ static int io_openat(struct io_kiocb *req, struct io_kiocb **nxt,
 	return 0;
 }
 
+static int io_close(struct io_kiocb *req, struct io_kiocb **nxt,
+		    bool force_nonblock)
+{
+	const struct io_uring_sqe *sqe = req->sqe;
+	int ret, fd;
+
+	if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
+	    sqe->rw_flags || sqe->buf_index)
+		return -EINVAL;
+
+	fd = READ_ONCE(sqe->fd);
+	if (req->file->f_op == &io_uring_fops || fd == req->ring_fd)
+		return -EBADF;
+
+	/*
+	 * If we queue this for async, it must not be cancellable. That would
+	 * leave the 'file' in an undeterminate state.
+	 */
+	req->work.flags |= IO_WQ_WORK_NO_CANCEL;
+
+	ret = 0;
+	if (force_nonblock) {
+		req->close.put_file = NULL;
+		ret = __close_fd_get_file(fd, &req->close.put_file);
+		if (ret < 0)
+			return ret;
+
+		/* if the file has a flush method, be safe and punt to async */
+		if (req->close.put_file->f_op->flush) {
+			req->work.flags |= IO_WQ_WORK_NEEDS_FILES;
+			return -EAGAIN;
+		}
+
+		/*
+		 * No ->flush(), safely close from here and just punt the
+		 * fput() to async context.
+		 */
+		ret = filp_close(req->close.put_file, current->files);
+		if (ret < 0)
+			req_set_fail_links(req);
+
+		io_cqring_add_event(req, ret);
+		return -EAGAIN;
+	} else {
+		/* Invoked with files, we need to do the close */
+		if (req->work.files) {
+			ret = filp_close(req->close.put_file, req->work.files);
+			if (ret < 0)
+				req_set_fail_links(req);
+			io_cqring_add_event(req, ret);
+		}
+		fput(req->close.put_file);
+	}
+
+	io_put_req_find_next(req, nxt);
+	return ret;
+}
+
 static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_ring_ctx *ctx = req->ctx;
@@ -3116,6 +3180,9 @@ static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt,
 	case IORING_OP_OPENAT:
 		ret = io_openat(req, nxt, force_nonblock);
 		break;
+	case IORING_OP_CLOSE:
+		ret = io_close(req, nxt, force_nonblock);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
@@ -3275,6 +3342,9 @@ static int io_grab_files(struct io_kiocb *req)
 	int ret = -EBADF;
 	struct io_ring_ctx *ctx = req->ctx;
 
+	if (!req->ring_file)
+		return -EBADF;
+
 	rcu_read_lock();
 	spin_lock_irq(&ctx->inflight_lock);
 	/*
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 02af580754ce..42a7f0e8dee3 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -79,6 +79,7 @@ enum {
 	IORING_OP_CONNECT,
 	IORING_OP_FALLOCATE,
 	IORING_OP_OPENAT,
+	IORING_OP_CLOSE,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
-- 
2.24.1


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH 09/10] io_uring: use u64_to_user_ptr() consistently
  2019-12-13 18:36 [PATCHSET 0/10] io_uring items for 5.6 Jens Axboe
                   ` (7 preceding siblings ...)
  2019-12-13 18:36 ` [PATCH 08/10] io_uring: add support for IORING_OP_CLOSE Jens Axboe
@ 2019-12-13 18:36 ` Jens Axboe
  2019-12-13 18:36 ` [PATCH 10/10] io_uring: avoid ring quiesce for fixed file set unregister and update Jens Axboe
  9 siblings, 0 replies; 18+ messages in thread
From: Jens Axboe @ 2019-12-13 18:36 UTC (permalink / raw)
  To: io-uring; +Cc: linux-fsdevel, Jens Axboe

We use it in some spots, but not consistently. Convert the rest over,
makes it easier to read as well.

No functional changes in this patch.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 927f28112f0e..6f169b5a6494 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2211,7 +2211,7 @@ static int io_sendmsg_prep(struct io_kiocb *req, struct io_async_ctx *io)
 	unsigned flags;
 
 	flags = READ_ONCE(sqe->msg_flags);
-	msg = (struct user_msghdr __user *)(unsigned long) READ_ONCE(sqe->addr);
+	msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
 	io->msg.iov = io->msg.fast_iov;
 	return sendmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.iov);
 #else
@@ -2290,7 +2290,7 @@ static int io_recvmsg_prep(struct io_kiocb *req, struct io_async_ctx *io)
 	unsigned flags;
 
 	flags = READ_ONCE(sqe->msg_flags);
-	msg = (struct user_msghdr __user *)(unsigned long) READ_ONCE(sqe->addr);
+	msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
 	io->msg.iov = io->msg.fast_iov;
 	return recvmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.uaddr,
 					&io->msg.iov);
@@ -2324,8 +2324,7 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt,
 		else if (force_nonblock)
 			flags |= MSG_DONTWAIT;
 
-		msg = (struct user_msghdr __user *) (unsigned long)
-			READ_ONCE(sqe->addr);
+		msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
 		if (req->io) {
 			kmsg = &req->io->msg.msg;
 			kmsg->msg_name = &addr;
@@ -2380,8 +2379,8 @@ static int io_accept(struct io_kiocb *req, struct io_kiocb **nxt,
 	if (sqe->ioprio || sqe->len || sqe->buf_index)
 		return -EINVAL;
 
-	addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr);
-	addr_len = (int __user *) (unsigned long) READ_ONCE(sqe->addr2);
+	addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
 	flags = READ_ONCE(sqe->accept_flags);
 	file_flags = force_nonblock ? O_NONBLOCK : 0;
 
@@ -2409,7 +2408,7 @@ static int io_connect_prep(struct io_kiocb *req, struct io_async_ctx *io)
 	struct sockaddr __user *addr;
 	int addr_len;
 
-	addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr);
+	addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
 	addr_len = READ_ONCE(sqe->addr2);
 	return move_addr_to_kernel(addr, addr_len, &io->connect.address);
 #else
@@ -4654,7 +4653,7 @@ static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
 		if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
 			return -EFAULT;
 
-		dst->iov_base = (void __user *) (unsigned long) ciov.iov_base;
+		dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
 		dst->iov_len = ciov.iov_len;
 		return 0;
 	}
-- 
2.24.1


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH 10/10] io_uring: avoid ring quiesce for fixed file set unregister and update
  2019-12-13 18:36 [PATCHSET 0/10] io_uring items for 5.6 Jens Axboe
                   ` (8 preceding siblings ...)
  2019-12-13 18:36 ` [PATCH 09/10] io_uring: use u64_to_user_ptr() consistently Jens Axboe
@ 2019-12-13 18:36 ` Jens Axboe
  9 siblings, 0 replies; 18+ messages in thread
From: Jens Axboe @ 2019-12-13 18:36 UTC (permalink / raw)
  To: io-uring; +Cc: linux-fsdevel, Jens Axboe

We currently fully quiesce the ring before an unregister or update of
the fixed fileset. This is very expensive, and we can be a bit smarter
about this.

Add a percpu refcount for the file tables as a whole. Grab a percpu ref
when we use a registered file, and put it on completion. This is cheap
to do. Upon removal of a file from a set, switch the ref count to atomic
mode. When we hit zero ref on the completion side, then we know we can
drop the previously registered files. When the old files have been
dropped, switch the ref back to percpu mode for normal operation.

Since there's a period between doing the update and the kernel being
done with it, add a IORING_OP_FILES_UPDATE opcode that can perform the
same action. The application knows the update has completed when it gets
the CQE for it. Between doing the update and receiving this completion,
the application must continue to use the unregistered fd if submitting
IO on this particular file.

This takes the runtime of test/file-register from liburing from 14s to
about 0.7s.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 409 +++++++++++++++++++++++++---------
 include/uapi/linux/io_uring.h |   1 +
 2 files changed, 300 insertions(+), 110 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6f169b5a6494..81219a631a6d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -179,6 +179,21 @@ struct fixed_file_table {
 	struct file		**files;
 };
 
+enum {
+	FFD_F_ATOMIC,
+};
+
+struct fixed_file_data {
+	struct fixed_file_table		*table;
+	struct io_ring_ctx		*ctx;
+
+	struct percpu_ref		refs;
+	struct llist_head		put_llist;
+	unsigned long			state;
+	struct work_struct		ref_work;
+	struct completion		done;
+};
+
 struct io_ring_ctx {
 	struct {
 		struct percpu_ref	refs;
@@ -231,7 +246,7 @@ struct io_ring_ctx {
 	 * readers must ensure that ->refs is alive as long as the file* is
 	 * used. Only updated through io_uring_register(2).
 	 */
-	struct fixed_file_table	*file_table;
+	struct fixed_file_data	*file_data;
 	unsigned		nr_user_files;
 
 	/* if used, fixed mapped user buffers */
@@ -431,6 +446,8 @@ static void io_double_put_req(struct io_kiocb *req);
 static void __io_double_put_req(struct io_kiocb *req);
 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
 static void io_queue_linked_timeout(struct io_kiocb *req);
+static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
+			       unsigned nr_args);
 
 static struct kmem_cache *req_cachep;
 
@@ -897,8 +914,12 @@ static void __io_free_req(struct io_kiocb *req)
 			putname(req->io->open.filename);
 		kfree(req->io);
 	}
-	if (req->file && !(req->flags & REQ_F_FIXED_FILE))
-		fput(req->file);
+	if (req->file) {
+		if (req->flags & REQ_F_FIXED_FILE)
+			percpu_ref_put(&ctx->file_data->refs);
+		else
+			fput(req->file);
+	}
 	if (req->flags & REQ_F_INFLIGHT) {
 		unsigned long flags;
 
@@ -3033,6 +3054,35 @@ static int io_async_cancel(struct io_kiocb *req, struct io_kiocb **nxt)
 	return 0;
 }
 
+static int io_files_update(struct io_kiocb *req, bool force_nonblock)
+{
+	const struct io_uring_sqe *sqe = req->sqe;
+	struct io_ring_ctx *ctx = req->ctx;
+	void __user *arg;
+	unsigned nr_args;
+	int ret;
+
+	if (sqe->flags || sqe->ioprio || sqe->fd || sqe->off || sqe->rw_flags)
+		return -EINVAL;
+	if (force_nonblock) {
+		req->work.flags |= IO_WQ_WORK_NEEDS_FILES;
+		return -EAGAIN;
+	}
+
+	nr_args = READ_ONCE(sqe->len);
+	arg = u64_to_user_ptr(READ_ONCE(sqe->addr));
+
+	mutex_lock(&ctx->uring_lock);
+	ret = io_sqe_files_update(ctx, arg, nr_args);
+	mutex_unlock(&ctx->uring_lock);
+
+	if (ret < 0)
+		req_set_fail_links(req);
+	io_cqring_add_event(req, ret);
+	io_put_req(req);
+	return 0;
+}
+
 static int io_req_defer_prep(struct io_kiocb *req, struct io_async_ctx *io)
 {
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
@@ -3067,6 +3117,8 @@ static int io_req_defer_prep(struct io_kiocb *req, struct io_async_ctx *io)
 	case IORING_OP_OPENAT:
 		ret = io_openat_prep(req, io);
 		break;
+	case IORING_OP_FILES_UPDATE:
+		return -EINVAL;
 	default:
 		req->io = io;
 		return 0;
@@ -3182,6 +3234,9 @@ static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt,
 	case IORING_OP_CLOSE:
 		ret = io_close(req, nxt, force_nonblock);
 		break;
+	case IORING_OP_FILES_UPDATE:
+		ret = io_files_update(req, force_nonblock);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
@@ -3295,8 +3350,8 @@ static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
 {
 	struct fixed_file_table *table;
 
-	table = &ctx->file_table[index >> IORING_FILE_TABLE_SHIFT];
-	return table->files[index & IORING_FILE_TABLE_MASK];
+	table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT];
+	return table->files[index & IORING_FILE_TABLE_MASK];;
 }
 
 static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req)
@@ -3316,7 +3371,7 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req)
 		return ret;
 
 	if (flags & IOSQE_FIXED_FILE) {
-		if (unlikely(!ctx->file_table ||
+		if (unlikely(!ctx->file_data ||
 		    (unsigned) fd >= ctx->nr_user_files))
 			return -EBADF;
 		fd = array_index_nospec(fd, ctx->nr_user_files);
@@ -3324,6 +3379,7 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req)
 		if (!req->file)
 			return -EBADF;
 		req->flags |= REQ_F_FIXED_FILE;
+		percpu_ref_get(&ctx->file_data->refs);
 	} else {
 		if (req->needs_fixed_file)
 			return -EBADF;
@@ -4000,19 +4056,33 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
 #endif
 }
 
+static void io_file_ref_kill(struct percpu_ref *ref)
+{
+	struct fixed_file_data *data;
+
+	data = container_of(ref, struct fixed_file_data, refs);
+	complete(&data->done);
+}
+
 static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 {
+	struct fixed_file_data *data = ctx->file_data;
 	unsigned nr_tables, i;
 
-	if (!ctx->file_table)
+	if (!data)
 		return -ENXIO;
 
+	percpu_ref_kill_and_confirm(&data->refs, io_file_ref_kill);
+	wait_for_completion(&data->done);
+	percpu_ref_exit(&data->refs);
+
 	__io_sqe_files_unregister(ctx);
 	nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
 	for (i = 0; i < nr_tables; i++)
-		kfree(ctx->file_table[i].files);
-	kfree(ctx->file_table);
-	ctx->file_table = NULL;
+		kfree(data->table[i].files);
+	kfree(data->table);
+	kfree(data);
+	ctx->file_data = NULL;
 	ctx->nr_user_files = 0;
 	return 0;
 }
@@ -4162,7 +4232,7 @@ static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables,
 	int i;
 
 	for (i = 0; i < nr_tables; i++) {
-		struct fixed_file_table *table = &ctx->file_table[i];
+		struct fixed_file_table *table = &ctx->file_data->table[i];
 		unsigned this_files;
 
 		this_files = min(nr_files, IORING_MAX_FILES_TABLE);
@@ -4177,36 +4247,158 @@ static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables,
 		return 0;
 
 	for (i = 0; i < nr_tables; i++) {
-		struct fixed_file_table *table = &ctx->file_table[i];
+		struct fixed_file_table *table = &ctx->file_data->table[i];
 		kfree(table->files);
 	}
 	return 1;
 }
 
+static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file)
+{
+#if defined(CONFIG_UNIX)
+	struct sock *sock = ctx->ring_sock->sk;
+	struct sk_buff_head list, *head = &sock->sk_receive_queue;
+	struct sk_buff *skb;
+	int i;
+
+	__skb_queue_head_init(&list);
+
+	/*
+	 * Find the skb that holds this file in its SCM_RIGHTS. When found,
+	 * remove this entry and rearrange the file array.
+	 */
+	skb = skb_dequeue(head);
+	while (skb) {
+		struct scm_fp_list *fp;
+
+		fp = UNIXCB(skb).fp;
+		for (i = 0; i < fp->count; i++) {
+			int left;
+
+			if (fp->fp[i] != file)
+				continue;
+
+			unix_notinflight(fp->user, fp->fp[i]);
+			left = fp->count - 1 - i;
+			if (left) {
+				memmove(&fp->fp[i], &fp->fp[i + 1],
+						left * sizeof(struct file *));
+			}
+			fp->count--;
+			if (!fp->count) {
+				kfree_skb(skb);
+				skb = NULL;
+			} else {
+				__skb_queue_tail(&list, skb);
+			}
+			fput(file);
+			file = NULL;
+			break;
+		}
+
+		if (!file)
+			break;
+
+		__skb_queue_tail(&list, skb);
+
+		skb = skb_dequeue(head);
+	}
+
+	if (skb_peek(&list)) {
+		spin_lock_irq(&head->lock);
+		while ((skb = __skb_dequeue(&list)) != NULL)
+			__skb_queue_tail(head, skb);
+		spin_unlock_irq(&head->lock);
+	}
+#else
+	fput(file);
+#endif
+}
+
+struct io_file_put {
+	struct llist_node llist;
+	struct file *file;
+	struct completion *done;
+};
+
+static void io_ring_file_ref_switch(struct work_struct *work)
+{
+	struct io_file_put *pfile, *tmp;
+	struct fixed_file_data *data;
+	struct llist_node *node;
+
+	data = container_of(work, struct fixed_file_data, ref_work);
+
+	while ((node = llist_del_all(&data->put_llist)) != NULL) {
+		llist_for_each_entry_safe(pfile, tmp, node, llist) {
+			io_ring_file_put(data->ctx, pfile->file);
+			if (pfile->done)
+				complete(pfile->done);
+			else
+				kfree(pfile);
+		}
+	}
+
+	percpu_ref_get(&data->refs);
+	percpu_ref_switch_to_percpu(&data->refs);
+}
+
+static void io_file_data_ref_zero(struct percpu_ref *ref)
+{
+	struct fixed_file_data *data;
+
+	data = container_of(ref, struct fixed_file_data, refs);
+
+	/* we can't safely switch from inside this context, punt to wq */
+	queue_work(system_wq, &data->ref_work);
+}
+
 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 				 unsigned nr_args)
 {
 	__s32 __user *fds = (__s32 __user *) arg;
 	unsigned nr_tables;
+	struct file *file;
 	int fd, ret = 0;
 	unsigned i;
 
-	if (ctx->file_table)
+	if (ctx->file_data)
 		return -EBUSY;
 	if (!nr_args)
 		return -EINVAL;
 	if (nr_args > IORING_MAX_FIXED_FILES)
 		return -EMFILE;
 
+	ctx->file_data = kzalloc(sizeof(*ctx->file_data), GFP_KERNEL);
+	if (!ctx->file_data)
+		return -ENOMEM;
+	ctx->file_data->ctx = ctx;
+	init_completion(&ctx->file_data->done);
+
 	nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
-	ctx->file_table = kcalloc(nr_tables, sizeof(struct fixed_file_table),
+	ctx->file_data->table = kcalloc(nr_tables,
+					sizeof(struct fixed_file_table),
 					GFP_KERNEL);
-	if (!ctx->file_table)
+	if (!ctx->file_data->table) {
+		kfree(ctx->file_data);
+		ctx->file_data = NULL;
 		return -ENOMEM;
+	}
+
+	if (percpu_ref_init(&ctx->file_data->refs, io_file_data_ref_zero,
+				PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
+		kfree(ctx->file_data->table);
+		kfree(ctx->file_data);
+		ctx->file_data = NULL;
+	}
+	ctx->file_data->put_llist.first = NULL;
+	INIT_WORK(&ctx->file_data->ref_work, io_ring_file_ref_switch);
 
 	if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) {
-		kfree(ctx->file_table);
-		ctx->file_table = NULL;
+		percpu_ref_exit(&ctx->file_data->refs);
+		kfree(ctx->file_data->table);
+		kfree(ctx->file_data);
+		ctx->file_data = NULL;
 		return -ENOMEM;
 	}
 
@@ -4223,13 +4415,14 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 			continue;
 		}
 
-		table = &ctx->file_table[i >> IORING_FILE_TABLE_SHIFT];
+		table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
 		index = i & IORING_FILE_TABLE_MASK;
-		table->files[index] = fget(fd);
+		file = fget(fd);
 
 		ret = -EBADF;
-		if (!table->files[index])
+		if (!file)
 			break;
+
 		/*
 		 * Don't allow io_uring instances to be registered. If UNIX
 		 * isn't enabled, then this causes a reference cycle and this
@@ -4237,26 +4430,26 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 		 * handle it just fine, but there's still no point in allowing
 		 * a ring fd as it doesn't support regular read/write anyway.
 		 */
-		if (table->files[index]->f_op == &io_uring_fops) {
-			fput(table->files[index]);
+		if (file->f_op == &io_uring_fops) {
+			fput(file);
 			break;
 		}
 		ret = 0;
+		table->files[index] = file;
 	}
 
 	if (ret) {
 		for (i = 0; i < ctx->nr_user_files; i++) {
-			struct file *file;
-
 			file = io_file_from_index(ctx, i);
 			if (file)
 				fput(file);
 		}
 		for (i = 0; i < nr_tables; i++)
-			kfree(ctx->file_table[i].files);
+			kfree(ctx->file_data->table[i].files);
 
-		kfree(ctx->file_table);
-		ctx->file_table = NULL;
+		kfree(ctx->file_data->table);
+		kfree(ctx->file_data);
+		ctx->file_data = NULL;
 		ctx->nr_user_files = 0;
 		return ret;
 	}
@@ -4268,69 +4461,6 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 	return ret;
 }
 
-static void io_sqe_file_unregister(struct io_ring_ctx *ctx, int index)
-{
-#if defined(CONFIG_UNIX)
-	struct file *file = io_file_from_index(ctx, index);
-	struct sock *sock = ctx->ring_sock->sk;
-	struct sk_buff_head list, *head = &sock->sk_receive_queue;
-	struct sk_buff *skb;
-	int i;
-
-	__skb_queue_head_init(&list);
-
-	/*
-	 * Find the skb that holds this file in its SCM_RIGHTS. When found,
-	 * remove this entry and rearrange the file array.
-	 */
-	skb = skb_dequeue(head);
-	while (skb) {
-		struct scm_fp_list *fp;
-
-		fp = UNIXCB(skb).fp;
-		for (i = 0; i < fp->count; i++) {
-			int left;
-
-			if (fp->fp[i] != file)
-				continue;
-
-			unix_notinflight(fp->user, fp->fp[i]);
-			left = fp->count - 1 - i;
-			if (left) {
-				memmove(&fp->fp[i], &fp->fp[i + 1],
-						left * sizeof(struct file *));
-			}
-			fp->count--;
-			if (!fp->count) {
-				kfree_skb(skb);
-				skb = NULL;
-			} else {
-				__skb_queue_tail(&list, skb);
-			}
-			fput(file);
-			file = NULL;
-			break;
-		}
-
-		if (!file)
-			break;
-
-		__skb_queue_tail(&list, skb);
-
-		skb = skb_dequeue(head);
-	}
-
-	if (skb_peek(&list)) {
-		spin_lock_irq(&head->lock);
-		while ((skb = __skb_dequeue(&list)) != NULL)
-			__skb_queue_tail(head, skb);
-		spin_unlock_irq(&head->lock);
-	}
-#else
-	fput(io_file_from_index(ctx, index));
-#endif
-}
-
 static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
 				int index)
 {
@@ -4374,15 +4504,59 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
 #endif
 }
 
+static void io_atomic_switch(struct percpu_ref *ref)
+{
+	struct fixed_file_data *data;
+
+	data = container_of(ref, struct fixed_file_data, refs);
+	clear_bit(FFD_F_ATOMIC, &data->state);
+}
+
+static bool io_queue_file_removal(struct fixed_file_data *data,
+				  struct file *file)
+{
+	struct io_file_put *pfile, pfile_stack;
+	DECLARE_COMPLETION_ONSTACK(done);
+
+	/*
+	 * If we fail allocating the struct we need for doing async reomval
+	 * of this file, just punt to sync and wait for it.
+	 */
+	pfile = kzalloc(sizeof(*pfile), GFP_KERNEL);
+	if (!pfile) {
+		pfile = &pfile_stack;
+		pfile->done = &done;
+	}
+
+	pfile->file = file;
+	llist_add(&pfile->llist, &data->put_llist);
+
+	if (pfile == &pfile_stack) {
+		if (!test_and_set_bit(FFD_F_ATOMIC, &data->state)) {
+			percpu_ref_put(&data->refs);
+			percpu_ref_switch_to_atomic(&data->refs,
+							io_atomic_switch);
+		}
+		wait_for_completion(&done);
+		flush_work(&data->ref_work);
+		return false;
+	}
+
+	return true;
+}
+
 static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
 			       unsigned nr_args)
 {
+	struct fixed_file_data *data = ctx->file_data;
 	struct io_uring_files_update up;
+	bool ref_switch = false;
+	struct file *file;
 	__s32 __user *fds;
 	int fd, i, err;
 	__u32 done;
 
-	if (!ctx->file_table)
+	if (!data)
 		return -ENXIO;
 	if (!nr_args)
 		return -EINVAL;
@@ -4405,15 +4579,15 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
 			break;
 		}
 		i = array_index_nospec(up.offset, ctx->nr_user_files);
-		table = &ctx->file_table[i >> IORING_FILE_TABLE_SHIFT];
+		table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
 		index = i & IORING_FILE_TABLE_MASK;
 		if (table->files[index]) {
-			io_sqe_file_unregister(ctx, i);
+			file = io_file_from_index(ctx, index);
 			table->files[index] = NULL;
+			if (io_queue_file_removal(data, file))
+				ref_switch = true;
 		}
 		if (fd != -1) {
-			struct file *file;
-
 			file = fget(fd);
 			if (!file) {
 				err = -EBADF;
@@ -4442,6 +4616,11 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
 		up.offset++;
 	}
 
+	if (ref_switch && !test_and_set_bit(FFD_F_ATOMIC, &data->state)) {
+		percpu_ref_put(&data->refs);
+		percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch);
+	}
+
 	return done ? done : err;
 }
 
@@ -4840,13 +5019,15 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx)
 
 static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 {
+	/* do this before async shutdown, as we may need to queue work */
+	io_sqe_files_unregister(ctx);
+
 	io_finish_async(ctx);
 	if (ctx->sqo_mm)
 		mmdrop(ctx->sqo_mm);
 
 	io_iopoll_reap_events(ctx);
 	io_sqe_buffer_unregister(ctx);
-	io_sqe_files_unregister(ctx);
 	io_eventfd_unregister(ctx);
 
 #if defined(CONFIG_UNIX)
@@ -5360,18 +5541,22 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 	if (percpu_ref_is_dying(&ctx->refs))
 		return -ENXIO;
 
-	percpu_ref_kill(&ctx->refs);
+	if (opcode != IORING_UNREGISTER_FILES &&
+	    opcode != IORING_REGISTER_FILES_UPDATE) {
+		percpu_ref_kill(&ctx->refs);
 
-	/*
-	 * Drop uring mutex before waiting for references to exit. If another
-	 * thread is currently inside io_uring_enter() it might need to grab
-	 * the uring_lock to make progress. If we hold it here across the drain
-	 * wait, then we can deadlock. It's safe to drop the mutex here, since
-	 * no new references will come in after we've killed the percpu ref.
-	 */
-	mutex_unlock(&ctx->uring_lock);
-	wait_for_completion(&ctx->completions[0]);
-	mutex_lock(&ctx->uring_lock);
+		/*
+		 * Drop uring mutex before waiting for references to exit. If
+		 * another thread is currently inside io_uring_enter() it might
+		 * need to grab the uring_lock to make progress. If we hold it
+		 * here across the drain wait, then we can deadlock. It's safe
+		 * to drop the mutex here, since no new references will come in
+		 * after we've killed the percpu ref.
+		 */
+		mutex_unlock(&ctx->uring_lock);
+		wait_for_completion(&ctx->completions[0]);
+		mutex_lock(&ctx->uring_lock);
+	}
 
 	switch (opcode) {
 	case IORING_REGISTER_BUFFERS:
@@ -5412,9 +5597,13 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 		break;
 	}
 
-	/* bring the ctx back to life */
-	reinit_completion(&ctx->completions[0]);
-	percpu_ref_reinit(&ctx->refs);
+
+	if (opcode != IORING_UNREGISTER_FILES &&
+	    opcode != IORING_REGISTER_FILES_UPDATE) {
+		/* bring the ctx back to life */
+		reinit_completion(&ctx->completions[0]);
+		percpu_ref_reinit(&ctx->refs);
+	}
 	return ret;
 }
 
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 42a7f0e8dee3..cafee41efbe5 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -80,6 +80,7 @@ enum {
 	IORING_OP_FALLOCATE,
 	IORING_OP_OPENAT,
 	IORING_OP_CLOSE,
+	IORING_OP_FILES_UPDATE,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
-- 
2.24.1


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 06/10] fs: move filp_close() outside of __close_fd_get_file()
  2019-12-13 18:36 ` [PATCH 06/10] fs: move filp_close() outside of __close_fd_get_file() Jens Axboe
@ 2019-12-16 19:27   ` Jann Horn
  2019-12-16 19:39     ` Jens Axboe
  0 siblings, 1 reply; 18+ messages in thread
From: Jann Horn @ 2019-12-16 19:27 UTC (permalink / raw)
  To: Jens Axboe; +Cc: io-uring, linux-fsdevel

On Fri, Dec 13, 2019 at 7:36 PM Jens Axboe <axboe@kernel.dk> wrote:
> Just one caller of this, and just use filp_close() there manually.
> This is important to allow async close/removal of the fd.
[...]
> index 3da91a112bab..a250d291c71b 100644
> --- a/fs/file.c
> +++ b/fs/file.c
> @@ -662,7 +662,7 @@ int __close_fd_get_file(unsigned int fd, struct file **res)
>         spin_unlock(&files->file_lock);
>         get_file(file);
>         *res = file;
> -       return filp_close(file, files);
> +       return 0;

Above this function is a comment saying "variant of __close_fd that
gets a ref on the file for later fput"; that should probably be
changed to point out that you also still need to filp_close().

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 06/10] fs: move filp_close() outside of __close_fd_get_file()
  2019-12-16 19:27   ` Jann Horn
@ 2019-12-16 19:39     ` Jens Axboe
  0 siblings, 0 replies; 18+ messages in thread
From: Jens Axboe @ 2019-12-16 19:39 UTC (permalink / raw)
  To: Jann Horn; +Cc: io-uring, linux-fsdevel

On 12/16/19 12:27 PM, Jann Horn wrote:
> On Fri, Dec 13, 2019 at 7:36 PM Jens Axboe <axboe@kernel.dk> wrote:
>> Just one caller of this, and just use filp_close() there manually.
>> This is important to allow async close/removal of the fd.
> [...]
>> index 3da91a112bab..a250d291c71b 100644
>> --- a/fs/file.c
>> +++ b/fs/file.c
>> @@ -662,7 +662,7 @@ int __close_fd_get_file(unsigned int fd, struct file **res)
>>         spin_unlock(&files->file_lock);
>>         get_file(file);
>>         *res = file;
>> -       return filp_close(file, files);
>> +       return 0;
> 
> Above this function is a comment saying "variant of __close_fd that
> gets a ref on the file for later fput"; that should probably be
> changed to point out that you also still need to filp_close().

Good point, I'll make the comment edit.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 03/10] fs: add namei support for doing a non-blocking path lookup
  2019-12-13 18:36 ` [PATCH 03/10] fs: add namei support for doing a non-blocking path lookup Jens Axboe
@ 2019-12-27  0:42   ` Al Viro
  2019-12-27  5:05     ` Jens Axboe
  0 siblings, 1 reply; 18+ messages in thread
From: Al Viro @ 2019-12-27  0:42 UTC (permalink / raw)
  To: Jens Axboe; +Cc: io-uring, linux-fsdevel

On Fri, Dec 13, 2019 at 11:36:25AM -0700, Jens Axboe wrote:
> If the fast lookup fails, then return -EAGAIN to have the caller retry
> the path lookup. This is in preparation for supporting non-blocking
> open.

NAK.  We are not littering fs/namei.c with incremental broken bits
and pieces with uncertain eventual use.

And it's broken - lookup_slow() is *NOT* the only place that can and
does block.  For starters, ->d_revalidate() can very well block and
it is called outside of lookup_slow().  So does ->d_automount().
So does ->d_manage().

I'm rather sceptical about the usefulness of non-blocking open, to be
honest, but in any case, one thing that is absolutely not going to
happen is piecewise introduction of such stuff without a discussion
of the entire design.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 03/10] fs: add namei support for doing a non-blocking path lookup
  2019-12-27  0:42   ` Al Viro
@ 2019-12-27  5:05     ` Jens Axboe
  2019-12-27  5:25       ` Jens Axboe
  0 siblings, 1 reply; 18+ messages in thread
From: Jens Axboe @ 2019-12-27  5:05 UTC (permalink / raw)
  To: Al Viro; +Cc: io-uring, linux-fsdevel

On 12/26/19 5:42 PM, Al Viro wrote:
> On Fri, Dec 13, 2019 at 11:36:25AM -0700, Jens Axboe wrote:
>> If the fast lookup fails, then return -EAGAIN to have the caller retry
>> the path lookup. This is in preparation for supporting non-blocking
>> open.
> 
> NAK.  We are not littering fs/namei.c with incremental broken bits
> and pieces with uncertain eventual use.

To be fair, the "eventual use" is just the next patch or two...

> And it's broken - lookup_slow() is *NOT* the only place that can and
> does block.  For starters, ->d_revalidate() can very well block and
> it is called outside of lookup_slow().  So does ->d_automount().
> So does ->d_manage().

Fair enough, so it's not complete. I'd love to get it there, though!

> I'm rather sceptical about the usefulness of non-blocking open, to be
> honest, but in any case, one thing that is absolutely not going to
> happen is piecewise introduction of such stuff without a discussion
> of the entire design.

It's a necessity for io_uring, otherwise _any_ open needs to happen
out-of-line. But I get your objection, I'd like to get this moving in a
productive way though.

What do you want it to look like? I'd be totally fine with knowing if
the fs has ->d_revalidate(), and always doing those out-of-line.  If I
know the open will be slow, that's preferable. Ditto for ->d_automount()
and ->d_manage(), all of that looks like cases that would be fine to
punt. I honestly care mostly about the cached local case _not_ needing
out-of-line handling, that needs to happen inline.

Still seems to me like the LOOKUP_NONBLOCK is the way to go, and just
have lookup_fast() -EAGAIN if we need to call any of the potentially
problematic dentry ops. Yes, they _may_ not block, but they could. I
don't think we need to propagate this information further.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 03/10] fs: add namei support for doing a non-blocking path lookup
  2019-12-27  5:05     ` Jens Axboe
@ 2019-12-27  5:25       ` Jens Axboe
  2019-12-27 15:45         ` Jens Axboe
  0 siblings, 1 reply; 18+ messages in thread
From: Jens Axboe @ 2019-12-27  5:25 UTC (permalink / raw)
  To: Al Viro; +Cc: io-uring, linux-fsdevel

On 12/26/19 10:05 PM, Jens Axboe wrote:
> On 12/26/19 5:42 PM, Al Viro wrote:
>> On Fri, Dec 13, 2019 at 11:36:25AM -0700, Jens Axboe wrote:
>>> If the fast lookup fails, then return -EAGAIN to have the caller retry
>>> the path lookup. This is in preparation for supporting non-blocking
>>> open.
>>
>> NAK.  We are not littering fs/namei.c with incremental broken bits
>> and pieces with uncertain eventual use.
> 
> To be fair, the "eventual use" is just the next patch or two...
> 
>> And it's broken - lookup_slow() is *NOT* the only place that can and
>> does block.  For starters, ->d_revalidate() can very well block and
>> it is called outside of lookup_slow().  So does ->d_automount().
>> So does ->d_manage().
> 
> Fair enough, so it's not complete. I'd love to get it there, though!
> 
>> I'm rather sceptical about the usefulness of non-blocking open, to be
>> honest, but in any case, one thing that is absolutely not going to
>> happen is piecewise introduction of such stuff without a discussion
>> of the entire design.
> 
> It's a necessity for io_uring, otherwise _any_ open needs to happen
> out-of-line. But I get your objection, I'd like to get this moving in a
> productive way though.
> 
> What do you want it to look like? I'd be totally fine with knowing if
> the fs has ->d_revalidate(), and always doing those out-of-line.  If I
> know the open will be slow, that's preferable. Ditto for ->d_automount()
> and ->d_manage(), all of that looks like cases that would be fine to
> punt. I honestly care mostly about the cached local case _not_ needing
> out-of-line handling, that needs to happen inline.
> 
> Still seems to me like the LOOKUP_NONBLOCK is the way to go, and just
> have lookup_fast() -EAGAIN if we need to call any of the potentially
> problematic dentry ops. Yes, they _may_ not block, but they could. I
> don't think we need to propagate this information further.

Incremental here - just check for potentially problematic dentry ops,
and have the open redone from a path where it doesn't matter.


diff --git a/fs/namei.c b/fs/namei.c
index ebd05ed14b0a..9c46b1e04fac 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1549,6 +1549,14 @@ static struct dentry *__lookup_hash(const struct qstr *name,
 	return dentry;
 }
 
+static inline bool lookup_may_block(struct dentry *dentry)
+{
+	const struct dentry_operations *ops = dentry->d_op;
+
+	/* assume these dentry ops may block */
+	return ops->d_revalidate || ops->d_automount || ops->d_manage;
+}
+
 static int lookup_fast(struct nameidata *nd,
 		       struct path *path, struct inode **inode,
 		       unsigned *seqp)
@@ -1573,6 +1581,9 @@ static int lookup_fast(struct nameidata *nd,
 			return 0;
 		}
 
+		if ((nd->flags & LOOKUP_NONBLOCK) && lookup_may_block(dentry))
+			return -EAGAIN;
+
 		/*
 		 * This sequence count validates that the inode matches
 		 * the dentry name information from lookup.
@@ -1615,7 +1626,10 @@ static int lookup_fast(struct nameidata *nd,
 		dentry = __d_lookup(parent, &nd->last);
 		if (unlikely(!dentry))
 			return 0;
-		status = d_revalidate(dentry, nd->flags);
+		if ((nd->flags & LOOKUP_NONBLOCK) && lookup_may_block(dentry))
+			status = -EAGAIN;
+		else
+			status = d_revalidate(dentry, nd->flags);
 	}
 	if (unlikely(status <= 0)) {
 		if (!status)

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 03/10] fs: add namei support for doing a non-blocking path lookup
  2019-12-27  5:25       ` Jens Axboe
@ 2019-12-27 15:45         ` Jens Axboe
  2019-12-28 19:03           ` Jens Axboe
  0 siblings, 1 reply; 18+ messages in thread
From: Jens Axboe @ 2019-12-27 15:45 UTC (permalink / raw)
  To: Al Viro; +Cc: io-uring, linux-fsdevel

On 12/26/19 10:25 PM, Jens Axboe wrote:
> On 12/26/19 10:05 PM, Jens Axboe wrote:
>> On 12/26/19 5:42 PM, Al Viro wrote:
>>> On Fri, Dec 13, 2019 at 11:36:25AM -0700, Jens Axboe wrote:
>>>> If the fast lookup fails, then return -EAGAIN to have the caller retry
>>>> the path lookup. This is in preparation for supporting non-blocking
>>>> open.
>>>
>>> NAK.  We are not littering fs/namei.c with incremental broken bits
>>> and pieces with uncertain eventual use.
>>
>> To be fair, the "eventual use" is just the next patch or two...
>>
>>> And it's broken - lookup_slow() is *NOT* the only place that can and
>>> does block.  For starters, ->d_revalidate() can very well block and
>>> it is called outside of lookup_slow().  So does ->d_automount().
>>> So does ->d_manage().
>>
>> Fair enough, so it's not complete. I'd love to get it there, though!
>>
>>> I'm rather sceptical about the usefulness of non-blocking open, to be
>>> honest, but in any case, one thing that is absolutely not going to
>>> happen is piecewise introduction of such stuff without a discussion
>>> of the entire design.
>>
>> It's a necessity for io_uring, otherwise _any_ open needs to happen
>> out-of-line. But I get your objection, I'd like to get this moving in a
>> productive way though.
>>
>> What do you want it to look like? I'd be totally fine with knowing if
>> the fs has ->d_revalidate(), and always doing those out-of-line.  If I
>> know the open will be slow, that's preferable. Ditto for ->d_automount()
>> and ->d_manage(), all of that looks like cases that would be fine to
>> punt. I honestly care mostly about the cached local case _not_ needing
>> out-of-line handling, that needs to happen inline.
>>
>> Still seems to me like the LOOKUP_NONBLOCK is the way to go, and just
>> have lookup_fast() -EAGAIN if we need to call any of the potentially
>> problematic dentry ops. Yes, they _may_ not block, but they could. I
>> don't think we need to propagate this information further.
> 
> Incremental here - just check for potentially problematic dentry ops,
> and have the open redone from a path where it doesn't matter.

Here's the (updated) full patch, with the bits cleaned up a bit. Would
this be more agreeable to you?


commit ac605d1d6ca445ba7e2990e0afe0e28ad831a663
Author: Jens Axboe <axboe@kernel.dk>
Date:   Fri Dec 13 11:09:26 2019 -0700

    fs: add namei support for doing a non-blocking path lookup
    
    If the fast lookup fails, then return -EAGAIN to have the caller retry
    the path lookup. Assume that a dentry having any of:
    
    ->d_revalidate()
    ->d_automount()
    ->d_manage()
    
    could block in those callbacks. Preemptively return -EAGAIN if any of
    these are present.
    
    This is in preparation for supporting non-blocking open.
    
    Signed-off-by: Jens Axboe <axboe@kernel.dk>

diff --git a/fs/namei.c b/fs/namei.c
index d6c91d1e88cb..2bfdb932f2f2 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1549,6 +1549,17 @@ static struct dentry *__lookup_hash(const struct qstr *name,
 	return dentry;
 }
 
+static inline bool lookup_could_block(struct dentry *dentry, unsigned int flags)
+{
+	const struct dentry_operations *ops = dentry->d_op;
+
+	if (!(flags & LOOKUP_NONBLOCK))
+		return 0;
+
+	/* assume these dentry ops may block */
+	return ops->d_revalidate || ops->d_automount || ops->d_manage;
+}
+
 static int lookup_fast(struct nameidata *nd,
 		       struct path *path, struct inode **inode,
 		       unsigned *seqp)
@@ -1573,6 +1584,9 @@ static int lookup_fast(struct nameidata *nd,
 			return 0;
 		}
 
+		if (unlikely(lookup_could_block(dentry, nd->flags)))
+			return -EAGAIN;
+
 		/*
 		 * This sequence count validates that the inode matches
 		 * the dentry name information from lookup.
@@ -1615,7 +1629,10 @@ static int lookup_fast(struct nameidata *nd,
 		dentry = __d_lookup(parent, &nd->last);
 		if (unlikely(!dentry))
 			return 0;
-		status = d_revalidate(dentry, nd->flags);
+		if (unlikely(lookup_could_block(dentry, nd->flags)))
+			status = -EAGAIN;
+		else
+			status = d_revalidate(dentry, nd->flags);
 	}
 	if (unlikely(status <= 0)) {
 		if (!status)
@@ -1799,6 +1816,8 @@ static int walk_component(struct nameidata *nd, int flags)
 	if (unlikely(err <= 0)) {
 		if (err < 0)
 			return err;
+		if (nd->flags & LOOKUP_NONBLOCK)
+			return -EAGAIN;
 		path.dentry = lookup_slow(&nd->last, nd->path.dentry,
 					  nd->flags);
 		if (IS_ERR(path.dentry))
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 7fe7b87a3ded..935a1bf0caca 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -38,6 +38,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
 #define LOOKUP_JUMPED		0x1000
 #define LOOKUP_ROOT		0x2000
 #define LOOKUP_ROOT_GRABBED	0x0008
+#define LOOKUP_NONBLOCK		0x10000	/* don't block for lookup */
 
 extern int path_pts(struct path *path);

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 03/10] fs: add namei support for doing a non-blocking path lookup
  2019-12-27 15:45         ` Jens Axboe
@ 2019-12-28 19:03           ` Jens Axboe
  0 siblings, 0 replies; 18+ messages in thread
From: Jens Axboe @ 2019-12-28 19:03 UTC (permalink / raw)
  To: Al Viro; +Cc: io-uring, linux-fsdevel

On 12/27/19 8:45 AM, Jens Axboe wrote:
> On 12/26/19 10:25 PM, Jens Axboe wrote:
>> On 12/26/19 10:05 PM, Jens Axboe wrote:
>>> On 12/26/19 5:42 PM, Al Viro wrote:
>>>> On Fri, Dec 13, 2019 at 11:36:25AM -0700, Jens Axboe wrote:
>>>>> If the fast lookup fails, then return -EAGAIN to have the caller retry
>>>>> the path lookup. This is in preparation for supporting non-blocking
>>>>> open.
>>>>
>>>> NAK.  We are not littering fs/namei.c with incremental broken bits
>>>> and pieces with uncertain eventual use.
>>>
>>> To be fair, the "eventual use" is just the next patch or two...
>>>
>>>> And it's broken - lookup_slow() is *NOT* the only place that can and
>>>> does block.  For starters, ->d_revalidate() can very well block and
>>>> it is called outside of lookup_slow().  So does ->d_automount().
>>>> So does ->d_manage().
>>>
>>> Fair enough, so it's not complete. I'd love to get it there, though!
>>>
>>>> I'm rather sceptical about the usefulness of non-blocking open, to be
>>>> honest, but in any case, one thing that is absolutely not going to
>>>> happen is piecewise introduction of such stuff without a discussion
>>>> of the entire design.
>>>
>>> It's a necessity for io_uring, otherwise _any_ open needs to happen
>>> out-of-line. But I get your objection, I'd like to get this moving in a
>>> productive way though.
>>>
>>> What do you want it to look like? I'd be totally fine with knowing if
>>> the fs has ->d_revalidate(), and always doing those out-of-line.  If I
>>> know the open will be slow, that's preferable. Ditto for ->d_automount()
>>> and ->d_manage(), all of that looks like cases that would be fine to
>>> punt. I honestly care mostly about the cached local case _not_ needing
>>> out-of-line handling, that needs to happen inline.
>>>
>>> Still seems to me like the LOOKUP_NONBLOCK is the way to go, and just
>>> have lookup_fast() -EAGAIN if we need to call any of the potentially
>>> problematic dentry ops. Yes, they _may_ not block, but they could. I
>>> don't think we need to propagate this information further.
>>
>> Incremental here - just check for potentially problematic dentry ops,
>> and have the open redone from a path where it doesn't matter.
> 
> Here's the (updated) full patch, with the bits cleaned up a bit. Would
> this be more agreeable to you?

Needs a !ops check as well, tmpfs hits that:


commit 8a4dbfbbcd675492cf9e8cbcb203386a1ce10916
Author: Jens Axboe <axboe@kernel.dk>
Date:   Fri Dec 13 11:09:26 2019 -0700

    fs: add namei support for doing a non-blocking path lookup
    
    If the fast lookup fails, then return -EAGAIN to have the caller retry
    the path lookup. Assume that a dentry having any of:
    
    ->d_revalidate()
    ->d_automount()
    ->d_manage()
    
    could block in those callbacks. Preemptively return -EAGAIN if any of
    these are present.
    
    This is in preparation for supporting non-blocking open.
    
    Signed-off-by: Jens Axboe <axboe@kernel.dk>

diff --git a/fs/namei.c b/fs/namei.c
index d6c91d1e88cb..42e71e5a69f1 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1549,6 +1549,17 @@ static struct dentry *__lookup_hash(const struct qstr *name,
 	return dentry;
 }
 
+static inline bool lookup_could_block(struct dentry *dentry, unsigned int flags)
+{
+	const struct dentry_operations *ops = dentry->d_op;
+
+	if (!ops || !(flags & LOOKUP_NONBLOCK))
+		return 0;
+
+	/* assume these dentry ops may block */
+	return ops->d_revalidate || ops->d_automount || ops->d_manage;
+}
+
 static int lookup_fast(struct nameidata *nd,
 		       struct path *path, struct inode **inode,
 		       unsigned *seqp)
@@ -1573,6 +1584,9 @@ static int lookup_fast(struct nameidata *nd,
 			return 0;
 		}
 
+		if (unlikely(lookup_could_block(dentry, nd->flags)))
+			return -EAGAIN;
+
 		/*
 		 * This sequence count validates that the inode matches
 		 * the dentry name information from lookup.
@@ -1615,7 +1629,10 @@ static int lookup_fast(struct nameidata *nd,
 		dentry = __d_lookup(parent, &nd->last);
 		if (unlikely(!dentry))
 			return 0;
-		status = d_revalidate(dentry, nd->flags);
+		if (unlikely(lookup_could_block(dentry, nd->flags)))
+			status = -EAGAIN;
+		else
+			status = d_revalidate(dentry, nd->flags);
 	}
 	if (unlikely(status <= 0)) {
 		if (!status)
@@ -1799,6 +1816,8 @@ static int walk_component(struct nameidata *nd, int flags)
 	if (unlikely(err <= 0)) {
 		if (err < 0)
 			return err;
+		if (nd->flags & LOOKUP_NONBLOCK)
+			return -EAGAIN;
 		path.dentry = lookup_slow(&nd->last, nd->path.dentry,
 					  nd->flags);
 		if (IS_ERR(path.dentry))
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 7fe7b87a3ded..935a1bf0caca 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -38,6 +38,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
 #define LOOKUP_JUMPED		0x1000
 #define LOOKUP_ROOT		0x2000
 #define LOOKUP_ROOT_GRABBED	0x0008
+#define LOOKUP_NONBLOCK		0x10000	/* don't block for lookup */
 
 extern int path_pts(struct path *path);
 
-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, back to index

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-12-13 18:36 [PATCHSET 0/10] io_uring items for 5.6 Jens Axboe
2019-12-13 18:36 ` [PATCH 01/10] io_uring: add support for fallocate() Jens Axboe
2019-12-13 18:36 ` [PATCH 02/10] io_uring: remove 'sqe' parameter to the OP helpers that take it Jens Axboe
2019-12-13 18:36 ` [PATCH 03/10] fs: add namei support for doing a non-blocking path lookup Jens Axboe
2019-12-27  0:42   ` Al Viro
2019-12-27  5:05     ` Jens Axboe
2019-12-27  5:25       ` Jens Axboe
2019-12-27 15:45         ` Jens Axboe
2019-12-28 19:03           ` Jens Axboe
2019-12-13 18:36 ` [PATCH 04/10] fs: make build_open_flags() available internally Jens Axboe
2019-12-13 18:36 ` [PATCH 05/10] io_uring: add support for IORING_OP_OPENAT Jens Axboe
2019-12-13 18:36 ` [PATCH 06/10] fs: move filp_close() outside of __close_fd_get_file() Jens Axboe
2019-12-16 19:27   ` Jann Horn
2019-12-16 19:39     ` Jens Axboe
2019-12-13 18:36 ` [PATCH 07/10] io-wq: add support for uncancellable work Jens Axboe
2019-12-13 18:36 ` [PATCH 08/10] io_uring: add support for IORING_OP_CLOSE Jens Axboe
2019-12-13 18:36 ` [PATCH 09/10] io_uring: use u64_to_user_ptr() consistently Jens Axboe
2019-12-13 18:36 ` [PATCH 10/10] io_uring: avoid ring quiesce for fixed file set unregister and update Jens Axboe

IO-Uring Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/io-uring/0 io-uring/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 io-uring io-uring/ https://lore.kernel.org/io-uring \
		io-uring@vger.kernel.org
	public-inbox-index io-uring

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.io-uring


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git