All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCHSET v3 0/4] fs: Support for LOOKUP_NONBLOCK / RESOLVE_NONBLOCK
@ 2020-12-14 19:13 Jens Axboe
  2020-12-14 19:13 ` [PATCH 1/4] fs: make unlazy_walk() error handling consistent Jens Axboe
                   ` (7 more replies)
  0 siblings, 8 replies; 46+ messages in thread
From: Jens Axboe @ 2020-12-14 19:13 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: torvalds, viro

Hi,

Wanted to throw out what the current state of this is, as we keep
getting closer to something palatable.

This time I've included the io_uring change too. I've tested this through
both openat2, and through io_uring as well.

I'm pretty happy with this at this point. The core change is very simple,
and the users end up being trivial too.

Also available here:

https://git.kernel.dk/cgit/linux-block/log/?h=nonblock-path-lookup

Changes since v2:

- Simplify the LOOKUP_NONBLOCK to _just_ cover LOOKUP_RCU and
  lookup_fast(), as per Linus's suggestion. I verified fast path
  operations are indeed just that, and it avoids having to mess with
  the inode lock and mnt_want_write() completely.

- Make O_TMPFILE return -EAGAIN for LOOKUP_NONBLOCK.

- Improve/add a few comments.

- Folded what was left of the open part of LOOKUP_NONBLOCK into the
  main patch.

-- 
Jens Axboe



^ permalink raw reply	[flat|nested] 46+ messages in thread

* [PATCH 1/4] fs: make unlazy_walk() error handling consistent
  2020-12-14 19:13 [PATCHSET v3 0/4] fs: Support for LOOKUP_NONBLOCK / RESOLVE_NONBLOCK Jens Axboe
@ 2020-12-14 19:13 ` Jens Axboe
  2020-12-14 19:13 ` [PATCH 2/4] fs: add support for LOOKUP_NONBLOCK Jens Axboe
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 46+ messages in thread
From: Jens Axboe @ 2020-12-14 19:13 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: torvalds, viro, Jens Axboe

Most callers check for non-zero return, and assume it's -ECHILD (which
it always will be). One caller uses the actual error return. Clean this
up and make it fully consistent, by having unlazy_walk() return a bool
instead. Rename it to try_to_unlazy() and return true on success, and
failure on error. That's easier to read.

No functional changes in this patch.

Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/namei.c | 43 +++++++++++++++++--------------------------
 1 file changed, 17 insertions(+), 26 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 03d0e11e4f36..7eb7830da298 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -669,17 +669,17 @@ static bool legitimize_root(struct nameidata *nd)
  */
 
 /**
- * unlazy_walk - try to switch to ref-walk mode.
+ * try_to_unlazy - try to switch to ref-walk mode.
  * @nd: nameidata pathwalk data
- * Returns: 0 on success, -ECHILD on failure
+ * Returns: true on success, false on failure
  *
- * unlazy_walk attempts to legitimize the current nd->path and nd->root
+ * try_to_unlazyk attempts to legitimize the current nd->path and nd->root
  * for ref-walk mode.
  * Must be called from rcu-walk context.
- * Nothing should touch nameidata between unlazy_walk() failure and
+ * Nothing should touch nameidata between try_to_unlazy() failure and
  * terminate_walk().
  */
-static int unlazy_walk(struct nameidata *nd)
+static bool try_to_unlazy(struct nameidata *nd)
 {
 	struct dentry *parent = nd->path.dentry;
 
@@ -694,14 +694,14 @@ static int unlazy_walk(struct nameidata *nd)
 		goto out;
 	rcu_read_unlock();
 	BUG_ON(nd->inode != parent->d_inode);
-	return 0;
+	return true;
 
 out1:
 	nd->path.mnt = NULL;
 	nd->path.dentry = NULL;
 out:
 	rcu_read_unlock();
-	return -ECHILD;
+	return false;
 }
 
 /**
@@ -792,7 +792,7 @@ static int complete_walk(struct nameidata *nd)
 		 */
 		if (!(nd->flags & (LOOKUP_ROOT | LOOKUP_IS_SCOPED)))
 			nd->root.mnt = NULL;
-		if (unlikely(unlazy_walk(nd)))
+		if (!try_to_unlazy(nd))
 			return -ECHILD;
 	}
 
@@ -1466,7 +1466,7 @@ static struct dentry *lookup_fast(struct nameidata *nd,
 		unsigned seq;
 		dentry = __d_lookup_rcu(parent, &nd->last, &seq);
 		if (unlikely(!dentry)) {
-			if (unlazy_walk(nd))
+			if (!try_to_unlazy(nd))
 				return ERR_PTR(-ECHILD);
 			return NULL;
 		}
@@ -1567,10 +1567,8 @@ static inline int may_lookup(struct nameidata *nd)
 {
 	if (nd->flags & LOOKUP_RCU) {
 		int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
-		if (err != -ECHILD)
+		if (err != -ECHILD || !try_to_unlazy(nd))
 			return err;
-		if (unlazy_walk(nd))
-			return -ECHILD;
 	}
 	return inode_permission(nd->inode, MAY_EXEC);
 }
@@ -1592,7 +1590,7 @@ static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq)
 		// unlazy even if we fail to grab the link - cleanup needs it
 		bool grabbed_link = legitimize_path(nd, link, seq);
 
-		if (unlazy_walk(nd) != 0 || !grabbed_link)
+		if (!try_to_unlazy(nd) != 0 || !grabbed_link)
 			return -ECHILD;
 
 		if (nd_alloc_stack(nd))
@@ -1634,7 +1632,7 @@ static const char *pick_link(struct nameidata *nd, struct path *link,
 		touch_atime(&last->link);
 		cond_resched();
 	} else if (atime_needs_update(&last->link, inode)) {
-		if (unlikely(unlazy_walk(nd)))
+		if (!try_to_unlazy(nd))
 			return ERR_PTR(-ECHILD);
 		touch_atime(&last->link);
 	}
@@ -1651,11 +1649,8 @@ static const char *pick_link(struct nameidata *nd, struct path *link,
 		get = inode->i_op->get_link;
 		if (nd->flags & LOOKUP_RCU) {
 			res = get(NULL, inode, &last->done);
-			if (res == ERR_PTR(-ECHILD)) {
-				if (unlikely(unlazy_walk(nd)))
-					return ERR_PTR(-ECHILD);
+			if (res == ERR_PTR(-ECHILD) && try_to_unlazy(nd))
 				res = get(link->dentry, inode, &last->done);
-			}
 		} else {
 			res = get(link->dentry, inode, &last->done);
 		}
@@ -2193,7 +2188,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		}
 		if (unlikely(!d_can_lookup(nd->path.dentry))) {
 			if (nd->flags & LOOKUP_RCU) {
-				if (unlazy_walk(nd))
+				if (!try_to_unlazy(nd))
 					return -ECHILD;
 			}
 			return -ENOTDIR;
@@ -3127,7 +3122,6 @@ static const char *open_last_lookups(struct nameidata *nd,
 	struct inode *inode;
 	struct dentry *dentry;
 	const char *res;
-	int error;
 
 	nd->flags |= op->intent;
 
@@ -3151,9 +3145,8 @@ static const char *open_last_lookups(struct nameidata *nd,
 	} else {
 		/* create side of things */
 		if (nd->flags & LOOKUP_RCU) {
-			error = unlazy_walk(nd);
-			if (unlikely(error))
-				return ERR_PTR(error);
+			if (!try_to_unlazy(nd))
+				return ERR_PTR(-ECHILD);
 		}
 		audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
 		/* trailing slashes? */
@@ -3162,9 +3155,7 @@ static const char *open_last_lookups(struct nameidata *nd,
 	}
 
 	if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
-		error = mnt_want_write(nd->path.mnt);
-		if (!error)
-			got_write = true;
+		got_write = !mnt_want_write(nd->path.mnt);
 		/*
 		 * do _not_ fail yet - we might not need that or fail with
 		 * a different error; let lookup_open() decide; we'll be
-- 
2.29.2


^ permalink raw reply related	[flat|nested] 46+ messages in thread

* [PATCH 2/4] fs: add support for LOOKUP_NONBLOCK
  2020-12-14 19:13 [PATCHSET v3 0/4] fs: Support for LOOKUP_NONBLOCK / RESOLVE_NONBLOCK Jens Axboe
  2020-12-14 19:13 ` [PATCH 1/4] fs: make unlazy_walk() error handling consistent Jens Axboe
@ 2020-12-14 19:13 ` Jens Axboe
  2020-12-15 12:24   ` Matthew Wilcox
                     ` (2 more replies)
  2020-12-14 19:13 ` [PATCH 3/4] fs: expose LOOKUP_NONBLOCK through openat2() RESOLVE_NONBLOCK Jens Axboe
                   ` (5 subsequent siblings)
  7 siblings, 3 replies; 46+ messages in thread
From: Jens Axboe @ 2020-12-14 19:13 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: torvalds, viro, Jens Axboe

io_uring always punts opens to async context, since there's no control
over whether the lookup blocks or not. Add LOOKUP_NONBLOCK to support
just doing the fast RCU based lookups, which we know will not block. If
we can do a cached path resolution of the filename, then we don't have
to always punt lookups for a worker.

We explicitly disallow O_CREAT | O_TRUNC opens, as those will require
blocking, and O_TMPFILE as that requires filesystem interactions and
there's currently no way to pass down an attempt to do nonblocking
operations there. This basically boils down to whether or not we can
do the fast path of open or not. If we can't, then return -EAGAIN and
let the caller retry from an appropriate context that can handle
blocking.

During path resolution, we always do LOOKUP_RCU first. If that fails and
we terminate LOOKUP_RCU, then fail a LOOKUP_NONBLOCK attempt as well.

Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/namei.c            | 27 ++++++++++++++++++++++++++-
 include/linux/namei.h |  1 +
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/fs/namei.c b/fs/namei.c
index 7eb7830da298..83a7f7866232 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -686,6 +686,8 @@ static bool try_to_unlazy(struct nameidata *nd)
 	BUG_ON(!(nd->flags & LOOKUP_RCU));
 
 	nd->flags &= ~LOOKUP_RCU;
+	if (nd->flags & LOOKUP_NONBLOCK)
+		goto out1;
 	if (unlikely(!legitimize_links(nd)))
 		goto out1;
 	if (unlikely(!legitimize_path(nd, &nd->path, nd->seq)))
@@ -722,6 +724,8 @@ static int unlazy_child(struct nameidata *nd, struct dentry *dentry, unsigned se
 	BUG_ON(!(nd->flags & LOOKUP_RCU));
 
 	nd->flags &= ~LOOKUP_RCU;
+	if (nd->flags & LOOKUP_NONBLOCK)
+		goto out2;
 	if (unlikely(!legitimize_links(nd)))
 		goto out2;
 	if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq)))
@@ -792,6 +796,7 @@ static int complete_walk(struct nameidata *nd)
 		 */
 		if (!(nd->flags & (LOOKUP_ROOT | LOOKUP_IS_SCOPED)))
 			nd->root.mnt = NULL;
+		nd->flags &= ~LOOKUP_NONBLOCK;
 		if (!try_to_unlazy(nd))
 			return -ECHILD;
 	}
@@ -2202,6 +2207,10 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
 	int error;
 	const char *s = nd->name->name;
 
+	/* LOOKUP_NONBLOCK requires RCU, ask caller to retry */
+	if ((flags & (LOOKUP_RCU | LOOKUP_NONBLOCK)) == LOOKUP_NONBLOCK)
+		return ERR_PTR(-EAGAIN);
+
 	if (!*s)
 		flags &= ~LOOKUP_RCU;
 	if (flags & LOOKUP_RCU)
@@ -3140,6 +3149,12 @@ static const char *open_last_lookups(struct nameidata *nd,
 			return ERR_CAST(dentry);
 		if (likely(dentry))
 			goto finish_lookup;
+		/*
+		 * We can't guarantee nonblocking semantics beyond this, if
+		 * the fast lookup fails.
+		 */
+		if (nd->flags & LOOKUP_NONBLOCK)
+			return ERR_PTR(-EAGAIN);
 
 		BUG_ON(nd->flags & LOOKUP_RCU);
 	} else {
@@ -3233,6 +3248,7 @@ static int do_open(struct nameidata *nd,
 		open_flag &= ~O_TRUNC;
 		acc_mode = 0;
 	} else if (d_is_reg(nd->path.dentry) && open_flag & O_TRUNC) {
+		WARN_ON_ONCE(nd->flags & LOOKUP_NONBLOCK);
 		error = mnt_want_write(nd->path.mnt);
 		if (error)
 			return error;
@@ -3299,7 +3315,16 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags,
 {
 	struct dentry *child;
 	struct path path;
-	int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path);
+	int error;
+
+	/*
+	 * We can't guarantee that the fs doesn't block further down, so
+	 * just disallow nonblock attempts at O_TMPFILE for now.
+	 */
+	if (flags & LOOKUP_NONBLOCK)
+		return -EAGAIN;
+
+	error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path);
 	if (unlikely(error))
 		return error;
 	error = mnt_want_write(path.mnt);
diff --git a/include/linux/namei.h b/include/linux/namei.h
index a4bb992623c4..c36c4e0805fc 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -46,6 +46,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT};
 #define LOOKUP_NO_XDEV		0x040000 /* No mountpoint crossing. */
 #define LOOKUP_BENEATH		0x080000 /* No escaping from starting point. */
 #define LOOKUP_IN_ROOT		0x100000 /* Treat dirfd as fs root. */
+#define LOOKUP_NONBLOCK		0x200000 /* don't block for lookup */
 /* LOOKUP_* flags which do scope-related checks based on the dirfd. */
 #define LOOKUP_IS_SCOPED (LOOKUP_BENEATH | LOOKUP_IN_ROOT)
 
-- 
2.29.2


^ permalink raw reply related	[flat|nested] 46+ messages in thread

* [PATCH 3/4] fs: expose LOOKUP_NONBLOCK through openat2() RESOLVE_NONBLOCK
  2020-12-14 19:13 [PATCHSET v3 0/4] fs: Support for LOOKUP_NONBLOCK / RESOLVE_NONBLOCK Jens Axboe
  2020-12-14 19:13 ` [PATCH 1/4] fs: make unlazy_walk() error handling consistent Jens Axboe
  2020-12-14 19:13 ` [PATCH 2/4] fs: add support for LOOKUP_NONBLOCK Jens Axboe
@ 2020-12-14 19:13 ` Jens Axboe
  2020-12-15 22:25   ` Dave Chinner
  2020-12-16  2:37   ` Al Viro
  2020-12-14 19:13 ` [PATCH 4/4] io_uring: enable LOOKUP_NONBLOCK path resolution for filename lookups Jens Axboe
                   ` (4 subsequent siblings)
  7 siblings, 2 replies; 46+ messages in thread
From: Jens Axboe @ 2020-12-14 19:13 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: torvalds, viro, Jens Axboe

Now that we support non-blocking path resolution internally, expose it
via openat2() in the struct open_how ->resolve flags. This allows
applications using openat2() to limit path resolution to the extent that
it is already cached.

If the lookup cannot be satisfied in a non-blocking manner, openat2(2)
will return -1/-EAGAIN.

Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/open.c                    | 6 ++++++
 include/linux/fcntl.h        | 2 +-
 include/uapi/linux/openat2.h | 4 ++++
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/fs/open.c b/fs/open.c
index 9af548fb841b..a83434cfe01c 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1087,6 +1087,12 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
 		lookup_flags |= LOOKUP_BENEATH;
 	if (how->resolve & RESOLVE_IN_ROOT)
 		lookup_flags |= LOOKUP_IN_ROOT;
+	if (how->resolve & RESOLVE_NONBLOCK) {
+		/* Don't bother even trying for create/truncate open */
+		if (flags & (O_TRUNC | O_CREAT))
+			return -EAGAIN;
+		lookup_flags |= LOOKUP_NONBLOCK;
+	}
 
 	op->lookup_flags = lookup_flags;
 	return 0;
diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
index 921e750843e6..919a13c9317c 100644
--- a/include/linux/fcntl.h
+++ b/include/linux/fcntl.h
@@ -19,7 +19,7 @@
 /* List of all valid flags for the how->resolve argument: */
 #define VALID_RESOLVE_FLAGS \
 	(RESOLVE_NO_XDEV | RESOLVE_NO_MAGICLINKS | RESOLVE_NO_SYMLINKS | \
-	 RESOLVE_BENEATH | RESOLVE_IN_ROOT)
+	 RESOLVE_BENEATH | RESOLVE_IN_ROOT | RESOLVE_NONBLOCK)
 
 /* List of all open_how "versions". */
 #define OPEN_HOW_SIZE_VER0	24 /* sizeof first published struct */
diff --git a/include/uapi/linux/openat2.h b/include/uapi/linux/openat2.h
index 58b1eb711360..7bc1d0c35108 100644
--- a/include/uapi/linux/openat2.h
+++ b/include/uapi/linux/openat2.h
@@ -35,5 +35,9 @@ struct open_how {
 #define RESOLVE_IN_ROOT		0x10 /* Make all jumps to "/" and ".."
 					be scoped inside the dirfd
 					(similar to chroot(2)). */
+#define RESOLVE_NONBLOCK	0x20 /* Only complete if resolution can be
+					completed through cached lookup. May
+					return -EAGAIN if that's not
+					possible. */
 
 #endif /* _UAPI_LINUX_OPENAT2_H */
-- 
2.29.2


^ permalink raw reply related	[flat|nested] 46+ messages in thread

* [PATCH 4/4] io_uring: enable LOOKUP_NONBLOCK path resolution for filename lookups
  2020-12-14 19:13 [PATCHSET v3 0/4] fs: Support for LOOKUP_NONBLOCK / RESOLVE_NONBLOCK Jens Axboe
                   ` (2 preceding siblings ...)
  2020-12-14 19:13 ` [PATCH 3/4] fs: expose LOOKUP_NONBLOCK through openat2() RESOLVE_NONBLOCK Jens Axboe
@ 2020-12-14 19:13 ` Jens Axboe
  2020-12-15  3:06 ` [PATCHSET v3 0/4] fs: Support for LOOKUP_NONBLOCK / RESOLVE_NONBLOCK Linus Torvalds
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 46+ messages in thread
From: Jens Axboe @ 2020-12-14 19:13 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: torvalds, viro, Jens Axboe

Instead of being pessimistic and assume that path lookup will block, use
LOOKUP_NONBLOCK to attempt just a cached lookup. This ensures that the
fast path is always done inline, and we only punt to async context if
IO is needed to satisfy the lookup.

For forced nonblock open attempts, mark the file O_NONBLOCK over the
actual ->open() call as well. We can safely clear this again before
doing fd_install(), so it'll never be user visible that we fiddled with
it.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 44 ++++++++++++++++++++++++--------------------
 1 file changed, 24 insertions(+), 20 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9d7baf8ba77a..6734a2616990 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -487,7 +487,6 @@ struct io_sr_msg {
 struct io_open {
 	struct file			*file;
 	int				dfd;
-	bool				ignore_nonblock;
 	struct filename			*filename;
 	struct open_how			how;
 	unsigned long			nofile;
@@ -3998,7 +3997,6 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 		return ret;
 	}
 	req->open.nofile = rlimit(RLIMIT_NOFILE);
-	req->open.ignore_nonblock = false;
 	req->flags |= REQ_F_NEED_CLEANUP;
 	return 0;
 }
@@ -4040,39 +4038,45 @@ static int io_openat2(struct io_kiocb *req, bool force_nonblock)
 {
 	struct open_flags op;
 	struct file *file;
+	bool nonblock_set;
 	int ret;
 
-	if (force_nonblock && !req->open.ignore_nonblock)
-		return -EAGAIN;
-
 	ret = build_open_flags(&req->open.how, &op);
 	if (ret)
 		goto err;
+	nonblock_set = op.open_flag & O_NONBLOCK;
+	if (force_nonblock) {
+		/*
+		 * Don't bother trying for O_TRUNC or O_CREAT open, it'll
+		 * always -EAGAIN
+		 */
+		if (req->open.how.flags & (O_TRUNC | O_CREAT))
+			return -EAGAIN;
+		op.lookup_flags |= LOOKUP_NONBLOCK;
+		op.open_flag |= O_NONBLOCK;
+	}
 
 	ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
 	if (ret < 0)
 		goto err;
 
 	file = do_filp_open(req->open.dfd, req->open.filename, &op);
+	if (force_nonblock && file == ERR_PTR(-EAGAIN)) {
+		/*
+		 * We could hang on to this 'fd', but seems like marginal
+		 * gain for something that is now known to be a slower path.
+		 * So just put it, and we'll get a new one when we retry.
+		 */
+		put_unused_fd(ret);
+		return -EAGAIN;
+	}
+
 	if (IS_ERR(file)) {
 		put_unused_fd(ret);
 		ret = PTR_ERR(file);
-		/*
-		 * A work-around to ensure that /proc/self works that way
-		 * that it should - if we get -EOPNOTSUPP back, then assume
-		 * that proc_self_get_link() failed us because we're in async
-		 * context. We should be safe to retry this from the task
-		 * itself with force_nonblock == false set, as it should not
-		 * block on lookup. Would be nice to know this upfront and
-		 * avoid the async dance, but doesn't seem feasible.
-		 */
-		if (ret == -EOPNOTSUPP && io_wq_current_is_worker()) {
-			req->open.ignore_nonblock = true;
-			refcount_inc(&req->refs);
-			io_req_task_queue(req);
-			return 0;
-		}
 	} else {
+		if (force_nonblock && !nonblock_set)
+			file->f_flags &= ~O_NONBLOCK;
 		fsnotify_open(file);
 		fd_install(ret, file);
 	}
-- 
2.29.2


^ permalink raw reply related	[flat|nested] 46+ messages in thread

* Re: [PATCHSET v3 0/4] fs: Support for LOOKUP_NONBLOCK / RESOLVE_NONBLOCK
  2020-12-14 19:13 [PATCHSET v3 0/4] fs: Support for LOOKUP_NONBLOCK / RESOLVE_NONBLOCK Jens Axboe
                   ` (3 preceding siblings ...)
  2020-12-14 19:13 ` [PATCH 4/4] io_uring: enable LOOKUP_NONBLOCK path resolution for filename lookups Jens Axboe
@ 2020-12-15  3:06 ` Linus Torvalds
  2020-12-15  3:18   ` Jens Axboe
  2020-12-15  6:11 ` Al Viro
                   ` (2 subsequent siblings)
  7 siblings, 1 reply; 46+ messages in thread
From: Linus Torvalds @ 2020-12-15  3:06 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-fsdevel, Al Viro

On Mon, Dec 14, 2020 at 11:13 AM Jens Axboe <axboe@kernel.dk> wrote:
>
> I'm pretty happy with this at this point. The core change is very simple,
> and the users end up being trivial too.

It does look very simple.

It strikes me that io_statx would be another user of this. But it
obviously depends on what the actual io_uring users do..

             Linus

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCHSET v3 0/4] fs: Support for LOOKUP_NONBLOCK / RESOLVE_NONBLOCK
  2020-12-15  3:06 ` [PATCHSET v3 0/4] fs: Support for LOOKUP_NONBLOCK / RESOLVE_NONBLOCK Linus Torvalds
@ 2020-12-15  3:18   ` Jens Axboe
  0 siblings, 0 replies; 46+ messages in thread
From: Jens Axboe @ 2020-12-15  3:18 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel, Al Viro

On 12/14/20 8:06 PM, Linus Torvalds wrote:
> On Mon, Dec 14, 2020 at 11:13 AM Jens Axboe <axboe@kernel.dk> wrote:
>>
>> I'm pretty happy with this at this point. The core change is very simple,
>> and the users end up being trivial too.
> 
> It does look very simple.

Agree, hard to imagine it being much simpler than this.

> It strikes me that io_statx would be another user of this. But it
> obviously depends on what the actual io_uring users do..

Indeed, I did mention that in my email from the other thread earlier
today, and I think it's potentially an even bigger deal than nonblock
open. Lots of workloads are very stat intensive. So I did do that:

https://git.kernel.dk/cgit/linux-block/log/?h=nonblock-path-lookup

it's the top three patches there. Basically all local file systems are
fine with AT_STATX_LOOKUP just basically mapping to LOOKUP_NONBLOCK,
various (mostly) networked file systems like to do revalidation and
other kinds of communication as part of getattr. Hence the second patch
there is needed to have some:

if (query_flags & AT_STATX_NONBLOCK)
	return -EAGAIN;

constructs in there to avoid it in the nonblock path. Outside of that,
it's very trivial and just works. I didn't include it in this posting to
avoid detracting from the core work, but I definitely think we should be
doing that as well.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCHSET v3 0/4] fs: Support for LOOKUP_NONBLOCK / RESOLVE_NONBLOCK
  2020-12-14 19:13 [PATCHSET v3 0/4] fs: Support for LOOKUP_NONBLOCK / RESOLVE_NONBLOCK Jens Axboe
                   ` (4 preceding siblings ...)
  2020-12-15  3:06 ` [PATCHSET v3 0/4] fs: Support for LOOKUP_NONBLOCK / RESOLVE_NONBLOCK Linus Torvalds
@ 2020-12-15  6:11 ` Al Viro
  2020-12-15 15:29   ` Jens Axboe
  2021-01-04  5:31 ` Al Viro
       [not found] ` <m1lfbrwrgq.fsf@fess.ebiederm.org>
  7 siblings, 1 reply; 46+ messages in thread
From: Al Viro @ 2020-12-15  6:11 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-fsdevel, torvalds

On Mon, Dec 14, 2020 at 12:13:20PM -0700, Jens Axboe wrote:
> Hi,
> 
> Wanted to throw out what the current state of this is, as we keep
> getting closer to something palatable.
> 
> This time I've included the io_uring change too. I've tested this through
> both openat2, and through io_uring as well.
> 
> I'm pretty happy with this at this point. The core change is very simple,
> and the users end up being trivial too.

I'll review tomorrow morning (sorry, half-asleep right now)

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 2/4] fs: add support for LOOKUP_NONBLOCK
  2020-12-14 19:13 ` [PATCH 2/4] fs: add support for LOOKUP_NONBLOCK Jens Axboe
@ 2020-12-15 12:24   ` Matthew Wilcox
  2020-12-15 15:29     ` Jens Axboe
  2020-12-16  2:36   ` Al Viro
  2020-12-16  2:43   ` Al Viro
  2 siblings, 1 reply; 46+ messages in thread
From: Matthew Wilcox @ 2020-12-15 12:24 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-fsdevel, torvalds, viro

On Mon, Dec 14, 2020 at 12:13:22PM -0700, Jens Axboe wrote:
> +++ b/fs/namei.c
> @@ -686,6 +686,8 @@ static bool try_to_unlazy(struct nameidata *nd)
>  	BUG_ON(!(nd->flags & LOOKUP_RCU));
>  
>  	nd->flags &= ~LOOKUP_RCU;
> +	if (nd->flags & LOOKUP_NONBLOCK)
> +		goto out1;

If we try a walk in a non-blocking context, it fails, then we punt to
a thread, do we want to prohibit that thread trying an RCU walk first?
I can see arguments both ways -- this may only be a temporary RCU walk
failure, or we may never be able to RCU walk this path.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 2/4] fs: add support for LOOKUP_NONBLOCK
  2020-12-15 12:24   ` Matthew Wilcox
@ 2020-12-15 15:29     ` Jens Axboe
  2020-12-15 15:33       ` Matthew Wilcox
  0 siblings, 1 reply; 46+ messages in thread
From: Jens Axboe @ 2020-12-15 15:29 UTC (permalink / raw)
  To: Matthew Wilcox; +Cc: linux-fsdevel, torvalds, viro

On 12/15/20 5:24 AM, Matthew Wilcox wrote:
> On Mon, Dec 14, 2020 at 12:13:22PM -0700, Jens Axboe wrote:
>> +++ b/fs/namei.c
>> @@ -686,6 +686,8 @@ static bool try_to_unlazy(struct nameidata *nd)
>>  	BUG_ON(!(nd->flags & LOOKUP_RCU));
>>  
>>  	nd->flags &= ~LOOKUP_RCU;
>> +	if (nd->flags & LOOKUP_NONBLOCK)
>> +		goto out1;
> 
> If we try a walk in a non-blocking context, it fails, then we punt to
> a thread, do we want to prohibit that thread trying an RCU walk first?
> I can see arguments both ways -- this may only be a temporary RCU walk
> failure, or we may never be able to RCU walk this path.

In my opinion, it's not worth it trying to over complicate matters by
handling the retry side differently. Better to just keep them the
same. We'd need a lookup anyway to avoid aliasing.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCHSET v3 0/4] fs: Support for LOOKUP_NONBLOCK / RESOLVE_NONBLOCK
  2020-12-15  6:11 ` Al Viro
@ 2020-12-15 15:29   ` Jens Axboe
  0 siblings, 0 replies; 46+ messages in thread
From: Jens Axboe @ 2020-12-15 15:29 UTC (permalink / raw)
  To: Al Viro; +Cc: linux-fsdevel, torvalds

On 12/14/20 11:11 PM, Al Viro wrote:
> On Mon, Dec 14, 2020 at 12:13:20PM -0700, Jens Axboe wrote:
>> Hi,
>>
>> Wanted to throw out what the current state of this is, as we keep
>> getting closer to something palatable.
>>
>> This time I've included the io_uring change too. I've tested this through
>> both openat2, and through io_uring as well.
>>
>> I'm pretty happy with this at this point. The core change is very simple,
>> and the users end up being trivial too.
> 
> I'll review tomorrow morning (sorry, half-asleep right now)

Thanks Al, appreciate it.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 2/4] fs: add support for LOOKUP_NONBLOCK
  2020-12-15 15:29     ` Jens Axboe
@ 2020-12-15 15:33       ` Matthew Wilcox
  2020-12-15 15:37         ` Jens Axboe
  0 siblings, 1 reply; 46+ messages in thread
From: Matthew Wilcox @ 2020-12-15 15:33 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-fsdevel, torvalds, viro

On Tue, Dec 15, 2020 at 08:29:40AM -0700, Jens Axboe wrote:
> On 12/15/20 5:24 AM, Matthew Wilcox wrote:
> > On Mon, Dec 14, 2020 at 12:13:22PM -0700, Jens Axboe wrote:
> >> +++ b/fs/namei.c
> >> @@ -686,6 +686,8 @@ static bool try_to_unlazy(struct nameidata *nd)
> >>  	BUG_ON(!(nd->flags & LOOKUP_RCU));
> >>  
> >>  	nd->flags &= ~LOOKUP_RCU;
> >> +	if (nd->flags & LOOKUP_NONBLOCK)
> >> +		goto out1;
> > 
> > If we try a walk in a non-blocking context, it fails, then we punt to
> > a thread, do we want to prohibit that thread trying an RCU walk first?
> > I can see arguments both ways -- this may only be a temporary RCU walk
> > failure, or we may never be able to RCU walk this path.
> 
> In my opinion, it's not worth it trying to over complicate matters by
> handling the retry side differently. Better to just keep them the
> same. We'd need a lookup anyway to avoid aliasing.

but by clearing LOOKUP_RCU here, aren't you making the retry handle
things differently?  maybe i got lost.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 2/4] fs: add support for LOOKUP_NONBLOCK
  2020-12-15 15:33       ` Matthew Wilcox
@ 2020-12-15 15:37         ` Jens Axboe
  2020-12-15 16:08           ` Jens Axboe
  0 siblings, 1 reply; 46+ messages in thread
From: Jens Axboe @ 2020-12-15 15:37 UTC (permalink / raw)
  To: Matthew Wilcox; +Cc: linux-fsdevel, torvalds, viro

On 12/15/20 8:33 AM, Matthew Wilcox wrote:
> On Tue, Dec 15, 2020 at 08:29:40AM -0700, Jens Axboe wrote:
>> On 12/15/20 5:24 AM, Matthew Wilcox wrote:
>>> On Mon, Dec 14, 2020 at 12:13:22PM -0700, Jens Axboe wrote:
>>>> +++ b/fs/namei.c
>>>> @@ -686,6 +686,8 @@ static bool try_to_unlazy(struct nameidata *nd)
>>>>  	BUG_ON(!(nd->flags & LOOKUP_RCU));
>>>>  
>>>>  	nd->flags &= ~LOOKUP_RCU;
>>>> +	if (nd->flags & LOOKUP_NONBLOCK)
>>>> +		goto out1;
>>>
>>> If we try a walk in a non-blocking context, it fails, then we punt to
>>> a thread, do we want to prohibit that thread trying an RCU walk first?
>>> I can see arguments both ways -- this may only be a temporary RCU walk
>>> failure, or we may never be able to RCU walk this path.
>>
>> In my opinion, it's not worth it trying to over complicate matters by
>> handling the retry side differently. Better to just keep them the
>> same. We'd need a lookup anyway to avoid aliasing.
> 
> but by clearing LOOKUP_RCU here, aren't you making the retry handle
> things differently?  maybe i got lost.

That's already how it works, I'm just clearing LOOKUP_NONBLOCK (which
relies on LOOKUP_RCU) when we're clearing LOOKUP_RCU. I can try and
benchmark skipping LOOKUP_RCU when we do the blocking retry, but my gut
tells me it'll be noise.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 2/4] fs: add support for LOOKUP_NONBLOCK
  2020-12-15 15:37         ` Jens Axboe
@ 2020-12-15 16:08           ` Jens Axboe
  2020-12-15 16:14             ` Jens Axboe
  2020-12-15 18:29             ` Linus Torvalds
  0 siblings, 2 replies; 46+ messages in thread
From: Jens Axboe @ 2020-12-15 16:08 UTC (permalink / raw)
  To: Matthew Wilcox; +Cc: linux-fsdevel, torvalds, viro

On 12/15/20 8:37 AM, Jens Axboe wrote:
> On 12/15/20 8:33 AM, Matthew Wilcox wrote:
>> On Tue, Dec 15, 2020 at 08:29:40AM -0700, Jens Axboe wrote:
>>> On 12/15/20 5:24 AM, Matthew Wilcox wrote:
>>>> On Mon, Dec 14, 2020 at 12:13:22PM -0700, Jens Axboe wrote:
>>>>> +++ b/fs/namei.c
>>>>> @@ -686,6 +686,8 @@ static bool try_to_unlazy(struct nameidata *nd)
>>>>>  	BUG_ON(!(nd->flags & LOOKUP_RCU));
>>>>>  
>>>>>  	nd->flags &= ~LOOKUP_RCU;
>>>>> +	if (nd->flags & LOOKUP_NONBLOCK)
>>>>> +		goto out1;
>>>>
>>>> If we try a walk in a non-blocking context, it fails, then we punt to
>>>> a thread, do we want to prohibit that thread trying an RCU walk first?
>>>> I can see arguments both ways -- this may only be a temporary RCU walk
>>>> failure, or we may never be able to RCU walk this path.
>>>
>>> In my opinion, it's not worth it trying to over complicate matters by
>>> handling the retry side differently. Better to just keep them the
>>> same. We'd need a lookup anyway to avoid aliasing.
>>
>> but by clearing LOOKUP_RCU here, aren't you making the retry handle
>> things differently?  maybe i got lost.
> 
> That's already how it works, I'm just clearing LOOKUP_NONBLOCK (which
> relies on LOOKUP_RCU) when we're clearing LOOKUP_RCU. I can try and
> benchmark skipping LOOKUP_RCU when we do the blocking retry, but my gut
> tells me it'll be noise.

OK, ran some numbers. The test app benchmarks opening X files, I just
used /usr on my test box. That's 182677 files. To mimic real worldy
kind of setups, 33% of the files can be looked up hot, so LOOKUP_NONBLOCK
will succeed.

Patchset as posted:

Method		Time (usec)
---------------------------
openat		2,268,930
openat		2,274,256
openat		2,274,256
io_uring	  917,813
io_uring	  921,448 
io_uring	  915,233

And with a LOOKUP_NO_RCU flag, which io_uring sets when it has to do
retry, and which will make namei skip the first LOOKUP_RCU for path
resolution:

Method		Time (usec)
---------------------------
io_uring	  902,410
io_uring	  902,725
io_uring	  896,289

Definitely not faster - whether that's just reboot noise, or if it's
significant, I'd need to look deeper to figure out.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 2/4] fs: add support for LOOKUP_NONBLOCK
  2020-12-15 16:08           ` Jens Axboe
@ 2020-12-15 16:14             ` Jens Axboe
  2020-12-15 18:29             ` Linus Torvalds
  1 sibling, 0 replies; 46+ messages in thread
From: Jens Axboe @ 2020-12-15 16:14 UTC (permalink / raw)
  To: Matthew Wilcox; +Cc: linux-fsdevel, torvalds, viro

On 12/15/20 9:08 AM, Jens Axboe wrote:
> On 12/15/20 8:37 AM, Jens Axboe wrote:
>> On 12/15/20 8:33 AM, Matthew Wilcox wrote:
>>> On Tue, Dec 15, 2020 at 08:29:40AM -0700, Jens Axboe wrote:
>>>> On 12/15/20 5:24 AM, Matthew Wilcox wrote:
>>>>> On Mon, Dec 14, 2020 at 12:13:22PM -0700, Jens Axboe wrote:
>>>>>> +++ b/fs/namei.c
>>>>>> @@ -686,6 +686,8 @@ static bool try_to_unlazy(struct nameidata *nd)
>>>>>>  	BUG_ON(!(nd->flags & LOOKUP_RCU));
>>>>>>  
>>>>>>  	nd->flags &= ~LOOKUP_RCU;
>>>>>> +	if (nd->flags & LOOKUP_NONBLOCK)
>>>>>> +		goto out1;
>>>>>
>>>>> If we try a walk in a non-blocking context, it fails, then we punt to
>>>>> a thread, do we want to prohibit that thread trying an RCU walk first?
>>>>> I can see arguments both ways -- this may only be a temporary RCU walk
>>>>> failure, or we may never be able to RCU walk this path.
>>>>
>>>> In my opinion, it's not worth it trying to over complicate matters by
>>>> handling the retry side differently. Better to just keep them the
>>>> same. We'd need a lookup anyway to avoid aliasing.
>>>
>>> but by clearing LOOKUP_RCU here, aren't you making the retry handle
>>> things differently?  maybe i got lost.
>>
>> That's already how it works, I'm just clearing LOOKUP_NONBLOCK (which
>> relies on LOOKUP_RCU) when we're clearing LOOKUP_RCU. I can try and
>> benchmark skipping LOOKUP_RCU when we do the blocking retry, but my gut
>> tells me it'll be noise.
> 
> OK, ran some numbers. The test app benchmarks opening X files, I just
> used /usr on my test box. That's 182677 files. To mimic real worldy
> kind of setups, 33% of the files can be looked up hot, so LOOKUP_NONBLOCK
> will succeed.
> 
> Patchset as posted:
> 
> Method		Time (usec)
> ---------------------------
> openat		2,268,930
> openat		2,274,256
> openat		2,274,256
> io_uring	  917,813
> io_uring	  921,448 
> io_uring	  915,233
> 
> And with a LOOKUP_NO_RCU flag, which io_uring sets when it has to do
> retry, and which will make namei skip the first LOOKUP_RCU for path
> resolution:
> 
> Method		Time (usec)
> ---------------------------
> io_uring	  902,410
> io_uring	  902,725
> io_uring	  896,289
> 
> Definitely not faster - whether that's just reboot noise, or if it's
> significant, I'd need to look deeper to figure out.

If you're puzzled by the conclusion based on the numbers, there's a good
reason. The first table is io_uring + LOOKUP_NO_RCU for retry, second
table is io_uring as posted. I mistakenly swapped the numbers around...

So conclusion still stands, I just pasted in the wrong set for the
table.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 2/4] fs: add support for LOOKUP_NONBLOCK
  2020-12-15 16:08           ` Jens Axboe
  2020-12-15 16:14             ` Jens Axboe
@ 2020-12-15 18:29             ` Linus Torvalds
  2020-12-15 18:44               ` Jens Axboe
  1 sibling, 1 reply; 46+ messages in thread
From: Linus Torvalds @ 2020-12-15 18:29 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Matthew Wilcox, linux-fsdevel, Al Viro

On Tue, Dec 15, 2020 at 8:08 AM Jens Axboe <axboe@kernel.dk> wrote:
>
> OK, ran some numbers. The test app benchmarks opening X files, I just
> used /usr on my test box. That's 182677 files. To mimic real worldy
> kind of setups, 33% of the files can be looked up hot, so LOOKUP_NONBLOCK
> will succeed.

Perhaps more interestingly, what's the difference between the patchset
as posted for just io_uring?

IOW, does the synchronous LOOKUP_NONBLOCK actually help?

I'm obviously a big believer in the whole "avoid thread setup costs if
not necessary", so I'd _expect_ it to help, but maybe the possible
extra parallelism is enough to overcome the thread setup and
synchronization costs even for a fast cached RCU lookup.

(I also suspect the reality is often much closer to 100% cached
lookups than just 33%, but who knows - there are things like just
concurrent renames that can cause the RCU lookup to fail even if it
_was_ cached, so it's not purely about whether things are in the
dcache or not).

              Linus

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 2/4] fs: add support for LOOKUP_NONBLOCK
  2020-12-15 18:29             ` Linus Torvalds
@ 2020-12-15 18:44               ` Jens Axboe
  2020-12-15 18:47                 ` Linus Torvalds
  0 siblings, 1 reply; 46+ messages in thread
From: Jens Axboe @ 2020-12-15 18:44 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Matthew Wilcox, linux-fsdevel, Al Viro

On 12/15/20 11:29 AM, Linus Torvalds wrote:
> On Tue, Dec 15, 2020 at 8:08 AM Jens Axboe <axboe@kernel.dk> wrote:
>>
>> OK, ran some numbers. The test app benchmarks opening X files, I just
>> used /usr on my test box. That's 182677 files. To mimic real worldy
>> kind of setups, 33% of the files can be looked up hot, so LOOKUP_NONBLOCK
>> will succeed.
> 
> Perhaps more interestingly, what's the difference between the patchset
> as posted for just io_uring?
> 
> IOW, does the synchronous LOOKUP_NONBLOCK actually help?
> 
> I'm obviously a big believer in the whole "avoid thread setup costs if
> not necessary", so I'd _expect_ it to help, but maybe the possible
> extra parallelism is enough to overcome the thread setup and
> synchronization costs even for a fast cached RCU lookup.

For basically all cases on the io_uring side where I've ended up being
able to do the hot/fast path inline, it's been a nice win. The only real
exception to that rule is buffered reads that are fully cached, and
having multiple async workers copy the data is obviously always going to
be faster at some point due to the extra parallelism and memory
bandwidth. So yes, I too am a big believer in being able to perform
operations inline if at all possible, even if for some thing it turns
into a full retry when we fail. The hot path more than makes up for it.

> (I also suspect the reality is often much closer to 100% cached
> lookups than just 33%, but who knows - there are things like just
> concurrent renames that can cause the RCU lookup to fail even if it
> _was_ cached, so it's not purely about whether things are in the
> dcache or not).

In usecs again, same test, this time just using io_uring:

Cached		5.10-git	5.10-git+LOOKUP_NONBLOCK
--------------------------------------------------------
33%		1,014,975	900,474
100%		435,636		151,475

As expected, the closer we get to fully cached, the better off we are
with using LOOKUP_NONBLOCK. It's a nice win even at just 33% cached.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 2/4] fs: add support for LOOKUP_NONBLOCK
  2020-12-15 18:44               ` Jens Axboe
@ 2020-12-15 18:47                 ` Linus Torvalds
  2020-12-15 19:03                   ` Jens Axboe
  0 siblings, 1 reply; 46+ messages in thread
From: Linus Torvalds @ 2020-12-15 18:47 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Matthew Wilcox, linux-fsdevel, Al Viro

On Tue, Dec 15, 2020 at 10:44 AM Jens Axboe <axboe@kernel.dk> wrote:
>
> In usecs again, same test, this time just using io_uring:
>
> Cached          5.10-git        5.10-git+LOOKUP_NONBLOCK
> --------------------------------------------------------
> 33%             1,014,975       900,474
> 100%            435,636         151,475

Ok, that's quite convincing. Thanks,

            Linus

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 2/4] fs: add support for LOOKUP_NONBLOCK
  2020-12-15 18:47                 ` Linus Torvalds
@ 2020-12-15 19:03                   ` Jens Axboe
  2020-12-15 19:32                     ` Linus Torvalds
  0 siblings, 1 reply; 46+ messages in thread
From: Jens Axboe @ 2020-12-15 19:03 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Matthew Wilcox, linux-fsdevel, Al Viro

On 12/15/20 11:47 AM, Linus Torvalds wrote:
> On Tue, Dec 15, 2020 at 10:44 AM Jens Axboe <axboe@kernel.dk> wrote:
>>
>> In usecs again, same test, this time just using io_uring:
>>
>> Cached          5.10-git        5.10-git+LOOKUP_NONBLOCK
>> --------------------------------------------------------
>> 33%             1,014,975       900,474
>> 100%            435,636         151,475
> 
> Ok, that's quite convincing. Thanks,

Just for completeness, here's 89% (tool is pretty basic, closest I can
get to 90%):

Cached          5.10-git        5.10-git+LOOKUP_NONBLOCK
--------------------------------------------------------
89%             545,466		292,937

which is about an 1.8x win, where 100% is a 2.8x win.

And for comparison, doing the same thing with the regular openat2()
system call takes 867,897 usec.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 2/4] fs: add support for LOOKUP_NONBLOCK
  2020-12-15 19:03                   ` Jens Axboe
@ 2020-12-15 19:32                     ` Linus Torvalds
  2020-12-15 19:38                       ` Jens Axboe
  0 siblings, 1 reply; 46+ messages in thread
From: Linus Torvalds @ 2020-12-15 19:32 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Matthew Wilcox, linux-fsdevel, Al Viro

On Tue, Dec 15, 2020 at 11:03 AM Jens Axboe <axboe@kernel.dk> wrote:
>
> And for comparison, doing the same thing with the regular openat2()
> system call takes 867,897 usec.

Well, you could always thread it. That's what git does for its
'stat()' loop for 'git diff' and friends (the whole "check that the
index is up-to-date with all the files in the working tree" is
basically just a lot of lstat() calls).

           Linus

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 2/4] fs: add support for LOOKUP_NONBLOCK
  2020-12-15 19:32                     ` Linus Torvalds
@ 2020-12-15 19:38                       ` Jens Axboe
  0 siblings, 0 replies; 46+ messages in thread
From: Jens Axboe @ 2020-12-15 19:38 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Matthew Wilcox, linux-fsdevel, Al Viro

On 12/15/20 12:32 PM, Linus Torvalds wrote:
> On Tue, Dec 15, 2020 at 11:03 AM Jens Axboe <axboe@kernel.dk> wrote:
>>
>> And for comparison, doing the same thing with the regular openat2()
>> system call takes 867,897 usec.
> 
> Well, you could always thread it. That's what git does for its
> 'stat()' loop for 'git diff' and friends (the whole "check that the
> index is up-to-date with all the files in the working tree" is
> basically just a lot of lstat() calls).

Sure, but imho the whole point of LOOKUP_NONBLOCK + io_uring is that you
don't have to worry about (or deal with) threading. As the offload
numbers prove, offloading when you don't have to is always going to be a
big loss. I like the model where the fast path is just done the right
way, and the slow path is handled automatically, a whole lot more than
chunking it off like that.

Granted that's less of an issue if the workload is "stat this crap ton
of files", it's more difficult when you're doing it in a more piecemeal
fashion as needed.

Actually been on my list to use io_uring stat with git, but needed to
get the LOOKUP_NONBLOCK done first...

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 3/4] fs: expose LOOKUP_NONBLOCK through openat2() RESOLVE_NONBLOCK
  2020-12-14 19:13 ` [PATCH 3/4] fs: expose LOOKUP_NONBLOCK through openat2() RESOLVE_NONBLOCK Jens Axboe
@ 2020-12-15 22:25   ` Dave Chinner
  2020-12-15 22:31     ` Linus Torvalds
  2020-12-16  2:37   ` Al Viro
  1 sibling, 1 reply; 46+ messages in thread
From: Dave Chinner @ 2020-12-15 22:25 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-fsdevel, torvalds, viro

On Mon, Dec 14, 2020 at 12:13:23PM -0700, Jens Axboe wrote:
> Now that we support non-blocking path resolution internally, expose it
> via openat2() in the struct open_how ->resolve flags. This allows
> applications using openat2() to limit path resolution to the extent that
> it is already cached.
> 
> If the lookup cannot be satisfied in a non-blocking manner, openat2(2)
> will return -1/-EAGAIN.
> 
> Cc: Al Viro <viro@zeniv.linux.org.uk>
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---
>  fs/open.c                    | 6 ++++++
>  include/linux/fcntl.h        | 2 +-
>  include/uapi/linux/openat2.h | 4 ++++

What text are you going to add to the man page to describe how this
flag behaves to developers?

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 3/4] fs: expose LOOKUP_NONBLOCK through openat2() RESOLVE_NONBLOCK
  2020-12-15 22:25   ` Dave Chinner
@ 2020-12-15 22:31     ` Linus Torvalds
  2020-12-15 23:25       ` Jens Axboe
  0 siblings, 1 reply; 46+ messages in thread
From: Linus Torvalds @ 2020-12-15 22:31 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Jens Axboe, linux-fsdevel, Al Viro

On Tue, Dec 15, 2020 at 2:25 PM Dave Chinner <david@fromorbit.com> wrote:
>
> What text are you going to add to the man page to describe how this
> flag behaves to developers?

I think it was you or Jens who suggested renaming it to RESOLVE_CACHED
(and LOOKUP_CACHED), and I think that would be a good idea.

Make it explicit that this isn't primarily about avoiding blocking,
but about only doing name lookups that can be resolved directly from
the dcache.

Even in some other contexts, the traditional NONBLOCK naming isn't
necessarily about not blocking. Lots of operations tend to block even
for things that are requested to be "nonblocking", and in fact they
can even cause IO (ie a "nonblocking" read can and will cause IO when
it copies to user space, because _that_ part isn't nonblocking).

        Linus

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 3/4] fs: expose LOOKUP_NONBLOCK through openat2() RESOLVE_NONBLOCK
  2020-12-15 22:31     ` Linus Torvalds
@ 2020-12-15 23:25       ` Jens Axboe
  0 siblings, 0 replies; 46+ messages in thread
From: Jens Axboe @ 2020-12-15 23:25 UTC (permalink / raw)
  To: Linus Torvalds, Dave Chinner; +Cc: linux-fsdevel, Al Viro

On 12/15/20 3:31 PM, Linus Torvalds wrote:
> On Tue, Dec 15, 2020 at 2:25 PM Dave Chinner <david@fromorbit.com> wrote:
>>
>> What text are you going to add to the man page to describe how this
>> flag behaves to developers?
> 
> I think it was you or Jens who suggested renaming it to RESOLVE_CACHED
> (and LOOKUP_CACHED), and I think that would be a good idea.

Yeah that was me, and I think it helps both internally in terms of the
code being easier/better to read, and when exposed as a user API as
well. It makes it readily apparent what it does, which is much better
than requring lengthy descriptions of it...

I'll go ahead and make the edit.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 2/4] fs: add support for LOOKUP_NONBLOCK
  2020-12-14 19:13 ` [PATCH 2/4] fs: add support for LOOKUP_NONBLOCK Jens Axboe
  2020-12-15 12:24   ` Matthew Wilcox
@ 2020-12-16  2:36   ` Al Viro
  2020-12-16  3:30     ` Jens Axboe
  2020-12-16  2:43   ` Al Viro
  2 siblings, 1 reply; 46+ messages in thread
From: Al Viro @ 2020-12-16  2:36 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-fsdevel, torvalds

On Mon, Dec 14, 2020 at 12:13:22PM -0700, Jens Axboe wrote:
> io_uring always punts opens to async context, since there's no control
> over whether the lookup blocks or not. Add LOOKUP_NONBLOCK to support
> just doing the fast RCU based lookups, which we know will not block. If
> we can do a cached path resolution of the filename, then we don't have
> to always punt lookups for a worker.
> 
> We explicitly disallow O_CREAT | O_TRUNC opens, as those will require
> blocking, and O_TMPFILE as that requires filesystem interactions and
> there's currently no way to pass down an attempt to do nonblocking
> operations there. This basically boils down to whether or not we can
> do the fast path of open or not. If we can't, then return -EAGAIN and
> let the caller retry from an appropriate context that can handle
> blocking.
> 
> During path resolution, we always do LOOKUP_RCU first. If that fails and
> we terminate LOOKUP_RCU, then fail a LOOKUP_NONBLOCK attempt as well.

Ho-hum...  FWIW, I'm tempted to do the same change of calling conventions
for unlazy_child() (try_to_unlazy_child(), true on success).  OTOH, the
call site is right next to removal of unlikely(status == -ECHILD) suggested
a few days ago...

Mind if I take your first commit + that removal of unlikely + change of calling
conventions for unlazy_child() into #work.namei (based at 5.10), so that
the rest of your series got rebased on top of that?

> @@ -3299,7 +3315,16 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags,
>  {
>  	struct dentry *child;
>  	struct path path;
> -	int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path);
> +	int error;
> +
> +	/*
> +	 * We can't guarantee that the fs doesn't block further down, so
> +	 * just disallow nonblock attempts at O_TMPFILE for now.
> +	 */
> +	if (flags & LOOKUP_NONBLOCK)
> +		return -EAGAIN;

Not sure I like it here, TBH...

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 3/4] fs: expose LOOKUP_NONBLOCK through openat2() RESOLVE_NONBLOCK
  2020-12-14 19:13 ` [PATCH 3/4] fs: expose LOOKUP_NONBLOCK through openat2() RESOLVE_NONBLOCK Jens Axboe
  2020-12-15 22:25   ` Dave Chinner
@ 2020-12-16  2:37   ` Al Viro
  2020-12-16  3:39     ` Linus Torvalds
  1 sibling, 1 reply; 46+ messages in thread
From: Al Viro @ 2020-12-16  2:37 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-fsdevel, torvalds

On Mon, Dec 14, 2020 at 12:13:23PM -0700, Jens Axboe wrote:
> Now that we support non-blocking path resolution internally, expose it
> via openat2() in the struct open_how ->resolve flags. This allows
> applications using openat2() to limit path resolution to the extent that
> it is already cached.
> 
> If the lookup cannot be satisfied in a non-blocking manner, openat2(2)
> will return -1/-EAGAIN.
> 
> Cc: Al Viro <viro@zeniv.linux.org.uk>
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---
>  fs/open.c                    | 6 ++++++
>  include/linux/fcntl.h        | 2 +-
>  include/uapi/linux/openat2.h | 4 ++++
>  3 files changed, 11 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/open.c b/fs/open.c
> index 9af548fb841b..a83434cfe01c 100644
> --- a/fs/open.c
> +++ b/fs/open.c
> @@ -1087,6 +1087,12 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
>  		lookup_flags |= LOOKUP_BENEATH;
>  	if (how->resolve & RESOLVE_IN_ROOT)
>  		lookup_flags |= LOOKUP_IN_ROOT;
> +	if (how->resolve & RESOLVE_NONBLOCK) {
> +		/* Don't bother even trying for create/truncate open */
> +		if (flags & (O_TRUNC | O_CREAT))
> +			return -EAGAIN;

Why not O_TMPFILE here as well?

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 2/4] fs: add support for LOOKUP_NONBLOCK
  2020-12-14 19:13 ` [PATCH 2/4] fs: add support for LOOKUP_NONBLOCK Jens Axboe
  2020-12-15 12:24   ` Matthew Wilcox
  2020-12-16  2:36   ` Al Viro
@ 2020-12-16  2:43   ` Al Viro
  2020-12-16  3:32     ` Jens Axboe
  2 siblings, 1 reply; 46+ messages in thread
From: Al Viro @ 2020-12-16  2:43 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-fsdevel, torvalds

On Mon, Dec 14, 2020 at 12:13:22PM -0700, Jens Axboe wrote:
> @@ -3140,6 +3149,12 @@ static const char *open_last_lookups(struct nameidata *nd,
>  			return ERR_CAST(dentry);
>  		if (likely(dentry))
>  			goto finish_lookup;
> +		/*
> +		 * We can't guarantee nonblocking semantics beyond this, if
> +		 * the fast lookup fails.
> +		 */
> +		if (nd->flags & LOOKUP_NONBLOCK)
> +			return ERR_PTR(-EAGAIN);
>  
>  		BUG_ON(nd->flags & LOOKUP_RCU);

That can't be right - we already must have removed LOOKUP_RCU here
(see BUG_ON() right after that point).  What is that test supposed
to catch?

What am I missing here?

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 2/4] fs: add support for LOOKUP_NONBLOCK
  2020-12-16  2:36   ` Al Viro
@ 2020-12-16  3:30     ` Jens Axboe
  0 siblings, 0 replies; 46+ messages in thread
From: Jens Axboe @ 2020-12-16  3:30 UTC (permalink / raw)
  To: Al Viro; +Cc: linux-fsdevel, torvalds

On 12/15/20 7:36 PM, Al Viro wrote:
> On Mon, Dec 14, 2020 at 12:13:22PM -0700, Jens Axboe wrote:
>> io_uring always punts opens to async context, since there's no control
>> over whether the lookup blocks or not. Add LOOKUP_NONBLOCK to support
>> just doing the fast RCU based lookups, which we know will not block. If
>> we can do a cached path resolution of the filename, then we don't have
>> to always punt lookups for a worker.
>>
>> We explicitly disallow O_CREAT | O_TRUNC opens, as those will require
>> blocking, and O_TMPFILE as that requires filesystem interactions and
>> there's currently no way to pass down an attempt to do nonblocking
>> operations there. This basically boils down to whether or not we can
>> do the fast path of open or not. If we can't, then return -EAGAIN and
>> let the caller retry from an appropriate context that can handle
>> blocking.
>>
>> During path resolution, we always do LOOKUP_RCU first. If that fails and
>> we terminate LOOKUP_RCU, then fail a LOOKUP_NONBLOCK attempt as well.
> 
> Ho-hum...  FWIW, I'm tempted to do the same change of calling
> conventions for unlazy_child() (try_to_unlazy_child(), true on
> success).  OTOH, the call site is right next to removal of
> unlikely(status == -ECHILD) suggested a few days ago...
> 
> Mind if I take your first commit + that removal of unlikely + change
> of calling conventions for unlazy_child() into #work.namei (based at
> 5.10), so that the rest of your series got rebased on top of that?

Of course, go ahead.

>> @@ -3299,7 +3315,16 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags,
>>  {
>>  	struct dentry *child;
>>  	struct path path;
>> -	int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path);
>> +	int error;
>> +
>> +	/*
>> +	 * We can't guarantee that the fs doesn't block further down, so
>> +	 * just disallow nonblock attempts at O_TMPFILE for now.
>> +	 */
>> +	if (flags & LOOKUP_NONBLOCK)
>> +		return -EAGAIN;
> 
> Not sure I like it here, TBH...

This ties in with the later email, so you'd prefer to gate this upfront
instead of putting it in here? I'm fine with that.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 2/4] fs: add support for LOOKUP_NONBLOCK
  2020-12-16  2:43   ` Al Viro
@ 2020-12-16  3:32     ` Jens Axboe
  0 siblings, 0 replies; 46+ messages in thread
From: Jens Axboe @ 2020-12-16  3:32 UTC (permalink / raw)
  To: Al Viro; +Cc: linux-fsdevel, torvalds

On 12/15/20 7:43 PM, Al Viro wrote:
> On Mon, Dec 14, 2020 at 12:13:22PM -0700, Jens Axboe wrote:
>> @@ -3140,6 +3149,12 @@ static const char *open_last_lookups(struct nameidata *nd,
>>  			return ERR_CAST(dentry);
>>  		if (likely(dentry))
>>  			goto finish_lookup;
>> +		/*
>> +		 * We can't guarantee nonblocking semantics beyond this, if
>> +		 * the fast lookup fails.
>> +		 */
>> +		if (nd->flags & LOOKUP_NONBLOCK)
>> +			return ERR_PTR(-EAGAIN);
>>  
>>  		BUG_ON(nd->flags & LOOKUP_RCU);
> 
> That can't be right - we already must have removed LOOKUP_RCU here
> (see BUG_ON() right after that point).  What is that test supposed
> to catch?
> 
> What am I missing here?

Nothing I think, doesn't look like it's needed. If we don't return
a valid dentry under LOOKUP_RCU, we will indeed have unlazied at
this point. So this hunk can go.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 3/4] fs: expose LOOKUP_NONBLOCK through openat2() RESOLVE_NONBLOCK
  2020-12-16  2:37   ` Al Viro
@ 2020-12-16  3:39     ` Linus Torvalds
  0 siblings, 0 replies; 46+ messages in thread
From: Linus Torvalds @ 2020-12-16  3:39 UTC (permalink / raw)
  To: Al Viro; +Cc: Jens Axboe, linux-fsdevel

On Tue, Dec 15, 2020 at 6:37 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> > +     if (how->resolve & RESOLVE_NONBLOCK) {
> > +             /* Don't bother even trying for create/truncate open */
> > +             if (flags & (O_TRUNC | O_CREAT))
> > +                     return -EAGAIN;
>
> Why not O_TMPFILE here as well?

Yup, I think that's just missing.

           Linus

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCHSET v3 0/4] fs: Support for LOOKUP_NONBLOCK / RESOLVE_NONBLOCK
  2020-12-14 19:13 [PATCHSET v3 0/4] fs: Support for LOOKUP_NONBLOCK / RESOLVE_NONBLOCK Jens Axboe
                   ` (5 preceding siblings ...)
  2020-12-15  6:11 ` Al Viro
@ 2021-01-04  5:31 ` Al Viro
  2021-01-04 14:43   ` Jens Axboe
       [not found] ` <m1lfbrwrgq.fsf@fess.ebiederm.org>
  7 siblings, 1 reply; 46+ messages in thread
From: Al Viro @ 2021-01-04  5:31 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-fsdevel, torvalds

On Mon, Dec 14, 2020 at 12:13:20PM -0700, Jens Axboe wrote:
> Hi,
> 
> Wanted to throw out what the current state of this is, as we keep
> getting closer to something palatable.
> 
> This time I've included the io_uring change too. I've tested this through
> both openat2, and through io_uring as well.
> 
> I'm pretty happy with this at this point. The core change is very simple,
> and the users end up being trivial too.
> 
> Also available here:
> 
> https://git.kernel.dk/cgit/linux-block/log/?h=nonblock-path-lookup

OK, pushed with modifications into vfs.git #work.namei

Changes: dropped a couple of pointless pieces in open_last_lookups()/do_open(),
moved O_TMPFILE rejection into build_open_flags() (i.e. in the third of your
commits).  And no io_uring stuff in there - your #4 is absent.

I've not put it into #for-next yet; yell if you see any problems with that
branch, or it'll end up there ;-)

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCHSET v3 0/4] fs: Support for LOOKUP_NONBLOCK / RESOLVE_NONBLOCK
  2021-01-04  5:31 ` Al Viro
@ 2021-01-04 14:43   ` Jens Axboe
  2021-01-04 16:54     ` Al Viro
  0 siblings, 1 reply; 46+ messages in thread
From: Jens Axboe @ 2021-01-04 14:43 UTC (permalink / raw)
  To: Al Viro; +Cc: linux-fsdevel, torvalds

On 1/3/21 10:31 PM, Al Viro wrote:
> On Mon, Dec 14, 2020 at 12:13:20PM -0700, Jens Axboe wrote:
>> Hi,
>>
>> Wanted to throw out what the current state of this is, as we keep
>> getting closer to something palatable.
>>
>> This time I've included the io_uring change too. I've tested this through
>> both openat2, and through io_uring as well.
>>
>> I'm pretty happy with this at this point. The core change is very simple,
>> and the users end up being trivial too.
>>
>> Also available here:
>>
>> https://git.kernel.dk/cgit/linux-block/log/?h=nonblock-path-lookup
> 
> OK, pushed with modifications into vfs.git #work.namei
> 
> Changes: dropped a couple of pointless pieces in open_last_lookups()/do_open(),
> moved O_TMPFILE rejection into build_open_flags() (i.e. in the third of your
> commits).  And no io_uring stuff in there - your #4 is absent.
> 
> I've not put it into #for-next yet; yell if you see any problems with that
> branch, or it'll end up there ;-)

Thanks Al - but you picked out of v3, not v4. Not that there are huge
changes between the two, from the posting of v4:

- Rename LOOKUP_NONBLOCK -> LOOKUP_CACHED, and ditto for the RESOLVE_
  flag. This better explains what the feature does, making it more self
  explanatory in terms of both code readability and for the user visible
  part.

- Remove dead LOOKUP_NONBLOCK check after we've dropped LOOKUP_RCU
  already, spotted by Al.

- Add O_TMPFILE to the checks upfront, so we can drop the checking in
  do_tmpfile().

and it sounds like you did the last two when merging yourself. I do like
LOOKUP_CACHED better than LOOKUP_NONBLOCK, mostly for the externally
self-documenting feature of it. What do you think?

Here's the v4 posting, fwiw:

https://lore.kernel.org/linux-fsdevel/20201217161911.743222-1-axboe@kernel.dk/

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCHSET v3 0/4] fs: Support for LOOKUP_NONBLOCK / RESOLVE_NONBLOCK
  2021-01-04 14:43   ` Jens Axboe
@ 2021-01-04 16:54     ` Al Viro
  2021-01-04 17:03       ` Jens Axboe
  0 siblings, 1 reply; 46+ messages in thread
From: Al Viro @ 2021-01-04 16:54 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-fsdevel, torvalds

On Mon, Jan 04, 2021 at 07:43:17AM -0700, Jens Axboe wrote:

> > I've not put it into #for-next yet; yell if you see any problems with that
> > branch, or it'll end up there ;-)
> 
> Thanks Al - but you picked out of v3, not v4. Not that there are huge
> changes between the two, from the posting of v4:
> 
> - Rename LOOKUP_NONBLOCK -> LOOKUP_CACHED, and ditto for the RESOLVE_
>   flag. This better explains what the feature does, making it more self
>   explanatory in terms of both code readability and for the user visible
>   part.
> 
> - Remove dead LOOKUP_NONBLOCK check after we've dropped LOOKUP_RCU
>   already, spotted by Al.
> 
> - Add O_TMPFILE to the checks upfront, so we can drop the checking in
>   do_tmpfile().
> 
> and it sounds like you did the last two when merging yourself.

Yes - back when I'd posted that review.

> I do like
> LOOKUP_CACHED better than LOOKUP_NONBLOCK, mostly for the externally
> self-documenting feature of it. What do you think?

Agreed, especially since _NONBLOCK would confuse users into assumption
that operation is actually non-blocking...

> Here's the v4 posting, fwiw:
> 
> https://lore.kernel.org/linux-fsdevel/20201217161911.743222-1-axboe@kernel.dk/

Sorry, picked from the local branch that sat around since Mid-December ;-/
Fixed.  Another change: ..._child part in unlazy_child() is misleading -
it might as well be used for .. traversal, where dentry is usually the
_parent_ of nd->path.dentry.  The real constraint here is that dentry/seq pair
had been valid next position at some point during the RCU walk.  Renamed to
try_to_unlazy_next(), (hopefully) fixed the comment...

Updated variant force-pushed.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCHSET v3 0/4] fs: Support for LOOKUP_NONBLOCK / RESOLVE_NONBLOCK
  2021-01-04 16:54     ` Al Viro
@ 2021-01-04 17:03       ` Jens Axboe
  0 siblings, 0 replies; 46+ messages in thread
From: Jens Axboe @ 2021-01-04 17:03 UTC (permalink / raw)
  To: Al Viro; +Cc: linux-fsdevel, torvalds

On 1/4/21 9:54 AM, Al Viro wrote:
> On Mon, Jan 04, 2021 at 07:43:17AM -0700, Jens Axboe wrote:
> 
>>> I've not put it into #for-next yet; yell if you see any problems with that
>>> branch, or it'll end up there ;-)
>>
>> Thanks Al - but you picked out of v3, not v4. Not that there are huge
>> changes between the two, from the posting of v4:
>>
>> - Rename LOOKUP_NONBLOCK -> LOOKUP_CACHED, and ditto for the RESOLVE_
>>   flag. This better explains what the feature does, making it more self
>>   explanatory in terms of both code readability and for the user visible
>>   part.
>>
>> - Remove dead LOOKUP_NONBLOCK check after we've dropped LOOKUP_RCU
>>   already, spotted by Al.
>>
>> - Add O_TMPFILE to the checks upfront, so we can drop the checking in
>>   do_tmpfile().
>>
>> and it sounds like you did the last two when merging yourself.
> 
> Yes - back when I'd posted that review.

Gotcha

>> I do like
>> LOOKUP_CACHED better than LOOKUP_NONBLOCK, mostly for the externally
>> self-documenting feature of it. What do you think?
> 
> Agreed, especially since _NONBLOCK would confuse users into assumption
> that operation is actually non-blocking...
> 
>> Here's the v4 posting, fwiw:
>>
>> https://lore.kernel.org/linux-fsdevel/20201217161911.743222-1-axboe@kernel.dk/
> 
> Sorry, picked from the local branch that sat around since Mid-December ;-/
> Fixed.  Another change: ..._child part in unlazy_child() is misleading -
> it might as well be used for .. traversal, where dentry is usually the
> _parent_ of nd->path.dentry.  The real constraint here is that dentry/seq pair
> had been valid next position at some point during the RCU walk.  Renamed to
> try_to_unlazy_next(), (hopefully) fixed the comment...
> 
> Updated variant force-pushed.

All good, looks good to me from a quick look. A diff between the two
bases just show seems sane. I'll pull it in and rebase on top of that,
and re-run the testing just to make sure that no ill effects snuck in
there with the changes.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCHSET v3 0/4] fs: Support for LOOKUP_NONBLOCK / RESOLVE_NONBLOCK (Insufficiently faking current?)
       [not found] ` <m1lfbrwrgq.fsf@fess.ebiederm.org>
@ 2021-02-14 16:38   ` Jens Axboe
  2021-02-14 20:30     ` Linus Torvalds
  2021-02-15 17:56     ` Eric W. Biederman
  0 siblings, 2 replies; 46+ messages in thread
From: Jens Axboe @ 2021-02-14 16:38 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: linux-fsdevel, torvalds, viro

On 2/13/21 3:08 PM, Eric W. Biederman wrote:
> Jens Axboe <axboe@kernel.dk> writes:
> 
>> Hi,
>>
>> Wanted to throw out what the current state of this is, as we keep
>> getting closer to something palatable.
>>
>> This time I've included the io_uring change too. I've tested this through
>> both openat2, and through io_uring as well.
>>
>> I'm pretty happy with this at this point. The core change is very simple,
>> and the users end up being trivial too.
>>
>> Also available here:
>>
>> https://git.kernel.dk/cgit/linux-block/log/?h=nonblock-path-lookup
>>
>> Changes since v2:
>>
>> - Simplify the LOOKUP_NONBLOCK to _just_ cover LOOKUP_RCU and
>>   lookup_fast(), as per Linus's suggestion. I verified fast path
>>   operations are indeed just that, and it avoids having to mess with
>>   the inode lock and mnt_want_write() completely.
>>
>> - Make O_TMPFILE return -EAGAIN for LOOKUP_NONBLOCK.
>>
>> - Improve/add a few comments.
>>
>> - Folded what was left of the open part of LOOKUP_NONBLOCK into the
>>   main patch.
> 
> I have to ask.  Are you doing work to verify that performing
> path traversals and opening of files yields the same results
> when passed to io_uring as it does when performed by the caller?
> 
> Looking at v5.11-rc7 it appears I can arbitrarily access the
> io-wq thread in proc by traversing "/proc/thread-self/".

Yes that looks like a bug, it needs similar treatment to /proc/self:

diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index a553273fbd41..e8ca19371a36 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -17,6 +17,13 @@ static const char *proc_thread_self_get_link(struct dentry *dentry,
 	pid_t pid = task_pid_nr_ns(current, ns);
 	char *name;
 
+	/*
+	 * Not currently supported. Once we can inherit all of struct pid,
+	 * we can allow this.
+	 */
+	if (current->flags & PF_KTHREAD)
+		return ERR_PTR(-EOPNOTSUPP);
+
 	if (!pid)
 		return ERR_PTR(-ENOENT);
 	name = kmalloc(10 + 6 + 10 + 1, dentry ? GFP_KERNEL : GFP_ATOMIC);

as was done in this commit:

commit 8d4c3e76e3be11a64df95ddee52e99092d42fc19
Author: Jens Axboe <axboe@kernel.dk>
Date:   Fri Nov 13 16:47:52 2020 -0700

    proc: don't allow async path resolution of /proc/self components

> Similarly it looks like opening of "/dev/tty" fails to
> return the tty of the caller but instead fails because
> io-wq threads don't have a tty.
> 
> 
> There are a non-trivial number of places where current is used in the
> kernel both in path traversal and in opening files, and I may be blind
> but I have not see any work to find all of those places and make certain
> they are safe when called from io_uring.  That worries me as that using
> a kernel thread instead of a user thread could potential lead to
> privilege escalation.

I've got a patch queued up for 5.12 that clears ->fs and ->files for the
thread if not explicitly inherited, and I'm working on similarly
proactively catching these cases that could potentially be problematic.

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 46+ messages in thread

* Re: [PATCHSET v3 0/4] fs: Support for LOOKUP_NONBLOCK / RESOLVE_NONBLOCK (Insufficiently faking current?)
  2021-02-14 16:38   ` [PATCHSET v3 0/4] fs: Support for LOOKUP_NONBLOCK / RESOLVE_NONBLOCK (Insufficiently faking current?) Jens Axboe
@ 2021-02-14 20:30     ` Linus Torvalds
  2021-02-14 21:24       ` Al Viro
  2021-02-15 18:07       ` Eric W. Biederman
  2021-02-15 17:56     ` Eric W. Biederman
  1 sibling, 2 replies; 46+ messages in thread
From: Linus Torvalds @ 2021-02-14 20:30 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Eric W. Biederman, linux-fsdevel, Al Viro

On Sun, Feb 14, 2021 at 8:38 AM Jens Axboe <axboe@kernel.dk> wrote:
>
> > Similarly it looks like opening of "/dev/tty" fails to
> > return the tty of the caller but instead fails because
> > io-wq threads don't have a tty.
>
> I've got a patch queued up for 5.12 that clears ->fs and ->files for the
> thread if not explicitly inherited, and I'm working on similarly
> proactively catching these cases that could potentially be problematic.

Well, the /dev/tty case still needs fixing somehow.

Opening /dev/tty actually depends on current->signal, and if it is
NULL it will fall back on the first VT console instead (I think).

I wonder if it should do the same thing /proc/self does..

           Linus

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCHSET v3 0/4] fs: Support for LOOKUP_NONBLOCK / RESOLVE_NONBLOCK (Insufficiently faking current?)
  2021-02-14 20:30     ` Linus Torvalds
@ 2021-02-14 21:24       ` Al Viro
  2021-02-15 18:07       ` Eric W. Biederman
  1 sibling, 0 replies; 46+ messages in thread
From: Al Viro @ 2021-02-14 21:24 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Jens Axboe, Eric W. Biederman, linux-fsdevel

On Sun, Feb 14, 2021 at 12:30:07PM -0800, Linus Torvalds wrote:
> On Sun, Feb 14, 2021 at 8:38 AM Jens Axboe <axboe@kernel.dk> wrote:
> >
> > > Similarly it looks like opening of "/dev/tty" fails to
> > > return the tty of the caller but instead fails because
> > > io-wq threads don't have a tty.
> >
> > I've got a patch queued up for 5.12 that clears ->fs and ->files for the
> > thread if not explicitly inherited, and I'm working on similarly
> > proactively catching these cases that could potentially be problematic.
> 
> Well, the /dev/tty case still needs fixing somehow.
> 
> Opening /dev/tty actually depends on current->signal, and if it is
> NULL it will fall back on the first VT console instead (I think).
> 
> I wonder if it should do the same thing /proc/self does..

	I still think that entire "offload pathname resolution to helper
threads" thing is bollocks.  ->fs, ->files and ->signal is still nowhere
near enough - look at /proc/net, for example.  Or netns-sensitive parts
of sysfs, for that matter...  And that's not going into really weird crap
where opener is very special and assumed to be doing all IO.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCHSET v3 0/4] fs: Support for LOOKUP_NONBLOCK / RESOLVE_NONBLOCK (Insufficiently faking current?)
  2021-02-14 16:38   ` [PATCHSET v3 0/4] fs: Support for LOOKUP_NONBLOCK / RESOLVE_NONBLOCK (Insufficiently faking current?) Jens Axboe
  2021-02-14 20:30     ` Linus Torvalds
@ 2021-02-15 17:56     ` Eric W. Biederman
  1 sibling, 0 replies; 46+ messages in thread
From: Eric W. Biederman @ 2021-02-15 17:56 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-fsdevel, torvalds, viro

Jens Axboe <axboe@kernel.dk> writes:

> On 2/13/21 3:08 PM, Eric W. Biederman wrote:
>> 
>> I have to ask.  Are you doing work to verify that performing
>> path traversals and opening of files yields the same results
>> when passed to io_uring as it does when performed by the caller?
>> 
>> Looking at v5.11-rc7 it appears I can arbitrarily access the
>> io-wq thread in proc by traversing "/proc/thread-self/".
>
> Yes that looks like a bug, it needs similar treatment to /proc/self:
Agreed.


> diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
> index a553273fbd41..e8ca19371a36 100644
> --- a/fs/proc/thread_self.c
> +++ b/fs/proc/thread_self.c
> @@ -17,6 +17,13 @@ static const char *proc_thread_self_get_link(struct dentry *dentry,
>  	pid_t pid = task_pid_nr_ns(current, ns);
>  	char *name;
>  
> +	/*
> +	 * Not currently supported. Once we can inherit all of struct pid,
> +	 * we can allow this.
> +	 */
> +	if (current->flags & PF_KTHREAD)
> +		return ERR_PTR(-EOPNOTSUPP);

I suspect simply testing for PF_IO_WORKER is a better test.  As it is
the delegation to the io_worker not the fact that it is a kernel thread
that causes a problem.

I have a memory of that point being made when the smack_privileged test
and the tomoyo_kernel_service test and how to fix them was being
discussed.

>  	if (!pid)
>  		return ERR_PTR(-ENOENT);
>  	name = kmalloc(10 + 6 + 10 + 1, dentry ? GFP_KERNEL : GFP_ATOMIC);
>
> as was done in this commit:
>
> commit 8d4c3e76e3be11a64df95ddee52e99092d42fc19
> Author: Jens Axboe <axboe@kernel.dk>
> Date:   Fri Nov 13 16:47:52 2020 -0700
>
>     proc: don't allow async path resolution of /proc/self components

I suspect that would be better as PF_IO_WORKER as well.

>> Similarly it looks like opening of "/dev/tty" fails to
>> return the tty of the caller but instead fails because
>> io-wq threads don't have a tty.
>> 
>> 
>> There are a non-trivial number of places where current is used in the
>> kernel both in path traversal and in opening files, and I may be blind
>> but I have not see any work to find all of those places and make certain
>> they are safe when called from io_uring.  That worries me as that using
>> a kernel thread instead of a user thread could potential lead to
>> privilege escalation.
>
> I've got a patch queued up for 5.12 that clears ->fs and ->files for the
> thread if not explicitly inherited, and I'm working on similarly
> proactively catching these cases that could potentially be
> problematic.

Any pointers or is this all in a private tree for now?

It is difficult to follow because many of the fixes have not CC'd the
reporters or even the maintainers of the subsystems who have been
affected by this kind of issue.

Eric


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCHSET v3 0/4] fs: Support for LOOKUP_NONBLOCK / RESOLVE_NONBLOCK (Insufficiently faking current?)
  2021-02-14 20:30     ` Linus Torvalds
  2021-02-14 21:24       ` Al Viro
@ 2021-02-15 18:07       ` Eric W. Biederman
  2021-02-15 18:24         ` Jens Axboe
  1 sibling, 1 reply; 46+ messages in thread
From: Eric W. Biederman @ 2021-02-15 18:07 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Jens Axboe, linux-fsdevel, Al Viro

Linus Torvalds <torvalds@linux-foundation.org> writes:

> On Sun, Feb 14, 2021 at 8:38 AM Jens Axboe <axboe@kernel.dk> wrote:
>>
>> > Similarly it looks like opening of "/dev/tty" fails to
>> > return the tty of the caller but instead fails because
>> > io-wq threads don't have a tty.
>>
>> I've got a patch queued up for 5.12 that clears ->fs and ->files for the
>> thread if not explicitly inherited, and I'm working on similarly
>> proactively catching these cases that could potentially be problematic.
>
> Well, the /dev/tty case still needs fixing somehow.
>
> Opening /dev/tty actually depends on current->signal, and if it is
> NULL it will fall back on the first VT console instead (I think).
>
> I wonder if it should do the same thing /proc/self does..

Would there be any downside of making the io-wq kernel threads be per
process instead of per user?

I can see a lower probability of a thread already existing.  Are there
other downsides I am missing?

The upside would be that all of the issues of have we copied enough
should go away, as the io-wq thread would then behave like another user
space thread.  To handle posix setresuid() and friends it looks like
current_cred would need to be copied but I can't think of anything else.

Right I am with Al and I don't have any idea how many special cases we
need to play whack-a-mole with.

Eric

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCHSET v3 0/4] fs: Support for LOOKUP_NONBLOCK / RESOLVE_NONBLOCK (Insufficiently faking current?)
  2021-02-15 18:07       ` Eric W. Biederman
@ 2021-02-15 18:24         ` Jens Axboe
  2021-02-15 21:09           ` Jens Axboe
  0 siblings, 1 reply; 46+ messages in thread
From: Jens Axboe @ 2021-02-15 18:24 UTC (permalink / raw)
  To: Eric W. Biederman, Linus Torvalds; +Cc: linux-fsdevel, Al Viro

On 2/15/21 11:07 AM, Eric W. Biederman wrote:
> Linus Torvalds <torvalds@linux-foundation.org> writes:
> 
>> On Sun, Feb 14, 2021 at 8:38 AM Jens Axboe <axboe@kernel.dk> wrote:
>>>
>>>> Similarly it looks like opening of "/dev/tty" fails to
>>>> return the tty of the caller but instead fails because
>>>> io-wq threads don't have a tty.
>>>
>>> I've got a patch queued up for 5.12 that clears ->fs and ->files for the
>>> thread if not explicitly inherited, and I'm working on similarly
>>> proactively catching these cases that could potentially be problematic.
>>
>> Well, the /dev/tty case still needs fixing somehow.
>>
>> Opening /dev/tty actually depends on current->signal, and if it is
>> NULL it will fall back on the first VT console instead (I think).
>>
>> I wonder if it should do the same thing /proc/self does..
> 
> Would there be any downside of making the io-wq kernel threads be per
> process instead of per user?
> 
> I can see a lower probability of a thread already existing.  Are there
> other downsides I am missing?
> 
> The upside would be that all of the issues of have we copied enough
> should go away, as the io-wq thread would then behave like another user
> space thread.  To handle posix setresuid() and friends it looks like
> current_cred would need to be copied but I can't think of anything else.

I really like that idea. Do we currently have a way of creating a thread
internally, akin to what would happen if the same task did pthread_create?
That'd ensure that we have everything we need, without actively needing to
map the request types, or find future issues of "we also need this bit".
It'd work fine for the 'need new worker' case too, if one goes to sleep.
We'd just 'fork' off that child.

Would require some restructuring of io-wq, but at the end of it, it'd
be a simpler solution.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCHSET v3 0/4] fs: Support for LOOKUP_NONBLOCK / RESOLVE_NONBLOCK (Insufficiently faking current?)
  2021-02-15 18:24         ` Jens Axboe
@ 2021-02-15 21:09           ` Jens Axboe
  2021-02-15 22:41             ` Eric W. Biederman
  0 siblings, 1 reply; 46+ messages in thread
From: Jens Axboe @ 2021-02-15 21:09 UTC (permalink / raw)
  To: Eric W. Biederman, Linus Torvalds; +Cc: linux-fsdevel, Al Viro

On 2/15/21 11:24 AM, Jens Axboe wrote:
> On 2/15/21 11:07 AM, Eric W. Biederman wrote:
>> Linus Torvalds <torvalds@linux-foundation.org> writes:
>>
>>> On Sun, Feb 14, 2021 at 8:38 AM Jens Axboe <axboe@kernel.dk> wrote:
>>>>
>>>>> Similarly it looks like opening of "/dev/tty" fails to
>>>>> return the tty of the caller but instead fails because
>>>>> io-wq threads don't have a tty.
>>>>
>>>> I've got a patch queued up for 5.12 that clears ->fs and ->files for the
>>>> thread if not explicitly inherited, and I'm working on similarly
>>>> proactively catching these cases that could potentially be problematic.
>>>
>>> Well, the /dev/tty case still needs fixing somehow.
>>>
>>> Opening /dev/tty actually depends on current->signal, and if it is
>>> NULL it will fall back on the first VT console instead (I think).
>>>
>>> I wonder if it should do the same thing /proc/self does..
>>
>> Would there be any downside of making the io-wq kernel threads be per
>> process instead of per user?
>>
>> I can see a lower probability of a thread already existing.  Are there
>> other downsides I am missing?
>>
>> The upside would be that all of the issues of have we copied enough
>> should go away, as the io-wq thread would then behave like another user
>> space thread.  To handle posix setresuid() and friends it looks like
>> current_cred would need to be copied but I can't think of anything else.
> 
> I really like that idea. Do we currently have a way of creating a thread
> internally, akin to what would happen if the same task did pthread_create?
> That'd ensure that we have everything we need, without actively needing to
> map the request types, or find future issues of "we also need this bit".
> It'd work fine for the 'need new worker' case too, if one goes to sleep.
> We'd just 'fork' off that child.
> 
> Would require some restructuring of io-wq, but at the end of it, it'd
> be a simpler solution.

I was intrigued enough that I tried to wire this up. If we can pull this
off, then it would take a great weight off my shoulders as there would
be no more worries on identity.

Here's a branch that's got a set of patches that actually work, though
it's a bit of a hack in spots. Notes:

- Forked worker initially crashed, since it's an actual user thread and
  bombed on deref of kernel structures. Expectedly. That's what the
  horrible kernel_clone_args->io_wq hack is working around for now.
  Obviously not the final solution, but helped move things along so
  I could actually test this.

- Shared io-wq helpers need indexing for task, right now this isn't
  done. But that's not hard to do.

- Idle thread reaping isn't done yet, so they persist until the
  context goes away.

- task_work fallback needs a bit of love. Currently we fallback to
  the io-wq manager thread for handling that, but a) manager is gone,
  and b) the new workers are now threads and go away as well when
  the original task goes away. None of the three fallback sites need
  task context, so likely solution here is just punt it to system_wq.
  Not the hot path, obviously, we're exiting.

- Personality registration is broken, it's just Good Enough to compile.

Probably a few more items that escape me right now. As long as you
don't hit the fallback cases, it appears to work fine for me. And
the diffstat is pretty good to:

 fs/io-wq.c                 | 418 +++++++++++--------------------------
 fs/io-wq.h                 |  10 +-
 fs/io_uring.c              | 314 +++-------------------------
 fs/proc/self.c             |   7 -
 fs/proc/thread_self.c      |   7 -
 include/linux/io_uring.h   |  19 --
 include/linux/sched.h      |   3 +
 include/linux/sched/task.h |   1 +
 kernel/fork.c              |   2 +
 9 files changed, 161 insertions(+), 620 deletions(-)

as it gets rid of _all_ the 'grab this or that piece' that we're
tracking.

WIP series here:

https://git.kernel.dk/cgit/linux-block/log/?h=io_uring-worker

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCHSET v3 0/4] fs: Support for LOOKUP_NONBLOCK / RESOLVE_NONBLOCK (Insufficiently faking current?)
  2021-02-15 21:09           ` Jens Axboe
@ 2021-02-15 22:41             ` Eric W. Biederman
  2021-02-16  2:41               ` Jens Axboe
  0 siblings, 1 reply; 46+ messages in thread
From: Eric W. Biederman @ 2021-02-15 22:41 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Linus Torvalds, linux-fsdevel, Al Viro

Jens Axboe <axboe@kernel.dk> writes:

> On 2/15/21 11:24 AM, Jens Axboe wrote:
>> On 2/15/21 11:07 AM, Eric W. Biederman wrote:
>>> Linus Torvalds <torvalds@linux-foundation.org> writes:
>>>
>>>> On Sun, Feb 14, 2021 at 8:38 AM Jens Axboe <axboe@kernel.dk> wrote:
>>>>>
>>>>>> Similarly it looks like opening of "/dev/tty" fails to
>>>>>> return the tty of the caller but instead fails because
>>>>>> io-wq threads don't have a tty.
>>>>>
>>>>> I've got a patch queued up for 5.12 that clears ->fs and ->files for the
>>>>> thread if not explicitly inherited, and I'm working on similarly
>>>>> proactively catching these cases that could potentially be problematic.
>>>>
>>>> Well, the /dev/tty case still needs fixing somehow.
>>>>
>>>> Opening /dev/tty actually depends on current->signal, and if it is
>>>> NULL it will fall back on the first VT console instead (I think).
>>>>
>>>> I wonder if it should do the same thing /proc/self does..
>>>
>>> Would there be any downside of making the io-wq kernel threads be per
>>> process instead of per user?
>>>
>>> I can see a lower probability of a thread already existing.  Are there
>>> other downsides I am missing?
>>>
>>> The upside would be that all of the issues of have we copied enough
>>> should go away, as the io-wq thread would then behave like another user
>>> space thread.  To handle posix setresuid() and friends it looks like
>>> current_cred would need to be copied but I can't think of anything else.
>> 
>> I really like that idea. Do we currently have a way of creating a thread
>> internally, akin to what would happen if the same task did pthread_create?
>> That'd ensure that we have everything we need, without actively needing to
>> map the request types, or find future issues of "we also need this bit".
>> It'd work fine for the 'need new worker' case too, if one goes to sleep.
>> We'd just 'fork' off that child.
>> 
>> Would require some restructuring of io-wq, but at the end of it, it'd
>> be a simpler solution.
>
> I was intrigued enough that I tried to wire this up. If we can pull this
> off, then it would take a great weight off my shoulders as there would
> be no more worries on identity.
>
> Here's a branch that's got a set of patches that actually work, though
> it's a bit of a hack in spots. Notes:
>
> - Forked worker initially crashed, since it's an actual user thread and
>   bombed on deref of kernel structures. Expectedly. That's what the
>   horrible kernel_clone_args->io_wq hack is working around for now.
>   Obviously not the final solution, but helped move things along so
>   I could actually test this.
>
> - Shared io-wq helpers need indexing for task, right now this isn't
>   done. But that's not hard to do.
>
> - Idle thread reaping isn't done yet, so they persist until the
>   context goes away.
>
> - task_work fallback needs a bit of love. Currently we fallback to
>   the io-wq manager thread for handling that, but a) manager is gone,
>   and b) the new workers are now threads and go away as well when
>   the original task goes away. None of the three fallback sites need
>   task context, so likely solution here is just punt it to system_wq.
>   Not the hot path, obviously, we're exiting.
>
> - Personality registration is broken, it's just Good Enough to compile.
>
> Probably a few more items that escape me right now. As long as you
> don't hit the fallback cases, it appears to work fine for me. And
> the diffstat is pretty good to:
>
>  fs/io-wq.c                 | 418 +++++++++++--------------------------
>  fs/io-wq.h                 |  10 +-
>  fs/io_uring.c              | 314 +++-------------------------
>  fs/proc/self.c             |   7 -
>  fs/proc/thread_self.c      |   7 -
>  include/linux/io_uring.h   |  19 --
>  include/linux/sched.h      |   3 +
>  include/linux/sched/task.h |   1 +
>  kernel/fork.c              |   2 +
>  9 files changed, 161 insertions(+), 620 deletions(-)
>
> as it gets rid of _all_ the 'grab this or that piece' that we're
> tracking.
>
> WIP series here:
>
> https://git.kernel.dk/cgit/linux-block/log/?h=io_uring-worker

I took a quick look through the code and in general it seems reasonable.

Can the io_uring_task_cancel in begin_new_exec go away as well?
Today it happens after de_thread and so presumably all of the io_uring
threads are already gone.

Eric

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCHSET v3 0/4] fs: Support for LOOKUP_NONBLOCK / RESOLVE_NONBLOCK (Insufficiently faking current?)
  2021-02-15 22:41             ` Eric W. Biederman
@ 2021-02-16  2:41               ` Jens Axboe
  2021-02-17  1:18                 ` Jens Axboe
  0 siblings, 1 reply; 46+ messages in thread
From: Jens Axboe @ 2021-02-16  2:41 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Linus Torvalds, linux-fsdevel, Al Viro

On 2/15/21 3:41 PM, Eric W. Biederman wrote:
> Jens Axboe <axboe@kernel.dk> writes:
> 
>> On 2/15/21 11:24 AM, Jens Axboe wrote:
>>> On 2/15/21 11:07 AM, Eric W. Biederman wrote:
>>>> Linus Torvalds <torvalds@linux-foundation.org> writes:
>>>>
>>>>> On Sun, Feb 14, 2021 at 8:38 AM Jens Axboe <axboe@kernel.dk> wrote:
>>>>>>
>>>>>>> Similarly it looks like opening of "/dev/tty" fails to
>>>>>>> return the tty of the caller but instead fails because
>>>>>>> io-wq threads don't have a tty.
>>>>>>
>>>>>> I've got a patch queued up for 5.12 that clears ->fs and ->files for the
>>>>>> thread if not explicitly inherited, and I'm working on similarly
>>>>>> proactively catching these cases that could potentially be problematic.
>>>>>
>>>>> Well, the /dev/tty case still needs fixing somehow.
>>>>>
>>>>> Opening /dev/tty actually depends on current->signal, and if it is
>>>>> NULL it will fall back on the first VT console instead (I think).
>>>>>
>>>>> I wonder if it should do the same thing /proc/self does..
>>>>
>>>> Would there be any downside of making the io-wq kernel threads be per
>>>> process instead of per user?
>>>>
>>>> I can see a lower probability of a thread already existing.  Are there
>>>> other downsides I am missing?
>>>>
>>>> The upside would be that all of the issues of have we copied enough
>>>> should go away, as the io-wq thread would then behave like another user
>>>> space thread.  To handle posix setresuid() and friends it looks like
>>>> current_cred would need to be copied but I can't think of anything else.
>>>
>>> I really like that idea. Do we currently have a way of creating a thread
>>> internally, akin to what would happen if the same task did pthread_create?
>>> That'd ensure that we have everything we need, without actively needing to
>>> map the request types, or find future issues of "we also need this bit".
>>> It'd work fine for the 'need new worker' case too, if one goes to sleep.
>>> We'd just 'fork' off that child.
>>>
>>> Would require some restructuring of io-wq, but at the end of it, it'd
>>> be a simpler solution.
>>
>> I was intrigued enough that I tried to wire this up. If we can pull this
>> off, then it would take a great weight off my shoulders as there would
>> be no more worries on identity.
>>
>> Here's a branch that's got a set of patches that actually work, though
>> it's a bit of a hack in spots. Notes:
>>
>> - Forked worker initially crashed, since it's an actual user thread and
>>   bombed on deref of kernel structures. Expectedly. That's what the
>>   horrible kernel_clone_args->io_wq hack is working around for now.
>>   Obviously not the final solution, but helped move things along so
>>   I could actually test this.
>>
>> - Shared io-wq helpers need indexing for task, right now this isn't
>>   done. But that's not hard to do.
>>
>> - Idle thread reaping isn't done yet, so they persist until the
>>   context goes away.
>>
>> - task_work fallback needs a bit of love. Currently we fallback to
>>   the io-wq manager thread for handling that, but a) manager is gone,
>>   and b) the new workers are now threads and go away as well when
>>   the original task goes away. None of the three fallback sites need
>>   task context, so likely solution here is just punt it to system_wq.
>>   Not the hot path, obviously, we're exiting.
>>
>> - Personality registration is broken, it's just Good Enough to compile.
>>
>> Probably a few more items that escape me right now. As long as you
>> don't hit the fallback cases, it appears to work fine for me. And
>> the diffstat is pretty good to:
>>
>>  fs/io-wq.c                 | 418 +++++++++++--------------------------
>>  fs/io-wq.h                 |  10 +-
>>  fs/io_uring.c              | 314 +++-------------------------
>>  fs/proc/self.c             |   7 -
>>  fs/proc/thread_self.c      |   7 -
>>  include/linux/io_uring.h   |  19 --
>>  include/linux/sched.h      |   3 +
>>  include/linux/sched/task.h |   1 +
>>  kernel/fork.c              |   2 +
>>  9 files changed, 161 insertions(+), 620 deletions(-)
>>
>> as it gets rid of _all_ the 'grab this or that piece' that we're
>> tracking.
>>
>> WIP series here:
>>
>> https://git.kernel.dk/cgit/linux-block/log/?h=io_uring-worker
> 
> I took a quick look through the code and in general it seems reasonable.

Great, thanks for checking.

> Can the io_uring_task_cancel in begin_new_exec go away as well?
> Today it happens after de_thread and so presumably all of the io_uring
> threads are already gone.

I don't think so, async offload is only the slower path of async. We
can have poll based waiting, and various others. If we want to guarantee
that no requests escape across exec, then we'll need to retain that.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCHSET v3 0/4] fs: Support for LOOKUP_NONBLOCK / RESOLVE_NONBLOCK (Insufficiently faking current?)
  2021-02-16  2:41               ` Jens Axboe
@ 2021-02-17  1:18                 ` Jens Axboe
  2021-02-17  1:26                   ` Jens Axboe
  0 siblings, 1 reply; 46+ messages in thread
From: Jens Axboe @ 2021-02-17  1:18 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Linus Torvalds, linux-fsdevel, Al Viro

On 2/15/21 7:41 PM, Jens Axboe wrote:
> On 2/15/21 3:41 PM, Eric W. Biederman wrote:
>> Jens Axboe <axboe@kernel.dk> writes:
>>
>>> On 2/15/21 11:24 AM, Jens Axboe wrote:
>>>> On 2/15/21 11:07 AM, Eric W. Biederman wrote:
>>>>> Linus Torvalds <torvalds@linux-foundation.org> writes:
>>>>>
>>>>>> On Sun, Feb 14, 2021 at 8:38 AM Jens Axboe <axboe@kernel.dk> wrote:
>>>>>>>
>>>>>>>> Similarly it looks like opening of "/dev/tty" fails to
>>>>>>>> return the tty of the caller but instead fails because
>>>>>>>> io-wq threads don't have a tty.
>>>>>>>
>>>>>>> I've got a patch queued up for 5.12 that clears ->fs and ->files for the
>>>>>>> thread if not explicitly inherited, and I'm working on similarly
>>>>>>> proactively catching these cases that could potentially be problematic.
>>>>>>
>>>>>> Well, the /dev/tty case still needs fixing somehow.
>>>>>>
>>>>>> Opening /dev/tty actually depends on current->signal, and if it is
>>>>>> NULL it will fall back on the first VT console instead (I think).
>>>>>>
>>>>>> I wonder if it should do the same thing /proc/self does..
>>>>>
>>>>> Would there be any downside of making the io-wq kernel threads be per
>>>>> process instead of per user?
>>>>>
>>>>> I can see a lower probability of a thread already existing.  Are there
>>>>> other downsides I am missing?
>>>>>
>>>>> The upside would be that all of the issues of have we copied enough
>>>>> should go away, as the io-wq thread would then behave like another user
>>>>> space thread.  To handle posix setresuid() and friends it looks like
>>>>> current_cred would need to be copied but I can't think of anything else.
>>>>
>>>> I really like that idea. Do we currently have a way of creating a thread
>>>> internally, akin to what would happen if the same task did pthread_create?
>>>> That'd ensure that we have everything we need, without actively needing to
>>>> map the request types, or find future issues of "we also need this bit".
>>>> It'd work fine for the 'need new worker' case too, if one goes to sleep.
>>>> We'd just 'fork' off that child.
>>>>
>>>> Would require some restructuring of io-wq, but at the end of it, it'd
>>>> be a simpler solution.
>>>
>>> I was intrigued enough that I tried to wire this up. If we can pull this
>>> off, then it would take a great weight off my shoulders as there would
>>> be no more worries on identity.
>>>
>>> Here's a branch that's got a set of patches that actually work, though
>>> it's a bit of a hack in spots. Notes:
>>>
>>> - Forked worker initially crashed, since it's an actual user thread and
>>>   bombed on deref of kernel structures. Expectedly. That's what the
>>>   horrible kernel_clone_args->io_wq hack is working around for now.
>>>   Obviously not the final solution, but helped move things along so
>>>   I could actually test this.
>>>
>>> - Shared io-wq helpers need indexing for task, right now this isn't
>>>   done. But that's not hard to do.
>>>
>>> - Idle thread reaping isn't done yet, so they persist until the
>>>   context goes away.
>>>
>>> - task_work fallback needs a bit of love. Currently we fallback to
>>>   the io-wq manager thread for handling that, but a) manager is gone,
>>>   and b) the new workers are now threads and go away as well when
>>>   the original task goes away. None of the three fallback sites need
>>>   task context, so likely solution here is just punt it to system_wq.
>>>   Not the hot path, obviously, we're exiting.
>>>
>>> - Personality registration is broken, it's just Good Enough to compile.
>>>
>>> Probably a few more items that escape me right now. As long as you
>>> don't hit the fallback cases, it appears to work fine for me. And
>>> the diffstat is pretty good to:
>>>
>>>  fs/io-wq.c                 | 418 +++++++++++--------------------------
>>>  fs/io-wq.h                 |  10 +-
>>>  fs/io_uring.c              | 314 +++-------------------------
>>>  fs/proc/self.c             |   7 -
>>>  fs/proc/thread_self.c      |   7 -
>>>  include/linux/io_uring.h   |  19 --
>>>  include/linux/sched.h      |   3 +
>>>  include/linux/sched/task.h |   1 +
>>>  kernel/fork.c              |   2 +
>>>  9 files changed, 161 insertions(+), 620 deletions(-)
>>>
>>> as it gets rid of _all_ the 'grab this or that piece' that we're
>>> tracking.
>>>
>>> WIP series here:
>>>
>>> https://git.kernel.dk/cgit/linux-block/log/?h=io_uring-worker
>>
>> I took a quick look through the code and in general it seems reasonable.
> 
> Great, thanks for checking.

Cleaner series here:

https://git.kernel.dk/cgit/linux-block/log/?h=io_uring-worker.v2

One question, since I'm a bit stumped. The very top most debug patch:

https://git.kernel.dk/cgit/linux-block/commit/?h=io_uring-worker.v2&id=8a422f030b9630d16d5ec1ff97842a265f88485e

any idea what is going on here? For some reason, it only happens for
the 'manager' thread. That one doesn't do any work by itself, it's just
tasked with forking a new worker, if we need one.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCHSET v3 0/4] fs: Support for LOOKUP_NONBLOCK / RESOLVE_NONBLOCK (Insufficiently faking current?)
  2021-02-17  1:18                 ` Jens Axboe
@ 2021-02-17  1:26                   ` Jens Axboe
  2021-02-17  3:11                     ` Jens Axboe
  0 siblings, 1 reply; 46+ messages in thread
From: Jens Axboe @ 2021-02-17  1:26 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Linus Torvalds, linux-fsdevel, Al Viro

On 2/16/21 6:18 PM, Jens Axboe wrote:
> On 2/15/21 7:41 PM, Jens Axboe wrote:
>> On 2/15/21 3:41 PM, Eric W. Biederman wrote:
>>> Jens Axboe <axboe@kernel.dk> writes:
>>>
>>>> On 2/15/21 11:24 AM, Jens Axboe wrote:
>>>>> On 2/15/21 11:07 AM, Eric W. Biederman wrote:
>>>>>> Linus Torvalds <torvalds@linux-foundation.org> writes:
>>>>>>
>>>>>>> On Sun, Feb 14, 2021 at 8:38 AM Jens Axboe <axboe@kernel.dk> wrote:
>>>>>>>>
>>>>>>>>> Similarly it looks like opening of "/dev/tty" fails to
>>>>>>>>> return the tty of the caller but instead fails because
>>>>>>>>> io-wq threads don't have a tty.
>>>>>>>>
>>>>>>>> I've got a patch queued up for 5.12 that clears ->fs and ->files for the
>>>>>>>> thread if not explicitly inherited, and I'm working on similarly
>>>>>>>> proactively catching these cases that could potentially be problematic.
>>>>>>>
>>>>>>> Well, the /dev/tty case still needs fixing somehow.
>>>>>>>
>>>>>>> Opening /dev/tty actually depends on current->signal, and if it is
>>>>>>> NULL it will fall back on the first VT console instead (I think).
>>>>>>>
>>>>>>> I wonder if it should do the same thing /proc/self does..
>>>>>>
>>>>>> Would there be any downside of making the io-wq kernel threads be per
>>>>>> process instead of per user?
>>>>>>
>>>>>> I can see a lower probability of a thread already existing.  Are there
>>>>>> other downsides I am missing?
>>>>>>
>>>>>> The upside would be that all of the issues of have we copied enough
>>>>>> should go away, as the io-wq thread would then behave like another user
>>>>>> space thread.  To handle posix setresuid() and friends it looks like
>>>>>> current_cred would need to be copied but I can't think of anything else.
>>>>>
>>>>> I really like that idea. Do we currently have a way of creating a thread
>>>>> internally, akin to what would happen if the same task did pthread_create?
>>>>> That'd ensure that we have everything we need, without actively needing to
>>>>> map the request types, or find future issues of "we also need this bit".
>>>>> It'd work fine for the 'need new worker' case too, if one goes to sleep.
>>>>> We'd just 'fork' off that child.
>>>>>
>>>>> Would require some restructuring of io-wq, but at the end of it, it'd
>>>>> be a simpler solution.
>>>>
>>>> I was intrigued enough that I tried to wire this up. If we can pull this
>>>> off, then it would take a great weight off my shoulders as there would
>>>> be no more worries on identity.
>>>>
>>>> Here's a branch that's got a set of patches that actually work, though
>>>> it's a bit of a hack in spots. Notes:
>>>>
>>>> - Forked worker initially crashed, since it's an actual user thread and
>>>>   bombed on deref of kernel structures. Expectedly. That's what the
>>>>   horrible kernel_clone_args->io_wq hack is working around for now.
>>>>   Obviously not the final solution, but helped move things along so
>>>>   I could actually test this.
>>>>
>>>> - Shared io-wq helpers need indexing for task, right now this isn't
>>>>   done. But that's not hard to do.
>>>>
>>>> - Idle thread reaping isn't done yet, so they persist until the
>>>>   context goes away.
>>>>
>>>> - task_work fallback needs a bit of love. Currently we fallback to
>>>>   the io-wq manager thread for handling that, but a) manager is gone,
>>>>   and b) the new workers are now threads and go away as well when
>>>>   the original task goes away. None of the three fallback sites need
>>>>   task context, so likely solution here is just punt it to system_wq.
>>>>   Not the hot path, obviously, we're exiting.
>>>>
>>>> - Personality registration is broken, it's just Good Enough to compile.
>>>>
>>>> Probably a few more items that escape me right now. As long as you
>>>> don't hit the fallback cases, it appears to work fine for me. And
>>>> the diffstat is pretty good to:
>>>>
>>>>  fs/io-wq.c                 | 418 +++++++++++--------------------------
>>>>  fs/io-wq.h                 |  10 +-
>>>>  fs/io_uring.c              | 314 +++-------------------------
>>>>  fs/proc/self.c             |   7 -
>>>>  fs/proc/thread_self.c      |   7 -
>>>>  include/linux/io_uring.h   |  19 --
>>>>  include/linux/sched.h      |   3 +
>>>>  include/linux/sched/task.h |   1 +
>>>>  kernel/fork.c              |   2 +
>>>>  9 files changed, 161 insertions(+), 620 deletions(-)
>>>>
>>>> as it gets rid of _all_ the 'grab this or that piece' that we're
>>>> tracking.
>>>>
>>>> WIP series here:
>>>>
>>>> https://git.kernel.dk/cgit/linux-block/log/?h=io_uring-worker
>>>
>>> I took a quick look through the code and in general it seems reasonable.
>>
>> Great, thanks for checking.
> 
> Cleaner series here:
> 
> https://git.kernel.dk/cgit/linux-block/log/?h=io_uring-worker.v2
> 
> One question, since I'm a bit stumped. The very top most debug patch:
> 
> https://git.kernel.dk/cgit/linux-block/commit/?h=io_uring-worker.v2&id=8a422f030b9630d16d5ec1ff97842a265f88485e
> 
> any idea what is going on here? For some reason, it only happens for
> the 'manager' thread. That one doesn't do any work by itself, it's just
> tasked with forking a new worker, if we need one.

Seems to trigger for all cases with a pthread in the app. This reproduces
it:


#include <stdio.h>
#include <pthread.h>
#include <unistd.h>
#include <liburing.h>

static void *fn(void *data)
{
	struct io_uring ring;

	io_uring_queue_init(1, &ring, 0);
	sleep(1);
	return NULL;
}

int main(int argc, char *argv[])
{
	pthread_t t;
	void *ret;

	pthread_create(&t, NULL, fn, NULL);
	pthread_join(t, &ret);

	return 0;
}

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCHSET v3 0/4] fs: Support for LOOKUP_NONBLOCK / RESOLVE_NONBLOCK (Insufficiently faking current?)
  2021-02-17  1:26                   ` Jens Axboe
@ 2021-02-17  3:11                     ` Jens Axboe
  0 siblings, 0 replies; 46+ messages in thread
From: Jens Axboe @ 2021-02-17  3:11 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Linus Torvalds, linux-fsdevel, Al Viro

On 2/16/21 6:26 PM, Jens Axboe wrote:
> On 2/16/21 6:18 PM, Jens Axboe wrote:
>> On 2/15/21 7:41 PM, Jens Axboe wrote:
>>> On 2/15/21 3:41 PM, Eric W. Biederman wrote:
>>>> Jens Axboe <axboe@kernel.dk> writes:
>>>>
>>>>> On 2/15/21 11:24 AM, Jens Axboe wrote:
>>>>>> On 2/15/21 11:07 AM, Eric W. Biederman wrote:
>>>>>>> Linus Torvalds <torvalds@linux-foundation.org> writes:
>>>>>>>
>>>>>>>> On Sun, Feb 14, 2021 at 8:38 AM Jens Axboe <axboe@kernel.dk> wrote:
>>>>>>>>>
>>>>>>>>>> Similarly it looks like opening of "/dev/tty" fails to
>>>>>>>>>> return the tty of the caller but instead fails because
>>>>>>>>>> io-wq threads don't have a tty.
>>>>>>>>>
>>>>>>>>> I've got a patch queued up for 5.12 that clears ->fs and ->files for the
>>>>>>>>> thread if not explicitly inherited, and I'm working on similarly
>>>>>>>>> proactively catching these cases that could potentially be problematic.
>>>>>>>>
>>>>>>>> Well, the /dev/tty case still needs fixing somehow.
>>>>>>>>
>>>>>>>> Opening /dev/tty actually depends on current->signal, and if it is
>>>>>>>> NULL it will fall back on the first VT console instead (I think).
>>>>>>>>
>>>>>>>> I wonder if it should do the same thing /proc/self does..
>>>>>>>
>>>>>>> Would there be any downside of making the io-wq kernel threads be per
>>>>>>> process instead of per user?
>>>>>>>
>>>>>>> I can see a lower probability of a thread already existing.  Are there
>>>>>>> other downsides I am missing?
>>>>>>>
>>>>>>> The upside would be that all of the issues of have we copied enough
>>>>>>> should go away, as the io-wq thread would then behave like another user
>>>>>>> space thread.  To handle posix setresuid() and friends it looks like
>>>>>>> current_cred would need to be copied but I can't think of anything else.
>>>>>>
>>>>>> I really like that idea. Do we currently have a way of creating a thread
>>>>>> internally, akin to what would happen if the same task did pthread_create?
>>>>>> That'd ensure that we have everything we need, without actively needing to
>>>>>> map the request types, or find future issues of "we also need this bit".
>>>>>> It'd work fine for the 'need new worker' case too, if one goes to sleep.
>>>>>> We'd just 'fork' off that child.
>>>>>>
>>>>>> Would require some restructuring of io-wq, but at the end of it, it'd
>>>>>> be a simpler solution.
>>>>>
>>>>> I was intrigued enough that I tried to wire this up. If we can pull this
>>>>> off, then it would take a great weight off my shoulders as there would
>>>>> be no more worries on identity.
>>>>>
>>>>> Here's a branch that's got a set of patches that actually work, though
>>>>> it's a bit of a hack in spots. Notes:
>>>>>
>>>>> - Forked worker initially crashed, since it's an actual user thread and
>>>>>   bombed on deref of kernel structures. Expectedly. That's what the
>>>>>   horrible kernel_clone_args->io_wq hack is working around for now.
>>>>>   Obviously not the final solution, but helped move things along so
>>>>>   I could actually test this.
>>>>>
>>>>> - Shared io-wq helpers need indexing for task, right now this isn't
>>>>>   done. But that's not hard to do.
>>>>>
>>>>> - Idle thread reaping isn't done yet, so they persist until the
>>>>>   context goes away.
>>>>>
>>>>> - task_work fallback needs a bit of love. Currently we fallback to
>>>>>   the io-wq manager thread for handling that, but a) manager is gone,
>>>>>   and b) the new workers are now threads and go away as well when
>>>>>   the original task goes away. None of the three fallback sites need
>>>>>   task context, so likely solution here is just punt it to system_wq.
>>>>>   Not the hot path, obviously, we're exiting.
>>>>>
>>>>> - Personality registration is broken, it's just Good Enough to compile.
>>>>>
>>>>> Probably a few more items that escape me right now. As long as you
>>>>> don't hit the fallback cases, it appears to work fine for me. And
>>>>> the diffstat is pretty good to:
>>>>>
>>>>>  fs/io-wq.c                 | 418 +++++++++++--------------------------
>>>>>  fs/io-wq.h                 |  10 +-
>>>>>  fs/io_uring.c              | 314 +++-------------------------
>>>>>  fs/proc/self.c             |   7 -
>>>>>  fs/proc/thread_self.c      |   7 -
>>>>>  include/linux/io_uring.h   |  19 --
>>>>>  include/linux/sched.h      |   3 +
>>>>>  include/linux/sched/task.h |   1 +
>>>>>  kernel/fork.c              |   2 +
>>>>>  9 files changed, 161 insertions(+), 620 deletions(-)
>>>>>
>>>>> as it gets rid of _all_ the 'grab this or that piece' that we're
>>>>> tracking.
>>>>>
>>>>> WIP series here:
>>>>>
>>>>> https://git.kernel.dk/cgit/linux-block/log/?h=io_uring-worker
>>>>
>>>> I took a quick look through the code and in general it seems reasonable.
>>>
>>> Great, thanks for checking.
>>
>> Cleaner series here:
>>
>> https://git.kernel.dk/cgit/linux-block/log/?h=io_uring-worker.v2
>>
>> One question, since I'm a bit stumped. The very top most debug patch:
>>
>> https://git.kernel.dk/cgit/linux-block/commit/?h=io_uring-worker.v2&id=8a422f030b9630d16d5ec1ff97842a265f88485e
>>
>> any idea what is going on here? For some reason, it only happens for
>> the 'manager' thread. That one doesn't do any work by itself, it's just
>> tasked with forking a new worker, if we need one.
> 
> Seems to trigger for all cases with a pthread in the app. This reproduces
> it:

Nevermind, it was me being an idiot. I had a case in the manager thread
that did return 0 instead of do_exit()...

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 46+ messages in thread

end of thread, other threads:[~2021-02-17  3:12 UTC | newest]

Thread overview: 46+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-12-14 19:13 [PATCHSET v3 0/4] fs: Support for LOOKUP_NONBLOCK / RESOLVE_NONBLOCK Jens Axboe
2020-12-14 19:13 ` [PATCH 1/4] fs: make unlazy_walk() error handling consistent Jens Axboe
2020-12-14 19:13 ` [PATCH 2/4] fs: add support for LOOKUP_NONBLOCK Jens Axboe
2020-12-15 12:24   ` Matthew Wilcox
2020-12-15 15:29     ` Jens Axboe
2020-12-15 15:33       ` Matthew Wilcox
2020-12-15 15:37         ` Jens Axboe
2020-12-15 16:08           ` Jens Axboe
2020-12-15 16:14             ` Jens Axboe
2020-12-15 18:29             ` Linus Torvalds
2020-12-15 18:44               ` Jens Axboe
2020-12-15 18:47                 ` Linus Torvalds
2020-12-15 19:03                   ` Jens Axboe
2020-12-15 19:32                     ` Linus Torvalds
2020-12-15 19:38                       ` Jens Axboe
2020-12-16  2:36   ` Al Viro
2020-12-16  3:30     ` Jens Axboe
2020-12-16  2:43   ` Al Viro
2020-12-16  3:32     ` Jens Axboe
2020-12-14 19:13 ` [PATCH 3/4] fs: expose LOOKUP_NONBLOCK through openat2() RESOLVE_NONBLOCK Jens Axboe
2020-12-15 22:25   ` Dave Chinner
2020-12-15 22:31     ` Linus Torvalds
2020-12-15 23:25       ` Jens Axboe
2020-12-16  2:37   ` Al Viro
2020-12-16  3:39     ` Linus Torvalds
2020-12-14 19:13 ` [PATCH 4/4] io_uring: enable LOOKUP_NONBLOCK path resolution for filename lookups Jens Axboe
2020-12-15  3:06 ` [PATCHSET v3 0/4] fs: Support for LOOKUP_NONBLOCK / RESOLVE_NONBLOCK Linus Torvalds
2020-12-15  3:18   ` Jens Axboe
2020-12-15  6:11 ` Al Viro
2020-12-15 15:29   ` Jens Axboe
2021-01-04  5:31 ` Al Viro
2021-01-04 14:43   ` Jens Axboe
2021-01-04 16:54     ` Al Viro
2021-01-04 17:03       ` Jens Axboe
     [not found] ` <m1lfbrwrgq.fsf@fess.ebiederm.org>
2021-02-14 16:38   ` [PATCHSET v3 0/4] fs: Support for LOOKUP_NONBLOCK / RESOLVE_NONBLOCK (Insufficiently faking current?) Jens Axboe
2021-02-14 20:30     ` Linus Torvalds
2021-02-14 21:24       ` Al Viro
2021-02-15 18:07       ` Eric W. Biederman
2021-02-15 18:24         ` Jens Axboe
2021-02-15 21:09           ` Jens Axboe
2021-02-15 22:41             ` Eric W. Biederman
2021-02-16  2:41               ` Jens Axboe
2021-02-17  1:18                 ` Jens Axboe
2021-02-17  1:26                   ` Jens Axboe
2021-02-17  3:11                     ` Jens Axboe
2021-02-15 17:56     ` Eric W. Biederman

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.