All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCHSET 0/2] fs: Support for LOOKUP_NONBLOCK / RESOLVE_NONBLOCK
@ 2020-12-10 20:01 Jens Axboe
  2020-12-10 20:01 ` [PATCH 1/2] fs: add support for LOOKUP_NONBLOCK Jens Axboe
  2020-12-10 20:01 ` [PATCH 2/2] fs: expose LOOKUP_NONBLOCK through openat2() RESOLVE_NONBLOCK Jens Axboe
  0 siblings, 2 replies; 28+ messages in thread
From: Jens Axboe @ 2020-12-10 20:01 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: torvalds, viro

Hi,

This adds support for just doing the RCU based (and non-blocking) part
of path resolution. The main motivation is for io_uring to be able to
do saner/faster open, so we don't always have to go async, particularly
for the fast path of the data already being cached.

Internally that is presented as LOOKUP_NONBLOCK, which depends on
LOOKUP_RCU for doing the right thing. If we terminate the RCU part of the
lookup, then we return -EAGAIN if LOOKUP_NONBLOCK is also set.

The second patch is enabling this through openat2() as well, by adding
a RESOLVE_NONBLOCK that can be passed in struct open_how ->resolve as
well. Basic test case:

[root@archlinux liburing]# echo 3 > /proc/sys/vm/drop_caches 
[root@archlinux liburing]# test/do-open2 /etc/nanorc
open: -1
openat2: Resource temporarily unavailable
[root@archlinux liburing]# touch /etc/nanorc
[root@archlinux liburing]# test/do-open2 /etc/nanorc
open: 3

-- 
Jens Axboe



^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 1/2] fs: add support for LOOKUP_NONBLOCK
  2020-12-10 20:01 [PATCHSET 0/2] fs: Support for LOOKUP_NONBLOCK / RESOLVE_NONBLOCK Jens Axboe
@ 2020-12-10 20:01 ` Jens Axboe
  2020-12-10 20:53   ` Linus Torvalds
  2020-12-11  2:35   ` Al Viro
  2020-12-10 20:01 ` [PATCH 2/2] fs: expose LOOKUP_NONBLOCK through openat2() RESOLVE_NONBLOCK Jens Axboe
  1 sibling, 2 replies; 28+ messages in thread
From: Jens Axboe @ 2020-12-10 20:01 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: torvalds, viro, Jens Axboe

io_uring always punts opens to async context, since there's no control
over whether the lookup blocks or not. Add LOOKUP_NONBLOCK to support
just doing the fast RCU based lookups, which we know will not block. If
we can do a cached path resolution of the filename, then we don't have
to always punt lookups for a worker.

During path resolution, we always do LOOKUP_RCU first. If that fails and
we terminate LOOKUP_RCU, then fail a LOOKUP_NONBLOCK attempt as well.

Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/namei.c            | 60 +++++++++++++++++++++++++++++++------------
 include/linux/namei.h |  1 +
 2 files changed, 44 insertions(+), 17 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 03d0e11e4f36..3d86915568fa 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -679,7 +679,7 @@ static bool legitimize_root(struct nameidata *nd)
  * Nothing should touch nameidata between unlazy_walk() failure and
  * terminate_walk().
  */
-static int unlazy_walk(struct nameidata *nd)
+static int complete_walk_rcu(struct nameidata *nd)
 {
 	struct dentry *parent = nd->path.dentry;
 
@@ -704,6 +704,18 @@ static int unlazy_walk(struct nameidata *nd)
 	return -ECHILD;
 }
 
+static int unlazy_walk(struct nameidata *nd)
+{
+	int ret;
+
+	ret = complete_walk_rcu(nd);
+	/* If caller is asking for NONBLOCK lookup, assume we can't satisfy it */
+	if (!ret && (nd->flags & LOOKUP_NONBLOCK))
+		ret = -EAGAIN;
+
+	return ret;
+}
+
 /**
  * unlazy_child - try to switch to ref-walk mode.
  * @nd: nameidata pathwalk data
@@ -764,10 +776,13 @@ static int unlazy_child(struct nameidata *nd, struct dentry *dentry, unsigned se
 
 static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
 {
-	if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
+	if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
+		if ((flags & (LOOKUP_RCU | LOOKUP_NONBLOCK)) == LOOKUP_NONBLOCK)
+			return -EAGAIN;
 		return dentry->d_op->d_revalidate(dentry, flags);
-	else
-		return 1;
+	}
+
+	return 1;
 }
 
 /**
@@ -792,7 +807,7 @@ static int complete_walk(struct nameidata *nd)
 		 */
 		if (!(nd->flags & (LOOKUP_ROOT | LOOKUP_IS_SCOPED)))
 			nd->root.mnt = NULL;
-		if (unlikely(unlazy_walk(nd)))
+		if (unlikely(complete_walk_rcu(nd)))
 			return -ECHILD;
 	}
 
@@ -1466,8 +1481,9 @@ static struct dentry *lookup_fast(struct nameidata *nd,
 		unsigned seq;
 		dentry = __d_lookup_rcu(parent, &nd->last, &seq);
 		if (unlikely(!dentry)) {
-			if (unlazy_walk(nd))
-				return ERR_PTR(-ECHILD);
+			int ret = unlazy_walk(nd);
+			if (ret)
+				return ERR_PTR(ret);
 			return NULL;
 		}
 
@@ -1569,8 +1585,9 @@ static inline int may_lookup(struct nameidata *nd)
 		int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
 		if (err != -ECHILD)
 			return err;
-		if (unlazy_walk(nd))
-			return -ECHILD;
+		err = unlazy_walk(nd);
+		if (err)
+			return err;
 	}
 	return inode_permission(nd->inode, MAY_EXEC);
 }
@@ -1591,9 +1608,11 @@ static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq)
 		// we need to grab link before we do unlazy.  And we can't skip
 		// unlazy even if we fail to grab the link - cleanup needs it
 		bool grabbed_link = legitimize_path(nd, link, seq);
+		int ret;
 
-		if (unlazy_walk(nd) != 0 || !grabbed_link)
-			return -ECHILD;
+		ret = unlazy_walk(nd);
+		if (ret && !grabbed_link)
+			return ret;
 
 		if (nd_alloc_stack(nd))
 			return 0;
@@ -1634,8 +1653,9 @@ static const char *pick_link(struct nameidata *nd, struct path *link,
 		touch_atime(&last->link);
 		cond_resched();
 	} else if (atime_needs_update(&last->link, inode)) {
-		if (unlikely(unlazy_walk(nd)))
-			return ERR_PTR(-ECHILD);
+		error = unlazy_walk(nd);
+		if (unlikely(error))
+			return ERR_PTR(error);
 		touch_atime(&last->link);
 	}
 
@@ -1652,8 +1672,9 @@ static const char *pick_link(struct nameidata *nd, struct path *link,
 		if (nd->flags & LOOKUP_RCU) {
 			res = get(NULL, inode, &last->done);
 			if (res == ERR_PTR(-ECHILD)) {
-				if (unlikely(unlazy_walk(nd)))
-					return ERR_PTR(-ECHILD);
+				error = unlazy_walk(nd);
+				if (unlikely(error))
+					return ERR_PTR(error);
 				res = get(link->dentry, inode, &last->done);
 			}
 		} else {
@@ -2193,8 +2214,9 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		}
 		if (unlikely(!d_can_lookup(nd->path.dentry))) {
 			if (nd->flags & LOOKUP_RCU) {
-				if (unlazy_walk(nd))
-					return -ECHILD;
+				err = unlazy_walk(nd);
+				if (err)
+					return err;
 			}
 			return -ENOTDIR;
 		}
@@ -3394,10 +3416,14 @@ struct file *do_filp_open(int dfd, struct filename *pathname,
 
 	set_nameidata(&nd, dfd, pathname);
 	filp = path_openat(&nd, op, flags | LOOKUP_RCU);
+	/* If we fail RCU lookup, assume NONBLOCK cannot be honored */
+	if (flags & LOOKUP_NONBLOCK)
+		goto out;
 	if (unlikely(filp == ERR_PTR(-ECHILD)))
 		filp = path_openat(&nd, op, flags);
 	if (unlikely(filp == ERR_PTR(-ESTALE)))
 		filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
+out:
 	restore_nameidata();
 	return filp;
 }
diff --git a/include/linux/namei.h b/include/linux/namei.h
index a4bb992623c4..c36c4e0805fc 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -46,6 +46,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT};
 #define LOOKUP_NO_XDEV		0x040000 /* No mountpoint crossing. */
 #define LOOKUP_BENEATH		0x080000 /* No escaping from starting point. */
 #define LOOKUP_IN_ROOT		0x100000 /* Treat dirfd as fs root. */
+#define LOOKUP_NONBLOCK		0x200000 /* don't block for lookup */
 /* LOOKUP_* flags which do scope-related checks based on the dirfd. */
 #define LOOKUP_IS_SCOPED (LOOKUP_BENEATH | LOOKUP_IN_ROOT)
 
-- 
2.29.2


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 2/2] fs: expose LOOKUP_NONBLOCK through openat2() RESOLVE_NONBLOCK
  2020-12-10 20:01 [PATCHSET 0/2] fs: Support for LOOKUP_NONBLOCK / RESOLVE_NONBLOCK Jens Axboe
  2020-12-10 20:01 ` [PATCH 1/2] fs: add support for LOOKUP_NONBLOCK Jens Axboe
@ 2020-12-10 20:01 ` Jens Axboe
  2020-12-10 22:29   ` Dave Chinner
  1 sibling, 1 reply; 28+ messages in thread
From: Jens Axboe @ 2020-12-10 20:01 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: torvalds, viro, Jens Axboe

Now that we support non-blocking path resolution internally, expose it
via openat2() in the struct open_how ->resolve flags. This allows
applications using openat2() to limit path resolution to the extent that
it is already cached.

If the lookup cannot be satisfied in a non-blocking manner, openat2(2)
will return -1/-EAGAIN.

Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/open.c                    | 2 ++
 include/linux/fcntl.h        | 2 +-
 include/uapi/linux/openat2.h | 2 ++
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/open.c b/fs/open.c
index 9af548fb841b..07dc9f3d1628 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1087,6 +1087,8 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
 		lookup_flags |= LOOKUP_BENEATH;
 	if (how->resolve & RESOLVE_IN_ROOT)
 		lookup_flags |= LOOKUP_IN_ROOT;
+	if (how->resolve & RESOLVE_NONBLOCK)
+		lookup_flags |= LOOKUP_NONBLOCK;
 
 	op->lookup_flags = lookup_flags;
 	return 0;
diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
index 921e750843e6..919a13c9317c 100644
--- a/include/linux/fcntl.h
+++ b/include/linux/fcntl.h
@@ -19,7 +19,7 @@
 /* List of all valid flags for the how->resolve argument: */
 #define VALID_RESOLVE_FLAGS \
 	(RESOLVE_NO_XDEV | RESOLVE_NO_MAGICLINKS | RESOLVE_NO_SYMLINKS | \
-	 RESOLVE_BENEATH | RESOLVE_IN_ROOT)
+	 RESOLVE_BENEATH | RESOLVE_IN_ROOT | RESOLVE_NONBLOCK)
 
 /* List of all open_how "versions". */
 #define OPEN_HOW_SIZE_VER0	24 /* sizeof first published struct */
diff --git a/include/uapi/linux/openat2.h b/include/uapi/linux/openat2.h
index 58b1eb711360..ddbf0796841a 100644
--- a/include/uapi/linux/openat2.h
+++ b/include/uapi/linux/openat2.h
@@ -35,5 +35,7 @@ struct open_how {
 #define RESOLVE_IN_ROOT		0x10 /* Make all jumps to "/" and ".."
 					be scoped inside the dirfd
 					(similar to chroot(2)). */
+#define RESOLVE_NONBLOCK	0x20 /* Only complete if resolution can be
+					done without IO */
 
 #endif /* _UAPI_LINUX_OPENAT2_H */
-- 
2.29.2


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] fs: add support for LOOKUP_NONBLOCK
  2020-12-10 20:01 ` [PATCH 1/2] fs: add support for LOOKUP_NONBLOCK Jens Axboe
@ 2020-12-10 20:53   ` Linus Torvalds
  2020-12-10 21:06     ` Jens Axboe
  2020-12-11  2:35   ` Al Viro
  1 sibling, 1 reply; 28+ messages in thread
From: Linus Torvalds @ 2020-12-10 20:53 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-fsdevel, Al Viro

On Thu, Dec 10, 2020 at 12:01 PM Jens Axboe <axboe@kernel.dk> wrote:
>
> io_uring always punts opens to async context, since there's no control
> over whether the lookup blocks or not. Add LOOKUP_NONBLOCK to support
> just doing the fast RCU based lookups, which we know will not block. If
> we can do a cached path resolution of the filename, then we don't have
> to always punt lookups for a worker.

Ok, this looks much better to me just from the name change.

Half of the patch is admittedly just to make sure it now returns the
right error from unlazy_walk (rather than knowing it's always
-ECHILD), and that could be its own thing, but I'm not sure it's even
worth splitting up. The only reason to do it would be to perhaps make
it really clear which part is the actual change, and which is just
that error handling cleanup.

So it looks fine to me, but I will leave this all to Al.

           Linus

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] fs: add support for LOOKUP_NONBLOCK
  2020-12-10 20:53   ` Linus Torvalds
@ 2020-12-10 21:06     ` Jens Axboe
  2020-12-11  2:45       ` Al Viro
  0 siblings, 1 reply; 28+ messages in thread
From: Jens Axboe @ 2020-12-10 21:06 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-fsdevel, Al Viro

On 12/10/20 1:53 PM, Linus Torvalds wrote:
> On Thu, Dec 10, 2020 at 12:01 PM Jens Axboe <axboe@kernel.dk> wrote:
>>
>> io_uring always punts opens to async context, since there's no control
>> over whether the lookup blocks or not. Add LOOKUP_NONBLOCK to support
>> just doing the fast RCU based lookups, which we know will not block. If
>> we can do a cached path resolution of the filename, then we don't have
>> to always punt lookups for a worker.
> 
> Ok, this looks much better to me just from the name change.
> 
> Half of the patch is admittedly just to make sure it now returns the
> right error from unlazy_walk (rather than knowing it's always
> -ECHILD), and that could be its own thing, but I'm not sure it's even
> worth splitting up. The only reason to do it would be to perhaps make
> it really clear which part is the actual change, and which is just
> that error handling cleanup.
> 
> So it looks fine to me, but I will leave this all to Al.

I did consider doing a prep patch just making the error handling clearer
and get rid of the -ECHILD assumption, since it's pretty odd and not
even something I'd expect to see in there. Al, do you want a prep patch
to do that to make the change simpler/cleaner?

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] fs: expose LOOKUP_NONBLOCK through openat2() RESOLVE_NONBLOCK
  2020-12-10 20:01 ` [PATCH 2/2] fs: expose LOOKUP_NONBLOCK through openat2() RESOLVE_NONBLOCK Jens Axboe
@ 2020-12-10 22:29   ` Dave Chinner
  2020-12-10 23:12     ` Jens Axboe
  2020-12-10 23:29     ` Linus Torvalds
  0 siblings, 2 replies; 28+ messages in thread
From: Dave Chinner @ 2020-12-10 22:29 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-fsdevel, torvalds, viro

On Thu, Dec 10, 2020 at 01:01:14PM -0700, Jens Axboe wrote:
> Now that we support non-blocking path resolution internally, expose it
> via openat2() in the struct open_how ->resolve flags. This allows
> applications using openat2() to limit path resolution to the extent that
> it is already cached.
> 
> If the lookup cannot be satisfied in a non-blocking manner, openat2(2)
> will return -1/-EAGAIN.
> 
> Cc: Al Viro <viro@zeniv.linux.org.uk>
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---
>  fs/open.c                    | 2 ++
>  include/linux/fcntl.h        | 2 +-
>  include/uapi/linux/openat2.h | 2 ++
>  3 files changed, 5 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/open.c b/fs/open.c
> index 9af548fb841b..07dc9f3d1628 100644
> --- a/fs/open.c
> +++ b/fs/open.c
> @@ -1087,6 +1087,8 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
>  		lookup_flags |= LOOKUP_BENEATH;
>  	if (how->resolve & RESOLVE_IN_ROOT)
>  		lookup_flags |= LOOKUP_IN_ROOT;
> +	if (how->resolve & RESOLVE_NONBLOCK)
> +		lookup_flags |= LOOKUP_NONBLOCK;
>  
>  	op->lookup_flags = lookup_flags;
>  	return 0;
> diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
> index 921e750843e6..919a13c9317c 100644
> --- a/include/linux/fcntl.h
> +++ b/include/linux/fcntl.h
> @@ -19,7 +19,7 @@
>  /* List of all valid flags for the how->resolve argument: */
>  #define VALID_RESOLVE_FLAGS \
>  	(RESOLVE_NO_XDEV | RESOLVE_NO_MAGICLINKS | RESOLVE_NO_SYMLINKS | \
> -	 RESOLVE_BENEATH | RESOLVE_IN_ROOT)
> +	 RESOLVE_BENEATH | RESOLVE_IN_ROOT | RESOLVE_NONBLOCK)
>  
>  /* List of all open_how "versions". */
>  #define OPEN_HOW_SIZE_VER0	24 /* sizeof first published struct */
> diff --git a/include/uapi/linux/openat2.h b/include/uapi/linux/openat2.h
> index 58b1eb711360..ddbf0796841a 100644
> --- a/include/uapi/linux/openat2.h
> +++ b/include/uapi/linux/openat2.h
> @@ -35,5 +35,7 @@ struct open_how {
>  #define RESOLVE_IN_ROOT		0x10 /* Make all jumps to "/" and ".."
>  					be scoped inside the dirfd
>  					(similar to chroot(2)). */
> +#define RESOLVE_NONBLOCK	0x20 /* Only complete if resolution can be
> +					done without IO */

I don't think this describes the implementation correctly - it has
nothing to actually do with whether IO is needed, just whether the
lookup can be done without taking blocking locks. The slow path can
complete without doing IO - it might miss the dentry cache but hit
the filesystem buffer cache on lookup and the inode cache when
retrieving the inode. And it may not even block anywhere doing this.

So, really, this isn't avoiding IO at all - it's avoiding the
possibility of running a lookup path that might blocking on
something.

This also needs a openat2(2) man page update explaining exactly what
behaviour/semantics this flag provides and that userspace can rely
on when this flag is set...

We've been failing to define the behaviour of our interfaces clearly,
especially around non-blocking IO behaviour in recent times. We need
to fix that, not make matters worse by adding new, poorly defined
non-blocking behaviours...

I'd also like to know how we actually test this is working- a
reliable regression test for fstests would be very useful for
ensuring that the behaviour as defined by the man page is not broken
accidentally by future changes...

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] fs: expose LOOKUP_NONBLOCK through openat2() RESOLVE_NONBLOCK
  2020-12-10 22:29   ` Dave Chinner
@ 2020-12-10 23:12     ` Jens Axboe
  2020-12-10 23:29     ` Linus Torvalds
  1 sibling, 0 replies; 28+ messages in thread
From: Jens Axboe @ 2020-12-10 23:12 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-fsdevel, torvalds, viro

On 12/10/20 3:29 PM, Dave Chinner wrote:
> On Thu, Dec 10, 2020 at 01:01:14PM -0700, Jens Axboe wrote:
>> Now that we support non-blocking path resolution internally, expose it
>> via openat2() in the struct open_how ->resolve flags. This allows
>> applications using openat2() to limit path resolution to the extent that
>> it is already cached.
>>
>> If the lookup cannot be satisfied in a non-blocking manner, openat2(2)
>> will return -1/-EAGAIN.
>>
>> Cc: Al Viro <viro@zeniv.linux.org.uk>
>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>> ---
>>  fs/open.c                    | 2 ++
>>  include/linux/fcntl.h        | 2 +-
>>  include/uapi/linux/openat2.h | 2 ++
>>  3 files changed, 5 insertions(+), 1 deletion(-)
>>
>> diff --git a/fs/open.c b/fs/open.c
>> index 9af548fb841b..07dc9f3d1628 100644
>> --- a/fs/open.c
>> +++ b/fs/open.c
>> @@ -1087,6 +1087,8 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
>>  		lookup_flags |= LOOKUP_BENEATH;
>>  	if (how->resolve & RESOLVE_IN_ROOT)
>>  		lookup_flags |= LOOKUP_IN_ROOT;
>> +	if (how->resolve & RESOLVE_NONBLOCK)
>> +		lookup_flags |= LOOKUP_NONBLOCK;
>>  
>>  	op->lookup_flags = lookup_flags;
>>  	return 0;
>> diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
>> index 921e750843e6..919a13c9317c 100644
>> --- a/include/linux/fcntl.h
>> +++ b/include/linux/fcntl.h
>> @@ -19,7 +19,7 @@
>>  /* List of all valid flags for the how->resolve argument: */
>>  #define VALID_RESOLVE_FLAGS \
>>  	(RESOLVE_NO_XDEV | RESOLVE_NO_MAGICLINKS | RESOLVE_NO_SYMLINKS | \
>> -	 RESOLVE_BENEATH | RESOLVE_IN_ROOT)
>> +	 RESOLVE_BENEATH | RESOLVE_IN_ROOT | RESOLVE_NONBLOCK)
>>  
>>  /* List of all open_how "versions". */
>>  #define OPEN_HOW_SIZE_VER0	24 /* sizeof first published struct */
>> diff --git a/include/uapi/linux/openat2.h b/include/uapi/linux/openat2.h
>> index 58b1eb711360..ddbf0796841a 100644
>> --- a/include/uapi/linux/openat2.h
>> +++ b/include/uapi/linux/openat2.h
>> @@ -35,5 +35,7 @@ struct open_how {
>>  #define RESOLVE_IN_ROOT		0x10 /* Make all jumps to "/" and ".."
>>  					be scoped inside the dirfd
>>  					(similar to chroot(2)). */
>> +#define RESOLVE_NONBLOCK	0x20 /* Only complete if resolution can be
>> +					done without IO */
> 
> I don't think this describes the implementation correctly - it has
> nothing to actually do with whether IO is needed, just whether the
> lookup can be done without taking blocking locks. The slow path can
> complete without doing IO - it might miss the dentry cache but hit
> the filesystem buffer cache on lookup and the inode cache when
> retrieving the inode. And it may not even block anywhere doing this.
> 
> So, really, this isn't avoiding IO at all - it's avoiding the
> possibility of running a lookup path that might blocking on
> something.

Right, it's about not blocking, as the commit message says. That could
be IO, either directly or indirectly, or it could be just locking. I'll
update this comment.

> This also needs a openat2(2) man page update explaining exactly what
> behaviour/semantics this flag provides and that userspace can rely
> on when this flag is set...

Agree, I'll add that.

> We've been failing to define the behaviour of our interfaces clearly,
> especially around non-blocking IO behaviour in recent times. We need
> to fix that, not make matters worse by adding new, poorly defined
> non-blocking behaviours...

Also agree on that! It doesn't help that different folks have different
criteria of what nowait/nonblock means...

> I'd also like to know how we actually test this is working- a
> reliable regression test for fstests would be very useful for
> ensuring that the behaviour as defined by the man page is not broken
> accidentally by future changes...

Definitely. On the io_uring side, I generally run with a debug patch
that triggers if anything blocks off the submission path. That won't
really work for this one, however.

FWIW, I'm quite fine deferring this patch, I obviously care more about
the io_uring side of things. It seems like a no-brainer to support for
openat2 though, as it would allow applications to decide when to punt
open to another thread. Since this one ties in very closely with
LOOKUP_RCU, I'm not _too_ worried about this one in particular. But it
would be great to have something that we could use with eg RWF_NOWAIT as
well, and similar constructs. I'll give it some thought.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] fs: expose LOOKUP_NONBLOCK through openat2() RESOLVE_NONBLOCK
  2020-12-10 22:29   ` Dave Chinner
  2020-12-10 23:12     ` Jens Axboe
@ 2020-12-10 23:29     ` Linus Torvalds
  2020-12-11  0:58       ` Dave Chinner
  1 sibling, 1 reply; 28+ messages in thread
From: Linus Torvalds @ 2020-12-10 23:29 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Jens Axboe, linux-fsdevel, Al Viro

On Thu, Dec 10, 2020 at 2:29 PM Dave Chinner <david@fromorbit.com> wrote:
>
> So, really, this isn't avoiding IO at all - it's avoiding the
> possibility of running a lookup path that might blocking on
> something.

For pathname lookup, the only case that matters is the RCU lockless lookup.

That cache hits basically 100% of the time except for the first
lookup, or under memory pressure.

And honestly, from a performance perspective, it's the lockless path
lookup that matters most. By the time you have to go to the
filesystem, take the directory locks etc, you've already lost.

So we're never going to bother with some kind of "lockless lookup for
actual filesystems", because it's only extra work for no actual gain.

End result: LOOKUP_NONBLOCK is about not just avoiding IO, but about
avoiding the filesystem and the inevitable locking that causes.

              Linus

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] fs: expose LOOKUP_NONBLOCK through openat2() RESOLVE_NONBLOCK
  2020-12-10 23:29     ` Linus Torvalds
@ 2020-12-11  0:58       ` Dave Chinner
  2020-12-11  1:01         ` Linus Torvalds
  0 siblings, 1 reply; 28+ messages in thread
From: Dave Chinner @ 2020-12-11  0:58 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Jens Axboe, linux-fsdevel, Al Viro

On Thu, Dec 10, 2020 at 03:29:23PM -0800, Linus Torvalds wrote:
> On Thu, Dec 10, 2020 at 2:29 PM Dave Chinner <david@fromorbit.com> wrote:
> >
> > So, really, this isn't avoiding IO at all - it's avoiding the
> > possibility of running a lookup path that might blocking on
> > something.
> 
> For pathname lookup, the only case that matters is the RCU lockless lookup.
> 
> That cache hits basically 100% of the time except for the first
> lookup, or under memory pressure.
> 
> And honestly, from a performance perspective, it's the lockless path
> lookup that matters most. By the time you have to go to the
> filesystem, take the directory locks etc, you've already lost.
> 
> So we're never going to bother with some kind of "lockless lookup for
> actual filesystems", because it's only extra work for no actual gain.
> 
> End result: LOOKUP_NONBLOCK is about not just avoiding IO, but about
> avoiding the filesystem and the inevitable locking that causes.

Umm, yes, that is _exactly_ what I just said. :/

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] fs: expose LOOKUP_NONBLOCK through openat2() RESOLVE_NONBLOCK
  2020-12-11  0:58       ` Dave Chinner
@ 2020-12-11  1:01         ` Linus Torvalds
  2020-12-11  3:45           ` Dave Chinner
  0 siblings, 1 reply; 28+ messages in thread
From: Linus Torvalds @ 2020-12-11  1:01 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Jens Axboe, linux-fsdevel, Al Viro

On Thu, Dec 10, 2020 at 4:58 PM Dave Chinner <david@fromorbit.com> wrote:
>
> Umm, yes, that is _exactly_ what I just said. :/

.,. but it _sounded_ like you would actually want to do the whole
filesystem thing, since why would you have piped up otherwise. I just
wanted to clarify that the onle sane model is the one that patch
actually implements.

Otherwise, your email was just nit-picking about a single word in a
comment in a header file.

Was that really what you wanted to do?

            Linus

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] fs: add support for LOOKUP_NONBLOCK
  2020-12-10 20:01 ` [PATCH 1/2] fs: add support for LOOKUP_NONBLOCK Jens Axboe
  2020-12-10 20:53   ` Linus Torvalds
@ 2020-12-11  2:35   ` Al Viro
  2020-12-11 15:57     ` Jens Axboe
  1 sibling, 1 reply; 28+ messages in thread
From: Al Viro @ 2020-12-11  2:35 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-fsdevel, torvalds

On Thu, Dec 10, 2020 at 01:01:13PM -0700, Jens Axboe wrote:
> io_uring always punts opens to async context, since there's no control
> over whether the lookup blocks or not. Add LOOKUP_NONBLOCK to support
> just doing the fast RCU based lookups, which we know will not block. If
> we can do a cached path resolution of the filename, then we don't have
> to always punt lookups for a worker.
> 
> During path resolution, we always do LOOKUP_RCU first. If that fails and
> we terminate LOOKUP_RCU, then fail a LOOKUP_NONBLOCK attempt as well.

In effect you are adding a mode where
	* unlazy would fail, except when done from complete_walk()
	* ->d_revalidate() wouldn't be attempted at all (not even with LOOKUP_RCU)
	* ... but ->get_link() in RCU mode would
	* ... and so would everything done after complete_walk() in
do_open(), very much including the joys like mnt_want_write() (i.e. waiting for
frozen fs to thaw), handling O_TRUNC, calling ->open() itself...

So this "not punting lookups for a worker" looks fishy as hell - if you care
about blocking operations, you haven't really won anything.

And why exactly is the RCU case of ->d_revalidate() worth buggering off (it
really can't block - it's called under rcu_read_lock() and it does *not*
drop it)?

_IF_ for some theoretical exercise you want to do "lookup without dropping
out of RCU", just add a flag that has unlazy_walk() fail.  With -ECHILD.
Strip it away in complete_walk() and have path_init() with that flag
and without LOOKUP_RCU fail with -EAGAIN.  All there is to it.

It still leaves you with fuckloads of blocking operations (and that's
"blocking" with "until admin thaws the damn filesystem several hours
down the road") after complete_walk(), though.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] fs: add support for LOOKUP_NONBLOCK
  2020-12-10 21:06     ` Jens Axboe
@ 2020-12-11  2:45       ` Al Viro
  2020-12-11 16:05         ` Jens Axboe
  0 siblings, 1 reply; 28+ messages in thread
From: Al Viro @ 2020-12-11  2:45 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Linus Torvalds, linux-fsdevel

On Thu, Dec 10, 2020 at 02:06:39PM -0700, Jens Axboe wrote:
> On 12/10/20 1:53 PM, Linus Torvalds wrote:
> > On Thu, Dec 10, 2020 at 12:01 PM Jens Axboe <axboe@kernel.dk> wrote:
> >>
> >> io_uring always punts opens to async context, since there's no control
> >> over whether the lookup blocks or not. Add LOOKUP_NONBLOCK to support
> >> just doing the fast RCU based lookups, which we know will not block. If
> >> we can do a cached path resolution of the filename, then we don't have
> >> to always punt lookups for a worker.
> > 
> > Ok, this looks much better to me just from the name change.
> > 
> > Half of the patch is admittedly just to make sure it now returns the
> > right error from unlazy_walk (rather than knowing it's always
> > -ECHILD), and that could be its own thing, but I'm not sure it's even
> > worth splitting up. The only reason to do it would be to perhaps make
> > it really clear which part is the actual change, and which is just
> > that error handling cleanup.
> > 
> > So it looks fine to me, but I will leave this all to Al.
> 
> I did consider doing a prep patch just making the error handling clearer
> and get rid of the -ECHILD assumption, since it's pretty odd and not
> even something I'd expect to see in there. Al, do you want a prep patch
> to do that to make the change simpler/cleaner?

No, I do not.  Why bother returning anything other than -ECHILD, when
you can just have path_init() treat you flag sans LOOKUP_RCU as "fail
with -EAGAIN now" and be done with that?

What's the point propagating that thing when we are going to call the
non-RCU variant next if we get -ECHILD?

And that still doesn't answer the questions about the difference between
->d_revalidate() and ->get_link() (for the latter you keep the call in
RCU mode, for the former you generate that -EAGAIN crap).  Or between
->d_revalidate() and ->permission(), for that matter.

Finally, I really wonder what is that for; if you are in conditions when
you really don't want to risk going to sleep, you do *NOT* want to
do mnt_want_write().  Or ->open().  Or truncate().  Or, for Cthulhu
sake, IMA hash calculation.

So how hard are your "we don't want to block here" requirements?  Because
the stuff you do after complete_walk() can easily be much longer than
everything else.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] fs: expose LOOKUP_NONBLOCK through openat2() RESOLVE_NONBLOCK
  2020-12-11  1:01         ` Linus Torvalds
@ 2020-12-11  3:45           ` Dave Chinner
  2020-12-11 18:07             ` Linus Torvalds
  0 siblings, 1 reply; 28+ messages in thread
From: Dave Chinner @ 2020-12-11  3:45 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Jens Axboe, linux-fsdevel, Al Viro

On Thu, Dec 10, 2020 at 05:01:44PM -0800, Linus Torvalds wrote:
> On Thu, Dec 10, 2020 at 4:58 PM Dave Chinner <david@fromorbit.com> wrote:
> >
> > Umm, yes, that is _exactly_ what I just said. :/
> 
> .,. but it _sounded_ like you would actually want to do the whole
> filesystem thing, since why would you have piped up otherwise. I just
> wanted to clarify that the onle sane model is the one that patch
> actually implements.

<sigh>

Is that really what you think motivates me, Linus? It sounds like
you've created an Evil Dave strawman and you simply cannot see past
the taint Evil Dave has created in your head. :/

I commented because Jens has recently found several issues with
inconsistencies in "non-blocking" APIs that we have in the IO path.
He's triggered bugs in the non-blocking behaviour in filesystem code
through io_uring that we've had to fix (e.g. see commit 883a790a8440
("xfs: don't allow NOWAIT DIO across extent boundaries"). Then there
are the active discussions about the limited ability to use
IOCB_NOWAIT for full stack non-blocking IO behaviour w/ REQ_NOWAIT
in the block layer because the semantics of IOCB_NOWAIT are directly
tied to the requirements of the RWF_NOWAIT preadv2/pwritev2 flag and
using REQ_NOWAIT in the block layer will break them.

Part of the problem we have with the non-blocking behaviour is that
the user interfaces have been under specified, poorly reviewed and
targetted a single specific use case on a single filesystem rather
than generic behaviour. And mostly they lack the necessary test
coverage to ensure all filesystems behave the same way and to inform
us of a regression that *would break userspace applications*.

Yes, I recognise and accept that some of the problems are partially
my fault. I also have a habit of trying to learn from the mistakes
I've made and then take steps to ensure that *we do not make those
same mistakes again*.

> Otherwise, your email was just nit-picking about a single word in a
> comment in a header file.
> 
> Was that really what you wanted to do?

So for all your talk about "don't break userspace", you think that
actively taking steps during reviews to avoid a poor userspace API
is "nit-picking"? FYI, having a reviewer ask for a userspace API
modification to:

	- have clearly specified and documented behaviour,
	- be provided with user documentation, and
	- be submitted with regression tests

is not at all unusual or unexpected. Asking for these things during
review on -fsdevel and various filesystem lists is a normal part of
the process for getting changes to user APIs reviewed and merged.
The fact that Jens replied with "yep, no problems, let's make sure
we nail down the semantics" and Al has replied "what does
RESOLVE_NONBLOCK actually mean for all the blocking stuff that open
does /after/ the pathwalk?" shows that these semantics really do
matter Hence they need to be defined, specified, documented and
carefully exercised by regression tests. i.e. the patch that
introduces the RESOLVE_NONBLOCK flag is the -easy part-, filling in
the rest of the blanks is where all the hard work is...

Hence calling these requests "nit picking" sets entirely the wrong
tone for the wider community. You may not care about things like
properly documenting interfaces, but application developers and
users sure do and hence it's something we need to pay attention to
and encourage.

Leaders are supposed to encourage and support good development
practices, not be arseholes to the people who ask for good practices
to be followed.

Please start behaving more like a leader should when I'm around,
Linus.

-Dave.
(Not really Evil.)
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] fs: add support for LOOKUP_NONBLOCK
  2020-12-11  2:35   ` Al Viro
@ 2020-12-11 15:57     ` Jens Axboe
  2020-12-11 17:21       ` Linus Torvalds
  0 siblings, 1 reply; 28+ messages in thread
From: Jens Axboe @ 2020-12-11 15:57 UTC (permalink / raw)
  To: Al Viro; +Cc: linux-fsdevel, torvalds

On 12/10/20 7:35 PM, Al Viro wrote:
> On Thu, Dec 10, 2020 at 01:01:13PM -0700, Jens Axboe wrote:
>> io_uring always punts opens to async context, since there's no control
>> over whether the lookup blocks or not. Add LOOKUP_NONBLOCK to support
>> just doing the fast RCU based lookups, which we know will not block. If
>> we can do a cached path resolution of the filename, then we don't have
>> to always punt lookups for a worker.
>>
>> During path resolution, we always do LOOKUP_RCU first. If that fails and
>> we terminate LOOKUP_RCU, then fail a LOOKUP_NONBLOCK attempt as well.
> 
> In effect you are adding a mode where
> 	* unlazy would fail, except when done from complete_walk()
> 	* ->d_revalidate() wouldn't be attempted at all (not even with LOOKUP_RCU)
> 	* ... but ->get_link() in RCU mode would
> 	* ... and so would everything done after complete_walk() in
> do_open(), very much including the joys like mnt_want_write() (i.e. waiting for
> frozen fs to thaw), handling O_TRUNC, calling ->open() itself...
> 
> So this "not punting lookups for a worker" looks fishy as hell - if you care
> about blocking operations, you haven't really won anything.
> 
> And why exactly is the RCU case of ->d_revalidate() worth buggering off (it
> really can't block - it's called under rcu_read_lock() and it does *not*
> drop it)?
> 
> _IF_ for some theoretical exercise you want to do "lookup without dropping
> out of RCU", just add a flag that has unlazy_walk() fail.  With -ECHILD.
> Strip it away in complete_walk() and have path_init() with that flag
> and without LOOKUP_RCU fail with -EAGAIN.  All there is to it.

Thanks Al, that makes for an easier implementation. I like that suggestion,
boils it down to just three hunks (see below).

For io_uring, the concept is just to perform the fast path inline. The
RCU lookup serves that purpose nicely - if we fail that, then it's expected
to take the latency hit of going async.

> It still leaves you with fuckloads of blocking operations (and that's
> "blocking" with "until admin thaws the damn filesystem several hours
> down the road") after complete_walk(), though.

But that's true (and expected) for any open that isn't non-blocking.


diff --git a/fs/namei.c b/fs/namei.c
index d7952f863e79..d49c72e34c6e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -686,6 +686,8 @@ static bool unlazy_walk(struct nameidata *nd)
 	BUG_ON(!(nd->flags & LOOKUP_RCU));
 
 	nd->flags &= ~LOOKUP_RCU;
+	if (nd->flags & LOOKUP_NONBLOCK)
+		goto out1;
 	if (unlikely(!legitimize_links(nd)))
 		goto out1;
 	if (unlikely(!legitimize_path(nd, &nd->path, nd->seq)))
@@ -792,6 +794,7 @@ static int complete_walk(struct nameidata *nd)
 		 */
 		if (!(nd->flags & (LOOKUP_ROOT | LOOKUP_IS_SCOPED)))
 			nd->root.mnt = NULL;
+		nd->flags &= ~LOOKUP_NONBLOCK;
 		if (unlikely(unlazy_walk(nd)))
 			return -ECHILD;
 	}
@@ -2209,6 +2212,9 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
 
 	if (!*s)
 		flags &= ~LOOKUP_RCU;
+	/* LOOKUP_NONBLOCK requires RCU, ask caller to retry */
+	if ((flags & (LOOKUP_RCU | LOOKUP_NONBLOCK)) == LOOKUP_NONBLOCK)
+		return ERR_PTR(-EAGAIN);
 	if (flags & LOOKUP_RCU)
 		rcu_read_lock();
 

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] fs: add support for LOOKUP_NONBLOCK
  2020-12-11  2:45       ` Al Viro
@ 2020-12-11 16:05         ` Jens Axboe
  2020-12-11 17:20           ` Al Viro
  2020-12-11 17:33           ` Matthew Wilcox
  0 siblings, 2 replies; 28+ messages in thread
From: Jens Axboe @ 2020-12-11 16:05 UTC (permalink / raw)
  To: Al Viro; +Cc: Linus Torvalds, linux-fsdevel

On 12/10/20 7:45 PM, Al Viro wrote:
> On Thu, Dec 10, 2020 at 02:06:39PM -0700, Jens Axboe wrote:
>> On 12/10/20 1:53 PM, Linus Torvalds wrote:
>>> On Thu, Dec 10, 2020 at 12:01 PM Jens Axboe <axboe@kernel.dk> wrote:
>>>>
>>>> io_uring always punts opens to async context, since there's no control
>>>> over whether the lookup blocks or not. Add LOOKUP_NONBLOCK to support
>>>> just doing the fast RCU based lookups, which we know will not block. If
>>>> we can do a cached path resolution of the filename, then we don't have
>>>> to always punt lookups for a worker.
>>>
>>> Ok, this looks much better to me just from the name change.
>>>
>>> Half of the patch is admittedly just to make sure it now returns the
>>> right error from unlazy_walk (rather than knowing it's always
>>> -ECHILD), and that could be its own thing, but I'm not sure it's even
>>> worth splitting up. The only reason to do it would be to perhaps make
>>> it really clear which part is the actual change, and which is just
>>> that error handling cleanup.
>>>
>>> So it looks fine to me, but I will leave this all to Al.
>>
>> I did consider doing a prep patch just making the error handling clearer
>> and get rid of the -ECHILD assumption, since it's pretty odd and not
>> even something I'd expect to see in there. Al, do you want a prep patch
>> to do that to make the change simpler/cleaner?
> 
> No, I do not.  Why bother returning anything other than -ECHILD, when
> you can just have path_init() treat you flag sans LOOKUP_RCU as "fail
> with -EAGAIN now" and be done with that?
> 
> What's the point propagating that thing when we are going to call the
> non-RCU variant next if we get -ECHILD?

Let's at least make it consistent - there is already one spot in there
that passes the return value back (see below).

> And that still doesn't answer the questions about the difference between
> ->d_revalidate() and ->get_link() (for the latter you keep the call in
> RCU mode, for the former you generate that -EAGAIN crap).  Or between
> ->d_revalidate() and ->permission(), for that matter.

I believe these are moot with the updated patch from the other email.

> Finally, I really wonder what is that for; if you are in conditions when
> you really don't want to risk going to sleep, you do *NOT* want to
> do mnt_want_write().  Or ->open().  Or truncate().  Or, for Cthulhu
> sake, IMA hash calculation.

I just want to do the RCU side lookup, that is all. That's my fast path.
If that doesn't work, then we'll go through the motions of pushing this
to a context that allows blocking open.

> So how hard are your "we don't want to block here" requirements?  Because
> the stuff you do after complete_walk() can easily be much longer than
> everything else.

Ideally it'd extend a bit beyond the RCU lookup, as things like proc
resolution will still fail with the proposed patch. But that's not a
huge deal to me, I consider the dentry lookup to be Good Enough.


commit bbfc4b98da8c5d9a64ae202952aa52ae6bb54dbd
Author: Jens Axboe <axboe@kernel.dk>
Date:   Thu Dec 10 14:10:37 2020 -0700

    fs: make unlazy_walk() error handling consistent
    
    Most callers check for non-zero return, and assume it's -ECHILD (which
    it always will be). One caller uses the actual error return. Clean this
    up and make it fully consistent, by having unlazy_walk() return a bool
    instead.
    
    No functional changes in this patch.
    
    Cc: Al Viro <viro@zeniv.linux.org.uk>
    Signed-off-by: Jens Axboe <axboe@kernel.dk>

diff --git a/fs/namei.c b/fs/namei.c
index 03d0e11e4f36..d7952f863e79 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -679,7 +679,7 @@ static bool legitimize_root(struct nameidata *nd)
  * Nothing should touch nameidata between unlazy_walk() failure and
  * terminate_walk().
  */
-static int unlazy_walk(struct nameidata *nd)
+static bool unlazy_walk(struct nameidata *nd)
 {
 	struct dentry *parent = nd->path.dentry;
 
@@ -694,14 +694,14 @@ static int unlazy_walk(struct nameidata *nd)
 		goto out;
 	rcu_read_unlock();
 	BUG_ON(nd->inode != parent->d_inode);
-	return 0;
+	return false;
 
 out1:
 	nd->path.mnt = NULL;
 	nd->path.dentry = NULL;
 out:
 	rcu_read_unlock();
-	return -ECHILD;
+	return true;
 }
 
 /**
@@ -3151,9 +3151,8 @@ static const char *open_last_lookups(struct nameidata *nd,
 	} else {
 		/* create side of things */
 		if (nd->flags & LOOKUP_RCU) {
-			error = unlazy_walk(nd);
-			if (unlikely(error))
-				return ERR_PTR(error);
+			if (unlazy_walk(nd))
+				return ERR_PTR(-ECHILD);
 		}
 		audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
 		/* trailing slashes? */


-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] fs: add support for LOOKUP_NONBLOCK
  2020-12-11 16:05         ` Jens Axboe
@ 2020-12-11 17:20           ` Al Viro
  2020-12-11 17:35             ` Linus Torvalds
  2020-12-11 18:50             ` Jens Axboe
  2020-12-11 17:33           ` Matthew Wilcox
  1 sibling, 2 replies; 28+ messages in thread
From: Al Viro @ 2020-12-11 17:20 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Linus Torvalds, linux-fsdevel

On Fri, Dec 11, 2020 at 09:05:26AM -0700, Jens Axboe wrote:

> > Finally, I really wonder what is that for; if you are in conditions when
> > you really don't want to risk going to sleep, you do *NOT* want to
> > do mnt_want_write().  Or ->open().  Or truncate().  Or, for Cthulhu
> > sake, IMA hash calculation.
> 
> I just want to do the RCU side lookup, that is all. That's my fast path.
> If that doesn't work, then we'll go through the motions of pushing this
> to a context that allows blocking open.

Explain, please.  What's the difference between blocking in a lookup and
blocking in truncate?  Either your call site is fine with a potentially
long sleep, or it is not; I don't understand what makes one source of
that behaviour different from another.

"Fast path" in context like "we can't sleep here, but often enough we
won't need to; here's a function that will bail out rather than blocking,
let's call that and go through offload to helper thread in rare case
when it does bail out" does make sense; what you are proposing to do
here is rather different and AFAICS saying "that's my fast path" is
meaningless here.

I really do not understand what it is that you are trying to achieve;
fastpath lookup part would be usable on its own, but mixed with
the rest of do_open() (as well as the open_last_lookups(), BTW)
it does not give the caller any useful warranties.

Theoretically it could be amended into something usable, but you
would need to make do_open(), open_last_lookups() (as well as
do_tmpfile()) honour your flag, with similar warranties provided
to caller.

AFAICS, without that part it is pretty much worthless.  And details
of what you are going to do in the missing bits *do* matter - unlike the
pathwalk side (which is trivial) it has potential for being very
messy.  I want to see _that_ before we commit to going there, and
a user-visible flag to openat2() makes a very strong commitment.

PS: just to make sure we are on the same page - O_NDELAY will *NOT*
suffice here.  I apologize if that's obvious to you, but I think
it's worth spelling out explicitly.

PPS: regarding unlazy_walk() change...  If we go that way, I would
probably changed the name to "try_to_unlazy" and inverted the meaning
of return value.  0 for success, -E... for failure is fine, but
false for success, true for failure is asking for recurring confusion.
IOW, I would rather do something like (completely untested)
diff --git a/fs/namei.c b/fs/namei.c
index d4a6dd772303..5abd1de11306 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -669,17 +669,17 @@ static bool legitimize_root(struct nameidata *nd)
  */
 
 /**
- * unlazy_walk - try to switch to ref-walk mode.
+ * try_to_unlazy - try to switch to ref-walk mode.
  * @nd: nameidata pathwalk data
- * Returns: 0 on success, -ECHILD on failure
+ * Returns: true on success, false on failure
  *
- * unlazy_walk attempts to legitimize the current nd->path and nd->root
+ * try_to_unlazy attempts to legitimize the current nd->path and nd->root
  * for ref-walk mode.
  * Must be called from rcu-walk context.
- * Nothing should touch nameidata between unlazy_walk() failure and
+ * Nothing should touch nameidata between try_to_unlazy() failure and
  * terminate_walk().
  */
-static int unlazy_walk(struct nameidata *nd)
+static bool try_to_unlazy(struct nameidata *nd)
 {
 	struct dentry *parent = nd->path.dentry;
 
@@ -694,14 +694,14 @@ static int unlazy_walk(struct nameidata *nd)
 		goto out;
 	rcu_read_unlock();
 	BUG_ON(nd->inode != parent->d_inode);
-	return 0;
+	return true;
 
 out1:
 	nd->path.mnt = NULL;
 	nd->path.dentry = NULL;
 out:
 	rcu_read_unlock();
-	return -ECHILD;
+	return false;
 }
 
 /**
@@ -792,7 +792,7 @@ static int complete_walk(struct nameidata *nd)
 		 */
 		if (!(nd->flags & (LOOKUP_ROOT | LOOKUP_IS_SCOPED)))
 			nd->root.mnt = NULL;
-		if (unlikely(unlazy_walk(nd)))
+		if (unlikely(!try_to_unlazy(nd)))
 			return -ECHILD;
 	}
 
@@ -1466,7 +1466,7 @@ static struct dentry *lookup_fast(struct nameidata *nd,
 		unsigned seq;
 		dentry = __d_lookup_rcu(parent, &nd->last, &seq);
 		if (unlikely(!dentry)) {
-			if (unlazy_walk(nd))
+			if (!try_to_unlazy(nd))
 				return ERR_PTR(-ECHILD);
 			return NULL;
 		}
@@ -1567,10 +1567,8 @@ static inline int may_lookup(struct nameidata *nd)
 {
 	if (nd->flags & LOOKUP_RCU) {
 		int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
-		if (err != -ECHILD)
+		if (err != -ECHILD || !try_to_unlazy(nd))
 			return err;
-		if (unlazy_walk(nd))
-			return -ECHILD;
 	}
 	return inode_permission(nd->inode, MAY_EXEC);
 }
@@ -1592,7 +1590,7 @@ static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq)
 		// unlazy even if we fail to grab the link - cleanup needs it
 		bool grabbed_link = legitimize_path(nd, link, seq);
 
-		if (unlazy_walk(nd) != 0 || !grabbed_link)
+		if (!try_to_unlazy(nd) || !grabbed_link)
 			return -ECHILD;
 
 		if (nd_alloc_stack(nd))
@@ -1634,7 +1632,7 @@ static const char *pick_link(struct nameidata *nd, struct path *link,
 		touch_atime(&last->link);
 		cond_resched();
 	} else if (atime_needs_update(&last->link, inode)) {
-		if (unlikely(unlazy_walk(nd)))
+		if (unlikely(!try_to_unlazy(nd)))
 			return ERR_PTR(-ECHILD);
 		touch_atime(&last->link);
 	}
@@ -1651,11 +1649,8 @@ static const char *pick_link(struct nameidata *nd, struct path *link,
 		get = inode->i_op->get_link;
 		if (nd->flags & LOOKUP_RCU) {
 			res = get(NULL, inode, &last->done);
-			if (res == ERR_PTR(-ECHILD)) {
-				if (unlikely(unlazy_walk(nd)))
-					return ERR_PTR(-ECHILD);
+			if (res == ERR_PTR(-ECHILD) && try_to_unlazy(nd))
 				res = get(link->dentry, inode, &last->done);
-			}
 		} else {
 			res = get(link->dentry, inode, &last->done);
 		}
@@ -2193,7 +2188,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		}
 		if (unlikely(!d_can_lookup(nd->path.dentry))) {
 			if (nd->flags & LOOKUP_RCU) {
-				if (unlazy_walk(nd))
+				if (!try_to_unlazy(nd))
 					return -ECHILD;
 			}
 			return -ENOTDIR;
@@ -3127,7 +3122,6 @@ static const char *open_last_lookups(struct nameidata *nd,
 	struct inode *inode;
 	struct dentry *dentry;
 	const char *res;
-	int error;
 
 	nd->flags |= op->intent;
 
@@ -3151,9 +3145,8 @@ static const char *open_last_lookups(struct nameidata *nd,
 	} else {
 		/* create side of things */
 		if (nd->flags & LOOKUP_RCU) {
-			error = unlazy_walk(nd);
-			if (unlikely(error))
-				return ERR_PTR(error);
+			if (unlikely(!try_to_unlazy(nd)))
+				return ERR_PTR(-ECHILD);
 		}
 		audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
 		/* trailing slashes? */
@@ -3162,8 +3155,7 @@ static const char *open_last_lookups(struct nameidata *nd,
 	}
 
 	if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
-		error = mnt_want_write(nd->path.mnt);
-		if (!error)
+		if (!mnt_want_write(nd->path.mnt))
 			got_write = true;
 		/*
 		 * do _not_ fail yet - we might not need that or fail with

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] fs: add support for LOOKUP_NONBLOCK
  2020-12-11 15:57     ` Jens Axboe
@ 2020-12-11 17:21       ` Linus Torvalds
  2020-12-11 17:29         ` Al Viro
  0 siblings, 1 reply; 28+ messages in thread
From: Linus Torvalds @ 2020-12-11 17:21 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Al Viro, linux-fsdevel

On Fri, Dec 11, 2020 at 7:57 AM Jens Axboe <axboe@kernel.dk> wrote:
>
> On 12/10/20 7:35 PM, Al Viro wrote:
> > _IF_ for some theoretical exercise you want to do "lookup without dropping
> > out of RCU", just add a flag that has unlazy_walk() fail.  With -ECHILD.
> > Strip it away in complete_walk() and have path_init() with that flag
> > and without LOOKUP_RCU fail with -EAGAIN.  All there is to it.
>
> Thanks Al, that makes for an easier implementation. I like that suggestion,
> boils it down to just three hunks (see below).

Ooh. Yes, very nice.

        Linus

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] fs: add support for LOOKUP_NONBLOCK
  2020-12-11 17:21       ` Linus Torvalds
@ 2020-12-11 17:29         ` Al Viro
  2020-12-11 17:38           ` Al Viro
                             ` (2 more replies)
  0 siblings, 3 replies; 28+ messages in thread
From: Al Viro @ 2020-12-11 17:29 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Jens Axboe, linux-fsdevel

On Fri, Dec 11, 2020 at 09:21:20AM -0800, Linus Torvalds wrote:
> On Fri, Dec 11, 2020 at 7:57 AM Jens Axboe <axboe@kernel.dk> wrote:
> >
> > On 12/10/20 7:35 PM, Al Viro wrote:
> > > _IF_ for some theoretical exercise you want to do "lookup without dropping
> > > out of RCU", just add a flag that has unlazy_walk() fail.  With -ECHILD.
> > > Strip it away in complete_walk() and have path_init() with that flag
> > > and without LOOKUP_RCU fail with -EAGAIN.  All there is to it.
> >
> > Thanks Al, that makes for an easier implementation. I like that suggestion,
> > boils it down to just three hunks (see below).
> 
> Ooh. Yes, very nice.

Except for the wrong order in path_init() - the check should go _before_
        if (!*s)
                flags &= ~LOOKUP_RCU;
for obvious reasons.

Again, that part is trivial - what to do with do_open()/open_last_lookups()
is where the nastiness will be.  Basically, it makes sure we bail out in
cases when lookup itself would've blocked, but it does *not* bail out
when equally heavy (and unlikely) blocking sources hit past the complete_walk().
Which makes it rather useless for the caller, unless we get logics added to
that part as well.  And _that_ I want to see before we commit to anything.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] fs: add support for LOOKUP_NONBLOCK
  2020-12-11 16:05         ` Jens Axboe
  2020-12-11 17:20           ` Al Viro
@ 2020-12-11 17:33           ` Matthew Wilcox
  2020-12-11 18:55             ` Jens Axboe
  1 sibling, 1 reply; 28+ messages in thread
From: Matthew Wilcox @ 2020-12-11 17:33 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Al Viro, Linus Torvalds, linux-fsdevel

On Fri, Dec 11, 2020 at 09:05:26AM -0700, Jens Axboe wrote:
> On 12/10/20 7:45 PM, Al Viro wrote:
> > So how hard are your "we don't want to block here" requirements?  Because
> > the stuff you do after complete_walk() can easily be much longer than
> > everything else.
> 
> Ideally it'd extend a bit beyond the RCU lookup, as things like proc
> resolution will still fail with the proposed patch. But that's not a
> huge deal to me, I consider the dentry lookup to be Good Enough.

FWIW, /proc/$pid always falls back to REF walks.  Here's a patch from
one of my colleagues that aims to fix that.

https://lore.kernel.org/linux-fsdevel/20201204000212.773032-1-stephen.s.brennan@oracle.com/

Maybe you had one of the other parts of /proc in mind?

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] fs: add support for LOOKUP_NONBLOCK
  2020-12-11 17:20           ` Al Viro
@ 2020-12-11 17:35             ` Linus Torvalds
  2020-12-11 18:50             ` Jens Axboe
  1 sibling, 0 replies; 28+ messages in thread
From: Linus Torvalds @ 2020-12-11 17:35 UTC (permalink / raw)
  To: Al Viro; +Cc: Jens Axboe, linux-fsdevel

On Fri, Dec 11, 2020 at 9:21 AM Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> Explain, please.  What's the difference between blocking in a lookup and
> blocking in truncate?  Either your call site is fine with a potentially
> long sleep, or it is not; I don't understand what makes one source of
> that behaviour different from another.

So I'm not Jens, and I don't know exactly what io_uring loads he's
looking at, but the reason I'm interested in this is that this is very
much not the first time this has come up.

The big difference between filename lookup and truncate is that one is
very common indeed, and the other isn't.

Sure, something like truncate happens. And it might even be a huge
deal and very critical for some load. But realistically, I don't think
I've ever seen a load where if it's important, and you can do it
asynchronously, you couldn't just start a thread for it (particularly
a kthread).

> "Fast path" in context like "we can't sleep here, but often enough we
> won't need to; here's a function that will bail out rather than blocking,
> let's call that and go through offload to helper thread in rare case
> when it does bail out" does make sense; what you are proposing to do
> here is rather different and AFAICS saying "that's my fast path" is
> meaningless here.

The fast path context here is not "we can't sleep here".

No, the fast-path context here is "we want highest performance here",
with the understanding that there are other things to be done. The
existing code already simply starts a kernel thread for the open - not
because it "can't sleep", but because of that "I want to get this
operation started, but there are other things I want to start too".

And in that context, it's not about "can't sleep". It's about "if we
already have the data in a very fast cache, then doing this
asynchronously with a thread is SLOWER than just doing it directly".

In particular it's not about correctness: doing it synchronously or
asynchronously are both "equally correct". You get the same answer in
the end. It's purely about that "if we can do it really quickly, it's
better to just do it".

Which gets me back to the first part: this has come up before. Tux2
used to want to do _exactly_ this same thing.

But what has happened is that
 (a) we now have a RCU lookup that is an almost exact match for this
and
 (b) we now have a generic interface for user space to use it in the
form of io_uring

So this is not about "you have to get it right".  In fact, if it was,
the RCU lookup model would be the wrong thing, because the RCU name
lookup is optimistic, and will fail for a variety of reasons.

Bo, this is literally about "threads and synchronization is a real
overhead, so if you care about performance, you don't actually want to
use them if you can do the operation so fast that the thread and
synchronization overhead is a real issue".

Which is why LOOKUP_RCU is such a good match.

And while Tux was never very successful because it was so limited and
so special, io_uring really looks like it could be the interface to
make a lot of performance-sensitive people happy. And getting that
"low-latency cached behaviour vs bigger operations that might need
lots of locks or IO" balance right would be a very good thing, I
suspect.

            Linus

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] fs: add support for LOOKUP_NONBLOCK
  2020-12-11 17:29         ` Al Viro
@ 2020-12-11 17:38           ` Al Viro
  2020-12-11 17:44           ` Linus Torvalds
  2020-12-11 21:46           ` Jens Axboe
  2 siblings, 0 replies; 28+ messages in thread
From: Al Viro @ 2020-12-11 17:38 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Jens Axboe, linux-fsdevel

On Fri, Dec 11, 2020 at 05:29:31PM +0000, Al Viro wrote:
> On Fri, Dec 11, 2020 at 09:21:20AM -0800, Linus Torvalds wrote:
> > On Fri, Dec 11, 2020 at 7:57 AM Jens Axboe <axboe@kernel.dk> wrote:
> > >
> > > On 12/10/20 7:35 PM, Al Viro wrote:
> > > > _IF_ for some theoretical exercise you want to do "lookup without dropping
> > > > out of RCU", just add a flag that has unlazy_walk() fail.  With -ECHILD.
> > > > Strip it away in complete_walk() and have path_init() with that flag
> > > > and without LOOKUP_RCU fail with -EAGAIN.  All there is to it.
> > >
> > > Thanks Al, that makes for an easier implementation. I like that suggestion,
> > > boils it down to just three hunks (see below).
> > 
> > Ooh. Yes, very nice.
> 
> Except for the wrong order in path_init() - the check should go _before_
>         if (!*s)
>                 flags &= ~LOOKUP_RCU;
> for obvious reasons.
> 
> Again, that part is trivial - what to do with do_open()/open_last_lookups()
> is where the nastiness will be.  Basically, it makes sure we bail out in
> cases when lookup itself would've blocked, but it does *not* bail out
> when equally heavy (and unlikely) blocking sources hit past the complete_walk().
> Which makes it rather useless for the caller, unless we get logics added to
> that part as well.  And _that_ I want to see before we commit to anything.

BTW, to reiterate - "any open that isn't non-blocking" is misleading; it's
*NOT* just a matter of passing O_NDELAY in flags.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] fs: add support for LOOKUP_NONBLOCK
  2020-12-11 17:29         ` Al Viro
  2020-12-11 17:38           ` Al Viro
@ 2020-12-11 17:44           ` Linus Torvalds
  2020-12-11 21:46           ` Jens Axboe
  2 siblings, 0 replies; 28+ messages in thread
From: Linus Torvalds @ 2020-12-11 17:44 UTC (permalink / raw)
  To: Al Viro; +Cc: Jens Axboe, linux-fsdevel

On Fri, Dec 11, 2020 at 9:29 AM Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> Again, that part is trivial - what to do with do_open()/open_last_lookups()
> is where the nastiness will be.  Basically, it makes sure we bail out in
> cases when lookup itself would've blocked, but it does *not* bail out
> when equally heavy (and unlikely) blocking sources hit past the complete_walk().

See my other email - while you are obviously correct from a "must
never sleep" standpoint, and from a general standpoint, from a
practical standpoint I suspect it really doesn't matter.

Exactly because it's not primarily a correctness issue, but a
performance issue - and because people wouldn't use this for things
like "open a named pipe that then blocks on the open side" use.

I do agree that maybe that LOOKUP_NONBLOCK might then want to be also
coupled with extra logic to also just fail if it turns out it's a
special device file or whatever - I just think that ends up being a
separate issue.

For example, in user space, we already have alternate methods for that
(ie O_NONBLOCK for fifo etc), and maybe io_uring wants that too.

               Linus

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] fs: expose LOOKUP_NONBLOCK through openat2() RESOLVE_NONBLOCK
  2020-12-11  3:45           ` Dave Chinner
@ 2020-12-11 18:07             ` Linus Torvalds
  0 siblings, 0 replies; 28+ messages in thread
From: Linus Torvalds @ 2020-12-11 18:07 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Jens Axboe, linux-fsdevel, Al Viro

On Thu, Dec 10, 2020 at 7:45 PM Dave Chinner <david@fromorbit.com> wrote:
>
> Part of the problem we have with the non-blocking behaviour is that
> the user interfaces have been under specified, poorly reviewed and
> targetted a single specific use case on a single filesystem rather
> than generic behaviour. And mostly they lack the necessary test
> coverage to ensure all filesystems behave the same way and to inform
> us of a regression that *would break userspace applications*.

Fair enough. I just didn't really see much a concern here, exactly
because this ends up not being a hard guarantee in the first place
(but the reason I suggested then adding that RESOLVE_NONBLOCK is so
that it could be tested without having to rely on timing and io_uring
that _should_ get the same result regardless in the end).

But the second reason I don't see much concern is exactly because it
wouldn't affect individual filesystems. There's nothing new going on
as far as  filesystem is concerned.

> Yes, I recognise and accept that some of the problems are partially
> my fault. I also have a habit of trying to learn from the mistakes
> I've made and then take steps to ensure that *we do not make those
> same mistakes again*.

So the third reason I reacted was because we have a history, and you
have traditionally not ever really cared unless it's about xfs and IO.
Which this thing would very explicitly not be about. The low-level
filesystem would never see the semantics at all, and could never get
it wrong.

Well, a filesystem could "get it wrong" in the same sense that it can
get the current LOOKUP_RCU wrong, of course.

But that would be either an outright bug and a correctness problem -
sleeping in RCU context - or be a performance problem - returning
ECHILD very easily due to other reasons. And it would be entirely
unrelated to the nonblocking path open, because it would be a
performance issue even _normally_, just not visible as semantics.

And that's the second reason I like this, and would like to see this,
and see RESOLVE_NONBLOCK: exactly because we have _had_ those subtle
issues that aren't actually correctness issues, but only "oh, this
situation always takes us out of RCU lookup, and it's a very subtle
performance problem".

For example, it used to be that whenever we saw a symlink, we'd throw
up our hands and say "we can't do this in RCU lookup" and give up.
That wasn't a low-level filesystem problem, it was literally at the
VFS layer, because the RCU lookup was fairly limited.

Or some of the security models (well, _all_ of them) used to just say
"oh, I can't do this check in RCU mode" and forced the slow path.

It was very noticeable from a performance angle under certain loads,
because RCU lookup is just _so_ much faster when you otherwise get
locking and reference counting cache conflicts. Yes, yes, Al fixed the
symlinks long ago, but we know RCU lookup still gives up fairly easily
in other circumstances.

And RCU lookup giving up eagerly is fine, of course. The whole point
is that it's an optimistic "let's see if we can do this really
quickly" interface that just works _most_ of the time.

But that also makes it easy to miss things, because it's so hard to
test when it's purely about latency and all the operations will retry
and get the right result in the end. The "noticeable performance
issues" are generally not very noticeable at the level of an
individual operation - you need to do a lot of them, and often in
parallel, to see the _big_ benefits.

So RESOLVE_NONBLOCK would be a nice way to perhaps add automated
testing for "did we screw up RCU pathname lookup", in addition to
perhaps making it easier to find the cases where we currently give up
too quickly just because it was _fairly_ rare and nobody had much
visibility into that case.

And we have had that "oh, RCU lookup broke" a couple of times by
mistake - and then it takes a while to notice, because it's purely
visible as a performance bug and not necessarily all _that_ obvious -
exactly because it's purely about performance, and the semantic end
result is the same.

           Linus

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] fs: add support for LOOKUP_NONBLOCK
  2020-12-11 17:20           ` Al Viro
  2020-12-11 17:35             ` Linus Torvalds
@ 2020-12-11 18:50             ` Jens Axboe
  2020-12-11 21:51               ` Al Viro
  1 sibling, 1 reply; 28+ messages in thread
From: Jens Axboe @ 2020-12-11 18:50 UTC (permalink / raw)
  To: Al Viro; +Cc: Linus Torvalds, linux-fsdevel

On 12/11/20 10:20 AM, Al Viro wrote:
> On Fri, Dec 11, 2020 at 09:05:26AM -0700, Jens Axboe wrote:
> 
>>> Finally, I really wonder what is that for; if you are in conditions when
>>> you really don't want to risk going to sleep, you do *NOT* want to
>>> do mnt_want_write().  Or ->open().  Or truncate().  Or, for Cthulhu
>>> sake, IMA hash calculation.
>>
>> I just want to do the RCU side lookup, that is all. That's my fast path.
>> If that doesn't work, then we'll go through the motions of pushing this
>> to a context that allows blocking open.
> 
> Explain, please.  What's the difference between blocking in a lookup and
> blocking in truncate?  Either your call site is fine with a potentially
> long sleep, or it is not; I don't understand what makes one source of
> that behaviour different from another.

I could filter on O_TRUNC (and O_CREAT) in the caller from the io_uring
side, and in fact we may want to do that in general for RESOLVE_LOOKUP
as well. Or handle it in do_open(), which probably makes a lot more
sense. In reality, io_uring would check this upfront and just not bother
with an inline attempt if O_TRUNC is set, as we know we'll wind up with
-EAGAIN at the end of it.  I don't think the combined semantics make any
sense, as you very well may block if you're doing truncate on the file
as part of open. So that should get added to the patch adding
RESOLVE_LOOKUP.

> "Fast path" in context like "we can't sleep here, but often enough we
> won't need to; here's a function that will bail out rather than blocking,
> let's call that and go through offload to helper thread in rare case
> when it does bail out" does make sense; what you are proposing to do
> here is rather different and AFAICS saying "that's my fast path" is
> meaningless here.

What you're describing is exactly what it is - and in my terminology,
O_TRUNC is not part of my fast path. It may be for the application, but
I cannot support it as we don't know if it'll block. We just have to
assume that it might, and that means it'll be handled from a different
context.

> I really do not understand what it is that you are trying to achieve;
> fastpath lookup part would be usable on its own, but mixed with
> the rest of do_open() (as well as the open_last_lookups(), BTW)
> it does not give the caller any useful warranties.

open_last_lookups() will end up bailing us out early, as we end the RCU
lookup side of things and hence would terminate a LOOKUP_NONBLOCK with
-EAGAIN at that point anyway.

> Theoretically it could be amended into something usable, but you
> would need to make do_open(), open_last_lookups() (as well as
> do_tmpfile()) honour your flag, with similar warranties provided
> to caller.
> 
> AFAICS, without that part it is pretty much worthless.  And details
> of what you are going to do in the missing bits *do* matter - unlike the
> pathwalk side (which is trivial) it has potential for being very
> messy.  I want to see _that_ before we commit to going there, and
> a user-visible flag to openat2() makes a very strong commitment.

Fair enough. In terms of patch flow, do you want that as an addon before
we do RESOLVE_NONBLOCK, or do you want it as part of the core
LOOKUP_NONBLOCK patch?

> PS: just to make sure we are on the same page - O_NDELAY will *NOT*
> suffice here.  I apologize if that's obvious to you, but I think
> it's worth spelling out explicitly.
> 
> PPS: regarding unlazy_walk() change...  If we go that way, I would
> probably changed the name to "try_to_unlazy" and inverted the meaning
> of return value.  0 for success, -E... for failure is fine, but
> false for success, true for failure is asking for recurring confusion.
> IOW, I would rather do something like (completely untested)

Agree, if we're going bool, we should make it the more usually followed
success-on-false instead. And I'm happy to see you drop those
likely/unlikely as well, not a huge fan. I'll fold this into what I had
for that and include your naming change.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] fs: add support for LOOKUP_NONBLOCK
  2020-12-11 17:33           ` Matthew Wilcox
@ 2020-12-11 18:55             ` Jens Axboe
  0 siblings, 0 replies; 28+ messages in thread
From: Jens Axboe @ 2020-12-11 18:55 UTC (permalink / raw)
  To: Matthew Wilcox; +Cc: Al Viro, Linus Torvalds, linux-fsdevel

On 12/11/20 10:33 AM, Matthew Wilcox wrote:
> On Fri, Dec 11, 2020 at 09:05:26AM -0700, Jens Axboe wrote:
>> On 12/10/20 7:45 PM, Al Viro wrote:
>>> So how hard are your "we don't want to block here" requirements?  Because
>>> the stuff you do after complete_walk() can easily be much longer than
>>> everything else.
>>
>> Ideally it'd extend a bit beyond the RCU lookup, as things like proc
>> resolution will still fail with the proposed patch. But that's not a
>> huge deal to me, I consider the dentry lookup to be Good Enough.
> 
> FWIW, /proc/$pid always falls back to REF walks.  Here's a patch from
> one of my colleagues that aims to fix that.
> 
> https://lore.kernel.org/linux-fsdevel/20201204000212.773032-1-stephen.s.brennan@oracle.com/
> 
> Maybe you had one of the other parts of /proc in mind?

Yes, it is/was /proc/self/ specifically, which the patch won't really
help. But it's not like it's a huge issue, and I'm quite happy (for
now) to just have that -EOPNOTSUPP on open as it does.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] fs: add support for LOOKUP_NONBLOCK
  2020-12-11 17:29         ` Al Viro
  2020-12-11 17:38           ` Al Viro
  2020-12-11 17:44           ` Linus Torvalds
@ 2020-12-11 21:46           ` Jens Axboe
  2 siblings, 0 replies; 28+ messages in thread
From: Jens Axboe @ 2020-12-11 21:46 UTC (permalink / raw)
  To: Al Viro, Linus Torvalds; +Cc: linux-fsdevel

On 12/11/20 10:29 AM, Al Viro wrote:
> On Fri, Dec 11, 2020 at 09:21:20AM -0800, Linus Torvalds wrote:
>> On Fri, Dec 11, 2020 at 7:57 AM Jens Axboe <axboe@kernel.dk> wrote:
>>>
>>> On 12/10/20 7:35 PM, Al Viro wrote:
>>>> _IF_ for some theoretical exercise you want to do "lookup without dropping
>>>> out of RCU", just add a flag that has unlazy_walk() fail.  With -ECHILD.
>>>> Strip it away in complete_walk() and have path_init() with that flag
>>>> and without LOOKUP_RCU fail with -EAGAIN.  All there is to it.
>>>
>>> Thanks Al, that makes for an easier implementation. I like that suggestion,
>>> boils it down to just three hunks (see below).
>>
>> Ooh. Yes, very nice.
> 
> Except for the wrong order in path_init() - the check should go _before_
>         if (!*s)
>                 flags &= ~LOOKUP_RCU;
> for obvious reasons.

Oops yes, I fixed that up.

> Again, that part is trivial - what to do with
> do_open()/open_last_lookups() is where the nastiness will be.
> Basically, it makes sure we bail out in cases when lookup itself
> would've blocked, but it does *not* bail out when equally heavy (and
> unlikely) blocking sources hit past the complete_walk(). Which makes
> it rather useless for the caller, unless we get logics added to that
> part as well.  And _that_ I want to see before we commit to anything.

A few items can be handled by just disallowing them upfront - like
O_TRUNC with RESOLVE_NONBLOCK. I'd prefer to do that instead of
sprinkling trylock variants in places where they kind of end up being
nonsensical anyway (eg O_TRUNC with RESOLVE_NONBLOCK just doesn't make
sense). Outside of that, doesn't look like to me that do_open() needs
any further massaging. open_last_lookups() needs to deal with
non-blocking mnt_want_write(), but that looks pretty trivial, and
trylock on the inode.

So all seems pretty doable. Which makes me think I must be missing
something?

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] fs: add support for LOOKUP_NONBLOCK
  2020-12-11 18:50             ` Jens Axboe
@ 2020-12-11 21:51               ` Al Viro
  2020-12-11 23:47                 ` Jens Axboe
  0 siblings, 1 reply; 28+ messages in thread
From: Al Viro @ 2020-12-11 21:51 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Linus Torvalds, linux-fsdevel

On Fri, Dec 11, 2020 at 11:50:12AM -0700, Jens Axboe wrote:

> I could filter on O_TRUNC (and O_CREAT) in the caller from the io_uring
> side, and in fact we may want to do that in general for RESOLVE_LOOKUP
> as well.

You do realize that it covers O_RDWR as well, right?  If the object is on
a frozen filesystem, mnt_want_write() will block until the thing gets thawed.

> > AFAICS, without that part it is pretty much worthless.  And details
> > of what you are going to do in the missing bits *do* matter - unlike the
> > pathwalk side (which is trivial) it has potential for being very
> > messy.  I want to see _that_ before we commit to going there, and
> > a user-visible flag to openat2() makes a very strong commitment.
> 
> Fair enough. In terms of patch flow, do you want that as an addon before
> we do RESOLVE_NONBLOCK, or do you want it as part of the core
> LOOKUP_NONBLOCK patch?

I want to understand how it will be done.

> Agree, if we're going bool, we should make it the more usually followed
> success-on-false instead. And I'm happy to see you drop those
> likely/unlikely as well, not a huge fan. I'll fold this into what I had
> for that and include your naming change.

BTW, I wonder if the compiler is able to figure out that

bool f(void)
{
	if (unlikely(foo))
		return false;
	if (unlikely(bar))
		return false;
	return true;
}

is unlikely to return false.  We can force that, obviously (provide an inlined
wrapper and slap likely() there), but...

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/2] fs: add support for LOOKUP_NONBLOCK
  2020-12-11 21:51               ` Al Viro
@ 2020-12-11 23:47                 ` Jens Axboe
  0 siblings, 0 replies; 28+ messages in thread
From: Jens Axboe @ 2020-12-11 23:47 UTC (permalink / raw)
  To: Al Viro; +Cc: Linus Torvalds, linux-fsdevel

On 12/11/20 2:51 PM, Al Viro wrote:
> On Fri, Dec 11, 2020 at 11:50:12AM -0700, Jens Axboe wrote:
> 
>> I could filter on O_TRUNC (and O_CREAT) in the caller from the io_uring
>> side, and in fact we may want to do that in general for RESOLVE_LOOKUP
>> as well.
> 
> You do realize that it covers O_RDWR as well, right?  If the object is on
> a frozen filesystem, mnt_want_write() will block until the thing gets thawed.

I do, current patch does have that handled. I was only referring to the
fact that I don't consider O_TRUNC interesting enough to fold in non-block
support for it, I'm quite happy just letting that be as it is and just
disallow it in the flags directly.

>>> AFAICS, without that part it is pretty much worthless.  And details
>>> of what you are going to do in the missing bits *do* matter - unlike the
>>> pathwalk side (which is trivial) it has potential for being very
>>> messy.  I want to see _that_ before we commit to going there, and
>>> a user-visible flag to openat2() makes a very strong commitment.
>>
>> Fair enough. In terms of patch flow, do you want that as an addon before
>> we do RESOLVE_NONBLOCK, or do you want it as part of the core
>> LOOKUP_NONBLOCK patch?
> 
> I want to understand how it will be done.

Of course. I'll post what I have later, easier to discuss an actual
series of patches.

>> Agree, if we're going bool, we should make it the more usually followed
>> success-on-false instead. And I'm happy to see you drop those
>> likely/unlikely as well, not a huge fan. I'll fold this into what I had
>> for that and include your naming change.
> 
> BTW, I wonder if the compiler is able to figure out that
> 
> bool f(void)
> {
> 	if (unlikely(foo))
> 		return false;
> 	if (unlikely(bar))
> 		return false;
> 	return true;
> }
> 
> is unlikely to return false.  We can force that, obviously (provide an inlined
> wrapper and slap likely() there), but...

Not sure, it _should_, but reality may differ with that guess.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2020-12-11 23:59 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-12-10 20:01 [PATCHSET 0/2] fs: Support for LOOKUP_NONBLOCK / RESOLVE_NONBLOCK Jens Axboe
2020-12-10 20:01 ` [PATCH 1/2] fs: add support for LOOKUP_NONBLOCK Jens Axboe
2020-12-10 20:53   ` Linus Torvalds
2020-12-10 21:06     ` Jens Axboe
2020-12-11  2:45       ` Al Viro
2020-12-11 16:05         ` Jens Axboe
2020-12-11 17:20           ` Al Viro
2020-12-11 17:35             ` Linus Torvalds
2020-12-11 18:50             ` Jens Axboe
2020-12-11 21:51               ` Al Viro
2020-12-11 23:47                 ` Jens Axboe
2020-12-11 17:33           ` Matthew Wilcox
2020-12-11 18:55             ` Jens Axboe
2020-12-11  2:35   ` Al Viro
2020-12-11 15:57     ` Jens Axboe
2020-12-11 17:21       ` Linus Torvalds
2020-12-11 17:29         ` Al Viro
2020-12-11 17:38           ` Al Viro
2020-12-11 17:44           ` Linus Torvalds
2020-12-11 21:46           ` Jens Axboe
2020-12-10 20:01 ` [PATCH 2/2] fs: expose LOOKUP_NONBLOCK through openat2() RESOLVE_NONBLOCK Jens Axboe
2020-12-10 22:29   ` Dave Chinner
2020-12-10 23:12     ` Jens Axboe
2020-12-10 23:29     ` Linus Torvalds
2020-12-11  0:58       ` Dave Chinner
2020-12-11  1:01         ` Linus Torvalds
2020-12-11  3:45           ` Dave Chinner
2020-12-11 18:07             ` Linus Torvalds

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.