linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* new ...at() flag: AT_NO_JUMPS
@ 2017-04-29 22:04 Al Viro
  2017-04-29 23:17 ` Andy Lutomirski
  2017-05-01 17:36 ` Jann Horn
  0 siblings, 2 replies; 26+ messages in thread
From: Al Viro @ 2017-04-29 22:04 UTC (permalink / raw)
  To: Linux API; +Cc: linux-kernel, linux-fsdevel

New AT_... flag - AT_NO_JUMPS

Semantics: pathname resolution must not involve
	* traversals of absolute symlinks
	* traversals of procfs-style symlinks
	* traversals of mountpoints (including bindings, referrals, etc.)
	* traversal of .. in the starting point of pathname resolution.

All of those lead to failure with -ELOOP.  Relative symlinks are fine,
as long as their resolution does not end up stepping into the conditions
above.

It guarantees that result of successful pathname resolution will be on the
same filesystem as its starting point and within the subtree rooted at
the starting point.

Right now I have it hooked only for fstatat() and friends; it could be
easily extended to any ...at() syscalls.  Objections?

commit 2765f14b0cbb4240a6a3dda353d7014b6de19db9
Author: Al Viro <viro@zeniv.linux.org.uk>
Date:   Sat Mar 18 16:27:55 2017 -0400

    namei: new flag (LOOKUP_NO_JUMPS)
    
    semantics: fail with -ELOOP upon
            * attempt to cross mountpoint (including bindings)
            * attempt to traverse a non-relative symlink
            * attempt to cross the starting point by ".." traversal
    
    Matching AT_... flag: AT_NO_JUMPS introduced, fstatat(2) (and
    corresponding statx/stat64 variants) taught about it.
    
    Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index d41fab78798b..de1f07ec8ccd 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -874,6 +874,8 @@ static int nd_jump_root(struct nameidata *nd)
 		path_get(&nd->path);
 		nd->inode = nd->path.dentry->d_inode;
 	}
+	if (unlikely(nd->flags & LOOKUP_NO_JUMPS))
+		return -ELOOP;
 	nd->flags |= LOOKUP_JUMPED;
 	return 0;
 }
@@ -1054,14 +1056,18 @@ const char *get_link(struct nameidata *nd)
 		} else {
 			res = get(dentry, inode, &last->done);
 		}
+		if (unlikely(nd->flags & LOOKUP_NO_JUMPS) &&
+		    unlikely(nd->flags & LOOKUP_JUMPED))
+			return ERR_PTR(-ELOOP);
 		if (IS_ERR_OR_NULL(res))
 			return res;
 	}
 	if (*res == '/') {
 		if (!nd->root.mnt)
 			set_root(nd);
-		if (unlikely(nd_jump_root(nd)))
-			return ERR_PTR(-ECHILD);
+		error = nd_jump_root(nd);
+		if (unlikely(error))
+			return ERR_PTR(error);
 		while (unlikely(*++res == '/'))
 			;
 	}
@@ -1245,12 +1251,16 @@ static int follow_managed(struct path *path, struct nameidata *nd)
 		break;
 	}
 
-	if (need_mntput && path->mnt == mnt)
-		mntput(path->mnt);
+	if (need_mntput) {
+		if (path->mnt == mnt)
+			mntput(path->mnt);
+		if (unlikely(nd->flags & LOOKUP_NO_JUMPS))
+			ret = -ELOOP;
+		else
+			nd->flags |= LOOKUP_JUMPED;
+	}
 	if (ret == -EISDIR || !ret)
 		ret = 1;
-	if (need_mntput)
-		nd->flags |= LOOKUP_JUMPED;
 	if (unlikely(ret < 0))
 		path_put_conditional(path, nd);
 	return ret;
@@ -1307,6 +1317,8 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
 		mounted = __lookup_mnt(path->mnt, path->dentry);
 		if (!mounted)
 			break;
+		if (unlikely(nd->flags & LOOKUP_NO_JUMPS))
+			return false;
 		path->mnt = &mounted->mnt;
 		path->dentry = mounted->mnt.mnt_root;
 		nd->flags |= LOOKUP_JUMPED;
@@ -1327,8 +1339,11 @@ static int follow_dotdot_rcu(struct nameidata *nd)
 	struct inode *inode = nd->inode;
 
 	while (1) {
-		if (path_equal(&nd->path, &nd->root))
+		if (unlikely(path_equal(&nd->path, &nd->root))) {
+			if (nd->flags & LOOKUP_NO_JUMPS)
+				return -ELOOP;
 			break;
+		}
 		if (nd->path.dentry != nd->path.mnt->mnt_root) {
 			struct dentry *old = nd->path.dentry;
 			struct dentry *parent = old->d_parent;
@@ -1455,8 +1470,9 @@ static int path_parent_directory(struct path *path)
 static int follow_dotdot(struct nameidata *nd)
 {
 	while(1) {
-		if (nd->path.dentry == nd->root.dentry &&
-		    nd->path.mnt == nd->root.mnt) {
+		if (unlikely(path_equal(&nd->path, &nd->root))) {
+			if (nd->flags & LOOKUP_NO_JUMPS)
+				return -ELOOP;
 			break;
 		}
 		if (nd->path.dentry != nd->path.mnt->mnt_root) {
@@ -2177,14 +2193,16 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
 
 	nd->m_seq = read_seqbegin(&mount_lock);
 	if (*s == '/') {
+		int error;
 		if (flags & LOOKUP_RCU)
 			rcu_read_lock();
 		set_root(nd);
-		if (likely(!nd_jump_root(nd)))
-			return s;
-		nd->root.mnt = NULL;
-		rcu_read_unlock();
-		return ERR_PTR(-ECHILD);
+		error = nd_jump_root(nd);
+		if (unlikely(error)) {
+			terminate_walk(nd);
+			s = ERR_PTR(error);
+		}
+		return s;
 	} else if (nd->dfd == AT_FDCWD) {
 		if (flags & LOOKUP_RCU) {
 			struct fs_struct *fs = current->fs;
@@ -2202,6 +2220,11 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
 			get_fs_pwd(current->fs, &nd->path);
 			nd->inode = nd->path.dentry->d_inode;
 		}
+		if (unlikely(flags & LOOKUP_NO_JUMPS)) {
+			nd->root = nd->path;
+			if (!(flags & LOOKUP_RCU))
+				path_get(&nd->root);
+		}
 		return s;
 	} else {
 		/* Caller must check execute permissions on the starting path component */
@@ -2229,6 +2252,11 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
 			path_get(&nd->path);
 			nd->inode = nd->path.dentry->d_inode;
 		}
+		if (unlikely(flags & LOOKUP_NO_JUMPS)) {
+			nd->root = nd->path;
+			if (!(flags & LOOKUP_RCU))
+				path_get(&nd->root);
+		}
 		fdput(f);
 		return s;
 	}
diff --git a/fs/stat.c b/fs/stat.c
index fa0be59340cc..1999ce5f77c9 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -168,7 +168,7 @@ int vfs_statx(int dfd, const char __user *filename, int flags,
 	unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT;
 
 	if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
-		       AT_EMPTY_PATH | KSTAT_QUERY_FLAGS)) != 0)
+		       AT_EMPTY_PATH | KSTAT_QUERY_FLAGS | AT_NO_JUMPS)) != 0)
 		return -EINVAL;
 
 	if (flags & AT_SYMLINK_NOFOLLOW)
@@ -177,6 +177,8 @@ int vfs_statx(int dfd, const char __user *filename, int flags,
 		lookup_flags &= ~LOOKUP_AUTOMOUNT;
 	if (flags & AT_EMPTY_PATH)
 		lookup_flags |= LOOKUP_EMPTY;
+	if (flags & AT_NO_JUMPS)
+		lookup_flags |= LOOKUP_NO_JUMPS;
 
 retry:
 	error = user_path_at(dfd, filename, lookup_flags, &path);
diff --git a/include/linux/namei.h b/include/linux/namei.h
index f29abda31e6d..3cefb90f38ca 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -45,6 +45,8 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
 #define LOOKUP_ROOT		0x2000
 #define LOOKUP_EMPTY		0x4000
 
+#define LOOKUP_NO_JUMPS		0x10000
+
 extern int path_pts(struct path *path);
 
 extern int user_path_at_empty(int, const char __user *, unsigned, struct path *, int *empty);
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 813afd6eee71..ca35ef523e40 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -68,5 +68,6 @@
 #define AT_STATX_FORCE_SYNC	0x2000	/* - Force the attributes to be sync'd with the server */
 #define AT_STATX_DONT_SYNC	0x4000	/* - Don't sync attributes with the server */
 
+#define AT_NO_JUMPS		0x8000	/* No mountpoint crossing, no abs symlinks */
 
 #endif /* _UAPI_LINUX_FCNTL_H */

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: new ...at() flag: AT_NO_JUMPS
  2017-04-29 22:04 new ...at() flag: AT_NO_JUMPS Al Viro
@ 2017-04-29 23:17 ` Andy Lutomirski
  2017-04-29 23:25   ` Al Viro
  2017-05-01 17:36 ` Jann Horn
  1 sibling, 1 reply; 26+ messages in thread
From: Andy Lutomirski @ 2017-04-29 23:17 UTC (permalink / raw)
  To: Al Viro; +Cc: Linux API, linux-kernel, Linux FS Devel

On Sat, Apr 29, 2017 at 3:04 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> New AT_... flag - AT_NO_JUMPS
>
> Semantics: pathname resolution must not involve
>         * traversals of absolute symlinks
>         * traversals of procfs-style symlinks
>         * traversals of mountpoints (including bindings, referrals, etc.)
>         * traversal of .. in the starting point of pathname resolution.

Can you clarify this last one?  I assume that ".." will be rejected,
but what about "a/../.."?  How about "b" if b is a symlink to ".."?
How about "a/b" if a is a directory and b is a symlink to "../.."?

> Right now I have it hooked only for fstatat() and friends; it could be
> easily extended to any ...at() syscalls.  Objections?

I like it, assuming the answers to all the questions above are that
they will be rejected.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: new ...at() flag: AT_NO_JUMPS
  2017-04-29 23:17 ` Andy Lutomirski
@ 2017-04-29 23:25   ` Al Viro
  2017-04-30  1:13     ` Andy Lutomirski
  2017-04-30  4:38     ` Matthew Wilcox
  0 siblings, 2 replies; 26+ messages in thread
From: Al Viro @ 2017-04-29 23:25 UTC (permalink / raw)
  To: Andy Lutomirski; +Cc: Linux API, linux-kernel, Linux FS Devel

On Sat, Apr 29, 2017 at 04:17:18PM -0700, Andy Lutomirski wrote:
> On Sat, Apr 29, 2017 at 3:04 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> > New AT_... flag - AT_NO_JUMPS
> >
> > Semantics: pathname resolution must not involve
> >         * traversals of absolute symlinks
> >         * traversals of procfs-style symlinks
> >         * traversals of mountpoints (including bindings, referrals, etc.)
> >         * traversal of .. in the starting point of pathname resolution.
> 
> Can you clarify this last one?  I assume that ".." will be rejected,
> but what about "a/../.."?  How about "b" if b is a symlink to ".."?
> How about "a/b" if a is a directory and b is a symlink to "../.."?

All of those will be rejected - in each of those cases pathname traversal
leads back into the starting point with .. being the next component to
handle.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: new ...at() flag: AT_NO_JUMPS
  2017-04-29 23:25   ` Al Viro
@ 2017-04-30  1:13     ` Andy Lutomirski
  2017-04-30  4:38     ` Matthew Wilcox
  1 sibling, 0 replies; 26+ messages in thread
From: Andy Lutomirski @ 2017-04-30  1:13 UTC (permalink / raw)
  To: Al Viro; +Cc: Andy Lutomirski, Linux API, linux-kernel, Linux FS Devel

On Sat, Apr 29, 2017 at 4:25 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> On Sat, Apr 29, 2017 at 04:17:18PM -0700, Andy Lutomirski wrote:
>> On Sat, Apr 29, 2017 at 3:04 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
>> > New AT_... flag - AT_NO_JUMPS
>> >
>> > Semantics: pathname resolution must not involve
>> >         * traversals of absolute symlinks
>> >         * traversals of procfs-style symlinks
>> >         * traversals of mountpoints (including bindings, referrals, etc.)
>> >         * traversal of .. in the starting point of pathname resolution.
>>
>> Can you clarify this last one?  I assume that ".." will be rejected,
>> but what about "a/../.."?  How about "b" if b is a symlink to ".."?
>> How about "a/b" if a is a directory and b is a symlink to "../.."?
>
> All of those will be rejected - in each of those cases pathname traversal
> leads back into the starting point with .. being the next component to
> handle.

Sounds good.

Might it make sense to split it into two flags, one to prevent moving
between mounts and one for everything else?  I can imagine webservers
and such that are fine with traversing mount points but don't want to
escape their home directory.

--Andy

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: new ...at() flag: AT_NO_JUMPS
  2017-04-29 23:25   ` Al Viro
  2017-04-30  1:13     ` Andy Lutomirski
@ 2017-04-30  4:38     ` Matthew Wilcox
  2017-04-30 16:10       ` Al Viro
  1 sibling, 1 reply; 26+ messages in thread
From: Matthew Wilcox @ 2017-04-30  4:38 UTC (permalink / raw)
  To: Al Viro; +Cc: Andy Lutomirski, Linux API, linux-kernel, Linux FS Devel

On Sun, Apr 30, 2017 at 12:25:04AM +0100, Al Viro wrote:
> On Sat, Apr 29, 2017 at 04:17:18PM -0700, Andy Lutomirski wrote:
> > On Sat, Apr 29, 2017 at 3:04 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> > > New AT_... flag - AT_NO_JUMPS
> > >
> > > Semantics: pathname resolution must not involve
> > >         * traversals of absolute symlinks
> > >         * traversals of procfs-style symlinks
> > >         * traversals of mountpoints (including bindings, referrals, etc.)
> > >         * traversal of .. in the starting point of pathname resolution.
> > 
> > Can you clarify this last one?  I assume that ".." will be rejected,
> > but what about "a/../.."?  How about "b" if b is a symlink to ".."?
> > How about "a/b" if a is a directory and b is a symlink to "../.."?
> 
> All of those will be rejected - in each of those cases pathname traversal
> leads back into the starting point with .. being the next component to
> handle.

It sounds more like AT_NO_ESCAPE ... or AT_BELOW, or something.  Perhaps
some example usages in the changelog?

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: new ...at() flag: AT_NO_JUMPS
  2017-04-30  4:38     ` Matthew Wilcox
@ 2017-04-30 16:10       ` Al Viro
  2017-05-01  4:52         ` Andy Lutomirski
  0 siblings, 1 reply; 26+ messages in thread
From: Al Viro @ 2017-04-30 16:10 UTC (permalink / raw)
  To: Matthew Wilcox; +Cc: Andy Lutomirski, Linux API, linux-kernel, Linux FS Devel

On Sat, Apr 29, 2017 at 09:38:22PM -0700, Matthew Wilcox wrote:

> It sounds more like AT_NO_ESCAPE ... or AT_BELOW, or something.

I considered AT_ROACH_MOTEL at one point...  Another interesting
question is whether EXDEV would've been better than ELOOP.
Opinions?

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: new ...at() flag: AT_NO_JUMPS
  2017-04-30 16:10       ` Al Viro
@ 2017-05-01  4:52         ` Andy Lutomirski
  2017-05-01  5:15           ` Al Viro
  0 siblings, 1 reply; 26+ messages in thread
From: Andy Lutomirski @ 2017-05-01  4:52 UTC (permalink / raw)
  To: Al Viro
  Cc: Matthew Wilcox, Andy Lutomirski, Linux API, linux-kernel, Linux FS Devel

On Sun, Apr 30, 2017 at 9:10 AM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> On Sat, Apr 29, 2017 at 09:38:22PM -0700, Matthew Wilcox wrote:
>
>> It sounds more like AT_NO_ESCAPE ... or AT_BELOW, or something.
>
> I considered AT_ROACH_MOTEL at one point...  Another interesting
> question is whether EXDEV would've been better than ELOOP.
> Opinions?

In support of my homeland, I propose AT_HOTEL_CALIFORNIA.

How about EXDEV for crossing a mountpoint and ELOOP for absolute
symlinks or invalid ..?  (Is there a technical reason why the same AT_
flag should trigger both cases?)

--Andy

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: new ...at() flag: AT_NO_JUMPS
  2017-05-01  4:52         ` Andy Lutomirski
@ 2017-05-01  5:15           ` Al Viro
  0 siblings, 0 replies; 26+ messages in thread
From: Al Viro @ 2017-05-01  5:15 UTC (permalink / raw)
  To: Andy Lutomirski; +Cc: Matthew Wilcox, Linux API, linux-kernel, Linux FS Devel

On Sun, Apr 30, 2017 at 09:52:37PM -0700, Andy Lutomirski wrote:
> On Sun, Apr 30, 2017 at 9:10 AM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> > On Sat, Apr 29, 2017 at 09:38:22PM -0700, Matthew Wilcox wrote:
> >
> >> It sounds more like AT_NO_ESCAPE ... or AT_BELOW, or something.
> >
> > I considered AT_ROACH_MOTEL at one point...  Another interesting
> > question is whether EXDEV would've been better than ELOOP.
> > Opinions?
> 
> In support of my homeland, I propose AT_HOTEL_CALIFORNIA.
> 
> How about EXDEV for crossing a mountpoint and ELOOP for absolute
> symlinks or invalid ..?  (Is there a technical reason why the same AT_
> flag should trigger both cases?)

You do realize that mount --bind can do everything absolute symlinks could,
right?  And absolute symlinks most likely do lead to (or at least through)
a different fs...

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: new ...at() flag: AT_NO_JUMPS
  2017-04-29 22:04 new ...at() flag: AT_NO_JUMPS Al Viro
  2017-04-29 23:17 ` Andy Lutomirski
@ 2017-05-01 17:36 ` Jann Horn
  2017-05-01 19:37   ` Andy Lutomirski
  2017-05-05  0:30   ` Al Viro
  1 sibling, 2 replies; 26+ messages in thread
From: Jann Horn @ 2017-05-01 17:36 UTC (permalink / raw)
  To: Al Viro; +Cc: Linux API, linux-kernel, linux-fsdevel

On Sun, Apr 30, 2017 at 12:04 AM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> New AT_... flag - AT_NO_JUMPS
>
> Semantics: pathname resolution must not involve
>         * traversals of absolute symlinks
>         * traversals of procfs-style symlinks
>         * traversals of mountpoints (including bindings, referrals, etc.)
>         * traversal of .. in the starting point of pathname resolution.
>
> All of those lead to failure with -ELOOP.  Relative symlinks are fine,
> as long as their resolution does not end up stepping into the conditions
> above.
>
> It guarantees that result of successful pathname resolution will be on the
> same filesystem as its starting point and within the subtree rooted at
> the starting point.
>
> Right now I have it hooked only for fstatat() and friends; it could be
> easily extended to any ...at() syscalls.  Objections?

Oh, nice!

It looks like this is somewhat similar to the old O_BENEATH proposal,
but because the intentions behind the proposals are different
(application sandboxing versus permitting an application to restrict its
own filesystem accesses), the semantics differ: AT_NO_JUMPS
doesn't prevent starting the path with "/", but does prevent mountpoint
traversal. Is that correct?

I think that, as Andy mentioned, it might make sense to split out (or
even remove?) the prevention of mountpoint traversal. A user who
can create visible mountpoints needs to have capabilities over the
mount namespace the file descriptor refers to already.

I suspect that if this lands, it would be pretty straightforward to add
another flag AT_NO_ABSOLUTE or so that, combined with
AT_NO_JUMPS, has the same semantics as O_BENEATH?

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: new ...at() flag: AT_NO_JUMPS
  2017-05-01 17:36 ` Jann Horn
@ 2017-05-01 19:37   ` Andy Lutomirski
  2017-05-05  0:30   ` Al Viro
  1 sibling, 0 replies; 26+ messages in thread
From: Andy Lutomirski @ 2017-05-01 19:37 UTC (permalink / raw)
  To: Jann Horn; +Cc: Al Viro, Linux API, linux-kernel, Linux FS Devel

On Mon, May 1, 2017 at 10:36 AM, Jann Horn <jannh@google.com> wrote:
> On Sun, Apr 30, 2017 at 12:04 AM, Al Viro <viro@zeniv.linux.org.uk> wrote:
>> New AT_... flag - AT_NO_JUMPS
>>
>> Semantics: pathname resolution must not involve
>>         * traversals of absolute symlinks
>>         * traversals of procfs-style symlinks
>>         * traversals of mountpoints (including bindings, referrals, etc.)
>>         * traversal of .. in the starting point of pathname resolution.
>>
>> All of those lead to failure with -ELOOP.  Relative symlinks are fine,
>> as long as their resolution does not end up stepping into the conditions
>> above.
>>
>> It guarantees that result of successful pathname resolution will be on the
>> same filesystem as its starting point and within the subtree rooted at
>> the starting point.
>>
>> Right now I have it hooked only for fstatat() and friends; it could be
>> easily extended to any ...at() syscalls.  Objections?
>
> Oh, nice!
>
> It looks like this is somewhat similar to the old O_BENEATH proposal,
> but because the intentions behind the proposals are different
> (application sandboxing versus permitting an application to restrict its
> own filesystem accesses), the semantics differ: AT_NO_JUMPS
> doesn't prevent starting the path with "/", but does prevent mountpoint
> traversal. Is that correct?
>

I missed that.  I think that AT_HOTEL_CALIFORNIA or whatever we call
it should disallow even explicit absolute paths.  If I do:

openat([fd to /var/www], "possibly untrusted path here",
AT_HOTEL_CALIFORNIA, O_WHATEVER);

I should not have to separately verify that the path doesn't start
with "/" to make sure that I don't escape.  There's a big added
advantage of this approach, too: I could write a seccomp rule that
only lets me call openat() with this new flag set, and now I can't
escape.


> I think that, as Andy mentioned, it might make sense to split out (or
> even remove?) the prevention of mountpoint traversal. A user who
> can create visible mountpoints needs to have capabilities over the
> mount namespace the file descriptor refers to already.

Agreed.  There's a big difference between the admin bind-mounting /etc
into /var/www and some web app putting a symlink to /etc into
/var/www.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: new ...at() flag: AT_NO_JUMPS
  2017-05-01 17:36 ` Jann Horn
  2017-05-01 19:37   ` Andy Lutomirski
@ 2017-05-05  0:30   ` Al Viro
  2017-05-05  0:44     ` Andy Lutomirski
                       ` (3 more replies)
  1 sibling, 4 replies; 26+ messages in thread
From: Al Viro @ 2017-05-05  0:30 UTC (permalink / raw)
  To: Jann Horn; +Cc: Linux API, linux-kernel, linux-fsdevel, Linus Torvalds

On Mon, May 01, 2017 at 07:36:52PM +0200, Jann Horn wrote:

> Oh, nice!
> 
> It looks like this is somewhat similar to the old O_BENEATH proposal,
> but because the intentions behind the proposals are different
> (application sandboxing versus permitting an application to restrict its
> own filesystem accesses), the semantics differ: AT_NO_JUMPS
> doesn't prevent starting the path with "/", but does prevent mountpoint
> traversal. Is that correct?

It prevents both, actually - I missed that in description, but this
        if (unlikely(nd->flags & LOOKUP_NO_JUMPS))
                return -ELOOP;
in nd_jump_root() affects absolute pathnames same way as it affects
absolute symlinks.

It's not quite O_BENEATH, and IMO it's saner that way - a/b/c/../d is
bloody well allowed, and so are relative symlinks that do not lead out of
the subtree.  If somebody has a good argument in favour of flat-out
ban on .. (_other_ than "other guys do it that way, and it doesn't need
to make sense 'cuz security!!1!!!", please), I'd be glad to hear it.

As for mountpoint crossing...  it might make sense to split those.
O_BENEATH allowed it, and if we want AT_BENEATH to match that - let's
do it.  Then this one would become AT_BENEATH | AT_XDEV (the latter named
after find(1) option, obviously).

So how about this:

AT_BENEATH:
	* no absolute pathnames
	* no absolute symlinks
	* no procfs-style symlinks
	* no traversal of .. when we are at the same place where we'd started
(dir/../file is allowed, dir/../.. isn't)

AT_XDEV:
	* no mountpoint crossing allowed

For the latter I would prefer -EXDEV, for obvious reasons.  For the former...
not sure.  I'm not too happy about -ELOOP, but -EPERM (as with O_BENEATH)
is an atrocity - it's even more overloaded.

Suggestions?

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: new ...at() flag: AT_NO_JUMPS
  2017-05-05  0:30   ` Al Viro
@ 2017-05-05  0:44     ` Andy Lutomirski
  2017-05-05  1:06       ` Al Viro
  2017-05-05  1:27     ` Linus Torvalds
                       ` (2 subsequent siblings)
  3 siblings, 1 reply; 26+ messages in thread
From: Andy Lutomirski @ 2017-05-05  0:44 UTC (permalink / raw)
  To: Al Viro
  Cc: Jann Horn, Linux API, linux-kernel, Linux FS Devel, Linus Torvalds

On Thu, May 4, 2017 at 5:30 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> On Mon, May 01, 2017 at 07:36:52PM +0200, Jann Horn wrote:
>
>> Oh, nice!
>>
>> It looks like this is somewhat similar to the old O_BENEATH proposal,
>> but because the intentions behind the proposals are different
>> (application sandboxing versus permitting an application to restrict its
>> own filesystem accesses), the semantics differ: AT_NO_JUMPS
>> doesn't prevent starting the path with "/", but does prevent mountpoint
>> traversal. Is that correct?
>
> It prevents both, actually - I missed that in description, but this
>         if (unlikely(nd->flags & LOOKUP_NO_JUMPS))
>                 return -ELOOP;
> in nd_jump_root() affects absolute pathnames same way as it affects
> absolute symlinks.
>
> It's not quite O_BENEATH, and IMO it's saner that way - a/b/c/../d is
> bloody well allowed, and so are relative symlinks that do not lead out of
> the subtree.  If somebody has a good argument in favour of flat-out
> ban on .. (_other_ than "other guys do it that way, and it doesn't need
> to make sense 'cuz security!!1!!!", please), I'd be glad to hear it.

I don't have an argument for allowing '..'.  I think it would be okay
to disallow it, but I don't think it matters all that much either way.

>
> As for mountpoint crossing...  it might make sense to split those.
> O_BENEATH allowed it, and if we want AT_BENEATH to match that - let's
> do it.  Then this one would become AT_BENEATH | AT_XDEV (the latter named
> after find(1) option, obviously).
>
> So how about this:
>
> AT_BENEATH:
>         * no absolute pathnames
>         * no absolute symlinks
>         * no procfs-style symlinks
>         * no traversal of .. when we are at the same place where we'd started
> (dir/../file is allowed, dir/../.. isn't)
>
> AT_XDEV:
>         * no mountpoint crossing allowed
>
> For the latter I would prefer -EXDEV, for obvious reasons.  For the former...
> not sure.  I'm not too happy about -ELOOP, but -EPERM (as with O_BENEATH)
> is an atrocity - it's even more overloaded.
>
> Suggestions?

-EDOTDOT would be amusing.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: new ...at() flag: AT_NO_JUMPS
  2017-05-05  0:44     ` Andy Lutomirski
@ 2017-05-05  1:06       ` Al Viro
  0 siblings, 0 replies; 26+ messages in thread
From: Al Viro @ 2017-05-05  1:06 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Jann Horn, Linux API, linux-kernel, Linux FS Devel, Linus Torvalds

On Thu, May 04, 2017 at 05:44:19PM -0700, Andy Lutomirski wrote:
> > It's not quite O_BENEATH, and IMO it's saner that way - a/b/c/../d is
> > bloody well allowed, and so are relative symlinks that do not lead out of
> > the subtree.  If somebody has a good argument in favour of flat-out
> > ban on .. (_other_ than "other guys do it that way, and it doesn't need
> > to make sense 'cuz security!!1!!!", please), I'd be glad to hear it.
> 
> I don't have an argument for allowing '..'.  I think it would be okay
> to disallow it, but I don't think it matters all that much either way.

Relative symlinks as argument in favour of allowing .. _when_ _it_ _stays_
_in_ _subtree_.

> > For the latter I would prefer -EXDEV, for obvious reasons.  For the former...
> > not sure.  I'm not too happy about -ELOOP, but -EPERM (as with O_BENEATH)
> > is an atrocity - it's even more overloaded.
> >
> > Suggestions?
> 
> -EDOTDOT would be amusing.

For ln -s /tmp foo/bar, lookup for foo/bar/baz?  Seriously?  Hell, even
-EXDEV would make more sense...

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: new ...at() flag: AT_NO_JUMPS
  2017-05-05  0:30   ` Al Viro
  2017-05-05  0:44     ` Andy Lutomirski
@ 2017-05-05  1:27     ` Linus Torvalds
  2017-05-05  3:00       ` Al Viro
  2017-05-05  2:47     ` Jann Horn
  2017-05-18  8:50     ` David Drysdale
  3 siblings, 1 reply; 26+ messages in thread
From: Linus Torvalds @ 2017-05-05  1:27 UTC (permalink / raw)
  To: Al Viro; +Cc: Jann Horn, Linux API, Linux Kernel Mailing List, linux-fsdevel

On Thu, May 4, 2017 at 5:30 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> As for mountpoint crossing...  it might make sense to split those.
> O_BENEATH allowed it, and if we want AT_BENEATH to match that - let's
> do it.  Then this one would become AT_BENEATH | AT_XDEV (the latter named
> after find(1) option, obviously).

So I would still like to split that NO_JUMP flag even more.

I like the AT_BENEATH | AT_XDEV split, but I think XDEV should be
split further, and I think the symlink avoidance should be split more
too.

As mentioned last time, at least for the git usage, even relative
symlinks are a no-no - not because they'd escape, but simply because
git wants to see the *unique* name, and resolve relative symlinks to
either the symlink, or to the actual file it points to.

So I think that we'd want an additional flag that says "no symlinks at all".

And I think the "no mountpoint" traversal might be splittable too.

Yes, sometimes you'd probably want to say "stay exactly inside this
filesystem" (like find -xdev). So no arguments against AT_XDEV that
refuses any mount traversal (kind of like my "no symlink traversal"
thing).

But at other points you might want to just guarantee that the walk
stays below a certain starting point and doesn't escape.

That could still allow crossing mount-points, but only if they are
non-bind mounts and cannot let us escape.

I'm not sure if that's testable, though.

                  Linus

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: new ...at() flag: AT_NO_JUMPS
  2017-05-05  0:30   ` Al Viro
  2017-05-05  0:44     ` Andy Lutomirski
  2017-05-05  1:27     ` Linus Torvalds
@ 2017-05-05  2:47     ` Jann Horn
  2017-05-05  3:46       ` Linus Torvalds
  2017-05-18  8:50     ` David Drysdale
  3 siblings, 1 reply; 26+ messages in thread
From: Jann Horn @ 2017-05-05  2:47 UTC (permalink / raw)
  To: Al Viro
  Cc: Linux API, linux-kernel, linux-fsdevel, Linus Torvalds, David Drysdale

+CC drysdale in case he has thoughts on this

On Fri, May 5, 2017 at 2:30 AM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> On Mon, May 01, 2017 at 07:36:52PM +0200, Jann Horn wrote:
>
>> Oh, nice!
>>
>> It looks like this is somewhat similar to the old O_BENEATH proposal,
>> but because the intentions behind the proposals are different
>> (application sandboxing versus permitting an application to restrict its
>> own filesystem accesses), the semantics differ: AT_NO_JUMPS
>> doesn't prevent starting the path with "/", but does prevent mountpoint
>> traversal. Is that correct?
>
> It prevents both, actually - I missed that in description, but this
>         if (unlikely(nd->flags & LOOKUP_NO_JUMPS))
>                 return -ELOOP;
> in nd_jump_root() affects absolute pathnames same way as it affects
> absolute symlinks.
>
> It's not quite O_BENEATH, and IMO it's saner that way - a/b/c/../d is
> bloody well allowed, and so are relative symlinks that do not lead out of
> the subtree.  If somebody has a good argument in favour of flat-out
> ban on .. (_other_ than "other guys do it that way, and it doesn't need
> to make sense 'cuz security!!1!!!", please), I'd be glad to hear it.

One annoying edgecase might be what happens when one thread
does an AT_BENEATH walk while another thread mutates the directory
structure. If some directory that is currently being traversed by the walk
is moved out of the directory at which the walk started, would that be
detected somehow, or could the walk then follow ".." path
components up to the root directory of the current process / of the
namespace the fd is referring to? As in:

Thread 1 starts an AT_BENEATH path walk using an O_PATH fd
pointing to /srv/www/example.org/foo; the path given to the syscall is
"bar/../../../../etc/passwd". The path walk enters the "bar" directory.
Thread 2 moves /srv/www/example.org/foo/bar to
/srv/www/example.org/bar.
Thread 1 processes the rest of the path ("../../../../etc/passwd"), never
hitting /srv/www/example.org/foo in the process.

I'm not really familiar with the VFS internals, but from a coarse look
at the patch, it seems like it wouldn't block this?

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: new ...at() flag: AT_NO_JUMPS
  2017-05-05  1:27     ` Linus Torvalds
@ 2017-05-05  3:00       ` Al Viro
  2017-05-05  4:01         ` Linus Torvalds
  0 siblings, 1 reply; 26+ messages in thread
From: Al Viro @ 2017-05-05  3:00 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Jann Horn, Linux API, Linux Kernel Mailing List, linux-fsdevel

On Thu, May 04, 2017 at 06:27:10PM -0700, Linus Torvalds wrote:

> As mentioned last time, at least for the git usage, even relative
> symlinks are a no-no - not because they'd escape, but simply because
> git wants to see the *unique* name, and resolve relative symlinks to
> either the symlink, or to the actual file it points to.
> 
> So I think that we'd want an additional flag that says "no symlinks at all".

OK, that's easily done.

> And I think the "no mountpoint" traversal might be splittable too.
> 
> Yes, sometimes you'd probably want to say "stay exactly inside this
> filesystem" (like find -xdev). So no arguments against AT_XDEV that
> refuses any mount traversal (kind of like my "no symlink traversal"
> thing).
> 
> But at other points you might want to just guarantee that the walk
> stays below a certain starting point and doesn't escape.
> 
> That could still allow crossing mount-points, but only if they are
> non-bind mounts and cannot let us escape.
> 
> I'm not sure if that's testable, though.

This one isn't, unfortunately - there is no difference between bind and
no-bind; vfsmounts form a tree and both normal mount and bind add leaves
to it.  Moreover, mount -t ext2 /dev/sdc7 /mnt; mount -t ext2 /dev/sdc7 /tmp/a
yield the same state as mount -t ext2 /dev/sdc7; mount --bind /mnt /tmp/a.
There is no way to tell the difference, simply because there *is* no
difference.  Moreover, either can be followed by umount /mnt and you'll get
the same state as you would have after a solitary mount of the same fs on
/tmp/a.

Ho-hum...  So:

			AT_BENEATH	AT_XDEV		AT_NO_SYMLINKS
absolute pathname:	EXDEV
non-relative symlink:	EXDEV		?		ELOOP
relative symlink:					ELOOP
.. from starting point:	EXDEV
.. crossing mountpoint:			EXDEV
crossing into mountpoint:		EXDEV

1) What should AT_XDEV do about absolute symlinks?  Nothing special?  EXDEV?
EXDEV if we are not on root?
2) What should AT_BENEATH | AT_NO_SYMLINKS do on absolute symlinks?  My
preference would be "AT_NO_SYMLINKS wins, ELOOP for you", but that's based
mostly upon the convenience of implementation.
3) What effect should AT_NO_SYMLINKS have upon the final component?  Same
as AT_SYMLINK_NOFOLLOW?

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: new ...at() flag: AT_NO_JUMPS
  2017-05-05  2:47     ` Jann Horn
@ 2017-05-05  3:46       ` Linus Torvalds
  2017-05-05  4:39         ` Al Viro
  0 siblings, 1 reply; 26+ messages in thread
From: Linus Torvalds @ 2017-05-05  3:46 UTC (permalink / raw)
  To: Jann Horn
  Cc: Al Viro, Linux API, Linux Kernel Mailing List, linux-fsdevel,
	David Drysdale

On Thu, May 4, 2017 at 7:47 PM, Jann Horn <jannh@google.com> wrote:
>
> Thread 1 starts an AT_BENEATH path walk using an O_PATH fd
> pointing to /srv/www/example.org/foo; the path given to the syscall is
> "bar/../../../../etc/passwd". The path walk enters the "bar" directory.
> Thread 2 moves /srv/www/example.org/foo/bar to
> /srv/www/example.org/bar.
> Thread 1 processes the rest of the path ("../../../../etc/passwd"), never
> hitting /srv/www/example.org/foo in the process.
>
> I'm not really familiar with the VFS internals, but from a coarse look
> at the patch, it seems like it wouldn't block this?

I think you're right.

I guess it would be safe for the RCU case due to the sequence number
check, but not the non-RCU case.

Al?

                 Linus

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: new ...at() flag: AT_NO_JUMPS
  2017-05-05  3:00       ` Al Viro
@ 2017-05-05  4:01         ` Linus Torvalds
  2017-05-05  4:31           ` Andy Lutomirski
  0 siblings, 1 reply; 26+ messages in thread
From: Linus Torvalds @ 2017-05-05  4:01 UTC (permalink / raw)
  To: Al Viro; +Cc: Jann Horn, Linux API, Linux Kernel Mailing List, linux-fsdevel

On Thu, May 4, 2017 at 8:00 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
>>
>> That could still allow crossing mount-points, but only if they are
>> non-bind mounts and cannot let us escape.
>>
>> I'm not sure if that's testable, though.
>
> This one isn't, unfortunately - there is no difference between bind and
> no-bind; vfsmounts form a tree and both normal mount and bind add leaves
> to it.  Moreover, mount -t ext2 /dev/sdc7 /mnt; mount -t ext2 /dev/sdc7 /tmp/a
> yield the same state as mount -t ext2 /dev/sdc7; mount --bind /mnt /tmp/a.
> There is no way to tell the difference, simply because there *is* no
> difference.  Moreover, either can be followed by umount /mnt and you'll get
> the same state as you would have after a solitary mount of the same fs on
> /tmp/a.

Fair enough.

> Ho-hum...  So:
>
>                         AT_BENEATH      AT_XDEV         AT_NO_SYMLINKS
> absolute pathname:      EXDEV
> non-relative symlink:   EXDEV           ?               ELOOP
> relative symlink:                                       ELOOP
> .. from starting point: EXDEV
> .. crossing mountpoint:                 EXDEV
> crossing into mountpoint:               EXDEV
>
> 1) What should AT_XDEV do about absolute symlinks?  Nothing special?  EXDEV?
> EXDEV if we are not on root?

My mental model would say that AT_XDEV without AT_BENEATH would
_logically_ result in "EXDEV if / is a different vfsmount", accept the
absolute path otherwise.

But honestly, just returning EXDEV unconditionally for an absolute
symlink might just be the simpler and more straightforward thing to
do.

Because testing the particular vfsmount of / simply doesn't seem to be
a very useful operation.  I dunno.

> 2) What should AT_BENEATH | AT_NO_SYMLINKS do on absolute symlinks?  My
> preference would be "AT_NO_SYMLINKS wins, ELOOP for you", but that's based
> mostly upon the convenience of implementation.

I think either is fine, and convenience wins.

> 3) What effect should AT_NO_SYMLINKS have upon the final component?  Same
> as AT_SYMLINK_NOFOLLOW?

I actually would suggest "error if it's followed".

So if you use AT_SYMLINK_NOFOLLOW | AT_NO_SYMLINKS, then you do *not*
get an error if the last component (but nothing before it) is a
symlink, and the end result is the symlink itself.

If you use just AT_NO_SYMLINKS, then the lack of NOFOLLOW implies that
you'd follow the symlink to look it up, and then AT_NO_SYMLINKS means
that you get an error (ELOOP).

So the user gets to choose, and gets to basically indicate whether
it's fine to end at a dangling symlink or not. Which is exactly what
AT_SYMLINK_NOFOLLOW is all about.

No?

                   Linus

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: new ...at() flag: AT_NO_JUMPS
  2017-05-05  4:01         ` Linus Torvalds
@ 2017-05-05  4:31           ` Andy Lutomirski
  0 siblings, 0 replies; 26+ messages in thread
From: Andy Lutomirski @ 2017-05-05  4:31 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Al Viro, Jann Horn, Linux API, Linux Kernel Mailing List, linux-fsdevel

On Thu, May 4, 2017 at 9:01 PM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
> On Thu, May 4, 2017 at 8:00 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
>>>
>>> That could still allow crossing mount-points, but only if they are
>>> non-bind mounts and cannot let us escape.
>>>
>>> I'm not sure if that's testable, though.
>>
>> This one isn't, unfortunately - there is no difference between bind and
>> no-bind; vfsmounts form a tree and both normal mount and bind add leaves
>> to it.  Moreover, mount -t ext2 /dev/sdc7 /mnt; mount -t ext2 /dev/sdc7 /tmp/a
>> yield the same state as mount -t ext2 /dev/sdc7; mount --bind /mnt /tmp/a.
>> There is no way to tell the difference, simply because there *is* no
>> difference.  Moreover, either can be followed by umount /mnt and you'll get
>> the same state as you would have after a solitary mount of the same fs on
>> /tmp/a.
>
> Fair enough.
>
>> Ho-hum...  So:
>>
>>                         AT_BENEATH      AT_XDEV         AT_NO_SYMLINKS
>> absolute pathname:      EXDEV
>> non-relative symlink:   EXDEV           ?               ELOOP
>> relative symlink:                                       ELOOP
>> .. from starting point: EXDEV
>> .. crossing mountpoint:                 EXDEV
>> crossing into mountpoint:               EXDEV
>>
>> 1) What should AT_XDEV do about absolute symlinks?  Nothing special?  EXDEV?
>> EXDEV if we are not on root?
>
> My mental model would say that AT_XDEV without AT_BENEATH would
> _logically_ result in "EXDEV if / is a different vfsmount", accept the
> absolute path otherwise.
>
> But honestly, just returning EXDEV unconditionally for an absolute
> symlink might just be the simpler and more straightforward thing to
> do.
>
> Because testing the particular vfsmount of / simply doesn't seem to be
> a very useful operation.  I dunno.

My intuition is that, regardless of whether it's obviously useful to
test the vfsmount, we should allow / if it's the same mount for
orthogonality and because it seems more likely to be the expected
behavior.

>
>> 3) What effect should AT_NO_SYMLINKS have upon the final component?  Same
>> as AT_SYMLINK_NOFOLLOW?
>
> I actually would suggest "error if it's followed".
>
> So if you use AT_SYMLINK_NOFOLLOW | AT_NO_SYMLINKS, then you do *not*
> get an error if the last component (but nothing before it) is a
> symlink, and the end result is the symlink itself.
>
> If you use just AT_NO_SYMLINKS, then the lack of NOFOLLOW implies that
> you'd follow the symlink to look it up, and then AT_NO_SYMLINKS means
> that you get an error (ELOOP).
>
> So the user gets to choose, and gets to basically indicate whether
> it's fine to end at a dangling symlink or not. Which is exactly what
> AT_SYMLINK_NOFOLLOW is all about.

Sounds reasonable to me.

--Andy

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: new ...at() flag: AT_NO_JUMPS
  2017-05-05  3:46       ` Linus Torvalds
@ 2017-05-05  4:39         ` Al Viro
  2017-05-05  4:44           ` Andy Lutomirski
  2017-05-05 20:28           ` Eric W. Biederman
  0 siblings, 2 replies; 26+ messages in thread
From: Al Viro @ 2017-05-05  4:39 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Jann Horn, Linux API, Linux Kernel Mailing List, linux-fsdevel,
	David Drysdale

On Thu, May 04, 2017 at 08:46:49PM -0700, Linus Torvalds wrote:
> On Thu, May 4, 2017 at 7:47 PM, Jann Horn <jannh@google.com> wrote:
> >
> > Thread 1 starts an AT_BENEATH path walk using an O_PATH fd
> > pointing to /srv/www/example.org/foo; the path given to the syscall is
> > "bar/../../../../etc/passwd". The path walk enters the "bar" directory.
> > Thread 2 moves /srv/www/example.org/foo/bar to
> > /srv/www/example.org/bar.
> > Thread 1 processes the rest of the path ("../../../../etc/passwd"), never
> > hitting /srv/www/example.org/foo in the process.
> >
> > I'm not really familiar with the VFS internals, but from a coarse look
> > at the patch, it seems like it wouldn't block this?
> 
> I think you're right.
> 
> I guess it would be safe for the RCU case due to the sequence number
> check, but not the non-RCU case.

	Yes and no...  FWIW, to exclude that it would suffice to have
mount --rbind /src/www/example.org/foo /srv/www/example.org/foo done first.
Then this kind of race will end up with -ENOENT due to path_connected()
logics in follow_dotdot_rcu()/follow_dotdot().  I'm not sure about the
intended applications, though - is that thing supposed to be used along with
some horror like seccomp, or...?

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: new ...at() flag: AT_NO_JUMPS
  2017-05-05  4:39         ` Al Viro
@ 2017-05-05  4:44           ` Andy Lutomirski
  2017-05-05 20:04             ` Eric W. Biederman
  2017-05-05 20:28           ` Eric W. Biederman
  1 sibling, 1 reply; 26+ messages in thread
From: Andy Lutomirski @ 2017-05-05  4:44 UTC (permalink / raw)
  To: Al Viro
  Cc: Linus Torvalds, Jann Horn, Linux API, Linux Kernel Mailing List,
	linux-fsdevel, David Drysdale

On Thu, May 4, 2017 at 9:39 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> On Thu, May 04, 2017 at 08:46:49PM -0700, Linus Torvalds wrote:
>> On Thu, May 4, 2017 at 7:47 PM, Jann Horn <jannh@google.com> wrote:
>> >
>> > Thread 1 starts an AT_BENEATH path walk using an O_PATH fd
>> > pointing to /srv/www/example.org/foo; the path given to the syscall is
>> > "bar/../../../../etc/passwd". The path walk enters the "bar" directory.
>> > Thread 2 moves /srv/www/example.org/foo/bar to
>> > /srv/www/example.org/bar.
>> > Thread 1 processes the rest of the path ("../../../../etc/passwd"), never
>> > hitting /srv/www/example.org/foo in the process.
>> >
>> > I'm not really familiar with the VFS internals, but from a coarse look
>> > at the patch, it seems like it wouldn't block this?
>>
>> I think you're right.
>>
>> I guess it would be safe for the RCU case due to the sequence number
>> check, but not the non-RCU case.
>
>         Yes and no...  FWIW, to exclude that it would suffice to have
> mount --rbind /src/www/example.org/foo /srv/www/example.org/foo done first.
> Then this kind of race will end up with -ENOENT due to path_connected()
> logics in follow_dotdot_rcu()/follow_dotdot().  I'm not sure about the
> intended applications, though - is that thing supposed to be used along with
> some horror like seccomp, or...?

How hard would it be for the kernel to prevent this on its own?
Asking users to do the mount --rbind seems like it's asking for users
to forget to do it.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: new ...at() flag: AT_NO_JUMPS
  2017-05-05  4:44           ` Andy Lutomirski
@ 2017-05-05 20:04             ` Eric W. Biederman
  0 siblings, 0 replies; 26+ messages in thread
From: Eric W. Biederman @ 2017-05-05 20:04 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Al Viro, Linus Torvalds, Jann Horn, Linux API,
	Linux Kernel Mailing List, linux-fsdevel, David Drysdale

Andy Lutomirski <luto@kernel.org> writes:

> On Thu, May 4, 2017 at 9:39 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
>> On Thu, May 04, 2017 at 08:46:49PM -0700, Linus Torvalds wrote:
>>> On Thu, May 4, 2017 at 7:47 PM, Jann Horn <jannh@google.com> wrote:
>>> >
>>> > Thread 1 starts an AT_BENEATH path walk using an O_PATH fd
>>> > pointing to /srv/www/example.org/foo; the path given to the syscall is
>>> > "bar/../../../../etc/passwd". The path walk enters the "bar" directory.
>>> > Thread 2 moves /srv/www/example.org/foo/bar to
>>> > /srv/www/example.org/bar.
>>> > Thread 1 processes the rest of the path ("../../../../etc/passwd"), never
>>> > hitting /srv/www/example.org/foo in the process.
>>> >
>>> > I'm not really familiar with the VFS internals, but from a coarse look
>>> > at the patch, it seems like it wouldn't block this?
>>>
>>> I think you're right.
>>>
>>> I guess it would be safe for the RCU case due to the sequence number
>>> check, but not the non-RCU case.
>>
>>         Yes and no...  FWIW, to exclude that it would suffice to have
>> mount --rbind /src/www/example.org/foo /srv/www/example.org/foo done first.
>> Then this kind of race will end up with -ENOENT due to path_connected()
>> logics in follow_dotdot_rcu()/follow_dotdot().  I'm not sure about the
>> intended applications, though - is that thing supposed to be used along with
>> some horror like seccomp, or...?
>
> How hard would it be for the kernel to prevent this on its own?
> Asking users to do the mount --rbind seems like it's asking for users
> to forget to do it.

The logic of path_connected checks every time you follow .. if the
parent directory you find is below your starting directory.
Mostly path_connected is optimized by noticing non-bind mounts
and doing nothing.  In this case we would need the is_subdir check
every time we follow ..

So it might just be cheaper not to follow ..

Which leads to something else we need to be careful with.

Suppose for whatever insane reason . is on a directory that has
another directory mounted on top.  Then following "some_dir/.."
would result in a different directory than we were on.

Eric

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: new ...at() flag: AT_NO_JUMPS
  2017-05-05  4:39         ` Al Viro
  2017-05-05  4:44           ` Andy Lutomirski
@ 2017-05-05 20:28           ` Eric W. Biederman
  2017-05-08 19:34             ` Mickaël Salaün
  1 sibling, 1 reply; 26+ messages in thread
From: Eric W. Biederman @ 2017-05-05 20:28 UTC (permalink / raw)
  To: Al Viro
  Cc: Linus Torvalds, Jann Horn, Linux API, Linux Kernel Mailing List,
	linux-fsdevel, David Drysdale

Al Viro <viro@ZenIV.linux.org.uk> writes:

> On Thu, May 04, 2017 at 08:46:49PM -0700, Linus Torvalds wrote:
>> On Thu, May 4, 2017 at 7:47 PM, Jann Horn <jannh@google.com> wrote:
>> >
>> > Thread 1 starts an AT_BENEATH path walk using an O_PATH fd
>> > pointing to /srv/www/example.org/foo; the path given to the syscall is
>> > "bar/../../../../etc/passwd". The path walk enters the "bar" directory.
>> > Thread 2 moves /srv/www/example.org/foo/bar to
>> > /srv/www/example.org/bar.
>> > Thread 1 processes the rest of the path ("../../../../etc/passwd"), never
>> > hitting /srv/www/example.org/foo in the process.
>> >
>> > I'm not really familiar with the VFS internals, but from a coarse look
>> > at the patch, it seems like it wouldn't block this?
>> 
>> I think you're right.
>> 
>> I guess it would be safe for the RCU case due to the sequence number
>> check, but not the non-RCU case.
>
> 	Yes and no...  FWIW, to exclude that it would suffice to have
> mount --rbind /src/www/example.org/foo /srv/www/example.org/foo done first.
> Then this kind of race will end up with -ENOENT due to path_connected()
> logics in follow_dotdot_rcu()/follow_dotdot().  I'm not sure about the
> intended applications, though - is that thing supposed to be used along with
> some horror like seccomp, or...?

As I recall the general idea is that if you have an application like a
tftp server or a web server that gets a path from a possibly dubious
source.  Instead of implementing an error prone validation logic in
userspace you can use AT_BENEATH and be certain the path resolution
stays in bounds.

As you can do stronger things as root this seems mostly targeted at
non-root applications.

I seem to recall part of the idea was to sometimes pair this to seccomp
to be certain your application can't escape a sandbox.  That plays to
seccomp limitations that it can inspect flags as they reside in
registers but seccomp can't follow pointers.

Which all suggests that we would want something similar to is_subdir
when AT_BENEATH is specified that we check every time we follow ..
that would verify that on the same filesystem we stay below and
that we also stay on a mount that is below.  mount --move has
all of the same challenges for enforcing you stay within bounds
as rename does.

Eric

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: new ...at() flag: AT_NO_JUMPS
  2017-05-05 20:28           ` Eric W. Biederman
@ 2017-05-08 19:34             ` Mickaël Salaün
  0 siblings, 0 replies; 26+ messages in thread
From: Mickaël Salaün @ 2017-05-08 19:34 UTC (permalink / raw)
  To: Eric W. Biederman, Al Viro, David Drysdale
  Cc: Linus Torvalds, Jann Horn, Linux API, Linux Kernel Mailing List,
	linux-fsdevel, linux-security-module, Kees Cook


[-- Attachment #1.1: Type: text/plain, Size: 3359 bytes --]


On 05/05/2017 22:28, Eric W. Biederman wrote:
> Al Viro <viro@ZenIV.linux.org.uk> writes:
> 
>> On Thu, May 04, 2017 at 08:46:49PM -0700, Linus Torvalds wrote:
>>> On Thu, May 4, 2017 at 7:47 PM, Jann Horn <jannh@google.com> wrote:
>>>>
>>>> Thread 1 starts an AT_BENEATH path walk using an O_PATH fd
>>>> pointing to /srv/www/example.org/foo; the path given to the syscall is
>>>> "bar/../../../../etc/passwd". The path walk enters the "bar" directory.
>>>> Thread 2 moves /srv/www/example.org/foo/bar to
>>>> /srv/www/example.org/bar.
>>>> Thread 1 processes the rest of the path ("../../../../etc/passwd"), never
>>>> hitting /srv/www/example.org/foo in the process.
>>>>
>>>> I'm not really familiar with the VFS internals, but from a coarse look
>>>> at the patch, it seems like it wouldn't block this?
>>>
>>> I think you're right.
>>>
>>> I guess it would be safe for the RCU case due to the sequence number
>>> check, but not the non-RCU case.
>>
>> 	Yes and no...  FWIW, to exclude that it would suffice to have
>> mount --rbind /src/www/example.org/foo /srv/www/example.org/foo done first.
>> Then this kind of race will end up with -ENOENT due to path_connected()
>> logics in follow_dotdot_rcu()/follow_dotdot().  I'm not sure about the
>> intended applications, though - is that thing supposed to be used along with
>> some horror like seccomp, or...?
> 
> As I recall the general idea is that if you have an application like a
> tftp server or a web server that gets a path from a possibly dubious
> source.  Instead of implementing an error prone validation logic in
> userspace you can use AT_BENEATH and be certain the path resolution
> stays in bounds.
> 
> As you can do stronger things as root this seems mostly targeted at
> non-root applications.
> 
> I seem to recall part of the idea was to sometimes pair this to seccomp
> to be certain your application can't escape a sandbox.  That plays to
> seccomp limitations that it can inspect flags as they reside in
> registers but seccomp can't follow pointers.

Here is the code and tests from David Drysdale:
https://github.com/google/capsicum-linux/commits/openat-v2
...and the latest patch: https://lkml.org/lkml/2015/3/9/407
The O_BENEATH flag have also been discussed for FreeBSD to support Capsicum.

> 
> Which all suggests that we would want something similar to is_subdir
> when AT_BENEATH is specified that we check every time we follow ..
> that would verify that on the same filesystem we stay below and
> that we also stay on a mount that is below.  mount --move has
> all of the same challenges for enforcing you stay within bounds
> as rename does.

FYI, I'm working on a new LSM [1] to work around the limitations of
seccomp-bpf, especially the pointer checks. The idea is to enable some
filtering as seccomp-bpf can do but instead of checking at the syscall
level, Landlock take advantage of LSM hooks. I had a first PoC of an
eBPF function and map type to check if a file was beneath another [2]. I
plan to create a new one that record a "snapshot" of the current mount
tree into an eBPF map to be able to check if a file is beneath or a
parent of another one.

[1] https://lkml.kernel.org/r/20170328234650.19695-1-mic@digikod.net
[2] https://lkml.kernel.org/r/20161026065654.19166-9-mic@digikod.net


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: new ...at() flag: AT_NO_JUMPS
  2017-05-05  0:30   ` Al Viro
                       ` (2 preceding siblings ...)
  2017-05-05  2:47     ` Jann Horn
@ 2017-05-18  8:50     ` David Drysdale
  3 siblings, 0 replies; 26+ messages in thread
From: David Drysdale @ 2017-05-18  8:50 UTC (permalink / raw)
  To: Al Viro
  Cc: Jann Horn, Linux API, linux-kernel, Linux FS Devel, Linus Torvalds

On Fri, May 5, 2017 at 1:30 AM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> On Mon, May 01, 2017 at 07:36:52PM +0200, Jann Horn wrote:
>
>> Oh, nice!
>>
>> It looks like this is somewhat similar to the old O_BENEATH proposal,
>> but because the intentions behind the proposals are different
>> (application sandboxing versus permitting an application to restrict its
>> own filesystem accesses), the semantics differ: AT_NO_JUMPS
>> doesn't prevent starting the path with "/", but does prevent mountpoint
>> traversal. Is that correct?
>
> It prevents both, actually - I missed that in description, but this
>         if (unlikely(nd->flags & LOOKUP_NO_JUMPS))
>                 return -ELOOP;
> in nd_jump_root() affects absolute pathnames same way as it affects
> absolute symlinks.
>
> It's not quite O_BENEATH, and IMO it's saner that way - a/b/c/../d is
> bloody well allowed, and so are relative symlinks that do not lead out of
> the subtree.  If somebody has a good argument in favour of flat-out
> ban on .. (_other_ than "other guys do it that way, and it doesn't need
> to make sense 'cuz security!!1!!!", please), I'd be glad to hear it.

BTW, FreeBSD head now allows .. if it stays in subtree:
https://svnweb.freebsd.org/base?view=revision&revision=308212

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: new ...at() flag: AT_NO_JUMPS
@ 2017-09-10 20:26 Jürg Billeter
  0 siblings, 0 replies; 26+ messages in thread
From: Jürg Billeter @ 2017-09-10 20:26 UTC (permalink / raw)
  To: Al Viro; +Cc: linux-fsdevel, linux-kernel

Hi Al,

Might it make sense to specify these lookup restrictions when opening
the directory (O_ROOT?) instead of specifying it for each lookup with
AT_* (or supporting both)? This might make it more useful when passing
directory fds between processes that do not use seccomp (where
AT_BENEATH could be enforced).

For my sandboxing use case, I'd be happy with either solution, though.
Is there anything I can do to help move this forward?

Best regards,
Jürg

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2017-09-10 20:26 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-04-29 22:04 new ...at() flag: AT_NO_JUMPS Al Viro
2017-04-29 23:17 ` Andy Lutomirski
2017-04-29 23:25   ` Al Viro
2017-04-30  1:13     ` Andy Lutomirski
2017-04-30  4:38     ` Matthew Wilcox
2017-04-30 16:10       ` Al Viro
2017-05-01  4:52         ` Andy Lutomirski
2017-05-01  5:15           ` Al Viro
2017-05-01 17:36 ` Jann Horn
2017-05-01 19:37   ` Andy Lutomirski
2017-05-05  0:30   ` Al Viro
2017-05-05  0:44     ` Andy Lutomirski
2017-05-05  1:06       ` Al Viro
2017-05-05  1:27     ` Linus Torvalds
2017-05-05  3:00       ` Al Viro
2017-05-05  4:01         ` Linus Torvalds
2017-05-05  4:31           ` Andy Lutomirski
2017-05-05  2:47     ` Jann Horn
2017-05-05  3:46       ` Linus Torvalds
2017-05-05  4:39         ` Al Viro
2017-05-05  4:44           ` Andy Lutomirski
2017-05-05 20:04             ` Eric W. Biederman
2017-05-05 20:28           ` Eric W. Biederman
2017-05-08 19:34             ` Mickaël Salaün
2017-05-18  8:50     ` David Drysdale
2017-09-10 20:26 Jürg Billeter

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).