From: David Howells <dhowells@redhat.com>
To: viro@zeniv.linux.org.uk
Cc: linux-api@vger.kernel.org, torvalds@linux-foundation.org,
dhowells@redhat.com, linux-fsdevel@vger.kernel.org,
linux-kernel@vger.kernel.org
Subject: [PATCH 01/33] vfs: syscall: Add open_tree(2) to reference or clone a mount [ver #11]
Date: Wed, 01 Aug 2018 16:24:11 +0100 [thread overview]
Message-ID: <153313705165.13253.4602180607294286849.stgit@warthog.procyon.org.uk> (raw)
In-Reply-To: <153313703562.13253.5766498657900728120.stgit@warthog.procyon.org.uk>
From: Al Viro <viro@zeniv.linux.org.uk>
open_tree(dfd, pathname, flags)
Returns an O_PATH-opened file descriptor or an error.
dfd and pathname specify the location to open, in usual
fashion (see e.g. fstatat(2)). flags should be an OR of
some of the following:
* AT_PATH_EMPTY, AT_NO_AUTOMOUNT, AT_SYMLINK_NOFOLLOW -
same meanings as usual
* OPEN_TREE_CLOEXEC - make the resulting descriptor
close-on-exec
* OPEN_TREE_CLONE or OPEN_TREE_CLONE | AT_RECURSIVE -
instead of opening the location in question, create a detached
mount tree matching the subtree rooted at location specified by
dfd/pathname. With AT_RECURSIVE the entire subtree is cloned,
without it - only the part within in the mount containing the
location in question. In other words, the same as mount --rbind
or mount --bind would've taken. The detached tree will be
dissolved on the final close of obtained file. Creation of such
detached trees requires the same capabilities as doing mount --bind.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: David Howells <dhowells@redhat.com>
cc: linux-api@vger.kernel.org
---
arch/x86/entry/syscalls/syscall_32.tbl | 1
arch/x86/entry/syscalls/syscall_64.tbl | 1
fs/file_table.c | 9 +-
fs/internal.h | 1
fs/namespace.c | 132 +++++++++++++++++++++++++++-----
include/linux/fs.h | 3 +
include/linux/syscalls.h | 1
include/uapi/linux/fcntl.h | 2
include/uapi/linux/mount.h | 10 ++
9 files changed, 135 insertions(+), 25 deletions(-)
create mode 100644 include/uapi/linux/mount.h
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 3cf7b533b3d1..ea1b413afd47 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -398,3 +398,4 @@
384 i386 arch_prctl sys_arch_prctl __ia32_compat_sys_arch_prctl
385 i386 io_pgetevents sys_io_pgetevents __ia32_compat_sys_io_pgetevents
386 i386 rseq sys_rseq __ia32_sys_rseq
+387 i386 open_tree sys_open_tree __ia32_sys_open_tree
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index f0b1709a5ffb..0545bed581dc 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -343,6 +343,7 @@
332 common statx __x64_sys_statx
333 common io_pgetevents __x64_sys_io_pgetevents
334 common rseq __x64_sys_rseq
+335 common open_tree __x64_sys_open_tree
#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/file_table.c b/fs/file_table.c
index 7ec0b3e5f05d..7480271a0d21 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -189,6 +189,7 @@ static void __fput(struct file *file)
struct dentry *dentry = file->f_path.dentry;
struct vfsmount *mnt = file->f_path.mnt;
struct inode *inode = file->f_inode;
+ fmode_t mode = file->f_mode;
might_sleep();
@@ -209,14 +210,14 @@ static void __fput(struct file *file)
file->f_op->release(inode, file);
security_file_free(file);
if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL &&
- !(file->f_mode & FMODE_PATH))) {
+ !(mode & FMODE_PATH))) {
cdev_put(inode->i_cdev);
}
fops_put(file->f_op);
put_pid(file->f_owner.pid);
- if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
+ if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
i_readcount_dec(inode);
- if (file->f_mode & FMODE_WRITER) {
+ if (mode & FMODE_WRITER) {
put_write_access(inode);
__mnt_drop_write(mnt);
}
@@ -224,6 +225,8 @@ static void __fput(struct file *file)
file->f_path.mnt = NULL;
file->f_inode = NULL;
file_free(file);
+ if (unlikely(mode & FMODE_NEED_UNMOUNT))
+ dissolve_on_fput(mnt);
dput(dentry);
mntput(mnt);
}
diff --git a/fs/internal.h b/fs/internal.h
index 56533b08532e..383ee4724f77 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -85,6 +85,7 @@ extern void __mnt_drop_write(struct vfsmount *);
extern void __mnt_drop_write_file(struct file *);
extern void mnt_drop_write_file_path(struct file *);
+extern void dissolve_on_fput(struct vfsmount *);
/*
* fs_struct.c
*/
diff --git a/fs/namespace.c b/fs/namespace.c
index 03cc3b5bcf00..a4a01ecbcacd 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -20,12 +20,14 @@
#include <linux/init.h> /* init_rootfs */
#include <linux/fs_struct.h> /* get_fs_root et.al. */
#include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */
+#include <linux/file.h>
#include <linux/uaccess.h>
#include <linux/proc_ns.h>
#include <linux/magic.h>
#include <linux/bootmem.h>
#include <linux/task_work.h>
#include <linux/sched/task.h>
+#include <uapi/linux/mount.h>
#include "pnode.h"
#include "internal.h"
@@ -1840,6 +1842,16 @@ struct vfsmount *collect_mounts(const struct path *path)
return &tree->mnt;
}
+void dissolve_on_fput(struct vfsmount *mnt)
+{
+ namespace_lock();
+ lock_mount_hash();
+ mntget(mnt);
+ umount_tree(real_mount(mnt), UMOUNT_SYNC);
+ unlock_mount_hash();
+ namespace_unlock();
+}
+
void drop_collected_mounts(struct vfsmount *mnt)
{
namespace_lock();
@@ -2199,6 +2211,30 @@ static bool has_locked_children(struct mount *mnt, struct dentry *dentry)
return false;
}
+static struct mount *__do_loopback(struct path *old_path, int recurse)
+{
+ struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt);
+
+ if (IS_MNT_UNBINDABLE(old))
+ return mnt;
+
+ if (!check_mnt(old) && old_path->dentry->d_op != &ns_dentry_operations)
+ return mnt;
+
+ if (!recurse && has_locked_children(old, old_path->dentry))
+ return mnt;
+
+ if (recurse)
+ mnt = copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE);
+ else
+ mnt = clone_mnt(old, old_path->dentry, 0);
+
+ if (!IS_ERR(mnt))
+ mnt->mnt.mnt_flags &= ~MNT_LOCKED;
+
+ return mnt;
+}
+
/*
* do loopback mount.
*/
@@ -2206,7 +2242,7 @@ static int do_loopback(struct path *path, const char *old_name,
int recurse)
{
struct path old_path;
- struct mount *mnt = NULL, *old, *parent;
+ struct mount *mnt = NULL, *parent;
struct mountpoint *mp;
int err;
if (!old_name || !*old_name)
@@ -2220,38 +2256,21 @@ static int do_loopback(struct path *path, const char *old_name,
goto out;
mp = lock_mount(path);
- err = PTR_ERR(mp);
- if (IS_ERR(mp))
+ if (IS_ERR(mp)) {
+ err = PTR_ERR(mp);
goto out;
+ }
- old = real_mount(old_path.mnt);
parent = real_mount(path->mnt);
-
- err = -EINVAL;
- if (IS_MNT_UNBINDABLE(old))
- goto out2;
-
if (!check_mnt(parent))
goto out2;
- if (!check_mnt(old) && old_path.dentry->d_op != &ns_dentry_operations)
- goto out2;
-
- if (!recurse && has_locked_children(old, old_path.dentry))
- goto out2;
-
- if (recurse)
- mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE);
- else
- mnt = clone_mnt(old, old_path.dentry, 0);
-
+ mnt = __do_loopback(&old_path, recurse);
if (IS_ERR(mnt)) {
err = PTR_ERR(mnt);
goto out2;
}
- mnt->mnt.mnt_flags &= ~MNT_LOCKED;
-
err = graft_tree(mnt, parent, mp);
if (err) {
lock_mount_hash();
@@ -2265,6 +2284,75 @@ static int do_loopback(struct path *path, const char *old_name,
return err;
}
+SYSCALL_DEFINE3(open_tree, int, dfd, const char *, filename, unsigned, flags)
+{
+ struct file *file;
+ struct path path;
+ int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
+ bool detached = flags & OPEN_TREE_CLONE;
+ int error;
+ int fd;
+
+ BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);
+
+ if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE |
+ AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE |
+ OPEN_TREE_CLOEXEC))
+ return -EINVAL;
+
+ if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE)
+ return -EINVAL;
+
+ if (flags & AT_NO_AUTOMOUNT)
+ lookup_flags &= ~LOOKUP_AUTOMOUNT;
+ if (flags & AT_SYMLINK_NOFOLLOW)
+ lookup_flags &= ~LOOKUP_FOLLOW;
+ if (flags & AT_EMPTY_PATH)
+ lookup_flags |= LOOKUP_EMPTY;
+
+ if (detached && !may_mount())
+ return -EPERM;
+
+ fd = get_unused_fd_flags(flags & O_CLOEXEC);
+ if (fd < 0)
+ return fd;
+
+ error = user_path_at(dfd, filename, lookup_flags, &path);
+ if (error)
+ goto out;
+
+ if (detached) {
+ struct mount *mnt = __do_loopback(&path, flags & AT_RECURSIVE);
+ if (IS_ERR(mnt)) {
+ error = PTR_ERR(mnt);
+ goto out2;
+ }
+ mntput(path.mnt);
+ path.mnt = &mnt->mnt;
+ }
+
+ file = dentry_open(&path, O_PATH, current_cred());
+ if (IS_ERR(file)) {
+ error = PTR_ERR(file);
+ goto out3;
+ }
+
+ if (detached)
+ file->f_mode |= FMODE_NEED_UNMOUNT;
+ path_put(&path);
+ fd_install(fd, file);
+ return fd;
+
+out3:
+ if (detached)
+ dissolve_on_fput(path.mnt);
+out2:
+ path_put(&path);
+out:
+ put_unused_fd(fd);
+ return error;
+}
+
static int change_mount_flags(struct vfsmount *mnt, int ms_flags)
{
int error = 0;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e3a18cddb74e..067f0e31aec7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -154,6 +154,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
/* File is capable of returning -EAGAIN if I/O will block */
#define FMODE_NOWAIT ((__force fmode_t)0x8000000)
+/* File represents mount that needs unmounting */
+#define FMODE_NEED_UNMOUNT ((__force fmode_t)0x10000000)
+
/*
* Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
* that indicates that they should check the contents of the iovec are
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 73810808cdf2..3cc6b8f8bd2f 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -900,6 +900,7 @@ asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
unsigned mask, struct statx __user *buffer);
asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len,
int flags, uint32_t sig);
+asmlinkage long sys_open_tree(int dfd, const char __user *path, unsigned flags);
/*
* Architecture-specific system calls
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 6448cdd9a350..594b85f7cb86 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -90,5 +90,7 @@
#define AT_STATX_FORCE_SYNC 0x2000 /* - Force the attributes to be sync'd with the server */
#define AT_STATX_DONT_SYNC 0x4000 /* - Don't sync attributes with the server */
+#define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */
+
#endif /* _UAPI_LINUX_FCNTL_H */
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
new file mode 100644
index 000000000000..e8db2911adca
--- /dev/null
+++ b/include/uapi/linux/mount.h
@@ -0,0 +1,10 @@
+#ifndef _UAPI_LINUX_MOUNT_H
+#define _UAPI_LINUX_MOUNT_H
+
+/*
+ * open_tree() flags.
+ */
+#define OPEN_TREE_CLONE 1 /* Clone the target tree and attach the clone */
+#define OPEN_TREE_CLOEXEC O_CLOEXEC /* Close the file on execve() */
+
+#endif /* _UAPI_LINUX_MOUNT_H */
next prev parent reply other threads:[~2018-08-01 15:24 UTC|newest]
Thread overview: 116+ messages / expand[flat|nested] mbox.gz Atom feed top
2018-08-01 15:23 [PATCH 00/33] VFS: Introduce filesystem context [ver #11] David Howells
2018-08-01 15:24 ` David Howells [this message]
2018-08-02 17:31 ` [PATCH 01/33] vfs: syscall: Add open_tree(2) to reference or clone a mount " Alan Jenkins
2018-08-02 21:29 ` Al Viro
2018-08-02 21:51 ` David Howells
2018-08-02 23:46 ` Alan Jenkins
2018-08-01 15:24 ` [PATCH 02/33] vfs: syscall: Add move_mount(2) to move mounts around " David Howells
2018-08-01 15:24 ` [PATCH 03/33] teach move_mount(2) to work with OPEN_TREE_CLONE " David Howells
2018-10-12 14:25 ` Alan Jenkins
2018-08-01 15:24 ` [PATCH 04/33] vfs: Suppress MS_* flag defs within the kernel unless explicitly enabled " David Howells
2018-08-01 15:24 ` [PATCH 05/33] vfs: Introduce the basic header for the new mount API's filesystem context " David Howells
2018-08-01 15:24 ` [PATCH 06/33] vfs: Introduce logging functions " David Howells
2018-08-01 15:24 ` [PATCH 07/33] vfs: Add configuration parser helpers " David Howells
2018-08-01 15:24 ` [PATCH 08/33] vfs: Add LSM hooks for the new mount API " David Howells
2018-08-01 20:50 ` James Morris
2018-08-01 22:53 ` David Howells
2018-08-01 15:25 ` [PATCH 09/33] selinux: Implement the new mount API LSM hooks " David Howells
2018-08-01 15:25 ` [PATCH 10/33] smack: Implement filesystem context security " David Howells
2018-08-01 15:25 ` [PATCH 11/33] apparmor: Implement security hooks for the new mount API " David Howells
2018-08-01 15:25 ` [PATCH 12/33] tomoyo: " David Howells
2018-08-01 15:25 ` [PATCH 13/33] vfs: Separate changing mount flags full remount " David Howells
2018-08-01 15:25 ` [PATCH 14/33] vfs: Implement a filesystem superblock creation/configuration context " David Howells
2018-09-11 17:46 ` Guenter Roeck
2018-09-11 21:52 ` David Howells
2018-09-11 22:07 ` Guenter Roeck
2018-09-11 23:17 ` David Howells
2018-09-11 23:54 ` Guenter Roeck
2018-09-18 9:07 ` Sergey Senozhatsky
2018-09-18 9:40 ` Sergey Senozhatsky
2018-09-18 14:06 ` Guenter Roeck
2018-09-19 1:12 ` Sergey Senozhatsky
2018-09-19 1:26 ` Sergey Senozhatsky
2018-09-18 15:34 ` David Howells
2018-09-18 16:39 ` David Howells
2018-09-19 1:15 ` Sergey Senozhatsky
2018-09-18 17:43 ` David Howells
2018-09-18 9:54 ` Sergey Senozhatsky
2018-09-18 15:28 ` David Howells
2018-08-01 15:25 ` [PATCH 15/33] vfs: Remove unused code after filesystem context changes " David Howells
2018-08-01 15:25 ` [PATCH 16/33] procfs: Move proc_fill_super() to fs/proc/root.c " David Howells
2018-08-01 15:26 ` [PATCH 17/33] proc: Add fs_context support to procfs " David Howells
2018-08-01 15:26 ` [PATCH 18/33] ipc: Convert mqueue fs to fs_context " David Howells
2018-08-01 15:26 ` [PATCH 19/33] cpuset: Use " David Howells
2018-08-01 15:26 ` [PATCH 20/33] kernfs, sysfs, cgroup, intel_rdt: Support " David Howells
2018-08-01 15:26 ` [PATCH 21/33] hugetlbfs: Convert to " David Howells
2018-08-01 15:26 ` [PATCH 22/33] vfs: Remove kern_mount_data() " David Howells
2018-08-01 15:26 ` [PATCH 23/33] vfs: Provide documentation for new mount API " David Howells
2018-08-01 15:26 ` [PATCH 24/33] Make anon_inodes unconditional " David Howells
2018-08-01 15:26 ` [PATCH 25/33] vfs: syscall: Add fsopen() to prepare for superblock creation " David Howells
2018-08-01 15:27 ` [PATCH 26/33] vfs: Implement logging through fs_context " David Howells
2018-08-01 15:27 ` [PATCH 27/33] vfs: Add some logging to the core users of the fs_context log " David Howells
2018-08-01 15:27 ` [PATCH 28/33] vfs: syscall: Add fsconfig() for configuring and managing a context " David Howells
2018-08-06 17:28 ` Eric W. Biederman
2018-08-09 14:14 ` David Howells
2018-08-09 14:24 ` David Howells
2018-08-09 14:35 ` Miklos Szeredi
2018-08-09 15:32 ` Eric W. Biederman
2018-08-09 16:33 ` David Howells
2018-08-11 20:20 ` David Howells
2018-08-11 23:26 ` Andy Lutomirski
2018-08-01 15:27 ` [PATCH 29/33] vfs: syscall: Add fsmount() to create a mount for a superblock " David Howells
2018-08-01 15:27 ` [PATCH 30/33] vfs: syscall: Add fspick() to select a superblock for reconfiguration " David Howells
2018-08-24 14:51 ` Miklos Szeredi
2018-08-24 14:54 ` Andy Lutomirski
2018-08-01 15:27 ` [PATCH 31/33] afs: Add fs_context support " David Howells
2018-08-01 15:27 ` [PATCH 32/33] afs: Use fs_context to pass parameters over automount " David Howells
2018-08-01 15:27 ` [PATCH 33/33] vfs: Add a sample program for the new mount API " David Howells
2018-08-10 14:05 ` BUG: Mount ignores mount options Eric W. Biederman
2018-08-10 14:36 ` Andy Lutomirski
2018-08-10 15:17 ` Eric W. Biederman
2018-08-10 15:24 ` Al Viro
2018-08-10 15:11 ` Tetsuo Handa
2018-08-10 15:13 ` David Howells
2018-08-10 15:16 ` Al Viro
2018-08-11 1:05 ` Eric W. Biederman
2018-08-11 1:46 ` Theodore Y. Ts'o
2018-08-11 4:48 ` Eric W. Biederman
2018-08-11 17:47 ` Casey Schaufler
2018-08-15 4:03 ` Eric W. Biederman
2018-08-11 1:58 ` Al Viro
2018-08-11 2:17 ` Al Viro
2018-08-11 4:43 ` Eric W. Biederman
2018-08-13 12:54 ` Miklos Szeredi
2018-08-10 15:11 ` David Howells
2018-08-10 15:39 ` Theodore Y. Ts'o
2018-08-10 15:55 ` Casey Schaufler
2018-08-10 16:11 ` David Howells
2018-08-10 18:00 ` Eric W. Biederman
2018-08-10 15:53 ` David Howells
2018-08-10 16:14 ` Theodore Y. Ts'o
2018-08-10 20:06 ` Andy Lutomirski
2018-08-10 20:46 ` Theodore Y. Ts'o
2018-08-10 22:12 ` Darrick J. Wong
2018-08-10 23:54 ` Theodore Y. Ts'o
2018-08-11 0:38 ` Darrick J. Wong
2018-08-11 1:32 ` Eric W. Biederman
2018-08-13 16:35 ` Alan Cox
2018-08-13 16:48 ` Andy Lutomirski
2018-08-13 17:29 ` Al Viro
2018-08-13 19:00 ` James Morris
2018-08-13 19:20 ` Casey Schaufler
2018-08-15 23:29 ` Serge E. Hallyn
2018-08-11 0:28 ` Eric W. Biederman
2018-08-11 1:19 ` Eric W. Biederman
2018-08-11 7:29 ` David Howells
2018-08-11 16:31 ` Andy Lutomirski
2018-08-11 16:51 ` Al Viro
2018-08-15 16:31 ` Should we split the network filesystem setup into two phases? David Howells
2018-08-15 16:51 ` Andy Lutomirski
2018-08-16 3:51 ` Steve French
2018-08-16 5:06 ` Eric W. Biederman
2018-08-16 16:24 ` Steve French
2018-08-16 17:21 ` Eric W. Biederman
2018-08-16 17:23 ` Aurélien Aptel
2018-08-16 18:36 ` Steve French
2018-08-17 23:11 ` Al Viro
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=153313705165.13253.4602180607294286849.stgit@warthog.procyon.org.uk \
--to=dhowells@redhat.com \
--cc=linux-api@vger.kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=torvalds@linux-foundation.org \
--cc=viro@zeniv.linux.org.uk \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).