linux-api.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 01/32] vfs: syscall: Add open_tree(2) to reference or clone a mount [ver #9]
       [not found] <153126248868.14533.9751473662727327569.stgit@warthog.procyon.org.uk>
@ 2018-07-10 22:41 ` David Howells
  2018-07-10 22:41 ` [PATCH 02/32] vfs: syscall: Add move_mount(2) to move mounts around " David Howells
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 75+ messages in thread
From: David Howells @ 2018-07-10 22:41 UTC (permalink / raw)
  To: viro; +Cc: dhowells, linux-api, linux-fsdevel, torvalds, linux-kernel

From: Al Viro <viro@zeniv.linux.org.uk>

open_tree(dfd, pathname, flags)

Returns an O_PATH-opened file descriptor or an error.
dfd and pathname specify the location to open, in usual
fashion (see e.g. fstatat(2)).  flags should be an OR of
some of the following:
	* AT_PATH_EMPTY, AT_NO_AUTOMOUNT, AT_SYMLINK_NOFOLLOW -
same meanings as usual
	* OPEN_TREE_CLOEXEC - make the resulting descriptor
close-on-exec
	* OPEN_TREE_CLONE or OPEN_TREE_CLONE | AT_RECURSIVE -
instead of opening the location in question, create a detached
mount tree matching the subtree rooted at location specified by
dfd/pathname.  With AT_RECURSIVE the entire subtree is cloned,
without it - only the part within in the mount containing the
location in question.  In other words, the same as mount --rbind
or mount --bind would've taken.  The detached tree will be
dissolved on the final close of obtained file.  Creation of such
detached trees requires the same capabilities as doing mount --bind.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
cc: linux-api@vger.kernel.org
---

 arch/x86/entry/syscalls/syscall_32.tbl |    1 
 arch/x86/entry/syscalls/syscall_64.tbl |    1 
 fs/file_table.c                        |    9 +-
 fs/internal.h                          |    1 
 fs/namespace.c                         |  132 +++++++++++++++++++++++++++-----
 include/linux/fs.h                     |    3 +
 include/linux/syscalls.h               |    1 
 include/uapi/linux/fcntl.h             |    2 
 include/uapi/linux/mount.h             |   10 ++
 9 files changed, 135 insertions(+), 25 deletions(-)
 create mode 100644 include/uapi/linux/mount.h

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 3cf7b533b3d1..ea1b413afd47 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -398,3 +398,4 @@
 384	i386	arch_prctl		sys_arch_prctl			__ia32_compat_sys_arch_prctl
 385	i386	io_pgetevents		sys_io_pgetevents		__ia32_compat_sys_io_pgetevents
 386	i386	rseq			sys_rseq			__ia32_sys_rseq
+387	i386	open_tree		sys_open_tree			__ia32_sys_open_tree
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index f0b1709a5ffb..0545bed581dc 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -343,6 +343,7 @@
 332	common	statx			__x64_sys_statx
 333	common	io_pgetevents		__x64_sys_io_pgetevents
 334	common	rseq			__x64_sys_rseq
+335	common	open_tree		__x64_sys_open_tree
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/file_table.c b/fs/file_table.c
index 7ec0b3e5f05d..7480271a0d21 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -189,6 +189,7 @@ static void __fput(struct file *file)
 	struct dentry *dentry = file->f_path.dentry;
 	struct vfsmount *mnt = file->f_path.mnt;
 	struct inode *inode = file->f_inode;
+	fmode_t mode = file->f_mode;
 
 	might_sleep();
 
@@ -209,14 +210,14 @@ static void __fput(struct file *file)
 		file->f_op->release(inode, file);
 	security_file_free(file);
 	if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL &&
-		     !(file->f_mode & FMODE_PATH))) {
+		     !(mode & FMODE_PATH))) {
 		cdev_put(inode->i_cdev);
 	}
 	fops_put(file->f_op);
 	put_pid(file->f_owner.pid);
-	if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
+	if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
 		i_readcount_dec(inode);
-	if (file->f_mode & FMODE_WRITER) {
+	if (mode & FMODE_WRITER) {
 		put_write_access(inode);
 		__mnt_drop_write(mnt);
 	}
@@ -224,6 +225,8 @@ static void __fput(struct file *file)
 	file->f_path.mnt = NULL;
 	file->f_inode = NULL;
 	file_free(file);
+	if (unlikely(mode & FMODE_NEED_UNMOUNT))
+		dissolve_on_fput(mnt);
 	dput(dentry);
 	mntput(mnt);
 }
diff --git a/fs/internal.h b/fs/internal.h
index 980d005b21b4..b55575b9b55c 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -85,6 +85,7 @@ extern void __mnt_drop_write(struct vfsmount *);
 extern void __mnt_drop_write_file(struct file *);
 extern void mnt_drop_write_file_path(struct file *);
 
+extern void dissolve_on_fput(struct vfsmount *);
 /*
  * fs_struct.c
  */
diff --git a/fs/namespace.c b/fs/namespace.c
index 8ddd14806799..b355a555b4db 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -20,12 +20,14 @@
 #include <linux/init.h>		/* init_rootfs */
 #include <linux/fs_struct.h>	/* get_fs_root et.al. */
 #include <linux/fsnotify.h>	/* fsnotify_vfsmount_delete */
+#include <linux/file.h>
 #include <linux/uaccess.h>
 #include <linux/proc_ns.h>
 #include <linux/magic.h>
 #include <linux/bootmem.h>
 #include <linux/task_work.h>
 #include <linux/sched/task.h>
+#include <uapi/linux/mount.h>
 
 #include "pnode.h"
 #include "internal.h"
@@ -1839,6 +1841,16 @@ struct vfsmount *collect_mounts(const struct path *path)
 	return &tree->mnt;
 }
 
+void dissolve_on_fput(struct vfsmount *mnt)
+{
+	namespace_lock();
+	lock_mount_hash();
+	mntget(mnt);
+	umount_tree(real_mount(mnt), UMOUNT_SYNC);
+	unlock_mount_hash();
+	namespace_unlock();
+}
+
 void drop_collected_mounts(struct vfsmount *mnt)
 {
 	namespace_lock();
@@ -2198,6 +2210,30 @@ static bool has_locked_children(struct mount *mnt, struct dentry *dentry)
 	return false;
 }
 
+static struct mount *__do_loopback(struct path *old_path, int recurse)
+{
+	struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt);
+
+	if (IS_MNT_UNBINDABLE(old))
+		return mnt;
+
+	if (!check_mnt(old) && old_path->dentry->d_op != &ns_dentry_operations)
+		return mnt;
+
+	if (!recurse && has_locked_children(old, old_path->dentry))
+		return mnt;
+
+	if (recurse)
+		mnt = copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE);
+	else
+		mnt = clone_mnt(old, old_path->dentry, 0);
+
+	if (!IS_ERR(mnt))
+		mnt->mnt.mnt_flags &= ~MNT_LOCKED;
+
+	return mnt;
+}
+
 /*
  * do loopback mount.
  */
@@ -2205,7 +2241,7 @@ static int do_loopback(struct path *path, const char *old_name,
 				int recurse)
 {
 	struct path old_path;
-	struct mount *mnt = NULL, *old, *parent;
+	struct mount *mnt = NULL, *parent;
 	struct mountpoint *mp;
 	int err;
 	if (!old_name || !*old_name)
@@ -2219,38 +2255,21 @@ static int do_loopback(struct path *path, const char *old_name,
 		goto out;
 
 	mp = lock_mount(path);
-	err = PTR_ERR(mp);
-	if (IS_ERR(mp))
+	if (IS_ERR(mp)) {
+		err = PTR_ERR(mp);
 		goto out;
+	}
 
-	old = real_mount(old_path.mnt);
 	parent = real_mount(path->mnt);
-
-	err = -EINVAL;
-	if (IS_MNT_UNBINDABLE(old))
-		goto out2;
-
 	if (!check_mnt(parent))
 		goto out2;
 
-	if (!check_mnt(old) && old_path.dentry->d_op != &ns_dentry_operations)
-		goto out2;
-
-	if (!recurse && has_locked_children(old, old_path.dentry))
-		goto out2;
-
-	if (recurse)
-		mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE);
-	else
-		mnt = clone_mnt(old, old_path.dentry, 0);
-
+	mnt = __do_loopback(&old_path, recurse);
 	if (IS_ERR(mnt)) {
 		err = PTR_ERR(mnt);
 		goto out2;
 	}
 
-	mnt->mnt.mnt_flags &= ~MNT_LOCKED;
-
 	err = graft_tree(mnt, parent, mp);
 	if (err) {
 		lock_mount_hash();
@@ -2264,6 +2283,75 @@ static int do_loopback(struct path *path, const char *old_name,
 	return err;
 }
 
+SYSCALL_DEFINE3(open_tree, int, dfd, const char *, filename, unsigned, flags)
+{
+	struct file *file;
+	struct path path;
+	int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
+	bool detached = flags & OPEN_TREE_CLONE;
+	int error;
+	int fd;
+
+	BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);
+
+	if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE |
+		      AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE |
+		      OPEN_TREE_CLOEXEC))
+		return -EINVAL;
+
+	if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE)
+		return -EINVAL;
+
+	if (flags & AT_NO_AUTOMOUNT)
+		lookup_flags &= ~LOOKUP_AUTOMOUNT;
+	if (flags & AT_SYMLINK_NOFOLLOW)
+		lookup_flags &= ~LOOKUP_FOLLOW;
+	if (flags & AT_EMPTY_PATH)
+		lookup_flags |= LOOKUP_EMPTY;
+
+	if (detached && !may_mount())
+		return -EPERM;
+
+	fd = get_unused_fd_flags(flags & O_CLOEXEC);
+	if (fd < 0)
+		return fd;
+
+	error = user_path_at(dfd, filename, lookup_flags, &path);
+	if (error)
+		goto out;
+
+	if (detached) {
+		struct mount *mnt = __do_loopback(&path, flags & AT_RECURSIVE);
+		if (IS_ERR(mnt)) {
+			error = PTR_ERR(mnt);
+			goto out2;
+		}
+		mntput(path.mnt);
+		path.mnt = &mnt->mnt;
+	}
+
+	file = dentry_open(&path, O_PATH, current_cred());
+	if (IS_ERR(file)) {
+		error = PTR_ERR(file);
+		goto out3;
+	}
+
+	if (detached)
+		file->f_mode |= FMODE_NEED_UNMOUNT;
+	path_put(&path);
+	fd_install(fd, file);
+	return fd;
+
+out3:
+	if (detached)
+		dissolve_on_fput(path.mnt);
+out2:
+	path_put(&path);
+out:
+	put_unused_fd(fd);
+	return error;
+}
+
 static int change_mount_flags(struct vfsmount *mnt, int ms_flags)
 {
 	int error = 0;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5c91108846db..00e255c195f2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -154,6 +154,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 /* File is capable of returning -EAGAIN if I/O will block */
 #define FMODE_NOWAIT	((__force fmode_t)0x8000000)
 
+/* File represents mount that needs unmounting */
+#define FMODE_NEED_UNMOUNT     ((__force fmode_t)0x10000000)
+
 /*
  * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
  * that indicates that they should check the contents of the iovec are
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 73810808cdf2..3cc6b8f8bd2f 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -900,6 +900,7 @@ asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
 			  unsigned mask, struct statx __user *buffer);
 asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len,
 			 int flags, uint32_t sig);
+asmlinkage long sys_open_tree(int dfd, const char __user *path, unsigned flags);
 
 /*
  * Architecture-specific system calls
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 6448cdd9a350..594b85f7cb86 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -90,5 +90,7 @@
 #define AT_STATX_FORCE_SYNC	0x2000	/* - Force the attributes to be sync'd with the server */
 #define AT_STATX_DONT_SYNC	0x4000	/* - Don't sync attributes with the server */
 
+#define AT_RECURSIVE		0x8000	/* Apply to the entire subtree */
+
 
 #endif /* _UAPI_LINUX_FCNTL_H */
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
new file mode 100644
index 000000000000..e8db2911adca
--- /dev/null
+++ b/include/uapi/linux/mount.h
@@ -0,0 +1,10 @@
+#ifndef _UAPI_LINUX_MOUNT_H
+#define _UAPI_LINUX_MOUNT_H
+
+/*
+ * open_tree() flags.
+ */
+#define OPEN_TREE_CLONE		1		/* Clone the target tree and attach the clone */
+#define OPEN_TREE_CLOEXEC	O_CLOEXEC	/* Close the file on execve() */
+
+#endif /* _UAPI_LINUX_MOUNT_H */

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [PATCH 02/32] vfs: syscall: Add move_mount(2) to move mounts around [ver #9]
       [not found] <153126248868.14533.9751473662727327569.stgit@warthog.procyon.org.uk>
  2018-07-10 22:41 ` [PATCH 01/32] vfs: syscall: Add open_tree(2) to reference or clone a mount [ver #9] David Howells
@ 2018-07-10 22:41 ` David Howells
  2018-07-10 22:44 ` [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation " David Howells
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 75+ messages in thread
From: David Howells @ 2018-07-10 22:41 UTC (permalink / raw)
  To: viro; +Cc: dhowells, linux-api, linux-fsdevel, torvalds, linux-kernel

Add a move_mount() system call that will move a mount from one place to
another and, in the next commit, allow to attach an unattached mount tree.

The new system call looks like the following:

	int move_mount(int from_dfd, const char *from_path,
		       int to_dfd, const char *to_path,
		       unsigned int flags);

Signed-off-by: David Howells <dhowells@redhat.com>
cc: linux-api@vger.kernel.org
---

 arch/x86/entry/syscalls/syscall_32.tbl |    1 
 arch/x86/entry/syscalls/syscall_64.tbl |    1 
 fs/namespace.c                         |  102 ++++++++++++++++++++++++++------
 include/linux/lsm_hooks.h              |    6 ++
 include/linux/security.h               |    7 ++
 include/linux/syscalls.h               |    3 +
 include/uapi/linux/mount.h             |   11 +++
 security/security.c                    |    5 ++
 8 files changed, 118 insertions(+), 18 deletions(-)

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index ea1b413afd47..76d092b7d1b0 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -399,3 +399,4 @@
 385	i386	io_pgetevents		sys_io_pgetevents		__ia32_compat_sys_io_pgetevents
 386	i386	rseq			sys_rseq			__ia32_sys_rseq
 387	i386	open_tree		sys_open_tree			__ia32_sys_open_tree
+388	i386	move_mount		sys_move_mount			__ia32_sys_move_mount
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 0545bed581dc..37ba4e65eee6 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -344,6 +344,7 @@
 333	common	io_pgetevents		__x64_sys_io_pgetevents
 334	common	rseq			__x64_sys_rseq
 335	common	open_tree		__x64_sys_open_tree
+336	common	move_mount		__x64_sys_move_mount
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/namespace.c b/fs/namespace.c
index b355a555b4db..e95b2bc8fcfe 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2446,43 +2446,37 @@ static inline int tree_contains_unbindable(struct mount *mnt)
 	return 0;
 }
 
-static int do_move_mount(struct path *path, const char *old_name)
+static int do_move_mount(struct path *old_path, struct path *new_path)
 {
-	struct path old_path, parent_path;
+	struct path parent_path = {.mnt = NULL, .dentry = NULL};
 	struct mount *p;
 	struct mount *old;
 	struct mountpoint *mp;
 	int err;
-	if (!old_name || !*old_name)
-		return -EINVAL;
-	err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
-	if (err)
-		return err;
 
-	mp = lock_mount(path);
+	mp = lock_mount(new_path);
 	err = PTR_ERR(mp);
 	if (IS_ERR(mp))
 		goto out;
 
-	old = real_mount(old_path.mnt);
-	p = real_mount(path->mnt);
+	old = real_mount(old_path->mnt);
+	p = real_mount(new_path->mnt);
 
 	err = -EINVAL;
 	if (!check_mnt(p) || !check_mnt(old))
 		goto out1;
 
-	if (old->mnt.mnt_flags & MNT_LOCKED)
+	if (!mnt_has_parent(old))
 		goto out1;
 
-	err = -EINVAL;
-	if (old_path.dentry != old_path.mnt->mnt_root)
+	if (old->mnt.mnt_flags & MNT_LOCKED)
 		goto out1;
 
-	if (!mnt_has_parent(old))
+	if (old_path->dentry != old_path->mnt->mnt_root)
 		goto out1;
 
-	if (d_is_dir(path->dentry) !=
-	      d_is_dir(old_path.dentry))
+	if (d_is_dir(new_path->dentry) !=
+	    d_is_dir(old_path->dentry))
 		goto out1;
 	/*
 	 * Don't move a mount residing in a shared parent.
@@ -2500,7 +2494,8 @@ static int do_move_mount(struct path *path, const char *old_name)
 		if (p == old)
 			goto out1;
 
-	err = attach_recursive_mnt(old, real_mount(path->mnt), mp, &parent_path);
+	err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp,
+				   &parent_path);
 	if (err)
 		goto out1;
 
@@ -2512,6 +2507,22 @@ static int do_move_mount(struct path *path, const char *old_name)
 out:
 	if (!err)
 		path_put(&parent_path);
+	return err;
+}
+
+static int do_move_mount_old(struct path *path, const char *old_name)
+{
+	struct path old_path;
+	int err;
+
+	if (!old_name || !*old_name)
+		return -EINVAL;
+
+	err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
+	if (err)
+		return err;
+
+	err = do_move_mount(&old_path, path);
 	path_put(&old_path);
 	return err;
 }
@@ -2931,7 +2942,7 @@ long do_mount(const char *dev_name, const char __user *dir_name,
 	else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
 		retval = do_change_type(&path, flags);
 	else if (flags & MS_MOVE)
-		retval = do_move_mount(&path, dev_name);
+		retval = do_move_mount_old(&path, dev_name);
 	else
 		retval = do_new_mount(&path, type_page, sb_flags, mnt_flags,
 				      dev_name, data_page);
@@ -3166,6 +3177,61 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
 	return ksys_mount(dev_name, dir_name, type, flags, data);
 }
 
+/*
+ * Move a mount from one place to another.
+ *
+ * Note the flags value is a combination of MOVE_MOUNT_* flags.
+ */
+SYSCALL_DEFINE5(move_mount,
+		int, from_dfd, const char *, from_pathname,
+		int, to_dfd, const char *, to_pathname,
+		unsigned int, flags)
+{
+	struct path from_path, to_path;
+	unsigned int lflags;
+	int ret = 0;
+
+	if (!may_mount())
+		return -EPERM;
+
+	if (flags & ~MOVE_MOUNT__MASK)
+		return -EINVAL;
+
+	/* If someone gives a pathname, they aren't permitted to move
+	 * from an fd that requires unmount as we can't get at the flag
+	 * to clear it afterwards.
+	 */
+	lflags = 0;
+	if (flags & MOVE_MOUNT_F_SYMLINKS)	lflags |= LOOKUP_FOLLOW;
+	if (flags & MOVE_MOUNT_F_AUTOMOUNTS)	lflags |= LOOKUP_AUTOMOUNT;
+	if (flags & MOVE_MOUNT_F_EMPTY_PATH)	lflags |= LOOKUP_EMPTY;
+
+	ret = user_path_at(from_dfd, from_pathname, lflags, &from_path);
+	if (ret < 0)
+		return ret;
+
+	lflags = 0;
+	if (flags & MOVE_MOUNT_T_SYMLINKS)	lflags |= LOOKUP_FOLLOW;
+	if (flags & MOVE_MOUNT_T_AUTOMOUNTS)	lflags |= LOOKUP_AUTOMOUNT;
+	if (flags & MOVE_MOUNT_T_EMPTY_PATH)	lflags |= LOOKUP_EMPTY;
+
+	ret = user_path_at(to_dfd, to_pathname, lflags, &to_path);
+	if (ret < 0)
+		goto out_from;
+
+	ret = security_move_mount(&from_path, &to_path);
+	if (ret < 0)
+		goto out_to;
+
+	ret = do_move_mount(&from_path, &to_path);
+
+out_to:
+	path_put(&to_path);
+out_from:
+	path_put(&from_path);
+	return ret;
+}
+
 /*
  * Return true if path is reachable from root
  *
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 8f1131c8dd54..926607defd83 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -144,6 +144,10 @@
  *	Parse a string of security data filling in the opts structure
  *	@options string containing all mount options known by the LSM
  *	@opts binary data structure usable by the LSM
+ * @move_mount:
+ *	Check permission before a mount is moved.
+ *	@from_path indicates the mount that is going to be moved.
+ *	@to_path indicates the mountpoint that will be mounted upon.
  * @dentry_init_security:
  *	Compute a context for a dentry as the inode is not yet available
  *	since NFSv4 has no label backed by an EA anyway.
@@ -1475,6 +1479,7 @@ union security_list_options {
 					unsigned long kern_flags,
 					unsigned long *set_kern_flags);
 	int (*sb_parse_opts_str)(char *options, struct security_mnt_opts *opts);
+	int (*move_mount)(const struct path *from_path, const struct path *to_path);
 	int (*dentry_init_security)(struct dentry *dentry, int mode,
 					const struct qstr *name, void **ctx,
 					u32 *ctxlen);
@@ -1806,6 +1811,7 @@ struct security_hook_heads {
 	struct hlist_head sb_set_mnt_opts;
 	struct hlist_head sb_clone_mnt_opts;
 	struct hlist_head sb_parse_opts_str;
+	struct hlist_head move_mount;
 	struct hlist_head dentry_init_security;
 	struct hlist_head dentry_create_files_as;
 #ifdef CONFIG_SECURITY_PATH
diff --git a/include/linux/security.h b/include/linux/security.h
index 63030c85ee19..15d121f156b3 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -245,6 +245,7 @@ int security_sb_clone_mnt_opts(const struct super_block *oldsb,
 				unsigned long kern_flags,
 				unsigned long *set_kern_flags);
 int security_sb_parse_opts_str(char *options, struct security_mnt_opts *opts);
+int security_move_mount(const struct path *from_path, const struct path *to_path);
 int security_dentry_init_security(struct dentry *dentry, int mode,
 					const struct qstr *name, void **ctx,
 					u32 *ctxlen);
@@ -598,6 +599,12 @@ static inline int security_sb_parse_opts_str(char *options, struct security_mnt_
 	return 0;
 }
 
+static inline int security_move_mount(const struct path *from_path,
+				      const struct path *to_path)
+{
+	return 0;
+}
+
 static inline int security_inode_alloc(struct inode *inode)
 {
 	return 0;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 3cc6b8f8bd2f..3c0855d9b105 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -901,6 +901,9 @@ asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
 asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len,
 			 int flags, uint32_t sig);
 asmlinkage long sys_open_tree(int dfd, const char __user *path, unsigned flags);
+asmlinkage long sys_move_mount(int from_dfd, const char __user *from_path,
+			       int to_dfd, const char __user *to_path,
+			       unsigned int ms_flags);
 
 /*
  * Architecture-specific system calls
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index e8db2911adca..89adf0d731ab 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -7,4 +7,15 @@
 #define OPEN_TREE_CLONE		1		/* Clone the target tree and attach the clone */
 #define OPEN_TREE_CLOEXEC	O_CLOEXEC	/* Close the file on execve() */
 
+/*
+ * move_mount() flags.
+ */
+#define MOVE_MOUNT_F_SYMLINKS		0x00000001 /* Follow symlinks on from path */
+#define MOVE_MOUNT_F_AUTOMOUNTS		0x00000002 /* Follow automounts on from path */
+#define MOVE_MOUNT_F_EMPTY_PATH		0x00000004 /* Empty from path permitted */
+#define MOVE_MOUNT_T_SYMLINKS		0x00000010 /* Follow symlinks on to path */
+#define MOVE_MOUNT_T_AUTOMOUNTS		0x00000020 /* Follow automounts on to path */
+#define MOVE_MOUNT_T_EMPTY_PATH		0x00000040 /* Empty to path permitted */
+#define MOVE_MOUNT__MASK		0x00000077
+
 #endif /* _UAPI_LINUX_MOUNT_H */
diff --git a/security/security.c b/security/security.c
index 68f46d849abe..c4cbdb7d3a5f 100644
--- a/security/security.c
+++ b/security/security.c
@@ -437,6 +437,11 @@ int security_sb_parse_opts_str(char *options, struct security_mnt_opts *opts)
 }
 EXPORT_SYMBOL(security_sb_parse_opts_str);
 
+int security_move_mount(const struct path *from_path, const struct path *to_path)
+{
+	return call_int_hook(move_mount, 0, from_path, to_path);
+}
+
 int security_inode_alloc(struct inode *inode)
 {
 	inode->i_security = NULL;

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
       [not found] <153126248868.14533.9751473662727327569.stgit@warthog.procyon.org.uk>
  2018-07-10 22:41 ` [PATCH 01/32] vfs: syscall: Add open_tree(2) to reference or clone a mount [ver #9] David Howells
  2018-07-10 22:41 ` [PATCH 02/32] vfs: syscall: Add move_mount(2) to move mounts around " David Howells
@ 2018-07-10 22:44 ` David Howells
  2018-07-10 23:59   ` Andy Lutomirski
                     ` (4 more replies)
  2018-07-10 22:44 ` [PATCH 25/32] vfs: syscall: Add fsmount() to create a mount for a superblock " David Howells
                   ` (5 subsequent siblings)
  8 siblings, 5 replies; 75+ messages in thread
From: David Howells @ 2018-07-10 22:44 UTC (permalink / raw)
  To: viro; +Cc: dhowells, linux-api, linux-fsdevel, torvalds, linux-kernel

Provide an fsopen() system call that starts the process of preparing to
create a superblock that will then be mountable, using an fd as a context
handle.  fsopen() is given the name of the filesystem that will be used:

	int mfd = fsopen(const char *fsname, unsigned int flags);

where flags can be 0 or FSOPEN_CLOEXEC.

For example:

	sfd = fsopen("ext4", FSOPEN_CLOEXEC);
	write(sfd, "s /dev/sdb1"); // note I'm ignoring write's length arg
	write(sfd, "o noatime");
	write(sfd, "o acl");
	write(sfd, "o user_attr");
	write(sfd, "o iversion");
	write(sfd, "o ");
	write(sfd, "r /my/container"); // root inside the fs
	write(sfd, "x create"); // create the superblock
	fsinfo(sfd, NULL, ...); // query new superblock attributes
	mfd = fsmount(sfd, FSMOUNT_CLOEXEC, MS_RELATIME);
	move_mount(mfd, "", sfd, AT_FDCWD, "/mnt", MOVE_MOUNT_F_EMPTY_PATH);

	sfd = fsopen("afs", -1);
	write(sfd, "s %grand.central.org:root.cell");
	write(sfd, "o cell=grand.central.org");
	write(sfd, "r /");
	write(sfd, "x create");
	mfd = fsmount(sfd, 0, MS_NODEV);
	move_mount(mfd, "", sfd, AT_FDCWD, "/mnt", MOVE_MOUNT_F_EMPTY_PATH);

If an error is reported at any step, an error message may be available to be
read() back (ENODATA will be reported if there isn't an error available) in
the form:

	"e <subsys>:<problem>"
	"e SELinux:Mount on mountpoint not permitted"

Once fsmount() has been called, further write() calls will incur EBUSY,
even if the fsmount() fails.  read() is still possible to retrieve error
information.

The fsopen() syscall creates a mount context and hangs it of the fd that it
returns.

Netlink is not used because it is optional and would make the core VFS
dependent on the networking layer and also potentially add network
namespace issues.

Note that, for the moment, the caller must have SYS_CAP_ADMIN to use
fsopen().

Signed-off-by: David Howells <dhowells@redhat.com>
cc: linux-api@vger.kernel.org
---

 arch/x86/entry/syscalls/syscall_32.tbl |    1 
 arch/x86/entry/syscalls/syscall_64.tbl |    1 
 fs/Makefile                            |    2 
 fs/fs_context.c                        |    4 +
 fs/fsopen.c                            |  209 ++++++++++++++++++++++++++++++++
 include/linux/fs_context.h             |    2 
 include/linux/syscalls.h               |    1 
 include/uapi/linux/fs.h                |    5 +
 8 files changed, 224 insertions(+), 1 deletion(-)
 create mode 100644 fs/fsopen.c

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 76d092b7d1b0..1647fefd2969 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -400,3 +400,4 @@
 386	i386	rseq			sys_rseq			__ia32_sys_rseq
 387	i386	open_tree		sys_open_tree			__ia32_sys_open_tree
 388	i386	move_mount		sys_move_mount			__ia32_sys_move_mount
+389	i386	fsopen			sys_fsopen			__ia32_sys_fsopen
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 37ba4e65eee6..235d33dbccb2 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -345,6 +345,7 @@
 334	common	rseq			__x64_sys_rseq
 335	common	open_tree		__x64_sys_open_tree
 336	common	move_mount		__x64_sys_move_mount
+337	common	fsopen			__x64_sys_fsopen
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/Makefile b/fs/Makefile
index 7e9ca59ac3a7..d3b33798998e 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -13,7 +13,7 @@ obj-y :=	open.o read_write.o file_table.o super.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o \
 		pnode.o splice.o sync.o utimes.o d_path.o \
 		stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
-		fs_context.o
+		fs_context.o fsopen.o
 
 ifeq ($(CONFIG_BLOCK),y)
 obj-y +=	buffer.o block_dev.o direct-io.o mpage.o
diff --git a/fs/fs_context.c b/fs/fs_context.c
index b7c84e0aa2f9..a2d745e6d356 100644
--- a/fs/fs_context.c
+++ b/fs/fs_context.c
@@ -251,6 +251,8 @@ struct fs_context *vfs_new_fs_context(struct file_system_type *fs_type,
 	fc->fs_type	= get_filesystem(fs_type);
 	fc->cred	= get_current_cred();
 
+	mutex_init(&fc->uapi_mutex);
+
 	switch (purpose) {
 	case FS_CONTEXT_FOR_KERNEL_MOUNT:
 		fc->sb_flags |= SB_KERNMOUNT;
@@ -335,6 +337,8 @@ struct fs_context *vfs_dup_fs_context(struct fs_context *src_fc)
 	if (!fc)
 		return ERR_PTR(-ENOMEM);
 
+	mutex_init(&fc->uapi_mutex);
+
 	fc->fs_private	= NULL;
 	fc->s_fs_info	= NULL;
 	fc->source	= NULL;
diff --git a/fs/fsopen.c b/fs/fsopen.c
new file mode 100644
index 000000000000..28bb72bda163
--- /dev/null
+++ b/fs/fsopen.c
@@ -0,0 +1,209 @@
+/* Filesystem access-by-fd.
+ *
+ * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/fs_context.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+#include <linux/security.h>
+#include <linux/anon_inodes.h>
+#include "mount.h"
+
+/*
+ * Userspace writes configuration data and commands to the fd and we parse it
+ * here.  For the moment, we assume a single option or command per write.  Each
+ * line written is of the form
+ *
+ *	<command_type><space><stuff...>
+ *
+ *	s /dev/sda1				-- Source device
+ *	o noatime				-- Option without value
+ *	o cell=grand.central.org		-- Option with value
+ *	x create				-- Create a superblock
+ *	x reconfigure				-- Reconfigure a superblock
+ */
+static ssize_t fscontext_write(struct file *file,
+			       const char __user *_buf, size_t len, loff_t *pos)
+{
+	struct fs_context *fc = file->private_data;
+	char opt[2], *data;
+	ssize_t ret;
+
+	if (len < 3 || len > 4095)
+		return -EINVAL;
+
+	if (copy_from_user(opt, _buf, 2) != 0)
+		return -EFAULT;
+	switch (opt[0]) {
+	case 's':
+	case 'o':
+	case 'x':
+		break;
+	default:
+		return -EINVAL;
+	}
+	if (opt[1] != ' ')
+		return -EINVAL;
+
+	data = memdup_user_nul(_buf + 2, len - 2);
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	/* From this point onwards we need to lock the fd against someone
+	 * trying to mount it.
+	 */
+	ret = mutex_lock_interruptible(&fc->uapi_mutex);
+	if (ret < 0)
+		goto err_free;
+
+	if (fc->phase == FS_CONTEXT_AWAITING_RECONF) {
+		if (fc->fs_type->init_fs_context) {
+			ret = fc->fs_type->init_fs_context(fc, fc->root);
+			if (ret < 0) {
+				fc->phase = FS_CONTEXT_FAILED;
+				goto err_unlock;
+			}
+		} else {
+			/* Leave legacy context ops in place */
+		}
+
+		/* Do the security check last because ->init_fs_context may
+		 * change the namespace subscriptions.
+		 */
+		ret = security_fs_context_alloc(fc, fc->root);
+		if (ret < 0) {
+			fc->phase = FS_CONTEXT_FAILED;
+			goto err_unlock;
+		}
+
+		fc->phase = FS_CONTEXT_RECONF_PARAMS;
+	}
+
+	ret = -EINVAL;
+	switch (opt[0]) {
+	case 's':
+		if (fc->phase != FS_CONTEXT_CREATE_PARAMS &&
+		    fc->phase != FS_CONTEXT_RECONF_PARAMS)
+			goto wrong_phase;
+		ret = vfs_set_fs_source(fc, data, len - 2);
+		if (ret < 0)
+			goto err_unlock;
+		break;
+
+	case 'o':
+		if (fc->phase != FS_CONTEXT_CREATE_PARAMS &&
+		    fc->phase != FS_CONTEXT_RECONF_PARAMS)
+			goto wrong_phase;
+		ret = vfs_parse_fs_option(fc, data, len - 2);
+		if (ret < 0)
+			goto err_unlock;
+		break;
+
+	case 'x':
+		if (strcmp(data, "create") == 0) {
+			if (fc->phase != FS_CONTEXT_CREATE_PARAMS)
+				goto wrong_phase;
+			fc->phase = FS_CONTEXT_CREATING;
+			ret = vfs_get_tree(fc);
+			if (ret == 0)
+				fc->phase = FS_CONTEXT_AWAITING_MOUNT;
+			else
+				fc->phase = FS_CONTEXT_FAILED;
+		} else {
+			ret = -EOPNOTSUPP;
+		}
+		if (ret < 0)
+			goto err_unlock;
+		break;
+
+	default:
+		goto err_unlock;
+	}
+
+	ret = len;
+err_unlock:
+	mutex_unlock(&fc->uapi_mutex);
+err_free:
+	kfree(data);
+	return ret;
+
+wrong_phase:
+	ret = -EBUSY;
+	goto err_unlock;
+}
+
+static int fscontext_release(struct inode *inode, struct file *file)
+{
+	struct fs_context *fc = file->private_data;
+
+	if (fc) {
+		file->private_data = NULL;
+		put_fs_context(fc);
+	}
+	return 0;
+}
+
+const struct file_operations fscontext_fs_fops = {
+	.write		= fscontext_write,
+	.release	= fscontext_release,
+	.llseek		= no_llseek,
+};
+
+/*
+ * Attach a filesystem context to a file and an fd.
+ */
+static int fscontext_create_fd(struct fs_context *fc, unsigned int o_flags)
+{
+	int fd;
+
+	fd = anon_inode_getfd("fscontext", &fscontext_fs_fops, fc,
+			      O_RDWR | o_flags);
+	if (fd < 0)
+		put_fs_context(fc);
+	return fd;
+}
+
+/*
+ * Open a filesystem by name so that it can be configured for mounting.
+ *
+ * We are allowed to specify a container in which the filesystem will be
+ * opened, thereby indicating which namespaces will be used (notably, which
+ * network namespace will be used for network filesystems).
+ */
+SYSCALL_DEFINE2(fsopen, const char __user *, _fs_name, unsigned int, flags)
+{
+	struct file_system_type *fs_type;
+	struct fs_context *fc;
+	const char *fs_name;
+
+	if (!ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (flags & ~FSOPEN_CLOEXEC)
+		return -EINVAL;
+
+	fs_name = strndup_user(_fs_name, PAGE_SIZE);
+	if (IS_ERR(fs_name))
+		return PTR_ERR(fs_name);
+
+	fs_type = get_fs_type(fs_name);
+	kfree(fs_name);
+	if (!fs_type)
+		return -ENODEV;
+
+	fc = vfs_new_fs_context(fs_type, NULL, 0, FS_CONTEXT_FOR_USER_MOUNT);
+	put_filesystem(fs_type);
+	if (IS_ERR(fc))
+		return PTR_ERR(fc);
+
+	fc->phase = FS_CONTEXT_CREATE_PARAMS;
+	return fscontext_create_fd(fc, flags & FSOPEN_CLOEXEC ? O_CLOEXEC : 0);
+}
diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index f157ff935a1e..387f25d7acc4 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h
@@ -14,6 +14,7 @@
 
 #include <linux/kernel.h>
 #include <linux/errno.h>
+#include <linux/mutex.h>
 
 struct cred;
 struct dentry;
@@ -58,6 +59,7 @@ enum fs_context_phase {
  */
 struct fs_context {
 	const struct fs_context_operations *ops;
+	struct mutex		uapi_mutex;	/* Userspace access mutex */
 	struct file_system_type	*fs_type;
 	void			*fs_private;	/* The filesystem's context */
 	struct dentry		*root;		/* The root and superblock */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 3c0855d9b105..ad6c7ff33c01 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -904,6 +904,7 @@ asmlinkage long sys_open_tree(int dfd, const char __user *path, unsigned flags);
 asmlinkage long sys_move_mount(int from_dfd, const char __user *from_path,
 			       int to_dfd, const char __user *to_path,
 			       unsigned int ms_flags);
+asmlinkage long sys_fsopen(const char __user *fs_name, unsigned int flags);
 
 /*
  * Architecture-specific system calls
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 1c982eb44ff4..f8818e6cddd6 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -344,4 +344,9 @@ typedef int __bitwise __kernel_rwf_t;
 #define RWF_SUPPORTED	(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
 			 RWF_APPEND)
 
+/*
+ * Flags for fsopen() and co.
+ */
+#define FSOPEN_CLOEXEC		0x00000001
+
 #endif /* _UAPI_LINUX_FS_H */

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [PATCH 25/32] vfs: syscall: Add fsmount() to create a mount for a superblock [ver #9]
       [not found] <153126248868.14533.9751473662727327569.stgit@warthog.procyon.org.uk>
                   ` (2 preceding siblings ...)
  2018-07-10 22:44 ` [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation " David Howells
@ 2018-07-10 22:44 ` David Howells
  2018-07-10 22:44 ` [PATCH 26/32] vfs: syscall: Add fspick() to select a superblock for reconfiguration " David Howells
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 75+ messages in thread
From: David Howells @ 2018-07-10 22:44 UTC (permalink / raw)
  To: viro; +Cc: dhowells, linux-api, linux-fsdevel, torvalds, linux-kernel

Provide a system call by which a filesystem opened with fsopen() and
configured by a series of writes can be mounted:

	int ret = fsmount(int fsfd, unsigned int flags,
			  unsigned int ms_flags);

where fsfd is the file descriptor returned by fsopen().  flags can be 0 or
FSMOUNT_CLOEXEC.  ms_flags is a bitwise-OR of the following flags:

	MS_RDONLY
	MS_NOSUID
	MS_NODEV
	MS_NOEXEC
	MS_NOATIME
	MS_NODIRATIME
	MS_RELATIME
	MS_STRICTATIME

	MS_UNBINDABLE
	MS_PRIVATE
	MS_SLAVE
	MS_SHARED

In the event that fsmount() fails, it may be possible to get an error
message by calling read() on fsfd.  If no message is available, ENODATA
will be reported.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: linux-api@vger.kernel.org
---

 arch/x86/entry/syscalls/syscall_32.tbl |    1 
 arch/x86/entry/syscalls/syscall_64.tbl |    1 
 fs/namespace.c                         |  140 +++++++++++++++++++++++++++++++-
 include/linux/fs_context.h             |    2 
 include/linux/syscalls.h               |    1 
 include/uapi/linux/fs.h                |    2 
 6 files changed, 143 insertions(+), 4 deletions(-)

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 1647fefd2969..537572098032 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -401,3 +401,4 @@
 387	i386	open_tree		sys_open_tree			__ia32_sys_open_tree
 388	i386	move_mount		sys_move_mount			__ia32_sys_move_mount
 389	i386	fsopen			sys_fsopen			__ia32_sys_fsopen
+390	i386	fsmount			sys_fsmount			__ia32_sys_fsmount
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 235d33dbccb2..47abbc2a2bbe 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -346,6 +346,7 @@
 335	common	open_tree		__x64_sys_open_tree
 336	common	move_mount		__x64_sys_move_mount
 337	common	fsopen			__x64_sys_fsopen
+338	common	fsmount			__x64_sys_fsmount
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/namespace.c b/fs/namespace.c
index d5a4d9351a17..a6fbfba8e448 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2503,7 +2503,7 @@ static int do_move_mount(struct path *old_path, struct path *new_path)
 
 	attached = mnt_has_parent(old);
 	/*
-	 * We need to allow open_tree(OPEN_TREE_CLONE) followed by
+	 * We need to allow open_tree(OPEN_TREE_CLONE) or fsmount() followed by
 	 * move_mount(), but mustn't allow "/" to be moved.
 	 */
 	if (old->mnt_ns && !attached)
@@ -3347,9 +3347,141 @@ struct vfsmount *kern_mount(struct file_system_type *type)
 EXPORT_SYMBOL_GPL(kern_mount);
 
 /*
- * Move a mount from one place to another.
- * In combination with open_tree(OPEN_TREE_CLONE [| AT_RECURSIVE]) it can be
- * used to copy a mount subtree.
+ * Create a kernel mount representation for a new, prepared superblock
+ * (specified by fs_fd) and attach to an open_tree-like file descriptor.
+ */
+SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags, unsigned int, ms_flags)
+{
+	struct fs_context *fc;
+	struct file *file;
+	struct path newmount;
+	struct fd f;
+	unsigned int mnt_flags = 0;
+	long ret;
+
+	if (!may_mount())
+		return -EPERM;
+
+	if ((flags & ~(FSMOUNT_CLOEXEC)) != 0)
+		return -EINVAL;
+
+	if (ms_flags & ~(MS_RDONLY | MS_NOSUID | MS_NODEV | MS_NOEXEC |
+			 MS_NOATIME | MS_NODIRATIME | MS_RELATIME |
+			 MS_STRICTATIME))
+		return -EINVAL;
+
+	if (ms_flags & MS_RDONLY)
+		mnt_flags |= MNT_READONLY;
+	if (ms_flags & MS_NOSUID)
+		mnt_flags |= MNT_NOSUID;
+	if (ms_flags & MS_NODEV)
+		mnt_flags |= MNT_NODEV;
+	if (ms_flags & MS_NOEXEC)
+		mnt_flags |= MNT_NOEXEC;
+	if (ms_flags & MS_NODIRATIME)
+		mnt_flags |= MNT_NODIRATIME;
+
+	if (ms_flags & MS_STRICTATIME) {
+		if (ms_flags & MS_NOATIME)
+			return -EINVAL;
+	} else if (ms_flags & MS_NOATIME) {
+		mnt_flags |= MNT_NOATIME;
+	} else {
+		mnt_flags |= MNT_RELATIME;
+	}
+
+	f = fdget(fs_fd);
+	if (!f.file)
+		return -EBADF;
+
+	ret = -EINVAL;
+	if (f.file->f_op != &fscontext_fs_fops)
+		goto err_fsfd;
+
+	fc = f.file->private_data;
+
+	/* There must be a valid superblock or we can't mount it */
+	ret = -EINVAL;
+	if (!fc->root)
+		goto err_fsfd;
+
+	ret = -EPERM;
+	if (mount_too_revealing(fc->root->d_sb, &mnt_flags)) {
+		pr_warn("VFS: Mount too revealing\n");
+		goto err_fsfd;
+	}
+
+	ret = mutex_lock_interruptible(&fc->uapi_mutex);
+	if (ret < 0)
+		goto err_fsfd;
+
+	ret = -EBUSY;
+	if (fc->phase != FS_CONTEXT_AWAITING_MOUNT)
+		goto err_unlock;
+
+	ret = -EPERM;
+	if ((fc->sb_flags & SB_MANDLOCK) && !may_mandlock())
+		goto err_unlock;
+
+	newmount.mnt = vfs_create_mount(fc, mnt_flags);
+	if (IS_ERR(newmount.mnt)) {
+		ret = PTR_ERR(newmount.mnt);
+		goto err_unlock;
+	}
+	newmount.dentry = dget(fc->root);
+
+	/* We've done the mount bit - now move the file context into more or
+	 * less the same state as if we'd done an fspick().  We don't want to
+	 * do any memory allocation or anything like that at this point as we
+	 * don't want to have to handle any errors incurred.
+	 */
+	if (fc->ops && fc->ops->free)
+		fc->ops->free(fc);
+	fc->fs_private = NULL;
+	fc->s_fs_info = NULL;
+	fc->sb_flags = 0;
+	fc->sloppy = false;
+	fc->silent = false;
+	security_fs_context_free(fc);
+	fc->security = NULL;
+	kfree(fc->subtype);
+	fc->subtype = NULL;
+	kfree(fc->source);
+	fc->source = NULL;
+
+	fc->purpose = FS_CONTEXT_FOR_RECONFIGURE;
+	fc->phase = FS_CONTEXT_AWAITING_RECONF;
+
+	/* Attach to an apparent O_PATH fd with a note that we need to unmount
+	 * it, not just simply put it.
+	 */
+	file = dentry_open(&newmount, O_PATH, fc->cred);
+	if (IS_ERR(file)) {
+		ret = PTR_ERR(file);
+		goto err_path;
+	}
+	file->f_mode |= FMODE_NEED_UNMOUNT;
+
+	ret = get_unused_fd_flags((flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0);
+	if (ret >= 0)
+		fd_install(ret, file);
+	else
+		fput(file);
+
+err_path:
+	path_put(&newmount);
+err_unlock:
+	mutex_unlock(&fc->uapi_mutex);
+err_fsfd:
+	fdput(f);
+	return ret;
+}
+
+/*
+ * Move a mount from one place to another.  In combination with
+ * fsopen()/fsmount() this is used to install a new mount and in combination
+ * with open_tree(OPEN_TREE_CLONE [| AT_RECURSIVE]) it can be used to copy
+ * a mount subtree.
  *
  * Note the flags value is a combination of MOVE_MOUNT_* flags.
  */
diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index 387f25d7acc4..2cde97490c6f 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h
@@ -115,4 +115,6 @@ extern int vfs_get_super(struct fs_context *fc,
 			 int (*fill_super)(struct super_block *sb,
 					   struct fs_context *fc));
 
+extern const struct file_operations fscontext_fs_fops;
+
 #endif /* _LINUX_FS_CONTEXT_H */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index ad6c7ff33c01..917fe10e1030 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -905,6 +905,7 @@ asmlinkage long sys_move_mount(int from_dfd, const char __user *from_path,
 			       int to_dfd, const char __user *to_path,
 			       unsigned int ms_flags);
 asmlinkage long sys_fsopen(const char __user *fs_name, unsigned int flags);
+asmlinkage long sys_fsmount(int fs_fd, unsigned int flags, unsigned int ms_flags);
 
 /*
  * Architecture-specific system calls
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index f8818e6cddd6..30a2fb85c4b7 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -349,4 +349,6 @@ typedef int __bitwise __kernel_rwf_t;
  */
 #define FSOPEN_CLOEXEC		0x00000001
 
+#define FSMOUNT_CLOEXEC		0x00000001
+
 #endif /* _UAPI_LINUX_FS_H */

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [PATCH 26/32] vfs: syscall: Add fspick() to select a superblock for reconfiguration [ver #9]
       [not found] <153126248868.14533.9751473662727327569.stgit@warthog.procyon.org.uk>
                   ` (3 preceding siblings ...)
  2018-07-10 22:44 ` [PATCH 25/32] vfs: syscall: Add fsmount() to create a mount for a superblock " David Howells
@ 2018-07-10 22:44 ` David Howells
  2018-07-10 22:44 ` [PATCH 31/32] vfs: syscall: Add fsinfo() to query filesystem information " David Howells
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 75+ messages in thread
From: David Howells @ 2018-07-10 22:44 UTC (permalink / raw)
  To: viro; +Cc: dhowells, linux-api, linux-fsdevel, torvalds, linux-kernel

Provide an fspick() system call that can be used to pick an existing
mountpoint into an fs_context which can thereafter be used to reconfigure a
superblock (equivalent of the superblock side of -o remount).

This looks like:

	int fd = fspick(AT_FDCWD, "/mnt",
			FSPICK_CLOEXEC | FSPICK_NO_AUTOMOUNT);
	write(fd, "o intr");
	write(fd, "o noac");
	write(fd, "x reconfigure");

At the point of fspick being called, the file descriptor referring to the
filesystem context is in exactly the same state as the one that was created
by fsopen() after fsmount() has been successfully called.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: linux-api@vger.kernel.org
---

 arch/x86/entry/syscalls/syscall_32.tbl |    1 +
 arch/x86/entry/syscalls/syscall_64.tbl |    1 +
 fs/fsopen.c                            |   53 ++++++++++++++++++++++++++++++++
 include/linux/syscalls.h               |    1 +
 include/uapi/linux/fs.h                |    5 +++
 5 files changed, 61 insertions(+)

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 537572098032..5587bcede253 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -402,3 +402,4 @@
 388	i386	move_mount		sys_move_mount			__ia32_sys_move_mount
 389	i386	fsopen			sys_fsopen			__ia32_sys_fsopen
 390	i386	fsmount			sys_fsmount			__ia32_sys_fsmount
+391	i386	fspick			sys_fspick			__ia32_sys_fspick
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 47abbc2a2bbe..460a464024bf 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -347,6 +347,7 @@
 336	common	move_mount		__x64_sys_move_mount
 337	common	fsopen			__x64_sys_fsopen
 338	common	fsmount			__x64_sys_fsmount
+339	common	fspick			__x64_sys_fspick
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/fsopen.c b/fs/fsopen.c
index 28bb72bda163..35c2a94d0c68 100644
--- a/fs/fsopen.c
+++ b/fs/fsopen.c
@@ -15,6 +15,7 @@
 #include <linux/syscalls.h>
 #include <linux/security.h>
 #include <linux/anon_inodes.h>
+#include <linux/namei.h>
 #include "mount.h"
 
 /*
@@ -207,3 +208,55 @@ SYSCALL_DEFINE2(fsopen, const char __user *, _fs_name, unsigned int, flags)
 	fc->phase = FS_CONTEXT_CREATE_PARAMS;
 	return fscontext_create_fd(fc, flags & FSOPEN_CLOEXEC ? O_CLOEXEC : 0);
 }
+
+/*
+ * Pick a superblock into a context for reconfiguration.
+ */
+SYSCALL_DEFINE3(fspick, int, dfd, const char __user *, path, unsigned int, flags)
+{
+	struct fs_context *fc;
+	struct path target;
+	unsigned int lookup_flags;
+	int ret;
+
+	if (!ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if ((flags & ~(FSPICK_CLOEXEC |
+		       FSPICK_SYMLINK_NOFOLLOW |
+		       FSPICK_NO_AUTOMOUNT |
+		       FSPICK_EMPTY_PATH)) != 0)
+		return -EINVAL;
+
+	lookup_flags = LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT;
+	if (flags & FSPICK_SYMLINK_NOFOLLOW)
+		lookup_flags &= ~LOOKUP_FOLLOW;
+	if (flags & FSPICK_NO_AUTOMOUNT)
+		lookup_flags &= ~LOOKUP_AUTOMOUNT;
+	if (flags & FSPICK_EMPTY_PATH)
+		lookup_flags |= LOOKUP_EMPTY;
+	ret = user_path_at(dfd, path, lookup_flags, &target);
+	if (ret < 0)
+		goto err;
+
+	ret = -EOPNOTSUPP;
+	if (!target.dentry->d_sb->s_op->reconfigure)
+		goto err_path;
+
+	fc = vfs_new_fs_context(target.dentry->d_sb->s_type, target.dentry,
+				0, FS_CONTEXT_FOR_RECONFIGURE);
+	if (IS_ERR(fc)) {
+		ret = PTR_ERR(fc);
+		goto err_path;
+	}
+
+	fc->phase = FS_CONTEXT_RECONF_PARAMS;
+
+	path_put(&target);
+	return fscontext_create_fd(fc, flags & FSPICK_CLOEXEC ? O_CLOEXEC : 0);
+
+err_path:
+	path_put(&target);
+err:
+	return ret;
+}
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 917fe10e1030..ac803f5c0822 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -906,6 +906,7 @@ asmlinkage long sys_move_mount(int from_dfd, const char __user *from_path,
 			       unsigned int ms_flags);
 asmlinkage long sys_fsopen(const char __user *fs_name, unsigned int flags);
 asmlinkage long sys_fsmount(int fs_fd, unsigned int flags, unsigned int ms_flags);
+asmlinkage long sys_fspick(int dfd, const char __user *path, unsigned int flags);
 
 /*
  * Architecture-specific system calls
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 30a2fb85c4b7..c27576d471c2 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -351,4 +351,9 @@ typedef int __bitwise __kernel_rwf_t;
 
 #define FSMOUNT_CLOEXEC		0x00000001
 
+#define FSPICK_CLOEXEC		0x00000001
+#define FSPICK_SYMLINK_NOFOLLOW	0x00000002
+#define FSPICK_NO_AUTOMOUNT	0x00000004
+#define FSPICK_EMPTY_PATH	0x00000008
+
 #endif /* _UAPI_LINUX_FS_H */

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [PATCH 31/32] vfs: syscall: Add fsinfo() to query filesystem information [ver #9]
       [not found] <153126248868.14533.9751473662727327569.stgit@warthog.procyon.org.uk>
                   ` (4 preceding siblings ...)
  2018-07-10 22:44 ` [PATCH 26/32] vfs: syscall: Add fspick() to select a superblock for reconfiguration " David Howells
@ 2018-07-10 22:44 ` David Howells
  2018-07-10 22:52 ` [MANPAGE PATCH] Add manpages for move_mount(2) and open_tree(2) David Howells
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 75+ messages in thread
From: David Howells @ 2018-07-10 22:44 UTC (permalink / raw)
  To: viro; +Cc: dhowells, linux-api, linux-fsdevel, torvalds, linux-kernel

Add a system call to allow filesystem information to be queried.  A request
value can be given to indicate the desired attribute.  Support is provided
for enumerating multi-value attributes.

===============
NEW SYSTEM CALL
===============

The new system call looks like:

	int ret = fsinfo(int dfd,
			 const char *filename,
			 const struct fsinfo_params *params,
			 void *buffer,
			 size_t buf_size);

The params parameter optionally points to a block of parameters:

	struct fsinfo_params {
		__u32	at_flags;
		__u32	request;
		__u32	Nth;
		__u32	Mth;
		__u32	__reserved[6];
	};

If params is NULL, it is assumed params->request should be
fsinfo_attr_statfs, params->Nth should be 0, params->Mth should be 0 and
params->at_flags should be 0.

If params is given, all of params->__reserved[] must be 0.

dfd, filename and params->at_flags indicate the file to query.  There is no
equivalent of lstat() as that can be emulated with fsinfo() by setting
AT_SYMLINK_NOFOLLOW in params->at_flags.  There is also no equivalent of
fstat() as that can be emulated by passing a NULL filename to fsinfo() with
the fd of interest in dfd.  AT_NO_AUTOMOUNT can also be used to an allow
automount point to be queried without triggering it.

params->request indicates the attribute/attributes to be queried.  This can
be one of:

	fsinfo_attr_statfs		- statfs-style info
	fsinfo_attr_fsinfo		- Information about fsinfo()
	fsinfo_attr_ids			- Filesystem IDs
	fsinfo_attr_limits		- Filesystem limits
	fsinfo_attr_supports		- What's supported in statx(), IOC flags
	fsinfo_attr_capabilities	- Filesystem capabilities
	fsinfo_attr_timestamp_info	- Inode timestamp info
	fsinfo_attr_volume_id		- Volume ID (string)
	fsinfo_attr_volume_uuid		- Volume UUID
	fsinfo_attr_volume_name		- Volume name (string)
	fsinfo_attr_cell_name		- Cell name (string)
	fsinfo_attr_domain_name		- Domain name (string)
	fsinfo_attr_realm_name		- Realm name (string)
	fsinfo_attr_server_name		- Name of the Nth server (string)
	fsinfo_attr_server_address	- Mth address of the Nth server
	fsinfo_attr_parameter		- Nth mount parameter (string)
	fsinfo_attr_source		- Nth mount source name (string)
	fsinfo_attr_name_encoding	- Filename encoding (string)
	fsinfo_attr_name_codepage	- Filename codepage (string)
	fsinfo_attr_io_size		- Optimal I/O sizes

Some attributes (such as the servers backing a network filesystem) can have
multiple values.  These can be enumerated by setting params->Nth and
params->Mth to 0, 1, ... until ENODATA is returned.

buffer and buf_size point to the reply buffer.  The buffer is filled up to
the specified size, even if this means truncating the reply.  The full size
of the reply is returned.  In future versions, this will allow extra fields
to be tacked on to the end of the reply, but anyone not expecting them will
only get the subset they're expecting.  If either buffer of buf_size are 0,
no copy will take place and the data size will be returned.

At the moment, this will only work on x86_64 and i386 as it requires the
system call to be wired up.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: linux-api@vger.kernel.org
---

 arch/x86/entry/syscalls/syscall_32.tbl |    1 
 arch/x86/entry/syscalls/syscall_64.tbl |    1 
 fs/statfs.c                            |  470 ++++++++++++++++++++++++++++
 include/linux/fs.h                     |    4 
 include/linux/fsinfo.h                 |   40 ++
 include/linux/syscalls.h               |    4 
 include/uapi/linux/fsinfo.h            |  237 ++++++++++++++
 samples/statx/Makefile                 |    5 
 samples/statx/test-fsinfo.c            |  539 ++++++++++++++++++++++++++++++++
 9 files changed, 1300 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/fsinfo.h
 create mode 100644 include/uapi/linux/fsinfo.h
 create mode 100644 samples/statx/test-fsinfo.c

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 5587bcede253..1c9b56f80cdf 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -403,3 +403,4 @@
 389	i386	fsopen			sys_fsopen			__ia32_sys_fsopen
 390	i386	fsmount			sys_fsmount			__ia32_sys_fsmount
 391	i386	fspick			sys_fspick			__ia32_sys_fspick
+392	i386	fsinfo			sys_fsinfo			__ia32_sys_fsinfo
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 460a464024bf..d2a4d6db4df6 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -348,6 +348,7 @@
 337	common	fsopen			__x64_sys_fsopen
 338	common	fsmount			__x64_sys_fsmount
 339	common	fspick			__x64_sys_fspick
+340	common	fsinfo			__x64_sys_fsinfo
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/statfs.c b/fs/statfs.c
index 5b2a24f0f263..fa6be965dce1 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -9,6 +9,7 @@
 #include <linux/security.h>
 #include <linux/uaccess.h>
 #include <linux/compat.h>
+#include <linux/fsinfo.h>
 #include "internal.h"
 
 static int flags_by_mnt(int mnt_flags)
@@ -384,3 +385,472 @@ COMPAT_SYSCALL_DEFINE2(ustat, unsigned, dev, struct compat_ustat __user *, u)
 	return 0;
 }
 #endif
+
+/*
+ * Get basic filesystem stats from statfs.
+ */
+static int fsinfo_generic_statfs(struct dentry *dentry,
+				 struct fsinfo_statfs *p)
+{
+	struct super_block *sb;
+	struct kstatfs buf;
+	int ret;
+
+	ret = statfs_by_dentry(dentry, &buf);
+	if (ret < 0)
+		return ret;
+
+	sb = dentry->d_sb;
+	p->f_blocks	= buf.f_blocks;
+	p->f_bfree	= buf.f_bfree;
+	p->f_bavail	= buf.f_bavail;
+	p->f_files	= buf.f_files;
+	p->f_ffree	= buf.f_ffree;
+	p->f_favail	= buf.f_ffree;
+	p->f_bsize	= buf.f_bsize;
+	p->f_frsize	= buf.f_frsize;
+	return sizeof(*p);
+}
+
+static int fsinfo_generic_ids(struct dentry *dentry,
+			      struct fsinfo_ids *p)
+{
+	struct super_block *sb;
+	struct kstatfs buf;
+	int ret;
+
+	ret = statfs_by_dentry(dentry, &buf);
+	if (ret < 0)
+		return ret;
+
+	sb = dentry->d_sb;
+	p->f_fstype	= sb->s_magic;
+	p->f_dev_major	= MAJOR(sb->s_dev);
+	p->f_dev_minor	= MINOR(sb->s_dev);
+	p->f_flags	= ST_VALID | flags_by_sb(sb->s_flags);
+
+	memcpy(&p->f_fsid, &buf.f_fsid, sizeof(p->f_fsid));
+	strcpy(p->f_fs_name, dentry->d_sb->s_type->name);
+	return sizeof(*p);
+}
+
+static int fsinfo_generic_limits(struct dentry *dentry,
+				 struct fsinfo_limits *lim)
+{
+	struct super_block *sb = dentry->d_sb;
+
+	lim->max_file_size = sb->s_maxbytes;
+	lim->max_hard_links = sb->s_max_links;
+	lim->max_uid = UINT_MAX;
+	lim->max_gid = UINT_MAX;
+	lim->max_projid = UINT_MAX;
+	lim->max_filename_len = NAME_MAX;
+	lim->max_symlink_len = PAGE_SIZE;
+	lim->max_xattr_name_len = XATTR_NAME_MAX;
+	lim->max_xattr_body_len = XATTR_SIZE_MAX;
+	lim->max_dev_major = 0xffffff;
+	lim->max_dev_minor = 0xff;
+	return sizeof(*lim);
+}
+
+static int fsinfo_generic_supports(struct dentry *dentry,
+				   struct fsinfo_supports *c)
+{
+	struct super_block *sb = dentry->d_sb;
+
+	c->stx_mask = STATX_BASIC_STATS;
+	if (sb->s_d_op && sb->s_d_op->d_automount)
+		c->stx_attributes |= STATX_ATTR_AUTOMOUNT;
+	return sizeof(*c);
+}
+
+static int fsinfo_generic_capabilities(struct dentry *dentry,
+				       struct fsinfo_capabilities *c)
+{
+	struct super_block *sb = dentry->d_sb;
+
+	if (sb->s_mtd)
+		fsinfo_set_cap(c, fsinfo_cap_is_flash_fs);
+	else if (sb->s_bdev)
+		fsinfo_set_cap(c, fsinfo_cap_is_block_fs);
+
+	if (sb->s_quota_types & QTYPE_MASK_USR)
+		fsinfo_set_cap(c, fsinfo_cap_user_quotas);
+	if (sb->s_quota_types & QTYPE_MASK_GRP)
+		fsinfo_set_cap(c, fsinfo_cap_group_quotas);
+	if (sb->s_quota_types & QTYPE_MASK_PRJ)
+		fsinfo_set_cap(c, fsinfo_cap_project_quotas);
+	if (sb->s_d_op && sb->s_d_op->d_automount)
+		fsinfo_set_cap(c, fsinfo_cap_automounts);
+	if (sb->s_id[0])
+		fsinfo_set_cap(c, fsinfo_cap_volume_id);
+
+	fsinfo_set_cap(c, fsinfo_cap_has_atime);
+	fsinfo_set_cap(c, fsinfo_cap_has_ctime);
+	fsinfo_set_cap(c, fsinfo_cap_has_mtime);
+	return sizeof(*c);
+}
+
+static int fsinfo_generic_timestamp_info(struct dentry *dentry,
+					 struct fsinfo_timestamp_info *ts)
+{
+	struct super_block *sb = dentry->d_sb;
+
+	/* If unset, assume 1s granularity */
+	u16 mantissa = 1;
+	s8 exponent = 0;
+
+	ts->minimum_timestamp = S64_MIN;
+	ts->maximum_timestamp = S64_MAX;
+	if (sb->s_time_gran < 1000000000) {
+		if (sb->s_time_gran < 1000)
+			exponent = -9;
+		else if (sb->s_time_gran < 1000000)
+			exponent = -6;
+		else
+			exponent = -3;
+	}
+#define set_gran(x)				\
+	do {					\
+		ts->x##_mantissa = mantissa;	\
+		ts->x##_exponent = exponent;	\
+	} while (0)
+	set_gran(atime_gran);
+	set_gran(btime_gran);
+	set_gran(ctime_gran);
+	set_gran(mtime_gran);
+	return sizeof(*ts);
+}
+
+static int fsinfo_generic_volume_uuid(struct dentry *dentry,
+				      struct fsinfo_volume_uuid *vu)
+{
+	struct super_block *sb = dentry->d_sb;
+
+	memcpy(vu, &sb->s_uuid, sizeof(*vu));
+	return sizeof(*vu);
+}
+
+static int fsinfo_generic_volume_id(struct dentry *dentry, char *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	size_t len = strlen(sb->s_id);
+
+	if (buf)
+		memcpy(buf, sb->s_id, len + 1);
+	return len;
+}
+
+static int fsinfo_generic_name_encoding(struct dentry *dentry, char *buf)
+{
+	static const char encoding[] = "utf8";
+
+	if (buf)
+		memcpy(buf, encoding, sizeof(encoding) - 1);
+	return sizeof(encoding) - 1;
+}
+
+static int fsinfo_generic_io_size(struct dentry *dentry,
+				  struct fsinfo_io_size *c)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct kstatfs buf;
+	int ret;
+
+	if (sb->s_op->statfs == simple_statfs) {
+		c->block_size = PAGE_SIZE;
+		c->max_single_read_size = 0;
+		c->max_single_write_size = 0;
+		c->best_read_size = PAGE_SIZE;
+		c->best_write_size = PAGE_SIZE;
+	} else {
+		ret = statfs_by_dentry(dentry, &buf);
+		if (ret < 0)
+			return ret;
+		c->block_size = buf.f_bsize;
+		c->max_single_read_size = buf.f_bsize;
+		c->max_single_write_size = buf.f_bsize;
+		c->best_read_size = PAGE_SIZE;
+		c->best_write_size = PAGE_SIZE;
+	}
+	return sizeof(*c);
+}
+
+/*
+ * Implement some queries generically from stuff in the superblock.
+ */
+int generic_fsinfo(struct dentry *dentry, struct fsinfo_kparams *params)
+{
+#define _gen(X) fsinfo_attr_##X: return fsinfo_generic_##X(dentry, params->buffer)
+
+	switch (params->request) {
+	case _gen(statfs);
+	case _gen(ids);
+	case _gen(limits);
+	case _gen(supports);
+	case _gen(capabilities);
+	case _gen(timestamp_info);
+	case _gen(volume_uuid);
+	case _gen(volume_id);
+	case _gen(name_encoding);
+	case _gen(io_size);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+EXPORT_SYMBOL(generic_fsinfo);
+
+/*
+ * Retrieve the filesystem info.  We make some stuff up if the operation is not
+ * supported.
+ */
+int vfs_fsinfo(const struct path *path, struct fsinfo_kparams *params)
+{
+	struct dentry *dentry = path->dentry;
+	int (*get_fsinfo)(struct dentry *, struct fsinfo_kparams *);
+	int ret;
+
+	if (params->request == fsinfo_attr_fsinfo) {
+		struct fsinfo_fsinfo *info = params->buffer;
+
+		info->max_attr	= fsinfo_attr__nr;
+		info->max_cap	= fsinfo_cap__nr;
+		return sizeof(*info);
+	}
+
+	get_fsinfo = dentry->d_sb->s_op->get_fsinfo;
+	if (!get_fsinfo) {
+		if (!dentry->d_sb->s_op->statfs)
+			return -EOPNOTSUPP;
+		get_fsinfo = generic_fsinfo;
+	}
+
+	ret = security_sb_statfs(dentry);
+	if (ret)
+		return ret;
+
+	ret = get_fsinfo(dentry, params);
+	if (ret < 0)
+		return ret;
+
+	if (params->request == fsinfo_attr_ids &&
+	    params->buffer) {
+		struct fsinfo_ids *p = params->buffer;
+
+		p->f_flags |= flags_by_mnt(path->mnt->mnt_flags);
+	}
+	return ret;
+}
+
+static int vfs_fsinfo_path(int dfd, const char __user *filename,
+			   struct fsinfo_kparams *params)
+{
+	struct path path;
+	unsigned lookup_flags = LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT;
+	int ret = -EINVAL;
+
+	if ((params->at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
+				 AT_EMPTY_PATH)) != 0)
+		return -EINVAL;
+
+	if (params->at_flags & AT_SYMLINK_NOFOLLOW)
+		lookup_flags &= ~LOOKUP_FOLLOW;
+	if (params->at_flags & AT_NO_AUTOMOUNT)
+		lookup_flags &= ~LOOKUP_AUTOMOUNT;
+	if (params->at_flags & AT_EMPTY_PATH)
+		lookup_flags |= LOOKUP_EMPTY;
+
+retry:
+	ret = user_path_at(dfd, filename, lookup_flags, &path);
+	if (ret)
+		goto out;
+
+	ret = vfs_fsinfo(&path, params);
+	path_put(&path);
+	if (retry_estale(ret, lookup_flags)) {
+		lookup_flags |= LOOKUP_REVAL;
+		goto retry;
+	}
+out:
+	return ret;
+}
+
+static int vfs_fsinfo_fd(unsigned int fd, struct fsinfo_kparams *params)
+{
+	struct fd f = fdget_raw(fd);
+	int ret = -EBADF;
+
+	if (f.file) {
+		ret = vfs_fsinfo(&f.file->f_path, params);
+		fdput(f);
+	}
+	return ret;
+}
+
+/*
+ * Return buffer information by requestable attribute.
+ *
+ * STRUCT indicates a fixed-size structure with only one instance.
+ * STRUCT_N indicates a fixed-size structure that may have multiple instances.
+ * STRING indicates a string with only one instance.
+ * STRING_N indicates a string that may have multiple instances.
+ * STRUCT_ARRAY indicates an array of fixed-size structs with only one instance.
+ * STRUCT_ARRAY_N as above that may have multiple instances.
+ *
+ * If an entry is marked STRUCT, STRUCT_N or STRUCT_NM then if no buffer is
+ * supplied to sys_fsinfo(), sys_fsinfo() will handle returning the buffer size
+ * without calling vfs_fsinfo() and the filesystem.
+ *
+ * No struct may have more than 252 bytes (ie. 0x3f * 4)
+ */
+#define FSINFO_STRING(N)	 [fsinfo_attr_##N] = 0x0000
+#define FSINFO_STRUCT(N)	 [fsinfo_attr_##N] = sizeof(struct fsinfo_##N)
+#define FSINFO_STRING_N(N)	 [fsinfo_attr_##N] = 0x4000
+#define FSINFO_STRUCT_N(N)	 [fsinfo_attr_##N] = 0x4000 | sizeof(struct fsinfo_##N)
+#define FSINFO_STRUCT_NM(N)	 [fsinfo_attr_##N] = 0x8000 | sizeof(struct fsinfo_##N)
+static const u16 fsinfo_buffer_sizes[fsinfo_attr__nr] = {
+	FSINFO_STRUCT		(statfs),
+	FSINFO_STRUCT		(fsinfo),
+	FSINFO_STRUCT		(ids),
+	FSINFO_STRUCT		(limits),
+	FSINFO_STRUCT		(capabilities),
+	FSINFO_STRUCT		(supports),
+	FSINFO_STRUCT		(timestamp_info),
+	FSINFO_STRING		(volume_id),
+	FSINFO_STRUCT		(volume_uuid),
+	FSINFO_STRING		(volume_name),
+	FSINFO_STRING		(cell_name),
+	FSINFO_STRING		(domain_name),
+	FSINFO_STRING		(realm_name),
+	FSINFO_STRING_N		(server_name),
+	FSINFO_STRUCT_NM	(server_address),
+	FSINFO_STRING_N		(parameter),
+	FSINFO_STRING_N		(source),
+	FSINFO_STRING		(name_encoding),
+	FSINFO_STRING		(name_codepage),
+	FSINFO_STRUCT		(io_size),
+};
+
+/**
+ * sys_fsinfo - System call to get filesystem information
+ * @dfd: Base directory to pathwalk from or fd referring to filesystem.
+ * @filename: Filesystem to query or NULL.
+ * @_params: Parameters to define request (or NULL for enhanced statfs).
+ * @_buffer: Result buffer.
+ * @buf_size: Size of result buffer.
+ *
+ * Get information on a filesystem.  The filesystem attribute to be queried is
+ * indicated by @_params->request, and some of the attributes can have multiple
+ * values, indexed by @_params->Nth and @_params->Mth.  If @_params is NULL,
+ * then the 0th fsinfo_attr_statfs attribute is queried.  If an attribute does
+ * not exist, EOPNOTSUPP is returned; if the Nth,Mth value does not exist,
+ * ENODATA is returned.
+ *
+ * On success, the size of the attribute's value is returned.  If @buf_size is
+ * 0 or @_buffer is NULL, only the size is returned.  If the size of the value
+ * is larger than @buf_size, it will be truncated by the copy.  If the size of
+ * the value is smaller than @buf_size then the excess buffer space will be
+ * cleared.  The full size of the value will be returned, irrespective of how
+ * much data is actually placed in the buffer.
+ */
+SYSCALL_DEFINE5(fsinfo,
+		int, dfd, const char __user *, filename,
+		struct fsinfo_params __user *, _params,
+		void __user *, _buffer, size_t, buf_size)
+{
+	struct fsinfo_params user_params;
+	struct fsinfo_kparams params;
+	size_t size;
+	int ret;
+
+	if (_params) {
+		if (copy_from_user(&user_params, _params, sizeof(user_params)))
+			return -EFAULT;
+		if (user_params.__reserved[0] ||
+		    user_params.__reserved[1] ||
+		    user_params.__reserved[2] ||
+		    user_params.__reserved[3] ||
+		    user_params.__reserved[4] ||
+		    user_params.__reserved[5])
+			return -EINVAL;
+		if (user_params.request >= fsinfo_attr__nr)
+			return -EOPNOTSUPP;
+		params.at_flags = user_params.at_flags;
+		params.request = user_params.request;
+		params.Nth = user_params.Nth;
+		params.Mth = user_params.Mth;
+	} else {
+		params.at_flags = 0;
+		params.request = fsinfo_attr_statfs;
+		params.Nth = 0;
+		params.Mth = 0;
+	}
+
+	if (!_buffer || !buf_size) {
+		buf_size = 0;
+		_buffer = NULL;
+	}
+
+	/* Allocate an appropriately-sized buffer.  We will truncate the
+	 * contents when we write the contents back to userspace.
+	 */
+	size = fsinfo_buffer_sizes[params.request];
+	switch (size & 0xc000) {
+	case 0x0000:
+		if (params.Nth != 0)
+			return -ENODATA;
+		/* Fall through */
+	case 0x4000:
+		if (params.Mth != 0)
+			return -ENODATA;
+		/* Fall through */
+	case 0x8000:
+		break;
+	case 0xc000:
+		return -ENOBUFS;
+	}
+
+	size &= ~0xc000;
+	if (size == 0) {
+		size = 4096; /* String */
+	} else {
+		if (buf_size == 0)
+			return size; /* We know how big the buffer should be */
+
+		/* Clear any part of the buffer that we won't fill. */
+		if (buf_size > size &&
+		    clear_user(_buffer, buf_size) != 0)
+			return -EFAULT;
+	}
+
+	if (buf_size > 0) {
+		params.buf_size = size;
+		params.buffer = kzalloc(size, GFP_KERNEL);
+		if (!params.buffer)
+			return -ENOMEM;
+	} else {
+		params.buf_size = 0;
+		params.buffer = NULL;
+	}
+
+	if (filename)
+		ret = vfs_fsinfo_path(dfd, filename, &params);
+	else
+		ret = vfs_fsinfo_fd(dfd, &params);
+	if (ret < 0)
+		goto error;
+
+	if (ret == 0) {
+		ret = -ENODATA;
+		goto error;
+	}
+
+	if (buf_size > ret)
+		buf_size = ret;
+
+	if (copy_to_user(_buffer, params.buffer, buf_size))
+		ret = -EFAULT;
+error:
+	kfree(params.buffer);
+	return ret;
+}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e6d963f2fdc2..bcbe94c0dfe8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -62,6 +62,8 @@ struct iov_iter;
 struct fscrypt_info;
 struct fscrypt_operations;
 struct fs_context;
+struct fsinfo_kparams;
+enum fsinfo_attribute;
 
 extern void __init inode_init(void);
 extern void __init inode_init_early(void);
@@ -1847,6 +1849,7 @@ struct super_operations {
 	int (*thaw_super) (struct super_block *);
 	int (*unfreeze_fs) (struct super_block *);
 	int (*statfs) (struct dentry *, struct kstatfs *);
+	int (*get_fsinfo) (struct dentry *, struct fsinfo_kparams *);
 	int (*remount_fs) (struct super_block *, int *, char *, size_t);
 	int (*reconfigure) (struct super_block *, struct fs_context *);
 	void (*umount_begin) (struct super_block *);
@@ -2223,6 +2226,7 @@ extern int iterate_mounts(int (*)(struct vfsmount *, void *), void *,
 extern int vfs_statfs(const struct path *, struct kstatfs *);
 extern int user_statfs(const char __user *, struct kstatfs *);
 extern int fd_statfs(int, struct kstatfs *);
+extern int vfs_fsinfo(const struct path *, struct fsinfo_kparams *);
 extern int freeze_super(struct super_block *super);
 extern int thaw_super(struct super_block *super);
 extern bool our_mnt(struct vfsmount *mnt);
diff --git a/include/linux/fsinfo.h b/include/linux/fsinfo.h
new file mode 100644
index 000000000000..c356391b4b2a
--- /dev/null
+++ b/include/linux/fsinfo.h
@@ -0,0 +1,40 @@
+/* Filesystem information query
+ *
+ * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#ifndef _LINUX_FSINFO_H
+#define _LINUX_FSINFO_H
+
+#include <uapi/linux/fsinfo.h>
+
+struct fsinfo_kparams {
+	__u32			at_flags;	/* AT_SYMLINK_NOFOLLOW and similar */
+	enum fsinfo_attribute	request;	/* What is being asking for */
+	__u32			Nth;		/* Instance of it (some may have multiple) */
+	__u32			Mth;		/* Subinstance */
+	void			*buffer;	/* Where to place the reply */
+	size_t			buf_size;	/* Size of the buffer */
+};
+
+extern int generic_fsinfo(struct dentry *, struct fsinfo_kparams *);
+
+static inline void fsinfo_set_cap(struct fsinfo_capabilities *c,
+				  enum fsinfo_capability cap)
+{
+	c->capabilities[cap / 8] |= 1 << (cap % 8);
+}
+
+static inline void fsinfo_clear_cap(struct fsinfo_capabilities *c,
+				    enum fsinfo_capability cap)
+{
+	c->capabilities[cap / 8] &= ~(1 << (cap % 8));
+}
+
+#endif /* _LINUX_FSINFO_H */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index ac803f5c0822..da3575dded79 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -49,6 +49,7 @@ struct stat64;
 struct statfs;
 struct statfs64;
 struct statx;
+struct fsinfo_params;
 struct __sysctl_args;
 struct sysinfo;
 struct timespec;
@@ -907,6 +908,9 @@ asmlinkage long sys_move_mount(int from_dfd, const char __user *from_path,
 asmlinkage long sys_fsopen(const char __user *fs_name, unsigned int flags);
 asmlinkage long sys_fsmount(int fs_fd, unsigned int flags, unsigned int ms_flags);
 asmlinkage long sys_fspick(int dfd, const char __user *path, unsigned int flags);
+asmlinkage long sys_fsinfo(int dfd, const char __user *path,
+			   struct fsinfo_params __user *params,
+			   void __user *buffer, size_t buf_size);
 
 /*
  * Architecture-specific system calls
diff --git a/include/uapi/linux/fsinfo.h b/include/uapi/linux/fsinfo.h
new file mode 100644
index 000000000000..f2bc5130544d
--- /dev/null
+++ b/include/uapi/linux/fsinfo.h
@@ -0,0 +1,237 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* fsinfo() definitions.
+ *
+ * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+#ifndef _UAPI_LINUX_FSINFO_H
+#define _UAPI_LINUX_FSINFO_H
+
+#include <linux/types.h>
+#include <linux/socket.h>
+
+/*
+ * The filesystem attributes that can be requested.  Note that some attributes
+ * may have multiple instances which can be switched in the parameter block.
+ */
+enum fsinfo_attribute {
+	fsinfo_attr_statfs		= 0,	/* statfs()-style state */
+	fsinfo_attr_fsinfo		= 1,	/* Information about fsinfo() */
+	fsinfo_attr_ids			= 2,	/* Filesystem IDs */
+	fsinfo_attr_limits		= 3,	/* Filesystem limits */
+	fsinfo_attr_supports		= 4,	/* What's supported in statx, iocflags, ... */
+	fsinfo_attr_capabilities	= 5,	/* Filesystem capabilities (bits) */
+	fsinfo_attr_timestamp_info	= 6,	/* Inode timestamp info */
+	fsinfo_attr_volume_id		= 7,	/* Volume ID (string) */
+	fsinfo_attr_volume_uuid		= 8,	/* Volume UUID (LE uuid) */
+	fsinfo_attr_volume_name		= 9,	/* Volume name (string) */
+	fsinfo_attr_cell_name		= 10,	/* Cell name (string) */
+	fsinfo_attr_domain_name		= 11,	/* Domain name (string) */
+	fsinfo_attr_realm_name		= 12,	/* Realm name (string) */
+	fsinfo_attr_server_name		= 13,	/* Name of the Nth server */
+	fsinfo_attr_server_address	= 14,	/* Mth address of the Nth server */
+	fsinfo_attr_parameter		= 15,	/* Nth mount parameter (string) */
+	fsinfo_attr_source		= 16,	/* Nth mount source name (string) */
+	fsinfo_attr_name_encoding	= 17,	/* Filename encoding (string) */
+	fsinfo_attr_name_codepage	= 18,	/* Filename codepage (string) */
+	fsinfo_attr_io_size		= 19,	/* Optimal I/O sizes */
+	fsinfo_attr__nr
+};
+
+/*
+ * Optional fsinfo() parameter structure.
+ *
+ * If this is not given, it is assumed that fsinfo_attr_statfs instance 0,0 is
+ * desired.
+ */
+struct fsinfo_params {
+	__u32	at_flags;	/* AT_SYMLINK_NOFOLLOW and similar flags */
+	__u32	request;	/* What is being asking for (enum fsinfo_attribute) */
+	__u32	Nth;		/* Instance of it (some may have multiple) */
+	__u32	Mth;		/* Subinstance of Nth instance */
+	__u32	__reserved[6];	/* Reserved params; all must be 0 */
+};
+
+/*
+ * Information struct for fsinfo(fsinfo_attr_statfs).
+ * - This gives extended filesystem information.
+ */
+struct fsinfo_statfs {
+	__u64	f_blocks;	/* Total number of blocks in fs */
+	__u64	f_bfree;	/* Total number of free blocks */
+	__u64	f_bavail;	/* Number of free blocks available to ordinary user */
+	__u64	f_files;	/* Total number of file nodes in fs */
+	__u64	f_ffree;	/* Number of free file nodes */
+	__u64	f_favail;	/* Number of free file nodes available to ordinary user */
+	__u32	f_bsize;	/* Optimal block size */
+	__u32	f_frsize;	/* Fragment size */
+};
+
+/*
+ * Information struct for fsinfo(fsinfo_attr_ids).
+ *
+ * List of basic identifiers as is normally found in statfs().
+ */
+struct fsinfo_ids {
+	char	f_fs_name[15 + 1];
+	__u64	f_flags;	/* Filesystem mount flags (MS_*) */
+	__u64	f_fsid;		/* Short 64-bit Filesystem ID (as statfs) */
+	__u64	f_sb_id;	/* Internal superblock ID for sbnotify()/mntnotify() */
+	__u32	f_fstype;	/* Filesystem type from linux/magic.h [uncond] */
+	__u32	f_dev_major;	/* As st_dev_* from struct statx [uncond] */
+	__u32	f_dev_minor;
+};
+
+/*
+ * Information struct for fsinfo(fsinfo_attr_limits).
+ *
+ * List of supported filesystem limits.
+ */
+struct fsinfo_limits {
+	__u64	max_file_size;			/* Maximum file size */
+	__u64	max_uid;			/* Maximum UID supported */
+	__u64	max_gid;			/* Maximum GID supported */
+	__u64	max_projid;			/* Maximum project ID supported */
+	__u32	max_dev_major;			/* Maximum device major representable */
+	__u32	max_dev_minor;			/* Maximum device minor representable */
+	__u32	max_hard_links;			/* Maximum number of hard links on a file */
+	__u32	max_xattr_body_len;		/* Maximum xattr content length */
+	__u32	max_xattr_name_len;		/* Maximum xattr name length */
+	__u32	max_filename_len;		/* Maximum filename length */
+	__u32	max_symlink_len;		/* Maximum symlink content length */
+	__u32	__reserved[1];
+};
+
+/*
+ * Information struct for fsinfo(fsinfo_attr_supports).
+ *
+ * What's supported in various masks, such as statx() attribute and mask bits
+ * and IOC flags.
+ */
+struct fsinfo_supports {
+	__u64	stx_attributes;		/* What statx::stx_attributes are supported */
+	__u32	stx_mask;		/* What statx::stx_mask bits are supported */
+	__u32	ioc_flags;		/* What FS_IOC_* flags are supported */
+	__u32	win_file_attrs;		/* What DOS/Windows FILE_* attributes are supported */
+	__u32	__reserved[1];
+};
+
+/*
+ * Information struct for fsinfo(fsinfo_attr_capabilities).
+ *
+ * Bitmask indicating filesystem capabilities where renderable as single bits.
+ */
+enum fsinfo_capability {
+	fsinfo_cap_is_kernel_fs		= 0,	/* fs is kernel-special filesystem */
+	fsinfo_cap_is_block_fs		= 1,	/* fs is block-based filesystem */
+	fsinfo_cap_is_flash_fs		= 2,	/* fs is flash filesystem */
+	fsinfo_cap_is_network_fs	= 3,	/* fs is network filesystem */
+	fsinfo_cap_is_automounter_fs	= 4,	/* fs is automounter special filesystem */
+	fsinfo_cap_automounts		= 5,	/* fs supports automounts */
+	fsinfo_cap_adv_locks		= 6,	/* fs supports advisory file locking */
+	fsinfo_cap_mand_locks		= 7,	/* fs supports mandatory file locking */
+	fsinfo_cap_leases		= 8,	/* fs supports file leases */
+	fsinfo_cap_uids			= 9,	/* fs supports numeric uids */
+	fsinfo_cap_gids			= 10,	/* fs supports numeric gids */
+	fsinfo_cap_projids		= 11,	/* fs supports numeric project ids */
+	fsinfo_cap_id_names		= 12,	/* fs supports user names */
+	fsinfo_cap_id_guids		= 13,	/* fs supports user guids */
+	fsinfo_cap_windows_attrs	= 14,	/* fs has windows attributes */
+	fsinfo_cap_user_quotas		= 15,	/* fs has per-user quotas */
+	fsinfo_cap_group_quotas		= 16,	/* fs has per-group quotas */
+	fsinfo_cap_project_quotas	= 17,	/* fs has per-project quotas */
+	fsinfo_cap_xattrs		= 18,	/* fs has xattrs */
+	fsinfo_cap_journal		= 19,	/* fs has a journal */
+	fsinfo_cap_data_is_journalled	= 20,	/* fs is using data journalling */
+	fsinfo_cap_o_sync		= 21,	/* fs supports O_SYNC */
+	fsinfo_cap_o_direct		= 22,	/* fs supports O_DIRECT */
+	fsinfo_cap_volume_id		= 23,	/* fs has a volume ID */
+	fsinfo_cap_volume_uuid		= 24,	/* fs has a volume UUID */
+	fsinfo_cap_volume_name		= 25,	/* fs has a volume name */
+	fsinfo_cap_volume_fsid		= 26,	/* fs has a volume FSID */
+	fsinfo_cap_cell_name		= 27,	/* fs has a cell name */
+	fsinfo_cap_domain_name		= 28,	/* fs has a domain name */
+	fsinfo_cap_realm_name		= 29,	/* fs has a realm name */
+	fsinfo_cap_iver_all_change	= 30,	/* i_version represents data + meta changes */
+	fsinfo_cap_iver_data_change	= 31,	/* i_version represents data changes only */
+	fsinfo_cap_iver_mono_incr	= 32,	/* i_version incremented monotonically */
+	fsinfo_cap_symlinks		= 33,	/* fs supports symlinks */
+	fsinfo_cap_hard_links		= 34,	/* fs supports hard links */
+	fsinfo_cap_hard_links_1dir	= 35,	/* fs supports hard links in same dir only */
+	fsinfo_cap_device_files		= 36,	/* fs supports bdev, cdev */
+	fsinfo_cap_unix_specials	= 37,	/* fs supports pipe, fifo, socket */
+	fsinfo_cap_resource_forks	= 38,	/* fs supports resource forks/streams */
+	fsinfo_cap_name_case_indep	= 39,	/* Filename case independence is mandatory */
+	fsinfo_cap_name_non_utf8	= 40,	/* fs has non-utf8 names */
+	fsinfo_cap_name_has_codepage	= 41,	/* fs has a filename codepage */
+	fsinfo_cap_sparse		= 42,	/* fs supports sparse files */
+	fsinfo_cap_not_persistent	= 43,	/* fs is not persistent */
+	fsinfo_cap_no_unix_mode		= 44,	/* fs does not support unix mode bits */
+	fsinfo_cap_has_atime		= 45,	/* fs supports access time */
+	fsinfo_cap_has_btime		= 46,	/* fs supports birth/creation time */
+	fsinfo_cap_has_ctime		= 47,	/* fs supports change time */
+	fsinfo_cap_has_mtime		= 48,	/* fs supports modification time */
+	fsinfo_cap__nr
+};
+
+struct fsinfo_capabilities {
+	__u8	capabilities[(fsinfo_cap__nr + 7) / 8];
+};
+
+/*
+ * Information struct for fsinfo(fsinfo_attr_timestamp_info).
+ */
+struct fsinfo_timestamp_info {
+	__s64	minimum_timestamp;	/* Minimum timestamp value in seconds */
+	__s64	maximum_timestamp;	/* Maximum timestamp value in seconds */
+	__u16	atime_gran_mantissa;	/* Granularity(secs) = mant * 10^exp */
+	__u16	btime_gran_mantissa;
+	__u16	ctime_gran_mantissa;
+	__u16	mtime_gran_mantissa;
+	__s8	atime_gran_exponent;
+	__s8	btime_gran_exponent;
+	__s8	ctime_gran_exponent;
+	__s8	mtime_gran_exponent;
+	__u32	__reserved[1];
+};
+
+/*
+ * Information struct for fsinfo(fsinfo_attr_volume_uuid).
+ */
+struct fsinfo_volume_uuid {
+	__u8	uuid[16];
+};
+
+/*
+ * Information struct for fsinfo(fsinfo_attr_server_addresses).
+ *
+ * Find the Mth address of the Nth server for a network mount.
+ */
+struct fsinfo_server_address {
+	struct __kernel_sockaddr_storage address;
+};
+
+/*
+ * Information struct for fsinfo(fsinfo_attr_io_size).
+ *
+ * Retrieve the optimal I/O size for a filesystem.
+ */
+struct fsinfo_io_size {
+	__u32		block_size;		/* Minimum block granularity for O_DIRECT */
+	__u32		max_single_read_size;	/* Maximum size of a single unbuffered read */
+	__u32		max_single_write_size;	/* Maximum size of a single unbuffered write */
+	__u32		best_read_size;		/* Optimal read size */
+	__u32		best_write_size;	/* Optimal write size */
+};
+
+/*
+ * Information struct for fsinfo(fsinfo_attr_fsinfo).
+ *
+ * This gives information about fsinfo() itself.
+ */
+struct fsinfo_fsinfo {
+	__u32	max_attr;	/* Number of supported attributes (fsinfo_attr__nr) */
+	__u32	max_cap;	/* Number of supported capabilities (fsinfo_cap__nr) */
+};
+
+#endif /* _UAPI_LINUX_FSINFO_H */
diff --git a/samples/statx/Makefile b/samples/statx/Makefile
index 59df7c25a9d1..9cb9a88e3a10 100644
--- a/samples/statx/Makefile
+++ b/samples/statx/Makefile
@@ -1,7 +1,10 @@
 # List of programs to build
-hostprogs-$(CONFIG_SAMPLE_STATX) := test-statx
+hostprogs-$(CONFIG_SAMPLE_STATX) := test-statx test-fsinfo
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
 
 HOSTCFLAGS_test-statx.o += -I$(objtree)/usr/include
+
+HOSTCFLAGS_test-fsinfo.o += -I$(objtree)/usr/include
+HOSTLOADLIBES_test-fsinfo += -lm
diff --git a/samples/statx/test-fsinfo.c b/samples/statx/test-fsinfo.c
new file mode 100644
index 000000000000..9e9fa62a3b9f
--- /dev/null
+++ b/samples/statx/test-fsinfo.c
@@ -0,0 +1,539 @@
+/* Test the fsinfo() system call
+ *
+ * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#define _GNU_SOURCE
+#define _ATFILE_SOURCE
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <errno.h>
+#include <time.h>
+#include <math.h>
+#include <fcntl.h>
+#include <sys/syscall.h>
+#include <linux/fsinfo.h>
+#include <linux/socket.h>
+#include <sys/stat.h>
+
+static __attribute__((unused))
+ssize_t fsinfo(int dfd, const char *filename, struct fsinfo_params *params,
+	       void *buffer, size_t buf_size)
+{
+	return syscall(__NR_fsinfo, dfd, filename, params, buffer, buf_size);
+}
+
+#define FSINFO_STRING(N)	 [fsinfo_attr_##N] = 0x00
+#define FSINFO_STRUCT(N)	 [fsinfo_attr_##N] = sizeof(struct fsinfo_##N)/sizeof(__u32)
+#define FSINFO_STRING_N(N)	 [fsinfo_attr_##N] = 0x40
+#define FSINFO_STRUCT_N(N)	 [fsinfo_attr_##N] = 0x40 | sizeof(struct fsinfo_##N)/sizeof(__u32)
+#define FSINFO_STRUCT_NM(N)	 [fsinfo_attr_##N] = 0x80 | sizeof(struct fsinfo_##N)/sizeof(__u32)
+static const __u8 fsinfo_buffer_sizes[fsinfo_attr__nr] = {
+	FSINFO_STRUCT		(statfs),
+	FSINFO_STRUCT		(fsinfo),
+	FSINFO_STRUCT		(ids),
+	FSINFO_STRUCT		(limits),
+	FSINFO_STRUCT		(supports),
+	FSINFO_STRUCT		(capabilities),
+	FSINFO_STRUCT		(timestamp_info),
+	FSINFO_STRING		(volume_id),
+	FSINFO_STRUCT		(volume_uuid),
+	FSINFO_STRING		(volume_name),
+	FSINFO_STRING		(cell_name),
+	FSINFO_STRING		(domain_name),
+	FSINFO_STRING		(realm_name),
+	FSINFO_STRING_N		(server_name),
+	FSINFO_STRUCT_NM	(server_address),
+	FSINFO_STRING_N		(parameter),
+	FSINFO_STRING_N		(source),
+	FSINFO_STRING		(name_encoding),
+	FSINFO_STRING		(name_codepage),
+	FSINFO_STRUCT		(io_size),
+};
+
+#define FSINFO_NAME(N) [fsinfo_attr_##N] = #N
+static const char *fsinfo_attr_names[fsinfo_attr__nr] = {
+	FSINFO_NAME(statfs),
+	FSINFO_NAME(fsinfo),
+	FSINFO_NAME(ids),
+	FSINFO_NAME(limits),
+	FSINFO_NAME(supports),
+	FSINFO_NAME(capabilities),
+	FSINFO_NAME(timestamp_info),
+	FSINFO_NAME(volume_id),
+	FSINFO_NAME(volume_uuid),
+	FSINFO_NAME(volume_name),
+	FSINFO_NAME(cell_name),
+	FSINFO_NAME(domain_name),
+	FSINFO_NAME(realm_name),
+	FSINFO_NAME(server_name),
+	FSINFO_NAME(server_address),
+	FSINFO_NAME(parameter),
+	FSINFO_NAME(source),
+	FSINFO_NAME(name_encoding),
+	FSINFO_NAME(name_codepage),
+	FSINFO_NAME(io_size),
+};
+
+union reply {
+	char buffer[4096];
+	struct fsinfo_statfs statfs;
+	struct fsinfo_fsinfo fsinfo;
+	struct fsinfo_ids ids;
+	struct fsinfo_limits limits;
+	struct fsinfo_supports supports;
+	struct fsinfo_capabilities caps;
+	struct fsinfo_timestamp_info timestamps;
+	struct fsinfo_volume_uuid uuid;
+	struct fsinfo_server_address srv_addr;
+	struct fsinfo_io_size io_size;
+};
+
+static void dump_hex(unsigned int *data, int from, int to)
+{
+	unsigned offset, print_offset = 1, col = 0;
+
+	from /= 4;
+	to = (to + 3) / 4;
+
+	for (offset = from; offset < to; offset++) {
+		if (print_offset) {
+			printf("%04x: ", offset * 8);
+			print_offset = 0;
+		}
+		printf("%08x", data[offset]);
+		col++;
+		if ((col & 3) == 0) {
+			printf("\n");
+			print_offset = 1;
+		} else {
+			printf(" ");
+		}
+	}
+
+	if (!print_offset)
+		printf("\n");
+}
+
+static void dump_attr_statfs(union reply *r, int size)
+{
+	struct fsinfo_statfs *f = &r->statfs;
+
+	printf("\n");
+	printf("\tblocks: n=%llu fr=%llu av=%llu\n",
+	       (unsigned long long)f->f_blocks,
+	       (unsigned long long)f->f_bfree,
+	       (unsigned long long)f->f_bavail);
+
+	printf("\tfiles : n=%llu fr=%llu av=%llu\n",
+	       (unsigned long long)f->f_files,
+	       (unsigned long long)f->f_ffree,
+	       (unsigned long long)f->f_favail);
+	printf("\tbsize : %u\n", f->f_bsize);
+	printf("\tfrsize: %u\n", f->f_frsize);
+}
+
+static void dump_attr_fsinfo(union reply *r, int size)
+{
+	struct fsinfo_fsinfo *f = &r->fsinfo;
+
+	printf("max_attr=%u max_cap=%u\n", f->max_attr, f->max_cap);
+}
+
+static void dump_attr_ids(union reply *r, int size)
+{
+	struct fsinfo_ids *f = &r->ids;
+
+	printf("\n");
+	printf("\tdev   : %02x:%02x\n", f->f_dev_major, f->f_dev_minor);
+	printf("\tfs    : type=%x name=%s\n", f->f_fstype, f->f_fs_name);
+	printf("\tflags : %llx\n", (unsigned long long)f->f_flags);
+	printf("\tfsid  : %llx\n", (unsigned long long)f->f_fsid);
+}
+
+static void dump_attr_limits(union reply *r, int size)
+{
+	struct fsinfo_limits *f = &r->limits;
+
+	printf("\n");
+	printf("\tmax file size: %llx\n", f->max_file_size);
+	printf("\tmax ids      : u=%llx g=%llx p=%llx\n",
+	       f->max_uid, f->max_gid, f->max_projid);
+	printf("\tmax dev      : maj=%x min=%x\n",
+	       f->max_dev_major, f->max_dev_minor);
+	printf("\tmax links    : %x\n", f->max_hard_links);
+	printf("\tmax xattr    : n=%x b=%x\n",
+	       f->max_xattr_name_len, f->max_xattr_body_len);
+	printf("\tmax len      : file=%x sym=%x\n",
+	       f->max_filename_len, f->max_symlink_len);
+}
+
+static void dump_attr_supports(union reply *r, int size)
+{
+	struct fsinfo_supports *f = &r->supports;
+
+	printf("\n");
+	printf("\tstx_attr=%llx\n", f->stx_attributes);
+	printf("\tstx_mask=%x\n", f->stx_mask);
+	printf("\tioc_flags=%x\n", f->ioc_flags);
+	printf("\twin_fattrs=%x\n", f->win_file_attrs);
+}
+
+#define FSINFO_CAP_NAME(C) [fsinfo_cap_##C] = #C
+static const char *fsinfo_cap_names[fsinfo_cap__nr] = {
+	FSINFO_CAP_NAME(is_kernel_fs),
+	FSINFO_CAP_NAME(is_block_fs),
+	FSINFO_CAP_NAME(is_flash_fs),
+	FSINFO_CAP_NAME(is_network_fs),
+	FSINFO_CAP_NAME(is_automounter_fs),
+	FSINFO_CAP_NAME(automounts),
+	FSINFO_CAP_NAME(adv_locks),
+	FSINFO_CAP_NAME(mand_locks),
+	FSINFO_CAP_NAME(leases),
+	FSINFO_CAP_NAME(uids),
+	FSINFO_CAP_NAME(gids),
+	FSINFO_CAP_NAME(projids),
+	FSINFO_CAP_NAME(id_names),
+	FSINFO_CAP_NAME(id_guids),
+	FSINFO_CAP_NAME(windows_attrs),
+	FSINFO_CAP_NAME(user_quotas),
+	FSINFO_CAP_NAME(group_quotas),
+	FSINFO_CAP_NAME(project_quotas),
+	FSINFO_CAP_NAME(xattrs),
+	FSINFO_CAP_NAME(journal),
+	FSINFO_CAP_NAME(data_is_journalled),
+	FSINFO_CAP_NAME(o_sync),
+	FSINFO_CAP_NAME(o_direct),
+	FSINFO_CAP_NAME(volume_id),
+	FSINFO_CAP_NAME(volume_uuid),
+	FSINFO_CAP_NAME(volume_name),
+	FSINFO_CAP_NAME(volume_fsid),
+	FSINFO_CAP_NAME(cell_name),
+	FSINFO_CAP_NAME(domain_name),
+	FSINFO_CAP_NAME(realm_name),
+	FSINFO_CAP_NAME(iver_all_change),
+	FSINFO_CAP_NAME(iver_data_change),
+	FSINFO_CAP_NAME(iver_mono_incr),
+	FSINFO_CAP_NAME(symlinks),
+	FSINFO_CAP_NAME(hard_links),
+	FSINFO_CAP_NAME(hard_links_1dir),
+	FSINFO_CAP_NAME(device_files),
+	FSINFO_CAP_NAME(unix_specials),
+	FSINFO_CAP_NAME(resource_forks),
+	FSINFO_CAP_NAME(name_case_indep),
+	FSINFO_CAP_NAME(name_non_utf8),
+	FSINFO_CAP_NAME(name_has_codepage),
+	FSINFO_CAP_NAME(sparse),
+	FSINFO_CAP_NAME(not_persistent),
+	FSINFO_CAP_NAME(no_unix_mode),
+	FSINFO_CAP_NAME(has_atime),
+	FSINFO_CAP_NAME(has_btime),
+	FSINFO_CAP_NAME(has_ctime),
+	FSINFO_CAP_NAME(has_mtime),
+};
+
+static void dump_attr_capabilities(union reply *r, int size)
+{
+	struct fsinfo_capabilities *f = &r->caps;
+	int i;
+
+	for (i = 0; i < sizeof(f->capabilities); i++)
+		printf("%02x", f->capabilities[i]);
+	printf("\n");
+	for (i = 0; i < fsinfo_cap__nr; i++)
+		if (f->capabilities[i / 8] & (1 << (i % 8)))
+			printf("\t- %s\n", fsinfo_cap_names[i]);
+}
+
+static void dump_attr_timestamp_info(union reply *r, int size)
+{
+	struct fsinfo_timestamp_info *f = &r->timestamps;
+
+	printf("range=%llx-%llx\n",
+	       (unsigned long long)f->minimum_timestamp,
+	       (unsigned long long)f->maximum_timestamp);
+
+#define print_time(G) \
+	printf("\t"#G"time : gran=%gs\n",			\
+	       (f->G##time_gran_mantissa *		\
+		pow(10., f->G##time_gran_exponent)))
+	print_time(a);
+	print_time(b);
+	print_time(c);
+	print_time(m);
+}
+
+static void dump_attr_volume_uuid(union reply *r, int size)
+{
+	struct fsinfo_volume_uuid *f = &r->uuid;
+
+	printf("%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x"
+	       "-%02x%02x%02x%02x%02x%02x\n",
+	       f->uuid[ 0], f->uuid[ 1],
+	       f->uuid[ 2], f->uuid[ 3],
+	       f->uuid[ 4], f->uuid[ 5],
+	       f->uuid[ 6], f->uuid[ 7],
+	       f->uuid[ 8], f->uuid[ 9],
+	       f->uuid[10], f->uuid[11],
+	       f->uuid[12], f->uuid[13],
+	       f->uuid[14], f->uuid[15]);
+}
+
+static void dump_attr_server_address(union reply *r, int size)
+{
+	struct fsinfo_server_address *f = &r->srv_addr;
+
+	printf("family=%u\n", f->address.ss_family);
+}
+
+static void dump_attr_io_size(union reply *r, int size)
+{
+	struct fsinfo_io_size *f = &r->io_size;
+
+	printf("bs=%u\n", f->block_size);
+}
+
+/*
+ *
+ */
+typedef void (*dumper_t)(union reply *r, int size);
+
+#define FSINFO_DUMPER(N) [fsinfo_attr_##N] = dump_attr_##N
+static const dumper_t fsinfo_attr_dumper[fsinfo_attr__nr] = {
+	FSINFO_DUMPER(statfs),
+	FSINFO_DUMPER(fsinfo),
+	FSINFO_DUMPER(ids),
+	FSINFO_DUMPER(limits),
+	FSINFO_DUMPER(supports),
+	FSINFO_DUMPER(capabilities),
+	FSINFO_DUMPER(timestamp_info),
+	FSINFO_DUMPER(volume_uuid),
+	FSINFO_DUMPER(server_address),
+	FSINFO_DUMPER(io_size),
+};
+
+static void dump_fsinfo(enum fsinfo_attribute attr, __u8 about,
+			union reply *r, int size)
+{
+	dumper_t dumper = fsinfo_attr_dumper[attr];
+	unsigned int len;
+
+	if (!dumper) {
+		printf("<no dumper>\n");
+		return;
+	}
+
+	len = (about & 0x3f) * sizeof(__u32);
+	if (size < len) {
+		printf("<short data %u/%u>\n", size, len);
+		return;
+	}
+
+	dumper(r, size);
+}
+
+/*
+ * Try one subinstance of an attribute.
+ */
+static int try_one(const char *file, struct fsinfo_params *params, bool raw)
+{
+	union reply r;
+	char *p;
+	int ret;
+	__u8 about;
+
+	memset(&r.buffer, 0xbd, sizeof(r.buffer));
+
+	errno = 0;
+	ret = fsinfo(AT_FDCWD, file, params, r.buffer, sizeof(r.buffer));
+	if (params->request >= fsinfo_attr__nr) {
+		if (ret == -1 && errno == EOPNOTSUPP)
+			exit(0);
+		fprintf(stderr, "Unexpected error for too-large command %u: %m\n",
+			params->request);
+		exit(1);
+	}
+
+	//printf("fsinfo(%s,%s,%u,%u) = %d: %m\n",
+	//       file, fsinfo_attr_names[params->request],
+	//       params->Nth, params->Mth, ret);
+
+	about = fsinfo_buffer_sizes[params->request];
+	if (ret == -1) {
+		if (errno == ENODATA) {
+			switch (about & 0xc0) {
+			case 0x00:
+				if (params->Nth == 0 && params->Mth == 0) {
+					fprintf(stderr,
+						"Unexpected ENODATA1 (%u[%u][%u])\n",
+						params->request, params->Nth, params->Mth);
+					exit(1);
+				}
+				break;
+			case 0x40:
+				if (params->Nth == 0 && params->Mth == 0) {
+					fprintf(stderr,
+						"Unexpected ENODATA2 (%u[%u][%u])\n",
+						params->request, params->Nth, params->Mth);
+					exit(1);
+				}
+				break;
+			}
+			return (params->Mth == 0) ? 2 : 1;
+		}
+		if (errno == EOPNOTSUPP) {
+			if (params->Nth > 0 || params->Mth > 0) {
+				fprintf(stderr,
+					"Should return -ENODATA (%u[%u][%u])\n",
+					params->request, params->Nth, params->Mth);
+				exit(1);
+			}
+			//printf("\e[33m%s\e[m: <not supported>\n",
+			//       fsinfo_attr_names[attr]);
+			return 2;
+		}
+		perror(file);
+		exit(1);
+	}
+
+	if (raw) {
+		if (ret > 4096)
+			ret = 4096;
+		dump_hex((unsigned int *)&r.buffer, 0, ret);
+		return 0;
+	}
+
+	switch (about & 0xc0) {
+	case 0x00:
+		printf("\e[33m%s\e[m: ",
+		       fsinfo_attr_names[params->request]);
+		break;
+	case 0x40:
+		printf("\e[33m%s[%u]\e[m: ",
+		       fsinfo_attr_names[params->request],
+		       params->Nth);
+		break;
+	case 0x80:
+		printf("\e[33m%s[%u][%u]\e[m: ",
+		       fsinfo_attr_names[params->request],
+		       params->Nth, params->Mth);
+		break;
+	}
+
+	switch (about) {
+		/* Struct */
+	case 0x01 ... 0x3f:
+	case 0x41 ... 0x7f:
+	case 0x81 ... 0xbf:
+		dump_fsinfo(params->request, about, &r, ret);
+		return 0;
+
+		/* String */
+	case 0x00:
+	case 0x40:
+	case 0x80:
+		if (ret >= 4096) {
+			ret = 4096;
+			r.buffer[4092] = '.';
+			r.buffer[4093] = '.';
+			r.buffer[4094] = '.';
+			r.buffer[4095] = 0;
+		} else {
+			r.buffer[ret] = 0;
+		}
+		for (p = r.buffer; *p; p++) {
+			if (!isprint(*p)) {
+				printf("<non-printable>\n");
+				continue;
+			}
+		}
+		printf("%s\n", r.buffer);
+		return 0;
+
+	default:
+		fprintf(stderr, "Fishy about %u %02x\n", params->request, about);
+		exit(1);
+	}
+}
+
+/*
+ *
+ */
+int main(int argc, char **argv)
+{
+	struct fsinfo_params params = {
+		.at_flags = AT_SYMLINK_NOFOLLOW,
+	};
+	unsigned int attr;
+	int raw = 0, opt, Nth, Mth;
+
+	while ((opt = getopt(argc, argv, "alr"))) {
+		switch (opt) {
+		case 'a':
+			params.at_flags |= AT_NO_AUTOMOUNT;
+			continue;
+		case 'l':
+			params.at_flags &= ~AT_SYMLINK_NOFOLLOW;
+			continue;
+		case 'r':
+			raw = 1;
+			continue;
+		}
+		break;
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if (argc != 1) {
+		printf("Format: test-fsinfo [-alr] <file>\n");
+		exit(2);
+	}
+
+	for (attr = 0; attr <= fsinfo_attr__nr; attr++) {
+		Nth = 0;
+		do {
+			Mth = 0;
+			do {
+				params.request = attr;
+				params.Nth = Nth;
+				params.Mth = Mth;
+
+				switch (try_one(argv[0], &params, raw)) {
+				case 0:
+					continue;
+				case 1:
+					goto done_M;
+				case 2:
+					goto done_N;
+				}
+			} while (++Mth < 100);
+
+		done_M:
+			if (Mth >= 100) {
+				fprintf(stderr, "Fishy: Mth == %u\n", Mth);
+				break;
+			}
+
+		} while (++Nth < 100);
+
+	done_N:
+		if (Nth >= 100) {
+			fprintf(stderr, "Fishy: Nth == %u\n", Nth);
+			break;
+		}
+	}
+
+	return 0;
+}

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [MANPAGE PATCH] Add manpages for move_mount(2) and open_tree(2)
       [not found] <153126248868.14533.9751473662727327569.stgit@warthog.procyon.org.uk>
                   ` (5 preceding siblings ...)
  2018-07-10 22:44 ` [PATCH 31/32] vfs: syscall: Add fsinfo() to query filesystem information " David Howells
@ 2018-07-10 22:52 ` David Howells
  2019-10-09  9:51   ` Michael Kerrisk (man-pages)
  2018-07-10 22:54 ` [MANPAGE PATCH] Add manpage for fsopen(2), fspick(2) and fsmount(2) David Howells
  2018-07-10 22:55 ` [MANPAGE PATCH] Add manpage for fsinfo(2) David Howells
  8 siblings, 1 reply; 75+ messages in thread
From: David Howells @ 2018-07-10 22:52 UTC (permalink / raw)
  To: Michael Kerrisk
  Cc: dhowells, viro, linux-api, linux-fsdevel, torvalds, linux-kernel,
	linux-man

Add manual pages to document the move_mount and open_tree() system calls.

Signed-off-by: David Howells <dhowells@redhat.com>
---

 man2/move_mount.2 |  274 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 man2/open_tree.2  |  260 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 534 insertions(+)
 create mode 100644 man2/move_mount.2
 create mode 100644 man2/open_tree.2

diff --git a/man2/move_mount.2 b/man2/move_mount.2
new file mode 100644
index 000000000..3a819fb84
--- /dev/null
+++ b/man2/move_mount.2
@@ -0,0 +1,274 @@
+'\" t
+.\" Copyright (c) 2018 David Howells <dhowells@redhat.com>
+.\"
+.\" %%%LICENSE_START(VERBATIM)
+.\" Permission is granted to make and distribute verbatim copies of this
+.\" manual provided the copyright notice and this permission notice are
+.\" preserved on all copies.
+.\"
+.\" Permission is granted to copy and distribute modified versions of this
+.\" manual under the conditions for verbatim copying, provided that the
+.\" entire resulting derived work is distributed under the terms of a
+.\" permission notice identical to this one.
+.\"
+.\" Since the Linux kernel and libraries are constantly changing, this
+.\" manual page may be incorrect or out-of-date.  The author(s) assume no
+.\" responsibility for errors or omissions, or for damages resulting from
+.\" the use of the information contained herein.  The author(s) may not
+.\" have taken the same level of care in the production of this manual,
+.\" which is licensed free of charge, as they might when working
+.\" professionally.
+.\"
+.\" Formatted or processed versions of this manual, if unaccompanied by
+.\" the source, must acknowledge the copyright and authors of this work.
+.\" %%%LICENSE_END
+.\"
+.TH MOVE_MOUNT 2 2018-06-08 "Linux" "Linux Programmer's Manual"
+.SH NAME
+move_mount \- Move mount objects around the filesystem topology
+.SH SYNOPSIS
+.nf
+.B #include <sys/types.h>
+.br
+.B #include <sys/mount.h>
+.br
+.B #include <unistd.h>
+.br
+.BR "#include <fcntl.h>           " "/* Definition of AT_* constants */"
+.PP
+.BI "int move_mount(int " from_dirfd ", const char *" from_pathname ","
+.BI "               int " to_dirfd ", const char *" to_pathname ","
+.BI "               unsigned int " flags );
+.fi
+.PP
+.IR Note :
+There are no glibc wrappers for these system calls.
+.SH DESCRIPTION
+The
+.BR move_mount ()
+call moves a mount from one place to another; it can also be used to attach an
+unattached mount created by
+.BR fsmount "() or " open_tree "() with " OPEN_TREE_CLONE .
+.PP
+If
+.BR move_mount ()
+is called repeatedly with a file descriptor that refers to a mount object,
+then the object will be attached/moved the first time and then moved again and
+again and again, detaching it from the previous mountpoint each time.
+.PP
+To access the source mount object or the destination mountpoint, no
+permissions are required on the object itself, but if either pathname is
+supplied, execute (search) permission is required on all of the directories
+specified in
+.IR from_pathname " or " to_pathname .
+.PP
+The caller does, however, require the appropriate capabilities or permission
+to effect a mount.
+.PP
+.BR move_mount ()
+uses
+.IR from_pathname ", " from_dirfd " and some " flags
+to locate the mount object to be moved and
+.IR to_pathname ", " to_dirfd " and some other " flags
+to locate the destination mountpoint.  Each lookup can be done in one of a
+variety of ways:
+.TP
+[*] By absolute path.
+The pathname points to an absolute path and the dirfd is ignored.  The file is
+looked up by name, starting from the root of the filesystem as seen by the
+calling process.
+.TP
+[*] By cwd-relative path.
+The pathname points to a relative path and the dirfd is
+.IR AT_FDCWD .
+The file is looked up by name, starting from the current working directory.
+.TP
+[*] By dir-relative path.
+The pathname points to relative path and the dirfd indicates a file descriptor
+pointing to a directory.  The file is looked up by name, starting from the
+directory specified by
+.IR dirfd .
+.TP
+[*] By file descriptor.
+The pathname points to "", the dirfd points directly to the mount object to
+move or the destination mount point and the appropriate
+.B *_EMPTY_PATH
+flag is set.
+.PP
+.I flags
+can be used to influence a path-based lookup.  A value for
+.I flags
+is constructed by OR'ing together zero or more of the following constants:
+.TP
+.BR MOVE_MOUNT_F_EMPTY_PATH
+.\" commit 65cfc6722361570bfe255698d9cd4dccaf47570d
+If
+.I from_pathname
+is an empty string, operate on the file referred to by
+.IR from_dirfd
+(which may have been obtained using the
+.BR open (2)
+.B O_PATH
+flag or
+.BR open_tree ())
+If
+.I from_dirfd
+is
+.BR AT_FDCWD ,
+the call operates on the current working directory.
+In this case,
+.I from_dirfd
+can refer to any type of file, not just a directory.
+This flag is Linux-specific; define
+.B _GNU_SOURCE
+.\" Before glibc 2.16, defining _ATFILE_SOURCE sufficed
+to obtain its definition.
+.TP
+.B MOVE_MOUNT_T_EMPTY_PATH
+As above, but operating on
+.IR to_pathname " and " to_dirfd .
+.TP
+.B MOVE_MOUNT_F_NO_AUTOMOUNT
+Don't automount the terminal ("basename") component of
+.I from_pathname
+if it is a directory that is an automount point.  This allows a mount object
+that has an automount point at its root to be moved and prevents unintended
+triggering of an automount point.
+The
+.B MOVE_MOUNT_F_NO_AUTOMOUNT
+flag has no effect if the automount point has already been mounted over.  This
+flag is Linux-specific; define
+.B _GNU_SOURCE
+.\" Before glibc 2.16, defining _ATFILE_SOURCE sufficed
+to obtain its definition.
+.TP
+.B MOVE_MOUNT_T_NO_AUTOMOUNT
+As above, but operating on
+.IR to_pathname " and " to_dirfd .
+This allows an automount point to be manually mounted over.
+.TP
+.B MOVE_MOUNT_F_SYMLINKS
+If
+.I from_pathname
+is a symbolic link, then dereference it.  The default for
+.BR move_mount ()
+is to not follow symlinks.
+.TP
+.B MOVE_MOUNT_T_SYMLINKS
+As above, but operating on
+.IR to_pathname " and " to_dirfd .
+
+.SH EXAMPLES
+The
+.BR move_mount ()
+function can be used like the following:
+.PP
+.RS
+.nf
+move_mount(AT_FDCWD, "/a", AT_FDCWD, "/b", 0);
+.fi
+.RE
+.PP
+This would move the object mounted on "/a" to "/b".  It can also be used in
+conjunction with
+.BR open_tree "(2) or " open "(2) with " O_PATH :
+.PP
+.RS
+.nf
+fd = open_tree(AT_FDCWD, "/mnt", 0);
+move_mount(fd, "", AT_FDCWD, "/mnt2", MOVE_MOUNT_F_EMPTY_PATH);
+move_mount(fd, "", AT_FDCWD, "/mnt3", MOVE_MOUNT_F_EMPTY_PATH);
+move_mount(fd, "", AT_FDCWD, "/mnt4", MOVE_MOUNT_F_EMPTY_PATH);
+.fi
+.RE
+.PP
+This would attach the path point for "/mnt" to fd, then it would move the
+mount to "/mnt2", then move it to "/mnt3" and finally to "/mnt4".
+.PP
+It can also be used to attach new mounts:
+.PP
+.RS
+.nf
+sfd = fsopen("ext4", FSOPEN_CLOEXEC);
+write(sfd, "s /dev/sda1");
+write(sfd, "o user_xattr");
+mfd = fsmount(sfd, FSMOUNT_CLOEXEC, MS_NODEV);
+move_mount(mfd, "", AT_FDCWD, "/home", MOVE_MOUNT_F_EMPTY_PATH);
+.fi
+.RE
+.PP
+Which would open the Ext4 filesystem mounted on "/dev/sda1", turn on user
+extended attribute support and create a mount object for it.  Finally, the new
+mount object would be attached with
+.BR move_mount ()
+to "/home".
+
+
+.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+.SH RETURN VALUE
+On success, 0 is returned.  On error, \-1 is returned, and
+.I errno
+is set appropriately.
+.SH ERRORS
+.TP
+.B EACCES
+Search permission is denied for one of the directories
+in the path prefix of
+.IR pathname .
+(See also
+.BR path_resolution (7).)
+.TP
+.B EBADF
+.IR from_dirfd " or " to_dirfd
+is not a valid open file descriptor.
+.TP
+.B EFAULT
+.IR from_pathname " or " to_pathname
+is NULL or either one point to a location outside the process's accessible
+address space.
+.TP
+.B EINVAL
+Reserved flag specified in
+.IR flags .
+.TP
+.B ELOOP
+Too many symbolic links encountered while traversing the pathname.
+.TP
+.B ENAMETOOLONG
+.IR from_pathname " or " to_pathname
+is too long.
+.TP
+.B ENOENT
+A component of
+.IR from_pathname " or " to_pathname
+does not exist, or one is an empty string and the appropriate
+.B *_EMPTY_PATH
+was not specified in
+.IR flags .
+.TP
+.B ENOMEM
+Out of memory (i.e., kernel memory).
+.TP
+.B ENOTDIR
+A component of the path prefix of
+.IR from_pathname " or " to_pathname
+is not a directory or one or the other is relative and the appropriate
+.I *_dirfd
+is a file descriptor referring to a file other than a directory.
+.SH VERSIONS
+.BR move_mount ()
+was added to Linux in kernel 4.18.
+.SH CONFORMING TO
+.BR move_mount ()
+is Linux-specific.
+.SH NOTES
+Glibc does not (yet) provide a wrapper for the
+.BR move_mount ()
+system call; call it using
+.BR syscall (2).
+.SH SEE ALSO
+.BR fsmount (2),
+.BR fsopen (2),
+.BR open_tree (2)
diff --git a/man2/open_tree.2 b/man2/open_tree.2
new file mode 100644
index 000000000..7e9c86fe3
--- /dev/null
+++ b/man2/open_tree.2
@@ -0,0 +1,260 @@
+'\" t
+.\" Copyright (c) 2018 David Howells <dhowells@redhat.com>
+.\"
+.\" %%%LICENSE_START(VERBATIM)
+.\" Permission is granted to make and distribute verbatim copies of this
+.\" manual provided the copyright notice and this permission notice are
+.\" preserved on all copies.
+.\"
+.\" Permission is granted to copy and distribute modified versions of this
+.\" manual under the conditions for verbatim copying, provided that the
+.\" entire resulting derived work is distributed under the terms of a
+.\" permission notice identical to this one.
+.\"
+.\" Since the Linux kernel and libraries are constantly changing, this
+.\" manual page may be incorrect or out-of-date.  The author(s) assume no
+.\" responsibility for errors or omissions, or for damages resulting from
+.\" the use of the information contained herein.  The author(s) may not
+.\" have taken the same level of care in the production of this manual,
+.\" which is licensed free of charge, as they might when working
+.\" professionally.
+.\"
+.\" Formatted or processed versions of this manual, if unaccompanied by
+.\" the source, must acknowledge the copyright and authors of this work.
+.\" %%%LICENSE_END
+.\"
+.TH OPEN_TREE 2 2018-06-08 "Linux" "Linux Programmer's Manual"
+.SH NAME
+open_tree \- Pick or clone mount object and attach to fd
+.SH SYNOPSIS
+.nf
+.B #include <sys/types.h>
+.br
+.B #include <sys/mount.h>
+.br
+.B #include <unistd.h>
+.br
+.BR "#include <fcntl.h>           " "/* Definition of AT_* constants */"
+.PP
+.BI "int open_tree(int " dirfd ", const char *" pathname ", unsigned int " flags );
+.fi
+.PP
+.IR Note :
+There are no glibc wrappers for these system calls.
+.SH DESCRIPTION
+.BR open_tree ()
+picks the mount object specified by the pathname and attaches it to a new file
+descriptor or clones it and attaches the clone to the file descriptor.  The
+resultant file descriptor is indistinguishable from one produced by
+.BR open "(2) with " O_PATH .
+.PP
+In the case that the mount object is cloned, the clone will be "unmounted" and
+destroyed when the file descriptor is closed if it is not otherwise mounted
+somewhere by calling
+.BR move_mount (2).
+.PP
+To select a mount object, no permissions are required on the object referred
+to by the path, but execute (search) permission is required on all of the
+directories in
+.I pathname
+that lead to the object.
+.PP
+To clone an object, however, the caller must have mount capabilities and
+permissions.
+.PP
+.BR open_tree ()
+uses
+.IR pathname ", " dirfd " and " flags
+to locate the target object in one of a variety of ways:
+.TP
+[*] By absolute path.
+.I pathname
+points to an absolute path and
+.I dirfd
+is ignored.  The object is looked up by name, starting from the root of the
+filesystem as seen by the calling process.
+.TP
+[*] By cwd-relative path.
+.I pathname
+points to a relative path and
+.IR dirfd " is " AT_FDCWD .
+The object is looked up by name, starting from the current working directory.
+.TP
+[*] By dir-relative path.
+.I pathname
+points to relative path and
+.I dirfd
+indicates a file descriptor pointing to a directory.  The object is looked up
+by name, starting from the directory specified by
+.IR dirfd .
+.TP
+[*] By file descriptor.
+.I pathname
+is "",
+.I dirfd
+indicates a file descriptor and
+.B AT_EMPTY_PATH
+is set in
+.IR flags .
+The mount attached to the file descriptor is queried directly.  The file
+descriptor may point to any type of file, not just a directory.
+
+.\"______________________________________________________________
+.PP
+.I flags
+can be used to control the operation of the function and to influence a
+path-based lookup.  A value for
+.I flags
+is constructed by OR'ing together zero or more of the following constants:
+.TP
+.BR AT_EMPTY_PATH
+.\" commit 65cfc6722361570bfe255698d9cd4dccaf47570d
+If
+.I pathname
+is an empty string, operate on the file referred to by
+.IR dirfd
+(which may have been obtained from
+.BR open "(2) with"
+.BR O_PATH ", from " fsmount (2)
+or from another
+.BR open_tree ()).
+If
+.I dirfd
+is
+.BR AT_FDCWD ,
+the call operates on the current working directory.
+In this case,
+.I dirfd
+can refer to any type of file, not just a directory.
+This flag is Linux-specific; define
+.B _GNU_SOURCE
+.\" Before glibc 2.16, defining _ATFILE_SOURCE sufficed
+to obtain its definition.
+.TP
+.BR AT_NO_AUTOMOUNT
+Don't automount the terminal ("basename") component of
+.I pathname
+if it is a directory that is an automount point.  This flag allows the
+automount point itself to be picked up or a mount cloned that is rooted on the
+automount point.  The
+.B AT_NO_AUTOMOUNT
+flag has no effect if the mount point has already been mounted over.
+This flag is Linux-specific; define
+.B _GNU_SOURCE
+.\" Before glibc 2.16, defining _ATFILE_SOURCE sufficed
+to obtain its definition.
+.TP
+.B AT_SYMLINK_NOFOLLOW
+If
+.I pathname
+is a symbolic link, do not dereference it: instead pick up or clone a mount
+rooted on the link itself.
+.TP
+.B OPEN_TREE_CLOEXEC
+Set the close-on-exec flag for the new file descriptor.  This will cause the
+file descriptor to be closed automatically when a process exec's.
+.TP
+.B OPEN_TREE_CLONE
+Rather than directly attaching the selected object to the file descriptor,
+clone the object, set the root of the new mount object to that point and
+attach the clone to the file descriptor.
+.TP
+.B AT_RECURSIVE
+This is only permitted in conjunction with OPEN_TREE_CLONE.  It causes the
+entire mount subtree rooted at the selected spot to be cloned rather than just
+that one mount object.
+
+
+.SH EXAMPLE
+The
+.BR open_tree ()
+function can be used like the following:
+.PP
+.RS
+.nf
+fd1 = open_tree(AT_FDCWD, "/mnt", 0);
+fd2 = open_tree(fd1, "",
+                AT_EMPTY_PATH | OPEN_TREE_CLONE | AT_RECURSIVE);
+move_mount(fd2, "", AT_FDCWD, "/mnt2", MOVE_MOUNT_F_EMPTY_PATH);
+.fi
+.RE
+.PP
+This would attach the path point for "/mnt" to fd1, then it would copy the
+entire subtree at the point referred to by fd1 and attach that to fd2; lastly,
+it would attach the clone to "/mnt2".
+
+
+.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+.SH RETURN VALUE
+On success, the new file descriptor is returned.  On error, \-1 is returned,
+and
+.I errno
+is set appropriately.
+.SH ERRORS
+.TP
+.B EACCES
+Search permission is denied for one of the directories
+in the path prefix of
+.IR pathname .
+(See also
+.BR path_resolution (7).)
+.TP
+.B EBADF
+.I dirfd
+is not a valid open file descriptor.
+.TP
+.B EFAULT
+.I pathname
+is NULL or
+.IR pathname
+point to a location outside the process's accessible address space.
+.TP
+.B EINVAL
+Reserved flag specified in
+.IR flags .
+.TP
+.B ELOOP
+Too many symbolic links encountered while traversing the pathname.
+.TP
+.B ENAMETOOLONG
+.I pathname
+is too long.
+.TP
+.B ENOENT
+A component of
+.I pathname
+does not exist, or
+.I pathname
+is an empty string and
+.B AT_EMPTY_PATH
+was not specified in
+.IR flags .
+.TP
+.B ENOMEM
+Out of memory (i.e., kernel memory).
+.TP
+.B ENOTDIR
+A component of the path prefix of
+.I pathname
+is not a directory or
+.I pathname
+is relative and
+.I dirfd
+is a file descriptor referring to a file other than a directory.
+.SH VERSIONS
+.BR open_tree ()
+was added to Linux in kernel 4.18.
+.SH CONFORMING TO
+.BR open_tree ()
+is Linux-specific.
+.SH NOTES
+Glibc does not (yet) provide a wrapper for the
+.BR open_tree ()
+system call; call it using
+.BR syscall (2).
+.SH SEE ALSO
+.BR fsmount (2),
+.BR move_mount (2),
+.BR open (2)

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [MANPAGE PATCH] Add manpage for fsopen(2), fspick(2) and fsmount(2)
       [not found] <153126248868.14533.9751473662727327569.stgit@warthog.procyon.org.uk>
                   ` (6 preceding siblings ...)
  2018-07-10 22:52 ` [MANPAGE PATCH] Add manpages for move_mount(2) and open_tree(2) David Howells
@ 2018-07-10 22:54 ` David Howells
  2019-10-09  9:52   ` Michael Kerrisk (man-pages)
  2018-07-10 22:55 ` [MANPAGE PATCH] Add manpage for fsinfo(2) David Howells
  8 siblings, 1 reply; 75+ messages in thread
From: David Howells @ 2018-07-10 22:54 UTC (permalink / raw)
  To: Michael Kerrisk
  Cc: dhowells, viro, linux-api, linux-fsdevel, torvalds, linux-kernel,
	linux-man

Add a manual page to document the fsopen(), fspick() and fsmount() system
calls.

Signed-off-by: David Howells <dhowells@redhat.com>
---

 man2/fsmount.2 |    1 
 man2/fsopen.2  |  357 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 man2/fspick.2  |    1 
 3 files changed, 359 insertions(+)
 create mode 100644 man2/fsmount.2
 create mode 100644 man2/fsopen.2
 create mode 100644 man2/fspick.2

diff --git a/man2/fsmount.2 b/man2/fsmount.2
new file mode 100644
index 000000000..2bf59fc3e
--- /dev/null
+++ b/man2/fsmount.2
@@ -0,0 +1 @@
+.so man2/fsopen.2
diff --git a/man2/fsopen.2 b/man2/fsopen.2
new file mode 100644
index 000000000..1bc761ab4
--- /dev/null
+++ b/man2/fsopen.2
@@ -0,0 +1,357 @@
+'\" t
+.\" Copyright (c) 2018 David Howells <dhowells@redhat.com>
+.\"
+.\" %%%LICENSE_START(VERBATIM)
+.\" Permission is granted to make and distribute verbatim copies of this
+.\" manual provided the copyright notice and this permission notice are
+.\" preserved on all copies.
+.\"
+.\" Permission is granted to copy and distribute modified versions of this
+.\" manual under the conditions for verbatim copying, provided that the
+.\" entire resulting derived work is distributed under the terms of a
+.\" permission notice identical to this one.
+.\"
+.\" Since the Linux kernel and libraries are constantly changing, this
+.\" manual page may be incorrect or out-of-date.  The author(s) assume no
+.\" responsibility for errors or omissions, or for damages resulting from
+.\" the use of the information contained herein.  The author(s) may not
+.\" have taken the same level of care in the production of this manual,
+.\" which is licensed free of charge, as they might when working
+.\" professionally.
+.\"
+.\" Formatted or processed versions of this manual, if unaccompanied by
+.\" the source, must acknowledge the copyright and authors of this work.
+.\" %%%LICENSE_END
+.\"
+.TH FSOPEN 2 2018-06-07 "Linux" "Linux Programmer's Manual"
+.SH NAME
+fsopen, fsmount, fspick \- Handle filesystem (re-)configuration and mounting
+.SH SYNOPSIS
+.nf
+.B #include <sys/types.h>
+.br
+.B #include <sys/mount.h>
+.br
+.B #include <unistd.h>
+.br
+.BR "#include <fcntl.h>           " "/* Definition of AT_* constants */"
+.PP
+.BI "int fsopen(const char *" fsname ", unsigned int " flags );
+.PP
+.BI "int fsmount(int " fd ", unsigned int " flags ", unsigned int " ms_flags );
+.PP
+.BI "int fspick(int " dirfd ", const char *" pathname ", unsigned int " flags );
+.fi
+.PP
+.IR Note :
+There are no glibc wrappers for these system calls.
+.SH DESCRIPTION
+.PP
+.BR fsopen ()
+creates a new filesystem configuration context within the kernel for the
+filesystem named in the
+.I fsname
+parameter and attaches it to a file descriptor, which it then returns.  The
+file descriptor can be marked close-on-exec by setting
+.B FSOPEN_CLOEXEC
+in flags.
+.PP
+The
+file descriptor can then be used to configure the desired filesystem parameters
+and security parameters by using
+.BR write (2)
+to pass parameters to it and then writing a command to actually create the
+filesystem representation.
+.PP
+The file descriptor also serves as a channel by which more comprehensive error,
+warning and information messages may be retrieved from the kernel using
+.BR read (2).
+.PP
+Once the kernel's filesystem representation has been created, it can be queried
+by calling
+.BR fsinfo (2)
+on the file descriptor.  fsinfo() will spot that the target is actually a
+creation context and look inside that.
+.PP
+.BR fsmount ()
+can then be called to create a mount object that refers to the newly created
+filesystem representation, with the propagation and mount restrictions to be
+applied specified in
+.IR ms_flags .
+The mount object is then attached to a new file descriptor that looks like one
+created by
+.BR open "(2) with " O_PATH " or " open_tree (2).
+This can be passed to
+.BR move_mount (2)
+to attach the mount object to a mountpoint, thereby completing the process.
+.PP
+The file descriptor returned by fsmount() is marked close-on-exec if
+FSMOUNT_CLOEXEC is specified in
+.IR flags .
+.PP
+After fsmount() has completed, the context created by fsopen() is reset and
+moved to reconfiguration state, allowing the new superblock to be reconfigured.
+.PP
+.BR fspick ()
+creates a new filesystem context within the kernel, attaches the superblock
+specified by
+.IR dfd ", " pathname ", " flags
+and puts it into the reconfiguration state and attached the context to a new
+file descriptor that can then be parameterised with
+.BR write (2)
+exactly the same as for the context created by fsopen() above.
+.PP
+.I flags
+is an OR'd together mask of
+.B FSPICK_CLOEXEC
+which indicates that the returned file descriptor should be marked
+close-on-exec and
+.BR FSPICK_SYMLINK_NOFOLLOW ", " FSPICK_NO_AUTOMOUNT " and " FSPICK_EMPTY_PATH
+which control the pathwalk to the target object (see below).
+
+.\"________________________________________________________
+.SS Writable Command Interface
+Superblock (re-)configuration is achieved by writing command strings to the
+context file descriptor using
+.BR write (2).
+Each string is prefixed with a specifier indicating the class of command
+being specified.  The available commands include:
+.TP
+\fB"o <option>"\fP
+Specify a filesystem or security parameter.
+.I <option>
+is typically a key or key=val format string.  Since the length of the option is
+given to write(), the option may include any sort of character, including
+spaces and commas or even binary data.
+.TP
+\fB"s <name>"\fP
+Specify a device file, network server or other other source specification.
+This may be optional, depending on the filesystem, and it may be possible to
+provide multiple of them to a filesystem.
+.TP
+\fB"x create"\fP
+End the filesystem configuration phase and try and create a representation in
+the kernel with the parameters specified.  After this, the context is shifted
+to the mount-pending state waiting for an fsmount() call to occur.
+.TP
+\fB"x reconfigure"\fP
+End a filesystem reconfiguration phase try to apply the parameters to the
+filesystem representation.  After this, the context gets reset and put back to
+the start of the reconfiguration phase again.
+.PP
+With this interface, option strings are not limited to 4096 bytes, either
+individually or in sum, and they are also not restricted to text-only options.
+Further, errors may be given individually for each option and not aggregated or
+dumped into the kernel log.
+
+.\"________________________________________________________
+.SS Message Retrieval Interface
+The context file descriptor may be queried for message strings at any time by
+calling
+.BR read (2)
+on the file descriptor.  This will return formatted messages that are prefixed
+to indicate their class:
+.TP
+\fB"e <message>"\fP
+An error message string was logged.
+.TP
+\fB"i <message>"\fP
+An informational message string was logged.
+.TP
+\fB"w <message>"\fP
+An warning message string was logged.
+.PP
+Messages are removed from the queue as they're read.
+
+.\"________________________________________________________
+.SH EXAMPLES
+To illustrate the process, here's an example whereby this can be used to mount
+an ext4 filesystem on /dev/sdb1 onto /mnt.  Note that the example ignores the
+fact that
+.BR write (2)
+has a length parameter and that errors might occur.
+.PP
+.in +4n
+.nf
+sfd = fsopen("ext4", FSOPEN_CLOEXEC);
+write(sfd, "s /dev/sdb1");
+write(sfd, "o noatime");
+write(sfd, "o acl");
+write(sfd, "o user_attr");
+write(sfd, "o iversion");
+write(sfd, "x create");
+fsinfo(sfd, NULL, ...);
+mfd = fsmount(sfd, FSMOUNT_CLOEXEC, MS_RELATIME);
+move_mount(mfd, "", sfd, AT_FDCWD, "/mnt", MOVE_MOUNT_F_EMPTY_PATH);
+.fi
+.in
+.PP
+Here, an ext4 context is created first and attached to sfd.  This is then told
+where its source will be, given a bunch of options and created.
+.BR fsinfo (2)
+can then be used to query the filesystem.  Then fsmount() is called to create a
+mount object and
+.BR move_mount (2)
+is called to attach it to its intended mountpoint.
+.PP
+And here's an example of mounting from an NFS server:
+.PP
+.in +4n
+.nf
+sfd = fsopen("nfs", 0);
+write(sfd, "s example.com/pub/linux");
+write(sfd, "o nfsvers=3");
+write(sfd, "o rsize=65536");
+write(sfd, "o wsize=65536");
+write(sfd, "o rdma");
+write(sfd, "x create");
+mfd = fsmount(sfd, 0, MS_NODEV);
+move_mount(mfd, "", sfd, AT_FDCWD, "/mnt", MOVE_MOUNT_F_EMPTY_PATH);
+.fi
+.in
+.PP
+Reconfiguration can be achieved by:
+.PP
+.in +4n
+.nf
+sfd = fspick(AT_FDCWD, "/mnt", FSPICK_NO_AUTOMOUNT | FSPICK_CLOEXEC);
+write(sfd, "o ro");
+write(sfd, "x reconfigure");
+.fi
+.in
+.PP
+or:
+.PP
+.in +4n
+.nf
+sfd = fsopen(...);
+...
+mfd = fsmount(sfd, ...);
+...
+write(sfd, "o ro");
+write(sfd, "x reconfigure");
+.fi
+.in
+
+
+.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+.SH RETURN VALUE
+On success, all three functions return a file descriptor.  On error, \-1 is
+returned, and
+.I errno
+is set appropriately.
+.SH ERRORS
+The error values given below result from filesystem type independent
+errors.
+Each filesystem type may have its own special errors and its
+own special behavior.
+See the Linux kernel source code for details.
+.TP
+.B EACCES
+A component of a path was not searchable.
+(See also
+.BR path_resolution (7).)
+.TP
+.B EACCES
+Mounting a read-only filesystem was attempted without giving the
+.B MS_RDONLY
+flag.
+.TP
+.B EACCES
+The block device
+.I source
+is located on a filesystem mounted with the
+.B MS_NODEV
+option.
+.\" mtk: Probably: write permission is required for MS_BIND, with
+.\" the error EPERM if not present; CAP_DAC_OVERRIDE is required.
+.TP
+.B EBUSY
+.I source
+cannot be reconfigured read-only, because it still holds files open for
+writing.
+.TP
+.B EFAULT
+One of the pointer arguments points outside the user address space.
+.TP
+.B EINVAL
+.I source
+had an invalid superblock.
+.TP
+.B EINVAL
+.I ms_flags
+includes more than one of
+.BR MS_SHARED ,
+.BR MS_PRIVATE ,
+.BR MS_SLAVE ,
+or
+.BR MS_UNBINDABLE .
+.TP
+.BR EINVAL
+An attempt was made to bind mount an unbindable mount.
+.TP
+.B ELOOP
+Too many links encountered during pathname resolution.
+.TP
+.B EMFILE
+The system has too many open files to create more.
+.TP
+.B ENFILE
+The process has too many open files to create more.
+.TP
+.B ENAMETOOLONG
+A pathname was longer than
+.BR MAXPATHLEN .
+.TP
+.B ENODEV
+Filesystem
+.I fsname
+not configured in the kernel.
+.TP
+.B ENOENT
+A pathname was empty or had a nonexistent component.
+.TP
+.B ENOMEM
+The kernel could not allocate sufficient memory to complete the call.
+.TP
+.B ENOTBLK
+.I source
+is not a block device (and a device was required).
+.TP
+.B ENOTDIR
+.IR pathname ,
+or a prefix of
+.IR source ,
+is not a directory.
+.TP
+.B ENXIO
+The major number of the block device
+.I source
+is out of range.
+.TP
+.B EPERM
+The caller does not have the required privileges.
+.SH CONFORMING TO
+These functions are Linux-specific and should not be used in programs intended
+to be portable.
+.SH VERSIONS
+.BR fsopen "(), " fsmount "() and " fspick ()
+were added to Linux in kernel 4.18.
+.SH NOTES
+Glibc does not (yet) provide a wrapper for the
+.BR fsopen "() , " fsmount "() or " fspick "()"
+system calls; call them using
+.BR syscall (2).
+.SH SEE ALSO
+.BR mountpoint (1),
+.BR move_mount (2),
+.BR open_tree (2),
+.BR umount (2),
+.BR mount_namespaces (7),
+.BR path_resolution (7),
+.BR findmnt (8),
+.BR lsblk (8),
+.BR mount (8),
+.BR umount (8)
diff --git a/man2/fspick.2 b/man2/fspick.2
new file mode 100644
index 000000000..2bf59fc3e
--- /dev/null
+++ b/man2/fspick.2
@@ -0,0 +1 @@
+.so man2/fsopen.2

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [MANPAGE PATCH] Add manpage for fsinfo(2)
       [not found] <153126248868.14533.9751473662727327569.stgit@warthog.procyon.org.uk>
                   ` (7 preceding siblings ...)
  2018-07-10 22:54 ` [MANPAGE PATCH] Add manpage for fsopen(2), fspick(2) and fsmount(2) David Howells
@ 2018-07-10 22:55 ` David Howells
  2019-10-09  9:52   ` Michael Kerrisk (man-pages)
  2019-10-09 12:02   ` David Howells
  8 siblings, 2 replies; 75+ messages in thread
From: David Howells @ 2018-07-10 22:55 UTC (permalink / raw)
  To: Michael Kerrisk
  Cc: dhowells, viro, linux-api, linux-fsdevel, torvalds, linux-kernel,
	linux-man

Add a manual page to document the fsinfo() system call.

Signed-off-by: David Howells <dhowells@redhat.com>
---

 man2/fsinfo.2       | 1017 +++++++++++++++++++++++++++++++++++++++++++++++++++
 man2/ioctl_iflags.2 |    6 
 man2/stat.2         |    7 
 man2/statx.2        |   13 +
 man2/utime.2        |    7 
 man2/utimensat.2    |    7 
 6 files changed, 1057 insertions(+)
 create mode 100644 man2/fsinfo.2

diff --git a/man2/fsinfo.2 b/man2/fsinfo.2
new file mode 100644
index 000000000..5710232df
--- /dev/null
+++ b/man2/fsinfo.2
@@ -0,0 +1,1017 @@
+'\" t
+.\" Copyright (c) 2018 David Howells <dhowells@redhat.com>
+.\"
+.\" %%%LICENSE_START(VERBATIM)
+.\" Permission is granted to make and distribute verbatim copies of this
+.\" manual provided the copyright notice and this permission notice are
+.\" preserved on all copies.
+.\"
+.\" Permission is granted to copy and distribute modified versions of this
+.\" manual under the conditions for verbatim copying, provided that the
+.\" entire resulting derived work is distributed under the terms of a
+.\" permission notice identical to this one.
+.\"
+.\" Since the Linux kernel and libraries are constantly changing, this
+.\" manual page may be incorrect or out-of-date.  The author(s) assume no
+.\" responsibility for errors or omissions, or for damages resulting from
+.\" the use of the information contained herein.  The author(s) may not
+.\" have taken the same level of care in the production of this manual,
+.\" which is licensed free of charge, as they might when working
+.\" professionally.
+.\"
+.\" Formatted or processed versions of this manual, if unaccompanied by
+.\" the source, must acknowledge the copyright and authors of this work.
+.\" %%%LICENSE_END
+.\"
+.TH FSINFO 2 2018-06-06 "Linux" "Linux Programmer's Manual"
+.SH NAME
+fsinfo \- Get filesystem information
+.SH SYNOPSIS
+.nf
+.B #include <sys/types.h>
+.br
+.B #include <sys/fsinfo.h>
+.br
+.B #include <unistd.h>
+.br
+.BR "#include <fcntl.h>           " "/* Definition of AT_* constants */"
+.PP
+.BI "int fsinfo(int " dirfd ", const char *" pathname ","
+.BI "           struct fsinfo_params *" params ","
+.BI "           void *" buffer ", size_t " buf_size );
+.fi
+.PP
+.IR Note :
+There is no glibc wrapper for
+.BR fsinfo ();
+see NOTES.
+.SH DESCRIPTION
+.PP
+fsinfo() retrieves the desired filesystem attribute, as selected by the
+parameters pointed to by
+.IR params ,
+and stores its value in the buffer pointed to by
+.IR buffer .
+.PP
+The parameter structure is optional, defaulting to all the parameters being 0
+if the pointer is NULL.  The structure looks like the following:
+.PP
+.in +4n
+.nf
+struct fsinfo_params {
+    __u32 at_flags;     /* AT_SYMLINK_NOFOLLOW and similar flags */
+    __u32 request;      /* Requested attribute */
+    __u32 Nth;          /* Instance of attribute */
+    __u32 Mth;          /* Subinstance of Nth instance */
+    __u32 __reserved[6]; /* Reserved params; all must be 0 */
+};
+.fi
+.in
+.PP
+The filesystem to be queried is looked up using a combination of
+.IR dfd ", " pathname " and " params->at_flags.
+This is discussed in more detail below.
+.PP
+The desired attribute is indicated by
+.IR params->request .
+If
+.I params
+is NULL, this will default to
+.BR fsinfo_attr_statfs ,
+which retrieves some of the information returned by
+.BR statfs ().
+The available attributes are described below in the "THE ATTRIBUTES" section.
+.PP
+Some attributes can have multiple values and some can even have multiple
+instances with multiple values.  For example, a network filesystem might use
+multiple servers.  The names of each of these servers can be retrieved by
+using
+.I params->Nth
+to iterate through all the instances until error
+.B ENODATA
+occurs, indicating the end of the list.  Further, each server might have
+multiple addresses available; these can be enumerated using
+.I params->Nth
+to iterate the servers and
+.I params->Mth
+to iterate the addresses of the Nth server.
+.PP
+The amount of data written into the buffer depends on the attribute selected.
+Some attributes return variable-length strings and some return fixed-size
+structures.  If either
+.IR buffer " is  NULL  or " buf_size " is 0"
+then the size of the attribute value will be returned and nothing will be
+written into the buffer.
+.PP
+The
+.I params->__reserved
+parameters must all be 0.
+.\"_______________________________________________________
+.SS
+Allowance for Future Attribute Expansion
+.PP
+To allow for the future expansion and addition of fields to any fixed-size
+structure attribute,
+.BR fsinfo ()
+makes the following guarantees:
+.RS 4m
+.IP (1) 4m
+It will always clear any excess space in the buffer.
+.IP (2) 4m
+It will always return the actual size of the data.
+.IP (3) 4m
+It will truncate the data to fit it into the buffer rather than giving an
+error.
+.IP (4) 4m
+Any new version of a structure will incorporate all the fields from the old
+version at same offsets.
+.RE
+.PP
+So, for example, if the caller is running on an older version of the kernel
+with an older, smaller version of the structure than was asked for, the kernel
+will write the smaller version into the buffer and will clear the remainder of
+the buffer to make sure any additional fields are set to 0.  The function will
+return the actual size of the data.
+.PP
+On the other hand, if the caller is running on a newer version of the kernel
+with a newer version of the structure that is larger than the buffer, the write
+to the buffer will be truncated to fit as necessary and the actual size of the
+data will be returned.
+.PP
+Note that this doesn't apply to variable-length string attributes.
+
+.\"_______________________________________________________
+.SS
+Invoking \fBfsinfo\fR():
+.PP
+To access a file's status, no permissions are required on the file itself, but
+in the case of
+.BR fsinfo ()
+with a path, execute (search) permission is required on all of the directories
+in
+.I pathname
+that lead to the file.
+.PP
+.BR fsinfo ()
+uses
+.IR pathname ", " dirfd " and " params->at_flags
+to locate the target file in one of a variety of ways:
+.TP
+[*] By absolute path.
+.I pathname
+points to an absolute path and
+.I dirfd
+is ignored.  The file is looked up by name, starting from the root of the
+filesystem as seen by the calling process.
+.TP
+[*] By cwd-relative path.
+.I pathname
+points to a relative path and
+.IR dirfd " is " AT_FDCWD .
+The file is looked up by name, starting from the current working directory.
+.TP
+[*] By dir-relative path.
+.I pathname
+points to relative path and
+.I dirfd
+indicates a file descriptor pointing to a directory.  The file is looked up by
+name, starting from the directory specified by
+.IR dirfd .
+.TP
+[*] By file descriptor.
+.IR pathname " is " NULL " and " dirfd
+indicates a file descriptor.  The file attached to the file descriptor is
+queried directly.  The file descriptor may point to any type of file, not just
+a directory.
+.PP
+.I flags
+can be used to influence a path-based lookup.  A value for
+.I flags
+is constructed by OR'ing together zero or more of the following constants:
+.TP
+.BR AT_EMPTY_PATH
+.\" commit 65cfc6722361570bfe255698d9cd4dccaf47570d
+If
+.I pathname
+is an empty string, operate on the file referred to by
+.IR dirfd
+(which may have been obtained using the
+.BR open (2)
+.B O_PATH
+flag).
+If
+.I dirfd
+is
+.BR AT_FDCWD ,
+the call operates on the current working directory.
+In this case,
+.I dirfd
+can refer to any type of file, not just a directory.
+This flag is Linux-specific; define
+.B _GNU_SOURCE
+.\" Before glibc 2.16, defining _ATFILE_SOURCE sufficed
+to obtain its definition.
+.TP
+.BR AT_NO_AUTOMOUNT
+Don't automount the terminal ("basename") component of
+.I pathname
+if it is a directory that is an automount point.  This allows the caller to
+gather attributes of the filesystem holding an automount point (rather than
+the filesystem it would mount).  This flag can be used in tools that scan
+directories to prevent mass-automounting of a directory of automount points.
+The
+.B AT_NO_AUTOMOUNT
+flag has no effect if the mount point has already been mounted over.
+This flag is Linux-specific; define
+.B _GNU_SOURCE
+.\" Before glibc 2.16, defining _ATFILE_SOURCE sufficed
+to obtain its definition.
+.TP
+.B AT_SYMLINK_NOFOLLOW
+If
+.I pathname
+is a symbolic link, do not dereference it:
+instead return information about the link itself, like
+.BR lstat ().
+.SH THE ATTRIBUTES
+.PP
+There is a range of attributes that can be selected from.  These are:
+
+.\" __________________ fsinfo_attr_statfs __________________
+.TP
+.B fsinfo_attr_statfs
+This retrieves the "dynamic"
+.B statfs
+information, such as block and file counts, that are expected to change whilst
+a filesystem is being used.  This fills in the following structure:
+.PP
+.RS
+.in +4n
+.nf
+struct fsinfo_statfs {
+    __u64 f_blocks;	/* Total number of blocks in fs */
+    __u64 f_bfree;	/* Total number of free blocks */
+    __u64 f_bavail;	/* Number of free blocks available to ordinary user */
+    __u64 f_files;	/* Total number of file nodes in fs */
+    __u64 f_ffree;	/* Number of free file nodes */
+    __u64 f_favail;	/* Number of free file nodes available to ordinary user */
+    __u32 f_bsize;	/* Optimal block size */
+    __u32 f_frsize;	/* Fragment size */
+};
+.fi
+.in
+.RE
+.IP
+The fields correspond to those of the same name returned by
+.BR statfs ().
+
+.\" __________________ fsinfo_attr_fsinfo __________________
+.TP
+.B fsinfo_attr_fsinfo
+This retrieves information about the
+.BR fsinfo ()
+system call itself.  This fills in the following structure:
+.PP
+.RS
+.in +4n
+.nf
+struct fsinfo_fsinfo {
+    __u32 max_attr;
+    __u32 max_cap;
+};
+.fi
+.in
+.RE
+.IP
+The
+.I max_attr
+value indicates the number of attributes supported by the
+.BR fsinfo ()
+system call, and
+.I max_cap
+indicates the number of capability bits supported by the
+.B fsinfo_attr_capabilities
+attribute.  The first corresponds to
+.I fsinfo_attr__nr
+and the second to
+.I fsinfo_cap__nr
+in the header file.
+
+.\" __________________ fsinfo_attr_ids __________________
+.TP
+.B fsinfo_attr_ids
+This retrieves a number of fixed IDs and other static information otherwise
+available through
+.BR statfs ().
+The following structure is filled in:
+.PP
+.RS
+.in +4n
+.nf
+struct fsinfo_ids {
+    char  f_fs_name[15 + 1]; /* Filesystem name */
+    __u64 f_flags;	/* Filesystem mount flags (MS_*) */
+    __u64 f_fsid;	/* Short 64-bit Filesystem ID */
+    __u64 f_sb_id;	/* Internal superblock ID */
+    __u32 f_fstype;	/* Filesystem type from linux/magic.h */
+    __u32 f_dev_major;	/* As st_dev_* from struct statx */
+    __u32 f_dev_minor;
+};
+.fi
+.in
+.RE
+.IP
+Most of these are filled in as for
+.BR statfs (),
+with the addition of the filesystem's symbolic name in
+.I f_fs_name
+and an identifier for use in notifications in
+.IR f_sb_id .
+
+.\" __________________ fsinfo_attr_limits __________________
+.TP
+.B fsinfo_attr_limits
+This retrieves information about the limits of what a filesystem can support.
+The following structure is filled in:
+.PP
+.RS
+.in +4n
+.nf
+struct fsinfo_limits {
+    __u64 max_file_size;
+    __u64 max_uid;
+    __u64 max_gid;
+    __u64 max_projid;
+    __u32 max_dev_major;
+    __u32 max_dev_minor;
+    __u32 max_hard_links;
+    __u32 max_xattr_body_len;
+    __u16 max_xattr_name_len;
+    __u16 max_filename_len;
+    __u16 max_symlink_len;
+    __u16 __reserved[1];
+};
+.fi
+.in
+.RE
+.IP
+These indicate the maximum supported sizes for a variety of filesystem objects,
+including the file size, the extended attribute name length and body length,
+the filename length and the symlink body length.
+.IP
+It also indicates the maximum representable values for a User ID, a Group ID,
+a Project ID, a device major number and a device minor number.
+.IP
+And finally, it indicates the maximum number of hard links that can be made to
+a file.
+.IP
+Note that some of these values may be zero if the underlying object or concept
+is not supported by the filesystem or the medium.
+
+.\" __________________ fsinfo_attr_supports __________________
+.TP
+.B fsinfo_attr_supports
+This retrieves information about what bits a filesystem supports in various
+masks.  The following structure is filled in:
+.PP
+.RS
+.in +4n
+.nf
+struct fsinfo_supports {
+    __u64 stx_attributes;
+    __u32 stx_mask;
+    __u32 ioc_flags;
+    __u32 win_file_attrs;
+    __u32 __reserved[1];
+};
+.fi
+.in
+.RE
+.IP
+The
+.IR stx_attributes " and " stx_mask
+fields indicate what bits in the struct statx fields of the matching names
+are supported by the filesystem.
+.IP
+The
+.I ioc_flags
+field indicates what FS_*_FL flag bits as used through the FS_IOC_GET/SETFLAGS
+ioctls are supported by the filesystem.
+.IP
+The
+.I win_file_attrs
+indicates what DOS/Windows file attributes a filesystem supports, if any.
+
+.\" __________________ fsinfo_attr_capabilities __________________
+.TP
+.B fsinfo_attr_capabilities
+This retrieves information about what features a filesystem supports as a
+series of single bit indicators.  The following structure is filled in:
+.PP
+.RS
+.in +4n
+.nf
+struct fsinfo_capabilities {
+    __u8 capabilities[(fsinfo_cap__nr + 7) / 8];
+};
+.fi
+.in
+.RE
+.IP
+where the bit of interest can be found by:
+.PP
+.RS
+.in +4n
+.nf
+	p->capabilities[bit / 8] & (1 << (bit % 8)))
+.fi
+.in
+.RE
+.IP
+The bits are listed by
+.I enum fsinfo_capability
+and
+.B fsinfo_cap__nr
+is one more than the last capability bit listed in the header file.
+.IP
+Note that the number of capability bits actually supported by the kernel can be
+found using the
+.B fsinfo_attr_fsinfo
+attribute.
+.IP
+The capability bits and their meanings are listed below in the "THE
+CAPABILITIES" section.
+
+.\" __________________ fsinfo_attr_timestamp_info __________________
+.TP
+.B fsinfo_attr_timestamp_info
+This retrieves information about what timestamp resolution and scope is
+supported by a filesystem for each of the file timestamps.  The following
+structure is filled in:
+.PP
+.RS
+.in +4n
+.nf
+struct fsinfo_timestamp_info {
+	__s64 minimum_timestamp;
+	__s64 maximum_timestamp;
+	__u16 atime_gran_mantissa;
+	__u16 btime_gran_mantissa;
+	__u16 ctime_gran_mantissa;
+	__u16 mtime_gran_mantissa;
+	__s8  atime_gran_exponent;
+	__s8  btime_gran_exponent;
+	__s8  ctime_gran_exponent;
+	__s8  mtime_gran_exponent;
+	__u32 __reserved[1];
+};
+.fi
+.in
+.RE
+.IP
+where
+.IR minimum_timestamp " and " maximum_timestamp
+are the limits on the timestamps that the filesystem supports and
+.IR *time_gran_mantissa " and " *time_gran_exponent
+indicate the granularity of each timestamp in terms of seconds, using the
+formula:
+.PP
+.RS
+.in +4n
+.nf
+mantissa * pow(10, exponent) Seconds
+.fi
+.in
+.RE
+.IP
+where exponent may be negative and the result may be a fraction of a second.
+.IP
+Four timestamps are detailed: \fBA\fPccess time, \fBB\fPirth/creation time,
+\fBC\fPhange time and \fBM\fPodification time.  Capability bits are defined
+that specify whether each of these exist in the filesystem or not.
+.IP
+Note that the timestamp description may be approximated or inaccurate if the
+file is actually remote or is the union of multiple objects.
+
+.\" __________________ fsinfo_attr_volume_id __________________
+.TP
+.B fsinfo_attr_volume_id
+This retrieves the system's superblock volume identifier as a variable-length
+string.  This does not necessarily represent a value stored in the medium but
+might be constructed on the fly.
+.IP
+For instance, for a block device this is the block device identifier
+(eg. "sdb2"); for AFS this would be the numeric volume identifier.
+
+.\" __________________ fsinfo_attr_volume_uuid __________________
+.TP
+.B fsinfo_attr_volume_uuid
+This retrieves the volume UUID, if there is one, as a little-endian binary
+UUID.  This fills in the following structure:
+.PP
+.RS
+.in +4n
+.nf
+struct fsinfo_volume_uuid {
+    __u8 uuid[16];
+};
+.fi
+.in
+.RE
+.IP
+
+.\" __________________ fsinfo_attr_volume_name __________________
+.TP
+.B fsinfo_attr_volume_name
+This retrieves the filesystem's volume name as a variable-length string.  This
+is expected to represent a name stored in the medium.
+.IP
+For a block device, this might be a label stored in the superblock.  For a
+network filesystem, this might be a logical volume name of some sort.
+
+.\" __________________ fsinfo_attr_cell/domain __________________
+.PP
+.B fsinfo_attr_cell_name
+.br
+.B fsinfo_attr_domain_name
+.br
+.IP
+These two attributes are variable-length string attributes that may be used to
+obtain information about network filesystems.  An AFS volume, for instance,
+belongs to a named cell.  CIFS shares may belong to a domain.
+
+.\" __________________ fsinfo_attr_realm_name __________________
+.TP
+.B fsinfo_attr_realm_name
+This attribute is variable-length string that indicates the Kerberos realm that
+a filesystem's authentication tokens should come from.
+
+.\" __________________ fsinfo_attr_server_name __________________
+.TP
+.B fsinfo_attr_server_name
+This attribute is a multiple-value attribute that lists the names of the
+servers that are backing a network filesystem.  Each value is a variable-length
+string.  The values are enumerated by calling
+.BR fsinfo ()
+multiple times, incrementing
+.I params->Nth
+each time until an ENODATA error occurs, thereby indicating the end of the
+list.
+
+.\" __________________ fsinfo_attr_server_address __________________
+.TP
+.B fsinfo_attr_server_address
+This attribute is a multiple-instance, multiple-value attribute that lists the
+addresses of the servers that are backing a network filesystem.  Each value is
+a structure of the following type:
+.PP
+.RS
+.in +4n
+.nf
+struct fsinfo_server_address {
+    struct __kernel_sockaddr_storage address;
+};
+.fi
+.in
+.RE
+.IP
+Where the address may be AF_INET, AF_INET6, AF_RXRPC or any other type as
+appropriate to the filesystem.
+.IP
+The values are enumerated by calling
+.IR fsinfo ()
+multiple times, incrementing
+.I params->Nth
+to step through the servers and
+.I params->Mth
+to step through the addresses of the Nth server each time until ENODATA errors
+occur, thereby indicating either the end of a server's address list or the end
+of the server list.
+.IP
+Barring the server list changing whilst being accessed, it is expected that the
+.I params->Nth
+will correspond to
+.I params->Nth
+for
+.BR fsinfo_attr_server_name .
+
+.\" __________________ fsinfo_attr_parameter __________________
+.TP
+.B fsinfo_attr_parameter
+This attribute is a multiple-value attribute that lists the values of the mount
+parameters for a filesystem as variable-length strings.
+.IP
+The parameters are enumerated by calling
+.BR fsinfo ()
+multiple times, incrementing
+.I params->Nth
+to step through them until error ENODATA is given.
+.IP
+Parameter strings are presented in a form akin to the way they're passed to the
+context created by the
+.BR fsopen ()
+system call.  For example, straight text parameters will be rendered as
+something like:
+.PP
+.RS
+.in +4n
+.nf
+"o data=journal"
+"o noquota"
+.fi
+.in
+.RE
+.IP
+Where the initial "word" indicates the option form.
+
+.\" __________________ fsinfo_attr_source __________________
+.TP
+.B fsinfo_attr_source
+This attribute is a multiple-value attribute that lists the mount sources for a
+filesystem as variable-length strings.  Normally only one source will be
+available, but the possibility of having more than one is allowed for.
+.IP
+The sources are enumerated by calling
+.BR fsinfo ()
+multiple times, incrementing
+.I params->Nth
+to step through them until error ENODATA is given.
+.IP
+Source strings are presented in a form akin to the way they're passed to the
+context created by the
+.BR fsopen ()
+system call.  For example, they will be rendered as something like:
+.PP
+.RS
+.in +4n
+.nf
+"s /dev/sda1"
+"s example.com/pub/linux/"
+.fi
+.in
+.RE
+.IP
+Where the initial "word" indicates the option form.
+
+.\" __________________ fsinfo_attr_name_encoding __________________
+.TP
+.B fsinfo_attr_name_encoding
+This attribute is variable-length string that indicates the filename encoding
+used by the filesystem.  The default is "utf8".  Note that this may indicate a
+non-8-bit encoding if that's what the underlying filesystem actually supports.
+
+.\" __________________ fsinfo_attr_name_codepage __________________
+.TP
+.B fsinfo_attr_name_codepage
+This attribute is variable-length string that indicates the codepage used to
+translate filenames from the filesystem to the system if this is applicable to
+the filesystem.
+
+.\" __________________ fsinfo_attr_io_size __________________
+.TP
+.B fsinfo_attr_io_size
+This retrieves information about the I/O sizes supported by the filesystem.
+The following structure is filled in:
+.PP
+.RS
+.in +4n
+.nf
+struct fsinfo_io_size {
+    __u32 block_size;
+    __u32 max_single_read_size;
+    __u32 max_single_write_size;
+    __u32 best_read_size;
+    __u32 best_write_size;
+};
+.fi
+.in
+.RE
+.IP
+Where
+.I block_size
+indicates the fundamental I/O block size of the filesystem as something
+O_DIRECT read/write sizes must be a multiple of;
+.IR max_single_write_size " and " max_single_write_size
+indicate the maximum sizes for individual unbuffered data transfer operations;
+and
+.IR best_read_size " and " best_write_size
+indicate the recommended I/O sizes.
+.IP
+Note that any of these may be zero if inapplicable or indeterminable.
+
+
+
+.SH THE CAPABILITIES
+.PP
+There are number of capability bits in a bit array that can be retrieved using
+.BR fsinfo_attr_capabilities .
+These give information about features of the filesystem driver and the specific
+filesystem.
+
+.\" __________________ fsinfo_cap_is_*_fs __________________
+.PP
+.B fsinfo_cap_is_kernel_fs
+.br
+.B fsinfo_cap_is_block_fs
+.br
+.B fsinfo_cap_is_flash_fs
+.br
+.B fsinfo_cap_is_network_fs
+.br
+.B fsinfo_cap_is_automounter_fs
+.IP
+These indicate the primary type of the filesystem.
+.B kernel
+filesystems are special communication interfaces that substitute files for
+system calls; examples include procfs and sysfs.
+.B block
+filesystems require a block device on which to operate; examples include ext4
+and XFS.
+.B flash
+filesystems require an MTD device on which to operate; examples include JFFS2.
+.B network
+filesystems require access to the network and contact one or more servers;
+examples include NFS and AFS.
+.B automounter
+filesystems are kernel special filesystems that host automount points and
+triggers to dynamically create automount points.  Examples include autofs and
+AFS's dynamic root.
+
+.\" __________________ fsinfo_cap_automounts __________________
+.TP
+.B fsinfo_cap_automounts
+The filesystem may have automount points that can be triggered by pathwalk.
+
+.\" __________________ fsinfo_cap_adv_locks __________________
+.TP
+.B fsinfo_cap_adv_locks
+The filesystem supports advisory file locks.  For a network filesystem, this
+indicates that the advisory file locks are cross-client (and also between
+server and its local filesystem on something like NFS).
+
+.\" __________________ fsinfo_cap_mand_locks __________________
+.TP
+.B fsinfo_cap_mand_locks
+The filesystem supports mandatory file locks.  For a network filesystem, this
+indicates that the mandatory file locks are cross-client (and also between
+server and its local filesystem on something like NFS).
+
+.\" __________________ fsinfo_cap_leases __________________
+.TP
+.B fsinfo_cap_leases
+The filesystem supports leases.  For a network filesystem, this means that the
+server will tell the client to clean up its state on a file before passing the
+lease to another client.
+
+.\" __________________ fsinfo_cap_*ids __________________
+.PP
+.B fsinfo_cap_uids
+.br
+.B fsinfo_cap_gids
+.br
+.B fsinfo_cap_projids
+.IP
+These indicate that the filesystem supports numeric user IDs, group IDs and
+project IDs respectively.
+
+.\" __________________ fsinfo_cap_id_* __________________
+.PP
+.B fsinfo_cap_id_names
+.br
+.B fsinfo_cap_id_guids
+.IP
+These indicate that the filesystem employs textual names and/or GUIDs as
+identifiers.
+
+.\" __________________ fsinfo_cap_windows_attrs __________________
+.TP
+.B fsinfo_cap_windows_attrs
+Indicates that the filesystem supports some Windows FILE_* attributes.
+
+.\" __________________ fsinfo_cap_*_quotas __________________
+.PP
+.B fsinfo_cap_user_quotas
+.br
+.B fsinfo_cap_group_quotas
+.br
+.B fsinfo_cap_project_quotas
+.IP
+These indicate that the filesystem supports quotas for users, groups and
+projects respectively.
+
+.\" __________________ fsinfo_cap_xattrs/filetypes __________________
+.PP
+.B fsinfo_cap_xattrs
+.br
+.B fsinfo_cap_symlinks
+.br
+.B fsinfo_cap_hard_links
+.br
+.B fsinfo_cap_hard_links_1dir
+.br
+.B fsinfo_cap_device_files
+.br
+.B fsinfo_cap_unix_specials
+.IP
+These indicate that the filesystem supports respectively extended attributes;
+symbolic links; hard links spanning direcories; hard links, but only within a
+directory; block and character device files; and UNIX special files, such as
+FIFO and socket.
+
+.\" __________________ fsinfo_cap_*journal* __________________
+.PP
+.B fsinfo_cap_journal
+.br
+.B fsinfo_cap_data_is_journalled
+.IP
+The first of these indicates that the filesystem has a journal and the second
+that the file data changes are being journalled.
+
+.\" __________________ fsinfo_cap_o_* __________________
+.PP
+.B fsinfo_cap_o_sync
+.br
+.B fsinfo_cap_o_direct
+.IP
+These indicate that O_SYNC and O_DIRECT are supported respectively.
+
+.\" __________________ fsinfo_cap_o_* __________________
+.PP
+.B fsinfo_cap_volume_id
+.br
+.B fsinfo_cap_volume_uuid
+.br
+.B fsinfo_cap_volume_name
+.br
+.B fsinfo_cap_volume_fsid
+.br
+.B fsinfo_cap_cell_name
+.br
+.B fsinfo_cap_domain_name
+.br
+.B fsinfo_cap_realm_name
+.IP
+These indicate if various attributes are supported by the filesystem, where
+.B fsinfo_cap_X
+here corresponds to
+.BR fsinfo_attr_X .
+
+.\" __________________ fsinfo_cap_iver_* __________________
+.PP
+.B fsinfo_cap_iver_all_change
+.br
+.B fsinfo_cap_iver_data_change
+.br
+.B fsinfo_cap_iver_mono_incr
+.IP
+These indicate if
+.I i_version
+on an inode in the filesystem is supported and
+how it behaves.
+.B all_change
+indicates that i_version is incremented on metadata changes as well as data
+changes.
+.B data_change
+indicates that i_version is only incremented on data changes, including
+truncation.
+.B mono_incr
+indicates that i_version is incremented by exactly 1 for each change made.
+
+.\" __________________ fsinfo_cap_resource_forks __________________
+.TP
+.B fsinfo_cap_resource_forks
+This indicates that the filesystem supports some sort of resource fork or
+alternate data stream on a file.  This isn't the same as an extended attribute.
+
+.\" __________________ fsinfo_cap_name_* __________________
+.PP
+.B fsinfo_cap_name_case_indep
+.br
+.B fsinfo_cap_name_non_utf8
+.br
+.B fsinfo_cap_name_has_codepage
+.IP
+These indicate certain facts about the filenames in a filesystem: whether
+they're case-independent; if they're not UTF-8; and if there's a codepage
+employed to map the names.
+
+.\" __________________ fsinfo_cap_sparse __________________
+.TP
+.B fsinfo_cap_sparse
+This indicates that the filesystem supports sparse files.
+
+.\" __________________ fsinfo_cap_not_persistent __________________
+.TP
+.B fsinfo_cap_not_persistent
+This indicates that the filesystem is not persistent, and that any data stored
+here will not be saved in the event that the filesystem is unmounted, the
+machine is rebooted or the machine loses power.
+
+.\" __________________ fsinfo_cap_no_unix_mode __________________
+.TP
+.B fsinfo_cap_no_unix_mode
+This indicates that the filesystem doesn't support the UNIX mode permissions
+bits.
+
+.\" __________________ fsinfo_cap_has_*time __________________
+.PP
+.B fsinfo_cap_has_atime
+.br
+.B fsinfo_cap_has_btime
+.br
+.B fsinfo_cap_has_ctime
+.br
+.B fsinfo_cap_has_mtime
+.IP
+These indicate as to what timestamps a filesystem supports, including: Access
+time, Birth/creation time, Change time (metadata and data) and Modification
+time (data only).
+
+
+.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+.SH RETURN VALUE
+On success, the size of the value that the kernel has available is returned,
+irrespective of whether the buffer is large enough to hold that.  The data
+written to the buffer will be truncated if it is not.  On error, \-1 is
+returned, and
+.I errno
+is set appropriately.
+.SH ERRORS
+.TP
+.B EACCES
+Search permission is denied for one of the directories
+in the path prefix of
+.IR pathname .
+(See also
+.BR path_resolution (7).)
+.TP
+.B EBADF
+.I dirfd
+is not a valid open file descriptor.
+.TP
+.B EFAULT
+.I pathname
+is NULL or
+.IR pathname ", " params " or " buffer
+point to a location outside the process's accessible address space.
+.TP
+.B EINVAL
+Reserved flag specified in
+.IR params->at_flags " or one of " params->__reserved[]
+is not 0.
+.TP
+.B EOPNOTSUPP
+Unsupported attribute requested in
+.IR params->request .
+This may be beyond the limit of the supported attribute set or may just not be
+one that's supported by the filesystem.
+.TP
+.B ENODATA
+Unavailable attribute value requested by
+.IR params->Nth " and/or " params->Mth .
+.TP
+.B ELOOP
+Too many symbolic links encountered while traversing the pathname.
+.TP
+.B ENAMETOOLONG
+.I pathname
+is too long.
+.TP
+.B ENOENT
+A component of
+.I pathname
+does not exist, or
+.I pathname
+is an empty string and
+.B AT_EMPTY_PATH
+was not specified in
+.IR params->at_flags .
+.TP
+.B ENOMEM
+Out of memory (i.e., kernel memory).
+.TP
+.B ENOTDIR
+A component of the path prefix of
+.I pathname
+is not a directory or
+.I pathname
+is relative and
+.I dirfd
+is a file descriptor referring to a file other than a directory.
+.SH VERSIONS
+.BR fsinfo ()
+was added to Linux in kernel 4.18.
+.SH CONFORMING TO
+.BR fsinfo ()
+is Linux-specific.
+.SH NOTES
+Glibc does not (yet) provide a wrapper for the
+.BR fsinfo ()
+system call; call it using
+.BR syscall (2).
+.SH SEE ALSO
+.BR ioctl_iflags (2),
+.BR statx (2),
+.BR statfs (2)
diff --git a/man2/ioctl_iflags.2 b/man2/ioctl_iflags.2
index 9c77b08b9..49ba4444e 100644
--- a/man2/ioctl_iflags.2
+++ b/man2/ioctl_iflags.2
@@ -200,9 +200,15 @@ the effective user ID of the caller must match the owner of the file,
 or the caller must have the
 .BR CAP_FOWNER
 capability.
+.PP
+The set of flags supported by a filesystem can be determined by calling
+.IR fsinfo (2)
+with attribute
+.IR fsinfo_attr_supports .
 .SH SEE ALSO
 .BR chattr (1),
 .BR lsattr (1),
+.BR fsinfo (2),
 .BR mount (2),
 .BR btrfs (5),
 .BR ext4 (5),
diff --git a/man2/stat.2 b/man2/stat.2
index dad9a01ac..ee4001f85 100644
--- a/man2/stat.2
+++ b/man2/stat.2
@@ -532,6 +532,12 @@ If none of the aforementioned macros are defined,
 then the nanosecond values are exposed with names of the form
 .IR st_atimensec .
 .\"
+.PP
+Which timestamps are supported by a filesystem and their the ranges and
+granularities can be determined by calling
+.IR fsinfo (2)
+with attribute
+.IR fsinfo_attr_timestamp_info .
 .SS C library/kernel differences
 Over time, increases in the size of the
 .I stat
@@ -707,6 +713,7 @@ main(int argc, char *argv[])
 .BR access (2),
 .BR chmod (2),
 .BR chown (2),
+.BR fsinfo (2),
 .BR readlink (2),
 .BR utime (2),
 .BR capabilities (7),
diff --git a/man2/statx.2 b/man2/statx.2
index edac9f6f4..9a57c1b90 100644
--- a/man2/statx.2
+++ b/man2/statx.2
@@ -534,12 +534,25 @@ Glibc does not (yet) provide a wrapper for the
 .BR statx ()
 system call; call it using
 .BR syscall (2).
+.PP
+The sets of mask/stx_mask and stx_attributes bits supported by a filesystem
+can be determined by calling
+.IR fsinfo (2)
+with attribute
+.IR fsinfo_attr_supports .
+.PP
+Which timestamps are supported by a filesystem and their the ranges and
+granularities can also be determined by calling
+.IR fsinfo (2)
+with attribute
+.IR fsinfo_attr_timestamp_info .
 .SH SEE ALSO
 .BR ls (1),
 .BR stat (1),
 .BR access (2),
 .BR chmod (2),
 .BR chown (2),
+.BR fsinfo (2),
 .BR readlink (2),
 .BR stat (2),
 .BR utime (2),
diff --git a/man2/utime.2 b/man2/utime.2
index 03a43a416..c6acdbac2 100644
--- a/man2/utime.2
+++ b/man2/utime.2
@@ -181,9 +181,16 @@ on an append-only file.
 .\" is just a wrapper for
 .\" .BR utime ()
 .\" and hence does not allow a subsecond resolution.
+.PP
+Which timestamps are supported by a filesystem and their the ranges and
+granularities can be determined by calling
+.IR fsinfo (2)
+with attribute
+.IR fsinfo_attr_timestamp_info .
 .SH SEE ALSO
 .BR chattr (1),
 .BR touch (1),
+.BR fsinfo (2),
 .BR futimesat (2),
 .BR stat (2),
 .BR utimensat (2),
diff --git a/man2/utimensat.2 b/man2/utimensat.2
index d61b43e96..be8925548 100644
--- a/man2/utimensat.2
+++ b/man2/utimensat.2
@@ -633,9 +633,16 @@ instead checks whether the
 .\" conversely, a process with a read-only file descriptor won't
 .\" be able to update the timestamps of a file,
 .\" even if it has write permission on the file.
+.PP
+Which timestamps are supported by a filesystem and their the ranges and
+granularities can be determined by calling
+.IR fsinfo (2)
+with attribute
+.IR fsinfo_attr_timestamp_info .
 .SH SEE ALSO
 .BR chattr (1),
 .BR touch (1),
+.BR fsinfo (2),
 .BR futimesat (2),
 .BR openat (2),
 .BR stat (2),

^ permalink raw reply related	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-10 22:44 ` [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation " David Howells
@ 2018-07-10 23:59   ` Andy Lutomirski
  2018-07-11  1:05     ` Linus Torvalds
                       ` (2 more replies)
  2018-07-11  7:22   ` David Howells
                     ` (3 subsequent siblings)
  4 siblings, 3 replies; 75+ messages in thread
From: Andy Lutomirski @ 2018-07-10 23:59 UTC (permalink / raw)
  To: David Howells
  Cc: viro, linux-api, linux-fsdevel, torvalds, linux-kernel, jannh

[cc Jann - you love this stuff]

> On Jul 10, 2018, at 3:44 PM, David Howells <dhowells@redhat.com> wrote:
> 
> Provide an fsopen() system call that starts the process of preparing to
> create a superblock that will then be mountable, using an fd as a context
> handle.  fsopen() is given the name of the filesystem that will be used:
> 
>    int mfd = fsopen(const char *fsname, unsigned int flags);

This is great in principle, but I think you’re seriously playing with fire with the API. 

> 
> where flags can be 0 or FSOPEN_CLOEXEC.
> 
> For example:
> 
>    sfd = fsopen("ext4", FSOPEN_CLOEXEC);
>    write(sfd, "s /dev/sdb1"); // note I'm ignoring write's length arg

Imagine some malicious program passes sfd as stdout to a setuid program. That program gets persuaded to write “s /etc/shadow”.  What happens?  You’re okay as long as *every single fs* gets it right, but that’s asking a lot.

>    write(sfd, "o noatime");
>    write(sfd, "o acl");
>    write(sfd, "o user_attr");
>    write(sfd, "o iversion");
>    write(sfd, "o ");
>    write(sfd, "r /my/container"); // root inside the fs
>    write(sfd, "x create"); // create the superblock

From cursory inspection of a bunch of the code, I think the expectation is that the actual device access happens in the “x” action. This is not okay. You can’t do this kind of thing in a write() handler, unless you somehow make every single access using f_cred, which is a real pain.

I think the right solution is one of:

(a) Pass a netlink-formatted blob to fsopen() and do the whole thing in one syscall. I don’t mean using netlink sockets — just the nlattr format.  Or you could use a different format. The part that matters is using just one syscall to do the whole thing.

(b) Keep the current structure but use a new syscall instead of write().

(c) Keep using write() but literally just buffer the data. Then have a new syscall to commit it.  In other words, replace “x” with a syscall and call all the fs_context_operations helpers in that context instead of from write().

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-10 23:59   ` Andy Lutomirski
@ 2018-07-11  1:05     ` Linus Torvalds
  2018-07-11  1:15       ` Al Viro
  2018-07-11  1:14     ` Jann Horn
  2018-07-11  8:42     ` David Howells
  2 siblings, 1 reply; 75+ messages in thread
From: Linus Torvalds @ 2018-07-11  1:05 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: David Howells, Al Viro, Linux API, linux-fsdevel,
	Linux Kernel Mailing List, Jann Horn

Yeah, Andy is right that we should *not* make "write()" have side effects.

Use it to queue things by all means, but not "do" things. Not unless
there's a very sane security model.

On Tue, Jul 10, 2018 at 4:59 PM Andy Lutomirski <luto@amacapital.net> wrote:
>
> I think the right solution is one of:
>
> (a) Pass a netlink-formatted blob to fsopen() and do the whole thing in one syscall. I don’t mean using netlink sockets — just the nlattr format.  Or you could use a different format. The part that matters is using just one syscall to do the whole thing.

Please no. Not another nasty marshalling thing.

> (b) Keep the current structure but use a new syscall instead of write().
>
> (c) Keep using write() but literally just buffer the data. Then have a new syscall to commit it.  In other words, replace “x” with a syscall and call all the fs_context_operations helpers in that context instead of from write().

But yeah, b-or-c sounds fine.

               Linus

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-10 23:59   ` Andy Lutomirski
  2018-07-11  1:05     ` Linus Torvalds
@ 2018-07-11  1:14     ` Jann Horn
  2018-07-11  1:16       ` Al Viro
  2018-07-11  8:42     ` David Howells
  2 siblings, 1 reply; 75+ messages in thread
From: Jann Horn @ 2018-07-11  1:14 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: David Howells, Al Viro, Linux API, linux-fsdevel, Linus Torvalds,
	kernel list

On Tue, Jul 10, 2018 at 4:59 PM Andy Lutomirski <luto@amacapital.net> wrote:
>
> [cc Jann - you love this stuff]
>
> > On Jul 10, 2018, at 3:44 PM, David Howells <dhowells@redhat.com> wrote:
> >
> > Provide an fsopen() system call that starts the process of preparing to
> > create a superblock that will then be mountable, using an fd as a context
> > handle.  fsopen() is given the name of the filesystem that will be used:
> >
> >    int mfd = fsopen(const char *fsname, unsigned int flags);
>
> This is great in principle, but I think you’re seriously playing with fire with the API.
>
> >
> > where flags can be 0 or FSOPEN_CLOEXEC.
> >
> > For example:
> >
> >    sfd = fsopen("ext4", FSOPEN_CLOEXEC);
> >    write(sfd, "s /dev/sdb1"); // note I'm ignoring write's length arg
>
> Imagine some malicious program passes sfd as stdout to a setuid program. That program gets persuaded to write “s /etc/shadow”.  What happens?  You’re okay as long as *every single fs* gets it right, but that’s asking a lot.
>
> >    write(sfd, "o noatime");
> >    write(sfd, "o acl");
> >    write(sfd, "o user_attr");
> >    write(sfd, "o iversion");
> >    write(sfd, "o ");
> >    write(sfd, "r /my/container"); // root inside the fs
> >    write(sfd, "x create"); // create the superblock
>
> From cursory inspection of a bunch of the code, I think the expectation is that the actual device access happens in the “x” action. This is not okay. You can’t do this kind of thing in a write() handler, unless you somehow make every single access using f_cred, which is a real pain.
>
> I think the right solution is one of:
>
> (a) Pass a netlink-formatted blob to fsopen() and do the whole thing in one syscall. I don’t mean using netlink sockets — just the nlattr format.  Or you could use a different format. The part that matters is using just one syscall to do the whole thing.
>
> (b) Keep the current structure but use a new syscall instead of write().
>
> (c) Keep using write() but literally just buffer the data. Then have a new syscall to commit it.  In other words, replace “x” with a syscall and call all the fs_context_operations helpers in that context instead of from write().

I also love ioctls, so I think you could also use an ioctl to do the
commit? You can do anything (well, almost anything) that you can do in
syscall context in ioctl context, too; and when you already have a
file descriptor of a specific type that you want to perform an
operation on, an ioctl works just fine.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-11  1:05     ` Linus Torvalds
@ 2018-07-11  1:15       ` Al Viro
  2018-07-11  1:33         ` Andy Lutomirski
                           ` (2 more replies)
  0 siblings, 3 replies; 75+ messages in thread
From: Al Viro @ 2018-07-11  1:15 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andy Lutomirski, David Howells, Linux API, linux-fsdevel,
	Linux Kernel Mailing List, Jann Horn

On Tue, Jul 10, 2018 at 06:05:49PM -0700, Linus Torvalds wrote:
> Yeah, Andy is right that we should *not* make "write()" have side effects.
> 
> Use it to queue things by all means, but not "do" things. Not unless
> there's a very sane security model.
> 
> On Tue, Jul 10, 2018 at 4:59 PM Andy Lutomirski <luto@amacapital.net> wrote:
> >
> > I think the right solution is one of:
> >
> > (a) Pass a netlink-formatted blob to fsopen() and do the whole thing in one syscall. I don’t mean using netlink sockets — just the nlattr format.  Or you could use a different format. The part that matters is using just one syscall to do the whole thing.
> 
> Please no. Not another nasty marshalling thing.
> 
> > (b) Keep the current structure but use a new syscall instead of write().
> >
> > (c) Keep using write() but literally just buffer the data. Then have a new syscall to commit it.  In other words, replace “x” with a syscall and call all the fs_context_operations helpers in that context instead of from write().
> 
> But yeah, b-or-c sounds fine.

Umm...  How about "use credentials of opener for everything"?

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-11  1:14     ` Jann Horn
@ 2018-07-11  1:16       ` Al Viro
  0 siblings, 0 replies; 75+ messages in thread
From: Al Viro @ 2018-07-11  1:16 UTC (permalink / raw)
  To: Jann Horn
  Cc: Andy Lutomirski, David Howells, Linux API, linux-fsdevel,
	Linus Torvalds, kernel list

On Tue, Jul 10, 2018 at 06:14:10PM -0700, Jann Horn wrote:

> I also love ioctls, so I think you could also use an ioctl to do the
> commit? You can do anything (well, almost anything) that you can do in
> syscall context in ioctl context, too; and when you already have a
> file descriptor of a specific type that you want to perform an
> operation on, an ioctl works just fine.

Poe's Law in action...

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-11  1:15       ` Al Viro
@ 2018-07-11  1:33         ` Andy Lutomirski
  2018-07-11  1:48         ` Linus Torvalds
  2018-07-11  8:43         ` David Howells
  2 siblings, 0 replies; 75+ messages in thread
From: Andy Lutomirski @ 2018-07-11  1:33 UTC (permalink / raw)
  To: Al Viro
  Cc: Linus Torvalds, David Howells, Linux API, linux-fsdevel,
	Linux Kernel Mailing List, Jann Horn

On Tue, Jul 10, 2018 at 6:15 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> On Tue, Jul 10, 2018 at 06:05:49PM -0700, Linus Torvalds wrote:
>> Yeah, Andy is right that we should *not* make "write()" have side effects.
>>
>> Use it to queue things by all means, but not "do" things. Not unless
>> there's a very sane security model.
>>
>> On Tue, Jul 10, 2018 at 4:59 PM Andy Lutomirski <luto@amacapital.net> wrote:
>> >
>> > I think the right solution is one of:
>> >
>> > (a) Pass a netlink-formatted blob to fsopen() and do the whole thing in one syscall. I don’t mean using netlink sockets — just the nlattr format.  Or you could use a different format. The part that matters is using just one syscall to do the whole thing.
>>
>> Please no. Not another nasty marshalling thing.
>>
>> > (b) Keep the current structure but use a new syscall instead of write().
>> >
>> > (c) Keep using write() but literally just buffer the data. Then have a new syscall to commit it.  In other words, replace “x” with a syscall and call all the fs_context_operations helpers in that context instead of from write().
>>
>> But yeah, b-or-c sounds fine.
>
> Umm...  How about "use credentials of opener for everything"?

If you want to audit every single filesystem for any code that uses
credentials for anything and add all the right kernel APIs and make
sure the filesystem uses them and somehow keep screwups from getting
added down the line, then okay I guess.  As far as I know, we don't
even *have* an API for "open this device node using this struct cred
*".

I kind of want to add a hack to set some poison bit in current->cred
in sys_write() and clear it on the way out.  Sigh.

--Andy

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-11  1:15       ` Al Viro
  2018-07-11  1:33         ` Andy Lutomirski
@ 2018-07-11  1:48         ` Linus Torvalds
  2018-07-11  8:43         ` David Howells
  2 siblings, 0 replies; 75+ messages in thread
From: Linus Torvalds @ 2018-07-11  1:48 UTC (permalink / raw)
  To: Al Viro
  Cc: Andy Lutomirski, David Howells, Linux API, linux-fsdevel,
	Linux Kernel Mailing List, Jann Horn

On Tue, Jul 10, 2018 at 6:15 PM Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> Umm...  How about "use credentials of opener for everything"?

yeah, we have that for writes in general.

Nobody ever actually follows that rule. They may *think* they do, and
then they call to some helper that does "capability(CAP_SYS_WHATEVAH)"
without even realizing it.

But I'm certainly ok with writes, if it's just filling a buffer.
Preferably a standard buffer we already have, like a seqfile or pipe
(hey, splice!) or whatever.

And then you have that final op to actually "commit" the state. Which
shouldn't be a write (and not the close).

           Linus

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-10 22:44 ` [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation " David Howells
  2018-07-10 23:59   ` Andy Lutomirski
@ 2018-07-11  7:22   ` David Howells
  2018-07-11 16:38     ` Eric Biggers
                       ` (2 more replies)
  2018-07-11 15:51   ` Jonathan Corbet
                     ` (2 subsequent siblings)
  4 siblings, 3 replies; 75+ messages in thread
From: David Howells @ 2018-07-11  7:22 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: dhowells, viro, linux-api, linux-fsdevel, torvalds, linux-kernel, jannh

Andy Lutomirski <luto@amacapital.net> wrote:

> >    sfd = fsopen("ext4", FSOPEN_CLOEXEC);
> >    write(sfd, "s /dev/sdb1"); // note I'm ignoring write's length arg
> 
> Imagine some malicious program passes sfd as stdout to a setuid
> program. That program gets persuaded to write "s /etc/shadow".  What
> happens?  You’re okay as long as *every single fs* gets it right, but that’s
> asking a lot.

Do note that you must already have CAP_SYS_ADMIN to be able to call fsopen().

David

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-10 23:59   ` Andy Lutomirski
  2018-07-11  1:05     ` Linus Torvalds
  2018-07-11  1:14     ` Jann Horn
@ 2018-07-11  8:42     ` David Howells
  2018-07-11 16:03       ` Linus Torvalds
  2 siblings, 1 reply; 75+ messages in thread
From: David Howells @ 2018-07-11  8:42 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: dhowells, Andy Lutomirski, Al Viro, Linux API, linux-fsdevel,
	Linux Kernel Mailing List, Jann Horn

Linus Torvalds <torvalds@linux-foundation.org> wrote:

> Yeah, Andy is right that we should *not* make "write()" have side effects.

Note that write() has side effects all over the place: procfs, sysfs, debugfs,
tracefs, ...  Though for the most part they're single-shot jobs and not
cumulative (I'm not sure this is always true for debugfs - there's a lot of
weird stuff in there).

> > (b) Keep the current structure but use a new syscall instead of write().
> >
> > (c) Keep using write() but literally just buffer the data. Then have a new
> > syscall to commit it.  In other words, replace “x” with a syscall and call
> > all the fs_context_operations helpers in that context instead of from
> > write().
> 
> But yeah, b-or-c sounds fine.

I would prefer to avoid the "let's buffer everything" but rather parse the
data as we go along.  What I currently do is store the parsed data in the
context and only actually *apply* it when someone sends the 'x' command.

There are two reasons for this:

 (1) mount()'s error handling is slight: it can only return an error code, but
     creating and mounting something has so many different and interesting
     ways of going wrong and I want to be able to give better error reporting.

     This gets more interesting if it happens inside a container where you
     can't see dmesg.

 (2) Parsing the data means you only need to store the result of the parse and
     can reject anything that's unknown or contradictory.

     Buffering till the end means you have to buffer *everything* - and,
     unless you limit your buffer, you risk running out of RAM.

Now, I can replace the 'x' command with an ioctl() so that just writing random
rubbish to the fd won't cause anything to actually happen.

	fd = fsopen("ext4");
	write(fd, "s /dev/sda1");
	write(fd, "o user_xattr");
	ioctl(fd, FSOPEN_IOC_CREATE_SB, 0);

or I could make a special syscall for it:

	fscommit(fd, FSCOMMIT_CREATE);

or:

	fscommit(fd, FSCOMMIT_RECONFIGURE);

and require that you have CAP_SYS_ADMIN to enact it.

David

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-11  1:15       ` Al Viro
  2018-07-11  1:33         ` Andy Lutomirski
  2018-07-11  1:48         ` Linus Torvalds
@ 2018-07-11  8:43         ` David Howells
  2 siblings, 0 replies; 75+ messages in thread
From: David Howells @ 2018-07-11  8:43 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: dhowells, Al Viro, Linus Torvalds, Linux API, linux-fsdevel,
	Linux Kernel Mailing List, Jann Horn

Andy Lutomirski <luto@kernel.org> wrote:

> > Umm...  How about "use credentials of opener for everything"?
> 
> If you want to audit every single filesystem for any code that uses
> credentials for anything and add all the right kernel APIs and make
> sure the filesystem uses them and somehow keep screwups from getting
> added down the line, then okay I guess.  As far as I know, we don't
> even *have* an API for "open this device node using this struct cred
> *".

You can use override_creds() too.

David

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-10 22:44 ` [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation " David Howells
  2018-07-10 23:59   ` Andy Lutomirski
  2018-07-11  7:22   ` David Howells
@ 2018-07-11 15:51   ` Jonathan Corbet
  2018-07-11 16:18   ` David Howells
  2018-07-12 17:15   ` Greg KH
  4 siblings, 0 replies; 75+ messages in thread
From: Jonathan Corbet @ 2018-07-11 15:51 UTC (permalink / raw)
  To: David Howells; +Cc: viro, linux-api, linux-fsdevel, torvalds, linux-kernel

On Tue, 10 Jul 2018 23:44:09 +0100
David Howells <dhowells@redhat.com> wrote:

> 	sfd = fsopen("ext4", FSOPEN_CLOEXEC);
> 	write(sfd, "s /dev/sdb1"); // note I'm ignoring write's length arg
> 	write(sfd, "o noatime");
> 	write(sfd, "o acl");
> 	write(sfd, "o user_attr");
> 	write(sfd, "o iversion");
> 	write(sfd, "o ");
> 	write(sfd, "r /my/container"); // root inside the fs
> 	write(sfd, "x create"); // create the superblock

A minor detail but ... the "r" operation mentioned above is not actually
implemented in this system call.

jon

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-11  8:42     ` David Howells
@ 2018-07-11 16:03       ` Linus Torvalds
  0 siblings, 0 replies; 75+ messages in thread
From: Linus Torvalds @ 2018-07-11 16:03 UTC (permalink / raw)
  To: David Howells
  Cc: Andy Lutomirski, Al Viro, Linux API, linux-fsdevel,
	Linux Kernel Mailing List, Jann Horn

On Wed, Jul 11, 2018 at 1:42 AM David Howells <dhowells@redhat.com> wrote:
>
>      Buffering till the end means you have to buffer *everything* - and,
>      unless you limit your buffer, you risk running out of RAM

Do we really care?

Can't we limit the buffer size to something small?

Right now, the mount options can't be bigger than a page anyway. Why
would we want to extend on that?

Btw, the magic word here is "why". I really really want to see a
fairly exhaustive explanation of why this all is such a big deal, and
exactly what limitations (including perhaps the mount option buffer
size) are such a pain right now and need changing.

> Now, I can replace the 'x' command with an ioctl() so that just writing random
> rubbish to the fd won't cause anything to actually happen.
>
>         fd = fsopen("ext4");
>         write(fd, "s /dev/sda1");
>         write(fd, "o user_xattr");
>         ioctl(fd, FSOPEN_IOC_CREATE_SB, 0);
>
> or I could make a special syscall for it:
>
>         fscommit(fd, FSCOMMIT_CREATE);
>
> or:
>
>         fscommit(fd, FSCOMMIT_RECONFIGURE);
>
> and require that you have CAP_SYS_ADMIN to enact it.

I think any of them sound fairly ok, with that whole "we need reasons" caveat.

               Linus

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-10 22:44 ` [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation " David Howells
                     ` (2 preceding siblings ...)
  2018-07-11 15:51   ` Jonathan Corbet
@ 2018-07-11 16:18   ` David Howells
  2018-07-12 17:15   ` Greg KH
  4 siblings, 0 replies; 75+ messages in thread
From: David Howells @ 2018-07-11 16:18 UTC (permalink / raw)
  To: Jonathan Corbet
  Cc: dhowells, viro, linux-api, linux-fsdevel, torvalds, linux-kernel

Jonathan Corbet <corbet@lwn.net> wrote:

> A minor detail but ... the "r" operation mentioned above is not actually
> implemented in this system call.

Yeah, that's something I'd like to add.  NFS4 already does this inside its
->mount() method, so my thought is that we might be able to move this from
there to the core.

David

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-11  7:22   ` David Howells
@ 2018-07-11 16:38     ` Eric Biggers
  2018-07-11 17:06     ` Andy Lutomirski
  2018-07-12 14:54     ` David Howells
  2 siblings, 0 replies; 75+ messages in thread
From: Eric Biggers @ 2018-07-11 16:38 UTC (permalink / raw)
  To: David Howells
  Cc: Andy Lutomirski, viro, linux-api, linux-fsdevel, torvalds,
	linux-kernel, jannh

On Wed, Jul 11, 2018 at 08:22:41AM +0100, David Howells wrote:
> Andy Lutomirski <luto@amacapital.net> wrote:
> 
> > >    sfd = fsopen("ext4", FSOPEN_CLOEXEC);
> > >    write(sfd, "s /dev/sdb1"); // note I'm ignoring write's length arg
> > 
> > Imagine some malicious program passes sfd as stdout to a setuid
> > program. That program gets persuaded to write "s /etc/shadow".  What
> > happens?  You’re okay as long as *every single fs* gets it right, but that’s
> > asking a lot.
> 
> Do note that you must already have CAP_SYS_ADMIN to be able to call fsopen().
> 
> David

Not really, by default an unprivileged user can still do:

	unshare(CLONE_NEWUSER|CLONE_NEWNS);
	syscall(__NR_fsopen, "ext4", 0);

- Eric

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-11  7:22   ` David Howells
  2018-07-11 16:38     ` Eric Biggers
@ 2018-07-11 17:06     ` Andy Lutomirski
  2018-07-12 14:54     ` David Howells
  2 siblings, 0 replies; 75+ messages in thread
From: Andy Lutomirski @ 2018-07-11 17:06 UTC (permalink / raw)
  To: David Howells
  Cc: Al Viro, Linux API, Linux FS Devel, Linus Torvalds, LKML, Jann Horn

> On Jul 11, 2018, at 12:22 AM, David Howells <dhowells@redhat.com> wrote:
>
> Andy Lutomirski <luto@amacapital.net> wrote:
>
>>>   sfd = fsopen("ext4", FSOPEN_CLOEXEC);
>>>   write(sfd, "s /dev/sdb1"); // note I'm ignoring write's length arg
>>
>> Imagine some malicious program passes sfd as stdout to a setuid
>> program. That program gets persuaded to write "s /etc/shadow".  What
>> happens?  You’re okay as long as *every single fs* gets it right, but that’s
>> asking a lot.
>
> Do note that you must already have CAP_SYS_ADMIN to be able to call fsopen().

If you’re not allowing it already, someone will want user namespace
root to be able to use this very, very soon.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-11  7:22   ` David Howells
  2018-07-11 16:38     ` Eric Biggers
  2018-07-11 17:06     ` Andy Lutomirski
@ 2018-07-12 14:54     ` David Howells
  2018-07-12 15:50       ` Linus Torvalds
                         ` (3 more replies)
  2 siblings, 4 replies; 75+ messages in thread
From: David Howells @ 2018-07-12 14:54 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: dhowells, Al Viro, Linux API, Linux FS Devel, Linus Torvalds,
	LKML, Jann Horn

Andy Lutomirski <luto@kernel.org> wrote:

> > On Jul 11, 2018, at 12:22 AM, David Howells <dhowells@redhat.com> wrote:
> >
> > Andy Lutomirski <luto@amacapital.net> wrote:
> >
> >>>   sfd = fsopen("ext4", FSOPEN_CLOEXEC);
> >>>   write(sfd, "s /dev/sdb1"); // note I'm ignoring write's length arg
> >>
> >> Imagine some malicious program passes sfd as stdout to a setuid
> >> program. That program gets persuaded to write "s /etc/shadow".  What
> >> happens?  You’re okay as long as *every single fs* gets it right, but
> >> that’s asking a lot.
> >
> > Do note that you must already have CAP_SYS_ADMIN to be able to call
> > fsopen().
> 
> If you're not allowing it already, someone will want user namespace
> root to be able to use this very, very soon.

Yeah, I'm sure.  And I've been thinking on how to deal with it.

I think we *have* to open the source files/devices with the creds of whoever
called fsopen() or fspick() - that way you can't upgrade your privs by passing
your context fd to a suid program.  To enforce this, I think it's simplest for
fscontext_write() to call override_creds() right after taking the uapi_mutex
and then call revert_creds() right before dropping the mutex.

Another thing we might want to look at is to allow a supervisory process to
examine the context before permitting the create/reconfigure action to
proceed.  It might also be possible to do this through the LSM.

David

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-12 14:54     ` David Howells
@ 2018-07-12 15:50       ` Linus Torvalds
  2018-07-12 16:00         ` Al Viro
  2018-07-12 16:23       ` Andy Lutomirski
                         ` (2 subsequent siblings)
  3 siblings, 1 reply; 75+ messages in thread
From: Linus Torvalds @ 2018-07-12 15:50 UTC (permalink / raw)
  To: David Howells
  Cc: Andrew Lutomirski, Al Viro, Linux API, linux-fsdevel,
	Linux Kernel Mailing List, Jann Horn

On Thu, Jul 12, 2018 at 7:54 AM David Howells <dhowells@redhat.com> wrote:
>
> I think we *have* to open the source files/devices with the creds of whoever
> called fsopen() or fspick() - that way you can't upgrade your privs by passing
> your context fd to a suid program.  To enforce this, I think it's simplest for
> fscontext_write() to call override_creds() right after taking the uapi_mutex
> and then call revert_creds() right before dropping the mutex.

No.

Don't play games with override_creds. It's wrong.

You have to use file->f_creds - no games, no garbage.

But "write()" simply is *NOT* a good "command" interface. If you want
to send a command, use an ioctl or a system call.

Because it's not just about credentials. It's not just about fooling a
suid app into writing an error message to a descriptor you wrote. It's
also about things like "splice()", which can write to your target
using a kernel buffer, and thus trick you into doing a command while
we have the context set to kernel addresses.

Are we trying to get away from that issue? Yes. But it's just another
example of why "write()" IS NOT TO BE USED FOR COMMANDS.

Only use write() for data.

That's final. We're not adding yet another clueless fuck-up of an
interface just because people cannot understand this very simple rule:
"write()" is for data, not for commands.

No more excuses.

                 Linus

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-12 15:50       ` Linus Torvalds
@ 2018-07-12 16:00         ` Al Viro
  2018-07-12 16:07           ` Linus Torvalds
  0 siblings, 1 reply; 75+ messages in thread
From: Al Viro @ 2018-07-12 16:00 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: David Howells, Andrew Lutomirski, Linux API, linux-fsdevel,
	Linux Kernel Mailing List, Jann Horn

On Thu, Jul 12, 2018 at 08:50:46AM -0700, Linus Torvalds wrote:

> But "write()" simply is *NOT* a good "command" interface. If you want
> to send a command, use an ioctl or a system call.
> 
> Because it's not just about credentials. It's not just about fooling a
> suid app into writing an error message to a descriptor you wrote. It's
> also about things like "splice()", which can write to your target
> using a kernel buffer, and thus trick you into doing a command while
> we have the context set to kernel addresses.

Wait a sec - that's only a problem if your command contains pointer-chasing
et.al.  Which is why e.g. /dev/sg is fucked in head.  But for something that
is plain text, what's the problem with splice/write/sendmsg/whatever?

I'm not talking about this particular interface, but "write is bad for
commands" as general policy looks missing the point.  If anything, it's
pointer-chasing crap that should be banned everywhere.  Just look at SG_IO -
it's a ioctl, and it's absolute garbage...

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-12 16:00         ` Al Viro
@ 2018-07-12 16:07           ` Linus Torvalds
  2018-07-12 16:31             ` Al Viro
  0 siblings, 1 reply; 75+ messages in thread
From: Linus Torvalds @ 2018-07-12 16:07 UTC (permalink / raw)
  To: Al Viro
  Cc: David Howells, Andrew Lutomirski, Linux API, linux-fsdevel,
	Linux Kernel Mailing List, Jann Horn

On Thu, Jul 12, 2018 at 9:00 AM Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> Wait a sec - that's only a problem if your command contains pointer-chasing
> et.al.

No.

It's a problem if anybody ever does something like "let's have a
helper splice thread that uses splice to move data automatically from
one buffer to another".

And yes, it's something people have wanted.

Seriously. I'm putting my foot down. NO COMMANDS IN WRITE DATA!

We have made that mistake in the past. Having done stupid things in
the past is not an excuse for doing so again. Quite the reverse.
Making the same mistake and not learning from your mistakes is the
sign of stupidity.

So I repeat: write is for data. If you want an action, you do it with
ioctl, or you do it with a system call.

              Linus

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-12 14:54     ` David Howells
  2018-07-12 15:50       ` Linus Torvalds
@ 2018-07-12 16:23       ` Andy Lutomirski
  2018-07-12 16:31         ` Linus Torvalds
                           ` (2 more replies)
  2018-07-12 20:23       ` David Howells
  2018-07-12 21:00       ` David Howells
  3 siblings, 3 replies; 75+ messages in thread
From: Andy Lutomirski @ 2018-07-12 16:23 UTC (permalink / raw)
  To: David Howells
  Cc: Andy Lutomirski, Al Viro, Linux API, Linux FS Devel,
	Linus Torvalds, LKML, Jann Horn, tycho



> On Jul 12, 2018, at 7:54 AM, David Howells <dhowells@redhat.com> wrote:
> 
> Andy Lutomirski <luto@kernel.org> wrote:
> 
>>> On Jul 11, 2018, at 12:22 AM, David Howells <dhowells@redhat.com> wrote:
>>> 
>>> Andy Lutomirski <luto@amacapital.net> wrote:
>>> 
>>>>>  sfd = fsopen("ext4", FSOPEN_CLOEXEC);
>>>>>  write(sfd, "s /dev/sdb1"); // note I'm ignoring write's length arg
>>>> 
>>>> Imagine some malicious program passes sfd as stdout to a setuid
>>>> program. That program gets persuaded to write "s /etc/shadow".  What
>>>> happens?  You’re okay as long as *every single fs* gets it right, but
>>>> that’s asking a lot.
>>> 
>>> Do note that you must already have CAP_SYS_ADMIN to be able to call
>>> fsopen().
>> 
>> If you're not allowing it already, someone will want user namespace
>> root to be able to use this very, very soon.
> 
> Yeah, I'm sure.  And I've been thinking on how to deal with it.
> 
> I think we *have* to open the source files/devices with the creds of whoever
> called fsopen() or fspick() - that way you can't upgrade your privs by passing
> your context fd to a suid program.  To enforce this, I think it's simplest for
> fscontext_write() to call override_creds() right after taking the uapi_mutex
> and then call revert_creds() right before dropping the mutex.
> 

If you make a syscall that attaches a block device to an fscontext, you don’t need any of this.  Heck, someone might actually *want* to grab a block device from a different namespace.

All this override_creds() stuff is maybe okay if we were fixing an old broken thing. But this is brand new.  And having write() call override_creds() and do nontrivial things is a fascinating attack surface.

Just imagine what blows up if I abuse fscontext to open a block device on a path that traverses an AFS mount or /proc/.../fd or similar.  Or if I splice() from a network filesystem into fscontext.

(Al- can’t we just stop allowing splice() at all on things that don’t use iov_iter?)

> Another thing we might want to look at is to allow a supervisory process to
> examine the context before permitting the create/reconfigure action to
> proceed.  It might also be possible to do this through the LSM.

Cc Tycho. He’s working on this exact idea using seccomp. And he’d probably much, much prefer if configuration of an fscontext didn’t use a performance-critical syscall like write().

As a straw man, I suggest:

fsconfigure(contextfd, ADD_BLOCKDEV, dfd, path, flags);

fsconfigure(contextfd, ADD_OPTION, 0, “foo=bar”, flags);

Etc.  

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-12 16:07           ` Linus Torvalds
@ 2018-07-12 16:31             ` Al Viro
  2018-07-12 16:39               ` Linus Torvalds
  0 siblings, 1 reply; 75+ messages in thread
From: Al Viro @ 2018-07-12 16:31 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: David Howells, Andrew Lutomirski, Linux API, linux-fsdevel,
	Linux Kernel Mailing List, Jann Horn

On Thu, Jul 12, 2018 at 09:07:36AM -0700, Linus Torvalds wrote:
> On Thu, Jul 12, 2018 at 9:00 AM Al Viro <viro@zeniv.linux.org.uk> wrote:
> >
> > Wait a sec - that's only a problem if your command contains pointer-chasing
> > et.al.
> 
> No.
> 
> It's a problem if anybody ever does something like "let's have a
> helper splice thread that uses splice to move data automatically from
> one buffer to another".
>
> And yes, it's something people have wanted.
> 
> Seriously. I'm putting my foot down. NO COMMANDS IN WRITE DATA!
> 
> We have made that mistake in the past. Having done stupid things in
> the past is not an excuse for doing so again. Quite the reverse.
> Making the same mistake and not learning from your mistakes is the
> sign of stupidity.
> 
> So I repeat: write is for data. If you want an action, you do it with
> ioctl, or you do it with a system call.

*shrug*

I think you are wrong[1], but it's your decision.  And seriously, ioctl?
_That_ has a great track record...

[1] one man's data is another man's commands, for starters.  All networking
protocols would fit your description.  So would ANSI escape sequences ("move
cursor to line 12 column 45" does sound like a command), so would writing
postscript to printer, etc.

IME it's more about data structures that are not marshalled cleanly - that
tends to go badly wrong.  Again, see SG_IO for recent example...

Anyway, your tree, your policy.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-12 16:23       ` Andy Lutomirski
@ 2018-07-12 16:31         ` Linus Torvalds
  2018-07-12 16:41         ` Al Viro
  2018-07-12 16:58         ` Al Viro
  2 siblings, 0 replies; 75+ messages in thread
From: Linus Torvalds @ 2018-07-12 16:31 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: David Howells, Andrew Lutomirski, Al Viro, Linux API,
	linux-fsdevel, Linux Kernel Mailing List, Jann Horn,
	Tycho Andersen

On Thu, Jul 12, 2018 at 9:23 AM Andy Lutomirski <luto@amacapital.net> wrote:
>
> (Al- can’t we just stop allowing splice() at all on things that don’t use iov_iter?)

We could add a FMODE_SPLICE_READ/WRITE bit, and let people opt in to
splice. We probably should have.

But again, that really doesn't change the fundamentals.  Using write()
for commands is stupid.

It also means that you have to _parse_ all the damn input at that
level, which is a mistake too. It easily leads to insane decisions
like "you have to use 'write()' calls without buffering", because
re-buffering the stream is a f*cking pain.

Just say no. Seriously. Stop this idiotic discussion.

I'm just happy this came up early, because that way I know to look out
for it and not merge it.

                 Linus

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-12 16:31             ` Al Viro
@ 2018-07-12 16:39               ` Linus Torvalds
  2018-07-12 17:14                 ` Linus Torvalds
  2018-07-12 17:52                 ` Al Viro
  0 siblings, 2 replies; 75+ messages in thread
From: Linus Torvalds @ 2018-07-12 16:39 UTC (permalink / raw)
  To: Al Viro
  Cc: David Howells, Andrew Lutomirski, Linux API, linux-fsdevel,
	Linux Kernel Mailing List, Jann Horn

On Thu, Jul 12, 2018 at 9:31 AM Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> And seriously, ioctl? _That_ has a great track record...

I agree that a system call is likely saner. Especially since we'd have
one to _start_ this (ie "fsopen()") it would make sense to have the
one to finalize it.

> [1] one man's data is another man's commands, for starters.  All networking
> protocols would fit your description.  So would ANSI escape sequences ("move
> cursor to line 12 column 45" does sound like a command), so would writing
> postscript to printer, etc.

.. and all of that is just data to the kernel.

Yes, vt100 escape sequences etc _are_ commands, and boy have we had
bugs in that area. But there the excuse is "that's how the world is".

The thing is, "reality" is the ultimate argument. You can't argue with
cold hard facts.

But when designing a new interface that doesn't have that kind of
constraints, do it right.

> IME it's more about data structures that are not marshalled cleanly - that
> tends to go badly wrong.  Again, see SG_IO for recent example...

SG_IO actually gets it right. It doesn't do async, but that's part of
the design (and a big part of why it's a lot simpler - the read-write
thing is actually broken too and just forces user space to basically
know SCSI).

              Linus

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-12 16:23       ` Andy Lutomirski
  2018-07-12 16:31         ` Linus Torvalds
@ 2018-07-12 16:41         ` Al Viro
  2018-07-12 16:58         ` Al Viro
  2 siblings, 0 replies; 75+ messages in thread
From: Al Viro @ 2018-07-12 16:41 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: David Howells, Andy Lutomirski, Linux API, Linux FS Devel,
	Linus Torvalds, LKML, Jann Horn, tycho

On Thu, Jul 12, 2018 at 09:23:22AM -0700, Andy Lutomirski wrote:

> If you make a syscall that attaches a block device to an fscontext, you don’t need any of this.  Heck, someone might actually *want* to grab a block device from a different namespace.

Fuck, NO.  The whole notion of "block device of filesystem" is fucking
garbage.  It's up to filesystem driver whether it uses any block
devices.  For backing store or otherwise.  Single or multiple.  Moreover,
it's up to filesystem driver whether it cares if backing store is
a block device, or mtd device, or...

Repeat after me: syscall that attaches a block device to an fscontext
makes as much sense as a syscall that attaches a charset name to the
same.  With a special syscall for attaching a timestamp granularity,
and another for selecting GID semantics on subdirectory creation.

Commit vs. write separation is one thing; fuckloads of special syscalls
for passing vaguely defined classes of mount options (which device
name *is*) is quite different.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-12 16:23       ` Andy Lutomirski
  2018-07-12 16:31         ` Linus Torvalds
  2018-07-12 16:41         ` Al Viro
@ 2018-07-12 16:58         ` Al Viro
  2018-07-12 17:54           ` Andy Lutomirski
  2 siblings, 1 reply; 75+ messages in thread
From: Al Viro @ 2018-07-12 16:58 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: David Howells, Andy Lutomirski, Linux API, Linux FS Devel,
	Linus Torvalds, LKML, Jann Horn, tycho

On Thu, Jul 12, 2018 at 09:23:22AM -0700, Andy Lutomirski wrote:
 
> As a straw man, I suggest:
> 
> fsconfigure(contextfd, ADD_BLOCKDEV, dfd, path, flags);
> 
> fsconfigure(contextfd, ADD_OPTION, 0, “foo=bar”, flags);

Bollocks.  First of all, block device *IS* a fucking option.
Always had been.  It's not even that it's passed as a separate
argument for historical reasons - just look at NFS.  That argument
is a detached part of options, parsed (yes, *parsed*) by filesystem
in question in whatever way it prefers.

Look at the things like e.g. cramfs.  That argument is interpreted
as pathname of block device.  Or that of mtd device.  Or the magic
string "mtd" followed by mtd number.

What's more, filesystems can and do live on more than one device.
Like e.g. btrfs.  Or like something journalled with the journal
on separate device.  So you do *NOT* get away from the need to
open stuff while doing mount - not unless you introduce arseloads
of ADD_... shite in your scheme.  And create a huge centralized
pile of code dealing with it.  ADD_NFS_IPV4_SERVER_AND_PATH, etc.?

You can't avoid parsing stuff.  It's one thing to argue at which
*point* you prefer doing that, but it has to be done kernel-side.
Format of filesystem options is fundamentally up to filesystem,
whichever syscall you use.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-12 16:39               ` Linus Torvalds
@ 2018-07-12 17:14                 ` Linus Torvalds
  2018-07-12 17:44                   ` Al Viro
  2018-07-12 17:52                 ` Al Viro
  1 sibling, 1 reply; 75+ messages in thread
From: Linus Torvalds @ 2018-07-12 17:14 UTC (permalink / raw)
  To: Al Viro
  Cc: David Howells, Andrew Lutomirski, Linux API, linux-fsdevel,
	Linux Kernel Mailing List, Jann Horn

On Thu, Jul 12, 2018 at 9:39 AM Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> I agree that a system call is likely saner. Especially since we'd have
> one to _start_ this (ie "fsopen()") it would make sense to have the
> one to finalize it.

Side note: if we can make do with just a buffer, then we wouldn't need
"fsopen()". You could literally just open a pipe, and write to it.
It's got 16 pages worth of buffers by default, and you can increase it
(within reason) as root.

Of course, depending on IO patterns, not all the buffer pages are
necessarily fully used, so it's not like you get a buffer of size
PAGE_SIZE*16, but we do merge buffers so you should be fairly close.

Then you really could do without a fsopen(). Just fill a pipe with
data, and do "fsmount()" on the pipe contents.

Added upside? You can use "iov_iter_pipe()" to iterate over all that data.

I'm only half joking.

            Linus

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-10 22:44 ` [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation " David Howells
                     ` (3 preceding siblings ...)
  2018-07-11 16:18   ` David Howells
@ 2018-07-12 17:15   ` Greg KH
  2018-07-12 17:20     ` Al Viro
  4 siblings, 1 reply; 75+ messages in thread
From: Greg KH @ 2018-07-12 17:15 UTC (permalink / raw)
  To: David Howells; +Cc: viro, linux-api, linux-fsdevel, torvalds, linux-kernel

On Tue, Jul 10, 2018 at 11:44:09PM +0100, David Howells wrote:
> Provide an fsopen() system call that starts the process of preparing to
> create a superblock that will then be mountable, using an fd as a context
> handle.  fsopen() is given the name of the filesystem that will be used:
> 
> 	int mfd = fsopen(const char *fsname, unsigned int flags);
> 
> where flags can be 0 or FSOPEN_CLOEXEC.
> 
> For example:
> 
> 	sfd = fsopen("ext4", FSOPEN_CLOEXEC);
> 	write(sfd, "s /dev/sdb1"); // note I'm ignoring write's length arg
> 	write(sfd, "o noatime");
> 	write(sfd, "o acl");
> 	write(sfd, "o user_attr");
> 	write(sfd, "o iversion");
> 	write(sfd, "o ");
> 	write(sfd, "r /my/container"); // root inside the fs
> 	write(sfd, "x create"); // create the superblock

Ugh, creating configfs again in a syscall form?  I know people love
file descriptors, but can't you do this with a configfs entry instead if
you really want to do this type of thing from userspace in this type of
"style"?

Why reinvent the wheel again?

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-12 17:15   ` Greg KH
@ 2018-07-12 17:20     ` Al Viro
  2018-07-12 18:03       ` Greg KH
  0 siblings, 1 reply; 75+ messages in thread
From: Al Viro @ 2018-07-12 17:20 UTC (permalink / raw)
  To: Greg KH; +Cc: David Howells, linux-api, linux-fsdevel, torvalds, linux-kernel

On Thu, Jul 12, 2018 at 07:15:05PM +0200, Greg KH wrote:
> On Tue, Jul 10, 2018 at 11:44:09PM +0100, David Howells wrote:
> > Provide an fsopen() system call that starts the process of preparing to
> > create a superblock that will then be mountable, using an fd as a context
> > handle.  fsopen() is given the name of the filesystem that will be used:
> > 
> > 	int mfd = fsopen(const char *fsname, unsigned int flags);
> > 
> > where flags can be 0 or FSOPEN_CLOEXEC.
> > 
> > For example:
> > 
> > 	sfd = fsopen("ext4", FSOPEN_CLOEXEC);
> > 	write(sfd, "s /dev/sdb1"); // note I'm ignoring write's length arg
> > 	write(sfd, "o noatime");
> > 	write(sfd, "o acl");
> > 	write(sfd, "o user_attr");
> > 	write(sfd, "o iversion");
> > 	write(sfd, "o ");
> > 	write(sfd, "r /my/container"); // root inside the fs
> > 	write(sfd, "x create"); // create the superblock
> 
> Ugh, creating configfs again in a syscall form?  I know people love
> file descriptors, but can't you do this with a configfs entry instead if
> you really want to do this type of thing from userspace in this type of
> "style"?
> 
> Why reinvent the wheel again?

The damn thing REALLY, REALLY depends upon the fs type.  How would
you map it on configfs?

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-12 17:14                 ` Linus Torvalds
@ 2018-07-12 17:44                   ` Al Viro
  2018-07-12 17:54                     ` Linus Torvalds
  0 siblings, 1 reply; 75+ messages in thread
From: Al Viro @ 2018-07-12 17:44 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: David Howells, Andrew Lutomirski, Linux API, linux-fsdevel,
	Linux Kernel Mailing List, Jann Horn

On Thu, Jul 12, 2018 at 10:14:05AM -0700, Linus Torvalds wrote:
> On Thu, Jul 12, 2018 at 9:39 AM Linus Torvalds
> <torvalds@linux-foundation.org> wrote:
> >
> > I agree that a system call is likely saner. Especially since we'd have
> > one to _start_ this (ie "fsopen()") it would make sense to have the
> > one to finalize it.
> 
> Side note: if we can make do with just a buffer, then we wouldn't need
> "fsopen()". You could literally just open a pipe, and write to it.
> It's got 16 pages worth of buffers by default, and you can increase it
> (within reason) as root.
> 
> Of course, depending on IO patterns, not all the buffer pages are
> necessarily fully used, so it's not like you get a buffer of size
> PAGE_SIZE*16, but we do merge buffers so you should be fairly close.
> 
> Then you really could do without a fsopen(). Just fill a pipe with
> data, and do "fsmount()" on the pipe contents.
> 
> Added upside? You can use "iov_iter_pipe()" to iterate over all that data.
> 
> I'm only half joking.

One semi-historical note here.

Originally, mount(2) (and it had been there since v1) had only one filesystem
type to deal with.  So it was really just "mount <block device pathname> on
<mountpoint pathname>, read-only or read-write".  3 arguments, two strings and
one flag (flag, BTW, was a later addition).

It didn't last.  I can dig out the archaeological notes and cut'n'paste the
whole horror story here, but that'll be way too long and scary.

By 4.2BSD times there had been essentially an enum encoding the filesystem
type and type-tagged union of structs with type-dependent options.  Plus
some options taking more bits in what used to be "is it r/w?" flag.

Leaving aside the whole "mount new/bind/remount/etc." overloading we have
in mount(2) today, we have a bunch of named filesystems, each with its
own set of options.  Device name has ceased to be something special for
many decades; the type name is what's universally present and that's what
decides how the rest (including "device name") is to be interpreted.

Fundamentally, we start with selecting (by name) a filesystem driver we'll
be talking to.  The rest (device name + string options + flags like noexec
that are not handled on VFS level) is given to that driver, which either
tells us to take a hike or gives us a dentry tree that can be attached.

Separating type name from everything else makes a lot of sense, simply
because it's what determines the parsing and interpretation of the rest.
Speaking of half-joking, I suggested AF_FSTYPE at some point.  Then
fsopen(2) would be connect(2)...

I think that having that (connection used to talk to fs driver, with or
without an already set up fs instance we are talking about) as first-class
object makes sense.  That's completely unrelated to the question of buffering,
of course.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-12 16:39               ` Linus Torvalds
  2018-07-12 17:14                 ` Linus Torvalds
@ 2018-07-12 17:52                 ` Al Viro
  1 sibling, 0 replies; 75+ messages in thread
From: Al Viro @ 2018-07-12 17:52 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: David Howells, Andrew Lutomirski, Linux API, linux-fsdevel,
	Linux Kernel Mailing List, Jann Horn

On Thu, Jul 12, 2018 at 09:39:31AM -0700, Linus Torvalds wrote:

> > [1] one man's data is another man's commands, for starters.  All networking
> > protocols would fit your description.  So would ANSI escape sequences ("move
> > cursor to line 12 column 45" does sound like a command), so would writing
> > postscript to printer, etc.
> 
> .. and all of that is just data to the kernel.
> 
> Yes, vt100 escape sequences etc _are_ commands, and boy have we had
> bugs in that area. But there the excuse is "that's how the world is".

... along with "something similar to ncurses-based programs usable over
ssh is a good thing to have, without having said ssh somehow intercept
and marshal ioctls" ;-)  I can just imagine something e.g. RDMA people
would've designed instead... OTOH, I'm eating right now, so better
not go there...

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-12 17:44                   ` Al Viro
@ 2018-07-12 17:54                     ` Linus Torvalds
  0 siblings, 0 replies; 75+ messages in thread
From: Linus Torvalds @ 2018-07-12 17:54 UTC (permalink / raw)
  To: Al Viro
  Cc: David Howells, Andrew Lutomirski, Linux API, linux-fsdevel,
	Linux Kernel Mailing List, Jann Horn

On Thu, Jul 12, 2018 at 10:44 AM Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> Separating type name from everything else makes a lot of sense

I do not dispute that at all.

But you can specify the type name in the "commit" phase, it doesn't
have to be at "fsopen" time.

In fact, doing so would _force_ a certain cleanliness to the
interfaces - it would force the rest to be filesystem-agnostic, rather
than possibly have semantic hacks for some part.

            Linus

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-12 16:58         ` Al Viro
@ 2018-07-12 17:54           ` Andy Lutomirski
  0 siblings, 0 replies; 75+ messages in thread
From: Andy Lutomirski @ 2018-07-12 17:54 UTC (permalink / raw)
  To: Al Viro
  Cc: David Howells, Andy Lutomirski, Linux API, Linux FS Devel,
	Linus Torvalds, LKML, Jann Horn, tycho



> On Jul 12, 2018, at 9:58 AM, Al Viro <viro@ZenIV.linux.org.uk> wrote:
> 
>> On Thu, Jul 12, 2018 at 09:23:22AM -0700, Andy Lutomirski wrote:
>> 
>> As a straw man, I suggest:
>> 
>> fsconfigure(contextfd, ADD_BLOCKDEV, dfd, path, flags);
>> 
>> fsconfigure(contextfd, ADD_OPTION, 0, “foo=bar”, flags);
> 
> Bollocks.  First of all, block device *IS* a fucking option.
> Always had been.  It's not even that it's passed as a separate
> argument for historical reasons - just look at NFS.  That argument
> is a detached part of options, parsed (yes, *parsed*) by filesystem
> in question in whatever way it prefers.

Fine, then generalize it. fsconfigure(context, ADD_FD, “some fs-specific string explaining what’s going on”, fd);  The point being that there are tons of cases where the filesystem wants to identify some backing store by some device node, and it seems like we should support something along the lines of a modern *at interface.

If I’m writing a daemon that deals with filesystems, I don’t want an API that looks like do_god_knows_what(context, “filesystem specific string that may contain a path to a device node or a network address”). That API will be a pain to use, since that opaque string may come from some random config file and I have no clue what it does. If I want to pass a device node or other object to a filesystem, I want to pass an fd (so I can use openat, SCM_CREDS, etc), and I want it to be crystal clear that I’m passing some object in. And if I tell a filesystem to access the network, I want it to be entirely clear which network namespace is in use.

I realize that doing this right is tricky when there are lots of legacy filesystems that parse opaque strings. That’s fine. We can convert things slowly.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-12 17:20     ` Al Viro
@ 2018-07-12 18:03       ` Greg KH
  2018-07-12 18:30         ` Andy Lutomirski
  0 siblings, 1 reply; 75+ messages in thread
From: Greg KH @ 2018-07-12 18:03 UTC (permalink / raw)
  To: Al Viro; +Cc: David Howells, linux-api, linux-fsdevel, torvalds, linux-kernel

On Thu, Jul 12, 2018 at 06:20:24PM +0100, Al Viro wrote:
> On Thu, Jul 12, 2018 at 07:15:05PM +0200, Greg KH wrote:
> > On Tue, Jul 10, 2018 at 11:44:09PM +0100, David Howells wrote:
> > > Provide an fsopen() system call that starts the process of preparing to
> > > create a superblock that will then be mountable, using an fd as a context
> > > handle.  fsopen() is given the name of the filesystem that will be used:
> > > 
> > > 	int mfd = fsopen(const char *fsname, unsigned int flags);
> > > 
> > > where flags can be 0 or FSOPEN_CLOEXEC.
> > > 
> > > For example:
> > > 
> > > 	sfd = fsopen("ext4", FSOPEN_CLOEXEC);
> > > 	write(sfd, "s /dev/sdb1"); // note I'm ignoring write's length arg
> > > 	write(sfd, "o noatime");
> > > 	write(sfd, "o acl");
> > > 	write(sfd, "o user_attr");
> > > 	write(sfd, "o iversion");
> > > 	write(sfd, "o ");
> > > 	write(sfd, "r /my/container"); // root inside the fs
> > > 	write(sfd, "x create"); // create the superblock
> > 
> > Ugh, creating configfs again in a syscall form?  I know people love
> > file descriptors, but can't you do this with a configfs entry instead if
> > you really want to do this type of thing from userspace in this type of
> > "style"?
> > 
> > Why reinvent the wheel again?
> 
> The damn thing REALLY, REALLY depends upon the fs type.  How would
> you map it on configfs?

/sys/kernel/config/fs/ext4/ would work, right?  Each fs "type" would be
listed there.

Anyway, the whole "write a bunch of options and then do a 'create'" is
exactly the way configfs works.  Why not use that?

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-12 18:03       ` Greg KH
@ 2018-07-12 18:30         ` Andy Lutomirski
  2018-07-12 18:34           ` Al Viro
  2018-07-12 19:08           ` Greg KH
  0 siblings, 2 replies; 75+ messages in thread
From: Andy Lutomirski @ 2018-07-12 18:30 UTC (permalink / raw)
  To: Greg KH
  Cc: Al Viro, David Howells, linux-api, linux-fsdevel, torvalds, linux-kernel


> On Jul 12, 2018, at 11:03 AM, Greg KH <gregkh@linuxfoundation.org> wrote:
> 
>> On Thu, Jul 12, 2018 at 06:20:24PM +0100, Al Viro wrote:
>>> On Thu, Jul 12, 2018 at 07:15:05PM +0200, Greg KH wrote:
>>>> On Tue, Jul 10, 2018 at 11:44:09PM +0100, David Howells wrote:
>>>> Provide an fsopen() system call that starts the process of preparing to
>>>> create a superblock that will then be mountable, using an fd as a context
>>>> handle.  fsopen() is given the name of the filesystem that will be used:
>>>> 
>>>>    int mfd = fsopen(const char *fsname, unsigned int flags);
>>>> 
>>>> where flags can be 0 or FSOPEN_CLOEXEC.
>>>> 
>>>> For example:
>>>> 
>>>>    sfd = fsopen("ext4", FSOPEN_CLOEXEC);
>>>>    write(sfd, "s /dev/sdb1"); // note I'm ignoring write's length arg
>>>>    write(sfd, "o noatime");
>>>>    write(sfd, "o acl");
>>>>    write(sfd, "o user_attr");
>>>>    write(sfd, "o iversion");
>>>>    write(sfd, "o ");
>>>>    write(sfd, "r /my/container"); // root inside the fs
>>>>    write(sfd, "x create"); // create the superblock
>>> 
>>> Ugh, creating configfs again in a syscall form?  I know people love
>>> file descriptors, but can't you do this with a configfs entry instead if
>>> you really want to do this type of thing from userspace in this type of
>>> "style"?
>>> 
>>> Why reinvent the wheel again?
>> 
>> The damn thing REALLY, REALLY depends upon the fs type.  How would
>> you map it on configfs?
> 
> /sys/kernel/config/fs/ext4/ would work, right?  Each fs "type" would be
> listed there.
> 
> Anyway, the whole "write a bunch of options and then do a 'create'" is
> exactly the way configfs works.  Why not use that?
> 
> 

How do you mount configfs in the first place?  And how do you use this in a mount namespace without a private configfs instance or where you don’t want configfs mounted?

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-12 18:30         ` Andy Lutomirski
@ 2018-07-12 18:34           ` Al Viro
  2018-07-12 18:35             ` Al Viro
  2018-07-12 19:08           ` Greg KH
  1 sibling, 1 reply; 75+ messages in thread
From: Al Viro @ 2018-07-12 18:34 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Greg KH, David Howells, linux-api, linux-fsdevel, torvalds, linux-kernel

On Thu, Jul 12, 2018 at 11:30:32AM -0700, Andy Lutomirski wrote:

Andi, Greg - alt.tasteless is over -> that way.

And for fsck sake, fix your MUA.  Lines are obscenely long...

> How do you mount configfs in the first place?  And how do you use this in a mount namespace without a private configfs instance or where you don’t want configfs mounted?

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-12 18:34           ` Al Viro
@ 2018-07-12 18:35             ` Al Viro
  0 siblings, 0 replies; 75+ messages in thread
From: Al Viro @ 2018-07-12 18:35 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Greg KH, David Howells, linux-api, linux-fsdevel, torvalds, linux-kernel

On Thu, Jul 12, 2018 at 07:34:26PM +0100, Al Viro wrote:
> On Thu, Jul 12, 2018 at 11:30:32AM -0700, Andy Lutomirski wrote:
> 
> Andi,

Apologies for misspelling - finger macros strike ;-/

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-12 18:30         ` Andy Lutomirski
  2018-07-12 18:34           ` Al Viro
@ 2018-07-12 19:08           ` Greg KH
  1 sibling, 0 replies; 75+ messages in thread
From: Greg KH @ 2018-07-12 19:08 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Al Viro, David Howells, linux-api, linux-fsdevel, torvalds, linux-kernel

On Thu, Jul 12, 2018 at 11:30:32AM -0700, Andy Lutomirski wrote:
> 
> > On Jul 12, 2018, at 11:03 AM, Greg KH <gregkh@linuxfoundation.org> wrote:
> > 
> >> On Thu, Jul 12, 2018 at 06:20:24PM +0100, Al Viro wrote:
> >>> On Thu, Jul 12, 2018 at 07:15:05PM +0200, Greg KH wrote:
> >>>> On Tue, Jul 10, 2018 at 11:44:09PM +0100, David Howells wrote:
> >>>> Provide an fsopen() system call that starts the process of preparing to
> >>>> create a superblock that will then be mountable, using an fd as a context
> >>>> handle.  fsopen() is given the name of the filesystem that will be used:
> >>>> 
> >>>>    int mfd = fsopen(const char *fsname, unsigned int flags);
> >>>> 
> >>>> where flags can be 0 or FSOPEN_CLOEXEC.
> >>>> 
> >>>> For example:
> >>>> 
> >>>>    sfd = fsopen("ext4", FSOPEN_CLOEXEC);
> >>>>    write(sfd, "s /dev/sdb1"); // note I'm ignoring write's length arg
> >>>>    write(sfd, "o noatime");
> >>>>    write(sfd, "o acl");
> >>>>    write(sfd, "o user_attr");
> >>>>    write(sfd, "o iversion");
> >>>>    write(sfd, "o ");
> >>>>    write(sfd, "r /my/container"); // root inside the fs
> >>>>    write(sfd, "x create"); // create the superblock
> >>> 
> >>> Ugh, creating configfs again in a syscall form?  I know people love
> >>> file descriptors, but can't you do this with a configfs entry instead if
> >>> you really want to do this type of thing from userspace in this type of
> >>> "style"?
> >>> 
> >>> Why reinvent the wheel again?
> >> 
> >> The damn thing REALLY, REALLY depends upon the fs type.  How would
> >> you map it on configfs?
> > 
> > /sys/kernel/config/fs/ext4/ would work, right?  Each fs "type" would be
> > listed there.
> > 
> > Anyway, the whole "write a bunch of options and then do a 'create'" is
> > exactly the way configfs works.  Why not use that?
> > 
> > 
> 
> How do you mount configfs in the first place?  And how do you use this
> in a mount namespace without a private configfs instance or where you
> don’t want configfs mounted?--

Ok, fair enough, I missed the part where this is going to replace
mount(2).  Although you could just use mount(2) to mount configfs on a
mount point in the initramfs image and then go from there at boot time :)

/me runs away...

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-12 14:54     ` David Howells
  2018-07-12 15:50       ` Linus Torvalds
  2018-07-12 16:23       ` Andy Lutomirski
@ 2018-07-12 20:23       ` David Howells
  2018-07-12 20:25         ` Andy Lutomirski
                           ` (2 more replies)
  2018-07-12 21:00       ` David Howells
  3 siblings, 3 replies; 75+ messages in thread
From: David Howells @ 2018-07-12 20:23 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: dhowells, Andrew Lutomirski, Al Viro, Linux API, linux-fsdevel,
	Linux Kernel Mailing List, Jann Horn

Linus Torvalds <torvalds@linux-foundation.org> wrote:

> Don't play games with override_creds. It's wrong.
> 
> You have to use file->f_creds - no games, no garbage.

You missed the point.

It's all very well to say "use file->f_creds".  The problem is this has to be
handed down all the way through the filesystem and down into the block layer
as appropriate to anywhere there's an LSM call, a CAP_* check or a pathwalk -
but there's not currently any way to do that.

mount_bdev() and blkdev_get_by_path() are examples of this.  At the moment
there is no cred parameter there.  We'd also have to pass the creds down into
path_init() to store in struct nameidata and make sure that every permissions
call that might be invoked during pathwalk in every filesystem uses that, not
current_cred().

I made an attempt to do this a while ago and the patch got rather large before
I gave up.  In many ways, it's what we *should* do, but so many things need an
extra parameter...  If you really want, I can try that again.  It's possible I
can automate it with some perl scripting to parse the error messages from the
compiler.

My suggestion was to use override_creds() to impose the appropriate creds at
the top, be that file->f_creds or fs_context->creds (they would be the same in
any case).

If we want to go down the pass-the-creds-down route, then we can temporarily
do override_creds() until we've made the changes and then remove it later.

> But "write()" simply is *NOT* a good "command" interface. If you want
> to send a command, use an ioctl or a system call.

Okay.

> Because it's not just about credentials. It's not just about fooling a
> suid app into writing an error message to a descriptor you wrote. It's
> also about things like "splice()", which can write to your target
> using a kernel buffer, and thus trick you into doing a command while
> we have the context set to kernel addresses.
> 
> Are we trying to get away from that issue? Yes. But it's just another
> example of why "write()" IS NOT TO BE USED FOR COMMANDS.

Btw, do we protect sysfs, debugfs, tracefs, procfs, etc. writes against
splice?  Some of the things in debugfs are really icky, allowing you to muck
directly with hardware.

David

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-12 20:23       ` David Howells
@ 2018-07-12 20:25         ` Andy Lutomirski
  2018-07-12 20:34         ` Linus Torvalds
  2018-07-12 21:26         ` David Howells
  2 siblings, 0 replies; 75+ messages in thread
From: Andy Lutomirski @ 2018-07-12 20:25 UTC (permalink / raw)
  To: David Howells
  Cc: Linus Torvalds, Andrew Lutomirski, Al Viro, Linux API,
	linux-fsdevel, Linux Kernel Mailing List, Jann Horn



> On Jul 12, 2018, at 1:23 PM, David Howells <dhowells@redhat.com> wrote:
> 
> Linus Torvalds <torvalds@linux-foundation.org> wrote:
> 
>> Don't play games with override_creds. It's wrong.
>> 
>> You have to use file->f_creds - no games, no garbage.
> 
> You missed the point.
> 

> 
> My suggestion was to use override_creds() to impose the appropriate creds at
> the top, be that file->f_creds or fs_context->creds (they would be the same in
> any case).

I think it should be a new syscall and use current’s creds. No override needed.


> Btw, do we protect sysfs, debugfs, tracefs, procfs, etc. writes against
> splice?  Some of the things in debugfs are really icky, allowing you to muck
> directly with hardware.
> 

We try. It has been a perennial source of severe bugs.

This is part of why I’d like to see splice() be an opt in. Also, it’s a major step toward getting rid of set_fs().

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-12 20:23       ` David Howells
  2018-07-12 20:25         ` Andy Lutomirski
@ 2018-07-12 20:34         ` Linus Torvalds
  2018-07-12 20:36           ` Linus Torvalds
  2018-07-12 21:26         ` David Howells
  2 siblings, 1 reply; 75+ messages in thread
From: Linus Torvalds @ 2018-07-12 20:34 UTC (permalink / raw)
  To: David Howells
  Cc: Andrew Lutomirski, Al Viro, Linux API, linux-fsdevel,
	Linux Kernel Mailing List, Jann Horn

On Thu, Jul 12, 2018 at 1:23 PM David Howells <dhowells@redhat.com> wrote:
>
> It's all very well to say "use file->f_creds".  The problem is this has to be
> handed down all the way through the filesystem and down into the block layer
> as appropriate to anywhere there's an LSM call, a CAP_* check or a pathwalk -
> but there's not currently any way to do that.

.. and the reason is simple: you damn well shouldn't do that.

The unix semantics are that credentials are checked at open time.

If your interface involves checking credentials at write() time, your
interface is garbage shit.

Really.

This is the whole "write() is only for data". If you ever have
credentials mattering at write time, you're doing something wrong.

Really really.

Don't do it.

             Linus

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-12 20:34         ` Linus Torvalds
@ 2018-07-12 20:36           ` Linus Torvalds
  0 siblings, 0 replies; 75+ messages in thread
From: Linus Torvalds @ 2018-07-12 20:36 UTC (permalink / raw)
  To: David Howells
  Cc: Andrew Lutomirski, Al Viro, Linux API, linux-fsdevel,
	Linux Kernel Mailing List, Jann Horn

On Thu, Jul 12, 2018 at 1:34 PM Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> This is the whole "write() is only for data". If you ever have
> credentials mattering at write time, you're doing something wrong.
>
> Really really.
>
> Don't do it.

.. and I'd like to repeat: we *have* done things wrong. But that's
simply not an excuse. We've done it wrong in SCSI, we've done it wrong
in various /proc files, we've done it wrong in many places.

But let's not do it wrong AGAIN.

                Linus

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-12 14:54     ` David Howells
                         ` (2 preceding siblings ...)
  2018-07-12 20:23       ` David Howells
@ 2018-07-12 21:00       ` David Howells
  2018-07-12 21:29         ` Linus Torvalds
  2018-07-13 13:27         ` David Howells
  3 siblings, 2 replies; 75+ messages in thread
From: David Howells @ 2018-07-12 21:00 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: dhowells, Andy Lutomirski, Al Viro, Linux API, Linux FS Devel,
	Linus Torvalds, LKML, Jann Horn, tycho

Andy Lutomirski <luto@amacapital.net> wrote:

> fsconfigure(contextfd, ADD_BLOCKDEV, dfd, path, flags);
> 
> fsconfigure(contextfd, ADD_OPTION, 0, “foo=bar”, flags);

That seems okayish.  I'm not sure we need the flags, but I do want to allow
for binary data in an option.  So perhaps something like:

	int fsconfig(int fd, unsigned int type,
		     const char *key, const void *val, size_t val_len);

for example:

	fd = fsopen("ext4", FSOPEN_CLOEXEC);
	fsconfig(fd, fsconfig_blockdev, "dev.data", "/dev/sda1", ...);
	fsconfig(fd, fsconfig_blockdev, "dev.journal", "/dev/sda2", ...);
	fsconfig(fd, fsconfig_option, "user_xattr", NULL, ...);
	fsconfig(fd, fsconfig_option, "errors", "continue", ...);
	fsconfig(fd, fsconfig_option, "data", "journal", ...);
	fsconfig(fd, fsconfig_security, "selinux.context", "unconfined_u:...");
	fsconfig(fd, fsconfig_create, NULL, NULL, 0);
	mfd = fsmount(fd, FSMOUNT_CLOEXEC, MS_NOEXEC);

or:

	fd = fsopen("nfs", FSOPEN_CLOEXEC);
	fsconfig(fd, fsconfig_namespace, "user", "<usernsfd>", ...);
	fsconfig(fd, fsconfig_namespace, "net", "<netnsfd>", ...);
	fsconfig(fd, fsconfig_option, "server", "foo.com", ...);
	fsconfig(fd, fsconfig_option, "root", "/bar", ...);
	fsconfig(fd, fsconfig_option, "soft", NULL, ...);
	fsconfig(fd, fsconfig_option, "retry", "3", ...);
	fsconfig(fd, fsconfig_option, "wsize", "4096", ...);
	fsconfig(fd, fsconfig_uidmap, "dhowells", "1234", ...);
	fsconfig(fd, fsconfig_security, "selinux.context", "unconfined_u:...");
	fsconfig(fd, fsconfig_create, NULL, NULL, 0);
	mfd = fsmount(fd, FSMOUNT_CLOEXEC, MS_NOEXEC);

This does mean that userspace has to work harder, though, but it would
simplify the LSM interface internally.

Al Viro <viro@ftp.linux.org.uk>

> First of all, block device *IS* a fucking option.

Whilst that is true, I still need to be able to separate it out for
unconverted filesystems.

David

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-12 20:23       ` David Howells
  2018-07-12 20:25         ` Andy Lutomirski
  2018-07-12 20:34         ` Linus Torvalds
@ 2018-07-12 21:26         ` David Howells
  2018-07-12 21:40           ` Linus Torvalds
                             ` (2 more replies)
  2 siblings, 3 replies; 75+ messages in thread
From: David Howells @ 2018-07-12 21:26 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: dhowells, Andrew Lutomirski, Al Viro, Linux API, linux-fsdevel,
	Linux Kernel Mailing List, Jann Horn

Linus Torvalds <torvalds@linux-foundation.org> wrote:

> The unix semantics are that credentials are checked at open time.

Sigh.

The problem is that there's more than one actual "open" involved.

	fd = fsopen("ext4");				<--- #1
	whatever_interface(fd, "s /dev/sda1");
	whatever_interface(fd, "o journal_path=/dev/sda2");
	do_the_create_thing(fd);			<--- #2 and #3

The initial check to see whether you can mount or not is done at #1.

But later there are two nested file opens.  Internally, deep down inside the
block layer, /dev/sda1 and /dev/sda2 are opened and further permissions checks
are done, whether you like it or not.  But these have no access to the creds
attached to fd as things currently stand.

So we have three choices:

 (1) Pass the creds from ->get_tree() all the way down into pathwalk and make
     sure *every* check that pathwalk does uses it.

 (2) When do_the_create_thing() is invoked, it wraps the call to ->get_tree()
     with override_creds(file->f_cred).

 (3) Forget using an fd to refer to the context.  fsopen() takes absolutely
     everything, perhaps as a kv array and spits out an O_PATH fd.  You don't
     get improved error reporting, you don't get a chance for interaction -
     say with the server, to construct an ID mapping table - and you don't get
     the chance to query the superblock before creating a mount.

     So, something like:

	struct fsopen_param {
		unsigned int type,
		const char *key;
		const void *val;
		unsigned int val_len;
	};

	mfd = fsopen(const char *fs_type,
		     unsigned int flags, /* CLOEXEC */
		     const struct fsopen_param *params,
		     unsigned int param_count,
		     unsigned int ms_flags /* eg. MNT_NOEXEC */);

     For example:

	struct fsopen_param params[] = {
		{ fsopen_source, "dev.fs", "/dev/sda1" }
		{ fsopen_source, "dev.journal", "/dev/sda2" }
		{ fsopen_option, "user_xattr" }
		{ fsopen_option, "data", "journal" }
		{ fsopen_option, "jqfmt", "vfsv1" }
		{ fsopen_security, "selinux.context", "unconfined_u..." }
	};

	mfd = fsopen("ext4", FSOPEN_CLOEXEC, params, ARRAY_SIZE(params),
		     MNT_NOEXEC);

     There would need to be an fsreconfig() also in a similar vein.

David

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-12 21:00       ` David Howells
@ 2018-07-12 21:29         ` Linus Torvalds
  2018-07-13 13:27         ` David Howells
  1 sibling, 0 replies; 75+ messages in thread
From: Linus Torvalds @ 2018-07-12 21:29 UTC (permalink / raw)
  To: David Howells
  Cc: Andy Lutomirski, Andrew Lutomirski, Al Viro, Linux API,
	linux-fsdevel, Linux Kernel Mailing List, Jann Horn,
	Tycho Andersen

On Thu, Jul 12, 2018 at 2:00 PM David Howells <dhowells@redhat.com> wrote:
>
>
> for example:
>
>         fd = fsopen("ext4", FSOPEN_CLOEXEC);
>         fsconfig(fd, fsconfig_blockdev, "dev.data", "/dev/sda1", ...);
>         fsconfig(fd, fsconfig_blockdev, "dev.journal", "/dev/sda2", ...);

Ok, that looks good to me. It also avoids the parsing issue with using
an interface like "write()", where the expectation is that you can
append things etc.

              Linus

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-12 21:26         ` David Howells
@ 2018-07-12 21:40           ` Linus Torvalds
  2018-07-12 22:32           ` Theodore Y. Ts'o
  2018-07-12 22:54           ` David Howells
  2 siblings, 0 replies; 75+ messages in thread
From: Linus Torvalds @ 2018-07-12 21:40 UTC (permalink / raw)
  To: David Howells
  Cc: Andrew Lutomirski, Al Viro, Linux API, linux-fsdevel,
	Linux Kernel Mailing List, Jann Horn

On Thu, Jul 12, 2018 at 2:26 PM David Howells <dhowells@redhat.com> wrote:
>
> The problem is that there's more than one actual "open" involved.

No. The problem is "write()".

This is not about open, about fsopen, or about anything at all.

This is about the fact that "write()" by definition can happen in a
different - and unexpected - context. Whether that be due to suid or
due to splice, or due to any other random issue is entirely
immaterial.

(The same is true of "read()" too, but very few people try to make
"read()" have side effects, so it's less of an issue. It does happen,
though).

But once you have another interface than "read/write()", the issues go
away. Those other interfaces are synchronous, and now you can decide
"ok, I'll just use current creds".

>  (1) Pass the creds from ->get_tree() all the way down into pathwalk and make
>      sure *every* check that pathwalk does uses it.

No. See above.

If your write() does anything but buffering data, it's not getting merged.

>  (2) When do_the_create_thing() is invoked, it wraps the call to ->get_tree()
>      with override_creds(file->f_cred).

No.

We do not wrap creds in any case. It's just asking for *another* kind
of security issue, where you fool some higher-security thing into
giving you access because it wrapped the higher-security case instead.

>  (3) Forget using an fd to refer to the context.  fsopen() takes absolutely
>      everything, perhaps as a kv array and spits out an O_PATH fd.

That works.

Or you know - do what I told you to do ALL THE TIME, which was to not
use write(), or to only buffer things with write().

But yes, any option that simply avoids read and write is fine.

You can even have a file descriptor. We already have file descriptors
that cannot be read from or written to. It's quite common for special
devices, the whole "open /dev/floppy with O_NONBLOCK only to be able
to do control operations with it" goes back to pretty much day #1.

More recently, we have the whole "FMODE_PATH" kind of file descriptor,
which works as a directory entry, but not for read and write.

So file descriptors can have very useful properties.

But no. We do not use "write()" to implement actions. If you think you
need to check permissions and think you need a "cred", then you're not
using write(). It really is that simple.

Not using write just avouds *all* the problems. If you can fool a suid
application to do arbitrary system calls for you, then it's not the
system call that is the security problem.

                Linus

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-12 21:26         ` David Howells
  2018-07-12 21:40           ` Linus Torvalds
@ 2018-07-12 22:32           ` Theodore Y. Ts'o
  2018-07-12 22:54           ` David Howells
  2 siblings, 0 replies; 75+ messages in thread
From: Theodore Y. Ts'o @ 2018-07-12 22:32 UTC (permalink / raw)
  To: David Howells
  Cc: Linus Torvalds, Andrew Lutomirski, Al Viro, Linux API,
	linux-fsdevel, Linux Kernel Mailing List, Jann Horn

On Thu, Jul 12, 2018 at 10:26:37PM +0100, David Howells wrote:
> The problem is that there's more than one actual "open" involved.
> 
> 	fd = fsopen("ext4");				<--- #1
> 	whatever_interface(fd, "s /dev/sda1");
> 	whatever_interface(fd, "o journal_path=/dev/sda2");
> 	do_the_create_thing(fd);			<--- #2 and #3
> 
> The initial check to see whether you can mount or not is done at #1.
> 
> But later there are two nested file opens.  Internally, deep down inside the
> block layer, /dev/sda1 and /dev/sda2 are opened and further permissions checks
> are done, whether you like it or not.  But these have no access to the creds
> attached to fd as things currently stand.

So maybe the answer is that you open /dev/sda1 and /dev/sda2 and then
pass the file descriptors to the fsopen object?  We can require that
the fd's be opened with O_RDWR and O_EXCL, which has the benefit where
if you have multiple block devices, you know *which* block device had
a problem with being grabbed for an exclusive open.

Just a thought.

						- Ted

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-12 21:26         ` David Howells
  2018-07-12 21:40           ` Linus Torvalds
  2018-07-12 22:32           ` Theodore Y. Ts'o
@ 2018-07-12 22:54           ` David Howells
  2018-07-12 23:21             ` Andy Lutomirski
                               ` (4 more replies)
  2 siblings, 5 replies; 75+ messages in thread
From: David Howells @ 2018-07-12 22:54 UTC (permalink / raw)
  To: Theodore Y. Ts'o
  Cc: dhowells, Linus Torvalds, Andrew Lutomirski, Al Viro, Linux API,
	linux-fsdevel, Linux Kernel Mailing List, Jann Horn

Theodore Y. Ts'o <tytso@mit.edu> wrote:

> So maybe the answer is that you open /dev/sda1 and /dev/sda2 and then
> pass the file descriptors to the fsopen object?  We can require that
> the fd's be opened with O_RDWR and O_EXCL, which has the benefit where
> if you have multiple block devices, you know *which* block device had
> a problem with being grabbed for an exclusive open.

Would that mean then that doing:

	mount /dev/sda3 /a
	mount /dev/sda3 /b

would then fail on the second command because /dev/sda3 is already open
exclusively?

David

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-12 22:54           ` David Howells
@ 2018-07-12 23:21             ` Andy Lutomirski
  2018-07-12 23:23             ` Jann Horn
                               ` (3 subsequent siblings)
  4 siblings, 0 replies; 75+ messages in thread
From: Andy Lutomirski @ 2018-07-12 23:21 UTC (permalink / raw)
  To: David Howells
  Cc: Theodore Y. Ts'o, Linus Torvalds, Andrew Lutomirski, Al Viro,
	Linux API, linux-fsdevel, Linux Kernel Mailing List, Jann Horn



> On Jul 12, 2018, at 3:54 PM, David Howells <dhowells@redhat.com> wrote:
> 
> Theodore Y. Ts'o <tytso@mit.edu> wrote:
> 
>> So maybe the answer is that you open /dev/sda1 and /dev/sda2 and then
>> pass the file descriptors to the fsopen object?  We can require that
>> the fd's be opened with O_RDWR and O_EXCL, which has the benefit where
>> if you have multiple block devices, you know *which* block device had
>> a problem with being grabbed for an exclusive open.
> 
> Would that mean then that doing:
> 
>    mount /dev/sda3 /a
>    mount /dev/sda3 /b
> 
> would then fail on the second command because /dev/sda3 is already open
> exclusively?
> 

I tend to think that this *should* fail using the new API.  The semantics of the second mount request are bizarre at best.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-12 22:54           ` David Howells
  2018-07-12 23:21             ` Andy Lutomirski
@ 2018-07-12 23:23             ` Jann Horn
  2018-07-12 23:33               ` Jann Horn
  2018-07-12 23:35             ` David Howells
                               ` (2 subsequent siblings)
  4 siblings, 1 reply; 75+ messages in thread
From: Jann Horn @ 2018-07-12 23:23 UTC (permalink / raw)
  To: David Howells
  Cc: Theodore Y. Ts'o, Linus Torvalds, Andy Lutomirski, Al Viro,
	Linux API, linux-fsdevel, kernel list

On Thu, Jul 12, 2018 at 3:54 PM David Howells <dhowells@redhat.com> wrote:
>
> Theodore Y. Ts'o <tytso@mit.edu> wrote:
>
> > So maybe the answer is that you open /dev/sda1 and /dev/sda2 and then
> > pass the file descriptors to the fsopen object?  We can require that
> > the fd's be opened with O_RDWR and O_EXCL, which has the benefit where
> > if you have multiple block devices, you know *which* block device had
> > a problem with being grabbed for an exclusive open.
>
> Would that mean then that doing:
>
>         mount /dev/sda3 /a
>         mount /dev/sda3 /b
>
> would then fail on the second command because /dev/sda3 is already open
> exclusively?

Not exactly. mount_bdev() uses FMODE_EXCL, which locks out parallel
usage *with a different filesystem type*. This is the effect:

# strace -e trace=mount mount -t vfat /dev/loop0 mount
mount("/dev/loop0", "/home/jannh/tmp/x/mount", "vfat", MS_MGC_VAL, NULL) = 0
+++ exited with 0 +++
# strace -e trace=mount mount -t ext4 /dev/loop0 mount
mount("/dev/loop0", "/home/jannh/tmp/x/mount", "ext4", MS_MGC_VAL,
NULL) = -1 EBUSY (Device or resource busy)
mount: /home/jannh/tmp/x/mount: /dev/loop0 already mounted on
/home/jannh/tmp/x/mount.
+++ exited with 32 +++

I don't really understand why it's not more strict though...

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-12 23:23             ` Jann Horn
@ 2018-07-12 23:33               ` Jann Horn
  0 siblings, 0 replies; 75+ messages in thread
From: Jann Horn @ 2018-07-12 23:33 UTC (permalink / raw)
  To: David Howells
  Cc: Theodore Y. Ts'o, Linus Torvalds, Andy Lutomirski, Al Viro,
	Linux API, linux-fsdevel, kernel list

On Thu, Jul 12, 2018 at 4:23 PM Jann Horn <jannh@google.com> wrote:
>
> On Thu, Jul 12, 2018 at 3:54 PM David Howells <dhowells@redhat.com> wrote:
> >
> > Theodore Y. Ts'o <tytso@mit.edu> wrote:
> >
> > > So maybe the answer is that you open /dev/sda1 and /dev/sda2 and then
> > > pass the file descriptors to the fsopen object?  We can require that
> > > the fd's be opened with O_RDWR and O_EXCL, which has the benefit where
> > > if you have multiple block devices, you know *which* block device had
> > > a problem with being grabbed for an exclusive open.
> >
> > Would that mean then that doing:
> >
> >         mount /dev/sda3 /a
> >         mount /dev/sda3 /b
> >
> > would then fail on the second command because /dev/sda3 is already open
> > exclusively?
>
> Not exactly. mount_bdev() uses FMODE_EXCL, which locks out parallel
> usage *with a different filesystem type*. This is the effect:
>
> # strace -e trace=mount mount -t vfat /dev/loop0 mount
> mount("/dev/loop0", "/home/jannh/tmp/x/mount", "vfat", MS_MGC_VAL, NULL) = 0
> +++ exited with 0 +++
> # strace -e trace=mount mount -t ext4 /dev/loop0 mount
> mount("/dev/loop0", "/home/jannh/tmp/x/mount", "ext4", MS_MGC_VAL,
> NULL) = -1 EBUSY (Device or resource busy)
> mount: /home/jannh/tmp/x/mount: /dev/loop0 already mounted on
> /home/jannh/tmp/x/mount.
> +++ exited with 32 +++
>
> I don't really understand why it's not more strict though...

Er, sorry, of course that's the current behavior, not the behavior of
the suggested API.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-12 22:54           ` David Howells
  2018-07-12 23:21             ` Andy Lutomirski
  2018-07-12 23:23             ` Jann Horn
@ 2018-07-12 23:35             ` David Howells
  2018-07-12 23:50               ` Andy Lutomirski
       [not found]             ` <23894.1531438559@warthog.procyon.o rg.uk>
  2018-07-13  2:35             ` Theodore Y. Ts'o
  4 siblings, 1 reply; 75+ messages in thread
From: David Howells @ 2018-07-12 23:35 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: dhowells, Theodore Y. Ts'o, Linus Torvalds,
	Andrew Lutomirski, Al Viro, Linux API, linux-fsdevel,
	Linux Kernel Mailing List, Jann Horn

Andy Lutomirski <luto@amacapital.net> wrote:

> I tend to think that this *should* fail using the new API.  The semantics of
> the second mount request are bizarre at best.

You still have to support existing behaviour lest you break userspace.

David

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-12 23:35             ` David Howells
@ 2018-07-12 23:50               ` Andy Lutomirski
  0 siblings, 0 replies; 75+ messages in thread
From: Andy Lutomirski @ 2018-07-12 23:50 UTC (permalink / raw)
  To: David Howells
  Cc: Theodore Y. Ts'o, Linus Torvalds, Andrew Lutomirski, Al Viro,
	Linux API, linux-fsdevel, Linux Kernel Mailing List, Jann Horn



> On Jul 12, 2018, at 4:35 PM, David Howells <dhowells@redhat.com> wrote:
> 
> Andy Lutomirski <luto@amacapital.net> wrote:
> 
>> I tend to think that this *should* fail using the new API.  The semantics of
>> the second mount request are bizarre at best.
> 
> You still have to support existing behaviour lest you break userspace.
> 

I assume the existing behavior is that a bind mount is created?  If so, the new mount(8) tool could do it in user code.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
       [not found]             ` <23894.1531438559@warthog.procyon.o rg.uk>
@ 2018-07-13  0:03               ` David Howells
  2018-07-13  0:24                 ` Andy Lutomirski
  2018-07-13  7:30                 ` David Howells
  0 siblings, 2 replies; 75+ messages in thread
From: David Howells @ 2018-07-13  0:03 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: dhowells, Theodore Y. Ts'o, Linus Torvalds,
	Andrew Lutomirski, Al Viro, Linux API, linux-fsdevel,
	Linux Kernel Mailing List, Jann Horn

Andy Lutomirski <luto@amacapital.net> wrote:

> >> I tend to think that this *should* fail using the new API.  The semantics
> >> of the second mount request are bizarre at best.
> > 
> > You still have to support existing behaviour lest you break userspace.
> > 
> 
> I assume the existing behavior is that a bind mount is created?  If so, the
> new mount(8) tool could do it in user code.

You have a race there.

Also you can't currently directly create a bind mount from userspace as you
can only bind from another path point - which you may not be able to access
(either by permission failure or because it's not in your mount namespace).

David

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-13  0:03               ` David Howells
@ 2018-07-13  0:24                 ` Andy Lutomirski
  2018-07-13  7:30                 ` David Howells
  1 sibling, 0 replies; 75+ messages in thread
From: Andy Lutomirski @ 2018-07-13  0:24 UTC (permalink / raw)
  To: David Howells
  Cc: Theodore Y. Ts'o, Linus Torvalds, Andrew Lutomirski, Al Viro,
	Linux API, linux-fsdevel, Linux Kernel Mailing List, Jann Horn



> On Jul 12, 2018, at 5:03 PM, David Howells <dhowells@redhat.com> wrote:
> 
> Andy Lutomirski <luto@amacapital.net> wrote:
> 
>>>> I tend to think that this *should* fail using the new API.  The semantics
>>>> of the second mount request are bizarre at best.
>>> 
>>> You still have to support existing behaviour lest you break userspace.
>>> 
>> 
>> I assume the existing behavior is that a bind mount is created?  If so, the
>> new mount(8) tool could do it in user code.
> 
> You have a race there.
> 
> Also you can't currently directly create a bind mount from userspace as you
> can only bind from another path point - which you may not be able to access
> (either by permission failure or because it's not in your mount namespace).
> 

Are you trying to preserve the magic bind semantics with the new API?  If you are, I think it should be by explicit opt in only. Otherwise you risk having your shiny new way to specify fs options get ignored when the magic bind mount happens. 

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-12 22:54           ` David Howells
                               ` (3 preceding siblings ...)
       [not found]             ` <23894.1531438559@warthog.procyon.o rg.uk>
@ 2018-07-13  2:35             ` Theodore Y. Ts'o
  4 siblings, 0 replies; 75+ messages in thread
From: Theodore Y. Ts'o @ 2018-07-13  2:35 UTC (permalink / raw)
  To: David Howells
  Cc: Linus Torvalds, Andrew Lutomirski, Al Viro, Linux API,
	linux-fsdevel, Linux Kernel Mailing List, Jann Horn

On Thu, Jul 12, 2018 at 11:54:41PM +0100, David Howells wrote:
> 
> Would that mean then that doing:
> 
> 	mount /dev/sda3 /a
> 	mount /dev/sda3 /b
> 
> would then fail on the second command because /dev/sda3 is already open
> exclusively?

Good point.  One workaround would be to require an open with O_PATH instead.

     	     	 	    	     	- Ted

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-13  0:03               ` David Howells
  2018-07-13  0:24                 ` Andy Lutomirski
@ 2018-07-13  7:30                 ` David Howells
  2018-07-19  1:30                   ` Eric W. Biederman
  1 sibling, 1 reply; 75+ messages in thread
From: David Howells @ 2018-07-13  7:30 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: dhowells, Theodore Y. Ts'o, Linus Torvalds,
	Andrew Lutomirski, Al Viro, Linux API, linux-fsdevel,
	Linux Kernel Mailing List, Jann Horn

Andy Lutomirski <luto@amacapital.net> wrote:

> > Also you can't currently directly create a bind mount from userspace as you
> > can only bind from another path point - which you may not be able to access
> > (either by permission failure or because it's not in your mount namespace).
> > 
> 
> Are you trying to preserve the magic bind semantics with the new API?

No, I'm pointing out that you can't emulate this by doing a bind mount from
userspace if you can't access the thing you're binding from.

Now, we could create a syscall that just picks up an extant superblock using a
device and attaches it to a mount for you, but that would have to be at least
partially parameterised - which would be very fs-dependent - so that it can
know whether or not you're allowed to create another mount to that sb.

What you're talking about is emulating sget() in userspace - when we have to
do it in the kernel anyway if we still offer mount(2).

David

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-12 21:00       ` David Howells
  2018-07-12 21:29         ` Linus Torvalds
@ 2018-07-13 13:27         ` David Howells
  2018-07-13 15:01           ` Andy Lutomirski
                             ` (2 more replies)
  1 sibling, 3 replies; 75+ messages in thread
From: David Howells @ 2018-07-13 13:27 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: dhowells, Andy Lutomirski, Andrew Lutomirski, Al Viro, Linux API,
	linux-fsdevel, Linux Kernel Mailing List, Jann Horn,
	Tycho Andersen

Whilst I'm at it, do we want the option of doing the equivalent of mountat()?
I.e. offering the option to open all the device files used by a superblock
with dfd and AT_* flags in combination with the filename?

David

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-13 13:27         ` David Howells
@ 2018-07-13 15:01           ` Andy Lutomirski
  2018-07-13 15:40           ` David Howells
  2018-07-17  9:40           ` David Howells
  2 siblings, 0 replies; 75+ messages in thread
From: Andy Lutomirski @ 2018-07-13 15:01 UTC (permalink / raw)
  To: David Howells
  Cc: Linus Torvalds, Andrew Lutomirski, Al Viro, Linux API,
	linux-fsdevel, Linux Kernel Mailing List, Jann Horn,
	Tycho Andersen



> On Jul 13, 2018, at 6:27 AM, David Howells <dhowells@redhat.com> wrote:
> 
> Whilst I'm at it, do we want the option of doing the equivalent of mountat()?
> I.e. offering the option to open all the device files used by a superblock
> with dfd and AT_* flags in combination with the filename?
> 

Isn’t that more or less what I was suggesting?  I suggested dfd and path and I also suggested just an fd and letting the caller open the file itself.

> David

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-13 13:27         ` David Howells
  2018-07-13 15:01           ` Andy Lutomirski
@ 2018-07-13 15:40           ` David Howells
  2018-07-13 17:14             ` Andy Lutomirski
  2018-07-17  9:40           ` David Howells
  2 siblings, 1 reply; 75+ messages in thread
From: David Howells @ 2018-07-13 15:40 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: dhowells, Linus Torvalds, Andrew Lutomirski, Al Viro, Linux API,
	linux-fsdevel, Linux Kernel Mailing List, Jann Horn,
	Tycho Andersen

Andy Lutomirski <luto@amacapital.net> wrote:

> > Whilst I'm at it, do we want the option of doing the equivalent of
> > mountat()?  I.e. offering the option to open all the device files used by
> > a superblock with dfd and AT_* flags in combination with the filename?
> > 
> 
> Isn't that more or less what I was suggesting?

Yes, you suggested that.  I'm asking if we actually need that.

> ... I also suggested just an fd and letting the caller open the file itself.

I'm not entirely sure, but that might prevent the filesystem from being able
to use it, since userspace might then prevent the filesystem getting exclusive
holdership.

David

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-13 15:40           ` David Howells
@ 2018-07-13 17:14             ` Andy Lutomirski
  0 siblings, 0 replies; 75+ messages in thread
From: Andy Lutomirski @ 2018-07-13 17:14 UTC (permalink / raw)
  To: David Howells
  Cc: Linus Torvalds, Andrew Lutomirski, Al Viro, Linux API,
	linux-fsdevel, Linux Kernel Mailing List, Jann Horn,
	Tycho Andersen

On Fri, Jul 13, 2018 at 8:40 AM, David Howells <dhowells@redhat.com> wrote:
> Andy Lutomirski <luto@amacapital.net> wrote:
>
>> > Whilst I'm at it, do we want the option of doing the equivalent of
>> > mountat()?  I.e. offering the option to open all the device files used by
>> > a superblock with dfd and AT_* flags in combination with the filename?
>> >
>>
>> Isn't that more or less what I was suggesting?
>
> Yes, you suggested that.  I'm asking if we actually need that.
>

Suppose some program in a container chroots itself and then tries to
create an fscontext backed by "/path/to/blockdev".  The syscall gets
intercepted by a container manager.  That manager now has a somewhat
awkward time of mounting the same fs, although it could use
"/proc/PID/root/path/to/blockdev", I suppose.  Even that approach has
some potentially awkward permission issues.  I would defer to the
people who actually write software like this, but I can imagine fds
being considerably easier to work with.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-13 13:27         ` David Howells
  2018-07-13 15:01           ` Andy Lutomirski
  2018-07-13 15:40           ` David Howells
@ 2018-07-17  9:40           ` David Howells
  2 siblings, 0 replies; 75+ messages in thread
From: David Howells @ 2018-07-17  9:40 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: dhowells, Linus Torvalds, Andrew Lutomirski, Al Viro, Linux API,
	linux-fsdevel, Linux Kernel Mailing List, Jann Horn,
	Tycho Andersen

Andy Lutomirski <luto@amacapital.net> wrote:

> > Whilst I'm at it, do we want the option of doing the equivalent of
> > mountat()?  I.e. offering the option to open all the device files used by
> > a superblock with dfd and AT_* flags in combination with the filename?
> > 
> 
> Isn’t that more or less what I was suggesting?  I suggested dfd and path and I also suggested just an fd and letting the caller open the file itself.

Do we need AT_* flags?  There are three that we could use:

	AT_SYMLINK_NOFOLLOW
	AT_NO_AUTOMOUNT
	AT_EMPTY_PATH

AT_EMPTY_PATH I can see, but I don't see it as likely that we'd want to use
the other two for selecting a source?  Note that we can always do:

	fsfd = fsopen("ext4");
	sfd = open("/dev/", O_PATH);
	fsconfig(fsfd, fsconfig_set_path, "journal_path", "sda1", sfd);

or:

	fsfd = fsopen("ext4");
	sfd = open("/dev/sda1", O_PATH);
	fsconfig(fsfd, fsconfig_set_path_empty, "journal_path", "", sfd);

or:

	fsfd = fsopen("ext4");
	jfd = open("/dev/sda1", O_RDWR);
	fsconfig(fsfd, fsconfig_set_fd, "journal_path", NULL, jfd);

assuming the open on the latter doesn't exclude the use by the filesystem.

This way I don't need a second syscall or a 6-arg syscall to handle path
specification.

David

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation [ver #9]
  2018-07-13  7:30                 ` David Howells
@ 2018-07-19  1:30                   ` Eric W. Biederman
  0 siblings, 0 replies; 75+ messages in thread
From: Eric W. Biederman @ 2018-07-19  1:30 UTC (permalink / raw)
  To: David Howells
  Cc: Andy Lutomirski, Theodore Y. Ts'o, Linus Torvalds,
	Andrew Lutomirski, Al Viro, Linux API, linux-fsdevel,
	Linux Kernel Mailing List, Jann Horn

David Howells <dhowells@redhat.com> writes:

> Andy Lutomirski <luto@amacapital.net> wrote:
>
>> > Also you can't currently directly create a bind mount from userspace as you
>> > can only bind from another path point - which you may not be able to access
>> > (either by permission failure or because it's not in your mount namespace).
>> > 
>> 
>> Are you trying to preserve the magic bind semantics with the new API?
>
> No, I'm pointing out that you can't emulate this by doing a bind mount from
> userspace if you can't access the thing you're binding from.
>
> Now, we could create a syscall that just picks up an extant superblock using a
> device and attaches it to a mount for you, but that would have to be at least
> partially parameterised - which would be very fs-dependent - so that it can
> know whether or not you're allowed to create another mount to that sb.
>
> What you're talking about is emulating sget() in userspace - when we have to
> do it in the kernel anyway if we still offer mount(2).

I am just going to chime in and say that it is absolutely a problem in
the current mount interface that when I mount a filesystem with fresh
parameters I don't know if it is generates an sget and a new super_block
or if it just increments the refcount on an existing super_block.

It is the kind of problem that is actually security sensitive and has
resulted in a security issue in the current linux kernel with respect to
proc.

So yes we absolutely need to have a clean way of dealing with:

mount /dev/sda3 /tmp
mount /dev/sda3 /mnt

So that the second one is forbidden fails.  And userspace has to do the
equivalent of sget to get a file descriptor it can bind into the mount
namespace.

The deep problem is that the second mount does not parse the mount
options and userspace does not know that.  So userspace thinks it is
getting one kind of mount and in practice it gets another (sometimes
with different security properties).  Those different security
properties are an out and out bug.  Although any kind of different and
unexpected properties can be a problem.

Eric

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [MANPAGE PATCH] Add manpages for move_mount(2) and open_tree(2)
  2018-07-10 22:52 ` [MANPAGE PATCH] Add manpages for move_mount(2) and open_tree(2) David Howells
@ 2019-10-09  9:51   ` Michael Kerrisk (man-pages)
  0 siblings, 0 replies; 75+ messages in thread
From: Michael Kerrisk (man-pages) @ 2019-10-09  9:51 UTC (permalink / raw)
  To: David Howells
  Cc: mtk.manpages, viro, linux-api, linux-fsdevel, torvalds,
	linux-kernel, linux-man, Eric W. Biederman

Hello David,

Your wrote a series of manual pages patches (of which the mail below is one)
for the new mount API about a year before the code patches were actually
released in the kernel.

I'd like to check that these man-pages patches are up to date before
merging them. I think they may not be, since there is one patch for
fsinfo(2) which does not exist in the kernel, and no manual page for
fsconfig(2). I imagine that details may also have changed
in the system calls that were ultimately merged.

Could you write a manual page for fsconfig(2) please?

With respect to the patch below, would you be willing to:
* split it into two pieces, one for each page.
* review the content to see if it accurately reflects what was
  merged into the kernel and then resubmit please?

Thanks,

Michael

On 7/11/18 12:52 AM, David Howells wrote:
> Add manual pages to document the move_mount and open_tree() system calls.
> 
> Signed-off-by: David Howells <dhowells@redhat.com>
> ---
> 
>  man2/move_mount.2 |  274 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>  man2/open_tree.2  |  260 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 534 insertions(+)
>  create mode 100644 man2/move_mount.2
>  create mode 100644 man2/open_tree.2
> 
> diff --git a/man2/move_mount.2 b/man2/move_mount.2
> new file mode 100644
> index 000000000..3a819fb84
> --- /dev/null
> +++ b/man2/move_mount.2
> @@ -0,0 +1,274 @@
> +'\" t
> +.\" Copyright (c) 2018 David Howells <dhowells@redhat.com>
> +.\"
> +.\" %%%LICENSE_START(VERBATIM)
> +.\" Permission is granted to make and distribute verbatim copies of this
> +.\" manual provided the copyright notice and this permission notice are
> +.\" preserved on all copies.
> +.\"
> +.\" Permission is granted to copy and distribute modified versions of this
> +.\" manual under the conditions for verbatim copying, provided that the
> +.\" entire resulting derived work is distributed under the terms of a
> +.\" permission notice identical to this one.
> +.\"
> +.\" Since the Linux kernel and libraries are constantly changing, this
> +.\" manual page may be incorrect or out-of-date.  The author(s) assume no
> +.\" responsibility for errors or omissions, or for damages resulting from
> +.\" the use of the information contained herein.  The author(s) may not
> +.\" have taken the same level of care in the production of this manual,
> +.\" which is licensed free of charge, as they might when working
> +.\" professionally.
> +.\"
> +.\" Formatted or processed versions of this manual, if unaccompanied by
> +.\" the source, must acknowledge the copyright and authors of this work.
> +.\" %%%LICENSE_END
> +.\"
> +.TH MOVE_MOUNT 2 2018-06-08 "Linux" "Linux Programmer's Manual"
> +.SH NAME
> +move_mount \- Move mount objects around the filesystem topology
> +.SH SYNOPSIS
> +.nf
> +.B #include <sys/types.h>
> +.br
> +.B #include <sys/mount.h>
> +.br
> +.B #include <unistd.h>
> +.br
> +.BR "#include <fcntl.h>           " "/* Definition of AT_* constants */"
> +.PP
> +.BI "int move_mount(int " from_dirfd ", const char *" from_pathname ","
> +.BI "               int " to_dirfd ", const char *" to_pathname ","
> +.BI "               unsigned int " flags );
> +.fi
> +.PP
> +.IR Note :
> +There are no glibc wrappers for these system calls.
> +.SH DESCRIPTION
> +The
> +.BR move_mount ()
> +call moves a mount from one place to another; it can also be used to attach an
> +unattached mount created by
> +.BR fsmount "() or " open_tree "() with " OPEN_TREE_CLONE .
> +.PP
> +If
> +.BR move_mount ()
> +is called repeatedly with a file descriptor that refers to a mount object,
> +then the object will be attached/moved the first time and then moved again and
> +again and again, detaching it from the previous mountpoint each time.
> +.PP
> +To access the source mount object or the destination mountpoint, no
> +permissions are required on the object itself, but if either pathname is
> +supplied, execute (search) permission is required on all of the directories
> +specified in
> +.IR from_pathname " or " to_pathname .
> +.PP
> +The caller does, however, require the appropriate capabilities or permission
> +to effect a mount.
> +.PP
> +.BR move_mount ()
> +uses
> +.IR from_pathname ", " from_dirfd " and some " flags
> +to locate the mount object to be moved and
> +.IR to_pathname ", " to_dirfd " and some other " flags
> +to locate the destination mountpoint.  Each lookup can be done in one of a
> +variety of ways:
> +.TP
> +[*] By absolute path.
> +The pathname points to an absolute path and the dirfd is ignored.  The file is
> +looked up by name, starting from the root of the filesystem as seen by the
> +calling process.
> +.TP
> +[*] By cwd-relative path.
> +The pathname points to a relative path and the dirfd is
> +.IR AT_FDCWD .
> +The file is looked up by name, starting from the current working directory.
> +.TP
> +[*] By dir-relative path.
> +The pathname points to relative path and the dirfd indicates a file descriptor
> +pointing to a directory.  The file is looked up by name, starting from the
> +directory specified by
> +.IR dirfd .
> +.TP
> +[*] By file descriptor.
> +The pathname points to "", the dirfd points directly to the mount object to
> +move or the destination mount point and the appropriate
> +.B *_EMPTY_PATH
> +flag is set.
> +.PP
> +.I flags
> +can be used to influence a path-based lookup.  A value for
> +.I flags
> +is constructed by OR'ing together zero or more of the following constants:
> +.TP
> +.BR MOVE_MOUNT_F_EMPTY_PATH
> +.\" commit 65cfc6722361570bfe255698d9cd4dccaf47570d
> +If
> +.I from_pathname
> +is an empty string, operate on the file referred to by
> +.IR from_dirfd
> +(which may have been obtained using the
> +.BR open (2)
> +.B O_PATH
> +flag or
> +.BR open_tree ())
> +If
> +.I from_dirfd
> +is
> +.BR AT_FDCWD ,
> +the call operates on the current working directory.
> +In this case,
> +.I from_dirfd
> +can refer to any type of file, not just a directory.
> +This flag is Linux-specific; define
> +.B _GNU_SOURCE
> +.\" Before glibc 2.16, defining _ATFILE_SOURCE sufficed
> +to obtain its definition.
> +.TP
> +.B MOVE_MOUNT_T_EMPTY_PATH
> +As above, but operating on
> +.IR to_pathname " and " to_dirfd .
> +.TP
> +.B MOVE_MOUNT_F_NO_AUTOMOUNT
> +Don't automount the terminal ("basename") component of
> +.I from_pathname
> +if it is a directory that is an automount point.  This allows a mount object
> +that has an automount point at its root to be moved and prevents unintended
> +triggering of an automount point.
> +The
> +.B MOVE_MOUNT_F_NO_AUTOMOUNT
> +flag has no effect if the automount point has already been mounted over.  This
> +flag is Linux-specific; define
> +.B _GNU_SOURCE
> +.\" Before glibc 2.16, defining _ATFILE_SOURCE sufficed
> +to obtain its definition.
> +.TP
> +.B MOVE_MOUNT_T_NO_AUTOMOUNT
> +As above, but operating on
> +.IR to_pathname " and " to_dirfd .
> +This allows an automount point to be manually mounted over.
> +.TP
> +.B MOVE_MOUNT_F_SYMLINKS
> +If
> +.I from_pathname
> +is a symbolic link, then dereference it.  The default for
> +.BR move_mount ()
> +is to not follow symlinks.
> +.TP
> +.B MOVE_MOUNT_T_SYMLINKS
> +As above, but operating on
> +.IR to_pathname " and " to_dirfd .
> +
> +.SH EXAMPLES
> +The
> +.BR move_mount ()
> +function can be used like the following:
> +.PP
> +.RS
> +.nf
> +move_mount(AT_FDCWD, "/a", AT_FDCWD, "/b", 0);
> +.fi
> +.RE
> +.PP
> +This would move the object mounted on "/a" to "/b".  It can also be used in
> +conjunction with
> +.BR open_tree "(2) or " open "(2) with " O_PATH :
> +.PP
> +.RS
> +.nf
> +fd = open_tree(AT_FDCWD, "/mnt", 0);
> +move_mount(fd, "", AT_FDCWD, "/mnt2", MOVE_MOUNT_F_EMPTY_PATH);
> +move_mount(fd, "", AT_FDCWD, "/mnt3", MOVE_MOUNT_F_EMPTY_PATH);
> +move_mount(fd, "", AT_FDCWD, "/mnt4", MOVE_MOUNT_F_EMPTY_PATH);
> +.fi
> +.RE
> +.PP
> +This would attach the path point for "/mnt" to fd, then it would move the
> +mount to "/mnt2", then move it to "/mnt3" and finally to "/mnt4".
> +.PP
> +It can also be used to attach new mounts:
> +.PP
> +.RS
> +.nf
> +sfd = fsopen("ext4", FSOPEN_CLOEXEC);
> +write(sfd, "s /dev/sda1");
> +write(sfd, "o user_xattr");
> +mfd = fsmount(sfd, FSMOUNT_CLOEXEC, MS_NODEV);
> +move_mount(mfd, "", AT_FDCWD, "/home", MOVE_MOUNT_F_EMPTY_PATH);
> +.fi
> +.RE
> +.PP
> +Which would open the Ext4 filesystem mounted on "/dev/sda1", turn on user
> +extended attribute support and create a mount object for it.  Finally, the new
> +mount object would be attached with
> +.BR move_mount ()
> +to "/home".
> +
> +
> +.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
> +.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
> +.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
> +.SH RETURN VALUE
> +On success, 0 is returned.  On error, \-1 is returned, and
> +.I errno
> +is set appropriately.
> +.SH ERRORS
> +.TP
> +.B EACCES
> +Search permission is denied for one of the directories
> +in the path prefix of
> +.IR pathname .
> +(See also
> +.BR path_resolution (7).)
> +.TP
> +.B EBADF
> +.IR from_dirfd " or " to_dirfd
> +is not a valid open file descriptor.
> +.TP
> +.B EFAULT
> +.IR from_pathname " or " to_pathname
> +is NULL or either one point to a location outside the process's accessible
> +address space.
> +.TP
> +.B EINVAL
> +Reserved flag specified in
> +.IR flags .
> +.TP
> +.B ELOOP
> +Too many symbolic links encountered while traversing the pathname.
> +.TP
> +.B ENAMETOOLONG
> +.IR from_pathname " or " to_pathname
> +is too long.
> +.TP
> +.B ENOENT
> +A component of
> +.IR from_pathname " or " to_pathname
> +does not exist, or one is an empty string and the appropriate
> +.B *_EMPTY_PATH
> +was not specified in
> +.IR flags .
> +.TP
> +.B ENOMEM
> +Out of memory (i.e., kernel memory).
> +.TP
> +.B ENOTDIR
> +A component of the path prefix of
> +.IR from_pathname " or " to_pathname
> +is not a directory or one or the other is relative and the appropriate
> +.I *_dirfd
> +is a file descriptor referring to a file other than a directory.
> +.SH VERSIONS
> +.BR move_mount ()
> +was added to Linux in kernel 4.18.
> +.SH CONFORMING TO
> +.BR move_mount ()
> +is Linux-specific.
> +.SH NOTES
> +Glibc does not (yet) provide a wrapper for the
> +.BR move_mount ()
> +system call; call it using
> +.BR syscall (2).
> +.SH SEE ALSO
> +.BR fsmount (2),
> +.BR fsopen (2),
> +.BR open_tree (2)
> diff --git a/man2/open_tree.2 b/man2/open_tree.2
> new file mode 100644
> index 000000000..7e9c86fe3
> --- /dev/null
> +++ b/man2/open_tree.2
> @@ -0,0 +1,260 @@
> +'\" t
> +.\" Copyright (c) 2018 David Howells <dhowells@redhat.com>
> +.\"
> +.\" %%%LICENSE_START(VERBATIM)
> +.\" Permission is granted to make and distribute verbatim copies of this
> +.\" manual provided the copyright notice and this permission notice are
> +.\" preserved on all copies.
> +.\"
> +.\" Permission is granted to copy and distribute modified versions of this
> +.\" manual under the conditions for verbatim copying, provided that the
> +.\" entire resulting derived work is distributed under the terms of a
> +.\" permission notice identical to this one.
> +.\"
> +.\" Since the Linux kernel and libraries are constantly changing, this
> +.\" manual page may be incorrect or out-of-date.  The author(s) assume no
> +.\" responsibility for errors or omissions, or for damages resulting from
> +.\" the use of the information contained herein.  The author(s) may not
> +.\" have taken the same level of care in the production of this manual,
> +.\" which is licensed free of charge, as they might when working
> +.\" professionally.
> +.\"
> +.\" Formatted or processed versions of this manual, if unaccompanied by
> +.\" the source, must acknowledge the copyright and authors of this work.
> +.\" %%%LICENSE_END
> +.\"
> +.TH OPEN_TREE 2 2018-06-08 "Linux" "Linux Programmer's Manual"
> +.SH NAME
> +open_tree \- Pick or clone mount object and attach to fd
> +.SH SYNOPSIS
> +.nf
> +.B #include <sys/types.h>
> +.br
> +.B #include <sys/mount.h>
> +.br
> +.B #include <unistd.h>
> +.br
> +.BR "#include <fcntl.h>           " "/* Definition of AT_* constants */"
> +.PP
> +.BI "int open_tree(int " dirfd ", const char *" pathname ", unsigned int " flags );
> +.fi
> +.PP
> +.IR Note :
> +There are no glibc wrappers for these system calls.
> +.SH DESCRIPTION
> +.BR open_tree ()
> +picks the mount object specified by the pathname and attaches it to a new file
> +descriptor or clones it and attaches the clone to the file descriptor.  The
> +resultant file descriptor is indistinguishable from one produced by
> +.BR open "(2) with " O_PATH .
> +.PP
> +In the case that the mount object is cloned, the clone will be "unmounted" and
> +destroyed when the file descriptor is closed if it is not otherwise mounted
> +somewhere by calling
> +.BR move_mount (2).
> +.PP
> +To select a mount object, no permissions are required on the object referred
> +to by the path, but execute (search) permission is required on all of the
> +directories in
> +.I pathname
> +that lead to the object.
> +.PP
> +To clone an object, however, the caller must have mount capabilities and
> +permissions.
> +.PP
> +.BR open_tree ()
> +uses
> +.IR pathname ", " dirfd " and " flags
> +to locate the target object in one of a variety of ways:
> +.TP
> +[*] By absolute path.
> +.I pathname
> +points to an absolute path and
> +.I dirfd
> +is ignored.  The object is looked up by name, starting from the root of the
> +filesystem as seen by the calling process.
> +.TP
> +[*] By cwd-relative path.
> +.I pathname
> +points to a relative path and
> +.IR dirfd " is " AT_FDCWD .
> +The object is looked up by name, starting from the current working directory.
> +.TP
> +[*] By dir-relative path.
> +.I pathname
> +points to relative path and
> +.I dirfd
> +indicates a file descriptor pointing to a directory.  The object is looked up
> +by name, starting from the directory specified by
> +.IR dirfd .
> +.TP
> +[*] By file descriptor.
> +.I pathname
> +is "",
> +.I dirfd
> +indicates a file descriptor and
> +.B AT_EMPTY_PATH
> +is set in
> +.IR flags .
> +The mount attached to the file descriptor is queried directly.  The file
> +descriptor may point to any type of file, not just a directory.
> +
> +.\"______________________________________________________________
> +.PP
> +.I flags
> +can be used to control the operation of the function and to influence a
> +path-based lookup.  A value for
> +.I flags
> +is constructed by OR'ing together zero or more of the following constants:
> +.TP
> +.BR AT_EMPTY_PATH
> +.\" commit 65cfc6722361570bfe255698d9cd4dccaf47570d
> +If
> +.I pathname
> +is an empty string, operate on the file referred to by
> +.IR dirfd
> +(which may have been obtained from
> +.BR open "(2) with"
> +.BR O_PATH ", from " fsmount (2)
> +or from another
> +.BR open_tree ()).
> +If
> +.I dirfd
> +is
> +.BR AT_FDCWD ,
> +the call operates on the current working directory.
> +In this case,
> +.I dirfd
> +can refer to any type of file, not just a directory.
> +This flag is Linux-specific; define
> +.B _GNU_SOURCE
> +.\" Before glibc 2.16, defining _ATFILE_SOURCE sufficed
> +to obtain its definition.
> +.TP
> +.BR AT_NO_AUTOMOUNT
> +Don't automount the terminal ("basename") component of
> +.I pathname
> +if it is a directory that is an automount point.  This flag allows the
> +automount point itself to be picked up or a mount cloned that is rooted on the
> +automount point.  The
> +.B AT_NO_AUTOMOUNT
> +flag has no effect if the mount point has already been mounted over.
> +This flag is Linux-specific; define
> +.B _GNU_SOURCE
> +.\" Before glibc 2.16, defining _ATFILE_SOURCE sufficed
> +to obtain its definition.
> +.TP
> +.B AT_SYMLINK_NOFOLLOW
> +If
> +.I pathname
> +is a symbolic link, do not dereference it: instead pick up or clone a mount
> +rooted on the link itself.
> +.TP
> +.B OPEN_TREE_CLOEXEC
> +Set the close-on-exec flag for the new file descriptor.  This will cause the
> +file descriptor to be closed automatically when a process exec's.
> +.TP
> +.B OPEN_TREE_CLONE
> +Rather than directly attaching the selected object to the file descriptor,
> +clone the object, set the root of the new mount object to that point and
> +attach the clone to the file descriptor.
> +.TP
> +.B AT_RECURSIVE
> +This is only permitted in conjunction with OPEN_TREE_CLONE.  It causes the
> +entire mount subtree rooted at the selected spot to be cloned rather than just
> +that one mount object.
> +
> +
> +.SH EXAMPLE
> +The
> +.BR open_tree ()
> +function can be used like the following:
> +.PP
> +.RS
> +.nf
> +fd1 = open_tree(AT_FDCWD, "/mnt", 0);
> +fd2 = open_tree(fd1, "",
> +                AT_EMPTY_PATH | OPEN_TREE_CLONE | AT_RECURSIVE);
> +move_mount(fd2, "", AT_FDCWD, "/mnt2", MOVE_MOUNT_F_EMPTY_PATH);
> +.fi
> +.RE
> +.PP
> +This would attach the path point for "/mnt" to fd1, then it would copy the
> +entire subtree at the point referred to by fd1 and attach that to fd2; lastly,
> +it would attach the clone to "/mnt2".
> +
> +
> +.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
> +.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
> +.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
> +.SH RETURN VALUE
> +On success, the new file descriptor is returned.  On error, \-1 is returned,
> +and
> +.I errno
> +is set appropriately.
> +.SH ERRORS
> +.TP
> +.B EACCES
> +Search permission is denied for one of the directories
> +in the path prefix of
> +.IR pathname .
> +(See also
> +.BR path_resolution (7).)
> +.TP
> +.B EBADF
> +.I dirfd
> +is not a valid open file descriptor.
> +.TP
> +.B EFAULT
> +.I pathname
> +is NULL or
> +.IR pathname
> +point to a location outside the process's accessible address space.
> +.TP
> +.B EINVAL
> +Reserved flag specified in
> +.IR flags .
> +.TP
> +.B ELOOP
> +Too many symbolic links encountered while traversing the pathname.
> +.TP
> +.B ENAMETOOLONG
> +.I pathname
> +is too long.
> +.TP
> +.B ENOENT
> +A component of
> +.I pathname
> +does not exist, or
> +.I pathname
> +is an empty string and
> +.B AT_EMPTY_PATH
> +was not specified in
> +.IR flags .
> +.TP
> +.B ENOMEM
> +Out of memory (i.e., kernel memory).
> +.TP
> +.B ENOTDIR
> +A component of the path prefix of
> +.I pathname
> +is not a directory or
> +.I pathname
> +is relative and
> +.I dirfd
> +is a file descriptor referring to a file other than a directory.
> +.SH VERSIONS
> +.BR open_tree ()
> +was added to Linux in kernel 4.18.
> +.SH CONFORMING TO
> +.BR open_tree ()
> +is Linux-specific.
> +.SH NOTES
> +Glibc does not (yet) provide a wrapper for the
> +.BR open_tree ()
> +system call; call it using
> +.BR syscall (2).
> +.SH SEE ALSO
> +.BR fsmount (2),
> +.BR move_mount (2),
> +.BR open (2)
> 


-- 
Michael Kerrisk
Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
Linux/UNIX System Programming Training: http://man7.org/training/

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [MANPAGE PATCH] Add manpage for fsopen(2), fspick(2) and fsmount(2)
  2018-07-10 22:54 ` [MANPAGE PATCH] Add manpage for fsopen(2), fspick(2) and fsmount(2) David Howells
@ 2019-10-09  9:52   ` Michael Kerrisk (man-pages)
  0 siblings, 0 replies; 75+ messages in thread
From: Michael Kerrisk (man-pages) @ 2019-10-09  9:52 UTC (permalink / raw)
  To: David Howells
  Cc: mtk.manpages, viro, linux-api, linux-fsdevel, torvalds,
	linux-kernel, linux-man, Eric W. Biederman

Hello David,

See my previous mail.

With respect to the patch below, would you be willing to review
the content of this man-pages patch to see if it accurately reflects 
what was merged into the kernel, and then resubmit please?

Thanks,

Michael

On 7/11/18 12:54 AM, David Howells wrote:
> Add a manual page to document the fsopen(), fspick() and fsmount() system
> calls.
> 
> Signed-off-by: David Howells <dhowells@redhat.com>
> ---
> 
>  man2/fsmount.2 |    1 
>  man2/fsopen.2  |  357 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  man2/fspick.2  |    1 
>  3 files changed, 359 insertions(+)
>  create mode 100644 man2/fsmount.2
>  create mode 100644 man2/fsopen.2
>  create mode 100644 man2/fspick.2
> 
> diff --git a/man2/fsmount.2 b/man2/fsmount.2
> new file mode 100644
> index 000000000..2bf59fc3e
> --- /dev/null
> +++ b/man2/fsmount.2
> @@ -0,0 +1 @@
> +.so man2/fsopen.2
> diff --git a/man2/fsopen.2 b/man2/fsopen.2
> new file mode 100644
> index 000000000..1bc761ab4
> --- /dev/null
> +++ b/man2/fsopen.2
> @@ -0,0 +1,357 @@
> +'\" t
> +.\" Copyright (c) 2018 David Howells <dhowells@redhat.com>
> +.\"
> +.\" %%%LICENSE_START(VERBATIM)
> +.\" Permission is granted to make and distribute verbatim copies of this
> +.\" manual provided the copyright notice and this permission notice are
> +.\" preserved on all copies.
> +.\"
> +.\" Permission is granted to copy and distribute modified versions of this
> +.\" manual under the conditions for verbatim copying, provided that the
> +.\" entire resulting derived work is distributed under the terms of a
> +.\" permission notice identical to this one.
> +.\"
> +.\" Since the Linux kernel and libraries are constantly changing, this
> +.\" manual page may be incorrect or out-of-date.  The author(s) assume no
> +.\" responsibility for errors or omissions, or for damages resulting from
> +.\" the use of the information contained herein.  The author(s) may not
> +.\" have taken the same level of care in the production of this manual,
> +.\" which is licensed free of charge, as they might when working
> +.\" professionally.
> +.\"
> +.\" Formatted or processed versions of this manual, if unaccompanied by
> +.\" the source, must acknowledge the copyright and authors of this work.
> +.\" %%%LICENSE_END
> +.\"
> +.TH FSOPEN 2 2018-06-07 "Linux" "Linux Programmer's Manual"
> +.SH NAME
> +fsopen, fsmount, fspick \- Handle filesystem (re-)configuration and mounting
> +.SH SYNOPSIS
> +.nf
> +.B #include <sys/types.h>
> +.br
> +.B #include <sys/mount.h>
> +.br
> +.B #include <unistd.h>
> +.br
> +.BR "#include <fcntl.h>           " "/* Definition of AT_* constants */"
> +.PP
> +.BI "int fsopen(const char *" fsname ", unsigned int " flags );
> +.PP
> +.BI "int fsmount(int " fd ", unsigned int " flags ", unsigned int " ms_flags );
> +.PP
> +.BI "int fspick(int " dirfd ", const char *" pathname ", unsigned int " flags );
> +.fi
> +.PP
> +.IR Note :
> +There are no glibc wrappers for these system calls.
> +.SH DESCRIPTION
> +.PP
> +.BR fsopen ()
> +creates a new filesystem configuration context within the kernel for the
> +filesystem named in the
> +.I fsname
> +parameter and attaches it to a file descriptor, which it then returns.  The
> +file descriptor can be marked close-on-exec by setting
> +.B FSOPEN_CLOEXEC
> +in flags.
> +.PP
> +The
> +file descriptor can then be used to configure the desired filesystem parameters
> +and security parameters by using
> +.BR write (2)
> +to pass parameters to it and then writing a command to actually create the
> +filesystem representation.
> +.PP
> +The file descriptor also serves as a channel by which more comprehensive error,
> +warning and information messages may be retrieved from the kernel using
> +.BR read (2).
> +.PP
> +Once the kernel's filesystem representation has been created, it can be queried
> +by calling
> +.BR fsinfo (2)
> +on the file descriptor.  fsinfo() will spot that the target is actually a
> +creation context and look inside that.
> +.PP
> +.BR fsmount ()
> +can then be called to create a mount object that refers to the newly created
> +filesystem representation, with the propagation and mount restrictions to be
> +applied specified in
> +.IR ms_flags .
> +The mount object is then attached to a new file descriptor that looks like one
> +created by
> +.BR open "(2) with " O_PATH " or " open_tree (2).
> +This can be passed to
> +.BR move_mount (2)
> +to attach the mount object to a mountpoint, thereby completing the process.
> +.PP
> +The file descriptor returned by fsmount() is marked close-on-exec if
> +FSMOUNT_CLOEXEC is specified in
> +.IR flags .
> +.PP
> +After fsmount() has completed, the context created by fsopen() is reset and
> +moved to reconfiguration state, allowing the new superblock to be reconfigured.
> +.PP
> +.BR fspick ()
> +creates a new filesystem context within the kernel, attaches the superblock
> +specified by
> +.IR dfd ", " pathname ", " flags
> +and puts it into the reconfiguration state and attached the context to a new
> +file descriptor that can then be parameterised with
> +.BR write (2)
> +exactly the same as for the context created by fsopen() above.
> +.PP
> +.I flags
> +is an OR'd together mask of
> +.B FSPICK_CLOEXEC
> +which indicates that the returned file descriptor should be marked
> +close-on-exec and
> +.BR FSPICK_SYMLINK_NOFOLLOW ", " FSPICK_NO_AUTOMOUNT " and " FSPICK_EMPTY_PATH
> +which control the pathwalk to the target object (see below).
> +
> +.\"________________________________________________________
> +.SS Writable Command Interface
> +Superblock (re-)configuration is achieved by writing command strings to the
> +context file descriptor using
> +.BR write (2).
> +Each string is prefixed with a specifier indicating the class of command
> +being specified.  The available commands include:
> +.TP
> +\fB"o <option>"\fP
> +Specify a filesystem or security parameter.
> +.I <option>
> +is typically a key or key=val format string.  Since the length of the option is
> +given to write(), the option may include any sort of character, including
> +spaces and commas or even binary data.
> +.TP
> +\fB"s <name>"\fP
> +Specify a device file, network server or other other source specification.
> +This may be optional, depending on the filesystem, and it may be possible to
> +provide multiple of them to a filesystem.
> +.TP
> +\fB"x create"\fP
> +End the filesystem configuration phase and try and create a representation in
> +the kernel with the parameters specified.  After this, the context is shifted
> +to the mount-pending state waiting for an fsmount() call to occur.
> +.TP
> +\fB"x reconfigure"\fP
> +End a filesystem reconfiguration phase try to apply the parameters to the
> +filesystem representation.  After this, the context gets reset and put back to
> +the start of the reconfiguration phase again.
> +.PP
> +With this interface, option strings are not limited to 4096 bytes, either
> +individually or in sum, and they are also not restricted to text-only options.
> +Further, errors may be given individually for each option and not aggregated or
> +dumped into the kernel log.
> +
> +.\"________________________________________________________
> +.SS Message Retrieval Interface
> +The context file descriptor may be queried for message strings at any time by
> +calling
> +.BR read (2)
> +on the file descriptor.  This will return formatted messages that are prefixed
> +to indicate their class:
> +.TP
> +\fB"e <message>"\fP
> +An error message string was logged.
> +.TP
> +\fB"i <message>"\fP
> +An informational message string was logged.
> +.TP
> +\fB"w <message>"\fP
> +An warning message string was logged.
> +.PP
> +Messages are removed from the queue as they're read.
> +
> +.\"________________________________________________________
> +.SH EXAMPLES
> +To illustrate the process, here's an example whereby this can be used to mount
> +an ext4 filesystem on /dev/sdb1 onto /mnt.  Note that the example ignores the
> +fact that
> +.BR write (2)
> +has a length parameter and that errors might occur.
> +.PP
> +.in +4n
> +.nf
> +sfd = fsopen("ext4", FSOPEN_CLOEXEC);
> +write(sfd, "s /dev/sdb1");
> +write(sfd, "o noatime");
> +write(sfd, "o acl");
> +write(sfd, "o user_attr");
> +write(sfd, "o iversion");
> +write(sfd, "x create");
> +fsinfo(sfd, NULL, ...);
> +mfd = fsmount(sfd, FSMOUNT_CLOEXEC, MS_RELATIME);
> +move_mount(mfd, "", sfd, AT_FDCWD, "/mnt", MOVE_MOUNT_F_EMPTY_PATH);
> +.fi
> +.in
> +.PP
> +Here, an ext4 context is created first and attached to sfd.  This is then told
> +where its source will be, given a bunch of options and created.
> +.BR fsinfo (2)
> +can then be used to query the filesystem.  Then fsmount() is called to create a
> +mount object and
> +.BR move_mount (2)
> +is called to attach it to its intended mountpoint.
> +.PP
> +And here's an example of mounting from an NFS server:
> +.PP
> +.in +4n
> +.nf
> +sfd = fsopen("nfs", 0);
> +write(sfd, "s example.com/pub/linux");
> +write(sfd, "o nfsvers=3");
> +write(sfd, "o rsize=65536");
> +write(sfd, "o wsize=65536");
> +write(sfd, "o rdma");
> +write(sfd, "x create");
> +mfd = fsmount(sfd, 0, MS_NODEV);
> +move_mount(mfd, "", sfd, AT_FDCWD, "/mnt", MOVE_MOUNT_F_EMPTY_PATH);
> +.fi
> +.in
> +.PP
> +Reconfiguration can be achieved by:
> +.PP
> +.in +4n
> +.nf
> +sfd = fspick(AT_FDCWD, "/mnt", FSPICK_NO_AUTOMOUNT | FSPICK_CLOEXEC);
> +write(sfd, "o ro");
> +write(sfd, "x reconfigure");
> +.fi
> +.in
> +.PP
> +or:
> +.PP
> +.in +4n
> +.nf
> +sfd = fsopen(...);
> +...
> +mfd = fsmount(sfd, ...);
> +...
> +write(sfd, "o ro");
> +write(sfd, "x reconfigure");
> +.fi
> +.in
> +
> +
> +.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
> +.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
> +.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
> +.SH RETURN VALUE
> +On success, all three functions return a file descriptor.  On error, \-1 is
> +returned, and
> +.I errno
> +is set appropriately.
> +.SH ERRORS
> +The error values given below result from filesystem type independent
> +errors.
> +Each filesystem type may have its own special errors and its
> +own special behavior.
> +See the Linux kernel source code for details.
> +.TP
> +.B EACCES
> +A component of a path was not searchable.
> +(See also
> +.BR path_resolution (7).)
> +.TP
> +.B EACCES
> +Mounting a read-only filesystem was attempted without giving the
> +.B MS_RDONLY
> +flag.
> +.TP
> +.B EACCES
> +The block device
> +.I source
> +is located on a filesystem mounted with the
> +.B MS_NODEV
> +option.
> +.\" mtk: Probably: write permission is required for MS_BIND, with
> +.\" the error EPERM if not present; CAP_DAC_OVERRIDE is required.
> +.TP
> +.B EBUSY
> +.I source
> +cannot be reconfigured read-only, because it still holds files open for
> +writing.
> +.TP
> +.B EFAULT
> +One of the pointer arguments points outside the user address space.
> +.TP
> +.B EINVAL
> +.I source
> +had an invalid superblock.
> +.TP
> +.B EINVAL
> +.I ms_flags
> +includes more than one of
> +.BR MS_SHARED ,
> +.BR MS_PRIVATE ,
> +.BR MS_SLAVE ,
> +or
> +.BR MS_UNBINDABLE .
> +.TP
> +.BR EINVAL
> +An attempt was made to bind mount an unbindable mount.
> +.TP
> +.B ELOOP
> +Too many links encountered during pathname resolution.
> +.TP
> +.B EMFILE
> +The system has too many open files to create more.
> +.TP
> +.B ENFILE
> +The process has too many open files to create more.
> +.TP
> +.B ENAMETOOLONG
> +A pathname was longer than
> +.BR MAXPATHLEN .
> +.TP
> +.B ENODEV
> +Filesystem
> +.I fsname
> +not configured in the kernel.
> +.TP
> +.B ENOENT
> +A pathname was empty or had a nonexistent component.
> +.TP
> +.B ENOMEM
> +The kernel could not allocate sufficient memory to complete the call.
> +.TP
> +.B ENOTBLK
> +.I source
> +is not a block device (and a device was required).
> +.TP
> +.B ENOTDIR
> +.IR pathname ,
> +or a prefix of
> +.IR source ,
> +is not a directory.
> +.TP
> +.B ENXIO
> +The major number of the block device
> +.I source
> +is out of range.
> +.TP
> +.B EPERM
> +The caller does not have the required privileges.
> +.SH CONFORMING TO
> +These functions are Linux-specific and should not be used in programs intended
> +to be portable.
> +.SH VERSIONS
> +.BR fsopen "(), " fsmount "() and " fspick ()
> +were added to Linux in kernel 4.18.
> +.SH NOTES
> +Glibc does not (yet) provide a wrapper for the
> +.BR fsopen "() , " fsmount "() or " fspick "()"
> +system calls; call them using
> +.BR syscall (2).
> +.SH SEE ALSO
> +.BR mountpoint (1),
> +.BR move_mount (2),
> +.BR open_tree (2),
> +.BR umount (2),
> +.BR mount_namespaces (7),
> +.BR path_resolution (7),
> +.BR findmnt (8),
> +.BR lsblk (8),
> +.BR mount (8),
> +.BR umount (8)
> diff --git a/man2/fspick.2 b/man2/fspick.2
> new file mode 100644
> index 000000000..2bf59fc3e
> --- /dev/null
> +++ b/man2/fspick.2
> @@ -0,0 +1 @@
> +.so man2/fsopen.2
> 


-- 
Michael Kerrisk
Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
Linux/UNIX System Programming Training: http://man7.org/training/

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [MANPAGE PATCH] Add manpage for fsinfo(2)
  2018-07-10 22:55 ` [MANPAGE PATCH] Add manpage for fsinfo(2) David Howells
@ 2019-10-09  9:52   ` Michael Kerrisk (man-pages)
  2019-10-09 12:02   ` David Howells
  1 sibling, 0 replies; 75+ messages in thread
From: Michael Kerrisk (man-pages) @ 2019-10-09  9:52 UTC (permalink / raw)
  To: David Howells
  Cc: mtk.manpages, viro, linux-api, linux-fsdevel, torvalds,
	linux-kernel, linux-man, Eric W. Biederman

Hello David,

See my previous mails.

There is no fsinfo(2) in the system call in the kernel currently.
Will that call still be added, or was it replaced by fsconfig(2),
which--as far as I can tell--dnot have a man-pages patch?

Thanks,

Michael

On 7/11/18 12:55 AM, David Howells wrote:
> Add a manual page to document the fsinfo() system call.
> 
> Signed-off-by: David Howells <dhowells@redhat.com>
> ---
> 
>  man2/fsinfo.2       | 1017 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  man2/ioctl_iflags.2 |    6 
>  man2/stat.2         |    7 
>  man2/statx.2        |   13 +
>  man2/utime.2        |    7 
>  man2/utimensat.2    |    7 
>  6 files changed, 1057 insertions(+)
>  create mode 100644 man2/fsinfo.2
> 
> diff --git a/man2/fsinfo.2 b/man2/fsinfo.2
> new file mode 100644
> index 000000000..5710232df
> --- /dev/null
> +++ b/man2/fsinfo.2
> @@ -0,0 +1,1017 @@
> +'\" t
> +.\" Copyright (c) 2018 David Howells <dhowells@redhat.com>
> +.\"
> +.\" %%%LICENSE_START(VERBATIM)
> +.\" Permission is granted to make and distribute verbatim copies of this
> +.\" manual provided the copyright notice and this permission notice are
> +.\" preserved on all copies.
> +.\"
> +.\" Permission is granted to copy and distribute modified versions of this
> +.\" manual under the conditions for verbatim copying, provided that the
> +.\" entire resulting derived work is distributed under the terms of a
> +.\" permission notice identical to this one.
> +.\"
> +.\" Since the Linux kernel and libraries are constantly changing, this
> +.\" manual page may be incorrect or out-of-date.  The author(s) assume no
> +.\" responsibility for errors or omissions, or for damages resulting from
> +.\" the use of the information contained herein.  The author(s) may not
> +.\" have taken the same level of care in the production of this manual,
> +.\" which is licensed free of charge, as they might when working
> +.\" professionally.
> +.\"
> +.\" Formatted or processed versions of this manual, if unaccompanied by
> +.\" the source, must acknowledge the copyright and authors of this work.
> +.\" %%%LICENSE_END
> +.\"
> +.TH FSINFO 2 2018-06-06 "Linux" "Linux Programmer's Manual"
> +.SH NAME
> +fsinfo \- Get filesystem information
> +.SH SYNOPSIS
> +.nf
> +.B #include <sys/types.h>
> +.br
> +.B #include <sys/fsinfo.h>
> +.br
> +.B #include <unistd.h>
> +.br
> +.BR "#include <fcntl.h>           " "/* Definition of AT_* constants */"
> +.PP
> +.BI "int fsinfo(int " dirfd ", const char *" pathname ","
> +.BI "           struct fsinfo_params *" params ","
> +.BI "           void *" buffer ", size_t " buf_size );
> +.fi
> +.PP
> +.IR Note :
> +There is no glibc wrapper for
> +.BR fsinfo ();
> +see NOTES.
> +.SH DESCRIPTION
> +.PP
> +fsinfo() retrieves the desired filesystem attribute, as selected by the
> +parameters pointed to by
> +.IR params ,
> +and stores its value in the buffer pointed to by
> +.IR buffer .
> +.PP
> +The parameter structure is optional, defaulting to all the parameters being 0
> +if the pointer is NULL.  The structure looks like the following:
> +.PP
> +.in +4n
> +.nf
> +struct fsinfo_params {
> +    __u32 at_flags;     /* AT_SYMLINK_NOFOLLOW and similar flags */
> +    __u32 request;      /* Requested attribute */
> +    __u32 Nth;          /* Instance of attribute */
> +    __u32 Mth;          /* Subinstance of Nth instance */
> +    __u32 __reserved[6]; /* Reserved params; all must be 0 */
> +};
> +.fi
> +.in
> +.PP
> +The filesystem to be queried is looked up using a combination of
> +.IR dfd ", " pathname " and " params->at_flags.
> +This is discussed in more detail below.
> +.PP
> +The desired attribute is indicated by
> +.IR params->request .
> +If
> +.I params
> +is NULL, this will default to
> +.BR fsinfo_attr_statfs ,
> +which retrieves some of the information returned by
> +.BR statfs ().
> +The available attributes are described below in the "THE ATTRIBUTES" section.
> +.PP
> +Some attributes can have multiple values and some can even have multiple
> +instances with multiple values.  For example, a network filesystem might use
> +multiple servers.  The names of each of these servers can be retrieved by
> +using
> +.I params->Nth
> +to iterate through all the instances until error
> +.B ENODATA
> +occurs, indicating the end of the list.  Further, each server might have
> +multiple addresses available; these can be enumerated using
> +.I params->Nth
> +to iterate the servers and
> +.I params->Mth
> +to iterate the addresses of the Nth server.
> +.PP
> +The amount of data written into the buffer depends on the attribute selected.
> +Some attributes return variable-length strings and some return fixed-size
> +structures.  If either
> +.IR buffer " is  NULL  or " buf_size " is 0"
> +then the size of the attribute value will be returned and nothing will be
> +written into the buffer.
> +.PP
> +The
> +.I params->__reserved
> +parameters must all be 0.
> +.\"_______________________________________________________
> +.SS
> +Allowance for Future Attribute Expansion
> +.PP
> +To allow for the future expansion and addition of fields to any fixed-size
> +structure attribute,
> +.BR fsinfo ()
> +makes the following guarantees:
> +.RS 4m
> +.IP (1) 4m
> +It will always clear any excess space in the buffer.
> +.IP (2) 4m
> +It will always return the actual size of the data.
> +.IP (3) 4m
> +It will truncate the data to fit it into the buffer rather than giving an
> +error.
> +.IP (4) 4m
> +Any new version of a structure will incorporate all the fields from the old
> +version at same offsets.
> +.RE
> +.PP
> +So, for example, if the caller is running on an older version of the kernel
> +with an older, smaller version of the structure than was asked for, the kernel
> +will write the smaller version into the buffer and will clear the remainder of
> +the buffer to make sure any additional fields are set to 0.  The function will
> +return the actual size of the data.
> +.PP
> +On the other hand, if the caller is running on a newer version of the kernel
> +with a newer version of the structure that is larger than the buffer, the write
> +to the buffer will be truncated to fit as necessary and the actual size of the
> +data will be returned.
> +.PP
> +Note that this doesn't apply to variable-length string attributes.
> +
> +.\"_______________________________________________________
> +.SS
> +Invoking \fBfsinfo\fR():
> +.PP
> +To access a file's status, no permissions are required on the file itself, but
> +in the case of
> +.BR fsinfo ()
> +with a path, execute (search) permission is required on all of the directories
> +in
> +.I pathname
> +that lead to the file.
> +.PP
> +.BR fsinfo ()
> +uses
> +.IR pathname ", " dirfd " and " params->at_flags
> +to locate the target file in one of a variety of ways:
> +.TP
> +[*] By absolute path.
> +.I pathname
> +points to an absolute path and
> +.I dirfd
> +is ignored.  The file is looked up by name, starting from the root of the
> +filesystem as seen by the calling process.
> +.TP
> +[*] By cwd-relative path.
> +.I pathname
> +points to a relative path and
> +.IR dirfd " is " AT_FDCWD .
> +The file is looked up by name, starting from the current working directory.
> +.TP
> +[*] By dir-relative path.
> +.I pathname
> +points to relative path and
> +.I dirfd
> +indicates a file descriptor pointing to a directory.  The file is looked up by
> +name, starting from the directory specified by
> +.IR dirfd .
> +.TP
> +[*] By file descriptor.
> +.IR pathname " is " NULL " and " dirfd
> +indicates a file descriptor.  The file attached to the file descriptor is
> +queried directly.  The file descriptor may point to any type of file, not just
> +a directory.
> +.PP
> +.I flags
> +can be used to influence a path-based lookup.  A value for
> +.I flags
> +is constructed by OR'ing together zero or more of the following constants:
> +.TP
> +.BR AT_EMPTY_PATH
> +.\" commit 65cfc6722361570bfe255698d9cd4dccaf47570d
> +If
> +.I pathname
> +is an empty string, operate on the file referred to by
> +.IR dirfd
> +(which may have been obtained using the
> +.BR open (2)
> +.B O_PATH
> +flag).
> +If
> +.I dirfd
> +is
> +.BR AT_FDCWD ,
> +the call operates on the current working directory.
> +In this case,
> +.I dirfd
> +can refer to any type of file, not just a directory.
> +This flag is Linux-specific; define
> +.B _GNU_SOURCE
> +.\" Before glibc 2.16, defining _ATFILE_SOURCE sufficed
> +to obtain its definition.
> +.TP
> +.BR AT_NO_AUTOMOUNT
> +Don't automount the terminal ("basename") component of
> +.I pathname
> +if it is a directory that is an automount point.  This allows the caller to
> +gather attributes of the filesystem holding an automount point (rather than
> +the filesystem it would mount).  This flag can be used in tools that scan
> +directories to prevent mass-automounting of a directory of automount points.
> +The
> +.B AT_NO_AUTOMOUNT
> +flag has no effect if the mount point has already been mounted over.
> +This flag is Linux-specific; define
> +.B _GNU_SOURCE
> +.\" Before glibc 2.16, defining _ATFILE_SOURCE sufficed
> +to obtain its definition.
> +.TP
> +.B AT_SYMLINK_NOFOLLOW
> +If
> +.I pathname
> +is a symbolic link, do not dereference it:
> +instead return information about the link itself, like
> +.BR lstat ().
> +.SH THE ATTRIBUTES
> +.PP
> +There is a range of attributes that can be selected from.  These are:
> +
> +.\" __________________ fsinfo_attr_statfs __________________
> +.TP
> +.B fsinfo_attr_statfs
> +This retrieves the "dynamic"
> +.B statfs
> +information, such as block and file counts, that are expected to change whilst
> +a filesystem is being used.  This fills in the following structure:
> +.PP
> +.RS
> +.in +4n
> +.nf
> +struct fsinfo_statfs {
> +    __u64 f_blocks;	/* Total number of blocks in fs */
> +    __u64 f_bfree;	/* Total number of free blocks */
> +    __u64 f_bavail;	/* Number of free blocks available to ordinary user */
> +    __u64 f_files;	/* Total number of file nodes in fs */
> +    __u64 f_ffree;	/* Number of free file nodes */
> +    __u64 f_favail;	/* Number of free file nodes available to ordinary user */
> +    __u32 f_bsize;	/* Optimal block size */
> +    __u32 f_frsize;	/* Fragment size */
> +};
> +.fi
> +.in
> +.RE
> +.IP
> +The fields correspond to those of the same name returned by
> +.BR statfs ().
> +
> +.\" __________________ fsinfo_attr_fsinfo __________________
> +.TP
> +.B fsinfo_attr_fsinfo
> +This retrieves information about the
> +.BR fsinfo ()
> +system call itself.  This fills in the following structure:
> +.PP
> +.RS
> +.in +4n
> +.nf
> +struct fsinfo_fsinfo {
> +    __u32 max_attr;
> +    __u32 max_cap;
> +};
> +.fi
> +.in
> +.RE
> +.IP
> +The
> +.I max_attr
> +value indicates the number of attributes supported by the
> +.BR fsinfo ()
> +system call, and
> +.I max_cap
> +indicates the number of capability bits supported by the
> +.B fsinfo_attr_capabilities
> +attribute.  The first corresponds to
> +.I fsinfo_attr__nr
> +and the second to
> +.I fsinfo_cap__nr
> +in the header file.
> +
> +.\" __________________ fsinfo_attr_ids __________________
> +.TP
> +.B fsinfo_attr_ids
> +This retrieves a number of fixed IDs and other static information otherwise
> +available through
> +.BR statfs ().
> +The following structure is filled in:
> +.PP
> +.RS
> +.in +4n
> +.nf
> +struct fsinfo_ids {
> +    char  f_fs_name[15 + 1]; /* Filesystem name */
> +    __u64 f_flags;	/* Filesystem mount flags (MS_*) */
> +    __u64 f_fsid;	/* Short 64-bit Filesystem ID */
> +    __u64 f_sb_id;	/* Internal superblock ID */
> +    __u32 f_fstype;	/* Filesystem type from linux/magic.h */
> +    __u32 f_dev_major;	/* As st_dev_* from struct statx */
> +    __u32 f_dev_minor;
> +};
> +.fi
> +.in
> +.RE
> +.IP
> +Most of these are filled in as for
> +.BR statfs (),
> +with the addition of the filesystem's symbolic name in
> +.I f_fs_name
> +and an identifier for use in notifications in
> +.IR f_sb_id .
> +
> +.\" __________________ fsinfo_attr_limits __________________
> +.TP
> +.B fsinfo_attr_limits
> +This retrieves information about the limits of what a filesystem can support.
> +The following structure is filled in:
> +.PP
> +.RS
> +.in +4n
> +.nf
> +struct fsinfo_limits {
> +    __u64 max_file_size;
> +    __u64 max_uid;
> +    __u64 max_gid;
> +    __u64 max_projid;
> +    __u32 max_dev_major;
> +    __u32 max_dev_minor;
> +    __u32 max_hard_links;
> +    __u32 max_xattr_body_len;
> +    __u16 max_xattr_name_len;
> +    __u16 max_filename_len;
> +    __u16 max_symlink_len;
> +    __u16 __reserved[1];
> +};
> +.fi
> +.in
> +.RE
> +.IP
> +These indicate the maximum supported sizes for a variety of filesystem objects,
> +including the file size, the extended attribute name length and body length,
> +the filename length and the symlink body length.
> +.IP
> +It also indicates the maximum representable values for a User ID, a Group ID,
> +a Project ID, a device major number and a device minor number.
> +.IP
> +And finally, it indicates the maximum number of hard links that can be made to
> +a file.
> +.IP
> +Note that some of these values may be zero if the underlying object or concept
> +is not supported by the filesystem or the medium.
> +
> +.\" __________________ fsinfo_attr_supports __________________
> +.TP
> +.B fsinfo_attr_supports
> +This retrieves information about what bits a filesystem supports in various
> +masks.  The following structure is filled in:
> +.PP
> +.RS
> +.in +4n
> +.nf
> +struct fsinfo_supports {
> +    __u64 stx_attributes;
> +    __u32 stx_mask;
> +    __u32 ioc_flags;
> +    __u32 win_file_attrs;
> +    __u32 __reserved[1];
> +};
> +.fi
> +.in
> +.RE
> +.IP
> +The
> +.IR stx_attributes " and " stx_mask
> +fields indicate what bits in the struct statx fields of the matching names
> +are supported by the filesystem.
> +.IP
> +The
> +.I ioc_flags
> +field indicates what FS_*_FL flag bits as used through the FS_IOC_GET/SETFLAGS
> +ioctls are supported by the filesystem.
> +.IP
> +The
> +.I win_file_attrs
> +indicates what DOS/Windows file attributes a filesystem supports, if any.
> +
> +.\" __________________ fsinfo_attr_capabilities __________________
> +.TP
> +.B fsinfo_attr_capabilities
> +This retrieves information about what features a filesystem supports as a
> +series of single bit indicators.  The following structure is filled in:
> +.PP
> +.RS
> +.in +4n
> +.nf
> +struct fsinfo_capabilities {
> +    __u8 capabilities[(fsinfo_cap__nr + 7) / 8];
> +};
> +.fi
> +.in
> +.RE
> +.IP
> +where the bit of interest can be found by:
> +.PP
> +.RS
> +.in +4n
> +.nf
> +	p->capabilities[bit / 8] & (1 << (bit % 8)))
> +.fi
> +.in
> +.RE
> +.IP
> +The bits are listed by
> +.I enum fsinfo_capability
> +and
> +.B fsinfo_cap__nr
> +is one more than the last capability bit listed in the header file.
> +.IP
> +Note that the number of capability bits actually supported by the kernel can be
> +found using the
> +.B fsinfo_attr_fsinfo
> +attribute.
> +.IP
> +The capability bits and their meanings are listed below in the "THE
> +CAPABILITIES" section.
> +
> +.\" __________________ fsinfo_attr_timestamp_info __________________
> +.TP
> +.B fsinfo_attr_timestamp_info
> +This retrieves information about what timestamp resolution and scope is
> +supported by a filesystem for each of the file timestamps.  The following
> +structure is filled in:
> +.PP
> +.RS
> +.in +4n
> +.nf
> +struct fsinfo_timestamp_info {
> +	__s64 minimum_timestamp;
> +	__s64 maximum_timestamp;
> +	__u16 atime_gran_mantissa;
> +	__u16 btime_gran_mantissa;
> +	__u16 ctime_gran_mantissa;
> +	__u16 mtime_gran_mantissa;
> +	__s8  atime_gran_exponent;
> +	__s8  btime_gran_exponent;
> +	__s8  ctime_gran_exponent;
> +	__s8  mtime_gran_exponent;
> +	__u32 __reserved[1];
> +};
> +.fi
> +.in
> +.RE
> +.IP
> +where
> +.IR minimum_timestamp " and " maximum_timestamp
> +are the limits on the timestamps that the filesystem supports and
> +.IR *time_gran_mantissa " and " *time_gran_exponent
> +indicate the granularity of each timestamp in terms of seconds, using the
> +formula:
> +.PP
> +.RS
> +.in +4n
> +.nf
> +mantissa * pow(10, exponent) Seconds
> +.fi
> +.in
> +.RE
> +.IP
> +where exponent may be negative and the result may be a fraction of a second.
> +.IP
> +Four timestamps are detailed: \fBA\fPccess time, \fBB\fPirth/creation time,
> +\fBC\fPhange time and \fBM\fPodification time.  Capability bits are defined
> +that specify whether each of these exist in the filesystem or not.
> +.IP
> +Note that the timestamp description may be approximated or inaccurate if the
> +file is actually remote or is the union of multiple objects.
> +
> +.\" __________________ fsinfo_attr_volume_id __________________
> +.TP
> +.B fsinfo_attr_volume_id
> +This retrieves the system's superblock volume identifier as a variable-length
> +string.  This does not necessarily represent a value stored in the medium but
> +might be constructed on the fly.
> +.IP
> +For instance, for a block device this is the block device identifier
> +(eg. "sdb2"); for AFS this would be the numeric volume identifier.
> +
> +.\" __________________ fsinfo_attr_volume_uuid __________________
> +.TP
> +.B fsinfo_attr_volume_uuid
> +This retrieves the volume UUID, if there is one, as a little-endian binary
> +UUID.  This fills in the following structure:
> +.PP
> +.RS
> +.in +4n
> +.nf
> +struct fsinfo_volume_uuid {
> +    __u8 uuid[16];
> +};
> +.fi
> +.in
> +.RE
> +.IP
> +
> +.\" __________________ fsinfo_attr_volume_name __________________
> +.TP
> +.B fsinfo_attr_volume_name
> +This retrieves the filesystem's volume name as a variable-length string.  This
> +is expected to represent a name stored in the medium.
> +.IP
> +For a block device, this might be a label stored in the superblock.  For a
> +network filesystem, this might be a logical volume name of some sort.
> +
> +.\" __________________ fsinfo_attr_cell/domain __________________
> +.PP
> +.B fsinfo_attr_cell_name
> +.br
> +.B fsinfo_attr_domain_name
> +.br
> +.IP
> +These two attributes are variable-length string attributes that may be used to
> +obtain information about network filesystems.  An AFS volume, for instance,
> +belongs to a named cell.  CIFS shares may belong to a domain.
> +
> +.\" __________________ fsinfo_attr_realm_name __________________
> +.TP
> +.B fsinfo_attr_realm_name
> +This attribute is variable-length string that indicates the Kerberos realm that
> +a filesystem's authentication tokens should come from.
> +
> +.\" __________________ fsinfo_attr_server_name __________________
> +.TP
> +.B fsinfo_attr_server_name
> +This attribute is a multiple-value attribute that lists the names of the
> +servers that are backing a network filesystem.  Each value is a variable-length
> +string.  The values are enumerated by calling
> +.BR fsinfo ()
> +multiple times, incrementing
> +.I params->Nth
> +each time until an ENODATA error occurs, thereby indicating the end of the
> +list.
> +
> +.\" __________________ fsinfo_attr_server_address __________________
> +.TP
> +.B fsinfo_attr_server_address
> +This attribute is a multiple-instance, multiple-value attribute that lists the
> +addresses of the servers that are backing a network filesystem.  Each value is
> +a structure of the following type:
> +.PP
> +.RS
> +.in +4n
> +.nf
> +struct fsinfo_server_address {
> +    struct __kernel_sockaddr_storage address;
> +};
> +.fi
> +.in
> +.RE
> +.IP
> +Where the address may be AF_INET, AF_INET6, AF_RXRPC or any other type as
> +appropriate to the filesystem.
> +.IP
> +The values are enumerated by calling
> +.IR fsinfo ()
> +multiple times, incrementing
> +.I params->Nth
> +to step through the servers and
> +.I params->Mth
> +to step through the addresses of the Nth server each time until ENODATA errors
> +occur, thereby indicating either the end of a server's address list or the end
> +of the server list.
> +.IP
> +Barring the server list changing whilst being accessed, it is expected that the
> +.I params->Nth
> +will correspond to
> +.I params->Nth
> +for
> +.BR fsinfo_attr_server_name .
> +
> +.\" __________________ fsinfo_attr_parameter __________________
> +.TP
> +.B fsinfo_attr_parameter
> +This attribute is a multiple-value attribute that lists the values of the mount
> +parameters for a filesystem as variable-length strings.
> +.IP
> +The parameters are enumerated by calling
> +.BR fsinfo ()
> +multiple times, incrementing
> +.I params->Nth
> +to step through them until error ENODATA is given.
> +.IP
> +Parameter strings are presented in a form akin to the way they're passed to the
> +context created by the
> +.BR fsopen ()
> +system call.  For example, straight text parameters will be rendered as
> +something like:
> +.PP
> +.RS
> +.in +4n
> +.nf
> +"o data=journal"
> +"o noquota"
> +.fi
> +.in
> +.RE
> +.IP
> +Where the initial "word" indicates the option form.
> +
> +.\" __________________ fsinfo_attr_source __________________
> +.TP
> +.B fsinfo_attr_source
> +This attribute is a multiple-value attribute that lists the mount sources for a
> +filesystem as variable-length strings.  Normally only one source will be
> +available, but the possibility of having more than one is allowed for.
> +.IP
> +The sources are enumerated by calling
> +.BR fsinfo ()
> +multiple times, incrementing
> +.I params->Nth
> +to step through them until error ENODATA is given.
> +.IP
> +Source strings are presented in a form akin to the way they're passed to the
> +context created by the
> +.BR fsopen ()
> +system call.  For example, they will be rendered as something like:
> +.PP
> +.RS
> +.in +4n
> +.nf
> +"s /dev/sda1"
> +"s example.com/pub/linux/"
> +.fi
> +.in
> +.RE
> +.IP
> +Where the initial "word" indicates the option form.
> +
> +.\" __________________ fsinfo_attr_name_encoding __________________
> +.TP
> +.B fsinfo_attr_name_encoding
> +This attribute is variable-length string that indicates the filename encoding
> +used by the filesystem.  The default is "utf8".  Note that this may indicate a
> +non-8-bit encoding if that's what the underlying filesystem actually supports.
> +
> +.\" __________________ fsinfo_attr_name_codepage __________________
> +.TP
> +.B fsinfo_attr_name_codepage
> +This attribute is variable-length string that indicates the codepage used to
> +translate filenames from the filesystem to the system if this is applicable to
> +the filesystem.
> +
> +.\" __________________ fsinfo_attr_io_size __________________
> +.TP
> +.B fsinfo_attr_io_size
> +This retrieves information about the I/O sizes supported by the filesystem.
> +The following structure is filled in:
> +.PP
> +.RS
> +.in +4n
> +.nf
> +struct fsinfo_io_size {
> +    __u32 block_size;
> +    __u32 max_single_read_size;
> +    __u32 max_single_write_size;
> +    __u32 best_read_size;
> +    __u32 best_write_size;
> +};
> +.fi
> +.in
> +.RE
> +.IP
> +Where
> +.I block_size
> +indicates the fundamental I/O block size of the filesystem as something
> +O_DIRECT read/write sizes must be a multiple of;
> +.IR max_single_write_size " and " max_single_write_size
> +indicate the maximum sizes for individual unbuffered data transfer operations;
> +and
> +.IR best_read_size " and " best_write_size
> +indicate the recommended I/O sizes.
> +.IP
> +Note that any of these may be zero if inapplicable or indeterminable.
> +
> +
> +
> +.SH THE CAPABILITIES
> +.PP
> +There are number of capability bits in a bit array that can be retrieved using
> +.BR fsinfo_attr_capabilities .
> +These give information about features of the filesystem driver and the specific
> +filesystem.
> +
> +.\" __________________ fsinfo_cap_is_*_fs __________________
> +.PP
> +.B fsinfo_cap_is_kernel_fs
> +.br
> +.B fsinfo_cap_is_block_fs
> +.br
> +.B fsinfo_cap_is_flash_fs
> +.br
> +.B fsinfo_cap_is_network_fs
> +.br
> +.B fsinfo_cap_is_automounter_fs
> +.IP
> +These indicate the primary type of the filesystem.
> +.B kernel
> +filesystems are special communication interfaces that substitute files for
> +system calls; examples include procfs and sysfs.
> +.B block
> +filesystems require a block device on which to operate; examples include ext4
> +and XFS.
> +.B flash
> +filesystems require an MTD device on which to operate; examples include JFFS2.
> +.B network
> +filesystems require access to the network and contact one or more servers;
> +examples include NFS and AFS.
> +.B automounter
> +filesystems are kernel special filesystems that host automount points and
> +triggers to dynamically create automount points.  Examples include autofs and
> +AFS's dynamic root.
> +
> +.\" __________________ fsinfo_cap_automounts __________________
> +.TP
> +.B fsinfo_cap_automounts
> +The filesystem may have automount points that can be triggered by pathwalk.
> +
> +.\" __________________ fsinfo_cap_adv_locks __________________
> +.TP
> +.B fsinfo_cap_adv_locks
> +The filesystem supports advisory file locks.  For a network filesystem, this
> +indicates that the advisory file locks are cross-client (and also between
> +server and its local filesystem on something like NFS).
> +
> +.\" __________________ fsinfo_cap_mand_locks __________________
> +.TP
> +.B fsinfo_cap_mand_locks
> +The filesystem supports mandatory file locks.  For a network filesystem, this
> +indicates that the mandatory file locks are cross-client (and also between
> +server and its local filesystem on something like NFS).
> +
> +.\" __________________ fsinfo_cap_leases __________________
> +.TP
> +.B fsinfo_cap_leases
> +The filesystem supports leases.  For a network filesystem, this means that the
> +server will tell the client to clean up its state on a file before passing the
> +lease to another client.
> +
> +.\" __________________ fsinfo_cap_*ids __________________
> +.PP
> +.B fsinfo_cap_uids
> +.br
> +.B fsinfo_cap_gids
> +.br
> +.B fsinfo_cap_projids
> +.IP
> +These indicate that the filesystem supports numeric user IDs, group IDs and
> +project IDs respectively.
> +
> +.\" __________________ fsinfo_cap_id_* __________________
> +.PP
> +.B fsinfo_cap_id_names
> +.br
> +.B fsinfo_cap_id_guids
> +.IP
> +These indicate that the filesystem employs textual names and/or GUIDs as
> +identifiers.
> +
> +.\" __________________ fsinfo_cap_windows_attrs __________________
> +.TP
> +.B fsinfo_cap_windows_attrs
> +Indicates that the filesystem supports some Windows FILE_* attributes.
> +
> +.\" __________________ fsinfo_cap_*_quotas __________________
> +.PP
> +.B fsinfo_cap_user_quotas
> +.br
> +.B fsinfo_cap_group_quotas
> +.br
> +.B fsinfo_cap_project_quotas
> +.IP
> +These indicate that the filesystem supports quotas for users, groups and
> +projects respectively.
> +
> +.\" __________________ fsinfo_cap_xattrs/filetypes __________________
> +.PP
> +.B fsinfo_cap_xattrs
> +.br
> +.B fsinfo_cap_symlinks
> +.br
> +.B fsinfo_cap_hard_links
> +.br
> +.B fsinfo_cap_hard_links_1dir
> +.br
> +.B fsinfo_cap_device_files
> +.br
> +.B fsinfo_cap_unix_specials
> +.IP
> +These indicate that the filesystem supports respectively extended attributes;
> +symbolic links; hard links spanning direcories; hard links, but only within a
> +directory; block and character device files; and UNIX special files, such as
> +FIFO and socket.
> +
> +.\" __________________ fsinfo_cap_*journal* __________________
> +.PP
> +.B fsinfo_cap_journal
> +.br
> +.B fsinfo_cap_data_is_journalled
> +.IP
> +The first of these indicates that the filesystem has a journal and the second
> +that the file data changes are being journalled.
> +
> +.\" __________________ fsinfo_cap_o_* __________________
> +.PP
> +.B fsinfo_cap_o_sync
> +.br
> +.B fsinfo_cap_o_direct
> +.IP
> +These indicate that O_SYNC and O_DIRECT are supported respectively.
> +
> +.\" __________________ fsinfo_cap_o_* __________________
> +.PP
> +.B fsinfo_cap_volume_id
> +.br
> +.B fsinfo_cap_volume_uuid
> +.br
> +.B fsinfo_cap_volume_name
> +.br
> +.B fsinfo_cap_volume_fsid
> +.br
> +.B fsinfo_cap_cell_name
> +.br
> +.B fsinfo_cap_domain_name
> +.br
> +.B fsinfo_cap_realm_name
> +.IP
> +These indicate if various attributes are supported by the filesystem, where
> +.B fsinfo_cap_X
> +here corresponds to
> +.BR fsinfo_attr_X .
> +
> +.\" __________________ fsinfo_cap_iver_* __________________
> +.PP
> +.B fsinfo_cap_iver_all_change
> +.br
> +.B fsinfo_cap_iver_data_change
> +.br
> +.B fsinfo_cap_iver_mono_incr
> +.IP
> +These indicate if
> +.I i_version
> +on an inode in the filesystem is supported and
> +how it behaves.
> +.B all_change
> +indicates that i_version is incremented on metadata changes as well as data
> +changes.
> +.B data_change
> +indicates that i_version is only incremented on data changes, including
> +truncation.
> +.B mono_incr
> +indicates that i_version is incremented by exactly 1 for each change made.
> +
> +.\" __________________ fsinfo_cap_resource_forks __________________
> +.TP
> +.B fsinfo_cap_resource_forks
> +This indicates that the filesystem supports some sort of resource fork or
> +alternate data stream on a file.  This isn't the same as an extended attribute.
> +
> +.\" __________________ fsinfo_cap_name_* __________________
> +.PP
> +.B fsinfo_cap_name_case_indep
> +.br
> +.B fsinfo_cap_name_non_utf8
> +.br
> +.B fsinfo_cap_name_has_codepage
> +.IP
> +These indicate certain facts about the filenames in a filesystem: whether
> +they're case-independent; if they're not UTF-8; and if there's a codepage
> +employed to map the names.
> +
> +.\" __________________ fsinfo_cap_sparse __________________
> +.TP
> +.B fsinfo_cap_sparse
> +This indicates that the filesystem supports sparse files.
> +
> +.\" __________________ fsinfo_cap_not_persistent __________________
> +.TP
> +.B fsinfo_cap_not_persistent
> +This indicates that the filesystem is not persistent, and that any data stored
> +here will not be saved in the event that the filesystem is unmounted, the
> +machine is rebooted or the machine loses power.
> +
> +.\" __________________ fsinfo_cap_no_unix_mode __________________
> +.TP
> +.B fsinfo_cap_no_unix_mode
> +This indicates that the filesystem doesn't support the UNIX mode permissions
> +bits.
> +
> +.\" __________________ fsinfo_cap_has_*time __________________
> +.PP
> +.B fsinfo_cap_has_atime
> +.br
> +.B fsinfo_cap_has_btime
> +.br
> +.B fsinfo_cap_has_ctime
> +.br
> +.B fsinfo_cap_has_mtime
> +.IP
> +These indicate as to what timestamps a filesystem supports, including: Access
> +time, Birth/creation time, Change time (metadata and data) and Modification
> +time (data only).
> +
> +
> +.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
> +.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
> +.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
> +.SH RETURN VALUE
> +On success, the size of the value that the kernel has available is returned,
> +irrespective of whether the buffer is large enough to hold that.  The data
> +written to the buffer will be truncated if it is not.  On error, \-1 is
> +returned, and
> +.I errno
> +is set appropriately.
> +.SH ERRORS
> +.TP
> +.B EACCES
> +Search permission is denied for one of the directories
> +in the path prefix of
> +.IR pathname .
> +(See also
> +.BR path_resolution (7).)
> +.TP
> +.B EBADF
> +.I dirfd
> +is not a valid open file descriptor.
> +.TP
> +.B EFAULT
> +.I pathname
> +is NULL or
> +.IR pathname ", " params " or " buffer
> +point to a location outside the process's accessible address space.
> +.TP
> +.B EINVAL
> +Reserved flag specified in
> +.IR params->at_flags " or one of " params->__reserved[]
> +is not 0.
> +.TP
> +.B EOPNOTSUPP
> +Unsupported attribute requested in
> +.IR params->request .
> +This may be beyond the limit of the supported attribute set or may just not be
> +one that's supported by the filesystem.
> +.TP
> +.B ENODATA
> +Unavailable attribute value requested by
> +.IR params->Nth " and/or " params->Mth .
> +.TP
> +.B ELOOP
> +Too many symbolic links encountered while traversing the pathname.
> +.TP
> +.B ENAMETOOLONG
> +.I pathname
> +is too long.
> +.TP
> +.B ENOENT
> +A component of
> +.I pathname
> +does not exist, or
> +.I pathname
> +is an empty string and
> +.B AT_EMPTY_PATH
> +was not specified in
> +.IR params->at_flags .
> +.TP
> +.B ENOMEM
> +Out of memory (i.e., kernel memory).
> +.TP
> +.B ENOTDIR
> +A component of the path prefix of
> +.I pathname
> +is not a directory or
> +.I pathname
> +is relative and
> +.I dirfd
> +is a file descriptor referring to a file other than a directory.
> +.SH VERSIONS
> +.BR fsinfo ()
> +was added to Linux in kernel 4.18.
> +.SH CONFORMING TO
> +.BR fsinfo ()
> +is Linux-specific.
> +.SH NOTES
> +Glibc does not (yet) provide a wrapper for the
> +.BR fsinfo ()
> +system call; call it using
> +.BR syscall (2).
> +.SH SEE ALSO
> +.BR ioctl_iflags (2),
> +.BR statx (2),
> +.BR statfs (2)
> diff --git a/man2/ioctl_iflags.2 b/man2/ioctl_iflags.2
> index 9c77b08b9..49ba4444e 100644
> --- a/man2/ioctl_iflags.2
> +++ b/man2/ioctl_iflags.2
> @@ -200,9 +200,15 @@ the effective user ID of the caller must match the owner of the file,
>  or the caller must have the
>  .BR CAP_FOWNER
>  capability.
> +.PP
> +The set of flags supported by a filesystem can be determined by calling
> +.IR fsinfo (2)
> +with attribute
> +.IR fsinfo_attr_supports .
>  .SH SEE ALSO
>  .BR chattr (1),
>  .BR lsattr (1),
> +.BR fsinfo (2),
>  .BR mount (2),
>  .BR btrfs (5),
>  .BR ext4 (5),
> diff --git a/man2/stat.2 b/man2/stat.2
> index dad9a01ac..ee4001f85 100644
> --- a/man2/stat.2
> +++ b/man2/stat.2
> @@ -532,6 +532,12 @@ If none of the aforementioned macros are defined,
>  then the nanosecond values are exposed with names of the form
>  .IR st_atimensec .
>  .\"
> +.PP
> +Which timestamps are supported by a filesystem and their the ranges and
> +granularities can be determined by calling
> +.IR fsinfo (2)
> +with attribute
> +.IR fsinfo_attr_timestamp_info .
>  .SS C library/kernel differences
>  Over time, increases in the size of the
>  .I stat
> @@ -707,6 +713,7 @@ main(int argc, char *argv[])
>  .BR access (2),
>  .BR chmod (2),
>  .BR chown (2),
> +.BR fsinfo (2),
>  .BR readlink (2),
>  .BR utime (2),
>  .BR capabilities (7),
> diff --git a/man2/statx.2 b/man2/statx.2
> index edac9f6f4..9a57c1b90 100644
> --- a/man2/statx.2
> +++ b/man2/statx.2
> @@ -534,12 +534,25 @@ Glibc does not (yet) provide a wrapper for the
>  .BR statx ()
>  system call; call it using
>  .BR syscall (2).
> +.PP
> +The sets of mask/stx_mask and stx_attributes bits supported by a filesystem
> +can be determined by calling
> +.IR fsinfo (2)
> +with attribute
> +.IR fsinfo_attr_supports .
> +.PP
> +Which timestamps are supported by a filesystem and their the ranges and
> +granularities can also be determined by calling
> +.IR fsinfo (2)
> +with attribute
> +.IR fsinfo_attr_timestamp_info .
>  .SH SEE ALSO
>  .BR ls (1),
>  .BR stat (1),
>  .BR access (2),
>  .BR chmod (2),
>  .BR chown (2),
> +.BR fsinfo (2),
>  .BR readlink (2),
>  .BR stat (2),
>  .BR utime (2),
> diff --git a/man2/utime.2 b/man2/utime.2
> index 03a43a416..c6acdbac2 100644
> --- a/man2/utime.2
> +++ b/man2/utime.2
> @@ -181,9 +181,16 @@ on an append-only file.
>  .\" is just a wrapper for
>  .\" .BR utime ()
>  .\" and hence does not allow a subsecond resolution.
> +.PP
> +Which timestamps are supported by a filesystem and their the ranges and
> +granularities can be determined by calling
> +.IR fsinfo (2)
> +with attribute
> +.IR fsinfo_attr_timestamp_info .
>  .SH SEE ALSO
>  .BR chattr (1),
>  .BR touch (1),
> +.BR fsinfo (2),
>  .BR futimesat (2),
>  .BR stat (2),
>  .BR utimensat (2),
> diff --git a/man2/utimensat.2 b/man2/utimensat.2
> index d61b43e96..be8925548 100644
> --- a/man2/utimensat.2
> +++ b/man2/utimensat.2
> @@ -633,9 +633,16 @@ instead checks whether the
>  .\" conversely, a process with a read-only file descriptor won't
>  .\" be able to update the timestamps of a file,
>  .\" even if it has write permission on the file.
> +.PP
> +Which timestamps are supported by a filesystem and their the ranges and
> +granularities can be determined by calling
> +.IR fsinfo (2)
> +with attribute
> +.IR fsinfo_attr_timestamp_info .
>  .SH SEE ALSO
>  .BR chattr (1),
>  .BR touch (1),
> +.BR fsinfo (2),
>  .BR futimesat (2),
>  .BR openat (2),
>  .BR stat (2),
> 


-- 
Michael Kerrisk
Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
Linux/UNIX System Programming Training: http://man7.org/training/

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [MANPAGE PATCH] Add manpage for fsinfo(2)
  2018-07-10 22:55 ` [MANPAGE PATCH] Add manpage for fsinfo(2) David Howells
  2019-10-09  9:52   ` Michael Kerrisk (man-pages)
@ 2019-10-09 12:02   ` David Howells
  1 sibling, 0 replies; 75+ messages in thread
From: David Howells @ 2019-10-09 12:02 UTC (permalink / raw)
  To: Michael Kerrisk (man-pages)
  Cc: dhowells, viro, linux-api, linux-fsdevel, torvalds, linux-kernel,
	linux-man, Eric W. Biederman

Michael Kerrisk (man-pages) <mtk.manpages@gmail.com> wrote:

> There is no fsinfo(2) in the system call in the kernel currently.
> Will that call still be added,

Hopefully, but I'm not sure it'll be ready by the next merge window.

> or was it replaced by fsconfig(2),

They're different things and not interchangeable.

David

^ permalink raw reply	[flat|nested] 75+ messages in thread

end of thread, other threads:[~2019-10-09 12:02 UTC | newest]

Thread overview: 75+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <153126248868.14533.9751473662727327569.stgit@warthog.procyon.org.uk>
2018-07-10 22:41 ` [PATCH 01/32] vfs: syscall: Add open_tree(2) to reference or clone a mount [ver #9] David Howells
2018-07-10 22:41 ` [PATCH 02/32] vfs: syscall: Add move_mount(2) to move mounts around " David Howells
2018-07-10 22:44 ` [PATCH 24/32] vfs: syscall: Add fsopen() to prepare for superblock creation " David Howells
2018-07-10 23:59   ` Andy Lutomirski
2018-07-11  1:05     ` Linus Torvalds
2018-07-11  1:15       ` Al Viro
2018-07-11  1:33         ` Andy Lutomirski
2018-07-11  1:48         ` Linus Torvalds
2018-07-11  8:43         ` David Howells
2018-07-11  1:14     ` Jann Horn
2018-07-11  1:16       ` Al Viro
2018-07-11  8:42     ` David Howells
2018-07-11 16:03       ` Linus Torvalds
2018-07-11  7:22   ` David Howells
2018-07-11 16:38     ` Eric Biggers
2018-07-11 17:06     ` Andy Lutomirski
2018-07-12 14:54     ` David Howells
2018-07-12 15:50       ` Linus Torvalds
2018-07-12 16:00         ` Al Viro
2018-07-12 16:07           ` Linus Torvalds
2018-07-12 16:31             ` Al Viro
2018-07-12 16:39               ` Linus Torvalds
2018-07-12 17:14                 ` Linus Torvalds
2018-07-12 17:44                   ` Al Viro
2018-07-12 17:54                     ` Linus Torvalds
2018-07-12 17:52                 ` Al Viro
2018-07-12 16:23       ` Andy Lutomirski
2018-07-12 16:31         ` Linus Torvalds
2018-07-12 16:41         ` Al Viro
2018-07-12 16:58         ` Al Viro
2018-07-12 17:54           ` Andy Lutomirski
2018-07-12 20:23       ` David Howells
2018-07-12 20:25         ` Andy Lutomirski
2018-07-12 20:34         ` Linus Torvalds
2018-07-12 20:36           ` Linus Torvalds
2018-07-12 21:26         ` David Howells
2018-07-12 21:40           ` Linus Torvalds
2018-07-12 22:32           ` Theodore Y. Ts'o
2018-07-12 22:54           ` David Howells
2018-07-12 23:21             ` Andy Lutomirski
2018-07-12 23:23             ` Jann Horn
2018-07-12 23:33               ` Jann Horn
2018-07-12 23:35             ` David Howells
2018-07-12 23:50               ` Andy Lutomirski
     [not found]             ` <23894.1531438559@warthog.procyon.o rg.uk>
2018-07-13  0:03               ` David Howells
2018-07-13  0:24                 ` Andy Lutomirski
2018-07-13  7:30                 ` David Howells
2018-07-19  1:30                   ` Eric W. Biederman
2018-07-13  2:35             ` Theodore Y. Ts'o
2018-07-12 21:00       ` David Howells
2018-07-12 21:29         ` Linus Torvalds
2018-07-13 13:27         ` David Howells
2018-07-13 15:01           ` Andy Lutomirski
2018-07-13 15:40           ` David Howells
2018-07-13 17:14             ` Andy Lutomirski
2018-07-17  9:40           ` David Howells
2018-07-11 15:51   ` Jonathan Corbet
2018-07-11 16:18   ` David Howells
2018-07-12 17:15   ` Greg KH
2018-07-12 17:20     ` Al Viro
2018-07-12 18:03       ` Greg KH
2018-07-12 18:30         ` Andy Lutomirski
2018-07-12 18:34           ` Al Viro
2018-07-12 18:35             ` Al Viro
2018-07-12 19:08           ` Greg KH
2018-07-10 22:44 ` [PATCH 25/32] vfs: syscall: Add fsmount() to create a mount for a superblock " David Howells
2018-07-10 22:44 ` [PATCH 26/32] vfs: syscall: Add fspick() to select a superblock for reconfiguration " David Howells
2018-07-10 22:44 ` [PATCH 31/32] vfs: syscall: Add fsinfo() to query filesystem information " David Howells
2018-07-10 22:52 ` [MANPAGE PATCH] Add manpages for move_mount(2) and open_tree(2) David Howells
2019-10-09  9:51   ` Michael Kerrisk (man-pages)
2018-07-10 22:54 ` [MANPAGE PATCH] Add manpage for fsopen(2), fspick(2) and fsmount(2) David Howells
2019-10-09  9:52   ` Michael Kerrisk (man-pages)
2018-07-10 22:55 ` [MANPAGE PATCH] Add manpage for fsinfo(2) David Howells
2019-10-09  9:52   ` Michael Kerrisk (man-pages)
2019-10-09 12:02   ` David Howells

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).