linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC PATCH 0/3] quering mount attributes
@ 2023-09-13 15:22 Miklos Szeredi
  2023-09-13 15:22 ` [RFC PATCH 1/3] add unique mount ID Miklos Szeredi
                   ` (3 more replies)
  0 siblings, 4 replies; 76+ messages in thread
From: Miklos Szeredi @ 2023-09-13 15:22 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: linux-kernel, linux-api, linux-man, linux-security-module,
	Karel Zak, Ian Kent, David Howells, Linus Torvalds, Al Viro,
	Christian Brauner, Amir Goldstein

Implement the mount querying syscalls agreed on at LSF/MM 2023.  This is an
RFC with just x86_64 syscalls.

Excepting notification this should allow full replacement for
parsing /proc/self/mountinfo.

It is not a replacement for /proc/$OTHER_PID/mountinfo, since mount
namespace and root are taken from the current task.  I guess namespace and
root could be switched before invoking these syscalls but that sounds a bit
complicated.  Not sure if this is a problem.

Test utility attached at the end.
---

Miklos Szeredi (3):
  add unique mount ID
  add statmnt(2) syscall
  add listmnt(2) syscall

 arch/x86/entry/syscalls/syscall_64.tbl |   2 +
 fs/internal.h                          |   5 +
 fs/mount.h                             |   3 +-
 fs/namespace.c                         | 365 +++++++++++++++++++++++++
 fs/proc_namespace.c                    |  19 +-
 fs/stat.c                              |   9 +-
 fs/statfs.c                            |   1 +
 include/linux/syscalls.h               |   5 +
 include/uapi/asm-generic/unistd.h      |   8 +-
 include/uapi/linux/mount.h             |  36 +++
 include/uapi/linux/stat.h              |   1 +
 11 files changed, 443 insertions(+), 11 deletions(-)

-- 
2.41.0

=== statmnt.c ===
#define _GNU_SOURCE
#include <unistd.h>
#include <stdio.h>
#include <fcntl.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <sys/mount.h>
#include <sys/stat.h>
#include <err.h>

struct stmt_str {
	__u32 off;
	__u32 len;
};

struct statmnt {
	__u64 mask;		/* What results were written [uncond] */
	__u32 sb_dev_major;	/* Device ID */
	__u32 sb_dev_minor;
	__u64 sb_magic;		/* ..._SUPER_MAGIC */
	__u32 sb_flags;		/* MS_{RDONLY,SYNCHRONOUS,DIRSYNC,LAZYTIME} */
	__u32 __spare1;
	__u64 mnt_id;		/* Unique ID of mount */
	__u64 mnt_parent_id;	/* Unique ID of parent (for root == mnt_id) */
	__u32 mnt_id_old;	/* Reused IDs used in proc/.../mountinfo */
	__u32 mnt_parent_id_old;
	__u64 mnt_attr;		/* MOUNT_ATTR_... */
	__u64 mnt_propagation;	/* MS_{SHARED,SLAVE,PRIVATE,UNBINDABLE} */
	__u64 mnt_peer_group;	/* ID of shared peer group */
	__u64 mnt_master;	/* Mount receives propagation from this ID */
	__u64 propagate_from;	/* Propagation from in current namespace */
	__u64 __spare[20];
	struct stmt_str mnt_root;	/* Root of mount relative to root of fs */
	struct stmt_str mountpoint;	/* Mountpoint relative to root of process */
	struct stmt_str fs_type;	/* Filesystem type[.subtype] */
	struct stmt_str sb_opts;	/* Super block string options (nul delimted) */
};

#define STMT_SB_BASIC		0x00000001U     /* Want/got sb_... */
#define STMT_MNT_BASIC		0x00000002U	/* Want/got mnt_... */
#define STMT_PROPAGATE_FROM	0x00000004U	/* Want/got propagate_from */
#define STMT_MNT_ROOT		0x00000008U	/* Want/got mnt_root  */
#define STMT_MOUNTPOINT		0x00000010U	/* Want/got mountpoint */
#define STMT_FS_TYPE		0x00000020U	/* Want/got fs_type */
#define STMT_SB_OPTS		0x00000040U	/* Want/got sb_opts */

#define __NR_statmnt   454
#define __NR_listmnt   455

#define STATX_MNT_ID_UNIQUE	0x00004000U	/* Want/got extended stx_mount_id */

int main(int argc, char *argv[])
{
	char buf[65536];
	struct statmnt *st = (void *) buf;
	char *end;
	const char *arg = argv[1];
	long res;
	int list = 0;
	unsigned long mnt_id;
	unsigned int mask = STMT_SB_BASIC | STMT_MNT_BASIC | STMT_PROPAGATE_FROM | STMT_MNT_ROOT | STMT_MOUNTPOINT | STMT_FS_TYPE | STMT_SB_OPTS;

	if (arg && strcmp(arg, "-l") == 0) {
		list = 1;
		arg = argv[2];
	}
	if (argc != list + 2)
		errx(1, "usage: %s [-l] (mnt_id|path)", argv[0]);

	mnt_id = strtol(arg, &end, 0);
	if (!mnt_id || *end != '\0') {
		struct statx sx;

		res = statx(AT_FDCWD, arg, 0, STATX_MNT_ID_UNIQUE, &sx);
		if (res == -1)
			err(1, "%s", arg);

		if (!(sx.stx_mask & (STATX_MNT_ID | STATX_MNT_ID_UNIQUE)))
			errx(1, "Sorry, no mount ID");

		mnt_id = sx.stx_mnt_id;
	}


	if (list) {
		size_t size = 8192;
		uint64_t list[size];
		long i, num;

		res = syscall(__NR_listmnt, mnt_id, list, size, 0);
		if (res == -1)
			err(1, "listmnt(%lu)", mnt_id);

		num = res;
		for (i = 0; i < num; i++) {
			printf("0x%lx / ", list[i]);

			res = syscall(__NR_statmnt, list[i], STMT_MNT_BASIC | STMT_MOUNTPOINT, &buf, sizeof(buf), 0);
			if (res == -1) {
				printf("???\t[%s]\n", strerror(errno));
			} else {
				printf("%u\t%s\n", st->mnt_id_old,
				       (st->mask & STMT_MOUNTPOINT) ? buf + st->mountpoint.off : "???");
			}
		}

		return 0;
	}

	res = syscall(__NR_statmnt, mnt_id, mask, &buf, sizeof(buf), 0);
	if (res == -1)
		err(1, "statmnt(%lu)", mnt_id);

	printf("mask: 0x%llx\n", st->mask);
	if (st->mask & STMT_SB_BASIC) {
		printf("sb_dev_major: %u\n", st->sb_dev_major);
		printf("sb_dev_minor: %u\n", st->sb_dev_minor);
		printf("sb_magic: 0x%llx\n", st->sb_magic);
		printf("sb_flags: 0x%08x\n", st->sb_flags);
	}
	if (st->mask & STMT_MNT_BASIC) {
		printf("mnt_id: 0x%llx\n", st->mnt_id);
		printf("mnt_parent_id: 0x%llx\n", st->mnt_parent_id);
		printf("mnt_id_old: %u\n", st->mnt_id_old);
		printf("mnt_parent_id_old: %u\n", st->mnt_parent_id_old);
		printf("mnt_attr: 0x%08llx\n", st->mnt_attr);
		printf("mnt_propagation: %s%s%s%s\n",
		       st->mnt_propagation & MS_SHARED ? "shared," : "",
		       st->mnt_propagation & MS_SLAVE ? "slave," : "",
		       st->mnt_propagation & MS_UNBINDABLE ? "unbindable," : "",
		       st->mnt_propagation & MS_PRIVATE ? "private" : "");
		printf("mnt_peer_group: %llu\n", st->mnt_peer_group);
		printf("mnt_master: %llu\n", st->mnt_master);
	}
	if (st->mask & STMT_PROPAGATE_FROM) {
		printf("propagate_from: %llu\n", st->propagate_from);
	}
	if (st->mask & STMT_MNT_ROOT) {
		printf("mnt_root: %i/%u <%s>\n", st->mnt_root.off,
		       st->mnt_root.len, buf + st->mnt_root.off);
	}
	if (st->mask & STMT_MOUNTPOINT) {
		printf("mountpoint: %i/%u <%s>\n", st->mountpoint.off,
		       st->mountpoint.len, buf + st->mountpoint.off);
	}
	if (st->mask & STMT_FS_TYPE) {
		printf("fs_type: %i/%u <%s>\n", st->fs_type.off,
		       st->fs_type.len, buf + st->fs_type.off);
	}

	if (st->mask & STMT_SB_OPTS) {
		char *p = buf + st->sb_opts.off;
		char *end = p + st->sb_opts.len;

		printf("sb_opts: %i/%u ", st->sb_opts.off, st->sb_opts.len);
		for (; p < end; p += strlen(p) + 1)
			printf("<%s>, ", p);
		printf("\n");
	}

	return 0;
}


^ permalink raw reply	[flat|nested] 76+ messages in thread

* [RFC PATCH 1/3] add unique mount ID
  2023-09-13 15:22 [RFC PATCH 0/3] quering mount attributes Miklos Szeredi
@ 2023-09-13 15:22 ` Miklos Szeredi
  2023-09-14  9:03   ` Christian Brauner
  2023-09-13 15:22 ` [RFC PATCH 2/3] add statmnt(2) syscall Miklos Szeredi
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 76+ messages in thread
From: Miklos Szeredi @ 2023-09-13 15:22 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: linux-kernel, linux-api, linux-man, linux-security-module,
	Karel Zak, Ian Kent, David Howells, Linus Torvalds, Al Viro,
	Christian Brauner, Amir Goldstein

If a mount is released then it's mnt_id can immediately be reused.  This is
bad news for user interfaces that want to uniquely identify a mount.

Implementing a unique mount ID is trivial (use a 64bit counter).
Unfortunately userspace assumes 32bit size and would overflow after the
counter reaches 2^32.

Introduce a new 64bit ID alongside the old one.  Allow new interfaces to
work on both the old and new IDs by starting the counter from 2^32.

Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/mount.h                | 3 ++-
 fs/namespace.c            | 4 ++++
 fs/stat.c                 | 9 +++++++--
 include/uapi/linux/stat.h | 1 +
 4 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/fs/mount.h b/fs/mount.h
index 130c07c2f8d2..a14f762b3f29 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -72,7 +72,8 @@ struct mount {
 	struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
 	__u32 mnt_fsnotify_mask;
 #endif
-	int mnt_id;			/* mount identifier */
+	int mnt_id;			/* mount identifier, reused */
+	u64 mnt_id_unique;		/* mount ID unique until reboot */
 	int mnt_group_id;		/* peer group identifier */
 	int mnt_expiry_mark;		/* true if marked for expiry */
 	struct hlist_head mnt_pins;
diff --git a/fs/namespace.c b/fs/namespace.c
index e157efc54023..de47c5f66e17 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -68,6 +68,9 @@ static u64 event;
 static DEFINE_IDA(mnt_id_ida);
 static DEFINE_IDA(mnt_group_ida);
 
+/* Don't allow confusion with mount ID allocated wit IDA */
+static atomic64_t mnt_id_ctr = ATOMIC64_INIT(1ULL << 32);
+
 static struct hlist_head *mount_hashtable __read_mostly;
 static struct hlist_head *mountpoint_hashtable __read_mostly;
 static struct kmem_cache *mnt_cache __read_mostly;
@@ -131,6 +134,7 @@ static int mnt_alloc_id(struct mount *mnt)
 	if (res < 0)
 		return res;
 	mnt->mnt_id = res;
+	mnt->mnt_id_unique = atomic64_inc_return(&mnt_id_ctr);
 	return 0;
 }
 
diff --git a/fs/stat.c b/fs/stat.c
index 6822ac77aec2..46d901b6b2de 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -280,8 +280,13 @@ static int vfs_statx(int dfd, struct filename *filename, int flags,
 
 	error = vfs_getattr(&path, stat, request_mask, flags);
 
-	stat->mnt_id = real_mount(path.mnt)->mnt_id;
-	stat->result_mask |= STATX_MNT_ID;
+	if (request_mask & STATX_MNT_ID_UNIQUE) {
+		stat->mnt_id = real_mount(path.mnt)->mnt_id_unique;
+		stat->result_mask |= STATX_MNT_ID_UNIQUE;
+	} else {
+		stat->mnt_id = real_mount(path.mnt)->mnt_id;
+		stat->result_mask |= STATX_MNT_ID;
+	}
 
 	if (path.mnt->mnt_root == path.dentry)
 		stat->attributes |= STATX_ATTR_MOUNT_ROOT;
diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h
index 7cab2c65d3d7..2f2ee82d5517 100644
--- a/include/uapi/linux/stat.h
+++ b/include/uapi/linux/stat.h
@@ -154,6 +154,7 @@ struct statx {
 #define STATX_BTIME		0x00000800U	/* Want/got stx_btime */
 #define STATX_MNT_ID		0x00001000U	/* Got stx_mnt_id */
 #define STATX_DIOALIGN		0x00002000U	/* Want/got direct I/O alignment info */
+#define STATX_MNT_ID_UNIQUE	0x00004000U	/* Want/got extended stx_mount_id */
 
 #define STATX__RESERVED		0x80000000U	/* Reserved for future struct statx expansion */
 
-- 
2.41.0


^ permalink raw reply related	[flat|nested] 76+ messages in thread

* [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-13 15:22 [RFC PATCH 0/3] quering mount attributes Miklos Szeredi
  2023-09-13 15:22 ` [RFC PATCH 1/3] add unique mount ID Miklos Szeredi
@ 2023-09-13 15:22 ` Miklos Szeredi
  2023-09-14  6:11   ` Amir Goldstein
                     ` (4 more replies)
  2023-09-13 15:22 ` [RFC PATCH 3/3] add listmnt(2) syscall Miklos Szeredi
  2023-09-14  6:47 ` [RFC PATCH 0/3] quering mount attributes Amir Goldstein
  3 siblings, 5 replies; 76+ messages in thread
From: Miklos Szeredi @ 2023-09-13 15:22 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: linux-kernel, linux-api, linux-man, linux-security-module,
	Karel Zak, Ian Kent, David Howells, Linus Torvalds, Al Viro,
	Christian Brauner, Amir Goldstein

Add a way to query attributes of a single mount instead of having to parse
the complete /proc/$PID/mountinfo, which might be huge.

Lookup the mount by the old (32bit) or new (64bit) mount ID.  If a mount
needs to be queried based on path, then statx(2) can be used to first query
the mount ID belonging to the path.

Design is based on a suggestion by Linus:

  "So I'd suggest something that is very much like "statfsat()", which gets
   a buffer and a length, and returns an extended "struct statfs" *AND*
   just a string description at the end."

The interface closely mimics that of statx.

Handle ASCII attributes by appending after the end of the structure (as per
above suggestion).  Allow querying multiple string attributes with
individual offset/length for each.  String are nul terminated (termination
isn't counted in length).

Mount options are also delimited with nul characters.  Unlike proc, special
characters are not quoted.

Link: https://lore.kernel.org/all/CAHk-=wh5YifP7hzKSbwJj94+DZ2czjrZsczy6GBimiogZws=rg@mail.gmail.com/
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 arch/x86/entry/syscalls/syscall_64.tbl |   1 +
 fs/internal.h                          |   5 +
 fs/namespace.c                         | 312 ++++++++++++++++++++++++-
 fs/proc_namespace.c                    |  19 +-
 fs/statfs.c                            |   1 +
 include/linux/syscalls.h               |   3 +
 include/uapi/asm-generic/unistd.h      |   5 +-
 include/uapi/linux/mount.h             |  36 +++
 8 files changed, 373 insertions(+), 9 deletions(-)

diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 1d6eee30eceb..6d807c30cd16 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -375,6 +375,7 @@
 451	common	cachestat		sys_cachestat
 452	common	fchmodat2		sys_fchmodat2
 453	64	map_shadow_stack	sys_map_shadow_stack
+454	common	statmnt			sys_statmnt
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/fs/internal.h b/fs/internal.h
index d64ae03998cc..8f75271428aa 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -83,6 +83,11 @@ int path_mount(const char *dev_name, struct path *path,
 		const char *type_page, unsigned long flags, void *data_page);
 int path_umount(struct path *path, int flags);
 
+/*
+ * proc_namespace.c
+ */
+int show_path(struct seq_file *m, struct dentry *root);
+
 /*
  * fs_struct.c
  */
diff --git a/fs/namespace.c b/fs/namespace.c
index de47c5f66e17..088a52043bba 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -69,7 +69,8 @@ static DEFINE_IDA(mnt_id_ida);
 static DEFINE_IDA(mnt_group_ida);
 
 /* Don't allow confusion with mount ID allocated wit IDA */
-static atomic64_t mnt_id_ctr = ATOMIC64_INIT(1ULL << 32);
+#define OLD_MNT_ID_MAX UINT_MAX
+static atomic64_t mnt_id_ctr = ATOMIC64_INIT(OLD_MNT_ID_MAX);
 
 static struct hlist_head *mount_hashtable __read_mostly;
 static struct hlist_head *mountpoint_hashtable __read_mostly;
@@ -4678,6 +4679,315 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
 	return err;
 }
 
+static bool mnt_id_match(struct mount *mnt, u64 id)
+{
+	if (id <= OLD_MNT_ID_MAX)
+		return id == mnt->mnt_id;
+	else
+		return id == mnt->mnt_id_unique;
+}
+
+struct vfsmount *lookup_mnt_in_ns(u64 id, struct mnt_namespace *ns)
+{
+	struct mount *mnt;
+	struct vfsmount *res = NULL;
+
+	lock_ns_list(ns);
+	list_for_each_entry(mnt, &ns->list, mnt_list) {
+		if (!mnt_is_cursor(mnt) && mnt_id_match(mnt, id)) {
+			res = &mnt->mnt;
+			break;
+		}
+	}
+	unlock_ns_list(ns);
+	return res;
+}
+
+struct stmt_state {
+	void __user *const buf;
+	size_t const bufsize;
+	struct vfsmount *const mnt;
+	u64 const mask;
+	struct seq_file seq;
+	struct path root;
+	struct statmnt sm;
+	size_t pos;
+	int err;
+};
+
+typedef int (*stmt_func_t)(struct stmt_state *);
+
+static int stmt_string_seq(struct stmt_state *s, stmt_func_t func)
+{
+	struct seq_file *seq = &s->seq;
+	int ret;
+
+	seq->count = 0;
+	seq->size = min_t(size_t, seq->size, s->bufsize - s->pos);
+	seq->buf = kvmalloc(seq->size, GFP_KERNEL_ACCOUNT);
+	if (!seq->buf)
+		return -ENOMEM;
+
+	ret = func(s);
+	if (ret)
+		return ret;
+
+	if (seq_has_overflowed(seq)) {
+		if (seq->size == s->bufsize - s->pos)
+			return -EOVERFLOW;
+		seq->size *= 2;
+		if (seq->size > MAX_RW_COUNT)
+			return -ENOMEM;
+		kvfree(seq->buf);
+		return 0;
+	}
+
+	/* Done */
+	return 1;
+}
+
+static void stmt_string(struct stmt_state *s, u64 mask, stmt_func_t func,
+		       stmt_str_t *str)
+{
+	int ret = s->pos >= s->bufsize ? -EOVERFLOW : 0;
+	struct statmnt *sm = &s->sm;
+	struct seq_file *seq = &s->seq;
+
+	if (s->err || !(s->mask & mask))
+		return;
+
+	seq->size = PAGE_SIZE;
+	while (!ret)
+		ret = stmt_string_seq(s, func);
+
+	if (ret < 0) {
+		s->err = ret;
+	} else {
+		seq->buf[seq->count++] = '\0';
+		if (copy_to_user(s->buf + s->pos, seq->buf, seq->count)) {
+			s->err = -EFAULT;
+		} else {
+			str->off = s->pos;
+			str->len = seq->count - 1;
+			s->pos += seq->count;
+		}
+	}
+	kvfree(seq->buf);
+	sm->mask |= mask;
+}
+
+static void stmt_numeric(struct stmt_state *s, u64 mask, stmt_func_t func)
+{
+	if (s->err || !(s->mask & mask))
+		return;
+
+	s->err = func(s);
+	s->sm.mask |= mask;
+}
+
+static u64 mnt_to_attr_flags(struct vfsmount *mnt)
+{
+	unsigned int mnt_flags = READ_ONCE(mnt->mnt_flags);
+	u64 attr_flags = 0;
+
+	if (mnt_flags & MNT_READONLY)
+		attr_flags |= MOUNT_ATTR_RDONLY;
+	if (mnt_flags & MNT_NOSUID)
+		attr_flags |= MOUNT_ATTR_NOSUID;
+	if (mnt_flags & MNT_NODEV)
+		attr_flags |= MOUNT_ATTR_NODEV;
+	if (mnt_flags & MNT_NOEXEC)
+		attr_flags |= MOUNT_ATTR_NOEXEC;
+	if (mnt_flags & MNT_NODIRATIME)
+		attr_flags |= MOUNT_ATTR_NODIRATIME;
+	if (mnt_flags & MNT_NOSYMFOLLOW)
+		attr_flags |= MOUNT_ATTR_NOSYMFOLLOW;
+
+	if (mnt_flags & MNT_NOATIME)
+		attr_flags |= MOUNT_ATTR_NOATIME;
+	else if (mnt_flags & MNT_RELATIME)
+		attr_flags |= MOUNT_ATTR_RELATIME;
+	else
+		attr_flags |= MOUNT_ATTR_STRICTATIME;
+
+	if (is_idmapped_mnt(mnt))
+		attr_flags |= MOUNT_ATTR_IDMAP;
+
+	return attr_flags;
+}
+
+static u64 mnt_to_propagation_flags(struct mount *m)
+{
+	u64 propagation = 0;
+
+	if (IS_MNT_SHARED(m))
+		propagation |= MS_SHARED;
+	if (IS_MNT_SLAVE(m))
+		propagation |= MS_SLAVE;
+	if (IS_MNT_UNBINDABLE(m))
+		propagation |= MS_UNBINDABLE;
+	if (!propagation)
+		propagation |= MS_PRIVATE;
+
+	return propagation;
+}
+
+static int stmt_sb_basic(struct stmt_state *s)
+{
+	struct super_block *sb = s->mnt->mnt_sb;
+
+	s->sm.sb_dev_major = MAJOR(sb->s_dev);
+	s->sm.sb_dev_minor = MINOR(sb->s_dev);
+	s->sm.sb_magic = sb->s_magic;
+	s->sm.sb_flags = sb->s_flags & (SB_RDONLY|SB_SYNCHRONOUS|SB_DIRSYNC|SB_LAZYTIME);
+
+	return 0;
+}
+
+static int stmt_mnt_basic(struct stmt_state *s)
+{
+	struct mount *m = real_mount(s->mnt);
+
+	s->sm.mnt_id = m->mnt_id_unique;
+	s->sm.mnt_parent_id = m->mnt_parent->mnt_id_unique;
+	s->sm.mnt_id_old = m->mnt_id;
+	s->sm.mnt_parent_id_old = m->mnt_parent->mnt_id;
+	s->sm.mnt_attr = mnt_to_attr_flags(&m->mnt);
+	s->sm.mnt_propagation = mnt_to_propagation_flags(m);
+	s->sm.mnt_peer_group = IS_MNT_SHARED(m) ? m->mnt_group_id : 0;
+	s->sm.mnt_master = IS_MNT_SLAVE(m) ? m->mnt_master->mnt_group_id : 0;
+
+	return 0;
+}
+
+static int stmt_propagate_from(struct stmt_state *s)
+{
+	struct mount *m = real_mount(s->mnt);
+
+	if (!IS_MNT_SLAVE(m))
+		return 0;
+
+	s->sm.propagate_from = get_dominating_id(m, &current->fs->root);
+
+	return 0;
+}
+
+static int stmt_mnt_root(struct stmt_state *s)
+{
+	struct seq_file *seq = &s->seq;
+	int err = show_path(seq, s->mnt->mnt_root);
+
+	if (!err && !seq_has_overflowed(seq)) {
+		seq->buf[seq->count] = '\0';
+		seq->count = string_unescape_inplace(seq->buf, UNESCAPE_OCTAL);
+	}
+	return err;
+}
+
+static int stmt_mountpoint(struct stmt_state *s)
+{
+	struct vfsmount *mnt = s->mnt;
+	struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
+	int err = seq_path_root(&s->seq, &mnt_path, &s->root, "");
+
+	return err == SEQ_SKIP ? 0 : err;
+}
+
+static int stmt_fs_type(struct stmt_state *s)
+{
+	struct seq_file *seq = &s->seq;
+	struct super_block *sb = s->mnt->mnt_sb;
+
+	seq_puts(seq, sb->s_type->name);
+	if (sb->s_subtype) {
+		seq_putc(seq, '.');
+		seq_puts(seq, sb->s_subtype);
+	}
+	return 0;
+}
+
+static int stmt_sb_opts(struct stmt_state *s)
+{
+	struct seq_file *seq = &s->seq;
+	struct super_block *sb = s->mnt->mnt_sb;
+	char *p, *end, *next, *u = seq->buf;
+	int err;
+
+	if (!sb->s_op->show_options)
+		return 0;
+
+	err = sb->s_op->show_options(seq, s->mnt->mnt_root);
+	if (err || seq_has_overflowed(seq) || !seq->count)
+		return err;
+
+	end = seq->buf + seq->count;
+	*end = '\0';
+	for (p = seq->buf + 1; p < end; p = next + 1) {
+		next = strchrnul(p, ',');
+		*next = '\0';
+		u += string_unescape(p, u, 0, UNESCAPE_OCTAL) + 1;
+	}
+	seq->count = u - 1 - seq->buf;
+	return 0;
+}
+
+static int do_statmnt(struct stmt_state *s)
+{
+	struct statmnt *sm = &s->sm;
+	struct mount *m = real_mount(s->mnt);
+
+	if (!capable(CAP_SYS_ADMIN) &&
+	    !is_path_reachable(m, m->mnt.mnt_root, &s->root))
+		return -EPERM;
+
+	stmt_numeric(s, STMT_SB_BASIC, stmt_sb_basic);
+	stmt_numeric(s, STMT_MNT_BASIC, stmt_mnt_basic);
+	stmt_numeric(s, STMT_PROPAGATE_FROM, stmt_propagate_from);
+	stmt_string(s, STMT_MNT_ROOT, stmt_mnt_root, &sm->mnt_root);
+	stmt_string(s, STMT_MOUNTPOINT, stmt_mountpoint, &sm->mountpoint);
+	stmt_string(s, STMT_FS_TYPE, stmt_fs_type, &sm->fs_type);
+	stmt_string(s, STMT_SB_OPTS, stmt_sb_opts, &sm->sb_opts);
+
+	if (s->err)
+		return s->err;
+
+	if (copy_to_user(s->buf, sm, min_t(size_t, s->bufsize, sizeof(*sm))))
+		return -EFAULT;
+
+	return 0;
+}
+
+SYSCALL_DEFINE5(statmnt, u64, mnt_id,
+		u64, mask, struct statmnt __user *, buf,
+		size_t, bufsize, unsigned int, flags)
+{
+	struct vfsmount *mnt;
+	int err;
+
+	if (flags)
+		return -EINVAL;
+
+	down_read(&namespace_sem);
+	mnt = lookup_mnt_in_ns(mnt_id, current->nsproxy->mnt_ns);
+	err = -ENOENT;
+	if (mnt) {
+		struct stmt_state s = {
+			.mask = mask,
+			.buf = buf,
+			.bufsize = bufsize,
+			.mnt = mnt,
+			.pos = sizeof(*buf),
+		};
+
+		get_fs_root(current->fs, &s.root);
+		err = do_statmnt(&s);
+		path_put(&s.root);
+	}
+	up_read(&namespace_sem);
+
+	return err;
+}
+
 static void __init init_mount_tree(void)
 {
 	struct vfsmount *mnt;
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 250eb5bf7b52..20681d1f6798 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -132,6 +132,15 @@ static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt)
 	return err;
 }
 
+int show_path(struct seq_file *m, struct dentry *root)
+{
+	if (root->d_sb->s_op->show_path)
+		return root->d_sb->s_op->show_path(m, root);
+
+	seq_dentry(m, root, " \t\n\\");
+	return 0;
+}
+
 static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)
 {
 	struct proc_mounts *p = m->private;
@@ -142,13 +151,9 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)
 
 	seq_printf(m, "%i %i %u:%u ", r->mnt_id, r->mnt_parent->mnt_id,
 		   MAJOR(sb->s_dev), MINOR(sb->s_dev));
-	if (sb->s_op->show_path) {
-		err = sb->s_op->show_path(m, mnt->mnt_root);
-		if (err)
-			goto out;
-	} else {
-		seq_dentry(m, mnt->mnt_root, " \t\n\\");
-	}
+	err = show_path(m, mnt->mnt_root);
+	if (err)
+		goto out;
 	seq_putc(m, ' ');
 
 	/* mountpoints outside of chroot jail will give SEQ_SKIP on this */
diff --git a/fs/statfs.c b/fs/statfs.c
index 96d1c3edf289..cc774c2e2c9a 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -9,6 +9,7 @@
 #include <linux/security.h>
 #include <linux/uaccess.h>
 #include <linux/compat.h>
+#include <uapi/linux/mount.h>
 #include "internal.h"
 
 static int flags_by_mnt(int mnt_flags)
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 22bc6bc147f8..1099bd307fa7 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -408,6 +408,9 @@ asmlinkage long sys_statfs64(const char __user *path, size_t sz,
 asmlinkage long sys_fstatfs(unsigned int fd, struct statfs __user *buf);
 asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz,
 				struct statfs64 __user *buf);
+asmlinkage long sys_statmnt(u64 mnt_id, u64 mask,
+			    struct statmnt __user *buf, size_t bufsize,
+			    unsigned int flags);
 asmlinkage long sys_truncate(const char __user *path, long length);
 asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length);
 #if BITS_PER_LONG == 32
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index abe087c53b4b..640997231ff6 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -823,8 +823,11 @@ __SYSCALL(__NR_cachestat, sys_cachestat)
 #define __NR_fchmodat2 452
 __SYSCALL(__NR_fchmodat2, sys_fchmodat2)
 
+#define __NR_statmnt   454
+__SYSCALL(__NR_statmnt, sys_statmnt)
+
 #undef __NR_syscalls
-#define __NR_syscalls 453
+#define __NR_syscalls 455
 
 /*
  * 32 bit systems traditionally used different
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index bb242fdcfe6b..4ec7308a9259 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -138,4 +138,40 @@ struct mount_attr {
 /* List of all mount_attr versions. */
 #define MOUNT_ATTR_SIZE_VER0	32 /* sizeof first published struct */
 
+struct stmt_str {
+	__u32 off;
+	__u32 len;
+};
+
+struct statmnt {
+	__u64 mask;		/* What results were written [uncond] */
+	__u32 sb_dev_major;	/* Device ID */
+	__u32 sb_dev_minor;
+	__u64 sb_magic;		/* ..._SUPER_MAGIC */
+	__u32 sb_flags;		/* MS_{RDONLY,SYNCHRONOUS,DIRSYNC,LAZYTIME} */
+	__u32 __spare1;
+	__u64 mnt_id;		/* Unique ID of mount */
+	__u64 mnt_parent_id;	/* Unique ID of parent (for root == mnt_id) */
+	__u32 mnt_id_old;	/* Reused IDs used in proc/.../mountinfo */
+	__u32 mnt_parent_id_old;
+	__u64 mnt_attr;		/* MOUNT_ATTR_... */
+	__u64 mnt_propagation;	/* MS_{SHARED,SLAVE,PRIVATE,UNBINDABLE} */
+	__u64 mnt_peer_group;	/* ID of shared peer group */
+	__u64 mnt_master;	/* Mount receives propagation from this ID */
+	__u64 propagate_from;	/* Propagation from in current namespace */
+	__u64 __spare[20];
+	struct stmt_str mnt_root;	/* Root of mount relative to root of fs */
+	struct stmt_str mountpoint;	/* Mountpoint relative to root of process */
+	struct stmt_str fs_type;	/* Filesystem type[.subtype] */
+	struct stmt_str sb_opts;	/* Super block string options (nul delimted) */
+};
+
+#define STMT_SB_BASIC		0x00000001U     /* Want/got sb_... */
+#define STMT_MNT_BASIC		0x00000002U	/* Want/got mnt_... */
+#define STMT_PROPAGATE_FROM	0x00000004U	/* Want/got propagate_from */
+#define STMT_MNT_ROOT		0x00000008U	/* Want/got mnt_root  */
+#define STMT_MOUNTPOINT		0x00000010U	/* Want/got mountpoint */
+#define STMT_FS_TYPE		0x00000020U	/* Want/got fs_type */
+#define STMT_SB_OPTS		0x00000040U	/* Want/got sb_opts */
+
 #endif /* _UAPI_LINUX_MOUNT_H */
-- 
2.41.0


^ permalink raw reply related	[flat|nested] 76+ messages in thread

* [RFC PATCH 3/3] add listmnt(2) syscall
  2023-09-13 15:22 [RFC PATCH 0/3] quering mount attributes Miklos Szeredi
  2023-09-13 15:22 ` [RFC PATCH 1/3] add unique mount ID Miklos Szeredi
  2023-09-13 15:22 ` [RFC PATCH 2/3] add statmnt(2) syscall Miklos Szeredi
@ 2023-09-13 15:22 ` Miklos Szeredi
  2023-09-14  6:00   ` Amir Goldstein
  2023-09-17  0:54   ` Matthew House
  2023-09-14  6:47 ` [RFC PATCH 0/3] quering mount attributes Amir Goldstein
  3 siblings, 2 replies; 76+ messages in thread
From: Miklos Szeredi @ 2023-09-13 15:22 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: linux-kernel, linux-api, linux-man, linux-security-module,
	Karel Zak, Ian Kent, David Howells, Linus Torvalds, Al Viro,
	Christian Brauner, Amir Goldstein

Add way to query the children of a particular mount.  This is a more
flexible way to iterate the mount tree than having to parse the complete
/proc/self/mountinfo.

Lookup the mount by the old (32bit) or new (64bit) mount ID.  If a mount
needs to be queried based on path, then statx(2) can be used to first query
the mount ID belonging to the path.

Return an array of new (64bit) mount ID's.  Without privileges only mounts
are listed which are reachable from the task's root.

Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 arch/x86/entry/syscalls/syscall_64.tbl |  1 +
 fs/namespace.c                         | 51 ++++++++++++++++++++++++++
 include/linux/syscalls.h               |  2 +
 include/uapi/asm-generic/unistd.h      |  5 ++-
 4 files changed, 58 insertions(+), 1 deletion(-)

diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 6d807c30cd16..0d9a47b0ce9b 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -376,6 +376,7 @@
 452	common	fchmodat2		sys_fchmodat2
 453	64	map_shadow_stack	sys_map_shadow_stack
 454	common	statmnt			sys_statmnt
+455	common	listmnt			sys_listmnt
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/fs/namespace.c b/fs/namespace.c
index 088a52043bba..5362b1ffb26f 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -4988,6 +4988,57 @@ SYSCALL_DEFINE5(statmnt, u64, mnt_id,
 	return err;
 }
 
+static long do_listmnt(struct vfsmount *mnt, u64 __user *buf, size_t bufsize,
+		      const struct path *root)
+{
+	struct mount *r, *m = real_mount(mnt);
+	struct path rootmnt = { .mnt = root->mnt, .dentry = root->mnt->mnt_root };
+	long ctr = 0;
+
+	if (!capable(CAP_SYS_ADMIN) &&
+	    !is_path_reachable(m, mnt->mnt_root, &rootmnt))
+		return -EPERM;
+
+	list_for_each_entry(r, &m->mnt_mounts, mnt_child) {
+		if (!capable(CAP_SYS_ADMIN) &&
+		    !is_path_reachable(r, r->mnt.mnt_root, root))
+			continue;
+
+		if (ctr >= bufsize)
+			return -EOVERFLOW;
+		if (put_user(r->mnt_id_unique, buf + ctr))
+			return -EFAULT;
+		ctr++;
+		if (ctr < 0)
+			return -ERANGE;
+	}
+	return ctr;
+}
+
+SYSCALL_DEFINE4(listmnt, u64, mnt_id, u64 __user *, buf, size_t, bufsize,
+		unsigned int, flags)
+{
+	struct vfsmount *mnt;
+	struct path root;
+	long err;
+
+	if (flags)
+		return -EINVAL;
+
+	down_read(&namespace_sem);
+	mnt = lookup_mnt_in_ns(mnt_id, current->nsproxy->mnt_ns);
+	err = -ENOENT;
+	if (mnt) {
+		get_fs_root(current->fs, &root);
+		err = do_listmnt(mnt, buf, bufsize, &root);
+		path_put(&root);
+	}
+	up_read(&namespace_sem);
+
+	return err;
+}
+
+
 static void __init init_mount_tree(void)
 {
 	struct vfsmount *mnt;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 1099bd307fa7..5d776cdb6f18 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -411,6 +411,8 @@ asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz,
 asmlinkage long sys_statmnt(u64 mnt_id, u64 mask,
 			    struct statmnt __user *buf, size_t bufsize,
 			    unsigned int flags);
+asmlinkage long sys_listmnt(u64 mnt_id, u64 __user *buf, size_t bufsize,
+			    unsigned int flags);
 asmlinkage long sys_truncate(const char __user *path, long length);
 asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length);
 #if BITS_PER_LONG == 32
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 640997231ff6..a2b41370f603 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -826,8 +826,11 @@ __SYSCALL(__NR_fchmodat2, sys_fchmodat2)
 #define __NR_statmnt   454
 __SYSCALL(__NR_statmnt, sys_statmnt)
 
+#define __NR_listmnt   455
+__SYSCALL(__NR_listmnt, sys_listmnt)
+
 #undef __NR_syscalls
-#define __NR_syscalls 455
+#define __NR_syscalls 456
 
 /*
  * 32 bit systems traditionally used different
-- 
2.41.0


^ permalink raw reply related	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 3/3] add listmnt(2) syscall
  2023-09-13 15:22 ` [RFC PATCH 3/3] add listmnt(2) syscall Miklos Szeredi
@ 2023-09-14  6:00   ` Amir Goldstein
  2023-09-14  8:50     ` Miklos Szeredi
  2023-09-15  1:00     ` Ian Kent
  2023-09-17  0:54   ` Matthew House
  1 sibling, 2 replies; 76+ messages in thread
From: Amir Goldstein @ 2023-09-14  6:00 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: linux-fsdevel, linux-kernel, linux-api, linux-man,
	linux-security-module, Karel Zak, Ian Kent, David Howells,
	Linus Torvalds, Al Viro, Christian Brauner

On Wed, Sep 13, 2023 at 6:22 PM Miklos Szeredi <mszeredi@redhat.com> wrote:
>
> Add way to query the children of a particular mount.  This is a more
> flexible way to iterate the mount tree than having to parse the complete
> /proc/self/mountinfo.
>
> Lookup the mount by the old (32bit) or new (64bit) mount ID.  If a mount
> needs to be queried based on path, then statx(2) can be used to first query
> the mount ID belonging to the path.
>
> Return an array of new (64bit) mount ID's.  Without privileges only mounts
> are listed which are reachable from the task's root.
>
> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
> ---
>  arch/x86/entry/syscalls/syscall_64.tbl |  1 +
>  fs/namespace.c                         | 51 ++++++++++++++++++++++++++
>  include/linux/syscalls.h               |  2 +
>  include/uapi/asm-generic/unistd.h      |  5 ++-
>  4 files changed, 58 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index 6d807c30cd16..0d9a47b0ce9b 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -376,6 +376,7 @@
>  452    common  fchmodat2               sys_fchmodat2
>  453    64      map_shadow_stack        sys_map_shadow_stack
>  454    common  statmnt                 sys_statmnt
> +455    common  listmnt                 sys_listmnt
>
>  #
>  # Due to a historical design error, certain syscalls are numbered differently
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 088a52043bba..5362b1ffb26f 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -4988,6 +4988,57 @@ SYSCALL_DEFINE5(statmnt, u64, mnt_id,
>         return err;
>  }
>
> +static long do_listmnt(struct vfsmount *mnt, u64 __user *buf, size_t bufsize,
> +                     const struct path *root)
> +{
> +       struct mount *r, *m = real_mount(mnt);
> +       struct path rootmnt = { .mnt = root->mnt, .dentry = root->mnt->mnt_root };
> +       long ctr = 0;
> +
> +       if (!capable(CAP_SYS_ADMIN) &&
> +           !is_path_reachable(m, mnt->mnt_root, &rootmnt))
> +               return -EPERM;
> +
> +       list_for_each_entry(r, &m->mnt_mounts, mnt_child) {
> +               if (!capable(CAP_SYS_ADMIN) &&
> +                   !is_path_reachable(r, r->mnt.mnt_root, root))
> +                       continue;
> +
> +               if (ctr >= bufsize)
> +                       return -EOVERFLOW;
> +               if (put_user(r->mnt_id_unique, buf + ctr))
> +                       return -EFAULT;
> +               ctr++;
> +               if (ctr < 0)
> +                       return -ERANGE;

I think it'd be good for userspace to be able to query required
bufsize with NULL buf, listattr style, rather than having to
guess and re-guess on EOVERFLOW.

Thanks,
Amir.






> +       }
> +       return ctr;
> +}
> +
> +SYSCALL_DEFINE4(listmnt, u64, mnt_id, u64 __user *, buf, size_t, bufsize,
> +               unsigned int, flags)
> +{
> +       struct vfsmount *mnt;
> +       struct path root;
> +       long err;
> +
> +       if (flags)
> +               return -EINVAL;
> +
> +       down_read(&namespace_sem);
> +       mnt = lookup_mnt_in_ns(mnt_id, current->nsproxy->mnt_ns);
> +       err = -ENOENT;
> +       if (mnt) {
> +               get_fs_root(current->fs, &root);
> +               err = do_listmnt(mnt, buf, bufsize, &root);
> +               path_put(&root);
> +       }
> +       up_read(&namespace_sem);
> +
> +       return err;
> +}
> +
> +
>  static void __init init_mount_tree(void)
>  {
>         struct vfsmount *mnt;
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index 1099bd307fa7..5d776cdb6f18 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -411,6 +411,8 @@ asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz,
>  asmlinkage long sys_statmnt(u64 mnt_id, u64 mask,
>                             struct statmnt __user *buf, size_t bufsize,
>                             unsigned int flags);
> +asmlinkage long sys_listmnt(u64 mnt_id, u64 __user *buf, size_t bufsize,
> +                           unsigned int flags);
>  asmlinkage long sys_truncate(const char __user *path, long length);
>  asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length);
>  #if BITS_PER_LONG == 32
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index 640997231ff6..a2b41370f603 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -826,8 +826,11 @@ __SYSCALL(__NR_fchmodat2, sys_fchmodat2)
>  #define __NR_statmnt   454
>  __SYSCALL(__NR_statmnt, sys_statmnt)
>
> +#define __NR_listmnt   455
> +__SYSCALL(__NR_listmnt, sys_listmnt)
> +
>  #undef __NR_syscalls
> -#define __NR_syscalls 455
> +#define __NR_syscalls 456
>
>  /*
>   * 32 bit systems traditionally used different
> --
> 2.41.0
>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-13 15:22 ` [RFC PATCH 2/3] add statmnt(2) syscall Miklos Szeredi
@ 2023-09-14  6:11   ` Amir Goldstein
  2023-09-15  1:05     ` Ian Kent
  2023-09-14  9:27   ` Christian Brauner
                     ` (3 subsequent siblings)
  4 siblings, 1 reply; 76+ messages in thread
From: Amir Goldstein @ 2023-09-14  6:11 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: linux-fsdevel, linux-kernel, linux-api, linux-man,
	linux-security-module, Karel Zak, Ian Kent, David Howells,
	Linus Torvalds, Al Viro, Christian Brauner

On Wed, Sep 13, 2023 at 6:22 PM Miklos Szeredi <mszeredi@redhat.com> wrote:
>
> Add a way to query attributes of a single mount instead of having to parse
> the complete /proc/$PID/mountinfo, which might be huge.
>
> Lookup the mount by the old (32bit) or new (64bit) mount ID.  If a mount
> needs to be queried based on path, then statx(2) can be used to first query
> the mount ID belonging to the path.
>
> Design is based on a suggestion by Linus:
>
>   "So I'd suggest something that is very much like "statfsat()", which gets
>    a buffer and a length, and returns an extended "struct statfs" *AND*
>    just a string description at the end."
>
> The interface closely mimics that of statx.
>
> Handle ASCII attributes by appending after the end of the structure (as per
> above suggestion).  Allow querying multiple string attributes with
> individual offset/length for each.  String are nul terminated (termination
> isn't counted in length).
>
> Mount options are also delimited with nul characters.  Unlike proc, special
> characters are not quoted.
>
> Link: https://lore.kernel.org/all/CAHk-=wh5YifP7hzKSbwJj94+DZ2czjrZsczy6GBimiogZws=rg@mail.gmail.com/
> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
> ---
>  arch/x86/entry/syscalls/syscall_64.tbl |   1 +
>  fs/internal.h                          |   5 +
>  fs/namespace.c                         | 312 ++++++++++++++++++++++++-
>  fs/proc_namespace.c                    |  19 +-
>  fs/statfs.c                            |   1 +
>  include/linux/syscalls.h               |   3 +
>  include/uapi/asm-generic/unistd.h      |   5 +-
>  include/uapi/linux/mount.h             |  36 +++
>  8 files changed, 373 insertions(+), 9 deletions(-)
>
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index 1d6eee30eceb..6d807c30cd16 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -375,6 +375,7 @@
>  451    common  cachestat               sys_cachestat
>  452    common  fchmodat2               sys_fchmodat2
>  453    64      map_shadow_stack        sys_map_shadow_stack
> +454    common  statmnt                 sys_statmnt
>
>  #
>  # Due to a historical design error, certain syscalls are numbered differently
> diff --git a/fs/internal.h b/fs/internal.h
> index d64ae03998cc..8f75271428aa 100644
> --- a/fs/internal.h
> +++ b/fs/internal.h
> @@ -83,6 +83,11 @@ int path_mount(const char *dev_name, struct path *path,
>                 const char *type_page, unsigned long flags, void *data_page);
>  int path_umount(struct path *path, int flags);
>
> +/*
> + * proc_namespace.c
> + */
> +int show_path(struct seq_file *m, struct dentry *root);
> +
>  /*
>   * fs_struct.c
>   */
> diff --git a/fs/namespace.c b/fs/namespace.c
> index de47c5f66e17..088a52043bba 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -69,7 +69,8 @@ static DEFINE_IDA(mnt_id_ida);
>  static DEFINE_IDA(mnt_group_ida);
>
>  /* Don't allow confusion with mount ID allocated wit IDA */
> -static atomic64_t mnt_id_ctr = ATOMIC64_INIT(1ULL << 32);
> +#define OLD_MNT_ID_MAX UINT_MAX
> +static atomic64_t mnt_id_ctr = ATOMIC64_INIT(OLD_MNT_ID_MAX);
>
>  static struct hlist_head *mount_hashtable __read_mostly;
>  static struct hlist_head *mountpoint_hashtable __read_mostly;
> @@ -4678,6 +4679,315 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
>         return err;
>  }
>
> +static bool mnt_id_match(struct mount *mnt, u64 id)
> +{
> +       if (id <= OLD_MNT_ID_MAX)
> +               return id == mnt->mnt_id;
> +       else
> +               return id == mnt->mnt_id_unique;
> +}
> +
> +struct vfsmount *lookup_mnt_in_ns(u64 id, struct mnt_namespace *ns)
> +{
> +       struct mount *mnt;
> +       struct vfsmount *res = NULL;
> +
> +       lock_ns_list(ns);
> +       list_for_each_entry(mnt, &ns->list, mnt_list) {
> +               if (!mnt_is_cursor(mnt) && mnt_id_match(mnt, id)) {
> +                       res = &mnt->mnt;
> +                       break;
> +               }
> +       }
> +       unlock_ns_list(ns);
> +       return res;
> +}
> +
> +struct stmt_state {
> +       void __user *const buf;
> +       size_t const bufsize;
> +       struct vfsmount *const mnt;
> +       u64 const mask;
> +       struct seq_file seq;
> +       struct path root;
> +       struct statmnt sm;
> +       size_t pos;
> +       int err;
> +};
> +
> +typedef int (*stmt_func_t)(struct stmt_state *);
> +
> +static int stmt_string_seq(struct stmt_state *s, stmt_func_t func)
> +{
> +       struct seq_file *seq = &s->seq;
> +       int ret;
> +
> +       seq->count = 0;
> +       seq->size = min_t(size_t, seq->size, s->bufsize - s->pos);
> +       seq->buf = kvmalloc(seq->size, GFP_KERNEL_ACCOUNT);
> +       if (!seq->buf)
> +               return -ENOMEM;
> +
> +       ret = func(s);
> +       if (ret)
> +               return ret;
> +
> +       if (seq_has_overflowed(seq)) {
> +               if (seq->size == s->bufsize - s->pos)
> +                       return -EOVERFLOW;
> +               seq->size *= 2;
> +               if (seq->size > MAX_RW_COUNT)
> +                       return -ENOMEM;
> +               kvfree(seq->buf);
> +               return 0;
> +       }
> +
> +       /* Done */
> +       return 1;
> +}
> +
> +static void stmt_string(struct stmt_state *s, u64 mask, stmt_func_t func,
> +                      stmt_str_t *str)
> +{
> +       int ret = s->pos >= s->bufsize ? -EOVERFLOW : 0;
> +       struct statmnt *sm = &s->sm;
> +       struct seq_file *seq = &s->seq;
> +
> +       if (s->err || !(s->mask & mask))
> +               return;
> +
> +       seq->size = PAGE_SIZE;
> +       while (!ret)
> +               ret = stmt_string_seq(s, func);
> +
> +       if (ret < 0) {
> +               s->err = ret;
> +       } else {
> +               seq->buf[seq->count++] = '\0';
> +               if (copy_to_user(s->buf + s->pos, seq->buf, seq->count)) {
> +                       s->err = -EFAULT;
> +               } else {
> +                       str->off = s->pos;
> +                       str->len = seq->count - 1;
> +                       s->pos += seq->count;
> +               }
> +       }
> +       kvfree(seq->buf);
> +       sm->mask |= mask;
> +}
> +
> +static void stmt_numeric(struct stmt_state *s, u64 mask, stmt_func_t func)
> +{
> +       if (s->err || !(s->mask & mask))
> +               return;
> +
> +       s->err = func(s);
> +       s->sm.mask |= mask;
> +}
> +
> +static u64 mnt_to_attr_flags(struct vfsmount *mnt)
> +{
> +       unsigned int mnt_flags = READ_ONCE(mnt->mnt_flags);
> +       u64 attr_flags = 0;
> +
> +       if (mnt_flags & MNT_READONLY)
> +               attr_flags |= MOUNT_ATTR_RDONLY;
> +       if (mnt_flags & MNT_NOSUID)
> +               attr_flags |= MOUNT_ATTR_NOSUID;
> +       if (mnt_flags & MNT_NODEV)
> +               attr_flags |= MOUNT_ATTR_NODEV;
> +       if (mnt_flags & MNT_NOEXEC)
> +               attr_flags |= MOUNT_ATTR_NOEXEC;
> +       if (mnt_flags & MNT_NODIRATIME)
> +               attr_flags |= MOUNT_ATTR_NODIRATIME;
> +       if (mnt_flags & MNT_NOSYMFOLLOW)
> +               attr_flags |= MOUNT_ATTR_NOSYMFOLLOW;
> +
> +       if (mnt_flags & MNT_NOATIME)
> +               attr_flags |= MOUNT_ATTR_NOATIME;
> +       else if (mnt_flags & MNT_RELATIME)
> +               attr_flags |= MOUNT_ATTR_RELATIME;
> +       else
> +               attr_flags |= MOUNT_ATTR_STRICTATIME;
> +
> +       if (is_idmapped_mnt(mnt))
> +               attr_flags |= MOUNT_ATTR_IDMAP;
> +
> +       return attr_flags;
> +}
> +
> +static u64 mnt_to_propagation_flags(struct mount *m)
> +{
> +       u64 propagation = 0;
> +
> +       if (IS_MNT_SHARED(m))
> +               propagation |= MS_SHARED;
> +       if (IS_MNT_SLAVE(m))
> +               propagation |= MS_SLAVE;
> +       if (IS_MNT_UNBINDABLE(m))
> +               propagation |= MS_UNBINDABLE;
> +       if (!propagation)
> +               propagation |= MS_PRIVATE;
> +
> +       return propagation;
> +}
> +
> +static int stmt_sb_basic(struct stmt_state *s)
> +{
> +       struct super_block *sb = s->mnt->mnt_sb;
> +
> +       s->sm.sb_dev_major = MAJOR(sb->s_dev);
> +       s->sm.sb_dev_minor = MINOR(sb->s_dev);
> +       s->sm.sb_magic = sb->s_magic;
> +       s->sm.sb_flags = sb->s_flags & (SB_RDONLY|SB_SYNCHRONOUS|SB_DIRSYNC|SB_LAZYTIME);
> +
> +       return 0;
> +}
> +
> +static int stmt_mnt_basic(struct stmt_state *s)
> +{
> +       struct mount *m = real_mount(s->mnt);
> +
> +       s->sm.mnt_id = m->mnt_id_unique;
> +       s->sm.mnt_parent_id = m->mnt_parent->mnt_id_unique;
> +       s->sm.mnt_id_old = m->mnt_id;
> +       s->sm.mnt_parent_id_old = m->mnt_parent->mnt_id;
> +       s->sm.mnt_attr = mnt_to_attr_flags(&m->mnt);
> +       s->sm.mnt_propagation = mnt_to_propagation_flags(m);
> +       s->sm.mnt_peer_group = IS_MNT_SHARED(m) ? m->mnt_group_id : 0;
> +       s->sm.mnt_master = IS_MNT_SLAVE(m) ? m->mnt_master->mnt_group_id : 0;
> +
> +       return 0;
> +}
> +
> +static int stmt_propagate_from(struct stmt_state *s)
> +{
> +       struct mount *m = real_mount(s->mnt);
> +
> +       if (!IS_MNT_SLAVE(m))
> +               return 0;
> +
> +       s->sm.propagate_from = get_dominating_id(m, &current->fs->root);
> +
> +       return 0;
> +}
> +
> +static int stmt_mnt_root(struct stmt_state *s)
> +{
> +       struct seq_file *seq = &s->seq;
> +       int err = show_path(seq, s->mnt->mnt_root);
> +
> +       if (!err && !seq_has_overflowed(seq)) {
> +               seq->buf[seq->count] = '\0';
> +               seq->count = string_unescape_inplace(seq->buf, UNESCAPE_OCTAL);
> +       }
> +       return err;
> +}
> +
> +static int stmt_mountpoint(struct stmt_state *s)
> +{
> +       struct vfsmount *mnt = s->mnt;
> +       struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
> +       int err = seq_path_root(&s->seq, &mnt_path, &s->root, "");
> +
> +       return err == SEQ_SKIP ? 0 : err;
> +}
> +
> +static int stmt_fs_type(struct stmt_state *s)
> +{
> +       struct seq_file *seq = &s->seq;
> +       struct super_block *sb = s->mnt->mnt_sb;
> +
> +       seq_puts(seq, sb->s_type->name);
> +       if (sb->s_subtype) {
> +               seq_putc(seq, '.');
> +               seq_puts(seq, sb->s_subtype);
> +       }
> +       return 0;
> +}
> +
> +static int stmt_sb_opts(struct stmt_state *s)
> +{
> +       struct seq_file *seq = &s->seq;
> +       struct super_block *sb = s->mnt->mnt_sb;
> +       char *p, *end, *next, *u = seq->buf;
> +       int err;
> +
> +       if (!sb->s_op->show_options)
> +               return 0;
> +
> +       err = sb->s_op->show_options(seq, s->mnt->mnt_root);
> +       if (err || seq_has_overflowed(seq) || !seq->count)
> +               return err;
> +
> +       end = seq->buf + seq->count;
> +       *end = '\0';
> +       for (p = seq->buf + 1; p < end; p = next + 1) {
> +               next = strchrnul(p, ',');
> +               *next = '\0';
> +               u += string_unescape(p, u, 0, UNESCAPE_OCTAL) + 1;
> +       }
> +       seq->count = u - 1 - seq->buf;
> +       return 0;
> +}
> +
> +static int do_statmnt(struct stmt_state *s)
> +{
> +       struct statmnt *sm = &s->sm;
> +       struct mount *m = real_mount(s->mnt);
> +
> +       if (!capable(CAP_SYS_ADMIN) &&
> +           !is_path_reachable(m, m->mnt.mnt_root, &s->root))
> +               return -EPERM;
> +
> +       stmt_numeric(s, STMT_SB_BASIC, stmt_sb_basic);
> +       stmt_numeric(s, STMT_MNT_BASIC, stmt_mnt_basic);
> +       stmt_numeric(s, STMT_PROPAGATE_FROM, stmt_propagate_from);
> +       stmt_string(s, STMT_MNT_ROOT, stmt_mnt_root, &sm->mnt_root);
> +       stmt_string(s, STMT_MOUNTPOINT, stmt_mountpoint, &sm->mountpoint);
> +       stmt_string(s, STMT_FS_TYPE, stmt_fs_type, &sm->fs_type);
> +       stmt_string(s, STMT_SB_OPTS, stmt_sb_opts, &sm->sb_opts);
> +
> +       if (s->err)
> +               return s->err;
> +
> +       if (copy_to_user(s->buf, sm, min_t(size_t, s->bufsize, sizeof(*sm))))
> +               return -EFAULT;
> +
> +       return 0;

Similar concern as with listmnt, I think that users would
want to have a way to get the fixed size statmnt part that fits
in the buffer, even if the variable length string values do not fit
and be able to query the required buffer size to get the strings.

The API could be either to explicitly request
STMT_MNT_ROOT_LEN | STMT_MOUNTPOINT_LEN ...
without allowing mixing of no-value and value requests,
or to out-out from any string values using a single flag,
which is probably more simple for API and implementation.

Thanks,
Amir.

> +}
> +
> +SYSCALL_DEFINE5(statmnt, u64, mnt_id,
> +               u64, mask, struct statmnt __user *, buf,
> +               size_t, bufsize, unsigned int, flags)
> +{
> +       struct vfsmount *mnt;
> +       int err;
> +
> +       if (flags)
> +               return -EINVAL;
> +
> +       down_read(&namespace_sem);
> +       mnt = lookup_mnt_in_ns(mnt_id, current->nsproxy->mnt_ns);
> +       err = -ENOENT;
> +       if (mnt) {
> +               struct stmt_state s = {
> +                       .mask = mask,
> +                       .buf = buf,
> +                       .bufsize = bufsize,
> +                       .mnt = mnt,
> +                       .pos = sizeof(*buf),
> +               };
> +
> +               get_fs_root(current->fs, &s.root);
> +               err = do_statmnt(&s);
> +               path_put(&s.root);
> +       }
> +       up_read(&namespace_sem);
> +
> +       return err;
> +}
> +
>  static void __init init_mount_tree(void)
>  {
>         struct vfsmount *mnt;
> diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
> index 250eb5bf7b52..20681d1f6798 100644
> --- a/fs/proc_namespace.c
> +++ b/fs/proc_namespace.c
> @@ -132,6 +132,15 @@ static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt)
>         return err;
>  }
>
> +int show_path(struct seq_file *m, struct dentry *root)
> +{
> +       if (root->d_sb->s_op->show_path)
> +               return root->d_sb->s_op->show_path(m, root);
> +
> +       seq_dentry(m, root, " \t\n\\");
> +       return 0;
> +}
> +
>  static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)
>  {
>         struct proc_mounts *p = m->private;
> @@ -142,13 +151,9 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)
>
>         seq_printf(m, "%i %i %u:%u ", r->mnt_id, r->mnt_parent->mnt_id,
>                    MAJOR(sb->s_dev), MINOR(sb->s_dev));
> -       if (sb->s_op->show_path) {
> -               err = sb->s_op->show_path(m, mnt->mnt_root);
> -               if (err)
> -                       goto out;
> -       } else {
> -               seq_dentry(m, mnt->mnt_root, " \t\n\\");
> -       }
> +       err = show_path(m, mnt->mnt_root);
> +       if (err)
> +               goto out;
>         seq_putc(m, ' ');
>
>         /* mountpoints outside of chroot jail will give SEQ_SKIP on this */
> diff --git a/fs/statfs.c b/fs/statfs.c
> index 96d1c3edf289..cc774c2e2c9a 100644
> --- a/fs/statfs.c
> +++ b/fs/statfs.c
> @@ -9,6 +9,7 @@
>  #include <linux/security.h>
>  #include <linux/uaccess.h>
>  #include <linux/compat.h>
> +#include <uapi/linux/mount.h>
>  #include "internal.h"
>
>  static int flags_by_mnt(int mnt_flags)
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index 22bc6bc147f8..1099bd307fa7 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -408,6 +408,9 @@ asmlinkage long sys_statfs64(const char __user *path, size_t sz,
>  asmlinkage long sys_fstatfs(unsigned int fd, struct statfs __user *buf);
>  asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz,
>                                 struct statfs64 __user *buf);
> +asmlinkage long sys_statmnt(u64 mnt_id, u64 mask,
> +                           struct statmnt __user *buf, size_t bufsize,
> +                           unsigned int flags);
>  asmlinkage long sys_truncate(const char __user *path, long length);
>  asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length);
>  #if BITS_PER_LONG == 32
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index abe087c53b4b..640997231ff6 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -823,8 +823,11 @@ __SYSCALL(__NR_cachestat, sys_cachestat)
>  #define __NR_fchmodat2 452
>  __SYSCALL(__NR_fchmodat2, sys_fchmodat2)
>
> +#define __NR_statmnt   454
> +__SYSCALL(__NR_statmnt, sys_statmnt)
> +
>  #undef __NR_syscalls
> -#define __NR_syscalls 453
> +#define __NR_syscalls 455
>
>  /*
>   * 32 bit systems traditionally used different
> diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
> index bb242fdcfe6b..4ec7308a9259 100644
> --- a/include/uapi/linux/mount.h
> +++ b/include/uapi/linux/mount.h
> @@ -138,4 +138,40 @@ struct mount_attr {
>  /* List of all mount_attr versions. */
>  #define MOUNT_ATTR_SIZE_VER0   32 /* sizeof first published struct */
>
> +struct stmt_str {
> +       __u32 off;
> +       __u32 len;
> +};
> +
> +struct statmnt {
> +       __u64 mask;             /* What results were written [uncond] */
> +       __u32 sb_dev_major;     /* Device ID */
> +       __u32 sb_dev_minor;
> +       __u64 sb_magic;         /* ..._SUPER_MAGIC */
> +       __u32 sb_flags;         /* MS_{RDONLY,SYNCHRONOUS,DIRSYNC,LAZYTIME} */
> +       __u32 __spare1;
> +       __u64 mnt_id;           /* Unique ID of mount */
> +       __u64 mnt_parent_id;    /* Unique ID of parent (for root == mnt_id) */
> +       __u32 mnt_id_old;       /* Reused IDs used in proc/.../mountinfo */
> +       __u32 mnt_parent_id_old;
> +       __u64 mnt_attr;         /* MOUNT_ATTR_... */
> +       __u64 mnt_propagation;  /* MS_{SHARED,SLAVE,PRIVATE,UNBINDABLE} */
> +       __u64 mnt_peer_group;   /* ID of shared peer group */
> +       __u64 mnt_master;       /* Mount receives propagation from this ID */
> +       __u64 propagate_from;   /* Propagation from in current namespace */
> +       __u64 __spare[20];
> +       struct stmt_str mnt_root;       /* Root of mount relative to root of fs */
> +       struct stmt_str mountpoint;     /* Mountpoint relative to root of process */
> +       struct stmt_str fs_type;        /* Filesystem type[.subtype] */
> +       struct stmt_str sb_opts;        /* Super block string options (nul delimted) */
> +};
> +
> +#define STMT_SB_BASIC          0x00000001U     /* Want/got sb_... */
> +#define STMT_MNT_BASIC         0x00000002U     /* Want/got mnt_... */
> +#define STMT_PROPAGATE_FROM    0x00000004U     /* Want/got propagate_from */
> +#define STMT_MNT_ROOT          0x00000008U     /* Want/got mnt_root  */
> +#define STMT_MOUNTPOINT                0x00000010U     /* Want/got mountpoint */
> +#define STMT_FS_TYPE           0x00000020U     /* Want/got fs_type */
> +#define STMT_SB_OPTS           0x00000040U     /* Want/got sb_opts */
> +
>  #endif /* _UAPI_LINUX_MOUNT_H */
> --
> 2.41.0
>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 0/3] quering mount attributes
  2023-09-13 15:22 [RFC PATCH 0/3] quering mount attributes Miklos Szeredi
                   ` (2 preceding siblings ...)
  2023-09-13 15:22 ` [RFC PATCH 3/3] add listmnt(2) syscall Miklos Szeredi
@ 2023-09-14  6:47 ` Amir Goldstein
  2023-09-15  1:20   ` Ian Kent
  3 siblings, 1 reply; 76+ messages in thread
From: Amir Goldstein @ 2023-09-14  6:47 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: linux-fsdevel, linux-kernel, linux-api, linux-man,
	linux-security-module, Karel Zak, Ian Kent, David Howells,
	Linus Torvalds, Al Viro, Christian Brauner

On Wed, Sep 13, 2023 at 6:22 PM Miklos Szeredi <mszeredi@redhat.com> wrote:
>
> Implement the mount querying syscalls agreed on at LSF/MM 2023.  This is an
> RFC with just x86_64 syscalls.
>
> Excepting notification this should allow full replacement for
> parsing /proc/self/mountinfo.

Since you mentioned notifications, I will add that the plan discussed
in LFSMM was, once we have an API to query mount stats and children,
implement fanotify events for:
mount [mntuid] was un/mounted at [parent mntuid],[dirfid+name]

As with other fanotify events, the self mntuid and dirfid+name
information can be omitted and without it, multiple un/mount events
from the same parent mntuid will be merged, allowing userspace
to listmnt() periodically only mntuid whose child mounts have changed,
with little risk of event queue overflow.

The possible monitoring scopes would be the entire mount namespace
of the monitoring program or watching a single mount for change in
its children mounts. The latter is similar to inotify directory children watch,
where the watches needs to be set recursively, with all the weight on
userspace to avoid races.

That still leaves the problem of monitoring the creation of new mount
namespaces, but that is out of scope for this discussion, which is
about a replacement for /proc/self/mountinfo monitoring.

Thanks,
Amir.

>
> It is not a replacement for /proc/$OTHER_PID/mountinfo, since mount
> namespace and root are taken from the current task.  I guess namespace and
> root could be switched before invoking these syscalls but that sounds a bit
> complicated.  Not sure if this is a problem.
>
> Test utility attached at the end.
> ---
>
> Miklos Szeredi (3):
>   add unique mount ID
>   add statmnt(2) syscall
>   add listmnt(2) syscall
>
>  arch/x86/entry/syscalls/syscall_64.tbl |   2 +
>  fs/internal.h                          |   5 +
>  fs/mount.h                             |   3 +-
>  fs/namespace.c                         | 365 +++++++++++++++++++++++++
>  fs/proc_namespace.c                    |  19 +-
>  fs/stat.c                              |   9 +-
>  fs/statfs.c                            |   1 +
>  include/linux/syscalls.h               |   5 +
>  include/uapi/asm-generic/unistd.h      |   8 +-
>  include/uapi/linux/mount.h             |  36 +++
>  include/uapi/linux/stat.h              |   1 +
>  11 files changed, 443 insertions(+), 11 deletions(-)
>
> --
> 2.41.0
>
> === statmnt.c ===
> #define _GNU_SOURCE
> #include <unistd.h>
> #include <stdio.h>
> #include <fcntl.h>
> #include <stdint.h>
> #include <stdlib.h>
> #include <string.h>
> #include <errno.h>
> #include <sys/mount.h>
> #include <sys/stat.h>
> #include <err.h>
>
> struct stmt_str {
>         __u32 off;
>         __u32 len;
> };
>
> struct statmnt {
>         __u64 mask;             /* What results were written [uncond] */
>         __u32 sb_dev_major;     /* Device ID */
>         __u32 sb_dev_minor;
>         __u64 sb_magic;         /* ..._SUPER_MAGIC */
>         __u32 sb_flags;         /* MS_{RDONLY,SYNCHRONOUS,DIRSYNC,LAZYTIME} */
>         __u32 __spare1;
>         __u64 mnt_id;           /* Unique ID of mount */
>         __u64 mnt_parent_id;    /* Unique ID of parent (for root == mnt_id) */
>         __u32 mnt_id_old;       /* Reused IDs used in proc/.../mountinfo */
>         __u32 mnt_parent_id_old;
>         __u64 mnt_attr;         /* MOUNT_ATTR_... */
>         __u64 mnt_propagation;  /* MS_{SHARED,SLAVE,PRIVATE,UNBINDABLE} */
>         __u64 mnt_peer_group;   /* ID of shared peer group */
>         __u64 mnt_master;       /* Mount receives propagation from this ID */
>         __u64 propagate_from;   /* Propagation from in current namespace */
>         __u64 __spare[20];
>         struct stmt_str mnt_root;       /* Root of mount relative to root of fs */
>         struct stmt_str mountpoint;     /* Mountpoint relative to root of process */
>         struct stmt_str fs_type;        /* Filesystem type[.subtype] */
>         struct stmt_str sb_opts;        /* Super block string options (nul delimted) */
> };
>
> #define STMT_SB_BASIC           0x00000001U     /* Want/got sb_... */
> #define STMT_MNT_BASIC          0x00000002U     /* Want/got mnt_... */
> #define STMT_PROPAGATE_FROM     0x00000004U     /* Want/got propagate_from */
> #define STMT_MNT_ROOT           0x00000008U     /* Want/got mnt_root  */
> #define STMT_MOUNTPOINT         0x00000010U     /* Want/got mountpoint */
> #define STMT_FS_TYPE            0x00000020U     /* Want/got fs_type */
> #define STMT_SB_OPTS            0x00000040U     /* Want/got sb_opts */
>
> #define __NR_statmnt   454
> #define __NR_listmnt   455
>
> #define STATX_MNT_ID_UNIQUE     0x00004000U     /* Want/got extended stx_mount_id */
>
> int main(int argc, char *argv[])
> {
>         char buf[65536];
>         struct statmnt *st = (void *) buf;
>         char *end;
>         const char *arg = argv[1];
>         long res;
>         int list = 0;
>         unsigned long mnt_id;
>         unsigned int mask = STMT_SB_BASIC | STMT_MNT_BASIC | STMT_PROPAGATE_FROM | STMT_MNT_ROOT | STMT_MOUNTPOINT | STMT_FS_TYPE | STMT_SB_OPTS;
>
>         if (arg && strcmp(arg, "-l") == 0) {
>                 list = 1;
>                 arg = argv[2];
>         }
>         if (argc != list + 2)
>                 errx(1, "usage: %s [-l] (mnt_id|path)", argv[0]);
>
>         mnt_id = strtol(arg, &end, 0);
>         if (!mnt_id || *end != '\0') {
>                 struct statx sx;
>
>                 res = statx(AT_FDCWD, arg, 0, STATX_MNT_ID_UNIQUE, &sx);
>                 if (res == -1)
>                         err(1, "%s", arg);
>
>                 if (!(sx.stx_mask & (STATX_MNT_ID | STATX_MNT_ID_UNIQUE)))
>                         errx(1, "Sorry, no mount ID");
>
>                 mnt_id = sx.stx_mnt_id;
>         }
>
>
>         if (list) {
>                 size_t size = 8192;
>                 uint64_t list[size];
>                 long i, num;
>
>                 res = syscall(__NR_listmnt, mnt_id, list, size, 0);
>                 if (res == -1)
>                         err(1, "listmnt(%lu)", mnt_id);
>
>                 num = res;
>                 for (i = 0; i < num; i++) {
>                         printf("0x%lx / ", list[i]);
>
>                         res = syscall(__NR_statmnt, list[i], STMT_MNT_BASIC | STMT_MOUNTPOINT, &buf, sizeof(buf), 0);
>                         if (res == -1) {
>                                 printf("???\t[%s]\n", strerror(errno));
>                         } else {
>                                 printf("%u\t%s\n", st->mnt_id_old,
>                                        (st->mask & STMT_MOUNTPOINT) ? buf + st->mountpoint.off : "???");
>                         }
>                 }
>
>                 return 0;
>         }
>
>         res = syscall(__NR_statmnt, mnt_id, mask, &buf, sizeof(buf), 0);
>         if (res == -1)
>                 err(1, "statmnt(%lu)", mnt_id);
>
>         printf("mask: 0x%llx\n", st->mask);
>         if (st->mask & STMT_SB_BASIC) {
>                 printf("sb_dev_major: %u\n", st->sb_dev_major);
>                 printf("sb_dev_minor: %u\n", st->sb_dev_minor);
>                 printf("sb_magic: 0x%llx\n", st->sb_magic);
>                 printf("sb_flags: 0x%08x\n", st->sb_flags);
>         }
>         if (st->mask & STMT_MNT_BASIC) {
>                 printf("mnt_id: 0x%llx\n", st->mnt_id);
>                 printf("mnt_parent_id: 0x%llx\n", st->mnt_parent_id);
>                 printf("mnt_id_old: %u\n", st->mnt_id_old);
>                 printf("mnt_parent_id_old: %u\n", st->mnt_parent_id_old);
>                 printf("mnt_attr: 0x%08llx\n", st->mnt_attr);
>                 printf("mnt_propagation: %s%s%s%s\n",
>                        st->mnt_propagation & MS_SHARED ? "shared," : "",
>                        st->mnt_propagation & MS_SLAVE ? "slave," : "",
>                        st->mnt_propagation & MS_UNBINDABLE ? "unbindable," : "",
>                        st->mnt_propagation & MS_PRIVATE ? "private" : "");
>                 printf("mnt_peer_group: %llu\n", st->mnt_peer_group);
>                 printf("mnt_master: %llu\n", st->mnt_master);
>         }
>         if (st->mask & STMT_PROPAGATE_FROM) {
>                 printf("propagate_from: %llu\n", st->propagate_from);
>         }
>         if (st->mask & STMT_MNT_ROOT) {
>                 printf("mnt_root: %i/%u <%s>\n", st->mnt_root.off,
>                        st->mnt_root.len, buf + st->mnt_root.off);
>         }
>         if (st->mask & STMT_MOUNTPOINT) {
>                 printf("mountpoint: %i/%u <%s>\n", st->mountpoint.off,
>                        st->mountpoint.len, buf + st->mountpoint.off);
>         }
>         if (st->mask & STMT_FS_TYPE) {
>                 printf("fs_type: %i/%u <%s>\n", st->fs_type.off,
>                        st->fs_type.len, buf + st->fs_type.off);
>         }
>
>         if (st->mask & STMT_SB_OPTS) {
>                 char *p = buf + st->sb_opts.off;
>                 char *end = p + st->sb_opts.len;
>
>                 printf("sb_opts: %i/%u ", st->sb_opts.off, st->sb_opts.len);
>                 for (; p < end; p += strlen(p) + 1)
>                         printf("<%s>, ", p);
>                 printf("\n");
>         }
>
>         return 0;
> }
>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 3/3] add listmnt(2) syscall
  2023-09-14  6:00   ` Amir Goldstein
@ 2023-09-14  8:50     ` Miklos Szeredi
  2023-09-14 10:01       ` Christian Brauner
  2023-09-15  1:00     ` Ian Kent
  1 sibling, 1 reply; 76+ messages in thread
From: Miklos Szeredi @ 2023-09-14  8:50 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Miklos Szeredi, linux-fsdevel, linux-kernel, linux-api,
	linux-man, linux-security-module, Karel Zak, Ian Kent,
	David Howells, Linus Torvalds, Al Viro, Christian Brauner

On Thu, 14 Sept 2023 at 08:00, Amir Goldstein <amir73il@gmail.com> wrote:

> > +               if (ctr >= bufsize)
> > +                       return -EOVERFLOW;
> > +               if (put_user(r->mnt_id_unique, buf + ctr))
> > +                       return -EFAULT;
> > +               ctr++;
> > +               if (ctr < 0)
> > +                       return -ERANGE;
>
> I think it'd be good for userspace to be able to query required
> bufsize with NULL buf, listattr style, rather than having to
> guess and re-guess on EOVERFLOW.

The getxattr/listxattr style encourages the following code:

  size = get(NULL, 0);
  buf = alloc(size);
  err = get(buf, size);
  if (err)
      /* failure */

Which is wrong, since the needed buffer size could change between the two calls.

Doing it iteratively is the only correct way, and then adding
complexity to both userspace and the kernel for *optimizing* the
iteration is not really worth it, IMO.

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 1/3] add unique mount ID
  2023-09-13 15:22 ` [RFC PATCH 1/3] add unique mount ID Miklos Szeredi
@ 2023-09-14  9:03   ` Christian Brauner
  2023-09-14  9:30     ` Miklos Szeredi
  0 siblings, 1 reply; 76+ messages in thread
From: Christian Brauner @ 2023-09-14  9:03 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: linux-fsdevel, linux-kernel, linux-api, linux-man,
	linux-security-module, Karel Zak, Ian Kent, David Howells,
	Linus Torvalds, Al Viro, Christian Brauner, Amir Goldstein

On Wed, Sep 13, 2023 at 05:22:34PM +0200, Miklos Szeredi wrote:
> If a mount is released then it's mnt_id can immediately be reused.  This is
> bad news for user interfaces that want to uniquely identify a mount.
> 
> Implementing a unique mount ID is trivial (use a 64bit counter).
> Unfortunately userspace assumes 32bit size and would overflow after the
> counter reaches 2^32.
> 
> Introduce a new 64bit ID alongside the old one.  Allow new interfaces to
> work on both the old and new IDs by starting the counter from 2^32.
> 
> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
> ---
>  fs/mount.h                | 3 ++-
>  fs/namespace.c            | 4 ++++
>  fs/stat.c                 | 9 +++++++--
>  include/uapi/linux/stat.h | 1 +
>  4 files changed, 14 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/mount.h b/fs/mount.h
> index 130c07c2f8d2..a14f762b3f29 100644
> --- a/fs/mount.h
> +++ b/fs/mount.h
> @@ -72,7 +72,8 @@ struct mount {
>  	struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
>  	__u32 mnt_fsnotify_mask;
>  #endif
> -	int mnt_id;			/* mount identifier */
> +	int mnt_id;			/* mount identifier, reused */
> +	u64 mnt_id_unique;		/* mount ID unique until reboot */
>  	int mnt_group_id;		/* peer group identifier */
>  	int mnt_expiry_mark;		/* true if marked for expiry */
>  	struct hlist_head mnt_pins;
> diff --git a/fs/namespace.c b/fs/namespace.c
> index e157efc54023..de47c5f66e17 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -68,6 +68,9 @@ static u64 event;
>  static DEFINE_IDA(mnt_id_ida);
>  static DEFINE_IDA(mnt_group_ida);
>  
> +/* Don't allow confusion with mount ID allocated wit IDA */
> +static atomic64_t mnt_id_ctr = ATOMIC64_INIT(1ULL << 32);

Hm, is your concern that userspace confuses these two values? If so, I
think we shouldn't worry about this.

If a userspace program retrieves a mntid and then confuses itself about
what mnt id they're talking about something's very wrong anyway. So I'd
rather not see us waste 32 bits just for that. Other than that this
seems to implement what we agreed on at LSFMM so my hope is that this is
fairly uncontroversial.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-13 15:22 ` [RFC PATCH 2/3] add statmnt(2) syscall Miklos Szeredi
  2023-09-14  6:11   ` Amir Goldstein
@ 2023-09-14  9:27   ` Christian Brauner
  2023-09-14 10:13     ` Miklos Szeredi
  2023-09-14 20:39   ` Paul Moore
                     ` (2 subsequent siblings)
  4 siblings, 1 reply; 76+ messages in thread
From: Christian Brauner @ 2023-09-14  9:27 UTC (permalink / raw)
  To: Miklos Szeredi, Linus Torvalds
  Cc: linux-fsdevel, linux-kernel, linux-api, linux-man,
	linux-security-module, Karel Zak, Ian Kent, David Howells,
	Al Viro, Christian Brauner, Amir Goldstein

On Wed, Sep 13, 2023 at 05:22:35PM +0200, Miklos Szeredi wrote:
> Add a way to query attributes of a single mount instead of having to parse
> the complete /proc/$PID/mountinfo, which might be huge.
> 
> Lookup the mount by the old (32bit) or new (64bit) mount ID.  If a mount
> needs to be queried based on path, then statx(2) can be used to first query
> the mount ID belonging to the path.
> 
> Design is based on a suggestion by Linus:
> 
>   "So I'd suggest something that is very much like "statfsat()", which gets
>    a buffer and a length, and returns an extended "struct statfs" *AND*
>    just a string description at the end."

So what we agreed to at LSFMM was that we split filesystem option
retrieval into a separate system call and just have a very focused
statx() for mounts with just binary and non-variable sized information.
We even gave David a hard time about this. :) I would really love if we
could stick to that.

Linus, I realize this was your suggestion a long time ago but I would
really like us to avoid structs with variable sized fields at the end of
a struct. That's just so painful for userspace and universally disliked.
If you care I can even find the LSFMM video where we have users of that
api requesting that we please don't do this. So it'd be great if you
wouldn't insist on it.

This will also allow us to turn statmnt() into an extensible argument
system call versioned by size just like we do any new system calls with
struct arguments (e.g., mount_setattr(), clone3(), openat2() and so on).
Which is how we should do things like that.

Other than that I really think this is on track for what we ultimately
want.

> +struct stmt_str {
> +	__u32 off;
> +	__u32 len;
> +};
> +
> +struct statmnt {
> +	__u64 mask;		/* What results were written [uncond] */
> +	__u32 sb_dev_major;	/* Device ID */
> +	__u32 sb_dev_minor;
> +	__u64 sb_magic;		/* ..._SUPER_MAGIC */
> +	__u32 sb_flags;		/* MS_{RDONLY,SYNCHRONOUS,DIRSYNC,LAZYTIME} */
> +	__u32 __spare1;
> +	__u64 mnt_id;		/* Unique ID of mount */
> +	__u64 mnt_parent_id;	/* Unique ID of parent (for root == mnt_id) */
> +	__u32 mnt_id_old;	/* Reused IDs used in proc/.../mountinfo */
> +	__u32 mnt_parent_id_old;
> +	__u64 mnt_attr;		/* MOUNT_ATTR_... */
> +	__u64 mnt_propagation;	/* MS_{SHARED,SLAVE,PRIVATE,UNBINDABLE} */
> +	__u64 mnt_peer_group;	/* ID of shared peer group */
> +	__u64 mnt_master;	/* Mount receives propagation from this ID */
> +	__u64 propagate_from;	/* Propagation from in current namespace */
> +	__u64 __spare[20];
> +	struct stmt_str mnt_root;	/* Root of mount relative to root of fs */
> +	struct stmt_str mountpoint;	/* Mountpoint relative to root of process */
> +	struct stmt_str fs_type;	/* Filesystem type[.subtype] */

I think if we want to do this here we should add:

__u64 fs_type
__u64 fs_subtype

fs_type can just be our filesystem magic number and we introduce magic
numbers for sub types as well. So we don't need to use strings here.
Userspace can trivially map the magic numbers to filesystem names. We
don't need to do this for them.

> +	struct stmt_str sb_opts;	/* Super block string options (nul delimted) */
> +};

> 
> The interface closely mimics that of statx.
> 
> Handle ASCII attributes by appending after the end of the structure (as per
> above suggestion).  Allow querying multiple string attributes with
> individual offset/length for each.  String are nul terminated (termination
> isn't counted in length).
> 
> Mount options are also delimited with nul characters.  Unlike proc, special
> characters are not quoted.
> 
> Link: https://lore.kernel.org/all/CAHk-=wh5YifP7hzKSbwJj94+DZ2czjrZsczy6GBimiogZws=rg@mail.gmail.com/
> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
> ---
>  arch/x86/entry/syscalls/syscall_64.tbl |   1 +
>  fs/internal.h                          |   5 +
>  fs/namespace.c                         | 312 ++++++++++++++++++++++++-
>  fs/proc_namespace.c                    |  19 +-
>  fs/statfs.c                            |   1 +
>  include/linux/syscalls.h               |   3 +
>  include/uapi/asm-generic/unistd.h      |   5 +-
>  include/uapi/linux/mount.h             |  36 +++
>  8 files changed, 373 insertions(+), 9 deletions(-)
> 
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index 1d6eee30eceb..6d807c30cd16 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -375,6 +375,7 @@
>  451	common	cachestat		sys_cachestat
>  452	common	fchmodat2		sys_fchmodat2
>  453	64	map_shadow_stack	sys_map_shadow_stack
> +454	common	statmnt			sys_statmnt
>  
>  #
>  # Due to a historical design error, certain syscalls are numbered differently
> diff --git a/fs/internal.h b/fs/internal.h
> index d64ae03998cc..8f75271428aa 100644
> --- a/fs/internal.h
> +++ b/fs/internal.h
> @@ -83,6 +83,11 @@ int path_mount(const char *dev_name, struct path *path,
>  		const char *type_page, unsigned long flags, void *data_page);
>  int path_umount(struct path *path, int flags);
>  
> +/*
> + * proc_namespace.c
> + */
> +int show_path(struct seq_file *m, struct dentry *root);
> +
>  /*
>   * fs_struct.c
>   */
> diff --git a/fs/namespace.c b/fs/namespace.c
> index de47c5f66e17..088a52043bba 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -69,7 +69,8 @@ static DEFINE_IDA(mnt_id_ida);
>  static DEFINE_IDA(mnt_group_ida);
>  
>  /* Don't allow confusion with mount ID allocated wit IDA */
> -static atomic64_t mnt_id_ctr = ATOMIC64_INIT(1ULL << 32);
> +#define OLD_MNT_ID_MAX UINT_MAX
> +static atomic64_t mnt_id_ctr = ATOMIC64_INIT(OLD_MNT_ID_MAX);
>  
>  static struct hlist_head *mount_hashtable __read_mostly;
>  static struct hlist_head *mountpoint_hashtable __read_mostly;
> @@ -4678,6 +4679,315 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
>  	return err;
>  }
>  
> +static bool mnt_id_match(struct mount *mnt, u64 id)
> +{
> +	if (id <= OLD_MNT_ID_MAX)
> +		return id == mnt->mnt_id;
> +	else
> +		return id == mnt->mnt_id_unique;
> +}
> +
> +struct vfsmount *lookup_mnt_in_ns(u64 id, struct mnt_namespace *ns)
> +{
> +	struct mount *mnt;
> +	struct vfsmount *res = NULL;
> +
> +	lock_ns_list(ns);
> +	list_for_each_entry(mnt, &ns->list, mnt_list) {
> +		if (!mnt_is_cursor(mnt) && mnt_id_match(mnt, id)) {
> +			res = &mnt->mnt;
> +			break;
> +		}
> +	}
> +	unlock_ns_list(ns);
> +	return res;
> +}
> +
> +struct stmt_state {
> +	void __user *const buf;
> +	size_t const bufsize;
> +	struct vfsmount *const mnt;
> +	u64 const mask;
> +	struct seq_file seq;
> +	struct path root;
> +	struct statmnt sm;
> +	size_t pos;
> +	int err;
> +};
> +
> +typedef int (*stmt_func_t)(struct stmt_state *);
> +
> +static int stmt_string_seq(struct stmt_state *s, stmt_func_t func)
> +{
> +	struct seq_file *seq = &s->seq;
> +	int ret;
> +
> +	seq->count = 0;
> +	seq->size = min_t(size_t, seq->size, s->bufsize - s->pos);
> +	seq->buf = kvmalloc(seq->size, GFP_KERNEL_ACCOUNT);
> +	if (!seq->buf)
> +		return -ENOMEM;
> +
> +	ret = func(s);
> +	if (ret)
> +		return ret;
> +
> +	if (seq_has_overflowed(seq)) {
> +		if (seq->size == s->bufsize - s->pos)
> +			return -EOVERFLOW;
> +		seq->size *= 2;
> +		if (seq->size > MAX_RW_COUNT)
> +			return -ENOMEM;
> +		kvfree(seq->buf);
> +		return 0;
> +	}
> +
> +	/* Done */
> +	return 1;
> +}
> +
> +static void stmt_string(struct stmt_state *s, u64 mask, stmt_func_t func,
> +		       stmt_str_t *str)
> +{
> +	int ret = s->pos >= s->bufsize ? -EOVERFLOW : 0;
> +	struct statmnt *sm = &s->sm;
> +	struct seq_file *seq = &s->seq;
> +
> +	if (s->err || !(s->mask & mask))
> +		return;
> +
> +	seq->size = PAGE_SIZE;
> +	while (!ret)
> +		ret = stmt_string_seq(s, func);
> +
> +	if (ret < 0) {
> +		s->err = ret;
> +	} else {
> +		seq->buf[seq->count++] = '\0';
> +		if (copy_to_user(s->buf + s->pos, seq->buf, seq->count)) {
> +			s->err = -EFAULT;
> +		} else {
> +			str->off = s->pos;
> +			str->len = seq->count - 1;
> +			s->pos += seq->count;
> +		}
> +	}
> +	kvfree(seq->buf);
> +	sm->mask |= mask;
> +}
> +
> +static void stmt_numeric(struct stmt_state *s, u64 mask, stmt_func_t func)
> +{
> +	if (s->err || !(s->mask & mask))
> +		return;
> +
> +	s->err = func(s);
> +	s->sm.mask |= mask;
> +}
> +
> +static u64 mnt_to_attr_flags(struct vfsmount *mnt)
> +{
> +	unsigned int mnt_flags = READ_ONCE(mnt->mnt_flags);
> +	u64 attr_flags = 0;
> +
> +	if (mnt_flags & MNT_READONLY)
> +		attr_flags |= MOUNT_ATTR_RDONLY;
> +	if (mnt_flags & MNT_NOSUID)
> +		attr_flags |= MOUNT_ATTR_NOSUID;
> +	if (mnt_flags & MNT_NODEV)
> +		attr_flags |= MOUNT_ATTR_NODEV;
> +	if (mnt_flags & MNT_NOEXEC)
> +		attr_flags |= MOUNT_ATTR_NOEXEC;
> +	if (mnt_flags & MNT_NODIRATIME)
> +		attr_flags |= MOUNT_ATTR_NODIRATIME;
> +	if (mnt_flags & MNT_NOSYMFOLLOW)
> +		attr_flags |= MOUNT_ATTR_NOSYMFOLLOW;
> +
> +	if (mnt_flags & MNT_NOATIME)
> +		attr_flags |= MOUNT_ATTR_NOATIME;
> +	else if (mnt_flags & MNT_RELATIME)
> +		attr_flags |= MOUNT_ATTR_RELATIME;
> +	else
> +		attr_flags |= MOUNT_ATTR_STRICTATIME;
> +
> +	if (is_idmapped_mnt(mnt))
> +		attr_flags |= MOUNT_ATTR_IDMAP;
> +
> +	return attr_flags;
> +}
> +
> +static u64 mnt_to_propagation_flags(struct mount *m)
> +{
> +	u64 propagation = 0;
> +
> +	if (IS_MNT_SHARED(m))
> +		propagation |= MS_SHARED;
> +	if (IS_MNT_SLAVE(m))
> +		propagation |= MS_SLAVE;
> +	if (IS_MNT_UNBINDABLE(m))
> +		propagation |= MS_UNBINDABLE;
> +	if (!propagation)
> +		propagation |= MS_PRIVATE;
> +
> +	return propagation;
> +}
> +
> +static int stmt_sb_basic(struct stmt_state *s)
> +{
> +	struct super_block *sb = s->mnt->mnt_sb;
> +
> +	s->sm.sb_dev_major = MAJOR(sb->s_dev);
> +	s->sm.sb_dev_minor = MINOR(sb->s_dev);
> +	s->sm.sb_magic = sb->s_magic;
> +	s->sm.sb_flags = sb->s_flags & (SB_RDONLY|SB_SYNCHRONOUS|SB_DIRSYNC|SB_LAZYTIME);
> +
> +	return 0;
> +}
> +
> +static int stmt_mnt_basic(struct stmt_state *s)
> +{
> +	struct mount *m = real_mount(s->mnt);
> +
> +	s->sm.mnt_id = m->mnt_id_unique;
> +	s->sm.mnt_parent_id = m->mnt_parent->mnt_id_unique;
> +	s->sm.mnt_id_old = m->mnt_id;
> +	s->sm.mnt_parent_id_old = m->mnt_parent->mnt_id;
> +	s->sm.mnt_attr = mnt_to_attr_flags(&m->mnt);
> +	s->sm.mnt_propagation = mnt_to_propagation_flags(m);
> +	s->sm.mnt_peer_group = IS_MNT_SHARED(m) ? m->mnt_group_id : 0;
> +	s->sm.mnt_master = IS_MNT_SLAVE(m) ? m->mnt_master->mnt_group_id : 0;
> +
> +	return 0;
> +}
> +
> +static int stmt_propagate_from(struct stmt_state *s)
> +{
> +	struct mount *m = real_mount(s->mnt);
> +
> +	if (!IS_MNT_SLAVE(m))
> +		return 0;
> +
> +	s->sm.propagate_from = get_dominating_id(m, &current->fs->root);
> +
> +	return 0;
> +}
> +
> +static int stmt_mnt_root(struct stmt_state *s)
> +{
> +	struct seq_file *seq = &s->seq;
> +	int err = show_path(seq, s->mnt->mnt_root);
> +
> +	if (!err && !seq_has_overflowed(seq)) {
> +		seq->buf[seq->count] = '\0';
> +		seq->count = string_unescape_inplace(seq->buf, UNESCAPE_OCTAL);
> +	}
> +	return err;
> +}
> +
> +static int stmt_mountpoint(struct stmt_state *s)
> +{
> +	struct vfsmount *mnt = s->mnt;
> +	struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
> +	int err = seq_path_root(&s->seq, &mnt_path, &s->root, "");
> +
> +	return err == SEQ_SKIP ? 0 : err;
> +}
> +
> +static int stmt_fs_type(struct stmt_state *s)
> +{
> +	struct seq_file *seq = &s->seq;
> +	struct super_block *sb = s->mnt->mnt_sb;
> +
> +	seq_puts(seq, sb->s_type->name);
> +	if (sb->s_subtype) {
> +		seq_putc(seq, '.');
> +		seq_puts(seq, sb->s_subtype);
> +	}
> +	return 0;
> +}
> +
> +static int stmt_sb_opts(struct stmt_state *s)
> +{
> +	struct seq_file *seq = &s->seq;
> +	struct super_block *sb = s->mnt->mnt_sb;
> +	char *p, *end, *next, *u = seq->buf;
> +	int err;
> +
> +	if (!sb->s_op->show_options)
> +		return 0;
> +
> +	err = sb->s_op->show_options(seq, s->mnt->mnt_root);
> +	if (err || seq_has_overflowed(seq) || !seq->count)
> +		return err;
> +
> +	end = seq->buf + seq->count;
> +	*end = '\0';
> +	for (p = seq->buf + 1; p < end; p = next + 1) {
> +		next = strchrnul(p, ',');
> +		*next = '\0';
> +		u += string_unescape(p, u, 0, UNESCAPE_OCTAL) + 1;
> +	}
> +	seq->count = u - 1 - seq->buf;
> +	return 0;
> +}
> +
> +static int do_statmnt(struct stmt_state *s)
> +{
> +	struct statmnt *sm = &s->sm;
> +	struct mount *m = real_mount(s->mnt);
> +
> +	if (!capable(CAP_SYS_ADMIN) &&
> +	    !is_path_reachable(m, m->mnt.mnt_root, &s->root))
> +		return -EPERM;
> +
> +	stmt_numeric(s, STMT_SB_BASIC, stmt_sb_basic);
> +	stmt_numeric(s, STMT_MNT_BASIC, stmt_mnt_basic);
> +	stmt_numeric(s, STMT_PROPAGATE_FROM, stmt_propagate_from);
> +	stmt_string(s, STMT_MNT_ROOT, stmt_mnt_root, &sm->mnt_root);
> +	stmt_string(s, STMT_MOUNTPOINT, stmt_mountpoint, &sm->mountpoint);
> +	stmt_string(s, STMT_FS_TYPE, stmt_fs_type, &sm->fs_type);
> +	stmt_string(s, STMT_SB_OPTS, stmt_sb_opts, &sm->sb_opts);
> +
> +	if (s->err)
> +		return s->err;
> +
> +	if (copy_to_user(s->buf, sm, min_t(size_t, s->bufsize, sizeof(*sm))))
> +		return -EFAULT;
> +
> +	return 0;
> +}
> +
> +SYSCALL_DEFINE5(statmnt, u64, mnt_id,
> +		u64, mask, struct statmnt __user *, buf,
> +		size_t, bufsize, unsigned int, flags)
> +{
> +	struct vfsmount *mnt;
> +	int err;
> +
> +	if (flags)
> +		return -EINVAL;
> +
> +	down_read(&namespace_sem);
> +	mnt = lookup_mnt_in_ns(mnt_id, current->nsproxy->mnt_ns);
> +	err = -ENOENT;
> +	if (mnt) {
> +		struct stmt_state s = {
> +			.mask = mask,
> +			.buf = buf,
> +			.bufsize = bufsize,
> +			.mnt = mnt,
> +			.pos = sizeof(*buf),
> +		};
> +
> +		get_fs_root(current->fs, &s.root);
> +		err = do_statmnt(&s);
> +		path_put(&s.root);
> +	}
> +	up_read(&namespace_sem);
> +
> +	return err;
> +}
> +
>  static void __init init_mount_tree(void)
>  {
>  	struct vfsmount *mnt;
> diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
> index 250eb5bf7b52..20681d1f6798 100644
> --- a/fs/proc_namespace.c
> +++ b/fs/proc_namespace.c
> @@ -132,6 +132,15 @@ static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt)
>  	return err;
>  }
>  
> +int show_path(struct seq_file *m, struct dentry *root)
> +{
> +	if (root->d_sb->s_op->show_path)
> +		return root->d_sb->s_op->show_path(m, root);
> +
> +	seq_dentry(m, root, " \t\n\\");
> +	return 0;
> +}
> +
>  static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)
>  {
>  	struct proc_mounts *p = m->private;
> @@ -142,13 +151,9 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)
>  
>  	seq_printf(m, "%i %i %u:%u ", r->mnt_id, r->mnt_parent->mnt_id,
>  		   MAJOR(sb->s_dev), MINOR(sb->s_dev));
> -	if (sb->s_op->show_path) {
> -		err = sb->s_op->show_path(m, mnt->mnt_root);
> -		if (err)
> -			goto out;
> -	} else {
> -		seq_dentry(m, mnt->mnt_root, " \t\n\\");
> -	}
> +	err = show_path(m, mnt->mnt_root);
> +	if (err)
> +		goto out;
>  	seq_putc(m, ' ');
>  
>  	/* mountpoints outside of chroot jail will give SEQ_SKIP on this */
> diff --git a/fs/statfs.c b/fs/statfs.c
> index 96d1c3edf289..cc774c2e2c9a 100644
> --- a/fs/statfs.c
> +++ b/fs/statfs.c
> @@ -9,6 +9,7 @@
>  #include <linux/security.h>
>  #include <linux/uaccess.h>
>  #include <linux/compat.h>
> +#include <uapi/linux/mount.h>
>  #include "internal.h"
>  
>  static int flags_by_mnt(int mnt_flags)
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index 22bc6bc147f8..1099bd307fa7 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -408,6 +408,9 @@ asmlinkage long sys_statfs64(const char __user *path, size_t sz,
>  asmlinkage long sys_fstatfs(unsigned int fd, struct statfs __user *buf);
>  asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz,
>  				struct statfs64 __user *buf);
> +asmlinkage long sys_statmnt(u64 mnt_id, u64 mask,
> +			    struct statmnt __user *buf, size_t bufsize,
> +			    unsigned int flags);
>  asmlinkage long sys_truncate(const char __user *path, long length);
>  asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length);
>  #if BITS_PER_LONG == 32
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index abe087c53b4b..640997231ff6 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -823,8 +823,11 @@ __SYSCALL(__NR_cachestat, sys_cachestat)
>  #define __NR_fchmodat2 452
>  __SYSCALL(__NR_fchmodat2, sys_fchmodat2)
>  
> +#define __NR_statmnt   454
> +__SYSCALL(__NR_statmnt, sys_statmnt)
> +
>  #undef __NR_syscalls
> -#define __NR_syscalls 453
> +#define __NR_syscalls 455
>  
>  /*
>   * 32 bit systems traditionally used different
> diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
> index bb242fdcfe6b..4ec7308a9259 100644
> --- a/include/uapi/linux/mount.h
> +++ b/include/uapi/linux/mount.h
> @@ -138,4 +138,40 @@ struct mount_attr {
>  /* List of all mount_attr versions. */
>  #define MOUNT_ATTR_SIZE_VER0	32 /* sizeof first published struct */
>  
> +struct stmt_str {
> +	__u32 off;
> +	__u32 len;
> +};
> +
> +struct statmnt {
> +	__u64 mask;		/* What results were written [uncond] */
> +	__u32 sb_dev_major;	/* Device ID */
> +	__u32 sb_dev_minor;
> +	__u64 sb_magic;		/* ..._SUPER_MAGIC */
> +	__u32 sb_flags;		/* MS_{RDONLY,SYNCHRONOUS,DIRSYNC,LAZYTIME} */
> +	__u32 __spare1;
> +	__u64 mnt_id;		/* Unique ID of mount */
> +	__u64 mnt_parent_id;	/* Unique ID of parent (for root == mnt_id) */
> +	__u32 mnt_id_old;	/* Reused IDs used in proc/.../mountinfo */
> +	__u32 mnt_parent_id_old;
> +	__u64 mnt_attr;		/* MOUNT_ATTR_... */
> +	__u64 mnt_propagation;	/* MS_{SHARED,SLAVE,PRIVATE,UNBINDABLE} */
> +	__u64 mnt_peer_group;	/* ID of shared peer group */
> +	__u64 mnt_master;	/* Mount receives propagation from this ID */
> +	__u64 propagate_from;	/* Propagation from in current namespace */
> +	__u64 __spare[20];
> +	struct stmt_str mnt_root;	/* Root of mount relative to root of fs */
> +	struct stmt_str mountpoint;	/* Mountpoint relative to root of process */
> +	struct stmt_str fs_type;	/* Filesystem type[.subtype] */
> +	struct stmt_str sb_opts;	/* Super block string options (nul delimted) */
> +};
> +
> +#define STMT_SB_BASIC		0x00000001U     /* Want/got sb_... */
> +#define STMT_MNT_BASIC		0x00000002U	/* Want/got mnt_... */
> +#define STMT_PROPAGATE_FROM	0x00000004U	/* Want/got propagate_from */
> +#define STMT_MNT_ROOT		0x00000008U	/* Want/got mnt_root  */
> +#define STMT_MOUNTPOINT		0x00000010U	/* Want/got mountpoint */
> +#define STMT_FS_TYPE		0x00000020U	/* Want/got fs_type */
> +#define STMT_SB_OPTS		0x00000040U	/* Want/got sb_opts */
> +
>  #endif /* _UAPI_LINUX_MOUNT_H */
> -- 
> 2.41.0
> 

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 1/3] add unique mount ID
  2023-09-14  9:03   ` Christian Brauner
@ 2023-09-14  9:30     ` Miklos Szeredi
  2023-09-14  9:36       ` Christian Brauner
  0 siblings, 1 reply; 76+ messages in thread
From: Miklos Szeredi @ 2023-09-14  9:30 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Miklos Szeredi, linux-fsdevel, linux-kernel, linux-api,
	linux-man, linux-security-module, Karel Zak, Ian Kent,
	David Howells, Linus Torvalds, Al Viro, Christian Brauner,
	Amir Goldstein

On Thu, 14 Sept 2023 at 11:04, Christian Brauner <brauner@kernel.org> wrote:
>
> On Wed, Sep 13, 2023 at 05:22:34PM +0200, Miklos Szeredi wrote:
> > If a mount is released then it's mnt_id can immediately be reused.  This is
> > bad news for user interfaces that want to uniquely identify a mount.
> >
> > Implementing a unique mount ID is trivial (use a 64bit counter).
> > Unfortunately userspace assumes 32bit size and would overflow after the
> > counter reaches 2^32.
> >
> > Introduce a new 64bit ID alongside the old one.  Allow new interfaces to
> > work on both the old and new IDs by starting the counter from 2^32.
> >
> > Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
> > ---
> >  fs/mount.h                | 3 ++-
> >  fs/namespace.c            | 4 ++++
> >  fs/stat.c                 | 9 +++++++--
> >  include/uapi/linux/stat.h | 1 +
> >  4 files changed, 14 insertions(+), 3 deletions(-)
> >
> > diff --git a/fs/mount.h b/fs/mount.h
> > index 130c07c2f8d2..a14f762b3f29 100644
> > --- a/fs/mount.h
> > +++ b/fs/mount.h
> > @@ -72,7 +72,8 @@ struct mount {
> >       struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
> >       __u32 mnt_fsnotify_mask;
> >  #endif
> > -     int mnt_id;                     /* mount identifier */
> > +     int mnt_id;                     /* mount identifier, reused */
> > +     u64 mnt_id_unique;              /* mount ID unique until reboot */
> >       int mnt_group_id;               /* peer group identifier */
> >       int mnt_expiry_mark;            /* true if marked for expiry */
> >       struct hlist_head mnt_pins;
> > diff --git a/fs/namespace.c b/fs/namespace.c
> > index e157efc54023..de47c5f66e17 100644
> > --- a/fs/namespace.c
> > +++ b/fs/namespace.c
> > @@ -68,6 +68,9 @@ static u64 event;
> >  static DEFINE_IDA(mnt_id_ida);
> >  static DEFINE_IDA(mnt_group_ida);
> >
> > +/* Don't allow confusion with mount ID allocated wit IDA */
> > +static atomic64_t mnt_id_ctr = ATOMIC64_INIT(1ULL << 32);
>
> Hm, is your concern that userspace confuses these two values? If so, I
> think we shouldn't worry about this.

Yes, one concern is that humans confuse the old and the new ID.

I also think it makes sense to allow the new interfaces to look up the
mount based on either the old or the new ID.   But I could be wrong
there, since that might encourage bad code.  Maybe the new interface
should only use take the new ID, which means no mixed use of
/proc/$$/mountinfo and statmnt/listmnt.

>
> If a userspace program retrieves a mntid and then confuses itself about
> what mnt id they're talking about something's very wrong anyway. So I'd
> rather not see us waste 32 bits just for that.

This is wasting a quarter of a billionth of the ID space.  We are
surely not concerned about that.

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 1/3] add unique mount ID
  2023-09-14  9:30     ` Miklos Szeredi
@ 2023-09-14  9:36       ` Christian Brauner
  2023-09-14  9:43         ` Miklos Szeredi
  0 siblings, 1 reply; 76+ messages in thread
From: Christian Brauner @ 2023-09-14  9:36 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Miklos Szeredi, linux-fsdevel, linux-kernel, linux-api,
	linux-man, linux-security-module, Karel Zak, Ian Kent,
	David Howells, Linus Torvalds, Al Viro, Christian Brauner,
	Amir Goldstein

> Yes, one concern is that humans confuse the old and the new ID.
> 
> I also think it makes sense to allow the new interfaces to look up the
> mount based on either the old or the new ID.   But I could be wrong

Hm, mount id recycling may happen so quickly that for service restarts
with a lot of mounts this becomes mostly useless...

> there, since that might encourage bad code.  Maybe the new interface
> should only use take the new ID, which means no mixed use of
> /proc/$$/mountinfo and statmnt/listmnt.

... so I think that is indeed the better way of doing things. There's no
need to encourage userspace to mix both identifiers.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 1/3] add unique mount ID
  2023-09-14  9:36       ` Christian Brauner
@ 2023-09-14  9:43         ` Miklos Szeredi
  2023-09-14 10:06           ` Christian Brauner
  2023-09-15  1:31           ` Ian Kent
  0 siblings, 2 replies; 76+ messages in thread
From: Miklos Szeredi @ 2023-09-14  9:43 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Miklos Szeredi, linux-fsdevel, linux-kernel, linux-api,
	linux-man, linux-security-module, Karel Zak, Ian Kent,
	David Howells, Linus Torvalds, Al Viro, Christian Brauner,
	Amir Goldstein

On Thu, 14 Sept 2023 at 11:36, Christian Brauner <brauner@kernel.org> wrote:
>
> > Yes, one concern is that humans confuse the old and the new ID.
> >
> > I also think it makes sense to allow the new interfaces to look up the
> > mount based on either the old or the new ID.   But I could be wrong
>
> Hm, mount id recycling may happen so quickly that for service restarts
> with a lot of mounts this becomes mostly useless...

Agreed.  The old ID is mostly useful for human interaction.

>
> > there, since that might encourage bad code.  Maybe the new interface
> > should only use take the new ID, which means no mixed use of
> > /proc/$$/mountinfo and statmnt/listmnt.
>
> ... so I think that is indeed the better way of doing things. There's no
> need to encourage userspace to mix both identifiers.

Okay.

But I'd still leave the 2^32 offset for human confusion avoidance.

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 3/3] add listmnt(2) syscall
  2023-09-14  8:50     ` Miklos Szeredi
@ 2023-09-14 10:01       ` Christian Brauner
  0 siblings, 0 replies; 76+ messages in thread
From: Christian Brauner @ 2023-09-14 10:01 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Amir Goldstein, Miklos Szeredi, linux-fsdevel, linux-kernel,
	linux-api, linux-man, linux-security-module, Karel Zak, Ian Kent,
	David Howells, Linus Torvalds, Al Viro, Christian Brauner

On Thu, Sep 14, 2023 at 10:50:04AM +0200, Miklos Szeredi wrote:
> On Thu, 14 Sept 2023 at 08:00, Amir Goldstein <amir73il@gmail.com> wrote:
> 
> > > +               if (ctr >= bufsize)
> > > +                       return -EOVERFLOW;
> > > +               if (put_user(r->mnt_id_unique, buf + ctr))
> > > +                       return -EFAULT;
> > > +               ctr++;
> > > +               if (ctr < 0)
> > > +                       return -ERANGE;
> >
> > I think it'd be good for userspace to be able to query required
> > bufsize with NULL buf, listattr style, rather than having to
> > guess and re-guess on EOVERFLOW.
> 
> The getxattr/listxattr style encourages the following code:
> 
>   size = get(NULL, 0);
>   buf = alloc(size);
>   err = get(buf, size);
>   if (err)
>       /* failure */
> 
> Which is wrong, since the needed buffer size could change between the two calls.

Not a fan of this either tbh.

> 
> Doing it iteratively is the only correct way, and then adding
> complexity to both userspace and the kernel for *optimizing* the
> iteration is not really worth it, IMO.

So realistically, userspace nows that an upper bound on the number of
mounts in a mount namespace (expressed in /proc/sys/fs/mount-max usually
100000 - which is often too much ofc).

This is probably insane but I'll power through it: ideally we'd have an
iterator interface that keeps state between calls so we can continue
iterating similar to how readdir/getdents does.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 1/3] add unique mount ID
  2023-09-14  9:43         ` Miklos Szeredi
@ 2023-09-14 10:06           ` Christian Brauner
  2023-09-15  1:31           ` Ian Kent
  1 sibling, 0 replies; 76+ messages in thread
From: Christian Brauner @ 2023-09-14 10:06 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Miklos Szeredi, linux-fsdevel, linux-kernel, linux-api,
	linux-man, linux-security-module, Karel Zak, Ian Kent,
	David Howells, Linus Torvalds, Al Viro, Christian Brauner,
	Amir Goldstein

> But I'd still leave the 2^32 offset for human confusion avoidance.

Sure, it's really not worth arguing about.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-14  9:27   ` Christian Brauner
@ 2023-09-14 10:13     ` Miklos Szeredi
  2023-09-14 15:26       ` Christian Brauner
  0 siblings, 1 reply; 76+ messages in thread
From: Miklos Szeredi @ 2023-09-14 10:13 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Miklos Szeredi, Linus Torvalds, linux-fsdevel, linux-kernel,
	linux-api, linux-man, linux-security-module, Karel Zak, Ian Kent,
	David Howells, Al Viro, Christian Brauner, Amir Goldstein

On Thu, 14 Sept 2023 at 11:28, Christian Brauner <brauner@kernel.org> wrote:
>
> On Wed, Sep 13, 2023 at 05:22:35PM +0200, Miklos Szeredi wrote:
> > Add a way to query attributes of a single mount instead of having to parse
> > the complete /proc/$PID/mountinfo, which might be huge.
> >
> > Lookup the mount by the old (32bit) or new (64bit) mount ID.  If a mount
> > needs to be queried based on path, then statx(2) can be used to first query
> > the mount ID belonging to the path.
> >
> > Design is based on a suggestion by Linus:
> >
> >   "So I'd suggest something that is very much like "statfsat()", which gets
> >    a buffer and a length, and returns an extended "struct statfs" *AND*
> >    just a string description at the end."
>
> So what we agreed to at LSFMM was that we split filesystem option
> retrieval into a separate system call and just have a very focused
> statx() for mounts with just binary and non-variable sized information.
> We even gave David a hard time about this. :) I would really love if we
> could stick to that.
>
> Linus, I realize this was your suggestion a long time ago but I would
> really like us to avoid structs with variable sized fields at the end of
> a struct. That's just so painful for userspace and universally disliked.
> If you care I can even find the LSFMM video where we have users of that
> api requesting that we please don't do this. So it'd be great if you
> wouldn't insist on it.

I completely missed that.

What I'm thinking is making it even simpler for userspace:

struct statmnt {
  ...
  char *mnt_root;
  char *mountpoint;
  char *fs_type;
  u32 num_opts;
  char *opts;
};

I'd still just keep options nul delimited.

Is there a good reason not to return pointers (pointing to within the
supplied buffer obviously) to userspace?

>
> This will also allow us to turn statmnt() into an extensible argument
> system call versioned by size just like we do any new system calls with
> struct arguments (e.g., mount_setattr(), clone3(), openat2() and so on).
> Which is how we should do things like that.

The mask mechanism also allow versioning of the struct.

>
> Other than that I really think this is on track for what we ultimately
> want.
>
> > +struct stmt_str {
> > +     __u32 off;
> > +     __u32 len;
> > +};
> > +
> > +struct statmnt {
> > +     __u64 mask;             /* What results were written [uncond] */
> > +     __u32 sb_dev_major;     /* Device ID */
> > +     __u32 sb_dev_minor;
> > +     __u64 sb_magic;         /* ..._SUPER_MAGIC */
> > +     __u32 sb_flags;         /* MS_{RDONLY,SYNCHRONOUS,DIRSYNC,LAZYTIME} */
> > +     __u32 __spare1;
> > +     __u64 mnt_id;           /* Unique ID of mount */
> > +     __u64 mnt_parent_id;    /* Unique ID of parent (for root == mnt_id) */
> > +     __u32 mnt_id_old;       /* Reused IDs used in proc/.../mountinfo */
> > +     __u32 mnt_parent_id_old;
> > +     __u64 mnt_attr;         /* MOUNT_ATTR_... */
> > +     __u64 mnt_propagation;  /* MS_{SHARED,SLAVE,PRIVATE,UNBINDABLE} */
> > +     __u64 mnt_peer_group;   /* ID of shared peer group */
> > +     __u64 mnt_master;       /* Mount receives propagation from this ID */
> > +     __u64 propagate_from;   /* Propagation from in current namespace */
> > +     __u64 __spare[20];
> > +     struct stmt_str mnt_root;       /* Root of mount relative to root of fs */
> > +     struct stmt_str mountpoint;     /* Mountpoint relative to root of process */
> > +     struct stmt_str fs_type;        /* Filesystem type[.subtype] */
>
> I think if we want to do this here we should add:
>
> __u64 fs_type
> __u64 fs_subtype
>
> fs_type can just be our filesystem magic number and we introduce magic

It's already there: sb_magic.

However it's not a 1:1 mapping (ext* only has one magic).

> numbers for sub types as well. So we don't need to use strings here.

Ugh.

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-14 10:13     ` Miklos Szeredi
@ 2023-09-14 15:26       ` Christian Brauner
  2023-09-15  8:56         ` Miklos Szeredi
  2023-09-18 14:29         ` Jeff Layton
  0 siblings, 2 replies; 76+ messages in thread
From: Christian Brauner @ 2023-09-14 15:26 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Miklos Szeredi, Linus Torvalds, linux-fsdevel, linux-kernel,
	linux-api, linux-man, linux-security-module, Karel Zak, Ian Kent,
	David Howells, Al Viro, Christian Brauner, Amir Goldstein

On Thu, Sep 14, 2023 at 12:13:54PM +0200, Miklos Szeredi wrote:
> On Thu, 14 Sept 2023 at 11:28, Christian Brauner <brauner@kernel.org> wrote:
> >
> > On Wed, Sep 13, 2023 at 05:22:35PM +0200, Miklos Szeredi wrote:
> > > Add a way to query attributes of a single mount instead of having to parse
> > > the complete /proc/$PID/mountinfo, which might be huge.
> > >
> > > Lookup the mount by the old (32bit) or new (64bit) mount ID.  If a mount
> > > needs to be queried based on path, then statx(2) can be used to first query
> > > the mount ID belonging to the path.
> > >
> > > Design is based on a suggestion by Linus:
> > >
> > >   "So I'd suggest something that is very much like "statfsat()", which gets
> > >    a buffer and a length, and returns an extended "struct statfs" *AND*
> > >    just a string description at the end."
> >
> > So what we agreed to at LSFMM was that we split filesystem option
> > retrieval into a separate system call and just have a very focused
> > statx() for mounts with just binary and non-variable sized information.
> > We even gave David a hard time about this. :) I would really love if we
> > could stick to that.
> >
> > Linus, I realize this was your suggestion a long time ago but I would
> > really like us to avoid structs with variable sized fields at the end of
> > a struct. That's just so painful for userspace and universally disliked.
> > If you care I can even find the LSFMM video where we have users of that
> > api requesting that we please don't do this. So it'd be great if you
> > wouldn't insist on it.
> 
> I completely missed that.

No worries, I think the discussion touching on this starts at:
https://youtu.be/j3fp2MtRr2I?si=f-YBg6uWq80dV3VC&t=1603
(with David talking quietly without a microphone for some parts
unfortunately...)

> What I'm thinking is making it even simpler for userspace:
> 
> struct statmnt {
>   ...
>   char *mnt_root;
>   char *mountpoint;
>   char *fs_type;
>   u32 num_opts;
>   char *opts;
> };
> 
> I'd still just keep options nul delimited.
> 
> Is there a good reason not to return pointers (pointing to within the
> supplied buffer obviously) to userspace?

It's really unpleasant to program with. Yes, I think you pointed out
before that it often doesn't matter much as long as the system call is
really only relevant to some special purpose userspace.

But statmount() will be used pretty extensively pretty quickly for the
purpose of finding out mount options on a mount (Querying a whole
sequences of mounts via repeated listmount() + statmount() calls on the
other hand will be rarer.).

And there's just so many tools that need this: libmount, systemd, all
kinds of container runtimes, path lookup libraries such as libpathrs,
languages like go and rust that expose and wrap these calls and so on.

Most of these tools don't need to know about filesystem mount options
and if they do they can just query that through an extra system call. No
harm in doing that.

The agreement we came to to split out listing submounts into a separate
system call was exactly to avoid having to have a variable sized pointer
at the end of the struct statmnt (That's also part of the video above
btw.) and to make it as simple as possible.

Plus, the format for how to return arbitrary filesystem mount options
warrants a separate discussion imho as that's not really vfs level
information.

> > This will also allow us to turn statmnt() into an extensible argument
> > system call versioned by size just like we do any new system calls with
> > struct arguments (e.g., mount_setattr(), clone3(), openat2() and so on).
> > Which is how we should do things like that.
> 
> The mask mechanism also allow versioning of the struct.

Yes, but this is done with reserved space which just pushes away the
problem and bloats the struct for the sake of an unknown future. If we
were to use an extensible argument struct we would just version by size.
The only requirement is that you extend by 64 bit (see struct
clone_args) which had been extended.

> 
> >
> > Other than that I really think this is on track for what we ultimately
> > want.
> >
> > > +struct stmt_str {
> > > +     __u32 off;
> > > +     __u32 len;
> > > +};
> > > +
> > > +struct statmnt {
> > > +     __u64 mask;             /* What results were written [uncond] */
> > > +     __u32 sb_dev_major;     /* Device ID */
> > > +     __u32 sb_dev_minor;
> > > +     __u64 sb_magic;         /* ..._SUPER_MAGIC */
> > > +     __u32 sb_flags;         /* MS_{RDONLY,SYNCHRONOUS,DIRSYNC,LAZYTIME} */
> > > +     __u32 __spare1;
> > > +     __u64 mnt_id;           /* Unique ID of mount */
> > > +     __u64 mnt_parent_id;    /* Unique ID of parent (for root == mnt_id) */
> > > +     __u32 mnt_id_old;       /* Reused IDs used in proc/.../mountinfo */
> > > +     __u32 mnt_parent_id_old;
> > > +     __u64 mnt_attr;         /* MOUNT_ATTR_... */
> > > +     __u64 mnt_propagation;  /* MS_{SHARED,SLAVE,PRIVATE,UNBINDABLE} */
> > > +     __u64 mnt_peer_group;   /* ID of shared peer group */
> > > +     __u64 mnt_master;       /* Mount receives propagation from this ID */
> > > +     __u64 propagate_from;   /* Propagation from in current namespace */
> > > +     __u64 __spare[20];
> > > +     struct stmt_str mnt_root;       /* Root of mount relative to root of fs */
> > > +     struct stmt_str mountpoint;     /* Mountpoint relative to root of process */
> > > +     struct stmt_str fs_type;        /* Filesystem type[.subtype] */
> >
> > I think if we want to do this here we should add:
> >
> > __u64 fs_type
> > __u64 fs_subtype
> >
> > fs_type can just be our filesystem magic number and we introduce magic
> 
> It's already there: sb_magic.
> 
> However it's not a 1:1 mapping (ext* only has one magic).

That's a very odd choice but probably fixable by giving it a subtype.

> 
> > numbers for sub types as well. So we don't need to use strings here.
> 
> Ugh.

Hm, idk. It's not that bad imho. We'll have to make some ugly tradeoffs.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-13 15:22 ` [RFC PATCH 2/3] add statmnt(2) syscall Miklos Szeredi
  2023-09-14  6:11   ` Amir Goldstein
  2023-09-14  9:27   ` Christian Brauner
@ 2023-09-14 20:39   ` Paul Moore
  2023-09-15  9:10     ` Miklos Szeredi
  2023-09-17 18:18   ` Sargun Dhillon
  2023-09-25 12:57   ` Arnd Bergmann
  4 siblings, 1 reply; 76+ messages in thread
From: Paul Moore @ 2023-09-14 20:39 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: linux-fsdevel, linux-kernel, linux-api, linux-man,
	linux-security-module, Karel Zak, Ian Kent, David Howells,
	Linus Torvalds, Al Viro, Christian Brauner, Amir Goldstein

On Wed, Sep 13, 2023 at 11:23 AM Miklos Szeredi <mszeredi@redhat.com> wrote:
>
> Add a way to query attributes of a single mount instead of having to parse
> the complete /proc/$PID/mountinfo, which might be huge.
>
> Lookup the mount by the old (32bit) or new (64bit) mount ID.  If a mount
> needs to be queried based on path, then statx(2) can be used to first query
> the mount ID belonging to the path.
>
> Design is based on a suggestion by Linus:
>
>   "So I'd suggest something that is very much like "statfsat()", which gets
>    a buffer and a length, and returns an extended "struct statfs" *AND*
>    just a string description at the end."
>
> The interface closely mimics that of statx.
>
> Handle ASCII attributes by appending after the end of the structure (as per
> above suggestion).  Allow querying multiple string attributes with
> individual offset/length for each.  String are nul terminated (termination
> isn't counted in length).
>
> Mount options are also delimited with nul characters.  Unlike proc, special
> characters are not quoted.
>
> Link: https://lore.kernel.org/all/CAHk-=wh5YifP7hzKSbwJj94+DZ2czjrZsczy6GBimiogZws=rg@mail.gmail.com/
> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
> ---
>  arch/x86/entry/syscalls/syscall_64.tbl |   1 +
>  fs/internal.h                          |   5 +
>  fs/namespace.c                         | 312 ++++++++++++++++++++++++-
>  fs/proc_namespace.c                    |  19 +-
>  fs/statfs.c                            |   1 +
>  include/linux/syscalls.h               |   3 +
>  include/uapi/asm-generic/unistd.h      |   5 +-
>  include/uapi/linux/mount.h             |  36 +++
>  8 files changed, 373 insertions(+), 9 deletions(-)

...

> diff --git a/fs/namespace.c b/fs/namespace.c
> index de47c5f66e17..088a52043bba 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c

...

> +static int do_statmnt(struct stmt_state *s)
> +{
> +       struct statmnt *sm = &s->sm;
> +       struct mount *m = real_mount(s->mnt);
> +
> +       if (!capable(CAP_SYS_ADMIN) &&
> +           !is_path_reachable(m, m->mnt.mnt_root, &s->root))
> +               return -EPERM;

I realize statmnt() is different from fstatfs(), but from an access
control perspective they look a lot alike to me which is why I think
we should probably have a security_sb_statfs() call here.  Same thing
for the listmnt() syscall in patch 3/3.

> +       stmt_numeric(s, STMT_SB_BASIC, stmt_sb_basic);
> +       stmt_numeric(s, STMT_MNT_BASIC, stmt_mnt_basic);
> +       stmt_numeric(s, STMT_PROPAGATE_FROM, stmt_propagate_from);
> +       stmt_string(s, STMT_MNT_ROOT, stmt_mnt_root, &sm->mnt_root);
> +       stmt_string(s, STMT_MOUNTPOINT, stmt_mountpoint, &sm->mountpoint);
> +       stmt_string(s, STMT_FS_TYPE, stmt_fs_type, &sm->fs_type);
> +       stmt_string(s, STMT_SB_OPTS, stmt_sb_opts, &sm->sb_opts);
> +
> +       if (s->err)
> +               return s->err;
> +
> +       if (copy_to_user(s->buf, sm, min_t(size_t, s->bufsize, sizeof(*sm))))
> +               return -EFAULT;
> +
> +       return 0;
> +}

-- 
paul-moore.com

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 3/3] add listmnt(2) syscall
  2023-09-14  6:00   ` Amir Goldstein
  2023-09-14  8:50     ` Miklos Szeredi
@ 2023-09-15  1:00     ` Ian Kent
  1 sibling, 0 replies; 76+ messages in thread
From: Ian Kent @ 2023-09-15  1:00 UTC (permalink / raw)
  To: Amir Goldstein, Miklos Szeredi
  Cc: linux-fsdevel, linux-kernel, linux-api, linux-man,
	linux-security-module, Karel Zak, David Howells, Linus Torvalds,
	Al Viro, Christian Brauner

On 14/9/23 14:00, Amir Goldstein wrote:
> On Wed, Sep 13, 2023 at 6:22 PM Miklos Szeredi <mszeredi@redhat.com> wrote:
>> Add way to query the children of a particular mount.  This is a more
>> flexible way to iterate the mount tree than having to parse the complete
>> /proc/self/mountinfo.
>>
>> Lookup the mount by the old (32bit) or new (64bit) mount ID.  If a mount
>> needs to be queried based on path, then statx(2) can be used to first query
>> the mount ID belonging to the path.
>>
>> Return an array of new (64bit) mount ID's.  Without privileges only mounts
>> are listed which are reachable from the task's root.
>>
>> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
>> ---
>>   arch/x86/entry/syscalls/syscall_64.tbl |  1 +
>>   fs/namespace.c                         | 51 ++++++++++++++++++++++++++
>>   include/linux/syscalls.h               |  2 +
>>   include/uapi/asm-generic/unistd.h      |  5 ++-
>>   4 files changed, 58 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
>> index 6d807c30cd16..0d9a47b0ce9b 100644
>> --- a/arch/x86/entry/syscalls/syscall_64.tbl
>> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
>> @@ -376,6 +376,7 @@
>>   452    common  fchmodat2               sys_fchmodat2
>>   453    64      map_shadow_stack        sys_map_shadow_stack
>>   454    common  statmnt                 sys_statmnt
>> +455    common  listmnt                 sys_listmnt
>>
>>   #
>>   # Due to a historical design error, certain syscalls are numbered differently
>> diff --git a/fs/namespace.c b/fs/namespace.c
>> index 088a52043bba..5362b1ffb26f 100644
>> --- a/fs/namespace.c
>> +++ b/fs/namespace.c
>> @@ -4988,6 +4988,57 @@ SYSCALL_DEFINE5(statmnt, u64, mnt_id,
>>          return err;
>>   }
>>
>> +static long do_listmnt(struct vfsmount *mnt, u64 __user *buf, size_t bufsize,
>> +                     const struct path *root)
>> +{
>> +       struct mount *r, *m = real_mount(mnt);
>> +       struct path rootmnt = { .mnt = root->mnt, .dentry = root->mnt->mnt_root };
>> +       long ctr = 0;
>> +
>> +       if (!capable(CAP_SYS_ADMIN) &&
>> +           !is_path_reachable(m, mnt->mnt_root, &rootmnt))
>> +               return -EPERM;
>> +
>> +       list_for_each_entry(r, &m->mnt_mounts, mnt_child) {
>> +               if (!capable(CAP_SYS_ADMIN) &&
>> +                   !is_path_reachable(r, r->mnt.mnt_root, root))
>> +                       continue;
>> +
>> +               if (ctr >= bufsize)
>> +                       return -EOVERFLOW;
>> +               if (put_user(r->mnt_id_unique, buf + ctr))
>> +                       return -EFAULT;
>> +               ctr++;
>> +               if (ctr < 0)
>> +                       return -ERANGE;
> I think it'd be good for userspace to be able to query required
> bufsize with NULL buf, listattr style, rather than having to
> guess and re-guess on EOVERFLOW.

Agreed, I also think that would be useful.


Ian


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-14  6:11   ` Amir Goldstein
@ 2023-09-15  1:05     ` Ian Kent
  0 siblings, 0 replies; 76+ messages in thread
From: Ian Kent @ 2023-09-15  1:05 UTC (permalink / raw)
  To: Amir Goldstein, Miklos Szeredi
  Cc: linux-fsdevel, linux-kernel, linux-api, linux-man,
	linux-security-module, Karel Zak, David Howells, Linus Torvalds,
	Al Viro, Christian Brauner

On 14/9/23 14:11, Amir Goldstein wrote:
> On Wed, Sep 13, 2023 at 6:22 PM Miklos Szeredi <mszeredi@redhat.com> wrote:
>> Add a way to query attributes of a single mount instead of having to parse
>> the complete /proc/$PID/mountinfo, which might be huge.
>>
>> Lookup the mount by the old (32bit) or new (64bit) mount ID.  If a mount
>> needs to be queried based on path, then statx(2) can be used to first query
>> the mount ID belonging to the path.
>>
>> Design is based on a suggestion by Linus:
>>
>>    "So I'd suggest something that is very much like "statfsat()", which gets
>>     a buffer and a length, and returns an extended "struct statfs" *AND*
>>     just a string description at the end."
>>
>> The interface closely mimics that of statx.
>>
>> Handle ASCII attributes by appending after the end of the structure (as per
>> above suggestion).  Allow querying multiple string attributes with
>> individual offset/length for each.  String are nul terminated (termination
>> isn't counted in length).
>>
>> Mount options are also delimited with nul characters.  Unlike proc, special
>> characters are not quoted.
>>
>> Link: https://lore.kernel.org/all/CAHk-=wh5YifP7hzKSbwJj94+DZ2czjrZsczy6GBimiogZws=rg@mail.gmail.com/
>> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
>> ---
>>   arch/x86/entry/syscalls/syscall_64.tbl |   1 +
>>   fs/internal.h                          |   5 +
>>   fs/namespace.c                         | 312 ++++++++++++++++++++++++-
>>   fs/proc_namespace.c                    |  19 +-
>>   fs/statfs.c                            |   1 +
>>   include/linux/syscalls.h               |   3 +
>>   include/uapi/asm-generic/unistd.h      |   5 +-
>>   include/uapi/linux/mount.h             |  36 +++
>>   8 files changed, 373 insertions(+), 9 deletions(-)
>>
>> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
>> index 1d6eee30eceb..6d807c30cd16 100644
>> --- a/arch/x86/entry/syscalls/syscall_64.tbl
>> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
>> @@ -375,6 +375,7 @@
>>   451    common  cachestat               sys_cachestat
>>   452    common  fchmodat2               sys_fchmodat2
>>   453    64      map_shadow_stack        sys_map_shadow_stack
>> +454    common  statmnt                 sys_statmnt
>>
>>   #
>>   # Due to a historical design error, certain syscalls are numbered differently
>> diff --git a/fs/internal.h b/fs/internal.h
>> index d64ae03998cc..8f75271428aa 100644
>> --- a/fs/internal.h
>> +++ b/fs/internal.h
>> @@ -83,6 +83,11 @@ int path_mount(const char *dev_name, struct path *path,
>>                  const char *type_page, unsigned long flags, void *data_page);
>>   int path_umount(struct path *path, int flags);
>>
>> +/*
>> + * proc_namespace.c
>> + */
>> +int show_path(struct seq_file *m, struct dentry *root);
>> +
>>   /*
>>    * fs_struct.c
>>    */
>> diff --git a/fs/namespace.c b/fs/namespace.c
>> index de47c5f66e17..088a52043bba 100644
>> --- a/fs/namespace.c
>> +++ b/fs/namespace.c
>> @@ -69,7 +69,8 @@ static DEFINE_IDA(mnt_id_ida);
>>   static DEFINE_IDA(mnt_group_ida);
>>
>>   /* Don't allow confusion with mount ID allocated wit IDA */
>> -static atomic64_t mnt_id_ctr = ATOMIC64_INIT(1ULL << 32);
>> +#define OLD_MNT_ID_MAX UINT_MAX
>> +static atomic64_t mnt_id_ctr = ATOMIC64_INIT(OLD_MNT_ID_MAX);
>>
>>   static struct hlist_head *mount_hashtable __read_mostly;
>>   static struct hlist_head *mountpoint_hashtable __read_mostly;
>> @@ -4678,6 +4679,315 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
>>          return err;
>>   }
>>
>> +static bool mnt_id_match(struct mount *mnt, u64 id)
>> +{
>> +       if (id <= OLD_MNT_ID_MAX)
>> +               return id == mnt->mnt_id;
>> +       else
>> +               return id == mnt->mnt_id_unique;
>> +}
>> +
>> +struct vfsmount *lookup_mnt_in_ns(u64 id, struct mnt_namespace *ns)
>> +{
>> +       struct mount *mnt;
>> +       struct vfsmount *res = NULL;
>> +
>> +       lock_ns_list(ns);
>> +       list_for_each_entry(mnt, &ns->list, mnt_list) {
>> +               if (!mnt_is_cursor(mnt) && mnt_id_match(mnt, id)) {
>> +                       res = &mnt->mnt;
>> +                       break;
>> +               }
>> +       }
>> +       unlock_ns_list(ns);
>> +       return res;
>> +}
>> +
>> +struct stmt_state {
>> +       void __user *const buf;
>> +       size_t const bufsize;
>> +       struct vfsmount *const mnt;
>> +       u64 const mask;
>> +       struct seq_file seq;
>> +       struct path root;
>> +       struct statmnt sm;
>> +       size_t pos;
>> +       int err;
>> +};
>> +
>> +typedef int (*stmt_func_t)(struct stmt_state *);
>> +
>> +static int stmt_string_seq(struct stmt_state *s, stmt_func_t func)
>> +{
>> +       struct seq_file *seq = &s->seq;
>> +       int ret;
>> +
>> +       seq->count = 0;
>> +       seq->size = min_t(size_t, seq->size, s->bufsize - s->pos);
>> +       seq->buf = kvmalloc(seq->size, GFP_KERNEL_ACCOUNT);
>> +       if (!seq->buf)
>> +               return -ENOMEM;
>> +
>> +       ret = func(s);
>> +       if (ret)
>> +               return ret;
>> +
>> +       if (seq_has_overflowed(seq)) {
>> +               if (seq->size == s->bufsize - s->pos)
>> +                       return -EOVERFLOW;
>> +               seq->size *= 2;
>> +               if (seq->size > MAX_RW_COUNT)
>> +                       return -ENOMEM;
>> +               kvfree(seq->buf);
>> +               return 0;
>> +       }
>> +
>> +       /* Done */
>> +       return 1;
>> +}
>> +
>> +static void stmt_string(struct stmt_state *s, u64 mask, stmt_func_t func,
>> +                      stmt_str_t *str)
>> +{
>> +       int ret = s->pos >= s->bufsize ? -EOVERFLOW : 0;
>> +       struct statmnt *sm = &s->sm;
>> +       struct seq_file *seq = &s->seq;
>> +
>> +       if (s->err || !(s->mask & mask))
>> +               return;
>> +
>> +       seq->size = PAGE_SIZE;
>> +       while (!ret)
>> +               ret = stmt_string_seq(s, func);
>> +
>> +       if (ret < 0) {
>> +               s->err = ret;
>> +       } else {
>> +               seq->buf[seq->count++] = '\0';
>> +               if (copy_to_user(s->buf + s->pos, seq->buf, seq->count)) {
>> +                       s->err = -EFAULT;
>> +               } else {
>> +                       str->off = s->pos;
>> +                       str->len = seq->count - 1;
>> +                       s->pos += seq->count;
>> +               }
>> +       }
>> +       kvfree(seq->buf);
>> +       sm->mask |= mask;
>> +}
>> +
>> +static void stmt_numeric(struct stmt_state *s, u64 mask, stmt_func_t func)
>> +{
>> +       if (s->err || !(s->mask & mask))
>> +               return;
>> +
>> +       s->err = func(s);
>> +       s->sm.mask |= mask;
>> +}
>> +
>> +static u64 mnt_to_attr_flags(struct vfsmount *mnt)
>> +{
>> +       unsigned int mnt_flags = READ_ONCE(mnt->mnt_flags);
>> +       u64 attr_flags = 0;
>> +
>> +       if (mnt_flags & MNT_READONLY)
>> +               attr_flags |= MOUNT_ATTR_RDONLY;
>> +       if (mnt_flags & MNT_NOSUID)
>> +               attr_flags |= MOUNT_ATTR_NOSUID;
>> +       if (mnt_flags & MNT_NODEV)
>> +               attr_flags |= MOUNT_ATTR_NODEV;
>> +       if (mnt_flags & MNT_NOEXEC)
>> +               attr_flags |= MOUNT_ATTR_NOEXEC;
>> +       if (mnt_flags & MNT_NODIRATIME)
>> +               attr_flags |= MOUNT_ATTR_NODIRATIME;
>> +       if (mnt_flags & MNT_NOSYMFOLLOW)
>> +               attr_flags |= MOUNT_ATTR_NOSYMFOLLOW;
>> +
>> +       if (mnt_flags & MNT_NOATIME)
>> +               attr_flags |= MOUNT_ATTR_NOATIME;
>> +       else if (mnt_flags & MNT_RELATIME)
>> +               attr_flags |= MOUNT_ATTR_RELATIME;
>> +       else
>> +               attr_flags |= MOUNT_ATTR_STRICTATIME;
>> +
>> +       if (is_idmapped_mnt(mnt))
>> +               attr_flags |= MOUNT_ATTR_IDMAP;
>> +
>> +       return attr_flags;
>> +}
>> +
>> +static u64 mnt_to_propagation_flags(struct mount *m)
>> +{
>> +       u64 propagation = 0;
>> +
>> +       if (IS_MNT_SHARED(m))
>> +               propagation |= MS_SHARED;
>> +       if (IS_MNT_SLAVE(m))
>> +               propagation |= MS_SLAVE;
>> +       if (IS_MNT_UNBINDABLE(m))
>> +               propagation |= MS_UNBINDABLE;
>> +       if (!propagation)
>> +               propagation |= MS_PRIVATE;
>> +
>> +       return propagation;
>> +}
>> +
>> +static int stmt_sb_basic(struct stmt_state *s)
>> +{
>> +       struct super_block *sb = s->mnt->mnt_sb;
>> +
>> +       s->sm.sb_dev_major = MAJOR(sb->s_dev);
>> +       s->sm.sb_dev_minor = MINOR(sb->s_dev);
>> +       s->sm.sb_magic = sb->s_magic;
>> +       s->sm.sb_flags = sb->s_flags & (SB_RDONLY|SB_SYNCHRONOUS|SB_DIRSYNC|SB_LAZYTIME);
>> +
>> +       return 0;
>> +}
>> +
>> +static int stmt_mnt_basic(struct stmt_state *s)
>> +{
>> +       struct mount *m = real_mount(s->mnt);
>> +
>> +       s->sm.mnt_id = m->mnt_id_unique;
>> +       s->sm.mnt_parent_id = m->mnt_parent->mnt_id_unique;
>> +       s->sm.mnt_id_old = m->mnt_id;
>> +       s->sm.mnt_parent_id_old = m->mnt_parent->mnt_id;
>> +       s->sm.mnt_attr = mnt_to_attr_flags(&m->mnt);
>> +       s->sm.mnt_propagation = mnt_to_propagation_flags(m);
>> +       s->sm.mnt_peer_group = IS_MNT_SHARED(m) ? m->mnt_group_id : 0;
>> +       s->sm.mnt_master = IS_MNT_SLAVE(m) ? m->mnt_master->mnt_group_id : 0;
>> +
>> +       return 0;
>> +}
>> +
>> +static int stmt_propagate_from(struct stmt_state *s)
>> +{
>> +       struct mount *m = real_mount(s->mnt);
>> +
>> +       if (!IS_MNT_SLAVE(m))
>> +               return 0;
>> +
>> +       s->sm.propagate_from = get_dominating_id(m, &current->fs->root);
>> +
>> +       return 0;
>> +}
>> +
>> +static int stmt_mnt_root(struct stmt_state *s)
>> +{
>> +       struct seq_file *seq = &s->seq;
>> +       int err = show_path(seq, s->mnt->mnt_root);
>> +
>> +       if (!err && !seq_has_overflowed(seq)) {
>> +               seq->buf[seq->count] = '\0';
>> +               seq->count = string_unescape_inplace(seq->buf, UNESCAPE_OCTAL);
>> +       }
>> +       return err;
>> +}
>> +
>> +static int stmt_mountpoint(struct stmt_state *s)
>> +{
>> +       struct vfsmount *mnt = s->mnt;
>> +       struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
>> +       int err = seq_path_root(&s->seq, &mnt_path, &s->root, "");
>> +
>> +       return err == SEQ_SKIP ? 0 : err;
>> +}
>> +
>> +static int stmt_fs_type(struct stmt_state *s)
>> +{
>> +       struct seq_file *seq = &s->seq;
>> +       struct super_block *sb = s->mnt->mnt_sb;
>> +
>> +       seq_puts(seq, sb->s_type->name);
>> +       if (sb->s_subtype) {
>> +               seq_putc(seq, '.');
>> +               seq_puts(seq, sb->s_subtype);
>> +       }
>> +       return 0;
>> +}
>> +
>> +static int stmt_sb_opts(struct stmt_state *s)
>> +{
>> +       struct seq_file *seq = &s->seq;
>> +       struct super_block *sb = s->mnt->mnt_sb;
>> +       char *p, *end, *next, *u = seq->buf;
>> +       int err;
>> +
>> +       if (!sb->s_op->show_options)
>> +               return 0;
>> +
>> +       err = sb->s_op->show_options(seq, s->mnt->mnt_root);
>> +       if (err || seq_has_overflowed(seq) || !seq->count)
>> +               return err;
>> +
>> +       end = seq->buf + seq->count;
>> +       *end = '\0';
>> +       for (p = seq->buf + 1; p < end; p = next + 1) {
>> +               next = strchrnul(p, ',');
>> +               *next = '\0';
>> +               u += string_unescape(p, u, 0, UNESCAPE_OCTAL) + 1;
>> +       }
>> +       seq->count = u - 1 - seq->buf;
>> +       return 0;
>> +}
>> +
>> +static int do_statmnt(struct stmt_state *s)
>> +{
>> +       struct statmnt *sm = &s->sm;
>> +       struct mount *m = real_mount(s->mnt);
>> +
>> +       if (!capable(CAP_SYS_ADMIN) &&
>> +           !is_path_reachable(m, m->mnt.mnt_root, &s->root))
>> +               return -EPERM;
>> +
>> +       stmt_numeric(s, STMT_SB_BASIC, stmt_sb_basic);
>> +       stmt_numeric(s, STMT_MNT_BASIC, stmt_mnt_basic);
>> +       stmt_numeric(s, STMT_PROPAGATE_FROM, stmt_propagate_from);
>> +       stmt_string(s, STMT_MNT_ROOT, stmt_mnt_root, &sm->mnt_root);
>> +       stmt_string(s, STMT_MOUNTPOINT, stmt_mountpoint, &sm->mountpoint);
>> +       stmt_string(s, STMT_FS_TYPE, stmt_fs_type, &sm->fs_type);
>> +       stmt_string(s, STMT_SB_OPTS, stmt_sb_opts, &sm->sb_opts);
>> +
>> +       if (s->err)
>> +               return s->err;
>> +
>> +       if (copy_to_user(s->buf, sm, min_t(size_t, s->bufsize, sizeof(*sm))))
>> +               return -EFAULT;
>> +
>> +       return 0;
> Similar concern as with listmnt, I think that users would
> want to have a way to get the fixed size statmnt part that fits
> in the buffer, even if the variable length string values do not fit
> and be able to query the required buffer size to get the strings.
>
> The API could be either to explicitly request
> STMT_MNT_ROOT_LEN | STMT_MOUNTPOINT_LEN ...
> without allowing mixing of no-value and value requests,
> or to out-out from any string values using a single flag,
> which is probably more simple for API and implementation.


There is also the possibility that the size needed to satisfy the

request will change between request and call, not sure how to deal

with that, but the size estimate is needed ...


Ian


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 0/3] quering mount attributes
  2023-09-14  6:47 ` [RFC PATCH 0/3] quering mount attributes Amir Goldstein
@ 2023-09-15  1:20   ` Ian Kent
  2023-09-15  3:06     ` Amir Goldstein
  0 siblings, 1 reply; 76+ messages in thread
From: Ian Kent @ 2023-09-15  1:20 UTC (permalink / raw)
  To: Amir Goldstein, Miklos Szeredi
  Cc: linux-fsdevel, linux-kernel, linux-api, linux-man,
	linux-security-module, Karel Zak, David Howells, Linus Torvalds,
	Al Viro, Christian Brauner

On 14/9/23 14:47, Amir Goldstein wrote:
> On Wed, Sep 13, 2023 at 6:22 PM Miklos Szeredi <mszeredi@redhat.com> wrote:
>> Implement the mount querying syscalls agreed on at LSF/MM 2023.  This is an
>> RFC with just x86_64 syscalls.
>>
>> Excepting notification this should allow full replacement for
>> parsing /proc/self/mountinfo.
> Since you mentioned notifications, I will add that the plan discussed
> in LFSMM was, once we have an API to query mount stats and children,
> implement fanotify events for:
> mount [mntuid] was un/mounted at [parent mntuid],[dirfid+name]
>
> As with other fanotify events, the self mntuid and dirfid+name
> information can be omitted and without it, multiple un/mount events
> from the same parent mntuid will be merged, allowing userspace
> to listmnt() periodically only mntuid whose child mounts have changed,
> with little risk of event queue overflow.
>
> The possible monitoring scopes would be the entire mount namespace
> of the monitoring program or watching a single mount for change in
> its children mounts. The latter is similar to inotify directory children watch,
> where the watches needs to be set recursively, with all the weight on
> userspace to avoid races.

It's been my belief that the existing notification mechanisms don't

quite fully satisfy the needs of users of these calls (aka. the need

I found when implementing David's original calls into systemd).


Specifically the ability to process a batch of notifications at once.

Admittedly the notifications mechanism that David originally implemented

didn't fully implement what I found I needed but it did provide for a

settable queue length and getting a batch of notifications at a time.


Am I mistaken in my belief?


Don't misunderstand me, it would be great for the existing notification

mechanisms to support these system calls, I just have a specific use case

in mind that I think is important, at least to me.


Ian


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 1/3] add unique mount ID
  2023-09-14  9:43         ` Miklos Szeredi
  2023-09-14 10:06           ` Christian Brauner
@ 2023-09-15  1:31           ` Ian Kent
  1 sibling, 0 replies; 76+ messages in thread
From: Ian Kent @ 2023-09-15  1:31 UTC (permalink / raw)
  To: Miklos Szeredi, Christian Brauner
  Cc: Miklos Szeredi, linux-fsdevel, linux-kernel, linux-api,
	linux-man, linux-security-module, Karel Zak, David Howells,
	Linus Torvalds, Al Viro, Christian Brauner, Amir Goldstein

On 14/9/23 17:43, Miklos Szeredi wrote:
> On Thu, 14 Sept 2023 at 11:36, Christian Brauner <brauner@kernel.org> wrote:
>>> Yes, one concern is that humans confuse the old and the new ID.
>>>
>>> I also think it makes sense to allow the new interfaces to look up the
>>> mount based on either the old or the new ID.   But I could be wrong
>> Hm, mount id recycling may happen so quickly that for service restarts
>> with a lot of mounts this becomes mostly useless...
> Agreed.  The old ID is mostly useful for human interaction.
>
>>> there, since that might encourage bad code.  Maybe the new interface
>>> should only use take the new ID, which means no mixed use of
>>> /proc/$$/mountinfo and statmnt/listmnt.
>> ... so I think that is indeed the better way of doing things. There's no
>> need to encourage userspace to mix both identifiers.
> Okay.

I think having both is leaving more opportunity for confusion and the new

mount id has a different name so I think start the move to using that now

and only allow the new one for lookups.


>
> But I'd still leave the 2^32 offset for human confusion avoidance.

Yep, agreed.


Ian


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 0/3] quering mount attributes
  2023-09-15  1:20   ` Ian Kent
@ 2023-09-15  3:06     ` Amir Goldstein
  2023-09-16  2:04       ` Ian Kent
  2023-09-16  2:19       ` Ian Kent
  0 siblings, 2 replies; 76+ messages in thread
From: Amir Goldstein @ 2023-09-15  3:06 UTC (permalink / raw)
  To: Ian Kent
  Cc: Miklos Szeredi, linux-fsdevel, linux-kernel, linux-api,
	linux-man, linux-security-module, Karel Zak, David Howells,
	Linus Torvalds, Al Viro, Christian Brauner

On Fri, Sep 15, 2023 at 4:20 AM Ian Kent <raven@themaw.net> wrote:
>
> On 14/9/23 14:47, Amir Goldstein wrote:
> > On Wed, Sep 13, 2023 at 6:22 PM Miklos Szeredi <mszeredi@redhat.com> wrote:
> >> Implement the mount querying syscalls agreed on at LSF/MM 2023.  This is an
> >> RFC with just x86_64 syscalls.
> >>
> >> Excepting notification this should allow full replacement for
> >> parsing /proc/self/mountinfo.
> > Since you mentioned notifications, I will add that the plan discussed
> > in LFSMM was, once we have an API to query mount stats and children,
> > implement fanotify events for:
> > mount [mntuid] was un/mounted at [parent mntuid],[dirfid+name]
> >
> > As with other fanotify events, the self mntuid and dirfid+name
> > information can be omitted and without it, multiple un/mount events
> > from the same parent mntuid will be merged, allowing userspace
> > to listmnt() periodically only mntuid whose child mounts have changed,
> > with little risk of event queue overflow.
> >
> > The possible monitoring scopes would be the entire mount namespace
> > of the monitoring program or watching a single mount for change in
> > its children mounts. The latter is similar to inotify directory children watch,
> > where the watches needs to be set recursively, with all the weight on
> > userspace to avoid races.
>
> It's been my belief that the existing notification mechanisms don't
> quite fully satisfy the needs of users of these calls (aka. the need
> I found when implementing David's original calls into systemd).
>
> Specifically the ability to process a batch of notifications at once.
>
> Admittedly the notifications mechanism that David originally implemented
> didn't fully implement what I found I needed but it did provide for a
> settable queue length and getting a batch of notifications at a time.
>
> Am I mistaken in my belief?
>

I am not sure I understand the question.

fanotify has an event queue (16K events by default), but it can
also use unlimited size.
With a limited size queue, event queue overflow generates an
overflow event.

event listeners can read a batch of events, depending on
the size of the buffer that they provide.

when multiple events with same information are queued,
for example "something was un/mounted over parent mntuid 100"
fanotify will merged those all those events in the queue and the
event listeners will get only one such event in the batch.

> Don't misunderstand me, it would be great for the existing notification
> mechanisms to support these system calls, I just have a specific use case
> in mind that I think is important, at least to me.
>

Please explain the use case and your belief about existing fanotify
limitations. I did not understand it.

Thanks,
Amir.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-14 15:26       ` Christian Brauner
@ 2023-09-15  8:56         ` Miklos Szeredi
  2023-09-18 13:51           ` Christian Brauner
  2023-09-18 14:29         ` Jeff Layton
  1 sibling, 1 reply; 76+ messages in thread
From: Miklos Szeredi @ 2023-09-15  8:56 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Miklos Szeredi, Linus Torvalds, linux-fsdevel, linux-kernel,
	linux-api, linux-man, linux-security-module, Karel Zak, Ian Kent,
	David Howells, Al Viro, Christian Brauner, Amir Goldstein

On Thu, 14 Sept 2023 at 17:27, Christian Brauner <brauner@kernel.org> wrote:
>
> On Thu, Sep 14, 2023 at 12:13:54PM +0200, Miklos Szeredi wrote:
> No worries, I think the discussion touching on this starts at:
> https://youtu.be/j3fp2MtRr2I?si=f-YBg6uWq80dV3VC&t=1603
> (with David talking quietly without a microphone for some parts
> unfortunately...)

(Thanks for digging that out.)

That discussion touched on two aspects of using a single call vs.
multiple calls:

 - atomicity
 - marshalling

Atomicity of getting a snapshot of the current mount tree with all of
its attributes was never guaranteed, although reading
/proc/self/mountinfo into a sufficiently large buffer would work that
way.   However, I don't see why mount trees would require stronger
guarantees than dentry trees (for which we have basically none).

Marshalling/demashalling of arbitrary structures is indeed ugly.  I
think what Linus suggested, and what this interface was based on is
much less than that.  Also see my suggestion below: it doesn't need
demashalling at all due to the fact that the kernel can fill in the
pointers.   And yes, this could be used for arbitrary structures
without compromising type safety, but at the cost of adding more
complexity to the kernel (at least ascii strings are just one type).

Even more type clean interface:

struct statmnt *statmnt(u64 mnt_id, u64 mask, void *buf, size_t
bufsize, unsigned int flags);

Kernel would return a fully initialized struct with the numeric as
well as string fields filled.  That part is trivial for userspace to
deal with.

For sizing the buffer and versioning the struct see discussion below.

> > What I'm thinking is making it even simpler for userspace:
> >
> > struct statmnt {
> >   ...
> >   char *mnt_root;
> >   char *mountpoint;
> >   char *fs_type;
> >   u32 num_opts;
> >   char *opts;
> > };
> >
> > I'd still just keep options nul delimited.
> >
> > Is there a good reason not to return pointers (pointing to within the
> > supplied buffer obviously) to userspace?
>
> It's really unpleasant to program with. Yes, I think you pointed out
> before that it often doesn't matter much as long as the system call is
> really only relevant to some special purpose userspace.
>
> But statmount() will be used pretty extensively pretty quickly for the
> purpose of finding out mount options on a mount (Querying a whole
> sequences of mounts via repeated listmount() + statmount() calls on the
> other hand will be rarer.).
>
> And there's just so many tools that need this: libmount, systemd, all
> kinds of container runtimes, path lookup libraries such as libpathrs,
> languages like go and rust that expose and wrap these calls and so on.
>
> Most of these tools don't need to know about filesystem mount options
> and if they do they can just query that through an extra system call. No
> harm in doing that.

Just pass sizeof(struct statmnt) as the buffer size, and it will work that way.

> The agreement we came to to split out listing submounts into a separate
> system call was exactly to avoid having to have a variable sized pointer
> at the end of the struct statmnt (That's also part of the video above
> btw.) and to make it as simple as possible.
>
> Plus, the format for how to return arbitrary filesystem mount options
> warrants a separate discussion imho as that's not really vfs level
> information.

Okay.   Let's take fs options out of this.

That leaves:

 - fs type and optionally subtype
 - root of mount within fs
 - mountpoint path

The type and subtype are naturally limited to sane sizes, those are
not an issue.

For paths the evolution of the relevant system/library calls was:

  char *getwd(char buf[PATH_MAX]);
  char *getcwd(char *buf, size_t size);
  char *get_current_dir_name(void);

It started out using a fixed size buffer, then a variable sized
buffer, then an automatically allocated buffer by the library, hiding
the need to resize on overflow.

The latest style is suitable for the statmnt() call as well, if we
worry about pleasantness of the API.

>
> > > This will also allow us to turn statmnt() into an extensible argument
> > > system call versioned by size just like we do any new system calls with
> > > struct arguments (e.g., mount_setattr(), clone3(), openat2() and so on).
> > > Which is how we should do things like that.
> >
> > The mask mechanism also allow versioning of the struct.
>
> Yes, but this is done with reserved space which just pushes away the
> problem and bloats the struct for the sake of an unknown future. If we
> were to use an extensible argument struct we would just version by size.
> The only requirement is that you extend by 64 bit (see struct
> clone_args) which had been extended.

No need for reserved space in fact.  Versioning would still work, as
long as userspace is strictly checking the return mask.  I.e. newly
added fields will come after the old buffer, as assumed by the kernel.
But the kernel will never set the mask bits for these fields, so
userspace should not ever look at them.  Note: the interface does have
a bufsize parameter, so no possibility of memory corruption in any
event.

I added the reserved space so that userspace would be protected from
rubbish at the end of the struct if the kernel was older.  A library
wrapper could work around that issue (move the variable part beyond
the end of the new struct), but it would require code update in the
wrapper, not just updating the struct.

But in fact it's much simpler to just add ample reserved space and be
done with it forever, no need to worry about versioning at all.

> > > numbers for sub types as well. So we don't need to use strings here.
> >
> > Ugh.
>
> Hm, idk. It's not that bad imho. We'll have to make some ugly tradeoffs.

Subtype is a fuse thing (e.g. sshfs would show up as  fuse.sshfs
/proc/self/mountinfo.  Forcing each fuse filesystem to invent a magic
number... please no.

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-14 20:39   ` Paul Moore
@ 2023-09-15  9:10     ` Miklos Szeredi
  0 siblings, 0 replies; 76+ messages in thread
From: Miklos Szeredi @ 2023-09-15  9:10 UTC (permalink / raw)
  To: Paul Moore
  Cc: Miklos Szeredi, linux-fsdevel, linux-kernel, linux-api,
	linux-man, linux-security-module, Karel Zak, Ian Kent,
	David Howells, Linus Torvalds, Al Viro, Christian Brauner,
	Amir Goldstein

On Thu, 14 Sept 2023 at 22:40, Paul Moore <paul@paul-moore.com> wrote:
>
> On Wed, Sep 13, 2023 at 11:23 AM Miklos Szeredi <mszeredi@redhat.com> wrote:

> ...
>
> > +static int do_statmnt(struct stmt_state *s)
> > +{
> > +       struct statmnt *sm = &s->sm;
> > +       struct mount *m = real_mount(s->mnt);
> > +
> > +       if (!capable(CAP_SYS_ADMIN) &&
> > +           !is_path_reachable(m, m->mnt.mnt_root, &s->root))
> > +               return -EPERM;
>
> I realize statmnt() is different from fstatfs(), but from an access
> control perspective they look a lot alike to me which is why I think
> we should probably have a security_sb_statfs() call here.  Same thing
> for the listmnt() syscall in patch 3/3.

Okay, makes sense.

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 0/3] quering mount attributes
  2023-09-15  3:06     ` Amir Goldstein
@ 2023-09-16  2:04       ` Ian Kent
  2023-09-16  2:19       ` Ian Kent
  1 sibling, 0 replies; 76+ messages in thread
From: Ian Kent @ 2023-09-16  2:04 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Miklos Szeredi, linux-fsdevel, linux-kernel, linux-api,
	linux-man, linux-security-module, Karel Zak, David Howells,
	Linus Torvalds, Al Viro, Christian Brauner

On 15/9/23 11:06, Amir Goldstein wrote:
> On Fri, Sep 15, 2023 at 4:20 AM Ian Kent <raven@themaw.net> wrote:
>> On 14/9/23 14:47, Amir Goldstein wrote:
>>> On Wed, Sep 13, 2023 at 6:22 PM Miklos Szeredi <mszeredi@redhat.com> wrote:
>>>> Implement the mount querying syscalls agreed on at LSF/MM 2023.  This is an
>>>> RFC with just x86_64 syscalls.
>>>>
>>>> Excepting notification this should allow full replacement for
>>>> parsing /proc/self/mountinfo.
>>> Since you mentioned notifications, I will add that the plan discussed
>>> in LFSMM was, once we have an API to query mount stats and children,
>>> implement fanotify events for:
>>> mount [mntuid] was un/mounted at [parent mntuid],[dirfid+name]
>>>
>>> As with other fanotify events, the self mntuid and dirfid+name
>>> information can be omitted and without it, multiple un/mount events
>>> from the same parent mntuid will be merged, allowing userspace
>>> to listmnt() periodically only mntuid whose child mounts have changed,
>>> with little risk of event queue overflow.
>>>
>>> The possible monitoring scopes would be the entire mount namespace
>>> of the monitoring program or watching a single mount for change in
>>> its children mounts. The latter is similar to inotify directory children watch,
>>> where the watches needs to be set recursively, with all the weight on
>>> userspace to avoid races.
>> It's been my belief that the existing notification mechanisms don't
>> quite fully satisfy the needs of users of these calls (aka. the need
>> I found when implementing David's original calls into systemd).
>>
>> Specifically the ability to process a batch of notifications at once.
>>
>> Admittedly the notifications mechanism that David originally implemented
>> didn't fully implement what I found I needed but it did provide for a
>> settable queue length and getting a batch of notifications at a time.
>>
>> Am I mistaken in my belief?
>>
> I am not sure I understand the question.
>
> fanotify has an event queue (16K events by default), but it can
> also use unlimited size.
> With a limited size queue, event queue overflow generates an
> overflow event.
>
> event listeners can read a batch of events, depending on
> the size of the buffer that they provide.
>
> when multiple events with same information are queued,
> for example "something was un/mounted over parent mntuid 100"
> fanotify will merged those all those events in the queue and the
> event listeners will get only one such event in the batch.
>
>> Don't misunderstand me, it would be great for the existing notification
>> mechanisms to support these system calls, I just have a specific use case
>> in mind that I think is important, at least to me.
>>
> Please explain the use case and your belief about existing fanotify
> limitations. I did not understand it.

Yes, it's not obvious, I'll try and explain it more clearly.


I did some work to enable systemd to use the original fsinfo() call

and the notifications system David had written.


My use case was perhaps unrealistic but I have seen real world reports

with similar symptoms and autofs usage can behave like this usage at

times as well so it's not entirely manufactured. The use case is basically

when there are a large number of mounts occurring for a sustained amount

of time.


Anyway, systemd processes get notified when there is mount activity and

it then reads the mount table to update it state. I observed there are

usually 3 separate systemd processes monitoring mount table changes and,

under the above load, they use around 80-85% of a CPU each.


Thing is systemd is actually pretty good at processing notifications so

when there is sustained mount activity and the fsinfo() call was used the

load changes from processing the table to processing notifications. The

load goes down to a bit over 40% for each process.


But if you can batch those notifications, like introduce a high water

mark (yes I know this is not at all simple and I'm by no means suggesting

this is all that needs to be done), to get a bunch of these notifications

at once the throughput increases quite a bit. In my initial testing adding

a delay of 10 or 20 milliseconds before fetching the queue of notifications

and processing them saw a reduction of CPU usage to around 8% per process.


What I'm saying is I've found that system calls to get the information

directly isn't all that's needed to improve the scalability.


Ian


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 0/3] quering mount attributes
  2023-09-15  3:06     ` Amir Goldstein
  2023-09-16  2:04       ` Ian Kent
@ 2023-09-16  2:19       ` Ian Kent
  1 sibling, 0 replies; 76+ messages in thread
From: Ian Kent @ 2023-09-16  2:19 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Miklos Szeredi, linux-fsdevel, linux-kernel, linux-api,
	linux-man, linux-security-module, Karel Zak, David Howells,
	Linus Torvalds, Al Viro, Christian Brauner

On 15/9/23 11:06, Amir Goldstein wrote:
> On Fri, Sep 15, 2023 at 4:20 AM Ian Kent <raven@themaw.net> wrote:
>> On 14/9/23 14:47, Amir Goldstein wrote:
>>> On Wed, Sep 13, 2023 at 6:22 PM Miklos Szeredi <mszeredi@redhat.com> wrote:
>>>> Implement the mount querying syscalls agreed on at LSF/MM 2023.  This is an
>>>> RFC with just x86_64 syscalls.
>>>>
>>>> Excepting notification this should allow full replacement for
>>>> parsing /proc/self/mountinfo.
>>> Since you mentioned notifications, I will add that the plan discussed
>>> in LFSMM was, once we have an API to query mount stats and children,
>>> implement fanotify events for:
>>> mount [mntuid] was un/mounted at [parent mntuid],[dirfid+name]
>>>
>>> As with other fanotify events, the self mntuid and dirfid+name
>>> information can be omitted and without it, multiple un/mount events
>>> from the same parent mntuid will be merged, allowing userspace
>>> to listmnt() periodically only mntuid whose child mounts have changed,
>>> with little risk of event queue overflow.
>>>
>>> The possible monitoring scopes would be the entire mount namespace
>>> of the monitoring program or watching a single mount for change in
>>> its children mounts. The latter is similar to inotify directory children watch,
>>> where the watches needs to be set recursively, with all the weight on
>>> userspace to avoid races.
>> It's been my belief that the existing notification mechanisms don't
>> quite fully satisfy the needs of users of these calls (aka. the need
>> I found when implementing David's original calls into systemd).
>>
>> Specifically the ability to process a batch of notifications at once.
>>
>> Admittedly the notifications mechanism that David originally implemented
>> didn't fully implement what I found I needed but it did provide for a
>> settable queue length and getting a batch of notifications at a time.
>>
>> Am I mistaken in my belief?
>>
> I am not sure I understand the question.
>
> fanotify has an event queue (16K events by default), but it can
> also use unlimited size.
> With a limited size queue, event queue overflow generates an
> overflow event.
>
> event listeners can read a batch of events, depending on
> the size of the buffer that they provide.

So it sounds like I can get a bunch of events at once with fanotify.

I'll have to look at the code again ...


Ian

>
> when multiple events with same information are queued,
> for example "something was un/mounted over parent mntuid 100"
> fanotify will merged those all those events in the queue and the
> event listeners will get only one such event in the batch.
>
>> Don't misunderstand me, it would be great for the existing notification
>> mechanisms to support these system calls, I just have a specific use case
>> in mind that I think is important, at least to me.
>>
> Please explain the use case and your belief about existing fanotify
> limitations. I did not understand it.
>
> Thanks,
> Amir.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 3/3] add listmnt(2) syscall
  2023-09-13 15:22 ` [RFC PATCH 3/3] add listmnt(2) syscall Miklos Szeredi
  2023-09-14  6:00   ` Amir Goldstein
@ 2023-09-17  0:54   ` Matthew House
  2023-09-17 14:32     ` Miklos Szeredi
  1 sibling, 1 reply; 76+ messages in thread
From: Matthew House @ 2023-09-17  0:54 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: linux-fsdevel, linux-kernel, linux-api, linux-man,
	linux-security-module, Karel Zak, Ian Kent, David Howells,
	Linus Torvalds, Al Viro, Christian Brauner, Amir Goldstein

On Thu, Sep 14, 2023 at 12:02 PM Miklos Szeredi <mszeredi@redhat.com> wrote:
> Add way to query the children of a particular mount.  This is a more
> flexible way to iterate the mount tree than having to parse the complete
> /proc/self/mountinfo.
>
> Lookup the mount by the old (32bit) or new (64bit) mount ID.  If a mount
> needs to be queried based on path, then statx(2) can be used to first query
> the mount ID belonging to the path.
>
> Return an array of new (64bit) mount ID's.  Without privileges only mounts
> are listed which are reachable from the task's root.
>
> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
> ---
>  arch/x86/entry/syscalls/syscall_64.tbl |  1 +
>  fs/namespace.c                         | 51 ++++++++++++++++++++++++++
>  include/linux/syscalls.h               |  2 +
>  include/uapi/asm-generic/unistd.h      |  5 ++-
>  4 files changed, 58 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index 6d807c30cd16..0d9a47b0ce9b 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -376,6 +376,7 @@
>  452    common  fchmodat2               sys_fchmodat2
>  453    64      map_shadow_stack        sys_map_shadow_stack
>  454    common  statmnt                 sys_statmnt
> +455    common  listmnt                 sys_listmnt
>
>  #
>  # Due to a historical design error, certain syscalls are numbered differently
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 088a52043bba..5362b1ffb26f 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -4988,6 +4988,57 @@ SYSCALL_DEFINE5(statmnt, u64, mnt_id,
>         return err;
>  }
>
> +static long do_listmnt(struct vfsmount *mnt, u64 __user *buf, size_t bufsize,
> +                     const struct path *root)
> +{
> +       struct mount *r, *m = real_mount(mnt);
> +       struct path rootmnt = { .mnt = root->mnt, .dentry = root->mnt->mnt_root };
> +       long ctr = 0;
> +
> +       if (!capable(CAP_SYS_ADMIN) &&
> +           !is_path_reachable(m, mnt->mnt_root, &rootmnt))
> +               return -EPERM;
> +
> +       list_for_each_entry(r, &m->mnt_mounts, mnt_child) {
> +               if (!capable(CAP_SYS_ADMIN) &&
> +                   !is_path_reachable(r, r->mnt.mnt_root, root))
> +                       continue;

I'm not an expert on the kernel API, but to my eyes, it looks a bit weird
to silently include or exclude unreachable mounts from the list based on
the result of a capability check. I'd normally expect a more explicit
design, where (e.g.) the caller would set a flag to request unreachable
mounts, then get an -EPERM back if it didn't have the capability, as
opposed to this design, where the meaning of the output ("all mounts" vs.
"all reachable mounts") changes implicitly depending on the caller. Is
there any precedent for a design like this, where inaccessible results
are silently omitted from a returned list?

Thank you,
Matthew House

> +
> +               if (ctr >= bufsize)
> +                       return -EOVERFLOW;
> +               if (put_user(r->mnt_id_unique, buf + ctr))
> +                       return -EFAULT;
> +               ctr++;
> +               if (ctr < 0)
> +                       return -ERANGE;
> +       }
> +       return ctr;
> +}
> +
> +SYSCALL_DEFINE4(listmnt, u64, mnt_id, u64 __user *, buf, size_t, bufsize,
> +               unsigned int, flags)
> +{
> +       struct vfsmount *mnt;
> +       struct path root;
> +       long err;
> +
> +       if (flags)
> +               return -EINVAL;
> +
> +       down_read(&namespace_sem);
> +       mnt = lookup_mnt_in_ns(mnt_id, current->nsproxy->mnt_ns);
> +       err = -ENOENT;
> +       if (mnt) {
> +               get_fs_root(current->fs, &root);
> +               err = do_listmnt(mnt, buf, bufsize, &root);
> +               path_put(&root);
> +       }
> +       up_read(&namespace_sem);
> +
> +       return err;
> +}
> +
> +
>  static void __init init_mount_tree(void)
>  {
>         struct vfsmount *mnt;
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index 1099bd307fa7..5d776cdb6f18 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -411,6 +411,8 @@ asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz,
>  asmlinkage long sys_statmnt(u64 mnt_id, u64 mask,
>                             struct statmnt __user *buf, size_t bufsize,
>                             unsigned int flags);
> +asmlinkage long sys_listmnt(u64 mnt_id, u64 __user *buf, size_t bufsize,
> +                           unsigned int flags);
>  asmlinkage long sys_truncate(const char __user *path, long length);
>  asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length);
>  #if BITS_PER_LONG == 32
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index 640997231ff6..a2b41370f603 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -826,8 +826,11 @@ __SYSCALL(__NR_fchmodat2, sys_fchmodat2)
>  #define __NR_statmnt   454
>  __SYSCALL(__NR_statmnt, sys_statmnt)
>
> +#define __NR_listmnt   455
> +__SYSCALL(__NR_listmnt, sys_listmnt)
> +
>  #undef __NR_syscalls
> -#define __NR_syscalls 455
> +#define __NR_syscalls 456
>
>  /*
>   * 32 bit systems traditionally used different
> --
> 2.41.0

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 3/3] add listmnt(2) syscall
  2023-09-17  0:54   ` Matthew House
@ 2023-09-17 14:32     ` Miklos Szeredi
  2023-09-18 13:15       ` Christian Brauner
  0 siblings, 1 reply; 76+ messages in thread
From: Miklos Szeredi @ 2023-09-17 14:32 UTC (permalink / raw)
  To: Matthew House
  Cc: linux-fsdevel, linux-kernel, linux-api, linux-man,
	linux-security-module, Karel Zak, Ian Kent, David Howells,
	Linus Torvalds, Al Viro, Christian Brauner, Amir Goldstein

On Sun, Sep 17, 2023 at 2:54 AM Matthew House <mattlloydhouse@gmail.com> wrote:

> > +       list_for_each_entry(r, &m->mnt_mounts, mnt_child) {
> > +               if (!capable(CAP_SYS_ADMIN) &&
> > +                   !is_path_reachable(r, r->mnt.mnt_root, root))
> > +                       continue;
>
> I'm not an expert on the kernel API, but to my eyes, it looks a bit weird
> to silently include or exclude unreachable mounts from the list based on
> the result of a capability check. I'd normally expect a more explicit
> design, where (e.g.) the caller would set a flag to request unreachable
> mounts, then get an -EPERM back if it didn't have the capability, as
> opposed to this design, where the meaning of the output ("all mounts" vs.
> "all reachable mounts") changes implicitly depending on the caller. Is
> there any precedent for a design like this, where inaccessible results
> are silently omitted from a returned list?

Good point.  That issue was nagging at the back of my mind.  Having an
explicit flag nicely solves the issue.

Thanks,
Miklos


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-13 15:22 ` [RFC PATCH 2/3] add statmnt(2) syscall Miklos Szeredi
                     ` (2 preceding siblings ...)
  2023-09-14 20:39   ` Paul Moore
@ 2023-09-17 18:18   ` Sargun Dhillon
  2023-09-17 23:36     ` Ian Kent
  2023-09-25 12:57   ` Arnd Bergmann
  4 siblings, 1 reply; 76+ messages in thread
From: Sargun Dhillon @ 2023-09-17 18:18 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: linux-fsdevel, linux-kernel, linux-api, linux-man,
	linux-security-module, Karel Zak, Ian Kent, David Howells,
	Linus Torvalds, Al Viro, Christian Brauner, Amir Goldstein

On Wed, Sep 13, 2023 at 9:25 AM Miklos Szeredi <mszeredi@redhat.com> wrote:
>
> Add a way to query attributes of a single mount instead of having to parse
> the complete /proc/$PID/mountinfo, which might be huge.
>
> Lookup the mount by the old (32bit) or new (64bit) mount ID.  If a mount
> needs to be queried based on path, then statx(2) can be used to first query
> the mount ID belonging to the path.
>
> Design is based on a suggestion by Linus:
>
>   "So I'd suggest something that is very much like "statfsat()", which gets
>    a buffer and a length, and returns an extended "struct statfs" *AND*
>    just a string description at the end."
>
> The interface closely mimics that of statx.
>
> Handle ASCII attributes by appending after the end of the structure (as per
> above suggestion).  Allow querying multiple string attributes with
> individual offset/length for each.  String are nul terminated (termination
> isn't counted in length).
>
> Mount options are also delimited with nul characters.  Unlike proc, special
> characters are not quoted.
>

Thank you for writing this patch. I wish that this had existed the many times
I've written parsers for mounts files in my life.

What do you think about exposing the locked flags, a la what happens
on propagation of mount across user namespaces?

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-17 18:18   ` Sargun Dhillon
@ 2023-09-17 23:36     ` Ian Kent
  2023-09-18 13:05       ` Christian Brauner
  0 siblings, 1 reply; 76+ messages in thread
From: Ian Kent @ 2023-09-17 23:36 UTC (permalink / raw)
  To: Sargun Dhillon, Miklos Szeredi
  Cc: linux-fsdevel, linux-kernel, linux-api, linux-man,
	linux-security-module, Karel Zak, David Howells, Linus Torvalds,
	Al Viro, Christian Brauner, Amir Goldstein


On 18/9/23 02:18, Sargun Dhillon wrote:
> On Wed, Sep 13, 2023 at 9:25 AM Miklos Szeredi <mszeredi@redhat.com> wrote:
>> Add a way to query attributes of a single mount instead of having to parse
>> the complete /proc/$PID/mountinfo, which might be huge.
>>
>> Lookup the mount by the old (32bit) or new (64bit) mount ID.  If a mount
>> needs to be queried based on path, then statx(2) can be used to first query
>> the mount ID belonging to the path.
>>
>> Design is based on a suggestion by Linus:
>>
>>    "So I'd suggest something that is very much like "statfsat()", which gets
>>     a buffer and a length, and returns an extended "struct statfs" *AND*
>>     just a string description at the end."
>>
>> The interface closely mimics that of statx.
>>
>> Handle ASCII attributes by appending after the end of the structure (as per
>> above suggestion).  Allow querying multiple string attributes with
>> individual offset/length for each.  String are nul terminated (termination
>> isn't counted in length).
>>
>> Mount options are also delimited with nul characters.  Unlike proc, special
>> characters are not quoted.
>>
> Thank you for writing this patch. I wish that this had existed the many times
> I've written parsers for mounts files in my life.
>
> What do you think about exposing the locked flags, a la what happens
> on propagation of mount across user namespaces?

Which flags do you mean?


If you mean shared, slave and I think there's a group id as well, etc. 
then yes

they were available in the original fsinfo() implementation as they were 
requested.


So, yes, it would be good to also include those too.


Ian



^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-17 23:36     ` Ian Kent
@ 2023-09-18 13:05       ` Christian Brauner
  0 siblings, 0 replies; 76+ messages in thread
From: Christian Brauner @ 2023-09-18 13:05 UTC (permalink / raw)
  To: Ian Kent
  Cc: Sargun Dhillon, Miklos Szeredi, linux-fsdevel, linux-kernel,
	linux-api, linux-man, linux-security-module, Karel Zak,
	David Howells, Linus Torvalds, Al Viro, Christian Brauner,
	Amir Goldstein

On Mon, Sep 18, 2023 at 07:36:39AM +0800, Ian Kent wrote:
> 
> On 18/9/23 02:18, Sargun Dhillon wrote:
> > On Wed, Sep 13, 2023 at 9:25 AM Miklos Szeredi <mszeredi@redhat.com> wrote:
> > > Add a way to query attributes of a single mount instead of having to parse
> > > the complete /proc/$PID/mountinfo, which might be huge.
> > > 
> > > Lookup the mount by the old (32bit) or new (64bit) mount ID.  If a mount
> > > needs to be queried based on path, then statx(2) can be used to first query
> > > the mount ID belonging to the path.
> > > 
> > > Design is based on a suggestion by Linus:
> > > 
> > >    "So I'd suggest something that is very much like "statfsat()", which gets
> > >     a buffer and a length, and returns an extended "struct statfs" *AND*
> > >     just a string description at the end."
> > > 
> > > The interface closely mimics that of statx.
> > > 
> > > Handle ASCII attributes by appending after the end of the structure (as per
> > > above suggestion).  Allow querying multiple string attributes with
> > > individual offset/length for each.  String are nul terminated (termination
> > > isn't counted in length).
> > > 
> > > Mount options are also delimited with nul characters.  Unlike proc, special
> > > characters are not quoted.
> > > 
> > Thank you for writing this patch. I wish that this had existed the many times
> > I've written parsers for mounts files in my life.
> > 
> > What do you think about exposing the locked flags, a la what happens
> > on propagation of mount across user namespaces?
> 
> Which flags do you mean?

When you propagate mounts across mount+user namespaces a subset of
(security sensitive) mount attributes become locked. This information is
currently only available via internal flags but not in any way
explicitly exposed to userspace.

There's a proposal to extend mount_setattr(2) to explicitly allow
locking flags but that would mean a new set of mount attr flags.

So until the format of that is determined and settled this should be
kept out of statmount().

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 3/3] add listmnt(2) syscall
  2023-09-17 14:32     ` Miklos Szeredi
@ 2023-09-18 13:15       ` Christian Brauner
  2023-09-19 16:47         ` Paul Moore
  0 siblings, 1 reply; 76+ messages in thread
From: Christian Brauner @ 2023-09-18 13:15 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Matthew House, linux-fsdevel, linux-kernel, linux-api, linux-man,
	linux-security-module, Karel Zak, Ian Kent, David Howells,
	Linus Torvalds, Al Viro, Christian Brauner, Amir Goldstein

On Sun, Sep 17, 2023 at 04:32:04PM +0200, Miklos Szeredi wrote:
> On Sun, Sep 17, 2023 at 2:54 AM Matthew House <mattlloydhouse@gmail.com> wrote:
> 
> > > +       list_for_each_entry(r, &m->mnt_mounts, mnt_child) {
> > > +               if (!capable(CAP_SYS_ADMIN) &&


> Good point.  That issue was nagging at the back of my mind.  Having an
> explicit flag nicely solves the issue.

Ideally we avoid multiple capable(CAP_SYS_ADMIN) calls by only doing it
once and saving the return value. capable() call's aren't that cheap.
Plus, we should decide whether this should trigger an audit event or
not: capable(CAP_SYS_ADMIN) triggers an audit event,
ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN) wouldn't. 

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-15  8:56         ` Miklos Szeredi
@ 2023-09-18 13:51           ` Christian Brauner
  2023-09-18 14:14             ` Miklos Szeredi
  2023-09-18 20:58             ` Andreas Dilger
  0 siblings, 2 replies; 76+ messages in thread
From: Christian Brauner @ 2023-09-18 13:51 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Miklos Szeredi, Linus Torvalds, linux-fsdevel, linux-kernel,
	linux-api, linux-man, linux-security-module, Karel Zak, Ian Kent,
	David Howells, Al Viro, Christian Brauner, Amir Goldstein

> Atomicity of getting a snapshot of the current mount tree with all of
> its attributes was never guaranteed, although reading
> /proc/self/mountinfo into a sufficiently large buffer would work that
> way.   However, I don't see why mount trees would require stronger
> guarantees than dentry trees (for which we have basically none).

So atomicity was never put forward as a requirement. In that
session/recording I explicitly state that we won't guarantee atomicity.
And systemd agreed with this. So I think we're all on the same page.

> Even more type clean interface:
> 
> struct statmnt *statmnt(u64 mnt_id, u64 mask, void *buf, size_t
> bufsize, unsigned int flags);
> 
> Kernel would return a fully initialized struct with the numeric as
> well as string fields filled.  That part is trivial for userspace to
> deal with.

I really would prefer a properly typed struct and that's what everyone
was happy with in the session as well. So I would not like to change the
main parameters.

> > Plus, the format for how to return arbitrary filesystem mount options
> > warrants a separate discussion imho as that's not really vfs level
> > information.
> 
> Okay.   Let's take fs options out of this.

Thanks.

> 
> That leaves:
> 
>  - fs type and optionally subtype

So since subtype is FUSE specific it might be better to move this to
filesystem specific options imho.

>  - root of mount within fs
>  - mountpoint path
> 
> The type and subtype are naturally limited to sane sizes, those are
> not an issue.

What's the limit for fstype actually? I don't think there is one.
There's one by chance but not by design afaict?

Maybe crazy idea:
That magic number thing that we do in include/uapi/linux/magic.h
is there a good reason for this or why don't we just add a proper,
simple enum:

enum {
        FS_TYPE_ADFS        1
        FS_TYPE_AFFS        2
        FS_TYPE_AFS         3
        FS_TYPE_AUTOFS      4
	FS_TYPE_EXT2	    5
	FS_TYPE_EXT3	    6
	FS_TYPE_EXT4	    7
	.
	.
	.
	FS_TYPE_MAX
}

that we start returning from statmount(). We can still return both the
old and the new fstype? It always felt a bit odd that fs developers to
just select a magic number.

> 
> For paths the evolution of the relevant system/library calls was:
> 
>   char *getwd(char buf[PATH_MAX]);
>   char *getcwd(char *buf, size_t size);
>   char *get_current_dir_name(void);
> 
> It started out using a fixed size buffer, then a variable sized
> buffer, then an automatically allocated buffer by the library, hiding
> the need to resize on overflow.
> 
> The latest style is suitable for the statmnt() call as well, if we
> worry about pleasantness of the API.

So, can we then do the following struct:

struct statmnt {
        __u64 mask;             /* What results were written [uncond] */
        __u32 sb_dev_major;     /* Device ID */
        __u32 sb_dev_minor;
        __u64 sb_magic;         /* ..._SUPER_MAGIC */
        __u32 sb_flags;         /* MS_{RDONLY,SYNCHRONOUS,DIRSYNC,LAZYTIME} */
        __u32 __spare1;
        __u64 mnt_id;           /* Unique ID of mount */
        __u64 mnt_parent_id;    /* Unique ID of parent (for root == mnt_id) */
        __u32 mnt_id_old;       /* Reused IDs used in proc/.../mountinfo */
        __u32 mnt_parent_id_old;
        __u64 mnt_attr;         /* MOUNT_ATTR_... */
        __u64 mnt_propagation;  /* MS_{SHARED,SLAVE,PRIVATE,UNBINDABLE} */
        __u64 mnt_peer_group;   /* ID of shared peer group */
        __u64 mnt_master;       /* Mount receives propagation from this ID */
        __u64 propagate_from;   /* Propagation from in current namespace */
	__aligned_u64 mountpoint;
	__u32 mountpoint_len;
	__aligned_u64 mountroot;
	__u32 mountroot_len;
        __u64 __spare[20];
};

Userspace knows already how to deal with that because of bpf and other
structs (e.g., both systemd and LXC have ptr_to_u64() helpers and so
on). Libmount and glibc can hide this away internally as well.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-18 13:51           ` Christian Brauner
@ 2023-09-18 14:14             ` Miklos Szeredi
  2023-09-18 14:24               ` Christian Brauner
  2023-09-26 13:48               ` Florian Weimer
  2023-09-18 20:58             ` Andreas Dilger
  1 sibling, 2 replies; 76+ messages in thread
From: Miklos Szeredi @ 2023-09-18 14:14 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Miklos Szeredi, Linus Torvalds, linux-fsdevel, linux-kernel,
	linux-api, linux-man, linux-security-module, Karel Zak, Ian Kent,
	David Howells, Al Viro, Christian Brauner, Amir Goldstein

On Mon, Sep 18, 2023 at 3:51 PM Christian Brauner <brauner@kernel.org> wrote:

> I really would prefer a properly typed struct and that's what everyone
> was happy with in the session as well. So I would not like to change the
> main parameters.

I completely  agree.  Just would like to understand this point:

  struct statmnt *statmnt(u64 mntid, u64 mask, unsigned int flags);

What's not properly typed about this interface?

I guess the answer is that it's not a syscall interface, which will
have an added [void *buf, size_t bufsize], while the buffer sizing is
done by a simple libc wrapper.

Do you think that's a problem?  If so, why?

Thanks,
Miklos


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-18 14:14             ` Miklos Szeredi
@ 2023-09-18 14:24               ` Christian Brauner
  2023-09-18 14:32                 ` Miklos Szeredi
  2023-09-26 13:48               ` Florian Weimer
  1 sibling, 1 reply; 76+ messages in thread
From: Christian Brauner @ 2023-09-18 14:24 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Miklos Szeredi, Linus Torvalds, linux-fsdevel, linux-kernel,
	linux-api, linux-man, linux-security-module, Karel Zak, Ian Kent,
	David Howells, Al Viro, Christian Brauner, Amir Goldstein

On Mon, Sep 18, 2023 at 04:14:02PM +0200, Miklos Szeredi wrote:
> On Mon, Sep 18, 2023 at 3:51 PM Christian Brauner <brauner@kernel.org> wrote:
> 
> > I really would prefer a properly typed struct and that's what everyone
> > was happy with in the session as well. So I would not like to change the
> > main parameters.
> 
> I completely  agree.  Just would like to understand this point:
> 
>   struct statmnt *statmnt(u64 mntid, u64 mask, unsigned int flags);
> 
> What's not properly typed about this interface?
> 
> I guess the answer is that it's not a syscall interface, which will
> have an added [void *buf, size_t bufsize], while the buffer sizing is
> done by a simple libc wrapper.
> 
> Do you think that's a problem?  If so, why?

Sorry, I think we just talked passed each other.
I didn't realize you were talking about a glibc wrapper.
I'm not so much concerned with that they can expose this in whathever
way they like. But we will have a lot of low-level userspace that will
directly use statmount() or not even have glibc like go and other
languages.

The system call should please have a proper struct like you had in your
first proposal. This is what I'm concerned about:

int statmount(u64 mnt_id,
              struct statmnt __user *st,
              size_t size,
              unsigned int flags)

instead of taking an void pointer.



^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-14 15:26       ` Christian Brauner
  2023-09-15  8:56         ` Miklos Szeredi
@ 2023-09-18 14:29         ` Jeff Layton
  2023-09-18 14:35           ` Christian Brauner
  2023-09-20  9:43           ` David Laight
  1 sibling, 2 replies; 76+ messages in thread
From: Jeff Layton @ 2023-09-18 14:29 UTC (permalink / raw)
  To: Christian Brauner, Miklos Szeredi
  Cc: Miklos Szeredi, Linus Torvalds, linux-fsdevel, linux-kernel,
	linux-api, linux-man, linux-security-module, Karel Zak, Ian Kent,
	David Howells, Al Viro, Christian Brauner, Amir Goldstein

On Thu, 2023-09-14 at 17:26 +0200, Christian Brauner wrote:
> On Thu, Sep 14, 2023 at 12:13:54PM +0200, Miklos Szeredi wrote:
> > On Thu, 14 Sept 2023 at 11:28, Christian Brauner <brauner@kernel.org> wrote:
> > > 
> > > On Wed, Sep 13, 2023 at 05:22:35PM +0200, Miklos Szeredi wrote:
> > > > Add a way to query attributes of a single mount instead of having to parse
> > > > the complete /proc/$PID/mountinfo, which might be huge.
> > > > 
> > > > Lookup the mount by the old (32bit) or new (64bit) mount ID.  If a mount
> > > > needs to be queried based on path, then statx(2) can be used to first query
> > > > the mount ID belonging to the path.
> > > > 
> > > > Design is based on a suggestion by Linus:
> > > > 
> > > >   "So I'd suggest something that is very much like "statfsat()", which gets
> > > >    a buffer and a length, and returns an extended "struct statfs" *AND*
> > > >    just a string description at the end."
> > > 
> > > So what we agreed to at LSFMM was that we split filesystem option
> > > retrieval into a separate system call and just have a very focused
> > > statx() for mounts with just binary and non-variable sized information.
> > > We even gave David a hard time about this. :) I would really love if we
> > > could stick to that.
> > > 
> > > Linus, I realize this was your suggestion a long time ago but I would
> > > really like us to avoid structs with variable sized fields at the end of
> > > a struct. That's just so painful for userspace and universally disliked.
> > > If you care I can even find the LSFMM video where we have users of that
> > > api requesting that we please don't do this. So it'd be great if you
> > > wouldn't insist on it.
> > 
> > I completely missed that.
> 
> No worries, I think the discussion touching on this starts at:
> https://youtu.be/j3fp2MtRr2I?si=f-YBg6uWq80dV3VC&t=1603
> (with David talking quietly without a microphone for some parts
> unfortunately...)
> 
> > What I'm thinking is making it even simpler for userspace:
> > 
> > struct statmnt {
> >   ...
> >   char *mnt_root;
> >   char *mountpoint;
> >   char *fs_type;
> >   u32 num_opts;
> >   char *opts;
> > };
> > 
> > I'd still just keep options nul delimited.
> > 
> > Is there a good reason not to return pointers (pointing to within the
> > supplied buffer obviously) to userspace?
> 
> It's really unpleasant to program with. Yes, I think you pointed out
> before that it often doesn't matter much as long as the system call is
> really only relevant to some special purpose userspace.
> 
> But statmount() will be used pretty extensively pretty quickly for the
> purpose of finding out mount options on a mount (Querying a whole
> sequences of mounts via repeated listmount() + statmount() calls on the
> other hand will be rarer.).
> 
> And there's just so many tools that need this: libmount, systemd, all
> kinds of container runtimes, path lookup libraries such as libpathrs,
> languages like go and rust that expose and wrap these calls and so on.
> 
> Most of these tools don't need to know about filesystem mount options
> and if they do they can just query that through an extra system call. No
> harm in doing that.
> 
> The agreement we came to to split out listing submounts into a separate
> system call was exactly to avoid having to have a variable sized pointer
> at the end of the struct statmnt (That's also part of the video above
> btw.) and to make it as simple as possible.
> 
> Plus, the format for how to return arbitrary filesystem mount options
> warrants a separate discussion imho as that's not really vfs level
> information.
> 
> > > This will also allow us to turn statmnt() into an extensible argument
> > > system call versioned by size just like we do any new system calls with
> > > struct arguments (e.g., mount_setattr(), clone3(), openat2() and so on).
> > > Which is how we should do things like that.
> > 
> > The mask mechanism also allow versioning of the struct.
> 
> Yes, but this is done with reserved space which just pushes away the
> problem and bloats the struct for the sake of an unknown future. If we
> were to use an extensible argument struct we would just version by size.
> The only requirement is that you extend by 64 bit (see struct
> clone_args) which had been extended.
> 
> 

Fixed size structs are much nicer to deal with, and most of the fields
we're talking about don't change ofetn enough to make trying to strive
for perfect atomicity worthwhile.

What sort of interface are you thinking for fetching variable-length
string info? It sounds a lot like getxattr that uses a mnt_id in place
of a pathname. getmntattr() ?


> > > 
> > > Other than that I really think this is on track for what we ultimately
> > > want.
> > > 
> > > > +struct stmt_str {
> > > > +     __u32 off;
> > > > +     __u32 len;
> > > > +};
> > > > +
> > > > +struct statmnt {
> > > > +     __u64 mask;             /* What results were written [uncond] */
> > > > +     __u32 sb_dev_major;     /* Device ID */
> > > > +     __u32 sb_dev_minor;
> > > > +     __u64 sb_magic;         /* ..._SUPER_MAGIC */
> > > > +     __u32 sb_flags;         /* MS_{RDONLY,SYNCHRONOUS,DIRSYNC,LAZYTIME} */
> > > > +     __u32 __spare1;
> > > > +     __u64 mnt_id;           /* Unique ID of mount */
> > > > +     __u64 mnt_parent_id;    /* Unique ID of parent (for root == mnt_id) */
> > > > +     __u32 mnt_id_old;       /* Reused IDs used in proc/.../mountinfo */
> > > > +     __u32 mnt_parent_id_old;
> > > > +     __u64 mnt_attr;         /* MOUNT_ATTR_... */
> > > > +     __u64 mnt_propagation;  /* MS_{SHARED,SLAVE,PRIVATE,UNBINDABLE} */
> > > > +     __u64 mnt_peer_group;   /* ID of shared peer group */
> > > > +     __u64 mnt_master;       /* Mount receives propagation from this ID */
> > > > +     __u64 propagate_from;   /* Propagation from in current namespace */
> > > > +     __u64 __spare[20];
> > > > +     struct stmt_str mnt_root;       /* Root of mount relative to root of fs */
> > > > +     struct stmt_str mountpoint;     /* Mountpoint relative to root of process */
> > > > +     struct stmt_str fs_type;        /* Filesystem type[.subtype] */
> > > 

A bit tangential to this discussion, but one thing we could consider is
adding something like a mnt_change_cookie field that increments on any
significant changes on the mount: i.e. remounts with new options,
changes to parentage or propagation, etc.

That might make it more palatable to do something with separate syscalls
for the string-based fields. You could do:

statmnt(...);
getmntattr(mnt, "mnt.fstype", ...);
statmnt(...);

...and then if the mnt_change_cookie hasn't changed, you know that the
string option was stable during that window.


> > > I think if we want to do this here we should add:
> > > 
> > > __u64 fs_type
> > > __u64 fs_subtype
> > > 
> > > fs_type can just be our filesystem magic number and we introduce magic
> > 
> > It's already there: sb_magic.
> > 
> > However it's not a 1:1 mapping (ext* only has one magic).
> 
> That's a very odd choice but probably fixable by giving it a subtype.
> 
> > 
> > > numbers for sub types as well. So we don't need to use strings here.
> > 
> > Ugh.
> 
> Hm, idk. It's not that bad imho. We'll have to make some ugly tradeoffs.

-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-18 14:24               ` Christian Brauner
@ 2023-09-18 14:32                 ` Miklos Szeredi
  2023-09-18 14:40                   ` Christian Brauner
  0 siblings, 1 reply; 76+ messages in thread
From: Miklos Szeredi @ 2023-09-18 14:32 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Miklos Szeredi, Linus Torvalds, linux-fsdevel, linux-kernel,
	linux-api, linux-man, linux-security-module, Karel Zak, Ian Kent,
	David Howells, Al Viro, Christian Brauner, Amir Goldstein

On Mon, 18 Sept 2023 at 16:25, Christian Brauner <brauner@kernel.org> wrote:

> The system call should please have a proper struct like you had in your
> first proposal. This is what I'm concerned about:
>
> int statmount(u64 mnt_id,
>               struct statmnt __user *st,
>               size_t size,
>               unsigned int flags)
>
> instead of taking an void pointer.

So you are not concerned about having ascii strings returned by the
syscall?   I thought that was the main complaint.

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-18 14:29         ` Jeff Layton
@ 2023-09-18 14:35           ` Christian Brauner
  2023-09-20  9:43           ` David Laight
  1 sibling, 0 replies; 76+ messages in thread
From: Christian Brauner @ 2023-09-18 14:35 UTC (permalink / raw)
  To: Jeff Layton
  Cc: Miklos Szeredi, Miklos Szeredi, Linus Torvalds, linux-fsdevel,
	linux-kernel, linux-api, linux-man, linux-security-module,
	Karel Zak, Ian Kent, David Howells, Al Viro, Christian Brauner,
	Amir Goldstein

> Fixed size structs are much nicer to deal with, and most of the fields
> we're talking about don't change ofetn enough to make trying to strive
> for perfect atomicity worthwhile.

I think we can live with mnt_root and mnt_mountpoint in struct statmnt
if we add a length field for both them and make them __u64 pointers.
That's what we did in clone3() for the pid array and bpf is doing that
as well for log buffers and pathnames.

So if Miklos is fine with that then I'm happy to compromise. And I think
that's all the variable length data we want in struct statmount anyway.

> ...and then if the mnt_change_cookie hasn't changed, you know that the
> string option was stable during that window.

Meh, I would really like to sidestep this and keep it as simple as we
can. I like the proposal overall I just don't want it to get diluted too
much by exploding into another overly broad solution.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-18 14:32                 ` Miklos Szeredi
@ 2023-09-18 14:40                   ` Christian Brauner
  2023-09-18 14:51                     ` Miklos Szeredi
  0 siblings, 1 reply; 76+ messages in thread
From: Christian Brauner @ 2023-09-18 14:40 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Miklos Szeredi, Linus Torvalds, linux-fsdevel, linux-kernel,
	linux-api, linux-man, linux-security-module, Karel Zak, Ian Kent,
	David Howells, Al Viro, Christian Brauner, Amir Goldstein

On Mon, Sep 18, 2023 at 04:32:30PM +0200, Miklos Szeredi wrote:
> On Mon, 18 Sept 2023 at 16:25, Christian Brauner <brauner@kernel.org> wrote:
> 
> > The system call should please have a proper struct like you had in your
> > first proposal. This is what I'm concerned about:
> >
> > int statmount(u64 mnt_id,
> >               struct statmnt __user *st,
> >               size_t size,
> >               unsigned int flags)
> >
> > instead of taking an void pointer.
> 
> So you are not concerned about having ascii strings returned by the
> syscall?   I thought that was the main complaint.

I'm not following. The original proposals were only returning strings
even for basic binary data such as mount flags, propagation options, and
so on and we're using the xattr interface for any type of information.

What we're talking about here is a nicely typed struct which returns two
paths @mnt_root and @mnt_point which can both be represented as u64
pointers with length parameters like we do in other binary structs such
as bpf and clone3 and a few others. That is a compromise I can live
with. I'm really trying to find as much common ground here as we can.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-18 14:40                   ` Christian Brauner
@ 2023-09-18 14:51                     ` Miklos Szeredi
  2023-09-18 15:22                       ` Christian Brauner
  0 siblings, 1 reply; 76+ messages in thread
From: Miklos Szeredi @ 2023-09-18 14:51 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Miklos Szeredi, Linus Torvalds, linux-fsdevel, linux-kernel,
	linux-api, linux-man, linux-security-module, Karel Zak, Ian Kent,
	David Howells, Al Viro, Christian Brauner, Amir Goldstein

On Mon, 18 Sept 2023 at 16:40, Christian Brauner <brauner@kernel.org> wrote:

> What we're talking about here is a nicely typed struct which returns two
> paths @mnt_root and @mnt_point which can both be represented as u64
> pointers with length parameters like we do in other binary structs such
> as bpf and clone3 and a few others. That is a compromise I can live
> with. I'm really trying to find as much common ground here as we can.

So to be clear about your proposal: .mnt_root and .mountpoint are
initialized by the caller to buffers that the kernel can copy paths
into?

If there's an overflow (one of the buffers was too small) the syscall
returns -EOVERFLOW?

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-18 14:51                     ` Miklos Szeredi
@ 2023-09-18 15:22                       ` Christian Brauner
  2023-09-18 15:39                         ` Miklos Szeredi
  0 siblings, 1 reply; 76+ messages in thread
From: Christian Brauner @ 2023-09-18 15:22 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Miklos Szeredi, Linus Torvalds, linux-fsdevel, linux-kernel,
	linux-api, linux-man, linux-security-module, Karel Zak, Ian Kent,
	David Howells, Al Viro, Christian Brauner, Amir Goldstein

> So to be clear about your proposal: .mnt_root and .mountpoint are
> initialized by the caller to buffers that the kernel can copy paths
> into?

Yeah, u64 pointer to a buffer and a size (see e.g., @set_tid and
@set_tid_size for struct clone_args, @log_buf and @log_size and other
args in there).

> 
> If there's an overflow (one of the buffers was too small) the syscall
> returns -EOVERFLOW?

Yeah, I mean we have to make some things their problem.

To me that is an acceptable compromise.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-18 15:22                       ` Christian Brauner
@ 2023-09-18 15:39                         ` Miklos Szeredi
  2023-09-19  0:37                           ` Matthew House
  0 siblings, 1 reply; 76+ messages in thread
From: Miklos Szeredi @ 2023-09-18 15:39 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Miklos Szeredi, Linus Torvalds, linux-fsdevel, linux-kernel,
	linux-api, linux-man, linux-security-module, Karel Zak, Ian Kent,
	David Howells, Al Viro, Christian Brauner, Amir Goldstein

On Mon, 18 Sept 2023 at 17:22, Christian Brauner <brauner@kernel.org> wrote:
>
> > So to be clear about your proposal: .mnt_root and .mountpoint are
> > initialized by the caller to buffers that the kernel can copy paths
> > into?
>
> Yeah, u64 pointer to a buffer and a size (see e.g., @set_tid and
> @set_tid_size for struct clone_args, @log_buf and @log_size and other
> args in there).
>
> >
> > If there's an overflow (one of the buffers was too small) the syscall
> > returns -EOVERFLOW?
>
> Yeah, I mean we have to make some things their problem.
>
> To me that is an acceptable compromise.

Okay, so there are now (at least) two buffers, and on overflow the
caller cannot know which one got overflown.  It can resize both, but
that doesn't make the caller any simpler to implement.

Also the interface is kind of weird in that some struct members are
out, some are in (the pointers and the lengths).

I'd prefer the single buffer interface, which has none of the above issues.

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-18 13:51           ` Christian Brauner
  2023-09-18 14:14             ` Miklos Szeredi
@ 2023-09-18 20:58             ` Andreas Dilger
  2023-09-19 12:50               ` Christian Brauner
  1 sibling, 1 reply; 76+ messages in thread
From: Andreas Dilger @ 2023-09-18 20:58 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Miklos Szeredi, Miklos Szeredi, Linus Torvalds, linux-fsdevel,
	Linux Kernel Mailing List, linux-api, linux-man,
	linux-security-module, Karel Zak, Ian Kent, David Howells,
	Al Viro, Christian Brauner, Amir Goldstein

[-- Attachment #1: Type: text/plain, Size: 1846 bytes --]

On Sep 18, 2023, at 7:51 AM, Christian Brauner <brauner@kernel.org> wrote:
> 
> 
>> The type and subtype are naturally limited to sane sizes, those are
>> not an issue.
> 
> What's the limit for fstype actually? I don't think there is one.
> There's one by chance but not by design afaict?
> 
> Maybe crazy idea:
> That magic number thing that we do in include/uapi/linux/magic.h
> is there a good reason for this or why don't we just add a proper,
> simple enum:
> 
> enum {
> 	FS_TYPE_ADFS        1
> 	FS_TYPE_AFFS        2
> 	FS_TYPE_AFS         3
> 	FS_TYPE_AUTOFS      4
> 	FS_TYPE_EXT2	    5
> 	FS_TYPE_EXT3	    6
> 	FS_TYPE_EXT4	    7
> 	.
> 	.
> 	.
> 	FS_TYPE_MAX
> }
> 
> that we start returning from statmount(). We can still return both the
> old and the new fstype? It always felt a bit odd that fs developers to
> just select a magic number.

Yes, there is a very good reason that there isn't an enum for filesystem
type, which is because this API would be broken if it encounters any
filesystem that is not listed there.  Often a single filesystem driver in
the kernel will have multiple different magic numbers to handle versions,
endianness, etc.

Having a 32-bit magic number allows decentralized development with low
chance of collision, and using new filesystems without having to patch
every kernel for this new API to work with that filesystem.  Also,
filesystems come and go (though more slowly) over time, and keeping the
full list of every filesystem ever developed in the kernel enum would be
a headache.

The field in the statmnt() call would need to be at a fixed-size 32-bit
value in any case, so having it return the existing magic will "just work"
because userspace tools already know and understand these magic values,
while introducing an in-kernel enum would be broken for multiple reasons.

Cheers, Andreas






[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 873 bytes --]

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-18 15:39                         ` Miklos Szeredi
@ 2023-09-19  0:37                           ` Matthew House
  2023-09-19  8:02                             ` Miklos Szeredi
  0 siblings, 1 reply; 76+ messages in thread
From: Matthew House @ 2023-09-19  0:37 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Christian Brauner, Miklos Szeredi, Linus Torvalds, linux-fsdevel,
	linux-kernel, linux-api, linux-man, linux-security-module,
	Karel Zak, Ian Kent, David Howells, Al Viro, Christian Brauner,
	Amir Goldstein

On Mon, Sep 18, 2023 at 11:39 AM Miklos Szeredi <miklos@szeredi.hu> wrote:
> Okay, so there are now (at least) two buffers, and on overflow the
> caller cannot know which one got overflown.  It can resize both, but
> that doesn't make the caller any simpler to implement.
>
> Also the interface is kind of weird in that some struct members are
> out, some are in (the pointers and the lengths).
>
> I'd prefer the single buffer interface, which has none of the above issues.
>
> Thanks,
> Miklos

One natural solution is to set either of the two lengths to the expected
size if the provided buffer are too small. That way, the caller learns both
which of the buffers is too small, and how large they need to be. Replacing
a provided size with an expected size in this way already has precedent in
existing syscalls:

recvmsg(2):
    The msg argument points to an in/out struct msghdr, and msg->msg_name
    points to an optional buffer which receives the source address. If
    msg->msg_namelen is less than the actual size of the source address,
    the function truncates the address to that length before storing it in
    msg->msg_name; otherwise, it stores the full address. In either case,
    it sets msg->msg_namelen to the full size of the source address before
    returning.

(An address buffer size is similarly provided directly as an in/out pointer
in accept(2), accept4(2), getpeername(2), getsockname(2), and recvfrom(2).)

name_to_handle_at(2):
    The handle argument points to an in/out struct file_handle, followed by
    a variable-length char array. If handle->handle_bytes is too small to
    store the opaque handle, the function returns -EOVERFLOW; otherwise,
    it succeeds. In either case, it sets handle->handle_bytes to the size
    of the opaque handle before returning.

perf_event_open(2):
    The attr argument points to an in/out struct perf_event_attr. If
    attr->size is not a valid size for the struct, the function sets it to
    the latest size and returns -E2BIG.

sched_setattr(2):
    The attr argument points to an in/out struct sched_attr. If attr->size
    is not a valid size for the struct, the function sets it to the latest
    size and returns -E2BIG.

The specific pattern of returning the actual size of the strings both on
success and on failure, as with recvmsg(2) and name_to_handle_at(2), is
beneficial for callers that want to copy the strings elsewhere without
having to scan for the null byte. (Also, it would work well if we ever
wanted to return variable-size binary data, such as arrays of structs.)

Indeed, if we returned the actual size of the string, we could even take a
more radical approach of never setting a null byte after the data, leaving
the caller to append its own null byte if it really wants one. But perhaps
that would be taking it a bit too far; I just don't want this API to end up
in an awful situation like strncpy(3) or struct sockaddr_un, where the
buffer is always null-terminated except in one particular edge case. Also,
if we include a null byte in the returned size, it could invite off-by-one
errors in callers that just expect it to be the length of the string.

Meanwhile, if this solution of in/out size fields were adopted, then
there'd still be the question of what to do when a provided size is too
small: should the returned string be truncated (indicating the issue only
by the returned size being greater than the provided size), or should the
entire call fail with an -EOVERFLOW? IMO, the former is strictly more
flexible, since the caller can set a limit on how big a buffer it's willing
to dedicate to any particular string, and it can still retrieve the
remaining data if that buffer isn't quite big enough. But the latter might
be considered a bit more foolproof against callers who don't properly test
for truncation.

Thank you,
Matthew House

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-19  0:37                           ` Matthew House
@ 2023-09-19  8:02                             ` Miklos Szeredi
  2023-09-19  9:07                               ` Christian Brauner
  2023-09-19 21:28                               ` Matthew House
  0 siblings, 2 replies; 76+ messages in thread
From: Miklos Szeredi @ 2023-09-19  8:02 UTC (permalink / raw)
  To: Matthew House
  Cc: Christian Brauner, Miklos Szeredi, Linus Torvalds, linux-fsdevel,
	linux-kernel, linux-api, linux-man, linux-security-module,
	Karel Zak, Ian Kent, David Howells, Al Viro, Christian Brauner,
	Amir Goldstein

On Tue, 19 Sept 2023 at 02:38, Matthew House <mattlloydhouse@gmail.com> wrote:

> One natural solution is to set either of the two lengths to the expected
> size if the provided buffer are too small. That way, the caller learns both
> which of the buffers is too small, and how large they need to be. Replacing
> a provided size with an expected size in this way already has precedent in
> existing syscalls:

This is where the thread started.  Knowing the size of the buffer is
no good, since the needed buffer could change between calls.

We are trying to create a simple interface, no?  My proposal would
need a helper like this:

struct statmnt *statmount(uint64_t mnt_id, uint64_t mask, unsigned int flags)
{
        size_t bufsize = 1 << 15;
        void *buf;
        int ret;

        for (;;) {
                buf = malloc(bufsize <<= 1);
                if (!buf)
                        return NULL;
                ret = syscall(__NR_statmnt, mnt_id, mask, buf, bufsize, flags);
                if (!ret)
                        return buf;
                free(buf);
                if (errno != EOVERFLOW)
                        return NULL;
        }
}

Christian's would be (ignoring .fs_type for now):

int statmount(uint64_t mnt_id, uint64_t mask, struct statmnt *st,
unsigned int flags)
{
        int ret;

        st->mnt_root_size = 1 << 15;
        st->mountpoint_size = 1 << 15;
        for (;;) {
                st->mnt_root = malloc(st->mnt_root_size <<= 1);
                st->mountpoint = malloc(st->mountpoint <<= 1);
                if (!st->mnt_root || !st->mountpoint) {
                        free(st->mnt_root);
                        free(st->mountpoint);
                        return -1;
                }
                ret = syscall(__NR_statmnt, mnt_id, mask, st,
sizeof(*st), flags);
                if (!ret || errno != EOVERFLOW)
                        return ret;
                free(st->mnt_root);
                free(st->mountpoint);
        }
}

It's not hugely more complex, but more complex nonetheless.

Also having the helper allocate buffers inside the struct could easily
result in leaks since it's not obvious what the caller needs to free,
while in the first example it is.

Note that I'm not against having the prototype on the kernel interface
take a typed pointer.  If strings are not needed, both interfaces
would work in exactly the same way.

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-19  8:02                             ` Miklos Szeredi
@ 2023-09-19  9:07                               ` Christian Brauner
  2023-09-19 10:51                                 ` Miklos Szeredi
  2023-09-19 21:28                               ` Matthew House
  1 sibling, 1 reply; 76+ messages in thread
From: Christian Brauner @ 2023-09-19  9:07 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Matthew House, Miklos Szeredi, Linus Torvalds, linux-fsdevel,
	linux-kernel, linux-api, linux-man, linux-security-module,
	Karel Zak, Ian Kent, David Howells, Al Viro, Christian Brauner,
	Amir Goldstein

On Tue, Sep 19, 2023 at 10:02:17AM +0200, Miklos Szeredi wrote:
> On Tue, 19 Sept 2023 at 02:38, Matthew House <mattlloydhouse@gmail.com> wrote:
> 
> > One natural solution is to set either of the two lengths to the expected
> > size if the provided buffer are too small. That way, the caller learns both
> > which of the buffers is too small, and how large they need to be. Replacing
> > a provided size with an expected size in this way already has precedent in
> > existing syscalls:
> 
> This is where the thread started.  Knowing the size of the buffer is
> no good, since the needed buffer could change between calls.

The same problem would exist for the single buffer. Realistically, users
will most often simply use a fixed size PATH_MAX buffer that will cover
most cases and fallback to allocating a larger buffer in case things go
awry.

I don't think we need to make this atomic either. Providing a hint for
the required buffer size in case this fails is good enough and should be
a rather rare occurence and is exactly how other variable-sized buffers
are handled.

> Also having the helper allocate buffers inside the struct could easily
> result in leaks since it's not obvious what the caller needs to free,

I don't think we need to be overly concerned with how userspace
implements the wrapper here. Leaks can occur in both scenarios and
low-level userspace can use automatic cleanup macros (we even support it
in the kernel since v6.5) to harden against this.

Really, the main things I care about are 64 bit alignment of the whole
struct, typed __u64 pointers with __u32 size for mnt_root and mnt_point
and that we please spell out "mount" and not use "mnt": so statmount
because the new mount api uses "mount" (move_mount(), mount_setattr(),
fsmount(), MOUNT_ATTR_*) almost everywhere.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-19  9:07                               ` Christian Brauner
@ 2023-09-19 10:51                                 ` Miklos Szeredi
  2023-09-19 12:41                                   ` Christian Brauner
  0 siblings, 1 reply; 76+ messages in thread
From: Miklos Szeredi @ 2023-09-19 10:51 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Matthew House, Miklos Szeredi, Linus Torvalds, linux-fsdevel,
	linux-kernel, linux-api, linux-man, linux-security-module,
	Karel Zak, Ian Kent, David Howells, Al Viro, Christian Brauner,
	Amir Goldstein

On Tue, 19 Sept 2023 at 11:07, Christian Brauner <brauner@kernel.org> wrote:
>
> On Tue, Sep 19, 2023 at 10:02:17AM +0200, Miklos Szeredi wrote:

> > This is where the thread started.  Knowing the size of the buffer is
> > no good, since the needed buffer could change between calls.
>
> The same problem would exist for the single buffer. Realistically, users
> will most often simply use a fixed size PATH_MAX buffer that will cover
> most cases and fallback to allocating a larger buffer in case things go
> awry.

Exactly.  A large buffer will work in 99.99% of the cases. Good
quality implementations will deal with the 0.01% as well, but
optimizing that case is nonsense.

> Really, the main things I care about are 64 bit alignment of the whole
> struct, typed __u64 pointers

Okay.

>  with __u32 size for mnt_root and mnt_point

Unnecessary if the strings are nul terminated.

> and that we please spell out "mount" and not use "mnt": so statmount
> because the new mount api uses "mount" (move_mount(), mount_setattr(),
> fsmount(), MOUNT_ATTR_*) almost everywhere.

Okay.

Incremental below.

Also pushed to:

  git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/vfs.git#statmount

Thanks,
Miklos


diff --git a/arch/x86/entry/syscalls/syscall_64.tbl
b/arch/x86/entry/syscalls/syscall_64.tbl
index 0d9a47b0ce9b..a1b3ce7d22cc 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -375,8 +375,8 @@
 451    common  cachestat               sys_cachestat
 452    common  fchmodat2               sys_fchmodat2
 453    64      map_shadow_stack        sys_map_shadow_stack
-454    common  statmnt                 sys_statmnt
-455    common  listmnt                 sys_listmnt
+454    common  statmount               sys_statmount
+455    common  listmount               sys_listmount

 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/fs/namespace.c b/fs/namespace.c
index 5362b1ffb26f..803003052bfb 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -68,9 +68,8 @@ static u64 event;
 static DEFINE_IDA(mnt_id_ida);
 static DEFINE_IDA(mnt_group_ida);

-/* Don't allow confusion with mount ID allocated wit IDA */
-#define OLD_MNT_ID_MAX UINT_MAX
-static atomic64_t mnt_id_ctr = ATOMIC64_INIT(OLD_MNT_ID_MAX);
+/* Don't allow confusion with old 32bit mount ID */
+static atomic64_t mnt_id_ctr = ATOMIC64_INIT(1ULL << 32);

 static struct hlist_head *mount_hashtable __read_mostly;
 static struct hlist_head *mountpoint_hashtable __read_mostly;
@@ -4679,14 +4678,6 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const
char __user *, path,
        return err;
 }

-static bool mnt_id_match(struct mount *mnt, u64 id)
-{
-       if (id <= OLD_MNT_ID_MAX)
-               return id == mnt->mnt_id;
-       else
-               return id == mnt->mnt_id_unique;
-}
-
 struct vfsmount *lookup_mnt_in_ns(u64 id, struct mnt_namespace *ns)
 {
        struct mount *mnt;
@@ -4694,7 +4685,7 @@ struct vfsmount *lookup_mnt_in_ns(u64 id, struct
mnt_namespace *ns)

        lock_ns_list(ns);
        list_for_each_entry(mnt, &ns->list, mnt_list) {
-               if (!mnt_is_cursor(mnt) && mnt_id_match(mnt, id)) {
+               if (!mnt_is_cursor(mnt) && id == mnt->mnt_id_unique) {
                        res = &mnt->mnt;
                        break;
                }
@@ -4747,7 +4738,7 @@ static int stmt_string_seq(struct stmt_state *s,
stmt_func_t func)
 }

 static void stmt_string(struct stmt_state *s, u64 mask, stmt_func_t func,
-                      stmt_str_t *str)
+                      u64 *str)
 {
        int ret = s->pos >= s->bufsize ? -EOVERFLOW : 0;
        struct statmnt *sm = &s->sm;
@@ -4767,8 +4758,7 @@ static void stmt_string(struct stmt_state *s,
u64 mask, stmt_func_t func,
                if (copy_to_user(s->buf + s->pos, seq->buf, seq->count)) {
                        s->err = -EFAULT;
                } else {
-                       str->off = s->pos;
-                       str->len = seq->count - 1;
+                       *str = (unsigned long) (s->buf + s->pos);
                        s->pos += seq->count;
                }
        }
@@ -4899,39 +4889,10 @@ static int stmt_fs_type(struct stmt_state *s)
        struct super_block *sb = s->mnt->mnt_sb;

        seq_puts(seq, sb->s_type->name);
-       if (sb->s_subtype) {
-               seq_putc(seq, '.');
-               seq_puts(seq, sb->s_subtype);
-       }
-       return 0;
-}
-
-static int stmt_sb_opts(struct stmt_state *s)
-{
-       struct seq_file *seq = &s->seq;
-       struct super_block *sb = s->mnt->mnt_sb;
-       char *p, *end, *next, *u = seq->buf;
-       int err;
-
-       if (!sb->s_op->show_options)
-               return 0;
-
-       err = sb->s_op->show_options(seq, s->mnt->mnt_root);
-       if (err || seq_has_overflowed(seq) || !seq->count)
-               return err;
-
-       end = seq->buf + seq->count;
-       *end = '\0';
-       for (p = seq->buf + 1; p < end; p = next + 1) {
-               next = strchrnul(p, ',');
-               *next = '\0';
-               u += string_unescape(p, u, 0, UNESCAPE_OCTAL) + 1;
-       }
-       seq->count = u - 1 - seq->buf;
        return 0;
 }

-static int do_statmnt(struct stmt_state *s)
+static int do_statmount(struct stmt_state *s)
 {
        struct statmnt *sm = &s->sm;
        struct mount *m = real_mount(s->mnt);
@@ -4946,7 +4907,6 @@ static int do_statmnt(struct stmt_state *s)
        stmt_string(s, STMT_MNT_ROOT, stmt_mnt_root, &sm->mnt_root);
        stmt_string(s, STMT_MOUNTPOINT, stmt_mountpoint, &sm->mountpoint);
        stmt_string(s, STMT_FS_TYPE, stmt_fs_type, &sm->fs_type);
-       stmt_string(s, STMT_SB_OPTS, stmt_sb_opts, &sm->sb_opts);

        if (s->err)
                return s->err;
@@ -4957,7 +4917,7 @@ static int do_statmnt(struct stmt_state *s)
        return 0;
 }

-SYSCALL_DEFINE5(statmnt, u64, mnt_id,
+SYSCALL_DEFINE5(statmount, u64, mnt_id,
                u64, mask, struct statmnt __user *, buf,
                size_t, bufsize, unsigned int, flags)
 {
@@ -4980,7 +4940,7 @@ SYSCALL_DEFINE5(statmnt, u64, mnt_id,
                };

                get_fs_root(current->fs, &s.root);
-               err = do_statmnt(&s);
+               err = do_statmount(&s);
                path_put(&s.root);
        }
        up_read(&namespace_sem);
@@ -4988,19 +4948,25 @@ SYSCALL_DEFINE5(statmnt, u64, mnt_id,
        return err;
 }

-static long do_listmnt(struct vfsmount *mnt, u64 __user *buf, size_t bufsize,
-                     const struct path *root)
+static long do_listmount(struct vfsmount *mnt, u64 __user *buf, size_t bufsize,
+                        const struct path *root, unsigned int flags)
 {
        struct mount *r, *m = real_mount(mnt);
        struct path rootmnt = { .mnt = root->mnt, .dentry =
root->mnt->mnt_root };
        long ctr = 0;
+       bool reachable_only = true;

-       if (!capable(CAP_SYS_ADMIN) &&
-           !is_path_reachable(m, mnt->mnt_root, &rootmnt))
-               return -EPERM;
+       if (flags & LISTMOUNT_UNREACHABLE) {
+               if (!capable(CAP_SYS_ADMIN))
+                       return -EPERM;
+               reachable_only = false;
+       }
+
+       if (reachable_only && !is_path_reachable(m, mnt->mnt_root, &rootmnt))
+               return capable(CAP_SYS_ADMIN) ? 0 : -EPERM;

        list_for_each_entry(r, &m->mnt_mounts, mnt_child) {
-               if (!capable(CAP_SYS_ADMIN) &&
+               if (reachable_only &&
                    !is_path_reachable(r, r->mnt.mnt_root, root))
                        continue;

@@ -5015,14 +4981,14 @@ static long do_listmnt(struct vfsmount *mnt,
u64 __user *buf, size_t bufsize,
        return ctr;
 }

-SYSCALL_DEFINE4(listmnt, u64, mnt_id, u64 __user *, buf, size_t, bufsize,
+SYSCALL_DEFINE4(listmount, u64, mnt_id, u64 __user *, buf, size_t, bufsize,
                unsigned int, flags)
 {
        struct vfsmount *mnt;
        struct path root;
        long err;

-       if (flags)
+       if (flags & ~LISTMOUNT_UNREACHABLE)
                return -EINVAL;

        down_read(&namespace_sem);
@@ -5030,7 +4996,7 @@ SYSCALL_DEFINE4(listmnt, u64, mnt_id, u64 __user
*, buf, size_t, bufsize,
        err = -ENOENT;
        if (mnt) {
                get_fs_root(current->fs, &root);
-               err = do_listmnt(mnt, buf, bufsize, &root);
+               err = do_listmount(mnt, buf, bufsize, &root, flags);
                path_put(&root);
        }
        up_read(&namespace_sem);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 5d776cdb6f18..a35fb7b2c842 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -74,6 +74,7 @@ struct landlock_ruleset_attr;
 enum landlock_rule_type;
 struct cachestat_range;
 struct cachestat;
+struct statmnt;

 #include <linux/types.h>
 #include <linux/aio_abi.h>
@@ -408,11 +409,11 @@ asmlinkage long sys_statfs64(const char __user
*path, size_t sz,
 asmlinkage long sys_fstatfs(unsigned int fd, struct statfs __user *buf);
 asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz,
                                struct statfs64 __user *buf);
-asmlinkage long sys_statmnt(u64 mnt_id, u64 mask,
-                           struct statmnt __user *buf, size_t bufsize,
-                           unsigned int flags);
-asmlinkage long sys_listmnt(u64 mnt_id, u64 __user *buf, size_t bufsize,
-                           unsigned int flags);
+asmlinkage long sys_statmount(u64 mnt_id, u64 mask,
+                             struct statmnt __user *buf, size_t bufsize,
+                             unsigned int flags);
+asmlinkage long sys_listmount(u64 mnt_id, u64 __user *buf, size_t bufsize,
+                             unsigned int flags);
 asmlinkage long sys_truncate(const char __user *path, long length);
 asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length);
 #if BITS_PER_LONG == 32
diff --git a/include/uapi/asm-generic/unistd.h
b/include/uapi/asm-generic/unistd.h
index a2b41370f603..8df6a747e21a 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -823,11 +823,11 @@ __SYSCALL(__NR_cachestat, sys_cachestat)
 #define __NR_fchmodat2 452
 __SYSCALL(__NR_fchmodat2, sys_fchmodat2)

-#define __NR_statmnt   454
-__SYSCALL(__NR_statmnt, sys_statmnt)
+#define __NR_statmount   454
+__SYSCALL(__NR_statmount, sys_statmount)

-#define __NR_listmnt   455
-__SYSCALL(__NR_listmnt, sys_listmnt)
+#define __NR_listmount   455
+__SYSCALL(__NR_listmount, sys_listmount)

 #undef __NR_syscalls
 #define __NR_syscalls 456
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index 4ec7308a9259..d98b41024507 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -138,11 +138,6 @@ struct mount_attr {
 /* List of all mount_attr versions. */
 #define MOUNT_ATTR_SIZE_VER0   32 /* sizeof first published struct */

-struct stmt_str {
-       __u32 off;
-       __u32 len;
-};
-
 struct statmnt {
        __u64 mask;             /* What results were written [uncond] */
        __u32 sb_dev_major;     /* Device ID */
@@ -159,11 +154,10 @@ struct statmnt {
        __u64 mnt_peer_group;   /* ID of shared peer group */
        __u64 mnt_master;       /* Mount receives propagation from this ID */
        __u64 propagate_from;   /* Propagation from in current namespace */
-       __u64 __spare[20];
-       struct stmt_str mnt_root;       /* Root of mount relative to
root of fs */
-       struct stmt_str mountpoint;     /* Mountpoint relative to root
of process */
-       struct stmt_str fs_type;        /* Filesystem type[.subtype] */
-       struct stmt_str sb_opts;        /* Super block string options
(nul delimted) */
+       __u64 mnt_root;         /* [str] Root of mount relative to root of fs */
+       __u64 mountpoint;       /* [str] Mountpoint relative to root
of process */
+       __u64 fs_type;          /* [srt] Filesystem type */
+       __u64 __spare[49];
 };

 #define STMT_SB_BASIC          0x00000001U     /* Want/got sb_... */
@@ -172,6 +166,8 @@ struct statmnt {
 #define STMT_MNT_ROOT          0x00000008U     /* Want/got mnt_root  */
 #define STMT_MOUNTPOINT                0x00000010U     /* Want/got
mountpoint */
 #define STMT_FS_TYPE           0x00000020U     /* Want/got fs_type */
-#define STMT_SB_OPTS           0x00000040U     /* Want/got sb_opts */
+
+/* listmount(2) flags */
+#define LISTMOUNT_UNREACHABLE  0x01    /* List unreachable mounts too */

 #endif /* _UAPI_LINUX_MOUNT_H */

^ permalink raw reply related	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-19 10:51                                 ` Miklos Szeredi
@ 2023-09-19 12:41                                   ` Christian Brauner
  2023-09-19 12:59                                     ` Miklos Szeredi
  0 siblings, 1 reply; 76+ messages in thread
From: Christian Brauner @ 2023-09-19 12:41 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Matthew House, Miklos Szeredi, Linus Torvalds, linux-fsdevel,
	linux-kernel, linux-api, linux-man, linux-security-module,
	Karel Zak, Ian Kent, David Howells, Al Viro, Christian Brauner,
	Amir Goldstein

> >  with __u32 size for mnt_root and mnt_point
> 
> Unnecessary if the strings are nul terminated.

All ok by me so far but how does the kernel know the size of the buffer
to copy into? Wouldn't it be better to allow userspace to specify that?
I'm probably just missing something but I better ask.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-18 20:58             ` Andreas Dilger
@ 2023-09-19 12:50               ` Christian Brauner
  2023-09-20  0:33                 ` Dave Chinner
  0 siblings, 1 reply; 76+ messages in thread
From: Christian Brauner @ 2023-09-19 12:50 UTC (permalink / raw)
  To: Andreas Dilger
  Cc: Miklos Szeredi, Miklos Szeredi, Linus Torvalds, linux-fsdevel,
	Linux Kernel Mailing List, linux-api, linux-man,
	linux-security-module, Karel Zak, Ian Kent, David Howells,
	Al Viro, Christian Brauner, Amir Goldstein

On Mon, Sep 18, 2023 at 02:58:00PM -0600, Andreas Dilger wrote:
> On Sep 18, 2023, at 7:51 AM, Christian Brauner <brauner@kernel.org> wrote:
> > 
> > 
> >> The type and subtype are naturally limited to sane sizes, those are
> >> not an issue.
> > 
> > What's the limit for fstype actually? I don't think there is one.
> > There's one by chance but not by design afaict?
> > 
> > Maybe crazy idea:
> > That magic number thing that we do in include/uapi/linux/magic.h
> > is there a good reason for this or why don't we just add a proper,
> > simple enum:
> > 
> > enum {
> > 	FS_TYPE_ADFS        1
> > 	FS_TYPE_AFFS        2
> > 	FS_TYPE_AFS         3
> > 	FS_TYPE_AUTOFS      4
> > 	FS_TYPE_EXT2	    5
> > 	FS_TYPE_EXT3	    6
> > 	FS_TYPE_EXT4	    7
> > 	.
> > 	.
> > 	.
> > 	FS_TYPE_MAX
> > }
> > 
> > that we start returning from statmount(). We can still return both the
> > old and the new fstype? It always felt a bit odd that fs developers to
> > just select a magic number.
> 
> Yes, there is a very good reason that there isn't an enum for filesystem

I think this isn't all that relevant to the patchset so I'm not going to
spend a lot of time on this discussion but I'm curious.

> type, which is because this API would be broken if it encounters any
> filesystem that is not listed there.  Often a single filesystem driver in
> the kernel will have multiple different magic numbers to handle versions,
> endianness, etc.

Why isn't this a problem for magically chosen numbers?

> 
> Having a 32-bit magic number allows decentralized development with low
> chance of collision, and using new filesystems without having to patch
> every kernel for this new API to work with that filesystem.  Also,

We don't care about out of tree filesystems.

> filesystems come and go (though more slowly) over time, and keeping the

Even if we did ever remove a filesystem we'd obviously leave the enum in
place. Same thig we do for deprecated flags, same thing we'd do for
magic numbers.

> full list of every filesystem ever developed in the kernel enum would be
> a headache.

I really don't follow this argument.

> 
> The field in the statmnt() call would need to be at a fixed-size 32-bit
> value in any case, so having it return the existing magic will "just work"
> because userspace tools already know and understand these magic values,
> while introducing an in-kernel enum would be broken for multiple reasons.

We already do expose the magic number in statmount() but it can't
differentiate between ext2, ext3, and ext4 for example which is why I
asked.

Afaict, none of the points you mention are show stoppers and none of
them are unique to an enum.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-19 12:41                                   ` Christian Brauner
@ 2023-09-19 12:59                                     ` Miklos Szeredi
  2023-09-19 13:18                                       ` Christian Brauner
  0 siblings, 1 reply; 76+ messages in thread
From: Miklos Szeredi @ 2023-09-19 12:59 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Matthew House, Miklos Szeredi, Linus Torvalds, linux-fsdevel,
	linux-kernel, linux-api, linux-man, linux-security-module,
	Karel Zak, Ian Kent, David Howells, Al Viro, Christian Brauner,
	Amir Goldstein

On Tue, 19 Sept 2023 at 14:41, Christian Brauner <brauner@kernel.org> wrote:
>
> > >  with __u32 size for mnt_root and mnt_point
> >
> > Unnecessary if the strings are nul terminated.
>
> All ok by me so far but how does the kernel know the size of the buffer
> to copy into? Wouldn't it be better to allow userspace to specify that?
> I'm probably just missing something but I better ask.

Because size of the buffer is given as the syscall argument.

  long statmount(u64 mnt_id, u64 mask, struct statmnt __user *buf,
size_t bufsize, unsigned int flags);

If you are still hung up about this not being properly typed, how about this:

struct statmnt {
        __u64 mask;             /* What results were written [uncond] */
        __u32 sb_dev_major;     /* Device ID */
[...]
        __u64 fs_type;          /* [str] Filesystem type */
        __u64 __spare[49];
        char __string_buf[];
};

Such variable length structures are used all over the place, this
isn't some big invention.  The only new thing is that we set pointers
to within the tail part of the buffer, to make the interface work for
the multiple strings case.

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-19 12:59                                     ` Miklos Szeredi
@ 2023-09-19 13:18                                       ` Christian Brauner
  0 siblings, 0 replies; 76+ messages in thread
From: Christian Brauner @ 2023-09-19 13:18 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Matthew House, Miklos Szeredi, Linus Torvalds, linux-fsdevel,
	linux-kernel, linux-api, linux-man, linux-security-module,
	Karel Zak, Ian Kent, David Howells, Al Viro, Christian Brauner,
	Amir Goldstein

On Tue, Sep 19, 2023 at 02:59:53PM +0200, Miklos Szeredi wrote:
> On Tue, 19 Sept 2023 at 14:41, Christian Brauner <brauner@kernel.org> wrote:
> >
> > > >  with __u32 size for mnt_root and mnt_point
> > >
> > > Unnecessary if the strings are nul terminated.
> >
> > All ok by me so far but how does the kernel know the size of the buffer
> > to copy into? Wouldn't it be better to allow userspace to specify that?
> > I'm probably just missing something but I better ask.
> 
> Because size of the buffer is given as the syscall argument.
> 
>   long statmount(u64 mnt_id, u64 mask, struct statmnt __user *buf,
> size_t bufsize, unsigned int flags);
> 
> If you are still hung up about this not being properly typed, how about this:

I really just wasn't clear how exactly you envisioned this. Your
proposal as is sounds good to me! I'm on board. I prefer the two offsets
as that lets us avoid searching for null bytes. So please leave it as is!
Thanks!

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 3/3] add listmnt(2) syscall
  2023-09-18 13:15       ` Christian Brauner
@ 2023-09-19 16:47         ` Paul Moore
  2023-09-28 10:07           ` Miklos Szeredi
  0 siblings, 1 reply; 76+ messages in thread
From: Paul Moore @ 2023-09-19 16:47 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Miklos Szeredi, Matthew House, linux-fsdevel, linux-kernel,
	linux-api, linux-man, linux-security-module, Karel Zak, Ian Kent,
	David Howells, Linus Torvalds, Al Viro, Christian Brauner,
	Amir Goldstein

On Mon, Sep 18, 2023 at 12:52 PM Christian Brauner <brauner@kernel.org> wrote:
> On Sun, Sep 17, 2023 at 04:32:04PM +0200, Miklos Szeredi wrote:
> > On Sun, Sep 17, 2023 at 2:54 AM Matthew House <mattlloydhouse@gmail.com> wrote:
> >
> > > > +       list_for_each_entry(r, &m->mnt_mounts, mnt_child) {
> > > > +               if (!capable(CAP_SYS_ADMIN) &&
>
>
> > Good point.  That issue was nagging at the back of my mind.  Having an
> > explicit flag nicely solves the issue.
>
> Ideally we avoid multiple capable(CAP_SYS_ADMIN) calls by only doing it
> once and saving the return value. capable() call's aren't that cheap.

Agreed.  The capability check doesn't do any subject/object
comparisons so calling it for each mount is overkill.  However, I
would think we would want the LSM hook called from inside the loop as
that could involve a subject (@current) and object (individual mount
point) comparison.

> Plus, we should decide whether this should trigger an audit event or
> not: capable(CAP_SYS_ADMIN) triggers an audit event,
> ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN) wouldn't.

Why would we not want to audit the capable() call?

-- 
paul-moore.com

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-19  8:02                             ` Miklos Szeredi
  2023-09-19  9:07                               ` Christian Brauner
@ 2023-09-19 21:28                               ` Matthew House
  2023-09-20  9:42                                 ` Miklos Szeredi
  1 sibling, 1 reply; 76+ messages in thread
From: Matthew House @ 2023-09-19 21:28 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Christian Brauner, Miklos Szeredi, Linus Torvalds, linux-fsdevel,
	linux-kernel, linux-api, linux-man, linux-security-module,
	Karel Zak, Ian Kent, David Howells, Al Viro, Christian Brauner,
	Amir Goldstein

On Tue, Sep 19, 2023 at 4:02 AM Miklos Szeredi <miklos@szeredi.hu> wrote:
> On Tue, 19 Sept 2023 at 02:38, Matthew House <mattlloydhouse@gmail.com> wrote:
>
> > One natural solution is to set either of the two lengths to the expected
> > size if the provided buffer are too small. That way, the caller learns both
> > which of the buffers is too small, and how large they need to be. Replacing
> > a provided size with an expected size in this way already has precedent in
> > existing syscalls:
>
> This is where the thread started.  Knowing the size of the buffer is
> no good, since the needed buffer could change between calls.

As Brauner mentioned, this does not change with the single-buffer
interface. And since changes are not likely to occur extremely frequently,
I feel like it would be better for the caller to only need one retry in the
common case rather than N retries for however many doublings it takes to
fit the whole buffer.

> We are trying to create a simple interface, no?  My proposal would
> need a helper like this:
>
> struct statmnt *statmount(uint64_t mnt_id, uint64_t mask, unsigned int flags)
> {
>         size_t bufsize = 1 << 15;
>         void *buf;
>         int ret;
>
>         for (;;) {
>                 buf = malloc(bufsize <<= 1);
>                 if (!buf)
>                         return NULL;
>                 ret = syscall(__NR_statmnt, mnt_id, mask, buf, bufsize, flags);
>                 if (!ret)
>                         return buf;
>                 free(buf);
>                 if (errno != EOVERFLOW)
>                         return NULL;
>         }
> }
>
> Christian's would be (ignoring .fs_type for now):
>
> int statmount(uint64_t mnt_id, uint64_t mask, struct statmnt *st,
> unsigned int flags)
> {
>         int ret;
>
>         st->mnt_root_size = 1 << 15;
>         st->mountpoint_size = 1 << 15;
>         for (;;) {
>                 st->mnt_root = malloc(st->mnt_root_size <<= 1);
>                 st->mountpoint = malloc(st->mountpoint <<= 1);
>                 if (!st->mnt_root || !st->mountpoint) {
>                         free(st->mnt_root);
>                         free(st->mountpoint);
>                         return -1;
>                 }
>                 ret = syscall(__NR_statmnt, mnt_id, mask, st,
> sizeof(*st), flags);
>                 if (!ret || errno != EOVERFLOW)
>                         return ret;
>                 free(st->mnt_root);
>                 free(st->mountpoint);
>         }
> }
>
> It's not hugely more complex, but more complex nonetheless.
>
> Also having the helper allocate buffers inside the struct could easily
> result in leaks since it's not obvious what the caller needs to free,
> while in the first example it is.

There's nothing stopping the userspace helper from exposing a contiguous
buffer that can be easily freed, even if the kernel API uses a separate-
buffer interface internally. It just takes a bit of addition in the helper
to calculate the correct pointers. To wit:

struct statmnt *statmount(uint64_t mnt_id, uint64_t mask, unsigned int flags)
{
        uint32_t mnt_root_size = PATH_MAX;
        uint32_t mountpoint_size = PATH_MAX;
        struct statmnt *st;
        int ret;

        for (;;) {
                st = malloc(sizeof(*st) + mnt_root_size + mountpoint_size);
                if (!st)
                        return NULL;
                st->mnt_root = (char *)st + sizeof(*st);
                st->mountpoint = (char *)st + sizeof(*st) + mnt_root_size;
                st->mnt_root_size = mnt_root_size;
                st->mountpoint_size = mountpoint_size;
                ret = syscall(__NR_statmnt, mnt_id, mask, st, sizeof(*st),
                              flags);
                if (ret) {
                        free(st);
                        return NULL;
                }
                if (st->mnt_root_size <= mnt_root_size &&
                    st->mountpoint_size <= mountpoint_size)
                        return st;
                mnt_root_size = st->mnt_root_size;
                mountpoint_size = st->mountpoint_size;
                free(st);
        }
}

(This is also far more helpful for users of the returned struct statmnt *,
since they can just dereference the two pointers instead of having to
decode the offsets by hand.)

More generally speaking, the biggest reason I dislike the current single-
buffer interface is that the output is "all or nothing": either the caller
has enough space in the buffer to store every single string, or it's unable
to get any fields at all, just an -EOVERFLOW. There's no room for the
caller to say that it just wants the integer fields and doesn't care about
the strings. Thus, to reliably call statmnt() on an arbitrary mount, the
ability to dynamically allocate memory is effectively mandatory. The only
real solution to this would be additional statx-like flags to select the
returned strings.

Meanwhile, with a separate-buffer interface, where the caller provides a
pointer and capacity for each string, granular output would be trivial: the
caller could just specify NULL/0 for any string it doesn't want, and still
successfully retrieve all the integer fields. This would also work well if
the caller, e.g., wants to set a hard cap of PATH_MAX bytes for each string
(since it's using static buffers), but nonetheless wants to retrieve the
integer fields if a string is too long.

Besides that, if the caller is written in standard C but doesn't want to
use malloc(3) to allocate the buffer, then its helper function must be
written very carefully (with a wrapper struct around the header and data)
to satisfy the aliasing rules, which forbid programs from using a struct
statmnt * pointer to read from a declared char[N] array. In practice,
callers tend to very rarely exercise this proper care with existing single-
buffer interfaces, such as recvmsg(2)'s msg_control buffer, and I would not
be very happy if statmnt() further contributed to this widespread issue.

Thank you,
Matthew House

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-19 12:50               ` Christian Brauner
@ 2023-09-20  0:33                 ` Dave Chinner
  0 siblings, 0 replies; 76+ messages in thread
From: Dave Chinner @ 2023-09-20  0:33 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Andreas Dilger, Miklos Szeredi, Miklos Szeredi, Linus Torvalds,
	linux-fsdevel, Linux Kernel Mailing List, linux-api, linux-man,
	linux-security-module, Karel Zak, Ian Kent, David Howells,
	Al Viro, Christian Brauner, Amir Goldstein

On Tue, Sep 19, 2023 at 02:50:28PM +0200, Christian Brauner wrote:
> On Mon, Sep 18, 2023 at 02:58:00PM -0600, Andreas Dilger wrote:
> > On Sep 18, 2023, at 7:51 AM, Christian Brauner <brauner@kernel.org> wrote:
> > > 
> > > 
> > >> The type and subtype are naturally limited to sane sizes, those are
> > >> not an issue.
> > > 
> > > What's the limit for fstype actually? I don't think there is one.
> > > There's one by chance but not by design afaict?
> > > 
> > > Maybe crazy idea:
> > > That magic number thing that we do in include/uapi/linux/magic.h
> > > is there a good reason for this or why don't we just add a proper,
> > > simple enum:
> > > 
> > > enum {
> > > 	FS_TYPE_ADFS        1
> > > 	FS_TYPE_AFFS        2
> > > 	FS_TYPE_AFS         3
> > > 	FS_TYPE_AUTOFS      4
> > > 	FS_TYPE_EXT2	    5
> > > 	FS_TYPE_EXT3	    6
> > > 	FS_TYPE_EXT4	    7
> > > 	.
> > > 	.
> > > 	.
> > > 	FS_TYPE_MAX
> > > }
> > > 
> > > that we start returning from statmount(). We can still return both the
> > > old and the new fstype? It always felt a bit odd that fs developers to
> > > just select a magic number.
> > 
> > Yes, there is a very good reason that there isn't an enum for filesystem
> 
> I think this isn't all that relevant to the patchset so I'm not going to
> spend a lot of time on this discussion but I'm curious.
> 
> > type, which is because this API would be broken if it encounters any
> > filesystem that is not listed there.  Often a single filesystem driver in
> > the kernel will have multiple different magic numbers to handle versions,
> > endianness, etc.
> 
> Why isn't this a problem for magically chosen numbers?

What problem are you asking about? The 32 bit space that contains
a few hundred magic numbers remains a vast field of empty space that
makes collisions easy to avoid....

> > Having a 32-bit magic number allows decentralized development with low
> > chance of collision, and using new filesystems without having to patch
> > every kernel for this new API to work with that filesystem.  Also,
> 
> We don't care about out of tree filesystems.

In this case, we most certainly do care. Downstream distros support
all sorts of out of tree filesystems loaded via kernel modules, so a
syscall that is used to uniquely identify a filesystem type to
userspace *must* have a mechanism for the filesystem to provide that
unique identifier to userspace.

Fundamentally, the kernel does not and should not dictate what
filesystem types it supports; the user decides what filesystem they
need to use, and it is the kernel's job to provide infrastructure
that works with that user's choice.

Remember: it's not just applications that stat the mounted
filesystem that know about the filesystem amgic numbers.  Apps like
grub, libblkid, etc all look at filesystem magic numbers directly on
the block device to identify the type of filesystem that is on the
device.

If we introduce a new identifer specific to mounted kernel
filesystems, these sorts of apps now need to use two different
identifiers in different contexts instead of the same magic number
everywhere. That's not a win for anyone.

Magic numbers are also portable - it does not matter what OS you see
that FS on, it has the same unique, stable type identifier. You can
look at the block device and identify the filesystem by it's magic
number, you can stat the mounted filesystem and get the same magic
number. It just works the same *everywhere*.

Magic numbers have served the purpose of being unique filesystem
identifiers for over 40 years. They work just fine for this purpose
and nothing has changed in the past couple of decades that has
broken them or needs fixing.

> > filesystems come and go (though more slowly) over time, and keeping the
> 
> Even if we did ever remove a filesystem we'd obviously leave the enum in
> place. Same thig we do for deprecated flags, same thing we'd do for
> magic numbers.

So why try to replace magic numbers if we must replicate all the
same unique, stable behaviour that magic numbers already provide the
kernel and userspace with?

>
> > full list of every filesystem ever developed in the kernel enum would be
> > a headache.
> 
> I really don't follow this argument.

The kernel currently doesn't need to know about all the potential
fuse filesystem types that can be mounted. It doesn't need to know
about all the 3rd party filesystems that could be mounted. these all
just work and userspace can identify them just fine via their unique
magic numbers that are passed through the kernel interfaces from the
filesystem.

Then enum proposal breaks these existing working use cases unless
the enum explicitly includes ever possible filesystem type that the
kernel might expose to userspace. The kernel *should not care* what
filesystems it exposes to userspace and that's the whole point of using
a filesystem supplied magic number as the unique identifier for the
filesystem...

> > The field in the statmnt() call would need to be at a fixed-size 32-bit
> > value in any case, so having it return the existing magic will "just work"
> > because userspace tools already know and understand these magic values,
> > while introducing an in-kernel enum would be broken for multiple reasons.
> 
> We already do expose the magic number in statmount() but it can't
> differentiate between ext2, ext3, and ext4 for example which is why I
> asked.

That's just an extN quirk, and it's trivial to fix for the new
interface. Define new magic numbers for ext3 and ext4 and only use
them in the new interface, leave the old interfaces using the ext2
magic number for all of them.

-Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-19 21:28                               ` Matthew House
@ 2023-09-20  9:42                                 ` Miklos Szeredi
  2023-09-20 13:26                                   ` Matthew House
  0 siblings, 1 reply; 76+ messages in thread
From: Miklos Szeredi @ 2023-09-20  9:42 UTC (permalink / raw)
  To: Matthew House
  Cc: Christian Brauner, Miklos Szeredi, Linus Torvalds, linux-fsdevel,
	linux-kernel, linux-api, linux-man, linux-security-module,
	Karel Zak, Ian Kent, David Howells, Al Viro, Christian Brauner,
	Amir Goldstein

On Tue, 19 Sept 2023 at 23:28, Matthew House <mattlloydhouse@gmail.com> wrote:

> More generally speaking, the biggest reason I dislike the current single-
> buffer interface is that the output is "all or nothing": either the caller
> has enough space in the buffer to store every single string, or it's unable
> to get any fields at all, just an -EOVERFLOW. There's no room for the
> caller to say that it just wants the integer fields and doesn't care about
> the strings. Thus, to reliably call statmnt() on an arbitrary mount, the
> ability to dynamically allocate memory is effectively mandatory. The only
> real solution to this would be additional statx-like flags to select the
> returned strings.

It's already there:

#define STMT_MNT_ROOT 0x00000008U /* Want/got mnt_root  */
#define STMT_MNT_POINT 0x00000010U /* Want/got mnt_point */
#define STMT_FS_TYPE 0x00000020U /* Want/got fs_type */

For example, it's perfectly fine to do the following, and it's
guaranteed not to return EOVERFLOW:

        struct statmnt st;
        unsigned int mask = STMT_SB_BASIC | STMT_MNT_BASIC;

        ret = statmount(mnt_id, mask, &st, sizeof(st), flags);

> Besides that, if the caller is written in standard C but doesn't want to
> use malloc(3) to allocate the buffer, then its helper function must be
> written very carefully (with a wrapper struct around the header and data)
> to satisfy the aliasing rules, which forbid programs from using a struct
> statmnt * pointer to read from a declared char[N] array.

I think you interpret aliasing rules incorrectly.  The issue with
aliasing is if you access the same piece of memory though different
types.  Which is not the case here.  In fact with the latest
incarnation of the interface[1] there's no need to access the
underlying buffer at all:

        printf("mnt_root: <%s>\n", st->str + st->mnt_root);

So the following is perfectly safe to do (as long as you don't care
about buffer overflow):

        char buf[10000];
        struct statmnt *st = (void *) buf;

        ret = statmount(mnt_id, mask, st, sizeof(buf), flags);

If you do care about handling buffer overflows, then dynamic
allocation is the only sane way.

And before you dive into how this is going to be horrible because the
buffer size needs to be doubled an unknown number of times, think a
bit:  have you *ever* seen a line in /proc/self/mountinfo longer than
say 1000 characters?   So if the buffer starts out at 64k, how often
will this doubling happen?   Right: practically never.  Adding
complexity to handle this case is nonsense, as I've said many times.
And there is definitely nonzero complexity involved (just see the
special casing in getxattr and listxattr implementations all over the
place).

Thanks,
Miklos

[1] git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/vfs.git#statmount-v2

^ permalink raw reply	[flat|nested] 76+ messages in thread

* RE: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-18 14:29         ` Jeff Layton
  2023-09-18 14:35           ` Christian Brauner
@ 2023-09-20  9:43           ` David Laight
  1 sibling, 0 replies; 76+ messages in thread
From: David Laight @ 2023-09-20  9:43 UTC (permalink / raw)
  To: 'Jeff Layton', Christian Brauner, Miklos Szeredi
  Cc: Miklos Szeredi, Linus Torvalds, linux-fsdevel, linux-kernel,
	linux-api, linux-man, linux-security-module, Karel Zak, Ian Kent,
	David Howells, Al Viro, Christian Brauner, Amir Goldstein

From: Jeff Layton
> Sent: 18 September 2023 15:30
....
> A bit tangential to this discussion, but one thing we could consider is
> adding something like a mnt_change_cookie field that increments on any
> significant changes on the mount: i.e. remounts with new options,
> changes to parentage or propagation, etc.
> 
> That might make it more palatable to do something with separate syscalls
> for the string-based fields. You could do:
> 
> statmnt(...);
> getmntattr(mnt, "mnt.fstype", ...);
> statmnt(...);
> 
> ...and then if the mnt_change_cookie hasn't changed, you know that the
> string option was stable during that window.

That would also help with the problem of the mount options
being changed while processing a page fault on the user buffer.

Of, with a call to just get the cookie, could find that nothing
has changed so there is no point looking again.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-20  9:42                                 ` Miklos Szeredi
@ 2023-09-20 13:26                                   ` Matthew House
  2023-09-21  7:34                                     ` Miklos Szeredi
  0 siblings, 1 reply; 76+ messages in thread
From: Matthew House @ 2023-09-20 13:26 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Christian Brauner, Miklos Szeredi, Linus Torvalds, linux-fsdevel,
	linux-kernel, linux-api, linux-man, linux-security-module,
	Karel Zak, Ian Kent, David Howells, Al Viro, Christian Brauner,
	Amir Goldstein

On Wed, Sep 20, 2023 at 5:42 AM Miklos Szeredi <miklos@szeredi.hu> wrote:
> On Tue, 19 Sept 2023 at 23:28, Matthew House <mattlloydhouse@gmail.com> wrote:
>
> > More generally speaking, the biggest reason I dislike the current single-
> > buffer interface is that the output is "all or nothing": either the caller
> > has enough space in the buffer to store every single string, or it's unable
> > to get any fields at all, just an -EOVERFLOW. There's no room for the
> > caller to say that it just wants the integer fields and doesn't care about
> > the strings. Thus, to reliably call statmnt() on an arbitrary mount, the
> > ability to dynamically allocate memory is effectively mandatory. The only
> > real solution to this would be additional statx-like flags to select the
> > returned strings.
>
> It's already there:
>
> #define STMT_MNT_ROOT 0x00000008U /* Want/got mnt_root  */
> #define STMT_MNT_POINT 0x00000010U /* Want/got mnt_point */
> #define STMT_FS_TYPE 0x00000020U /* Want/got fs_type */
>
> For example, it's perfectly fine to do the following, and it's
> guaranteed not to return EOVERFLOW:
> 
>         struct statmnt st;
>         unsigned int mask = STMT_SB_BASIC | STMT_MNT_BASIC;
> 
>         ret = statmount(mnt_id, mask, &st, sizeof(st), flags);

Whoops, my apologies; perhaps I should try to learn to read for once. (I
just saw the undecorated sequence of stmt_numeric() and stmt_string() calls
and didn't notice the early exits within the functions.) I withdraw that
particular objection.

> > Besides that, if the caller is written in standard C but doesn't want to
> > use malloc(3) to allocate the buffer, then its helper function must be
> > written very carefully (with a wrapper struct around the header and data)
> > to satisfy the aliasing rules, which forbid programs from using a struct
> > statmnt * pointer to read from a declared char[N] array.
>
> I think you interpret aliasing rules incorrectly.  The issue with
> aliasing is if you access the same piece of memory though different
> types.  Which is not the case here.  In fact with the latest
> incarnation of the interface[1] there's no need to access the
> underlying buffer at all:
>
>         printf("mnt_root: <%s>\n", st->str + st->mnt_root);
>
> So the following is perfectly safe to do (as long as you don't care
> about buffer overflow):
>
>         char buf[10000];
>         struct statmnt *st = (void *) buf;
>
>         ret = statmount(mnt_id, mask, st, sizeof(buf), flags);

The declared type of a variable *is* one of the different types, as far as
the aliasing rules are concerned. In C17, section 6.5 ("Expressions"):

> The *effective type* of an object for an access to its stored value is
> the declared type of the object, if any. [More rules about objects with
> no declared type, i.e., those created with malloc(3) or realloc(3)...]
>
> An object shall have its stored value accessed only by an lvalue
> expression that has one of the following types:
>
> -- a type compatible with the effective type of the object,
>
> -- a qualified version of a type compatible with the effective type of
>    the object,
>
> -- a type that is the signed or unsigned type corresponding to the
>    effective type of the object,
>
> -- a type that is the signed or unsigned type corresponding to a
>    qualified version of the effective type of the object,
>
> -- an aggregate or union type that includes one of the aforementioned
>    types among its members (including, recursively, a member of a
>    subaggregate or contained union), or
>
> -- a character type.

In this case, buf is declared in the program as a char[10000] array, so the
declared type of each element is char, and the effective type of each
element is also char. If we want to access, say, st->mnt_id, the lvalue
expression has type __u64, and it tries to access 8 of the char objects.
However, the integer type that __u64 expands to doesn't meet any of those
criteria, so the aliasing rules are violated and the behavior is undefined.

(The statmount() helper could in theory avoid UB by saying the struct
statmnt object is stored in the buffer as if by memcpy(3), but it would
still be UB for the caller to access the fields of that pointer directly
instead of memcpy'ing them back out of the buffer. And practically no one
does that in the real world.)

It's a common misconception that the aliasing rules as written are about
accessing the same object through two different pointer types. That
corollary is indeed what compilers mainly care about, but the C/C++
standards further say that objects in memory "remember" the types they were
created with, and they demand that programs respect objects' original types
when trying to access them (except when accessing their raw representations
via a pointer of character type).

> If you do care about handling buffer overflows, then dynamic
> allocation is the only sane way.
>
> And before you dive into how this is going to be horrible because the
> buffer size needs to be doubled an unknown number of times, think a
> bit:  have you *ever* seen a line in /proc/self/mountinfo longer than
> say 1000 characters?   So if the buffer starts out at 64k, how often
> will this doubling happen?   Right: practically never.  Adding
> complexity to handle this case is nonsense, as I've said many times.
> And there is definitely nonzero complexity involved (just see the
> special casing in getxattr and listxattr implementations all over the
> place).
>
> Thanks,
> Miklos

I've always felt that capacity doubling is a bit wasteful, but it's
definitely something I can live with, especially if providing size feedback
is as complex as you suggest. Still, I'm not a big fan of single-buffer
interfaces in general, with how poorly they tend to interact with C's
aliasing rules. (Also, those kinds of interfaces also invite alignment
errors: for instance, your snippet above is missing the necessary union to
prevent the buffer from being misaligned, which would cause UB when you
cast it to a struct statmnt *.)

Thank you,
Matthew House

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-20 13:26                                   ` Matthew House
@ 2023-09-21  7:34                                     ` Miklos Szeredi
  0 siblings, 0 replies; 76+ messages in thread
From: Miklos Szeredi @ 2023-09-21  7:34 UTC (permalink / raw)
  To: Matthew House
  Cc: Christian Brauner, Miklos Szeredi, Linus Torvalds, linux-fsdevel,
	linux-kernel, linux-api, linux-man, linux-security-module,
	Karel Zak, Ian Kent, David Howells, Al Viro, Christian Brauner,
	Amir Goldstein

On Wed, 20 Sept 2023 at 15:26, Matthew House <mattlloydhouse@gmail.com> wrote:

> The declared type of a variable *is* one of the different types, as far as
> the aliasing rules are concerned. In C17, section 6.5 ("Expressions"):
>
> > The *effective type* of an object for an access to its stored value is
> > the declared type of the object, if any. [More rules about objects with
> > no declared type, i.e., those created with malloc(3) or realloc(3)...]
> >
> > An object shall have its stored value accessed only by an lvalue
> > expression that has one of the following types:
> >
> > -- a type compatible with the effective type of the object,
> >
> > -- a qualified version of a type compatible with the effective type of
> >    the object,
> >
> > -- a type that is the signed or unsigned type corresponding to the
> >    effective type of the object,
> >
> > -- a type that is the signed or unsigned type corresponding to a
> >    qualified version of the effective type of the object,
> >
> > -- an aggregate or union type that includes one of the aforementioned
> >    types among its members (including, recursively, a member of a
> >    subaggregate or contained union), or
> >
> > -- a character type.
>
> In this case, buf is declared in the program as a char[10000] array, so the
> declared type of each element is char, and the effective type of each
> element is also char. If we want to access, say, st->mnt_id, the lvalue
> expression has type __u64, and it tries to access 8 of the char objects.
> However, the integer type that __u64 expands to doesn't meet any of those
> criteria, so the aliasing rules are violated and the behavior is undefined.

Some of the above is new information for me.

However for all practical purposes the code doesn't violate aliasing
rules.  Even the most aggressive "-Wstrict-aliasing=1" doesn't trigger
a warning.  I guess this is because gcc takes the definition to be
symmetric, i.e. anything may safely be aliased to a char pointer and a
char pointer may safely be aliased to anything.  I'm not saying that
that is what the language definition says, just that gcc interprets
the language definition that way.  Also plain "-Wstrict-aliasing"
doesn't trigger even if the type of the array is not char, because gcc
tries hard not to warn about cases where there's no dereference of the
aliased pointer.  This is consistent with what I said and what the gcc
manpage says:  only accesses count, declarations don't.

>
> I've always felt that capacity doubling is a bit wasteful, but it's
> definitely something I can live with, especially if providing size feedback
> is as complex as you suggest. Still, I'm not a big fan of single-buffer
> interfaces in general, with how poorly they tend to interact with C's
> aliasing rules. (Also, those kinds of interfaces also invite alignment
> errors: for instance, your snippet above is missing the necessary union to
> prevent the buffer from being misaligned, which would cause UB when you
> cast it to a struct statmnt *.)

Okay, alignment is a different story.   I'll note this in the man page.

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-13 15:22 ` [RFC PATCH 2/3] add statmnt(2) syscall Miklos Szeredi
                     ` (3 preceding siblings ...)
  2023-09-17 18:18   ` Sargun Dhillon
@ 2023-09-25 12:57   ` Arnd Bergmann
  2023-09-25 13:04     ` Christian Brauner
  4 siblings, 1 reply; 76+ messages in thread
From: Arnd Bergmann @ 2023-09-25 12:57 UTC (permalink / raw)
  To: Miklos Szeredi, linux-fsdevel
  Cc: linux-kernel, linux-api, linux-man, linux-security-module,
	Karel Zak, Ian Kent, David Howells, Linus Torvalds,
	Alexander Viro, Christian Brauner, Amir Goldstein

On Wed, Sep 13, 2023, at 17:22, Miklos Szeredi wrote:

>  asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz,
>  				struct statfs64 __user *buf);
> +asmlinkage long sys_statmnt(u64 mnt_id, u64 mask,
> +			    struct statmnt __user *buf, size_t bufsize,
> +			    unsigned int flags);

This definition is problematic on 32-bit architectures for two
reasons:

- 64-bit register arguments are passed in pairs of registers
  on two architectures, so anything passing those needs to
  have a separate entry point for compat syscalls on 64-bit
  architectures. I would suggest also using the same one on
  32-bit ones, so you don't rely on the compiler splitting
  up the long arguments into pairs.

- There is a limit of six argument registers for system call
  entry points, but with two pairs and three single registers
  you end up with seven of them.

The listmnt syscall in patch 3 also has the first problem,
but not the second.

      Arnd

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-25 12:57   ` Arnd Bergmann
@ 2023-09-25 13:04     ` Christian Brauner
  2023-09-25 13:13       ` Miklos Szeredi
  0 siblings, 1 reply; 76+ messages in thread
From: Christian Brauner @ 2023-09-25 13:04 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Miklos Szeredi, linux-fsdevel, linux-kernel, linux-api,
	linux-man, linux-security-module, Karel Zak, Ian Kent,
	David Howells, Linus Torvalds, Alexander Viro, Christian Brauner,
	Amir Goldstein

On Mon, Sep 25, 2023 at 02:57:31PM +0200, Arnd Bergmann wrote:
> On Wed, Sep 13, 2023, at 17:22, Miklos Szeredi wrote:
> 
> >  asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz,
> >  				struct statfs64 __user *buf);
> > +asmlinkage long sys_statmnt(u64 mnt_id, u64 mask,
> > +			    struct statmnt __user *buf, size_t bufsize,
> > +			    unsigned int flags);
> 
> This definition is problematic on 32-bit architectures for two
> reasons:
> 
> - 64-bit register arguments are passed in pairs of registers
>   on two architectures, so anything passing those needs to
>   have a separate entry point for compat syscalls on 64-bit
>   architectures. I would suggest also using the same one on
>   32-bit ones, so you don't rely on the compiler splitting
>   up the long arguments into pairs.
> 
> - There is a limit of six argument registers for system call
>   entry points, but with two pairs and three single registers
>   you end up with seven of them.
> 
> The listmnt syscall in patch 3 also has the first problem,
> but not the second.

Both fields could also just be moved into the struct itself just like we
did for clone3() and others.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-25 13:04     ` Christian Brauner
@ 2023-09-25 13:13       ` Miklos Szeredi
  2023-09-25 13:19         ` Christian Brauner
  0 siblings, 1 reply; 76+ messages in thread
From: Miklos Szeredi @ 2023-09-25 13:13 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Arnd Bergmann, Miklos Szeredi, linux-fsdevel, linux-kernel,
	linux-api, linux-man, linux-security-module, Karel Zak, Ian Kent,
	David Howells, Linus Torvalds, Alexander Viro, Christian Brauner,
	Amir Goldstein

On Mon, 25 Sept 2023 at 15:04, Christian Brauner <brauner@kernel.org> wrote:
>
> On Mon, Sep 25, 2023 at 02:57:31PM +0200, Arnd Bergmann wrote:
> > On Wed, Sep 13, 2023, at 17:22, Miklos Szeredi wrote:
> >
> > >  asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz,
> > >                             struct statfs64 __user *buf);
> > > +asmlinkage long sys_statmnt(u64 mnt_id, u64 mask,
> > > +                       struct statmnt __user *buf, size_t bufsize,
> > > +                       unsigned int flags);
> >
> > This definition is problematic on 32-bit architectures for two
> > reasons:
> >
> > - 64-bit register arguments are passed in pairs of registers
> >   on two architectures, so anything passing those needs to
> >   have a separate entry point for compat syscalls on 64-bit
> >   architectures. I would suggest also using the same one on
> >   32-bit ones, so you don't rely on the compiler splitting
> >   up the long arguments into pairs.
> >
> > - There is a limit of six argument registers for system call
> >   entry points, but with two pairs and three single registers
> >   you end up with seven of them.
> >
> > The listmnt syscall in patch 3 also has the first problem,
> > but not the second.
>
> Both fields could also just be moved into the struct itself just like we
> did for clone3() and others.

Let's not mix in and out args, please.

How about passing u64 *?

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-25 13:13       ` Miklos Szeredi
@ 2023-09-25 13:19         ` Christian Brauner
  2023-09-25 13:20           ` Miklos Szeredi
  0 siblings, 1 reply; 76+ messages in thread
From: Christian Brauner @ 2023-09-25 13:19 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Arnd Bergmann, Miklos Szeredi, linux-fsdevel, linux-kernel,
	linux-api, linux-man, linux-security-module, Karel Zak, Ian Kent,
	David Howells, Linus Torvalds, Alexander Viro, Christian Brauner,
	Amir Goldstein

> How about passing u64 *?

struct statmnt_req {
        __u64 mnt_id;
	__u64 mask;
};

?

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-25 13:19         ` Christian Brauner
@ 2023-09-25 13:20           ` Miklos Szeredi
  2023-09-25 15:46             ` Arnd Bergmann
  2023-09-27  8:46             ` Miklos Szeredi
  0 siblings, 2 replies; 76+ messages in thread
From: Miklos Szeredi @ 2023-09-25 13:20 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Arnd Bergmann, Miklos Szeredi, linux-fsdevel, linux-kernel,
	linux-api, linux-man, linux-security-module, Karel Zak, Ian Kent,
	David Howells, Linus Torvalds, Alexander Viro, Christian Brauner,
	Amir Goldstein

On Mon, 25 Sept 2023 at 15:19, Christian Brauner <brauner@kernel.org> wrote:
>
> > How about passing u64 *?
>
> struct statmnt_req {
>         __u64 mnt_id;
>         __u64 mask;
> };
>
> ?

I'm fine with that as well.

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-25 13:20           ` Miklos Szeredi
@ 2023-09-25 15:46             ` Arnd Bergmann
  2023-09-26 10:05               ` Christian Brauner
  2023-09-27  8:46             ` Miklos Szeredi
  1 sibling, 1 reply; 76+ messages in thread
From: Arnd Bergmann @ 2023-09-25 15:46 UTC (permalink / raw)
  To: Miklos Szeredi, Christian Brauner
  Cc: Miklos Szeredi, linux-fsdevel, linux-kernel, linux-api,
	linux-man, linux-security-module, Karel Zak, Ian Kent,
	David Howells, Linus Torvalds, Alexander Viro, Christian Brauner,
	Amir Goldstein

On Mon, Sep 25, 2023, at 15:20, Miklos Szeredi wrote:
> On Mon, 25 Sept 2023 at 15:19, Christian Brauner <brauner@kernel.org> wrote:
>>
>> > How about passing u64 *?
>>
>> struct statmnt_req {
>>         __u64 mnt_id;
>>         __u64 mask;
>> };
>>
>> ?
>
> I'm fine with that as well.

Yes, this looks fine for the compat syscall purpose.

Not sure if losing visibility of the mnt_id and mask in ptrace
or seccomp/bpf is a problem though.

    Arnd

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-25 15:46             ` Arnd Bergmann
@ 2023-09-26 10:05               ` Christian Brauner
  0 siblings, 0 replies; 76+ messages in thread
From: Christian Brauner @ 2023-09-26 10:05 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Miklos Szeredi, Miklos Szeredi, linux-fsdevel, linux-kernel,
	linux-api, linux-man, linux-security-module, Karel Zak, Ian Kent,
	David Howells, Linus Torvalds, Alexander Viro, Christian Brauner,
	Amir Goldstein

On Mon, Sep 25, 2023 at 05:46:59PM +0200, Arnd Bergmann wrote:
> On Mon, Sep 25, 2023, at 15:20, Miklos Szeredi wrote:
> > On Mon, 25 Sept 2023 at 15:19, Christian Brauner <brauner@kernel.org> wrote:
> >>
> >> > How about passing u64 *?
> >>
> >> struct statmnt_req {
> >>         __u64 mnt_id;
> >>         __u64 mask;
> >> };
> >>
> >> ?
> >
> > I'm fine with that as well.
> 
> Yes, this looks fine for the compat syscall purpose.
> 
> Not sure if losing visibility of the mnt_id and mask in ptrace
> or seccomp/bpf is a problem though.

It's an information retrieval syscall so there shouldn't be any need to
block it and I think that this ship has sailed in general. Container
workloads should migrate from seccomp to landlock if they need to filter
system calls like this.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-18 14:14             ` Miklos Szeredi
  2023-09-18 14:24               ` Christian Brauner
@ 2023-09-26 13:48               ` Florian Weimer
  2023-09-26 14:06                 ` Miklos Szeredi
  2023-09-26 14:13                 ` Christian Brauner
  1 sibling, 2 replies; 76+ messages in thread
From: Florian Weimer @ 2023-09-26 13:48 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Christian Brauner, Miklos Szeredi, Linus Torvalds, linux-fsdevel,
	linux-kernel, linux-api, linux-man, linux-security-module,
	Karel Zak, Ian Kent, David Howells, Al Viro, Christian Brauner,
	Amir Goldstein

* Miklos Szeredi:

> On Mon, Sep 18, 2023 at 3:51 PM Christian Brauner <brauner@kernel.org> wrote:
>
>> I really would prefer a properly typed struct and that's what everyone
>> was happy with in the session as well. So I would not like to change the
>> main parameters.
>
> I completely  agree.  Just would like to understand this point:
>
>   struct statmnt *statmnt(u64 mntid, u64 mask, unsigned int flags);
>
> What's not properly typed about this interface?
>
> I guess the answer is that it's not a syscall interface, which will
> have an added [void *buf, size_t bufsize], while the buffer sizing is
> done by a simple libc wrapper.
>
> Do you think that's a problem?  If so, why?

Try-and-resize interfaces can be quite bad for data obtained from the
network.  If the first call provides the minimum buffer size (like
getgroups, but unlike readlink or the glibc *_r interfaces for NSS),
this could at least allow us to avoid allocating too much.  In
userspace, we cannot reduce the size of the heap allocation without
knowing where the pointers are and what they mean.

I also don't quite understand the dislike of variable-sized records.
Don't getdents, inotify, Netlink all use them?  And I think at least for
Netlink, more stuff is added all the time?

Thanks,
Florian


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-26 13:48               ` Florian Weimer
@ 2023-09-26 14:06                 ` Miklos Szeredi
  2023-09-26 14:19                   ` Florian Weimer
  2023-09-26 14:13                 ` Christian Brauner
  1 sibling, 1 reply; 76+ messages in thread
From: Miklos Szeredi @ 2023-09-26 14:06 UTC (permalink / raw)
  To: Florian Weimer
  Cc: Miklos Szeredi, Christian Brauner, Linus Torvalds, linux-fsdevel,
	linux-kernel, linux-api, linux-man, linux-security-module,
	Karel Zak, Ian Kent, David Howells, Al Viro, Christian Brauner,
	Amir Goldstein

On Tue, 26 Sept 2023 at 15:49, Florian Weimer <fweimer@redhat.com> wrote:
>
> * Miklos Szeredi:
>
> > On Mon, Sep 18, 2023 at 3:51 PM Christian Brauner <brauner@kernel.org> wrote:
> >
> >> I really would prefer a properly typed struct and that's what everyone
> >> was happy with in the session as well. So I would not like to change the
> >> main parameters.
> >
> > I completely  agree.  Just would like to understand this point:
> >
> >   struct statmnt *statmnt(u64 mntid, u64 mask, unsigned int flags);
> >
> > What's not properly typed about this interface?
> >
> > I guess the answer is that it's not a syscall interface, which will
> > have an added [void *buf, size_t bufsize], while the buffer sizing is
> > done by a simple libc wrapper.
> >
> > Do you think that's a problem?  If so, why?
>
> Try-and-resize interfaces can be quite bad for data obtained from the
> network.

In this particular case it's all local information.

>  If the first call provides the minimum buffer size (like
> getgroups, but unlike readlink or the glibc *_r interfaces for NSS),
> this could at least allow us to avoid allocating too much.  In
> userspace, we cannot reduce the size of the heap allocation without
> knowing where the pointers are and what they mean.

Does it matter if the heap allocation is say 32k instead of 589bytes?
 The returned strings are not limited in size, but are quite unlikely
to be over PATH_MAX.

E.g. getdents apparently uses 32k buffers, which is really a tiny
amount of heap these days, but more than enough for the purpose.  Not
sure if this is hard coded into libc or if it's the result of some
heuristic based on available memory, but I don't see why similar
treatment couldn't be applied to the statmount(2) syscall.

> I also don't quite understand the dislike of variable-sized records.
> Don't getdents, inotify, Netlink all use them?  And I think at least for
> Netlink, more stuff is added all the time?

What do you mean by variable sized records?

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-26 13:48               ` Florian Weimer
  2023-09-26 14:06                 ` Miklos Szeredi
@ 2023-09-26 14:13                 ` Christian Brauner
  1 sibling, 0 replies; 76+ messages in thread
From: Christian Brauner @ 2023-09-26 14:13 UTC (permalink / raw)
  To: Florian Weimer
  Cc: Miklos Szeredi, Miklos Szeredi, Linus Torvalds, linux-fsdevel,
	linux-kernel, linux-api, linux-man, linux-security-module,
	Karel Zak, Ian Kent, David Howells, Al Viro, Christian Brauner,
	Amir Goldstein

> I also don't quite understand the dislike of variable-sized records.
> Don't getdents, inotify, Netlink all use them?  And I think at least for
> Netlink, more stuff is added all the time?

Netlink is absolutely atrocious to work with because everything is
variable sized and figuring out the correct allocation size is a
complete nightmare even with the "helpful" macros that are provided.

The bigger problem however is the complete untypedness even of the most
basic things. For example, retrieving the mtu of a network interface
through netlink is a complete nightmare. getdents, inotify, fanotify,
open_by_handle_at()'s struct fiel_handle are all fine. But let's
absolutely not take netlink as a model for anything related to mounts.

And no one is against again variable sized records per se. I think we're
coming to a good compromise here.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-26 14:06                 ` Miklos Szeredi
@ 2023-09-26 14:19                   ` Florian Weimer
  2023-09-26 14:33                     ` Miklos Szeredi
  2023-09-26 14:36                     ` Christian Brauner
  0 siblings, 2 replies; 76+ messages in thread
From: Florian Weimer @ 2023-09-26 14:19 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Miklos Szeredi, Christian Brauner, Linus Torvalds, linux-fsdevel,
	linux-kernel, linux-api, linux-man, linux-security-module,
	Karel Zak, Ian Kent, David Howells, Al Viro, Christian Brauner,
	Amir Goldstein

* Miklos Szeredi:

>> Try-and-resize interfaces can be quite bad for data obtained from the
>> network.
>
> In this particular case it's all local information.

That's good.

>>  If the first call provides the minimum buffer size (like
>> getgroups, but unlike readlink or the glibc *_r interfaces for NSS),
>> this could at least allow us to avoid allocating too much.  In
>> userspace, we cannot reduce the size of the heap allocation without
>> knowing where the pointers are and what they mean.
>
> Does it matter if the heap allocation is say 32k instead of 589bytes?
>  The returned strings are not limited in size, but are quite unlikely
> to be over PATH_MAX.

It matters if the application needs to keep a copy.

> E.g. getdents apparently uses 32k buffers, which is really a tiny
> amount of heap these days, but more than enough for the purpose.  Not
> sure if this is hard coded into libc or if it's the result of some
> heuristic based on available memory, but I don't see why similar
> treatment couldn't be applied to the statmount(2) syscall.

getdents gets away with this buffer size because applications can copy
out all the data from struct dirent if they need long-term storage.
They have to do that because the usual readdir interface overwrites the
buffer, potentially at the next readdir call.  This means the buffer
size does not introduce an amount of memory fragmention that is
dependent on the directory size.

With an opaque, pointer-carrying struct, copying out the data is not
possible in a generic fashion.  Only the parts that the application
knows about can be copied out.  So I think it's desirable to have a
fairly exact allocation.

>> I also don't quite understand the dislike of variable-sized records.
>> Don't getdents, inotify, Netlink all use them?  And I think at least for
>> Netlink, more stuff is added all the time?
>
> What do you mean by variable sized records?

Iterating through d_reclen-sized subojects (for getdents).

Thanks,
Florian


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-26 14:19                   ` Florian Weimer
@ 2023-09-26 14:33                     ` Miklos Szeredi
  2023-09-26 14:39                       ` Florian Weimer
  2023-09-26 14:36                     ` Christian Brauner
  1 sibling, 1 reply; 76+ messages in thread
From: Miklos Szeredi @ 2023-09-26 14:33 UTC (permalink / raw)
  To: Florian Weimer
  Cc: Miklos Szeredi, Christian Brauner, Linus Torvalds, linux-fsdevel,
	linux-kernel, linux-api, linux-man, linux-security-module,
	Karel Zak, Ian Kent, David Howells, Al Viro, Christian Brauner,
	Amir Goldstein

On Tue, 26 Sept 2023 at 16:19, Florian Weimer <fweimer@redhat.com> wrote:

> getdents gets away with this buffer size because applications can copy
> out all the data from struct dirent if they need long-term storage.
> They have to do that because the usual readdir interface overwrites the
> buffer, potentially at the next readdir call.  This means the buffer
> size does not introduce an amount of memory fragmention that is
> dependent on the directory size.
>
> With an opaque, pointer-carrying struct, copying out the data is not
> possible in a generic fashion.  Only the parts that the application
> knows about can be copied out.  So I think it's desirable to have a
> fairly exact allocation.

Okay, so let's add a 'size' field to the struct, which is set to the
size used (as opposed to the size of the buffer).   That should solve
copying without wasting a single byte of memory.

Otherwise the format is fully copyable, since the strings are denoted
with an offset, which doesn't change after the buffer is copied.

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-26 14:19                   ` Florian Weimer
  2023-09-26 14:33                     ` Miklos Szeredi
@ 2023-09-26 14:36                     ` Christian Brauner
  1 sibling, 0 replies; 76+ messages in thread
From: Christian Brauner @ 2023-09-26 14:36 UTC (permalink / raw)
  To: Florian Weimer
  Cc: Miklos Szeredi, Miklos Szeredi, Linus Torvalds, linux-fsdevel,
	linux-kernel, linux-api, linux-man, linux-security-module,
	Karel Zak, Ian Kent, David Howells, Al Viro, Christian Brauner,
	Amir Goldstein

> With an opaque, pointer-carrying struct, copying out the data is not
> possible in a generic fashion.  Only the parts that the application
> knows about can be copied out.  So I think it's desirable to have a
> fairly exact allocation.

This could easily be added if we added size parameters like I originally
suggested for the variable sized mnt_root and mnt_point records into
struct statmount.

If the user specified that they want to retrieve the mnt_root and
mnt_mountpoint in @mask and the size for the relevant field is zero then
we fill in the required size for the relevant field. If they aren't zero
we just try to copy in the data in the relevant pointer field.

I prefer this interface as it allows for both strategies:

* users that don't care about exact allocation size can just pass a
  guesstimated buffer usually PATH_MAX/2 or sm
* users that care about exact allocation size can query the kernel

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-26 14:33                     ` Miklos Szeredi
@ 2023-09-26 14:39                       ` Florian Weimer
  0 siblings, 0 replies; 76+ messages in thread
From: Florian Weimer @ 2023-09-26 14:39 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Miklos Szeredi, Christian Brauner, Linus Torvalds, linux-fsdevel,
	linux-kernel, linux-api, linux-man, linux-security-module,
	Karel Zak, Ian Kent, David Howells, Al Viro, Christian Brauner,
	Amir Goldstein

* Miklos Szeredi:

> On Tue, 26 Sept 2023 at 16:19, Florian Weimer <fweimer@redhat.com> wrote:
>
>> getdents gets away with this buffer size because applications can copy
>> out all the data from struct dirent if they need long-term storage.
>> They have to do that because the usual readdir interface overwrites the
>> buffer, potentially at the next readdir call.  This means the buffer
>> size does not introduce an amount of memory fragmention that is
>> dependent on the directory size.
>>
>> With an opaque, pointer-carrying struct, copying out the data is not
>> possible in a generic fashion.  Only the parts that the application
>> knows about can be copied out.  So I think it's desirable to have a
>> fairly exact allocation.
>
> Okay, so let's add a 'size' field to the struct, which is set to the
> size used (as opposed to the size of the buffer).   That should solve
> copying without wasting a single byte of memory.

That would be helpful.

> Otherwise the format is fully copyable, since the strings are denoted
> with an offset, which doesn't change after the buffer is copied.

I missed the development in that direction.  Yes, offsets would work
nicely in this context.  They help with compat syscalls, too.

If the buffer is relocatable like that, we can even try first with a
reasonably sized on-stack buffer and create an exactly-sized heap
allocation from that.

Thanks,
Florian


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 2/3] add statmnt(2) syscall
  2023-09-25 13:20           ` Miklos Szeredi
  2023-09-25 15:46             ` Arnd Bergmann
@ 2023-09-27  8:46             ` Miklos Szeredi
  1 sibling, 0 replies; 76+ messages in thread
From: Miklos Szeredi @ 2023-09-27  8:46 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Arnd Bergmann, Miklos Szeredi, linux-fsdevel, linux-kernel,
	linux-api, linux-man, linux-security-module, Karel Zak, Ian Kent,
	David Howells, Linus Torvalds, Alexander Viro, Christian Brauner,
	Amir Goldstein

On Mon, 25 Sept 2023 at 15:20, Miklos Szeredi <miklos@szeredi.hu> wrote:
>
> On Mon, 25 Sept 2023 at 15:19, Christian Brauner <brauner@kernel.org> wrote:
> >
> > > How about passing u64 *?
> >
> > struct statmnt_req {
> >         __u64 mnt_id;
> >         __u64 mask;
> > };
> >
> > ?
>
> I'm fine with that as well.

So after a bit more thinking: this is okay to make life easier for
32bit archs, but only on the kernel ABI.

On the library API the args should *not* be multiplexed, as it's just
a pointless complication.  This is just an internal implementation
detail for the sake of legacy architectures, instead of being good API
design.

And because it's an internal thingy, my feeling is that this struct
could be reused for passing mnt_id to listmount(2) as well, despite
the fact that the mask would be unused.   But I'm ready to be
convinced otherwise...

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 3/3] add listmnt(2) syscall
  2023-09-19 16:47         ` Paul Moore
@ 2023-09-28 10:07           ` Miklos Szeredi
  2023-10-04 19:22             ` Paul Moore
  0 siblings, 1 reply; 76+ messages in thread
From: Miklos Szeredi @ 2023-09-28 10:07 UTC (permalink / raw)
  To: Paul Moore
  Cc: Christian Brauner, Miklos Szeredi, Matthew House, linux-fsdevel,
	linux-kernel, linux-api, linux-man, linux-security-module,
	Karel Zak, Ian Kent, David Howells, Linus Torvalds, Al Viro,
	Christian Brauner, Amir Goldstein

On Tue, 19 Sept 2023 at 18:48, Paul Moore <paul@paul-moore.com> wrote:

> > Ideally we avoid multiple capable(CAP_SYS_ADMIN) calls by only doing it
> > once and saving the return value. capable() call's aren't that cheap.
>
> Agreed.  The capability check doesn't do any subject/object
> comparisons so calling it for each mount is overkill.  However, I
> would think we would want the LSM hook called from inside the loop as
> that could involve a subject (@current) and object (individual mount
> point) comparison.

The security_sb_statfs() one?

Should a single failure result in a complete failure?

Why is it not enough to check permission on the parent?

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [RFC PATCH 3/3] add listmnt(2) syscall
  2023-09-28 10:07           ` Miklos Szeredi
@ 2023-10-04 19:22             ` Paul Moore
  0 siblings, 0 replies; 76+ messages in thread
From: Paul Moore @ 2023-10-04 19:22 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Christian Brauner, Miklos Szeredi, Matthew House, linux-fsdevel,
	linux-kernel, linux-api, linux-man, linux-security-module,
	Karel Zak, Ian Kent, David Howells, Linus Torvalds, Al Viro,
	Christian Brauner, Amir Goldstein

On Thu, Sep 28, 2023 at 6:07 AM Miklos Szeredi <miklos@szeredi.hu> wrote:
> On Tue, 19 Sept 2023 at 18:48, Paul Moore <paul@paul-moore.com> wrote:
>
> > > Ideally we avoid multiple capable(CAP_SYS_ADMIN) calls by only doing it
> > > once and saving the return value. capable() call's aren't that cheap.
> >
> > Agreed.  The capability check doesn't do any subject/object
> > comparisons so calling it for each mount is overkill.  However, I
> > would think we would want the LSM hook called from inside the loop as
> > that could involve a subject (@current) and object (individual mount
> > point) comparison.

My apologies, I was traveling and while I was quickly checking my
email each day this message was lost.  I'm very sorry for the delay in
responding.

> The security_sb_statfs() one?

Yes.

> Should a single failure result in a complete failure?

My opinion is that it should only result in the failure of that
listing/stat'ing that particular mount; if other mounts are allowed to
be queried than the operation should be allowed to continue.

> Why is it not enough to check permission on the parent?

Each mount has the potential to have a unique security identify in the
context of the LSM, and since the LSM access controls are generally
intended to support a subject-verb-object access control policy we
need to examine the subject and object together (the subject here is
@current, the object is the individual mount, and the verb is the
stat/list operation).

Does that make sense?

I'm looking at the v3 patchset right now, I've got some small nits,
but I'll add those to that thread.

-- 
paul-moore.com

^ permalink raw reply	[flat|nested] 76+ messages in thread

end of thread, other threads:[~2023-10-04 19:23 UTC | newest]

Thread overview: 76+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-09-13 15:22 [RFC PATCH 0/3] quering mount attributes Miklos Szeredi
2023-09-13 15:22 ` [RFC PATCH 1/3] add unique mount ID Miklos Szeredi
2023-09-14  9:03   ` Christian Brauner
2023-09-14  9:30     ` Miklos Szeredi
2023-09-14  9:36       ` Christian Brauner
2023-09-14  9:43         ` Miklos Szeredi
2023-09-14 10:06           ` Christian Brauner
2023-09-15  1:31           ` Ian Kent
2023-09-13 15:22 ` [RFC PATCH 2/3] add statmnt(2) syscall Miklos Szeredi
2023-09-14  6:11   ` Amir Goldstein
2023-09-15  1:05     ` Ian Kent
2023-09-14  9:27   ` Christian Brauner
2023-09-14 10:13     ` Miklos Szeredi
2023-09-14 15:26       ` Christian Brauner
2023-09-15  8:56         ` Miklos Szeredi
2023-09-18 13:51           ` Christian Brauner
2023-09-18 14:14             ` Miklos Szeredi
2023-09-18 14:24               ` Christian Brauner
2023-09-18 14:32                 ` Miklos Szeredi
2023-09-18 14:40                   ` Christian Brauner
2023-09-18 14:51                     ` Miklos Szeredi
2023-09-18 15:22                       ` Christian Brauner
2023-09-18 15:39                         ` Miklos Szeredi
2023-09-19  0:37                           ` Matthew House
2023-09-19  8:02                             ` Miklos Szeredi
2023-09-19  9:07                               ` Christian Brauner
2023-09-19 10:51                                 ` Miklos Szeredi
2023-09-19 12:41                                   ` Christian Brauner
2023-09-19 12:59                                     ` Miklos Szeredi
2023-09-19 13:18                                       ` Christian Brauner
2023-09-19 21:28                               ` Matthew House
2023-09-20  9:42                                 ` Miklos Szeredi
2023-09-20 13:26                                   ` Matthew House
2023-09-21  7:34                                     ` Miklos Szeredi
2023-09-26 13:48               ` Florian Weimer
2023-09-26 14:06                 ` Miklos Szeredi
2023-09-26 14:19                   ` Florian Weimer
2023-09-26 14:33                     ` Miklos Szeredi
2023-09-26 14:39                       ` Florian Weimer
2023-09-26 14:36                     ` Christian Brauner
2023-09-26 14:13                 ` Christian Brauner
2023-09-18 20:58             ` Andreas Dilger
2023-09-19 12:50               ` Christian Brauner
2023-09-20  0:33                 ` Dave Chinner
2023-09-18 14:29         ` Jeff Layton
2023-09-18 14:35           ` Christian Brauner
2023-09-20  9:43           ` David Laight
2023-09-14 20:39   ` Paul Moore
2023-09-15  9:10     ` Miklos Szeredi
2023-09-17 18:18   ` Sargun Dhillon
2023-09-17 23:36     ` Ian Kent
2023-09-18 13:05       ` Christian Brauner
2023-09-25 12:57   ` Arnd Bergmann
2023-09-25 13:04     ` Christian Brauner
2023-09-25 13:13       ` Miklos Szeredi
2023-09-25 13:19         ` Christian Brauner
2023-09-25 13:20           ` Miklos Szeredi
2023-09-25 15:46             ` Arnd Bergmann
2023-09-26 10:05               ` Christian Brauner
2023-09-27  8:46             ` Miklos Szeredi
2023-09-13 15:22 ` [RFC PATCH 3/3] add listmnt(2) syscall Miklos Szeredi
2023-09-14  6:00   ` Amir Goldstein
2023-09-14  8:50     ` Miklos Szeredi
2023-09-14 10:01       ` Christian Brauner
2023-09-15  1:00     ` Ian Kent
2023-09-17  0:54   ` Matthew House
2023-09-17 14:32     ` Miklos Szeredi
2023-09-18 13:15       ` Christian Brauner
2023-09-19 16:47         ` Paul Moore
2023-09-28 10:07           ` Miklos Szeredi
2023-10-04 19:22             ` Paul Moore
2023-09-14  6:47 ` [RFC PATCH 0/3] quering mount attributes Amir Goldstein
2023-09-15  1:20   ` Ian Kent
2023-09-15  3:06     ` Amir Goldstein
2023-09-16  2:04       ` Ian Kent
2023-09-16  2:19       ` Ian Kent

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).