All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] mnt: allow to add a mount into an existing group
@ 2017-04-28  5:18 ` Andrei Vagin
  0 siblings, 0 replies; 13+ messages in thread
From: Andrei Vagin @ 2017-04-28  5:18 UTC (permalink / raw)
  To: Alexander Viro, Eric W . Biederman
  Cc: linux-fsdevel, linux-kernel, linux-api, criu, Andrei Vagin

Now a shared group can be only inherited from a source mount.
This patch adds an ability to add a mount into an existing shared
group.

mount(source, target, NULL, MS_SET_GROUP, NULL)

mount() with the MS_SET_GROUP flag adds the "target" mount into a group
of the "source" mount. The calling process has to have the CAP_SYS_ADMIN
capability in namespaces of these mounts. The source and the target
mounts have to have the same super block.

This new functionality together with "mnt: Tuck mounts under others
instead of creating shadow/side mounts." allows CRIU to dump and restore
any set of mount namespaces.

Currently we have a lot of issues about dumping and restoring mount
namespaces. The bigest problem is that we can't construct mount trees
directly due to several reasons:
* groups can't be set, they can be only inherited
* file systems has to be mounted from the specified user namespaces
* the mount() syscall doesn't just create one mount -- the mount is
  also propagated to all members of a parent group
* umount() doesn't detach mounts from all members of a group
  (mounts with children are not umounted)
* mounts are propagated underneath of existing mounts
* mount() doesn't allow to make bind-mounts between two namespaces
* processes can have opened file descriptors to overmounted files

All these operations are non-trivial, making the task of restoring
a mount namespace practically unsolvable for reasonable time. The
proposed change allows to restore a mount namespace in a direct
manner, without any super complex logic.

Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrei Vagin <avagin@openvz.org>
---
 fs/namespace.c          | 66 ++++++++++++++++++++++++++++++++++++++++++++++---
 include/uapi/linux/fs.h |  6 +++++
 2 files changed, 68 insertions(+), 4 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index cc1375ef..3bf0cd2 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2355,6 +2355,57 @@ static inline int tree_contains_unbindable(struct mount *mnt)
 	return 0;
 }
 
+static int do_set_group(struct path *path, const char *sibling_name)
+{
+	struct mount *sibling, *mnt;
+	struct path sibling_path;
+	int err;
+
+	if (!sibling_name || !*sibling_name)
+		return -EINVAL;
+
+	err = kern_path(sibling_name, LOOKUP_FOLLOW, &sibling_path);
+	if (err)
+		return err;
+
+	sibling = real_mount(sibling_path.mnt);
+	mnt = real_mount(path->mnt);
+
+	namespace_lock();
+
+	err = -EPERM;
+	if (!sibling->mnt_ns ||
+	    !ns_capable(sibling->mnt_ns->user_ns, CAP_SYS_ADMIN))
+		goto out_unlock;
+
+	err = -EINVAL;
+	if (sibling->mnt.mnt_sb != mnt->mnt.mnt_sb)
+		goto out_unlock;
+
+	if (IS_MNT_SHARED(mnt) || IS_MNT_SLAVE(mnt))
+		goto out_unlock;
+
+	if (IS_MNT_SLAVE(sibling)) {
+		struct mount *m = sibling->mnt_master;
+
+		list_add(&mnt->mnt_slave, &m->mnt_slave_list);
+		mnt->mnt_master = m;
+	}
+
+	if (IS_MNT_SHARED(sibling)) {
+		mnt->mnt_group_id = sibling->mnt_group_id;
+		list_add(&mnt->mnt_share, &sibling->mnt_share);
+		set_mnt_shared(mnt);
+	}
+
+	err = 0;
+out_unlock:
+	namespace_unlock();
+
+	path_put(&sibling_path);
+	return err;
+}
+
 static int do_move_mount(struct path *path, const char *old_name)
 {
 	struct path old_path, parent_path;
@@ -2769,6 +2820,7 @@ long do_mount(const char *dev_name, const char __user *dir_name,
 	struct path path;
 	int retval = 0;
 	int mnt_flags = 0;
+	unsigned long cmd;
 
 	/* Discard magic */
 	if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
@@ -2820,19 +2872,25 @@ long do_mount(const char *dev_name, const char __user *dir_name,
 		mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK;
 	}
 
+	cmd = flags & (MS_REMOUNT | MS_BIND |
+		       MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE |
+		       MS_MOVE | MS_SET_GROUP);
+
 	flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
 		   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
 		   MS_STRICTATIME | MS_NOREMOTELOCK | MS_SUBMOUNT);
 
-	if (flags & MS_REMOUNT)
+	if (cmd & MS_REMOUNT)
 		retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
 				    data_page);
-	else if (flags & MS_BIND)
+	else if (cmd & MS_BIND)
 		retval = do_loopback(&path, dev_name, flags & MS_REC);
-	else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
+	else if (cmd & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
 		retval = do_change_type(&path, flags);
-	else if (flags & MS_MOVE)
+	else if (cmd & MS_MOVE)
 		retval = do_move_mount(&path, dev_name);
+	else if (cmd & MS_SET_GROUP)
+		retval = do_set_group(&path, dev_name);
 	else
 		retval = do_new_mount(&path, type_page, flags, mnt_flags,
 				      dev_name, data_page);
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 048a85e..33423aa 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -131,6 +131,12 @@ struct inodes_stat_t {
 #define MS_STRICTATIME	(1<<24) /* Always perform atime updates */
 #define MS_LAZYTIME	(1<<25) /* Update the on-disk [acm]times lazily */
 
+/*
+ * Here are commands and flags. Commands are handled in do_mount()
+ * and can intersect with kernel internal flags.
+ */
+#define MS_SET_GROUP	(1<<26) /* Add a mount into a shared group */
+
 /* These sb flags are internal to the kernel */
 #define MS_SUBMOUNT     (1<<26)
 #define MS_NOREMOTELOCK	(1<<27)
-- 
2.9.3

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH] mnt: allow to add a mount into an existing group
@ 2017-04-28  5:18 ` Andrei Vagin
  0 siblings, 0 replies; 13+ messages in thread
From: Andrei Vagin @ 2017-04-28  5:18 UTC (permalink / raw)
  To: Alexander Viro, Eric W . Biederman
  Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA, criu-GEFAQzZX7r8dnm+yROfE0A,
	Andrei Vagin

Now a shared group can be only inherited from a source mount.
This patch adds an ability to add a mount into an existing shared
group.

mount(source, target, NULL, MS_SET_GROUP, NULL)

mount() with the MS_SET_GROUP flag adds the "target" mount into a group
of the "source" mount. The calling process has to have the CAP_SYS_ADMIN
capability in namespaces of these mounts. The source and the target
mounts have to have the same super block.

This new functionality together with "mnt: Tuck mounts under others
instead of creating shadow/side mounts." allows CRIU to dump and restore
any set of mount namespaces.

Currently we have a lot of issues about dumping and restoring mount
namespaces. The bigest problem is that we can't construct mount trees
directly due to several reasons:
* groups can't be set, they can be only inherited
* file systems has to be mounted from the specified user namespaces
* the mount() syscall doesn't just create one mount -- the mount is
  also propagated to all members of a parent group
* umount() doesn't detach mounts from all members of a group
  (mounts with children are not umounted)
* mounts are propagated underneath of existing mounts
* mount() doesn't allow to make bind-mounts between two namespaces
* processes can have opened file descriptors to overmounted files

All these operations are non-trivial, making the task of restoring
a mount namespace practically unsolvable for reasonable time. The
proposed change allows to restore a mount namespace in a direct
manner, without any super complex logic.

Cc: Eric W. Biederman <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
Cc: Alexander Viro <viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn@public.gmane.org>
Signed-off-by: Andrei Vagin <avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
---
 fs/namespace.c          | 66 ++++++++++++++++++++++++++++++++++++++++++++++---
 include/uapi/linux/fs.h |  6 +++++
 2 files changed, 68 insertions(+), 4 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index cc1375ef..3bf0cd2 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2355,6 +2355,57 @@ static inline int tree_contains_unbindable(struct mount *mnt)
 	return 0;
 }
 
+static int do_set_group(struct path *path, const char *sibling_name)
+{
+	struct mount *sibling, *mnt;
+	struct path sibling_path;
+	int err;
+
+	if (!sibling_name || !*sibling_name)
+		return -EINVAL;
+
+	err = kern_path(sibling_name, LOOKUP_FOLLOW, &sibling_path);
+	if (err)
+		return err;
+
+	sibling = real_mount(sibling_path.mnt);
+	mnt = real_mount(path->mnt);
+
+	namespace_lock();
+
+	err = -EPERM;
+	if (!sibling->mnt_ns ||
+	    !ns_capable(sibling->mnt_ns->user_ns, CAP_SYS_ADMIN))
+		goto out_unlock;
+
+	err = -EINVAL;
+	if (sibling->mnt.mnt_sb != mnt->mnt.mnt_sb)
+		goto out_unlock;
+
+	if (IS_MNT_SHARED(mnt) || IS_MNT_SLAVE(mnt))
+		goto out_unlock;
+
+	if (IS_MNT_SLAVE(sibling)) {
+		struct mount *m = sibling->mnt_master;
+
+		list_add(&mnt->mnt_slave, &m->mnt_slave_list);
+		mnt->mnt_master = m;
+	}
+
+	if (IS_MNT_SHARED(sibling)) {
+		mnt->mnt_group_id = sibling->mnt_group_id;
+		list_add(&mnt->mnt_share, &sibling->mnt_share);
+		set_mnt_shared(mnt);
+	}
+
+	err = 0;
+out_unlock:
+	namespace_unlock();
+
+	path_put(&sibling_path);
+	return err;
+}
+
 static int do_move_mount(struct path *path, const char *old_name)
 {
 	struct path old_path, parent_path;
@@ -2769,6 +2820,7 @@ long do_mount(const char *dev_name, const char __user *dir_name,
 	struct path path;
 	int retval = 0;
 	int mnt_flags = 0;
+	unsigned long cmd;
 
 	/* Discard magic */
 	if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
@@ -2820,19 +2872,25 @@ long do_mount(const char *dev_name, const char __user *dir_name,
 		mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK;
 	}
 
+	cmd = flags & (MS_REMOUNT | MS_BIND |
+		       MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE |
+		       MS_MOVE | MS_SET_GROUP);
+
 	flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
 		   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
 		   MS_STRICTATIME | MS_NOREMOTELOCK | MS_SUBMOUNT);
 
-	if (flags & MS_REMOUNT)
+	if (cmd & MS_REMOUNT)
 		retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
 				    data_page);
-	else if (flags & MS_BIND)
+	else if (cmd & MS_BIND)
 		retval = do_loopback(&path, dev_name, flags & MS_REC);
-	else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
+	else if (cmd & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
 		retval = do_change_type(&path, flags);
-	else if (flags & MS_MOVE)
+	else if (cmd & MS_MOVE)
 		retval = do_move_mount(&path, dev_name);
+	else if (cmd & MS_SET_GROUP)
+		retval = do_set_group(&path, dev_name);
 	else
 		retval = do_new_mount(&path, type_page, flags, mnt_flags,
 				      dev_name, data_page);
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 048a85e..33423aa 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -131,6 +131,12 @@ struct inodes_stat_t {
 #define MS_STRICTATIME	(1<<24) /* Always perform atime updates */
 #define MS_LAZYTIME	(1<<25) /* Update the on-disk [acm]times lazily */
 
+/*
+ * Here are commands and flags. Commands are handled in do_mount()
+ * and can intersect with kernel internal flags.
+ */
+#define MS_SET_GROUP	(1<<26) /* Add a mount into a shared group */
+
 /* These sb flags are internal to the kernel */
 #define MS_SUBMOUNT     (1<<26)
 #define MS_NOREMOTELOCK	(1<<27)
-- 
2.9.3

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH] mnt: allow to add a mount into an existing group
@ 2017-05-09 17:36   ` Andrey Vagin
  0 siblings, 0 replies; 13+ messages in thread
From: Andrey Vagin @ 2017-05-09 17:36 UTC (permalink / raw)
  To: Eric W . Biederman
  Cc: linux-fsdevel, LKML, Linux API, criu, Andrei Vagin, Alexander Viro

On Tue, Jan 24, 2017 at 02:03:23PM +1300, Eric W. Biederman wrote:
> Andrei Vagin <avagin@openvz.org> writes:
>
> > Now a shared group can be only inherited from a source mount.
> > This patch adds an ability to add a mount into an existing shared
> > group.
>
> This sounds like a lot of the discussion on bind mounts accross
> namespaces.  I am going to stay out of this for a bit until
> we resolve my latest patch.

Hi Eric,

Your patches about shadow/side mounts were committed, can we resume
the discussion about this patch?

As for security, a mount can be added to a shared group, only if a
caller has CAP_SYS_ADMIN in namespaces of both mounts, so an
unprivileged user can't create a shared mount with a parent mount
namespace. If a user has CAP_SYS_ADMIN, I don't see a reason to
restrict him to create shared mounts between namespaces, even if they
are in different user-namespaces.

Now I look at volume drivers in container services (like docker and
kubernetes) and I think this functionality can be useful for them too.
Now it is impossible to run a plugin driver in unprivileged containers
(with sub-userns), because a container has to have a shared mount with
a mount namespace where the service is running. The idea of these
plugins is that a service requests a volume mount from a plugin and
then starts a container with this volume, so the service need to have
a way to get a mount from a service.

>
> Eric


On Thu, Apr 27, 2017 at 10:18 PM, Andrei Vagin <avagin@openvz.org> wrote:
> Now a shared group can be only inherited from a source mount.
> This patch adds an ability to add a mount into an existing shared
> group.
>
> mount(source, target, NULL, MS_SET_GROUP, NULL)
>
> mount() with the MS_SET_GROUP flag adds the "target" mount into a group
> of the "source" mount. The calling process has to have the CAP_SYS_ADMIN
> capability in namespaces of these mounts. The source and the target
> mounts have to have the same super block.
>
> This new functionality together with "mnt: Tuck mounts under others
> instead of creating shadow/side mounts." allows CRIU to dump and restore
> any set of mount namespaces.
>
> Currently we have a lot of issues about dumping and restoring mount
> namespaces. The bigest problem is that we can't construct mount trees
> directly due to several reasons:
> * groups can't be set, they can be only inherited
> * file systems has to be mounted from the specified user namespaces
> * the mount() syscall doesn't just create one mount -- the mount is
>   also propagated to all members of a parent group
> * umount() doesn't detach mounts from all members of a group
>   (mounts with children are not umounted)
> * mounts are propagated underneath of existing mounts
> * mount() doesn't allow to make bind-mounts between two namespaces
> * processes can have opened file descriptors to overmounted files
>
> All these operations are non-trivial, making the task of restoring
> a mount namespace practically unsolvable for reasonable time. The
> proposed change allows to restore a mount namespace in a direct
> manner, without any super complex logic.
>
> Cc: Eric W. Biederman <ebiederm@xmission.com>
> Cc: Alexander Viro <viro@zeniv.linux.org.uk>
> Signed-off-by: Andrei Vagin <avagin@openvz.org>
> ---
>  fs/namespace.c          | 66 ++++++++++++++++++++++++++++++++++++++++++++++---
>  include/uapi/linux/fs.h |  6 +++++
>  2 files changed, 68 insertions(+), 4 deletions(-)
>
> diff --git a/fs/namespace.c b/fs/namespace.c
> index cc1375ef..3bf0cd2 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -2355,6 +2355,57 @@ static inline int tree_contains_unbindable(struct mount *mnt)
>         return 0;
>  }
>
> +static int do_set_group(struct path *path, const char *sibling_name)
> +{
> +       struct mount *sibling, *mnt;
> +       struct path sibling_path;
> +       int err;
> +
> +       if (!sibling_name || !*sibling_name)
> +               return -EINVAL;
> +
> +       err = kern_path(sibling_name, LOOKUP_FOLLOW, &sibling_path);
> +       if (err)
> +               return err;
> +
> +       sibling = real_mount(sibling_path.mnt);
> +       mnt = real_mount(path->mnt);
> +
> +       namespace_lock();
> +
> +       err = -EPERM;
> +       if (!sibling->mnt_ns ||
> +           !ns_capable(sibling->mnt_ns->user_ns, CAP_SYS_ADMIN))
> +               goto out_unlock;
> +
> +       err = -EINVAL;
> +       if (sibling->mnt.mnt_sb != mnt->mnt.mnt_sb)
> +               goto out_unlock;
> +
> +       if (IS_MNT_SHARED(mnt) || IS_MNT_SLAVE(mnt))
> +               goto out_unlock;
> +
> +       if (IS_MNT_SLAVE(sibling)) {
> +               struct mount *m = sibling->mnt_master;
> +
> +               list_add(&mnt->mnt_slave, &m->mnt_slave_list);
> +               mnt->mnt_master = m;
> +       }
> +
> +       if (IS_MNT_SHARED(sibling)) {
> +               mnt->mnt_group_id = sibling->mnt_group_id;
> +               list_add(&mnt->mnt_share, &sibling->mnt_share);
> +               set_mnt_shared(mnt);
> +       }
> +
> +       err = 0;
> +out_unlock:
> +       namespace_unlock();
> +
> +       path_put(&sibling_path);
> +       return err;
> +}
> +
>  static int do_move_mount(struct path *path, const char *old_name)
>  {
>         struct path old_path, parent_path;
> @@ -2769,6 +2820,7 @@ long do_mount(const char *dev_name, const char __user *dir_name,
>         struct path path;
>         int retval = 0;
>         int mnt_flags = 0;
> +       unsigned long cmd;
>
>         /* Discard magic */
>         if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
> @@ -2820,19 +2872,25 @@ long do_mount(const char *dev_name, const char __user *dir_name,
>                 mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK;
>         }
>
> +       cmd = flags & (MS_REMOUNT | MS_BIND |
> +                      MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE |
> +                      MS_MOVE | MS_SET_GROUP);
> +
>         flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
>                    MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
>                    MS_STRICTATIME | MS_NOREMOTELOCK | MS_SUBMOUNT);
>
> -       if (flags & MS_REMOUNT)
> +       if (cmd & MS_REMOUNT)
>                 retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
>                                     data_page);
> -       else if (flags & MS_BIND)
> +       else if (cmd & MS_BIND)
>                 retval = do_loopback(&path, dev_name, flags & MS_REC);
> -       else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
> +       else if (cmd & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
>                 retval = do_change_type(&path, flags);
> -       else if (flags & MS_MOVE)
> +       else if (cmd & MS_MOVE)
>                 retval = do_move_mount(&path, dev_name);
> +       else if (cmd & MS_SET_GROUP)
> +               retval = do_set_group(&path, dev_name);
>         else
>                 retval = do_new_mount(&path, type_page, flags, mnt_flags,
>                                       dev_name, data_page);
> diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> index 048a85e..33423aa 100644
> --- a/include/uapi/linux/fs.h
> +++ b/include/uapi/linux/fs.h
> @@ -131,6 +131,12 @@ struct inodes_stat_t {
>  #define MS_STRICTATIME (1<<24) /* Always perform atime updates */
>  #define MS_LAZYTIME    (1<<25) /* Update the on-disk [acm]times lazily */
>
> +/*
> + * Here are commands and flags. Commands are handled in do_mount()
> + * and can intersect with kernel internal flags.
> + */
> +#define MS_SET_GROUP   (1<<26) /* Add a mount into a shared group */
> +
>  /* These sb flags are internal to the kernel */
>  #define MS_SUBMOUNT     (1<<26)
>  #define MS_NOREMOTELOCK        (1<<27)
> --
> 2.9.3
>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] mnt: allow to add a mount into an existing group
@ 2017-05-09 17:36   ` Andrey Vagin
  0 siblings, 0 replies; 13+ messages in thread
From: Andrey Vagin @ 2017-05-09 17:36 UTC (permalink / raw)
  To: Eric W . Biederman
  Cc: linux-fsdevel, LKML, Linux API, criu-GEFAQzZX7r8dnm+yROfE0A,
	Andrei Vagin, Alexander Viro

On Tue, Jan 24, 2017 at 02:03:23PM +1300, Eric W. Biederman wrote:
> Andrei Vagin <avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org> writes:
>
> > Now a shared group can be only inherited from a source mount.
> > This patch adds an ability to add a mount into an existing shared
> > group.
>
> This sounds like a lot of the discussion on bind mounts accross
> namespaces.  I am going to stay out of this for a bit until
> we resolve my latest patch.

Hi Eric,

Your patches about shadow/side mounts were committed, can we resume
the discussion about this patch?

As for security, a mount can be added to a shared group, only if a
caller has CAP_SYS_ADMIN in namespaces of both mounts, so an
unprivileged user can't create a shared mount with a parent mount
namespace. If a user has CAP_SYS_ADMIN, I don't see a reason to
restrict him to create shared mounts between namespaces, even if they
are in different user-namespaces.

Now I look at volume drivers in container services (like docker and
kubernetes) and I think this functionality can be useful for them too.
Now it is impossible to run a plugin driver in unprivileged containers
(with sub-userns), because a container has to have a shared mount with
a mount namespace where the service is running. The idea of these
plugins is that a service requests a volume mount from a plugin and
then starts a container with this volume, so the service need to have
a way to get a mount from a service.

>
> Eric


On Thu, Apr 27, 2017 at 10:18 PM, Andrei Vagin <avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org> wrote:
> Now a shared group can be only inherited from a source mount.
> This patch adds an ability to add a mount into an existing shared
> group.
>
> mount(source, target, NULL, MS_SET_GROUP, NULL)
>
> mount() with the MS_SET_GROUP flag adds the "target" mount into a group
> of the "source" mount. The calling process has to have the CAP_SYS_ADMIN
> capability in namespaces of these mounts. The source and the target
> mounts have to have the same super block.
>
> This new functionality together with "mnt: Tuck mounts under others
> instead of creating shadow/side mounts." allows CRIU to dump and restore
> any set of mount namespaces.
>
> Currently we have a lot of issues about dumping and restoring mount
> namespaces. The bigest problem is that we can't construct mount trees
> directly due to several reasons:
> * groups can't be set, they can be only inherited
> * file systems has to be mounted from the specified user namespaces
> * the mount() syscall doesn't just create one mount -- the mount is
>   also propagated to all members of a parent group
> * umount() doesn't detach mounts from all members of a group
>   (mounts with children are not umounted)
> * mounts are propagated underneath of existing mounts
> * mount() doesn't allow to make bind-mounts between two namespaces
> * processes can have opened file descriptors to overmounted files
>
> All these operations are non-trivial, making the task of restoring
> a mount namespace practically unsolvable for reasonable time. The
> proposed change allows to restore a mount namespace in a direct
> manner, without any super complex logic.
>
> Cc: Eric W. Biederman <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
> Cc: Alexander Viro <viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn@public.gmane.org>
> Signed-off-by: Andrei Vagin <avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
> ---
>  fs/namespace.c          | 66 ++++++++++++++++++++++++++++++++++++++++++++++---
>  include/uapi/linux/fs.h |  6 +++++
>  2 files changed, 68 insertions(+), 4 deletions(-)
>
> diff --git a/fs/namespace.c b/fs/namespace.c
> index cc1375ef..3bf0cd2 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -2355,6 +2355,57 @@ static inline int tree_contains_unbindable(struct mount *mnt)
>         return 0;
>  }
>
> +static int do_set_group(struct path *path, const char *sibling_name)
> +{
> +       struct mount *sibling, *mnt;
> +       struct path sibling_path;
> +       int err;
> +
> +       if (!sibling_name || !*sibling_name)
> +               return -EINVAL;
> +
> +       err = kern_path(sibling_name, LOOKUP_FOLLOW, &sibling_path);
> +       if (err)
> +               return err;
> +
> +       sibling = real_mount(sibling_path.mnt);
> +       mnt = real_mount(path->mnt);
> +
> +       namespace_lock();
> +
> +       err = -EPERM;
> +       if (!sibling->mnt_ns ||
> +           !ns_capable(sibling->mnt_ns->user_ns, CAP_SYS_ADMIN))
> +               goto out_unlock;
> +
> +       err = -EINVAL;
> +       if (sibling->mnt.mnt_sb != mnt->mnt.mnt_sb)
> +               goto out_unlock;
> +
> +       if (IS_MNT_SHARED(mnt) || IS_MNT_SLAVE(mnt))
> +               goto out_unlock;
> +
> +       if (IS_MNT_SLAVE(sibling)) {
> +               struct mount *m = sibling->mnt_master;
> +
> +               list_add(&mnt->mnt_slave, &m->mnt_slave_list);
> +               mnt->mnt_master = m;
> +       }
> +
> +       if (IS_MNT_SHARED(sibling)) {
> +               mnt->mnt_group_id = sibling->mnt_group_id;
> +               list_add(&mnt->mnt_share, &sibling->mnt_share);
> +               set_mnt_shared(mnt);
> +       }
> +
> +       err = 0;
> +out_unlock:
> +       namespace_unlock();
> +
> +       path_put(&sibling_path);
> +       return err;
> +}
> +
>  static int do_move_mount(struct path *path, const char *old_name)
>  {
>         struct path old_path, parent_path;
> @@ -2769,6 +2820,7 @@ long do_mount(const char *dev_name, const char __user *dir_name,
>         struct path path;
>         int retval = 0;
>         int mnt_flags = 0;
> +       unsigned long cmd;
>
>         /* Discard magic */
>         if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
> @@ -2820,19 +2872,25 @@ long do_mount(const char *dev_name, const char __user *dir_name,
>                 mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK;
>         }
>
> +       cmd = flags & (MS_REMOUNT | MS_BIND |
> +                      MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE |
> +                      MS_MOVE | MS_SET_GROUP);
> +
>         flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
>                    MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
>                    MS_STRICTATIME | MS_NOREMOTELOCK | MS_SUBMOUNT);
>
> -       if (flags & MS_REMOUNT)
> +       if (cmd & MS_REMOUNT)
>                 retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
>                                     data_page);
> -       else if (flags & MS_BIND)
> +       else if (cmd & MS_BIND)
>                 retval = do_loopback(&path, dev_name, flags & MS_REC);
> -       else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
> +       else if (cmd & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
>                 retval = do_change_type(&path, flags);
> -       else if (flags & MS_MOVE)
> +       else if (cmd & MS_MOVE)
>                 retval = do_move_mount(&path, dev_name);
> +       else if (cmd & MS_SET_GROUP)
> +               retval = do_set_group(&path, dev_name);
>         else
>                 retval = do_new_mount(&path, type_page, flags, mnt_flags,
>                                       dev_name, data_page);
> diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> index 048a85e..33423aa 100644
> --- a/include/uapi/linux/fs.h
> +++ b/include/uapi/linux/fs.h
> @@ -131,6 +131,12 @@ struct inodes_stat_t {
>  #define MS_STRICTATIME (1<<24) /* Always perform atime updates */
>  #define MS_LAZYTIME    (1<<25) /* Update the on-disk [acm]times lazily */
>
> +/*
> + * Here are commands and flags. Commands are handled in do_mount()
> + * and can intersect with kernel internal flags.
> + */
> +#define MS_SET_GROUP   (1<<26) /* Add a mount into a shared group */
> +
>  /* These sb flags are internal to the kernel */
>  #define MS_SUBMOUNT     (1<<26)
>  #define MS_NOREMOTELOCK        (1<<27)
> --
> 2.9.3
>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] mnt: allow to add a mount into an existing group
@ 2017-05-10  0:42     ` Eric W. Biederman
  0 siblings, 0 replies; 13+ messages in thread
From: Eric W. Biederman @ 2017-05-10  0:42 UTC (permalink / raw)
  To: Andrey Vagin; +Cc: linux-fsdevel, LKML, Linux API, criu, Alexander Viro

Andrey Vagin <avagin@openvz.org> writes:

> On Tue, Jan 24, 2017 at 02:03:23PM +1300, Eric W. Biederman wrote:
>> Andrei Vagin <avagin@openvz.org> writes:
>>
>> > Now a shared group can be only inherited from a source mount.
>> > This patch adds an ability to add a mount into an existing shared
>> > group.
>>
>> This sounds like a lot of the discussion on bind mounts accross
>> namespaces.  I am going to stay out of this for a bit until
>> we resolve my latest patch.
>
> Hi Eric,
>
> Your patches about shadow/side mounts were committed, can we resume
> the discussion about this patch?

We can.

Although personally I would rather get back to figuring out how
we can reduce the horrible time complexity of the worst case
for umount -l.

> As for security, a mount can be added to a shared group, only if a
> caller has CAP_SYS_ADMIN in namespaces of both mounts, so an
> unprivileged user can't create a shared mount with a parent mount
> namespace. If a user has CAP_SYS_ADMIN, I don't see a reason to
> restrict him to create shared mounts between namespaces, even if they
> are in different user-namespaces.

Can they create loops in mount propagation trees that we can not create
today?  It feels like that would be very simple to do with an interface
like this.

A loop in a mount propagation tree would be an absolute disaster.

I remember Al Viro had some very firm ideas about bind mounts from
foreign namespaces.  That I have never take the time to understand.
I suspect whatever objections he had will apply in this case.  Or else
this code might be made unnecessary by allowing bind mounts between
mount namespaces.


> Now I look at volume drivers in container services (like docker and
> kubernetes) and I think this functionality can be useful for them too.
> Now it is impossible to run a plugin driver in unprivileged containers
> (with sub-userns), because a container has to have a shared mount with
> a mount namespace where the service is running. The idea of these
> plugins is that a service requests a volume mount from a plugin and
> then starts a container with this volume, so the service need to have
> a way to get a mount from a service.

Interesting.  I personally think the checkpoint/restart use case
is more compelling.

>> Eric
>
>
> On Thu, Apr 27, 2017 at 10:18 PM, Andrei Vagin <avagin@openvz.org> wrote:
>> Now a shared group can be only inherited from a source mount.
>> This patch adds an ability to add a mount into an existing shared
>> group.
>>
>> mount(source, target, NULL, MS_SET_GROUP, NULL)
>>
>> mount() with the MS_SET_GROUP flag adds the "target" mount into a group
>> of the "source" mount. The calling process has to have the CAP_SYS_ADMIN
>> capability in namespaces of these mounts. The source and the target
>> mounts have to have the same super block.
>>
>> This new functionality together with "mnt: Tuck mounts under others
>> instead of creating shadow/side mounts." allows CRIU to dump and restore
>> any set of mount namespaces.
>>
>> Currently we have a lot of issues about dumping and restoring mount
>> namespaces. The bigest problem is that we can't construct mount trees
>> directly due to several reasons:
>> * groups can't be set, they can be only inherited
>> * file systems has to be mounted from the specified user namespaces
>> * the mount() syscall doesn't just create one mount -- the mount is
>>   also propagated to all members of a parent group
>> * umount() doesn't detach mounts from all members of a group
>>   (mounts with children are not umounted)
>> * mounts are propagated underneath of existing mounts
>> * mount() doesn't allow to make bind-mounts between two namespaces
>> * processes can have opened file descriptors to overmounted files
>>
>> All these operations are non-trivial, making the task of restoring
>> a mount namespace practically unsolvable for reasonable time. The
>> proposed change allows to restore a mount namespace in a direct
>> manner, without any super complex logic.
>>
>> Cc: Eric W. Biederman <ebiederm@xmission.com>
>> Cc: Alexander Viro <viro@zeniv.linux.org.uk>
>> Signed-off-by: Andrei Vagin <avagin@openvz.org>
>> ---
>>  fs/namespace.c          | 66 ++++++++++++++++++++++++++++++++++++++++++++++---
>>  include/uapi/linux/fs.h |  6 +++++
>>  2 files changed, 68 insertions(+), 4 deletions(-)
>>
>> diff --git a/fs/namespace.c b/fs/namespace.c
>> index cc1375ef..3bf0cd2 100644
>> --- a/fs/namespace.c
>> +++ b/fs/namespace.c
>> @@ -2355,6 +2355,57 @@ static inline int tree_contains_unbindable(struct mount *mnt)
>>         return 0;
>>  }
>>
>> +static int do_set_group(struct path *path, const char *sibling_name)
>> +{
>> +       struct mount *sibling, *mnt;
>> +       struct path sibling_path;
>> +       int err;
>> +
>> +       if (!sibling_name || !*sibling_name)
>> +               return -EINVAL;
>> +
>> +       err = kern_path(sibling_name, LOOKUP_FOLLOW, &sibling_path);
>> +       if (err)
>> +               return err;
>> +
>> +       sibling = real_mount(sibling_path.mnt);
>> +       mnt = real_mount(path->mnt);
>> +
>> +       namespace_lock();
>> +
>> +       err = -EPERM;
>> +       if (!sibling->mnt_ns ||
>> +           !ns_capable(sibling->mnt_ns->user_ns, CAP_SYS_ADMIN))
>> +               goto out_unlock;
>> +
>> +       err = -EINVAL;
>> +       if (sibling->mnt.mnt_sb != mnt->mnt.mnt_sb)
>> +               goto out_unlock;
>> +
>> +       if (IS_MNT_SHARED(mnt) || IS_MNT_SLAVE(mnt))
>> +               goto out_unlock;
>> +
>> +       if (IS_MNT_SLAVE(sibling)) {
>> +               struct mount *m = sibling->mnt_master;
>> +
>> +               list_add(&mnt->mnt_slave, &m->mnt_slave_list);
>> +               mnt->mnt_master = m;
>> +       }
>> +
>> +       if (IS_MNT_SHARED(sibling)) {
>> +               mnt->mnt_group_id = sibling->mnt_group_id;
>> +               list_add(&mnt->mnt_share, &sibling->mnt_share);
>> +               set_mnt_shared(mnt);
>> +       }
>> +
>> +       err = 0;
>> +out_unlock:
>> +       namespace_unlock();
>> +
>> +       path_put(&sibling_path);
>> +       return err;
>> +}
>> +
>>  static int do_move_mount(struct path *path, const char *old_name)
>>  {
>>         struct path old_path, parent_path;
>> @@ -2769,6 +2820,7 @@ long do_mount(const char *dev_name, const char __user *dir_name,
>>         struct path path;
>>         int retval = 0;
>>         int mnt_flags = 0;
>> +       unsigned long cmd;
>>
>>         /* Discard magic */
>>         if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
>> @@ -2820,19 +2872,25 @@ long do_mount(const char *dev_name, const char __user *dir_name,
>>                 mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK;
>>         }
>>
>> +       cmd = flags & (MS_REMOUNT | MS_BIND |
>> +                      MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE |
>> +                      MS_MOVE | MS_SET_GROUP);
>> +
>>         flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
>>                    MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
>>                    MS_STRICTATIME | MS_NOREMOTELOCK | MS_SUBMOUNT);
>>
>> -       if (flags & MS_REMOUNT)
>> +       if (cmd & MS_REMOUNT)
>>                 retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
>>                                     data_page);
>> -       else if (flags & MS_BIND)
>> +       else if (cmd & MS_BIND)
>>                 retval = do_loopback(&path, dev_name, flags & MS_REC);
>> -       else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
>> +       else if (cmd & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
>>                 retval = do_change_type(&path, flags);
>> -       else if (flags & MS_MOVE)
>> +       else if (cmd & MS_MOVE)
>>                 retval = do_move_mount(&path, dev_name);
>> +       else if (cmd & MS_SET_GROUP)
>> +               retval = do_set_group(&path, dev_name);
>>         else
>>                 retval = do_new_mount(&path, type_page, flags, mnt_flags,
>>                                       dev_name, data_page);
>> diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
>> index 048a85e..33423aa 100644
>> --- a/include/uapi/linux/fs.h
>> +++ b/include/uapi/linux/fs.h
>> @@ -131,6 +131,12 @@ struct inodes_stat_t {
>>  #define MS_STRICTATIME (1<<24) /* Always perform atime updates */
>>  #define MS_LAZYTIME    (1<<25) /* Update the on-disk [acm]times lazily */
>>
>> +/*
>> + * Here are commands and flags. Commands are handled in do_mount()
>> + * and can intersect with kernel internal flags.
>> + */
>> +#define MS_SET_GROUP   (1<<26) /* Add a mount into a shared group */
>> +
>>  /* These sb flags are internal to the kernel */
>>  #define MS_SUBMOUNT     (1<<26)
>>  #define MS_NOREMOTELOCK        (1<<27)
>> --
>> 2.9.3
>>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] mnt: allow to add a mount into an existing group
@ 2017-05-10  0:42     ` Eric W. Biederman
  0 siblings, 0 replies; 13+ messages in thread
From: Eric W. Biederman @ 2017-05-10  0:42 UTC (permalink / raw)
  To: Andrey Vagin
  Cc: linux-fsdevel, LKML, Linux API, criu@openvz.org, Alexander Viro

Andrey Vagin <avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org> writes:

> On Tue, Jan 24, 2017 at 02:03:23PM +1300, Eric W. Biederman wrote:
>> Andrei Vagin <avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org> writes:
>>
>> > Now a shared group can be only inherited from a source mount.
>> > This patch adds an ability to add a mount into an existing shared
>> > group.
>>
>> This sounds like a lot of the discussion on bind mounts accross
>> namespaces.  I am going to stay out of this for a bit until
>> we resolve my latest patch.
>
> Hi Eric,
>
> Your patches about shadow/side mounts were committed, can we resume
> the discussion about this patch?

We can.

Although personally I would rather get back to figuring out how
we can reduce the horrible time complexity of the worst case
for umount -l.

> As for security, a mount can be added to a shared group, only if a
> caller has CAP_SYS_ADMIN in namespaces of both mounts, so an
> unprivileged user can't create a shared mount with a parent mount
> namespace. If a user has CAP_SYS_ADMIN, I don't see a reason to
> restrict him to create shared mounts between namespaces, even if they
> are in different user-namespaces.

Can they create loops in mount propagation trees that we can not create
today?  It feels like that would be very simple to do with an interface
like this.

A loop in a mount propagation tree would be an absolute disaster.

I remember Al Viro had some very firm ideas about bind mounts from
foreign namespaces.  That I have never take the time to understand.
I suspect whatever objections he had will apply in this case.  Or else
this code might be made unnecessary by allowing bind mounts between
mount namespaces.


> Now I look at volume drivers in container services (like docker and
> kubernetes) and I think this functionality can be useful for them too.
> Now it is impossible to run a plugin driver in unprivileged containers
> (with sub-userns), because a container has to have a shared mount with
> a mount namespace where the service is running. The idea of these
> plugins is that a service requests a volume mount from a plugin and
> then starts a container with this volume, so the service need to have
> a way to get a mount from a service.

Interesting.  I personally think the checkpoint/restart use case
is more compelling.

>> Eric
>
>
> On Thu, Apr 27, 2017 at 10:18 PM, Andrei Vagin <avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org> wrote:
>> Now a shared group can be only inherited from a source mount.
>> This patch adds an ability to add a mount into an existing shared
>> group.
>>
>> mount(source, target, NULL, MS_SET_GROUP, NULL)
>>
>> mount() with the MS_SET_GROUP flag adds the "target" mount into a group
>> of the "source" mount. The calling process has to have the CAP_SYS_ADMIN
>> capability in namespaces of these mounts. The source and the target
>> mounts have to have the same super block.
>>
>> This new functionality together with "mnt: Tuck mounts under others
>> instead of creating shadow/side mounts." allows CRIU to dump and restore
>> any set of mount namespaces.
>>
>> Currently we have a lot of issues about dumping and restoring mount
>> namespaces. The bigest problem is that we can't construct mount trees
>> directly due to several reasons:
>> * groups can't be set, they can be only inherited
>> * file systems has to be mounted from the specified user namespaces
>> * the mount() syscall doesn't just create one mount -- the mount is
>>   also propagated to all members of a parent group
>> * umount() doesn't detach mounts from all members of a group
>>   (mounts with children are not umounted)
>> * mounts are propagated underneath of existing mounts
>> * mount() doesn't allow to make bind-mounts between two namespaces
>> * processes can have opened file descriptors to overmounted files
>>
>> All these operations are non-trivial, making the task of restoring
>> a mount namespace practically unsolvable for reasonable time. The
>> proposed change allows to restore a mount namespace in a direct
>> manner, without any super complex logic.
>>
>> Cc: Eric W. Biederman <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
>> Cc: Alexander Viro <viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn@public.gmane.org>
>> Signed-off-by: Andrei Vagin <avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
>> ---
>>  fs/namespace.c          | 66 ++++++++++++++++++++++++++++++++++++++++++++++---
>>  include/uapi/linux/fs.h |  6 +++++
>>  2 files changed, 68 insertions(+), 4 deletions(-)
>>
>> diff --git a/fs/namespace.c b/fs/namespace.c
>> index cc1375ef..3bf0cd2 100644
>> --- a/fs/namespace.c
>> +++ b/fs/namespace.c
>> @@ -2355,6 +2355,57 @@ static inline int tree_contains_unbindable(struct mount *mnt)
>>         return 0;
>>  }
>>
>> +static int do_set_group(struct path *path, const char *sibling_name)
>> +{
>> +       struct mount *sibling, *mnt;
>> +       struct path sibling_path;
>> +       int err;
>> +
>> +       if (!sibling_name || !*sibling_name)
>> +               return -EINVAL;
>> +
>> +       err = kern_path(sibling_name, LOOKUP_FOLLOW, &sibling_path);
>> +       if (err)
>> +               return err;
>> +
>> +       sibling = real_mount(sibling_path.mnt);
>> +       mnt = real_mount(path->mnt);
>> +
>> +       namespace_lock();
>> +
>> +       err = -EPERM;
>> +       if (!sibling->mnt_ns ||
>> +           !ns_capable(sibling->mnt_ns->user_ns, CAP_SYS_ADMIN))
>> +               goto out_unlock;
>> +
>> +       err = -EINVAL;
>> +       if (sibling->mnt.mnt_sb != mnt->mnt.mnt_sb)
>> +               goto out_unlock;
>> +
>> +       if (IS_MNT_SHARED(mnt) || IS_MNT_SLAVE(mnt))
>> +               goto out_unlock;
>> +
>> +       if (IS_MNT_SLAVE(sibling)) {
>> +               struct mount *m = sibling->mnt_master;
>> +
>> +               list_add(&mnt->mnt_slave, &m->mnt_slave_list);
>> +               mnt->mnt_master = m;
>> +       }
>> +
>> +       if (IS_MNT_SHARED(sibling)) {
>> +               mnt->mnt_group_id = sibling->mnt_group_id;
>> +               list_add(&mnt->mnt_share, &sibling->mnt_share);
>> +               set_mnt_shared(mnt);
>> +       }
>> +
>> +       err = 0;
>> +out_unlock:
>> +       namespace_unlock();
>> +
>> +       path_put(&sibling_path);
>> +       return err;
>> +}
>> +
>>  static int do_move_mount(struct path *path, const char *old_name)
>>  {
>>         struct path old_path, parent_path;
>> @@ -2769,6 +2820,7 @@ long do_mount(const char *dev_name, const char __user *dir_name,
>>         struct path path;
>>         int retval = 0;
>>         int mnt_flags = 0;
>> +       unsigned long cmd;
>>
>>         /* Discard magic */
>>         if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
>> @@ -2820,19 +2872,25 @@ long do_mount(const char *dev_name, const char __user *dir_name,
>>                 mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK;
>>         }
>>
>> +       cmd = flags & (MS_REMOUNT | MS_BIND |
>> +                      MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE |
>> +                      MS_MOVE | MS_SET_GROUP);
>> +
>>         flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
>>                    MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
>>                    MS_STRICTATIME | MS_NOREMOTELOCK | MS_SUBMOUNT);
>>
>> -       if (flags & MS_REMOUNT)
>> +       if (cmd & MS_REMOUNT)
>>                 retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
>>                                     data_page);
>> -       else if (flags & MS_BIND)
>> +       else if (cmd & MS_BIND)
>>                 retval = do_loopback(&path, dev_name, flags & MS_REC);
>> -       else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
>> +       else if (cmd & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
>>                 retval = do_change_type(&path, flags);
>> -       else if (flags & MS_MOVE)
>> +       else if (cmd & MS_MOVE)
>>                 retval = do_move_mount(&path, dev_name);
>> +       else if (cmd & MS_SET_GROUP)
>> +               retval = do_set_group(&path, dev_name);
>>         else
>>                 retval = do_new_mount(&path, type_page, flags, mnt_flags,
>>                                       dev_name, data_page);
>> diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
>> index 048a85e..33423aa 100644
>> --- a/include/uapi/linux/fs.h
>> +++ b/include/uapi/linux/fs.h
>> @@ -131,6 +131,12 @@ struct inodes_stat_t {
>>  #define MS_STRICTATIME (1<<24) /* Always perform atime updates */
>>  #define MS_LAZYTIME    (1<<25) /* Update the on-disk [acm]times lazily */
>>
>> +/*
>> + * Here are commands and flags. Commands are handled in do_mount()
>> + * and can intersect with kernel internal flags.
>> + */
>> +#define MS_SET_GROUP   (1<<26) /* Add a mount into a shared group */
>> +
>>  /* These sb flags are internal to the kernel */
>>  #define MS_SUBMOUNT     (1<<26)
>>  #define MS_NOREMOTELOCK        (1<<27)
>> --
>> 2.9.3
>>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] mnt: allow to add a mount into an existing group
  2017-05-10  0:42     ` Eric W. Biederman
  (?)
@ 2017-05-10 23:58     ` Andrei Vagin
  2021-03-23 12:59       ` [CRIU] " Pavel Tikhomirov
  -1 siblings, 1 reply; 13+ messages in thread
From: Andrei Vagin @ 2017-05-10 23:58 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Andrey Vagin, linux-fsdevel, LKML, Linux API, criu, Alexander Viro

On Tue, May 09, 2017 at 07:42:00PM -0500, Eric W. Biederman wrote:
> Andrey Vagin <avagin@openvz.org> writes:
> 
> > On Tue, Jan 24, 2017 at 02:03:23PM +1300, Eric W. Biederman wrote:
> >> Andrei Vagin <avagin@openvz.org> writes:
> >>
> >> > Now a shared group can be only inherited from a source mount.
> >> > This patch adds an ability to add a mount into an existing shared
> >> > group.
> >>
> >> This sounds like a lot of the discussion on bind mounts accross
> >> namespaces.  I am going to stay out of this for a bit until
> >> we resolve my latest patch.
> >
> > Hi Eric,
> >
> > Your patches about shadow/side mounts were committed, can we resume
> > the discussion about this patch?
> 
> We can.

Thank you

> 
> Although personally I would rather get back to figuring out how
> we can reduce the horrible time complexity of the worst case
> for umount -l.

This task is interesting for me too and I definitely will return to it soon.

> 
> > As for security, a mount can be added to a shared group, only if a
> > caller has CAP_SYS_ADMIN in namespaces of both mounts, so an
> > unprivileged user can't create a shared mount with a parent mount
> > namespace. If a user has CAP_SYS_ADMIN, I don't see a reason to
> > restrict him to create shared mounts between namespaces, even if they
> > are in different user-namespaces.
> 
> Can they create loops in mount propagation trees that we can not create
> today?  It feels like that would be very simple to do with an interface
> like this.

I am not sure what "loops in mount propagation trees" means. If it means
that two mounts will have inverted pairs of id-s for shared and master
groups, the answer is no. This interface doesn't allow to do this.

       int mount(const char *source, const char *target,
                 const char *filesystemtype, unsigned long mountflags,
                 const void *data);

This interface allows to add a target mount into shared AND slave
groups of the source mount. It is possible only if a target mount
is a private one.

Here is an example how it works:

[root@fc24 mounts]# cat /proc/self/mountinfo | grep test_mount
230 20 0:44 / /root/git/experiments/mounts/test/C rw,relatime - tmpfs test_mount rw
234 20 0:44 / /root/git/experiments/mounts/test/B rw,relatime shared:143 master:136 - tmpfs test_mount rw
[root@fc24 mounts]# strace -e mount ./set_group test/B test/C
mount("test/B", "test/C", NULL, 0x4000000 /* MS_??? */, NULL) = 0
+++ exited with 0 +++
[root@fc24 mounts]# cat /proc/self/mountinfo | grep test_mount
230 20 0:44 / /root/git/experiments/mounts/test/C rw,relatime shared:143 master:136 - tmpfs test_mount rw
234 20 0:44 / /root/git/experiments/mounts/test/B rw,relatime shared:143 master:136 - tmpfs test_mount rw

> 
> A loop in a mount propagation tree would be an absolute disaster.
> 
> I remember Al Viro had some very firm ideas about bind mounts from
> foreign namespaces.  That I have never take the time to understand.
> I suspect whatever objections he had will apply in this case.  Or else

It is interesting to get more details about this. I failed to find
something releative to this problem in lkml.

> this code might be made unnecessary by allowing bind mounts between
> mount namespaces.

No. We are going to use it even for a case when all mount namespaces
are lived on one user namespace. The main idea is to split restoring
of mount trees and mount groups.

Look at this example:

[root@fc24 mounts01]# cat test.sh 
set -e
mkdir -p A
mkdir -p B
mount -t tmpfs test_A A
mount --make-shared A
mkdir A/C
mount -t tmpfs test_C A/C
mkdir -p E
mount --bind A E
mount --bind A B
mount -t tmpfs test_E E/C
mount --make-private B/C
mkdir B/C/D
mount -t tmpfs test_D B/C/D
umount -l E/C
umount -l B/C/D
mount --make-rprivate E
umount -l E
cat /proc/self/mountinfo | grep test_

[root@fc24 mounts01]# unshare -m sh test.sh 
156 124 0:43 / /root/git/experiments/mounts01/A rw,relatime shared:70 - tmpfs test_A rw
157 156 0:44 / /root/git/experiments/mounts01/A/C rw,relatime shared:71 - tmpfs test_C rw
159 124 0:43 / /root/git/experiments/mounts01/B rw,relatime shared:70 - tmpfs test_A rw
162 159 0:45 / /root/git/experiments/mounts01/B/C rw,relatime - tmpfs test_E rw

Here we can see two mounts from one shared group, which have a
completely different set of children. Now this configuration can't be
achivied without extra helper mounts, which were umounted to get this
configuration.

With this new interface we will be able to restore a mount tree and
then restore groups for them and the algorithm is trivial:

mount -t tmpfs test_A A
mount --bind A B
mount -t tmpfs test_C A/C
mount -t tmpfs test_E B/C
mount --make-share A
mount --set-group A B
mount --make-share A/C

If we don't have the introduced interface, we need to invent this magic
sequence of commands (like in the test script) to reproduce the result
picture. The problem is that, when we are working with shared mounts,
all operations become more complex. We need to take into account that
mounts can be propagated to somewere, we need to remeber that the umount
command is propagated only for mounts without children. The bind-mount
command can be recursive or not. A mount could became shared when it
already had children. There are a number of other conditions for various
situations.

The superposition of this untrivial commands is making the task of restoring
a mount namespace practically unsolvable for reasonable time.

> 
> 
> > Now I look at volume drivers in container services (like docker and
> > kubernetes) and I think this functionality can be useful for them too.
> > Now it is impossible to run a plugin driver in unprivileged containers
> > (with sub-userns), because a container has to have a shared mount with
> > a mount namespace where the service is running. The idea of these
> > plugins is that a service requests a volume mount from a plugin and
> > then starts a container with this volume, so the service need to have
> > a way to get a mount from a service.
> 
> Interesting.  I personally think the checkpoint/restart use case
> is more compelling.

I just wanted to say that there may be other users for this interface.

Thanks,
Andrei
> 
> >> Eric
> >
> >
> > On Thu, Apr 27, 2017 at 10:18 PM, Andrei Vagin <avagin@openvz.org> wrote:
> >> Now a shared group can be only inherited from a source mount.
> >> This patch adds an ability to add a mount into an existing shared
> >> group.
> >>
> >> mount(source, target, NULL, MS_SET_GROUP, NULL)
> >>
> >> mount() with the MS_SET_GROUP flag adds the "target" mount into a group
> >> of the "source" mount. The calling process has to have the CAP_SYS_ADMIN
> >> capability in namespaces of these mounts. The source and the target
> >> mounts have to have the same super block.
> >>
> >> This new functionality together with "mnt: Tuck mounts under others
> >> instead of creating shadow/side mounts." allows CRIU to dump and restore
> >> any set of mount namespaces.
> >>
> >> Currently we have a lot of issues about dumping and restoring mount
> >> namespaces. The bigest problem is that we can't construct mount trees
> >> directly due to several reasons:
> >> * groups can't be set, they can be only inherited
> >> * file systems has to be mounted from the specified user namespaces
> >> * the mount() syscall doesn't just create one mount -- the mount is
> >>   also propagated to all members of a parent group
> >> * umount() doesn't detach mounts from all members of a group
> >>   (mounts with children are not umounted)
> >> * mounts are propagated underneath of existing mounts
> >> * mount() doesn't allow to make bind-mounts between two namespaces
> >> * processes can have opened file descriptors to overmounted files
> >>
> >> All these operations are non-trivial, making the task of restoring
> >> a mount namespace practically unsolvable for reasonable time. The
> >> proposed change allows to restore a mount namespace in a direct
> >> manner, without any super complex logic.
> >>
> >> Cc: Eric W. Biederman <ebiederm@xmission.com>
> >> Cc: Alexander Viro <viro@zeniv.linux.org.uk>
> >> Signed-off-by: Andrei Vagin <avagin@openvz.org>
> >> ---
> >>  fs/namespace.c          | 66 ++++++++++++++++++++++++++++++++++++++++++++++---
> >>  include/uapi/linux/fs.h |  6 +++++
> >>  2 files changed, 68 insertions(+), 4 deletions(-)
> >>
> >> diff --git a/fs/namespace.c b/fs/namespace.c
> >> index cc1375ef..3bf0cd2 100644
> >> --- a/fs/namespace.c
> >> +++ b/fs/namespace.c
> >> @@ -2355,6 +2355,57 @@ static inline int tree_contains_unbindable(struct mount *mnt)
> >>         return 0;
> >>  }
> >>
> >> +static int do_set_group(struct path *path, const char *sibling_name)
> >> +{
> >> +       struct mount *sibling, *mnt;
> >> +       struct path sibling_path;
> >> +       int err;
> >> +
> >> +       if (!sibling_name || !*sibling_name)
> >> +               return -EINVAL;
> >> +
> >> +       err = kern_path(sibling_name, LOOKUP_FOLLOW, &sibling_path);
> >> +       if (err)
> >> +               return err;
> >> +
> >> +       sibling = real_mount(sibling_path.mnt);
> >> +       mnt = real_mount(path->mnt);
> >> +
> >> +       namespace_lock();
> >> +
> >> +       err = -EPERM;
> >> +       if (!sibling->mnt_ns ||
> >> +           !ns_capable(sibling->mnt_ns->user_ns, CAP_SYS_ADMIN))
> >> +               goto out_unlock;
> >> +
> >> +       err = -EINVAL;
> >> +       if (sibling->mnt.mnt_sb != mnt->mnt.mnt_sb)
> >> +               goto out_unlock;
> >> +
> >> +       if (IS_MNT_SHARED(mnt) || IS_MNT_SLAVE(mnt))
> >> +               goto out_unlock;
> >> +
> >> +       if (IS_MNT_SLAVE(sibling)) {
> >> +               struct mount *m = sibling->mnt_master;
> >> +
> >> +               list_add(&mnt->mnt_slave, &m->mnt_slave_list);
> >> +               mnt->mnt_master = m;
> >> +       }
> >> +
> >> +       if (IS_MNT_SHARED(sibling)) {
> >> +               mnt->mnt_group_id = sibling->mnt_group_id;
> >> +               list_add(&mnt->mnt_share, &sibling->mnt_share);
> >> +               set_mnt_shared(mnt);
> >> +       }
> >> +
> >> +       err = 0;
> >> +out_unlock:
> >> +       namespace_unlock();
> >> +
> >> +       path_put(&sibling_path);
> >> +       return err;
> >> +}
> >> +
> >>  static int do_move_mount(struct path *path, const char *old_name)
> >>  {
> >>         struct path old_path, parent_path;
> >> @@ -2769,6 +2820,7 @@ long do_mount(const char *dev_name, const char __user *dir_name,
> >>         struct path path;
> >>         int retval = 0;
> >>         int mnt_flags = 0;
> >> +       unsigned long cmd;
> >>
> >>         /* Discard magic */
> >>         if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
> >> @@ -2820,19 +2872,25 @@ long do_mount(const char *dev_name, const char __user *dir_name,
> >>                 mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK;
> >>         }
> >>
> >> +       cmd = flags & (MS_REMOUNT | MS_BIND |
> >> +                      MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE |
> >> +                      MS_MOVE | MS_SET_GROUP);
> >> +
> >>         flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
> >>                    MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
> >>                    MS_STRICTATIME | MS_NOREMOTELOCK | MS_SUBMOUNT);
> >>
> >> -       if (flags & MS_REMOUNT)
> >> +       if (cmd & MS_REMOUNT)
> >>                 retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
> >>                                     data_page);
> >> -       else if (flags & MS_BIND)
> >> +       else if (cmd & MS_BIND)
> >>                 retval = do_loopback(&path, dev_name, flags & MS_REC);
> >> -       else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
> >> +       else if (cmd & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
> >>                 retval = do_change_type(&path, flags);
> >> -       else if (flags & MS_MOVE)
> >> +       else if (cmd & MS_MOVE)
> >>                 retval = do_move_mount(&path, dev_name);
> >> +       else if (cmd & MS_SET_GROUP)
> >> +               retval = do_set_group(&path, dev_name);
> >>         else
> >>                 retval = do_new_mount(&path, type_page, flags, mnt_flags,
> >>                                       dev_name, data_page);
> >> diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> >> index 048a85e..33423aa 100644
> >> --- a/include/uapi/linux/fs.h
> >> +++ b/include/uapi/linux/fs.h
> >> @@ -131,6 +131,12 @@ struct inodes_stat_t {
> >>  #define MS_STRICTATIME (1<<24) /* Always perform atime updates */
> >>  #define MS_LAZYTIME    (1<<25) /* Update the on-disk [acm]times lazily */
> >>
> >> +/*
> >> + * Here are commands and flags. Commands are handled in do_mount()
> >> + * and can intersect with kernel internal flags.
> >> + */
> >> +#define MS_SET_GROUP   (1<<26) /* Add a mount into a shared group */
> >> +
> >>  /* These sb flags are internal to the kernel */
> >>  #define MS_SUBMOUNT     (1<<26)
> >>  #define MS_NOREMOTELOCK        (1<<27)
> >> --
> >> 2.9.3
> >>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [CRIU] [PATCH] mnt: allow to add a mount into an existing group
  2017-05-10 23:58     ` Andrei Vagin
@ 2021-03-23 12:59       ` Pavel Tikhomirov
  2021-03-24 14:52         ` Pavel Tikhomirov
  0 siblings, 1 reply; 13+ messages in thread
From: Pavel Tikhomirov @ 2021-03-23 12:59 UTC (permalink / raw)
  To: Alexander Viro, Eric W . Biederman
  Cc: Andrei Vagin, Pavel Tikhomirov, Pavel Tikhomirov, linux-fsdevel,
	lkml, linux-api, crml

[-- Attachment #1: Type: text/plain, Size: 6301 bytes --]

Hi! Can we restart the discussion on this topic?

In CRIU we need to be able to dump/restore all mount trees of system 
container (CT). CT can have anything inside - users which create their 
custom mounts configuration, systemd with custom mount namespaces for 
it's services, nested application containers inside the CT with their 
own mount namespaces, and all mounts in CT mount trees can be grouped by 
sharing groupes (e.g. same shared_id + master_id pair), and those groups 
can depend one from another forming a tree structure of sharing groups.

1) Imagine that we have this sharing group tree (in format (shared_id, 
master_id), 0 means no sharing, we don't care about actual mounts for 
now only master-slave dependencies between sharing groups):

(1,0)
   |- (2,1)
   |- (3,1)
        |- (4,3)
             |- (0,4)

The main problem of restoring mounts is the fact that sharing groups 
currently can be only inherited, e.g. if you have one mount (first) with 
shared_id = x, master_id = y, the only way to get another mount with 
(x,y) is to create a bindmount from the first mount. Also to create 
mount (y,z) from mount (x,y) one should also first inherit (x,y) via 
bindmount and than change to (y,z).

This means that mentioned above tree puts restriction on the mounts 
creation order, one need to have at least one mount for each of sharing 
groups (1,0), (3,1) and (4,3) before creating the first mount of the 
sharing group (0,4).

But what if we want to mount (restore) actual mounts in this mount tree 
"reverse" order:

mntid	parent	mountpoint	(shared_id, master_id)
101	0	/tmp		(0,4)
102	101	/tmp		(4,3)
103	102	/tmp		(3,1)
104	103	/tmp		(1,0)

Mount 104's sharing group should be created before mount 101, 102 and 
103 sharing groups, but mount 104 should be created after those mounts. 
One can actually prepare this setup (on mainstream kernel) by 
pre-creating sharing groups elsewhere and then binding to /tmp in proper 
order with careful unmounting of propagations (see test.sh attached):

[root@snorch propagation-tests]# bash ../test.sh
------------
960 1120 0:56 / /tmp/propagation-tests/tmp rw,relatime master:452 - 
tmpfs propagation-tests-src rw,inode64
958 960 0:56 / /tmp/propagation-tests/tmp/sub rw,relatime shared:452 
master:451 - tmpfs propagation-tests-src rw,inode64
961 958 0:56 / /tmp/propagation-tests/tmp/sub/sub rw,relatime shared:451 
master:433 - tmpfs propagation-tests-src rw,inode64
963 961 0:56 / /tmp/propagation-tests/tmp/sub/sub/sub rw,relatime 
shared:433 - tmpfs propagation-tests-src rw,inode64
------------

But this "pre-creating" from test.sh is not universal at all and only 
works for this simple case. CRIU does not know anything about the 
history of mount creation for system container, it also does not know 
anything about any temporary mounts which were used and then removed. So 
understanding the proper order is almost impossible like Andrew says.

I've also prepared a presentation on Linux Plumbers last year about how 
much problems propagation brings to mounts restore in CRIU, you can take 
a look here https://www.linuxplumbersconf.org/event/7/contributions/640/

2) Propagation creates tons of mounts
3) Mount reparenting
4) "Mount trap"
5) "Non-uniform" propagation
6) “Cross-namespace” sharing groups

Allowing to create mounts private first and create sharing groups later 
and copy sharing groups later instead of inheriting them resolves all 
the problems with propagation at once.

One can take a look on the implementation of sharing group restore in 
CRIU if we have this (mnt: allow to add a mount into an existing group) 
patch applied: 
https://github.com/Snorch/criu/blob/bebbded98128ec787950fa8365a6c74ce6a3b2cb/criu/mount-v2.c#L898

Obviously this does not solve all the problems with mounts I know about 
but it's a big step forward in properly supporting them in CRIU. We 
already have this tested in Virtuozzo for almost a year and it works nice.

Notes:

- There is another idea, but I should say early that I don't like it, 
because with it restoring mounts with criu would be still super complex. 
We can add extra flag to mount/move_mount syscall to disable propagation 
temporary so that CRIU can restore the mount tree without problems 2-5, 
also we can now create cross-namespace bindmounts with 
(copy_tree+move_mount) to solve 6. But this solution does not help much 
with problem 1 - ordering and the need of temporary mounts. As you can 
see in test.sh you would still need to think hard to solve different 
similar configurations of reverse order between mounts and sharing groups.

- We can actually prohibit cross-namespace MS_SET_GROUP if you like. (If 
both namespaces are non abstract.) We can use open_tree to create a copy 
of the mount with the same sharing group and only then copy sharing from 
the copy while being in proper mountns.

- We still need it:

 > this code might be made unnecessary by allowing bind mounts between
 > mount namespaces.

No, because of problem 1. Guessing right order would be still to complex.

- This approach does not allow creation of any "bad" trees.

 > Can they create loops in mount propagation trees that we can not 
create today?

There would be no loops in "sharing groups tree" for sure, as this new 
MS_SET_GROUP only adds one _private_ mount to one group (without moving 
between groups), the tree itself is unchanged after mount(MS_SET_GROUP).

- Probably mount syscall is not the right place for MS_SET_GROUP. I see 
new syscall mount_setattr, first I thought reworking MS_SET_GROUP to be 
a part of it, but interface of mount_setattr for copying is not 
convenient. Probably we can add MS_SET_GROUP flag to move_mount which 
has exactly what we want path to source and destination relative to fd:

static inline int move_mount(int from_dfd, const char *from_pathname,
                              int to_dfd, const char *to_pathname,
                              unsigned int flags)

As in mount-v2 now I had to use proc hacks to access mounts at dfd:

https://github.com/Snorch/criu/blob/bebbded98128ec787950fa8365a6c74ce6a3b2cb/criu/mount-v2.c#L923

- I hope that we still have a chance for MS_SET_GROUP, this way we can 
port mount-v2 to mainstream CRIU.

-- 
Best regards, Tikhomirov Pavel
Software Developer, Virtuozzo.

[-- Attachment #2: test.sh --]
[-- Type: application/x-shellscript, Size: 726 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [CRIU] [PATCH] mnt: allow to add a mount into an existing group
  2021-03-23 12:59       ` [CRIU] " Pavel Tikhomirov
@ 2021-03-24 14:52         ` Pavel Tikhomirov
  0 siblings, 0 replies; 13+ messages in thread
From: Pavel Tikhomirov @ 2021-03-24 14:52 UTC (permalink / raw)
  To: Alexander Viro, Eric W . Biederman
  Cc: Pavel Tikhomirov, linux-fsdevel, lkml, linux-api, crml, Andrei Vagin

Adding Andrew to CC with the right email.

On 3/23/21 3:59 PM, Pavel Tikhomirov wrote:
> Hi! Can we restart the discussion on this topic?
> 
> In CRIU we need to be able to dump/restore all mount trees of system 
> container (CT). CT can have anything inside - users which create their 
> custom mounts configuration, systemd with custom mount namespaces for 
> it's services, nested application containers inside the CT with their 
> own mount namespaces, and all mounts in CT mount trees can be grouped by 
> sharing groupes (e.g. same shared_id + master_id pair), and those groups 
> can depend one from another forming a tree structure of sharing groups.
> 
> 1) Imagine that we have this sharing group tree (in format (shared_id, 
> master_id), 0 means no sharing, we don't care about actual mounts for 
> now only master-slave dependencies between sharing groups):
> 
> (1,0)
>    |- (2,1)
>    |- (3,1)
>         |- (4,3)
>              |- (0,4)
> 
> The main problem of restoring mounts is the fact that sharing groups 
> currently can be only inherited, e.g. if you have one mount (first) with 
> shared_id = x, master_id = y, the only way to get another mount with 
> (x,y) is to create a bindmount from the first mount. Also to create 
> mount (y,z) from mount (x,y) one should also first inherit (x,y) via 
> bindmount and than change to (y,z).
> 
> This means that mentioned above tree puts restriction on the mounts 
> creation order, one need to have at least one mount for each of sharing 
> groups (1,0), (3,1) and (4,3) before creating the first mount of the 
> sharing group (0,4).
> 
> But what if we want to mount (restore) actual mounts in this mount tree 
> "reverse" order:
> 
> mntid    parent    mountpoint    (shared_id, master_id)
> 101    0    /tmp        (0,4)
> 102    101    /tmp        (4,3)
> 103    102    /tmp        (3,1)
> 104    103    /tmp        (1,0)
> 
> Mount 104's sharing group should be created before mount 101, 102 and 
> 103 sharing groups, but mount 104 should be created after those mounts. 
> One can actually prepare this setup (on mainstream kernel) by 
> pre-creating sharing groups elsewhere and then binding to /tmp in proper 
> order with careful unmounting of propagations (see test.sh attached):
> 
> [root@snorch propagation-tests]# bash ../test.sh
> ------------
> 960 1120 0:56 / /tmp/propagation-tests/tmp rw,relatime master:452 - 
> tmpfs propagation-tests-src rw,inode64
> 958 960 0:56 / /tmp/propagation-tests/tmp/sub rw,relatime shared:452 
> master:451 - tmpfs propagation-tests-src rw,inode64
> 961 958 0:56 / /tmp/propagation-tests/tmp/sub/sub rw,relatime shared:451 
> master:433 - tmpfs propagation-tests-src rw,inode64
> 963 961 0:56 / /tmp/propagation-tests/tmp/sub/sub/sub rw,relatime 
> shared:433 - tmpfs propagation-tests-src rw,inode64
> ------------
> 
> But this "pre-creating" from test.sh is not universal at all and only 
> works for this simple case. CRIU does not know anything about the 
> history of mount creation for system container, it also does not know 
> anything about any temporary mounts which were used and then removed. So 
> understanding the proper order is almost impossible like Andrew says.
> 
> I've also prepared a presentation on Linux Plumbers last year about how 
> much problems propagation brings to mounts restore in CRIU, you can take 
> a look here https://www.linuxplumbersconf.org/event/7/contributions/640/
> 
> 2) Propagation creates tons of mounts
> 3) Mount reparenting
> 4) "Mount trap"
> 5) "Non-uniform" propagation
> 6) “Cross-namespace” sharing groups
> 
> Allowing to create mounts private first and create sharing groups later 
> and copy sharing groups later instead of inheriting them resolves all 
> the problems with propagation at once.
> 
> One can take a look on the implementation of sharing group restore in 
> CRIU if we have this (mnt: allow to add a mount into an existing group) 
> patch applied: 
> https://github.com/Snorch/criu/blob/bebbded98128ec787950fa8365a6c74ce6a3b2cb/criu/mount-v2.c#L898 
> 
> 
> Obviously this does not solve all the problems with mounts I know about 
> but it's a big step forward in properly supporting them in CRIU. We 
> already have this tested in Virtuozzo for almost a year and it works nice.
> 
> Notes:
> 
> - There is another idea, but I should say early that I don't like it, 
> because with it restoring mounts with criu would be still super complex. 
> We can add extra flag to mount/move_mount syscall to disable propagation 
> temporary so that CRIU can restore the mount tree without problems 2-5, 
> also we can now create cross-namespace bindmounts with 
> (copy_tree+move_mount) to solve 6. But this solution does not help much 
> with problem 1 - ordering and the need of temporary mounts. As you can 
> see in test.sh you would still need to think hard to solve different 
> similar configurations of reverse order between mounts and sharing groups.
> 
> - We can actually prohibit cross-namespace MS_SET_GROUP if you like. (If 
> both namespaces are non abstract.) We can use open_tree to create a copy 
> of the mount with the same sharing group and only then copy sharing from 
> the copy while being in proper mountns.
> 
> - We still need it:
> 
>  > this code might be made unnecessary by allowing bind mounts between
>  > mount namespaces.
> 
> No, because of problem 1. Guessing right order would be still to complex.
> 
> - This approach does not allow creation of any "bad" trees.
> 
>  > Can they create loops in mount propagation trees that we can not 
> create today?
> 
> There would be no loops in "sharing groups tree" for sure, as this new 
> MS_SET_GROUP only adds one _private_ mount to one group (without moving 
> between groups), the tree itself is unchanged after mount(MS_SET_GROUP).
> 
> - Probably mount syscall is not the right place for MS_SET_GROUP. I see 
> new syscall mount_setattr, first I thought reworking MS_SET_GROUP to be 
> a part of it, but interface of mount_setattr for copying is not 
> convenient. Probably we can add MS_SET_GROUP flag to move_mount which 
> has exactly what we want path to source and destination relative to fd:
> 
> static inline int move_mount(int from_dfd, const char *from_pathname,
>                               int to_dfd, const char *to_pathname,
>                               unsigned int flags)
> 
> As in mount-v2 now I had to use proc hacks to access mounts at dfd:
> 
> https://github.com/Snorch/criu/blob/bebbded98128ec787950fa8365a6c74ce6a3b2cb/criu/mount-v2.c#L923 
> 
> 
> - I hope that we still have a chance for MS_SET_GROUP, this way we can 
> port mount-v2 to mainstream CRIU.
> 

-- 
Best regards, Tikhomirov Pavel
Software Developer, Virtuozzo.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] mnt: allow to add a mount into an existing group
  2017-01-24  1:03 ` Eric W. Biederman
@ 2017-03-01  3:20     ` Andrei Vagin
  0 siblings, 0 replies; 13+ messages in thread
From: Andrei Vagin @ 2017-03-01  3:20 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Andrei Vagin, Alexander Viro, linux-fsdevel, linux-kernel,
	linux-api, Ram Pai

On Tue, Jan 24, 2017 at 02:03:23PM +1300, Eric W. Biederman wrote:
> Andrei Vagin <avagin@openvz.org> writes:
> 
> > Now a shared group can be only inherited from a source mount.
> > This patch adds an ability to add a mount into an existing shared
> > group.
> 
> This sounds like a lot of the discussion on bind mounts accross
> namespaces.  I am going to stay out of this for a bit until
> we resolve my latest patch.

Hello Eric,

I have seen your patches in the Linus' tree. Could we return to this
patch?

This patch should not add any security issues, because it allows to
create shared mounts between namespaces only if the current user has
CAP_SYS_ADMIN in both these mount namespaces.

For us (CRIU) this patch allows to separate restore of mount trees and
restore of shared groups.

Thanks,
Andrei

> 
> Eric
> 
> 
> > mount(source, target, NULL, MS_SET_GROUP, NULL)
> >
> > mount() with the MS_SET_GROUP flag adds the "target" mount into a group
> > of the "source" mount. The calling process has to have the CAP_SYS_ADMIN
> > capability in namespaces of these mounts. The source and the target
> > mounts have to have the same super block.
> >
> > This new functionality together with "mnt: Tuck mounts under others
> > instead of creating shadow/side mounts." allows CRIU to dump and restore
> > any set of mount namespaces.
> >
> > Currently we have a lot of issues about dumping and restoring mount
> > namespaces. The bigest problem is that we can't construct mount trees
> > directly due to several reasons:
> > * groups can't be set, they can be only inherited
> > * file systems has to be mounted from the specified user namespaces
> > * the mount() syscall doesn't just create one mount -- the mount is
> >   also propagated to all members of a parent group
> > * umount() doesn't detach mounts from all members of a group
> >   (mounts with children are not umounted)
> > * mounts are propagated underneath of existing mounts
> > * mount() doesn't allow to make bind-mounts between two namespaces
> > * processes can have opened file descriptors to overmounted files
> >
> > All these operations are non-trivial, making the task of restoring
> > a mount namespace practically unsolvable for reasonable time. The
> > proposed change allows to restore a mount namespace in a direct
> > manner, without any super complex logic.
> >
> > Cc: Eric W. Biederman <ebiederm@xmission.com>
> > Cc: Alexander Viro <viro@zeniv.linux.org.uk>
> > Signed-off-by: Andrei Vagin <avagin@openvz.org>
> > ---
> >  fs/namespace.c          | 53 +++++++++++++++++++++++++++++++++++++++++++++++++
> >  include/uapi/linux/fs.h |  1 +
> >  2 files changed, 54 insertions(+)
> >
> > diff --git a/fs/namespace.c b/fs/namespace.c
> > index b5b1259..df52fd4 100644
> > --- a/fs/namespace.c
> > +++ b/fs/namespace.c
> > @@ -2301,6 +2301,57 @@ static inline int tree_contains_unbindable(struct mount *mnt)
> >  	return 0;
> >  }
> >  
> > +static int do_set_group(struct path *path, const char *sibling_name)
> > +{
> > +	struct mount *sibling, *mnt;
> > +	struct path sibling_path;
> > +	int err;
> > +
> > +	if (!sibling_name || !*sibling_name)
> > +		return -EINVAL;
> > +
> > +	err = kern_path(sibling_name, LOOKUP_FOLLOW, &sibling_path);
> > +	if (err)
> > +		return err;
> > +
> > +	sibling = real_mount(sibling_path.mnt);
> > +	mnt = real_mount(path->mnt);
> > +
> > +	namespace_lock();
> > +
> > +	err = -EPERM;
> > +	if (!sibling->mnt_ns ||
> > +	    !ns_capable(sibling->mnt_ns->user_ns, CAP_SYS_ADMIN))
> > +		goto out_unlock;
> > +
> > +	err = -EINVAL;
> > +	if (sibling->mnt.mnt_sb != mnt->mnt.mnt_sb)
> > +		goto out_unlock;
> > +
> > +	if (IS_MNT_SHARED(mnt) || IS_MNT_SLAVE(mnt))
> > +		goto out_unlock;
> > +
> > +	if (IS_MNT_SLAVE(sibling)) {
> > +		struct mount *m = sibling->mnt_master;
> > +
> > +		list_add(&mnt->mnt_slave, &m->mnt_slave_list);
> > +		mnt->mnt_master = m;
> > +	}
> > +
> > +	if (IS_MNT_SHARED(sibling)) {
> > +		mnt->mnt_group_id = sibling->mnt_group_id;
> > +		list_add(&mnt->mnt_share, &sibling->mnt_share);
> > +		set_mnt_shared(mnt);
> > +	}
> > +
> > +	err = 0;
> > +out_unlock:
> > +	namespace_unlock();
> > +
> > +	path_put(&sibling_path);
> > +	return err;
> > +}
> > +
> >  static int do_move_mount(struct path *path, const char *old_name)
> >  {
> >  	struct path old_path, parent_path;
> > @@ -2779,6 +2830,8 @@ long do_mount(const char *dev_name, const char __user *dir_name,
> >  		retval = do_change_type(&path, flags);
> >  	else if (flags & MS_MOVE)
> >  		retval = do_move_mount(&path, dev_name);
> > +	else if (flags & MS_SET_GROUP)
> > +		retval = do_set_group(&path, dev_name);
> >  	else
> >  		retval = do_new_mount(&path, type_page, flags, mnt_flags,
> >  				      dev_name, data_page);
> > diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> > index 36da93f..6e6e37d 100644
> > --- a/include/uapi/linux/fs.h
> > +++ b/include/uapi/linux/fs.h
> > @@ -130,6 +130,7 @@ struct inodes_stat_t {
> >  #define MS_I_VERSION	(1<<23) /* Update inode I_version field */
> >  #define MS_STRICTATIME	(1<<24) /* Always perform atime updates */
> >  #define MS_LAZYTIME	(1<<25) /* Update the on-disk [acm]times lazily */
> > +#define MS_SET_GROUP	(1<<26) /* Add a mount into a shared group */
> >  
> >  /* These sb flags are internal to the kernel */
> >  #define MS_NOREMOTELOCK	(1<<27)

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] mnt: allow to add a mount into an existing group
@ 2017-03-01  3:20     ` Andrei Vagin
  0 siblings, 0 replies; 13+ messages in thread
From: Andrei Vagin @ 2017-03-01  3:20 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Andrei Vagin, Alexander Viro, linux-fsdevel, linux-kernel,
	linux-api, Ram Pai

On Tue, Jan 24, 2017 at 02:03:23PM +1300, Eric W. Biederman wrote:
> Andrei Vagin <avagin@openvz.org> writes:
> 
> > Now a shared group can be only inherited from a source mount.
> > This patch adds an ability to add a mount into an existing shared
> > group.
> 
> This sounds like a lot of the discussion on bind mounts accross
> namespaces.  I am going to stay out of this for a bit until
> we resolve my latest patch.

Hello Eric,

I have seen your patches in the Linus' tree. Could we return to this
patch?

This patch should not add any security issues, because it allows to
create shared mounts between namespaces only if the current user has
CAP_SYS_ADMIN in both these mount namespaces.

For us (CRIU) this patch allows to separate restore of mount trees and
restore of shared groups.

Thanks,
Andrei

> 
> Eric
> 
> 
> > mount(source, target, NULL, MS_SET_GROUP, NULL)
> >
> > mount() with the MS_SET_GROUP flag adds the "target" mount into a group
> > of the "source" mount. The calling process has to have the CAP_SYS_ADMIN
> > capability in namespaces of these mounts. The source and the target
> > mounts have to have the same super block.
> >
> > This new functionality together with "mnt: Tuck mounts under others
> > instead of creating shadow/side mounts." allows CRIU to dump and restore
> > any set of mount namespaces.
> >
> > Currently we have a lot of issues about dumping and restoring mount
> > namespaces. The bigest problem is that we can't construct mount trees
> > directly due to several reasons:
> > * groups can't be set, they can be only inherited
> > * file systems has to be mounted from the specified user namespaces
> > * the mount() syscall doesn't just create one mount -- the mount is
> >   also propagated to all members of a parent group
> > * umount() doesn't detach mounts from all members of a group
> >   (mounts with children are not umounted)
> > * mounts are propagated underneath of existing mounts
> > * mount() doesn't allow to make bind-mounts between two namespaces
> > * processes can have opened file descriptors to overmounted files
> >
> > All these operations are non-trivial, making the task of restoring
> > a mount namespace practically unsolvable for reasonable time. The
> > proposed change allows to restore a mount namespace in a direct
> > manner, without any super complex logic.
> >
> > Cc: Eric W. Biederman <ebiederm@xmission.com>
> > Cc: Alexander Viro <viro@zeniv.linux.org.uk>
> > Signed-off-by: Andrei Vagin <avagin@openvz.org>
> > ---
> >  fs/namespace.c          | 53 +++++++++++++++++++++++++++++++++++++++++++++++++
> >  include/uapi/linux/fs.h |  1 +
> >  2 files changed, 54 insertions(+)
> >
> > diff --git a/fs/namespace.c b/fs/namespace.c
> > index b5b1259..df52fd4 100644
> > --- a/fs/namespace.c
> > +++ b/fs/namespace.c
> > @@ -2301,6 +2301,57 @@ static inline int tree_contains_unbindable(struct mount *mnt)
> >  	return 0;
> >  }
> >  
> > +static int do_set_group(struct path *path, const char *sibling_name)
> > +{
> > +	struct mount *sibling, *mnt;
> > +	struct path sibling_path;
> > +	int err;
> > +
> > +	if (!sibling_name || !*sibling_name)
> > +		return -EINVAL;
> > +
> > +	err = kern_path(sibling_name, LOOKUP_FOLLOW, &sibling_path);
> > +	if (err)
> > +		return err;
> > +
> > +	sibling = real_mount(sibling_path.mnt);
> > +	mnt = real_mount(path->mnt);
> > +
> > +	namespace_lock();
> > +
> > +	err = -EPERM;
> > +	if (!sibling->mnt_ns ||
> > +	    !ns_capable(sibling->mnt_ns->user_ns, CAP_SYS_ADMIN))
> > +		goto out_unlock;
> > +
> > +	err = -EINVAL;
> > +	if (sibling->mnt.mnt_sb != mnt->mnt.mnt_sb)
> > +		goto out_unlock;
> > +
> > +	if (IS_MNT_SHARED(mnt) || IS_MNT_SLAVE(mnt))
> > +		goto out_unlock;
> > +
> > +	if (IS_MNT_SLAVE(sibling)) {
> > +		struct mount *m = sibling->mnt_master;
> > +
> > +		list_add(&mnt->mnt_slave, &m->mnt_slave_list);
> > +		mnt->mnt_master = m;
> > +	}
> > +
> > +	if (IS_MNT_SHARED(sibling)) {
> > +		mnt->mnt_group_id = sibling->mnt_group_id;
> > +		list_add(&mnt->mnt_share, &sibling->mnt_share);
> > +		set_mnt_shared(mnt);
> > +	}
> > +
> > +	err = 0;
> > +out_unlock:
> > +	namespace_unlock();
> > +
> > +	path_put(&sibling_path);
> > +	return err;
> > +}
> > +
> >  static int do_move_mount(struct path *path, const char *old_name)
> >  {
> >  	struct path old_path, parent_path;
> > @@ -2779,6 +2830,8 @@ long do_mount(const char *dev_name, const char __user *dir_name,
> >  		retval = do_change_type(&path, flags);
> >  	else if (flags & MS_MOVE)
> >  		retval = do_move_mount(&path, dev_name);
> > +	else if (flags & MS_SET_GROUP)
> > +		retval = do_set_group(&path, dev_name);
> >  	else
> >  		retval = do_new_mount(&path, type_page, flags, mnt_flags,
> >  				      dev_name, data_page);
> > diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> > index 36da93f..6e6e37d 100644
> > --- a/include/uapi/linux/fs.h
> > +++ b/include/uapi/linux/fs.h
> > @@ -130,6 +130,7 @@ struct inodes_stat_t {
> >  #define MS_I_VERSION	(1<<23) /* Update inode I_version field */
> >  #define MS_STRICTATIME	(1<<24) /* Always perform atime updates */
> >  #define MS_LAZYTIME	(1<<25) /* Update the on-disk [acm]times lazily */
> > +#define MS_SET_GROUP	(1<<26) /* Add a mount into a shared group */
> >  
> >  /* These sb flags are internal to the kernel */
> >  #define MS_NOREMOTELOCK	(1<<27)

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] mnt: allow to add a mount into an existing group
  2017-01-23 23:37 Andrei Vagin
@ 2017-01-24  1:03 ` Eric W. Biederman
  2017-03-01  3:20     ` Andrei Vagin
  0 siblings, 1 reply; 13+ messages in thread
From: Eric W. Biederman @ 2017-01-24  1:03 UTC (permalink / raw)
  To: Andrei Vagin
  Cc: Alexander Viro, linux-fsdevel, linux-kernel, linux-api, Ram Pai

Andrei Vagin <avagin@openvz.org> writes:

> Now a shared group can be only inherited from a source mount.
> This patch adds an ability to add a mount into an existing shared
> group.

This sounds like a lot of the discussion on bind mounts accross
namespaces.  I am going to stay out of this for a bit until
we resolve my latest patch.

Eric


> mount(source, target, NULL, MS_SET_GROUP, NULL)
>
> mount() with the MS_SET_GROUP flag adds the "target" mount into a group
> of the "source" mount. The calling process has to have the CAP_SYS_ADMIN
> capability in namespaces of these mounts. The source and the target
> mounts have to have the same super block.
>
> This new functionality together with "mnt: Tuck mounts under others
> instead of creating shadow/side mounts." allows CRIU to dump and restore
> any set of mount namespaces.
>
> Currently we have a lot of issues about dumping and restoring mount
> namespaces. The bigest problem is that we can't construct mount trees
> directly due to several reasons:
> * groups can't be set, they can be only inherited
> * file systems has to be mounted from the specified user namespaces
> * the mount() syscall doesn't just create one mount -- the mount is
>   also propagated to all members of a parent group
> * umount() doesn't detach mounts from all members of a group
>   (mounts with children are not umounted)
> * mounts are propagated underneath of existing mounts
> * mount() doesn't allow to make bind-mounts between two namespaces
> * processes can have opened file descriptors to overmounted files
>
> All these operations are non-trivial, making the task of restoring
> a mount namespace practically unsolvable for reasonable time. The
> proposed change allows to restore a mount namespace in a direct
> manner, without any super complex logic.
>
> Cc: Eric W. Biederman <ebiederm@xmission.com>
> Cc: Alexander Viro <viro@zeniv.linux.org.uk>
> Signed-off-by: Andrei Vagin <avagin@openvz.org>
> ---
>  fs/namespace.c          | 53 +++++++++++++++++++++++++++++++++++++++++++++++++
>  include/uapi/linux/fs.h |  1 +
>  2 files changed, 54 insertions(+)
>
> diff --git a/fs/namespace.c b/fs/namespace.c
> index b5b1259..df52fd4 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -2301,6 +2301,57 @@ static inline int tree_contains_unbindable(struct mount *mnt)
>  	return 0;
>  }
>  
> +static int do_set_group(struct path *path, const char *sibling_name)
> +{
> +	struct mount *sibling, *mnt;
> +	struct path sibling_path;
> +	int err;
> +
> +	if (!sibling_name || !*sibling_name)
> +		return -EINVAL;
> +
> +	err = kern_path(sibling_name, LOOKUP_FOLLOW, &sibling_path);
> +	if (err)
> +		return err;
> +
> +	sibling = real_mount(sibling_path.mnt);
> +	mnt = real_mount(path->mnt);
> +
> +	namespace_lock();
> +
> +	err = -EPERM;
> +	if (!sibling->mnt_ns ||
> +	    !ns_capable(sibling->mnt_ns->user_ns, CAP_SYS_ADMIN))
> +		goto out_unlock;
> +
> +	err = -EINVAL;
> +	if (sibling->mnt.mnt_sb != mnt->mnt.mnt_sb)
> +		goto out_unlock;
> +
> +	if (IS_MNT_SHARED(mnt) || IS_MNT_SLAVE(mnt))
> +		goto out_unlock;
> +
> +	if (IS_MNT_SLAVE(sibling)) {
> +		struct mount *m = sibling->mnt_master;
> +
> +		list_add(&mnt->mnt_slave, &m->mnt_slave_list);
> +		mnt->mnt_master = m;
> +	}
> +
> +	if (IS_MNT_SHARED(sibling)) {
> +		mnt->mnt_group_id = sibling->mnt_group_id;
> +		list_add(&mnt->mnt_share, &sibling->mnt_share);
> +		set_mnt_shared(mnt);
> +	}
> +
> +	err = 0;
> +out_unlock:
> +	namespace_unlock();
> +
> +	path_put(&sibling_path);
> +	return err;
> +}
> +
>  static int do_move_mount(struct path *path, const char *old_name)
>  {
>  	struct path old_path, parent_path;
> @@ -2779,6 +2830,8 @@ long do_mount(const char *dev_name, const char __user *dir_name,
>  		retval = do_change_type(&path, flags);
>  	else if (flags & MS_MOVE)
>  		retval = do_move_mount(&path, dev_name);
> +	else if (flags & MS_SET_GROUP)
> +		retval = do_set_group(&path, dev_name);
>  	else
>  		retval = do_new_mount(&path, type_page, flags, mnt_flags,
>  				      dev_name, data_page);
> diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> index 36da93f..6e6e37d 100644
> --- a/include/uapi/linux/fs.h
> +++ b/include/uapi/linux/fs.h
> @@ -130,6 +130,7 @@ struct inodes_stat_t {
>  #define MS_I_VERSION	(1<<23) /* Update inode I_version field */
>  #define MS_STRICTATIME	(1<<24) /* Always perform atime updates */
>  #define MS_LAZYTIME	(1<<25) /* Update the on-disk [acm]times lazily */
> +#define MS_SET_GROUP	(1<<26) /* Add a mount into a shared group */
>  
>  /* These sb flags are internal to the kernel */
>  #define MS_NOREMOTELOCK	(1<<27)

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH] mnt: allow to add a mount into an existing group
@ 2017-01-23 23:37 Andrei Vagin
  2017-01-24  1:03 ` Eric W. Biederman
  0 siblings, 1 reply; 13+ messages in thread
From: Andrei Vagin @ 2017-01-23 23:37 UTC (permalink / raw)
  To: Alexander Viro, Eric W . Biederman
  Cc: linux-fsdevel, linux-kernel, linux-api, Andrei Vagin

Now a shared group can be only inherited from a source mount.
This patch adds an ability to add a mount into an existing shared
group.

mount(source, target, NULL, MS_SET_GROUP, NULL)

mount() with the MS_SET_GROUP flag adds the "target" mount into a group
of the "source" mount. The calling process has to have the CAP_SYS_ADMIN
capability in namespaces of these mounts. The source and the target
mounts have to have the same super block.

This new functionality together with "mnt: Tuck mounts under others
instead of creating shadow/side mounts." allows CRIU to dump and restore
any set of mount namespaces.

Currently we have a lot of issues about dumping and restoring mount
namespaces. The bigest problem is that we can't construct mount trees
directly due to several reasons:
* groups can't be set, they can be only inherited
* file systems has to be mounted from the specified user namespaces
* the mount() syscall doesn't just create one mount -- the mount is
  also propagated to all members of a parent group
* umount() doesn't detach mounts from all members of a group
  (mounts with children are not umounted)
* mounts are propagated underneath of existing mounts
* mount() doesn't allow to make bind-mounts between two namespaces
* processes can have opened file descriptors to overmounted files

All these operations are non-trivial, making the task of restoring
a mount namespace practically unsolvable for reasonable time. The
proposed change allows to restore a mount namespace in a direct
manner, without any super complex logic.

Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrei Vagin <avagin@openvz.org>
---
 fs/namespace.c          | 53 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/fs.h |  1 +
 2 files changed, 54 insertions(+)

diff --git a/fs/namespace.c b/fs/namespace.c
index b5b1259..df52fd4 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2301,6 +2301,57 @@ static inline int tree_contains_unbindable(struct mount *mnt)
 	return 0;
 }
 
+static int do_set_group(struct path *path, const char *sibling_name)
+{
+	struct mount *sibling, *mnt;
+	struct path sibling_path;
+	int err;
+
+	if (!sibling_name || !*sibling_name)
+		return -EINVAL;
+
+	err = kern_path(sibling_name, LOOKUP_FOLLOW, &sibling_path);
+	if (err)
+		return err;
+
+	sibling = real_mount(sibling_path.mnt);
+	mnt = real_mount(path->mnt);
+
+	namespace_lock();
+
+	err = -EPERM;
+	if (!sibling->mnt_ns ||
+	    !ns_capable(sibling->mnt_ns->user_ns, CAP_SYS_ADMIN))
+		goto out_unlock;
+
+	err = -EINVAL;
+	if (sibling->mnt.mnt_sb != mnt->mnt.mnt_sb)
+		goto out_unlock;
+
+	if (IS_MNT_SHARED(mnt) || IS_MNT_SLAVE(mnt))
+		goto out_unlock;
+
+	if (IS_MNT_SLAVE(sibling)) {
+		struct mount *m = sibling->mnt_master;
+
+		list_add(&mnt->mnt_slave, &m->mnt_slave_list);
+		mnt->mnt_master = m;
+	}
+
+	if (IS_MNT_SHARED(sibling)) {
+		mnt->mnt_group_id = sibling->mnt_group_id;
+		list_add(&mnt->mnt_share, &sibling->mnt_share);
+		set_mnt_shared(mnt);
+	}
+
+	err = 0;
+out_unlock:
+	namespace_unlock();
+
+	path_put(&sibling_path);
+	return err;
+}
+
 static int do_move_mount(struct path *path, const char *old_name)
 {
 	struct path old_path, parent_path;
@@ -2779,6 +2830,8 @@ long do_mount(const char *dev_name, const char __user *dir_name,
 		retval = do_change_type(&path, flags);
 	else if (flags & MS_MOVE)
 		retval = do_move_mount(&path, dev_name);
+	else if (flags & MS_SET_GROUP)
+		retval = do_set_group(&path, dev_name);
 	else
 		retval = do_new_mount(&path, type_page, flags, mnt_flags,
 				      dev_name, data_page);
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 36da93f..6e6e37d 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -130,6 +130,7 @@ struct inodes_stat_t {
 #define MS_I_VERSION	(1<<23) /* Update inode I_version field */
 #define MS_STRICTATIME	(1<<24) /* Always perform atime updates */
 #define MS_LAZYTIME	(1<<25) /* Update the on-disk [acm]times lazily */
+#define MS_SET_GROUP	(1<<26) /* Add a mount into a shared group */
 
 /* These sb flags are internal to the kernel */
 #define MS_NOREMOTELOCK	(1<<27)
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2021-03-24 14:53 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-04-28  5:18 [PATCH] mnt: allow to add a mount into an existing group Andrei Vagin
2017-04-28  5:18 ` Andrei Vagin
2017-05-09 17:36 ` Andrey Vagin
2017-05-09 17:36   ` Andrey Vagin
2017-05-10  0:42   ` Eric W. Biederman
2017-05-10  0:42     ` Eric W. Biederman
2017-05-10 23:58     ` Andrei Vagin
2021-03-23 12:59       ` [CRIU] " Pavel Tikhomirov
2021-03-24 14:52         ` Pavel Tikhomirov
  -- strict thread matches above, loose matches on Subject: below --
2017-01-23 23:37 Andrei Vagin
2017-01-24  1:03 ` Eric W. Biederman
2017-03-01  3:20   ` Andrei Vagin
2017-03-01  3:20     ` Andrei Vagin

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.