linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] mountinfo: implement show_path for kernfs and cgroup
@ 2016-05-05 15:20 Serge E. Hallyn
  2016-05-06 13:39 ` Michael Kerrisk (man-pages)
  0 siblings, 1 reply; 8+ messages in thread
From: Serge E. Hallyn @ 2016-05-05 15:20 UTC (permalink / raw)
  To: Tejun Heo, linux-kernel
  Cc: linux-api, containers, hannes, ebiederm, gregkh, cgroups, akpm, serge

Short explanation:

When showing a cgroupfs entry in mountinfo, show the path of the mount
root dentry relative to the reader's cgroup namespace root.

Long version:

When a uid 0 task which is in freezer cgroup /a/b, unshares a new cgroup
namespace, and then mounts a new instance of the freezer cgroup, the new
mount will be rooted at /a/b.  The root dentry field of the mountinfo
entry will show '/a/b'.

 cat > /tmp/do1 << EOF
 mount -t cgroup -o freezer freezer /mnt
 grep freezer /proc/self/mountinfo
 EOF

 unshare -Gm  bash /tmp/do1
 > 330 160 0:34 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,freezer
 > 355 133 0:34 /a/b /mnt rw,relatime - cgroup freezer rw,freezer

The task's freezer cgroup entry in /proc/self/cgroup will simply show
'/':

 grep freezer /proc/self/cgroup
 9:freezer:/

If instead the same task simply bind mounts the /a/b cgroup directory,
the resulting mountinfo entry will again show /a/b for the dentry root.
However in this case the task will find its own cgroup at /mnt/a/b,
not at /mnt:

 mount --bind /sys/fs/cgroup/freezer/a/b /mnt
 130 25 0:34 /a/b /mnt rw,nosuid,nodev,noexec,relatime shared:21 - cgroup cgroup rw,freezer

In other words, there is no way for the task to know, based on what is
in mountinfo, which cgroup directory is its own.

With this patch, the dentry root field in mountinfo is shown relative
to the reader's cgroup namespace.  I.e.:

 unshare -Gm  bash /tmp/do1
 > 330 160 0:34 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,freezer
 > 355 133 0:34 / /mnt rw,relatime - cgroup freezer rw,freezer

This way the task can correlate the paths in /proc/pid/cgroup to
/proc/self/mountinfo, and determine which cgroup directory (in any
mount which the reader created) corresponds to the task.

Signed-off-by: Serge Hallyn <serge.hallyn@ubuntu.com>
---
 fs/kernfs/mount.c      | 14 +++++++++++
 include/linux/kernfs.h |  2 ++
 kernel/cgroup.c        | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 79 insertions(+)

diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index f73541f..3b78724 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -15,6 +15,7 @@
 #include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/namei.h>
+#include <linux/seq_file.h>
 
 #include "kernfs-internal.h"
 
@@ -40,6 +41,18 @@ static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry)
 	return 0;
 }
 
+static int kernfs_sop_show_path(struct seq_file *sf, struct dentry *dentry)
+{
+	struct kernfs_node *node = dentry->d_fsdata;
+	struct kernfs_root *root = kernfs_root(node);
+	struct kernfs_syscall_ops *scops = root->syscall_ops;
+
+	if (scops && scops->show_path)
+		return scops->show_path(sf, node, root);
+
+	return seq_dentry(sf, dentry, " \t\n\\");
+}
+
 const struct super_operations kernfs_sops = {
 	.statfs		= simple_statfs,
 	.drop_inode	= generic_delete_inode,
@@ -47,6 +60,7 @@ const struct super_operations kernfs_sops = {
 
 	.remount_fs	= kernfs_sop_remount_fs,
 	.show_options	= kernfs_sop_show_options,
+	.show_path	= kernfs_sop_show_path,
 };
 
 /**
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index c06c442..30f089e 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -152,6 +152,8 @@ struct kernfs_syscall_ops {
 	int (*rmdir)(struct kernfs_node *kn);
 	int (*rename)(struct kernfs_node *kn, struct kernfs_node *new_parent,
 		      const char *new_name);
+	int (*show_path)(struct seq_file *sf, struct kernfs_node *kn,
+			 struct kernfs_root *root);
 };
 
 struct kernfs_root {
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 909a7d3..afea39e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1215,6 +1215,41 @@ static void cgroup_destroy_root(struct cgroup_root *root)
 	cgroup_free_root(root);
 }
 
+/*
+ * look up cgroup associated with current task's cgroup namespace on the
+ * specified hierarchy
+ */
+static struct cgroup *
+current_cgns_cgroup_from_root(struct cgroup_root *root)
+{
+	struct cgroup *res = NULL;
+	struct css_set *cset;
+
+	lockdep_assert_held(&css_set_lock);
+
+	rcu_read_lock();
+
+	cset = current->nsproxy->cgroup_ns->root_cset;
+	if (cset == &init_css_set) {
+		res = &root->cgrp;
+	} else {
+		struct cgrp_cset_link *link;
+
+		list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
+			struct cgroup *c = link->cgrp;
+
+			if (c->root == root) {
+				res = c;
+				break;
+			}
+		}
+	}
+	rcu_read_unlock();
+
+	BUG_ON(!res);
+	return res;
+}
+
 /* look up cgroup associated with given css_set on the specified hierarchy */
 static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
 					    struct cgroup_root *root)
@@ -1593,6 +1628,33 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
 	return 0;
 }
 
+static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
+			    struct kernfs_root *kf_root)
+{
+	int len = 0, ret = 0;
+	char *buf = NULL;
+	struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root);
+	struct cgroup *ns_cgroup;
+
+	buf = kmalloc(PATH_MAX, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	spin_lock_bh(&css_set_lock);
+	ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);
+	len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
+	spin_unlock_bh(&css_set_lock);
+
+	if (len >= PATH_MAX)
+		len = -ERANGE;
+	else if (len > 0) {
+		seq_escape(sf, buf, " \t\n\\");
+		len = 0;
+	}
+	kfree(buf);
+	return len;
+}
+
 static int cgroup_show_options(struct seq_file *seq,
 			       struct kernfs_root *kf_root)
 {
@@ -5433,6 +5495,7 @@ static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
 	.mkdir			= cgroup_mkdir,
 	.rmdir			= cgroup_rmdir,
 	.rename			= cgroup_rename,
+	.show_path		= cgroup_show_path,
 };
 
 static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] mountinfo: implement show_path for kernfs and cgroup
  2016-05-05 15:20 [PATCH] mountinfo: implement show_path for kernfs and cgroup Serge E. Hallyn
@ 2016-05-06 13:39 ` Michael Kerrisk (man-pages)
  2016-05-06 17:33   ` Serge E. Hallyn
  0 siblings, 1 reply; 8+ messages in thread
From: Michael Kerrisk (man-pages) @ 2016-05-06 13:39 UTC (permalink / raw)
  To: Serge E. Hallyn, Tejun Heo, linux-kernel
  Cc: mtk.manpages, linux-api, containers, hannes, ebiederm, gregkh,
	cgroups, akpm, serge

Hi Serge,

I'll add my own notes below, as much as anything in order to convince 
myself that I understand what's going on.

On 05/05/2016 05:20 PM, Serge E. Hallyn wrote:
> Short explanation:
> 
> When showing a cgroupfs entry in mountinfo, show the path of the mount
> root dentry relative to the reader's cgroup namespace root.

As part of the commit message, I think it would be useful to add a 
sentence here explain why this is needed / which applications need it.

> Long version:
> 
> When a uid 0 task which is in freezer cgroup /a/b, unshares a new cgroup
> namespace, and then mounts a new instance of the freezer cgroup, the new
> mount will be rooted at /a/b.  The root dentry field of the mountinfo
> entry will show '/a/b'.

So, the point is that if we create a new cgroup namespace,
then we want both /proc/self/cgroup and /proc/self/mountinfo
to show cgroup paths that are correctly virtualized with
respect to the cgroup mount point. Previous to this patch,
/proc/self/cgroup shows the right info, but
/proc/self/mountinfo does not. (Walk through in a moment.) 

Is the above a correct summary?

>  cat > /tmp/do1 << EOF
>  mount -t cgroup -o freezer freezer /mnt
>  grep freezer /proc/self/mountinfo
>  EOF
> 
>  unshare -Gm  bash /tmp/do1

Should that not be:

    unshare -Gm  bash /tmp/do1

(i.e., s/G/C/)?

>  > 330 160 0:34 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,freezer
>  > 355 133 0:34 /a/b /mnt rw,relatime - cgroup freezer rw,freezer
> 
> The task's freezer cgroup entry in /proc/self/cgroup will simply show
> '/':
> 
>  grep freezer /proc/self/cgroup
>  9:freezer:/
> 
> If instead the same task simply bind mounts the /a/b cgroup directory,
> the resulting mountinfo entry will again show /a/b for the dentry root.
> However in this case the task will find its own cgroup at /mnt/a/b,
> not at /mnt:
> 
>  mount --bind /sys/fs/cgroup/freezer/a/b /mnt
>  130 25 0:34 /a/b /mnt rw,nosuid,nodev,noexec,relatime shared:21 - cgroup cgroup rw,freezer
> 
> In other words, there is no way for the task to know, based on what is
> in mountinfo, which cgroup directory is its own.

So, my walk through and commentary to the above...

First, a little script to save some typing and verbiage:

# cat cgroup_info.sh
#!/bin/sh
echo -e "\t/proc/self/cgroup:\t$(cat /proc/self/cgroup | grep freezer)"
cat /proc/self/mountinfo | grep freezer |
        awk '{print "\tmountinfo:\t\t" $4 "\t" $5}'
#

Create cgroup, place this shell into the cgroup, and look at the state of
the /proc files:

# mkdir -p /sys/fs/cgroup/freezer/a/b
# echo $$ > /sys/fs/cgroup/freezer/a/b/cgroup.procs
# echo $$
2653
# cat /sys/fs/cgroup/freezer/a/b/cgroup.procs 
2653                         # Our shell
14254                        # cat(1)
# ./cgroup_info.sh 
	/proc/self/cgroup:	10:freezer:/a/b
	mountinfo:		/	/sys/fs/cgroup/freezer

Create a shell in new cgroup and mount namespaces. The act of creating
a new cgroup namespace causes the process's current cgroups directories 
to become its cgroup root directories. (Here, I'm using my own version
of the "unshare" utility, which takes the same options as the util-linux
version):

# ~mtk/tlpi/code/ns/unshare -Cm bash

Look at the state of the /proc files:

# ./cgroup_info.sh 
	/proc/self/cgroup:	10:freezer:/
	mountinfo:		/	/sys/fs/cgroup/freezer

The third entry in /proc/self/info (the pathname of the cgroup inside the
hierarchy) is correctly virtualized w.r.t. the cgroup namespace, which is
rooted at /a/b in the outer namespace.

However, the info in /proc/self/mountinfo is not for this cgroup
namespace, since we are seeing a duplicate of the mount from the 
old mount namespace, and the info there does not correspond to the
new cgroup namespace. However, trying to create a new mount still 
doesn't show us the right information in mountinfo:

# mount --make-rslave /               # Prevent our mount operations 
                                      # propagating to other mountns
# mkdir -p /mnt/freezer               # Create a new mount point
# umount /sys/fs/cgroup/freezer       # Discard old mount
# mount -t cgroup -o freezer freezer /mnt/freezer/
# ./cgroup_info.sh
        /proc/self/cgroup:      7:freezer:/
        mountinfo:              /a/b    /mnt/freezer

The act of creating a new cgroup namespace caused the process's
current freezer directory, "/a/b", to become its cgroup freezer root
directory. In other words, the pathname directory of the directory
within the newly mounted cgroup filesystem should be "/",
but mountinfo wrongly shows us "/a/b". The consequence of this is
that the process in the cgroup namespace cannot correctly construct
the pathname of its cgroup root directory from the information in
/proc/PID/mountinfo. (The current patch fixes exactly this problem.)

> With this patch, the dentry root field in mountinfo is shown relative
> to the reader's cgroup namespace.  I.e.:
> 
>  unshare -Gm  bash /tmp/do1
>  > 330 160 0:34 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,freezer
>  > 355 133 0:34 / /mnt rw,relatime - cgroup freezer rw,freezer
> 
> This way the task can correlate the paths in /proc/pid/cgroup to
> /proc/self/mountinfo, and determine which cgroup directory (in any
> mount which the reader created) corresponds to the task.

So, I applied your patch against a current (i.e., 4.6-rc6) kernel.
Same steps as before, and here's what I see:

# mkdir -p /sys/fs/cgroup/freezer/a/b
# echo $$ > /sys/fs/cgroup/freezer/a/b/cgroup.procs
# ./cgroup_info.sh 
	/proc/self/cgroup:	10:freezer:/a/b
	mountinfo:		/	/sys/fs/cgroup/freezer
# ~mtk/tlpi/code/ns/unshare -Cm bash
# ./cgroup_info.sh 
	/proc/self/cgroup:	10:freezer:/
	mountinfo:		/../..	/sys/fs/cgroup/freezer
# mount --make-rslave /
# mkdir -p /mnt/freezer
# umount /sys/fs/cgroup/freezer
# mount -t cgroup -o freezer freezer /mnt/freezer/
# ./cgroup_info.sh
	/proc/self/cgroup:	10:freezer:/
	mountinfo:		/	/mnt/freezer

Now the root directory path shown by mountinfo is correct,
and when we look inside the mount point, we see that things
look "right" (i.e., a cgroup root directory with no
subdirectories, and the PID of the shell run by unshare is
in the cgroup.procs file of this cgroup):

# ls /mnt/freezer/
cgroup.clone_children  freezer.parent_freezing  freezer.state      tasks
cgroup.procs           freezer.self_freezing    notify_on_release
# echo $$
3164
# cat /mnt/freezer/cgroup.procs
2653                   # First shell that placed in this cgroup
3164                   # Shell started by 'unshare'
14197                  # cat(1)

All makes sense to me.

Tested-by: Michael Kerrisk <mtk.manpages@gmail.com>
Acked-by: Michael Kerrisk <mtk.manpages@gmail.com>

(I did no review of the patch itself though.)

Cheers,

Michael


> Signed-off-by: Serge Hallyn <serge.hallyn@ubuntu.com>
> ---
>  fs/kernfs/mount.c      | 14 +++++++++++
>  include/linux/kernfs.h |  2 ++
>  kernel/cgroup.c        | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 79 insertions(+)
> 
> diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
> index f73541f..3b78724 100644
> --- a/fs/kernfs/mount.c
> +++ b/fs/kernfs/mount.c
> @@ -15,6 +15,7 @@
>  #include <linux/slab.h>
>  #include <linux/pagemap.h>
>  #include <linux/namei.h>
> +#include <linux/seq_file.h>
>  
>  #include "kernfs-internal.h"
>  
> @@ -40,6 +41,18 @@ static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry)
>  	return 0;
>  }
>  
> +static int kernfs_sop_show_path(struct seq_file *sf, struct dentry *dentry)
> +{
> +	struct kernfs_node *node = dentry->d_fsdata;
> +	struct kernfs_root *root = kernfs_root(node);
> +	struct kernfs_syscall_ops *scops = root->syscall_ops;
> +
> +	if (scops && scops->show_path)
> +		return scops->show_path(sf, node, root);
> +
> +	return seq_dentry(sf, dentry, " \t\n\\");
> +}
> +
>  const struct super_operations kernfs_sops = {
>  	.statfs		= simple_statfs,
>  	.drop_inode	= generic_delete_inode,
> @@ -47,6 +60,7 @@ const struct super_operations kernfs_sops = {
>  
>  	.remount_fs	= kernfs_sop_remount_fs,
>  	.show_options	= kernfs_sop_show_options,
> +	.show_path	= kernfs_sop_show_path,
>  };
>  
>  /**
> diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
> index c06c442..30f089e 100644
> --- a/include/linux/kernfs.h
> +++ b/include/linux/kernfs.h
> @@ -152,6 +152,8 @@ struct kernfs_syscall_ops {
>  	int (*rmdir)(struct kernfs_node *kn);
>  	int (*rename)(struct kernfs_node *kn, struct kernfs_node *new_parent,
>  		      const char *new_name);
> +	int (*show_path)(struct seq_file *sf, struct kernfs_node *kn,
> +			 struct kernfs_root *root);
>  };
>  
>  struct kernfs_root {
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 909a7d3..afea39e 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -1215,6 +1215,41 @@ static void cgroup_destroy_root(struct cgroup_root *root)
>  	cgroup_free_root(root);
>  }
>  
> +/*
> + * look up cgroup associated with current task's cgroup namespace on the
> + * specified hierarchy
> + */
> +static struct cgroup *
> +current_cgns_cgroup_from_root(struct cgroup_root *root)
> +{
> +	struct cgroup *res = NULL;
> +	struct css_set *cset;
> +
> +	lockdep_assert_held(&css_set_lock);
> +
> +	rcu_read_lock();
> +
> +	cset = current->nsproxy->cgroup_ns->root_cset;
> +	if (cset == &init_css_set) {
> +		res = &root->cgrp;
> +	} else {
> +		struct cgrp_cset_link *link;
> +
> +		list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
> +			struct cgroup *c = link->cgrp;
> +
> +			if (c->root == root) {
> +				res = c;
> +				break;
> +			}
> +		}
> +	}
> +	rcu_read_unlock();
> +
> +	BUG_ON(!res);
> +	return res;
> +}
> +
>  /* look up cgroup associated with given css_set on the specified hierarchy */
>  static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
>  					    struct cgroup_root *root)
> @@ -1593,6 +1628,33 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
>  	return 0;
>  }
>  
> +static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
> +			    struct kernfs_root *kf_root)
> +{
> +	int len = 0, ret = 0;
> +	char *buf = NULL;
> +	struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root);
> +	struct cgroup *ns_cgroup;
> +
> +	buf = kmalloc(PATH_MAX, GFP_KERNEL);
> +	if (!buf)
> +		return -ENOMEM;
> +
> +	spin_lock_bh(&css_set_lock);
> +	ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);
> +	len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
> +	spin_unlock_bh(&css_set_lock);
> +
> +	if (len >= PATH_MAX)
> +		len = -ERANGE;
> +	else if (len > 0) {
> +		seq_escape(sf, buf, " \t\n\\");
> +		len = 0;
> +	}
> +	kfree(buf);
> +	return len;
> +}
> +
>  static int cgroup_show_options(struct seq_file *seq,
>  			       struct kernfs_root *kf_root)
>  {
> @@ -5433,6 +5495,7 @@ static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
>  	.mkdir			= cgroup_mkdir,
>  	.rmdir			= cgroup_rmdir,
>  	.rename			= cgroup_rename,
> +	.show_path		= cgroup_show_path,
>  };
>  
>  static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
> 


-- 
Michael Kerrisk
Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
Linux/UNIX System Programming Training: http://man7.org/training/

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] mountinfo: implement show_path for kernfs and cgroup
  2016-05-06 13:39 ` Michael Kerrisk (man-pages)
@ 2016-05-06 17:33   ` Serge E. Hallyn
  2016-05-06 17:42     ` Tejun Heo
  2016-05-06 17:42     ` Michael Kerrisk (man-pages)
  0 siblings, 2 replies; 8+ messages in thread
From: Serge E. Hallyn @ 2016-05-06 17:33 UTC (permalink / raw)
  To: Michael Kerrisk (man-pages)
  Cc: Serge E. Hallyn, Tejun Heo, linux-kernel, linux-api, containers,
	hannes, ebiederm, gregkh, cgroups, akpm, serge

Quoting Michael Kerrisk (man-pages) (mtk.manpages@gmail.com):
> Hi Serge,
> 
> I'll add my own notes below, as much as anything in order to convince 
> myself that I understand what's going on.
> 
> On 05/05/2016 05:20 PM, Serge E. Hallyn wrote:
> > Short explanation:
> > 
> > When showing a cgroupfs entry in mountinfo, show the path of the mount
> > root dentry relative to the reader's cgroup namespace root.
> 
> As part of the commit message, I think it would be useful to add a 
> sentence here explain why this is needed / which applications need it.
> 
> > Long version:
> > 
> > When a uid 0 task which is in freezer cgroup /a/b, unshares a new cgroup
> > namespace, and then mounts a new instance of the freezer cgroup, the new
> > mount will be rooted at /a/b.  The root dentry field of the mountinfo
> > entry will show '/a/b'.
> 
> So, the point is that if we create a new cgroup namespace,
> then we want both /proc/self/cgroup and /proc/self/mountinfo
> to show cgroup paths that are correctly virtualized with
> respect to the cgroup mount point. Previous to this patch,
> /proc/self/cgroup shows the right info, but
> /proc/self/mountinfo does not. (Walk through in a moment.) 
> 
> Is the above a correct summary?

Correct.

> >  cat > /tmp/do1 << EOF
> >  mount -t cgroup -o freezer freezer /mnt
> >  grep freezer /proc/self/mountinfo
> >  EOF
> > 
> >  unshare -Gm  bash /tmp/do1
> 
> Should that not be:
> 
>     unshare -Gm  bash /tmp/do1
> 
> (i.e., s/G/C/)?

Indeed

> >  > 330 160 0:34 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,freezer
> >  > 355 133 0:34 /a/b /mnt rw,relatime - cgroup freezer rw,freezer
> > 
> > The task's freezer cgroup entry in /proc/self/cgroup will simply show
> > '/':
> > 
> >  grep freezer /proc/self/cgroup
> >  9:freezer:/
> > 
> > If instead the same task simply bind mounts the /a/b cgroup directory,
> > the resulting mountinfo entry will again show /a/b for the dentry root.
> > However in this case the task will find its own cgroup at /mnt/a/b,
> > not at /mnt:
> > 
> >  mount --bind /sys/fs/cgroup/freezer/a/b /mnt
> >  130 25 0:34 /a/b /mnt rw,nosuid,nodev,noexec,relatime shared:21 - cgroup cgroup rw,freezer
> > 
> > In other words, there is no way for the task to know, based on what is
> > in mountinfo, which cgroup directory is its own.
> 
> So, my walk through and commentary to the above...
> 
> First, a little script to save some typing and verbiage:
> 
> # cat cgroup_info.sh
> #!/bin/sh
> echo -e "\t/proc/self/cgroup:\t$(cat /proc/self/cgroup | grep freezer)"
> cat /proc/self/mountinfo | grep freezer |
>         awk '{print "\tmountinfo:\t\t" $4 "\t" $5}'
> #
> 
> Create cgroup, place this shell into the cgroup, and look at the state of
> the /proc files:
> 
> # mkdir -p /sys/fs/cgroup/freezer/a/b
> # echo $$ > /sys/fs/cgroup/freezer/a/b/cgroup.procs
> # echo $$
> 2653
> # cat /sys/fs/cgroup/freezer/a/b/cgroup.procs 
> 2653                         # Our shell
> 14254                        # cat(1)
> # ./cgroup_info.sh 
> 	/proc/self/cgroup:	10:freezer:/a/b
> 	mountinfo:		/	/sys/fs/cgroup/freezer
> 
> Create a shell in new cgroup and mount namespaces. The act of creating
> a new cgroup namespace causes the process's current cgroups directories 
> to become its cgroup root directories. (Here, I'm using my own version
> of the "unshare" utility, which takes the same options as the util-linux
> version):
> 
> # ~mtk/tlpi/code/ns/unshare -Cm bash
> 
> Look at the state of the /proc files:
> 
> # ./cgroup_info.sh 
> 	/proc/self/cgroup:	10:freezer:/
> 	mountinfo:		/	/sys/fs/cgroup/freezer
> 
> The third entry in /proc/self/info (the pathname of the cgroup inside the

/proc/self/cgroup

> hierarchy) is correctly virtualized w.r.t. the cgroup namespace, which is
> rooted at /a/b in the outer namespace.
> 
> However, the info in /proc/self/mountinfo is not for this cgroup
> namespace, since we are seeing a duplicate of the mount from the 
> old mount namespace, and the info there does not correspond to the
> new cgroup namespace. However, trying to create a new mount still 
> doesn't show us the right information in mountinfo:
> 
> # mount --make-rslave /               # Prevent our mount operations 
>                                       # propagating to other mountns
> # mkdir -p /mnt/freezer               # Create a new mount point
> # umount /sys/fs/cgroup/freezer       # Discard old mount
> # mount -t cgroup -o freezer freezer /mnt/freezer/
> # ./cgroup_info.sh
>         /proc/self/cgroup:      7:freezer:/
>         mountinfo:              /a/b    /mnt/freezer
> 
> The act of creating a new cgroup namespace caused the process's
> current freezer directory, "/a/b", to become its cgroup freezer root
> directory. In other words, the pathname directory of the directory
> within the newly mounted cgroup filesystem should be "/",
> but mountinfo wrongly shows us "/a/b". The consequence of this is
> that the process in the cgroup namespace cannot correctly construct
> the pathname of its cgroup root directory from the information in
> /proc/PID/mountinfo. (The current patch fixes exactly this problem.)
> 
> > With this patch, the dentry root field in mountinfo is shown relative
> > to the reader's cgroup namespace.  I.e.:
> > 
> >  unshare -Gm  bash /tmp/do1
> >  > 330 160 0:34 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,freezer
> >  > 355 133 0:34 / /mnt rw,relatime - cgroup freezer rw,freezer
> > 
> > This way the task can correlate the paths in /proc/pid/cgroup to
> > /proc/self/mountinfo, and determine which cgroup directory (in any
> > mount which the reader created) corresponds to the task.
> 
> So, I applied your patch against a current (i.e., 4.6-rc6) kernel.
> Same steps as before, and here's what I see:
> 
> # mkdir -p /sys/fs/cgroup/freezer/a/b
> # echo $$ > /sys/fs/cgroup/freezer/a/b/cgroup.procs
> # ./cgroup_info.sh 
> 	/proc/self/cgroup:	10:freezer:/a/b
> 	mountinfo:		/	/sys/fs/cgroup/freezer
> # ~mtk/tlpi/code/ns/unshare -Cm bash
> # ./cgroup_info.sh 
> 	/proc/self/cgroup:	10:freezer:/
> 	mountinfo:		/../..	/sys/fs/cgroup/freezer
> # mount --make-rslave /
> # mkdir -p /mnt/freezer
> # umount /sys/fs/cgroup/freezer
> # mount -t cgroup -o freezer freezer /mnt/freezer/
> # ./cgroup_info.sh
> 	/proc/self/cgroup:	10:freezer:/
> 	mountinfo:		/	/mnt/freezer
> 
> Now the root directory path shown by mountinfo is correct,
> and when we look inside the mount point, we see that things
> look "right" (i.e., a cgroup root directory with no
> subdirectories, and the PID of the shell run by unshare is
> in the cgroup.procs file of this cgroup):
> 
> # ls /mnt/freezer/
> cgroup.clone_children  freezer.parent_freezing  freezer.state      tasks
> cgroup.procs           freezer.self_freezing    notify_on_release
> # echo $$
> 3164
> # cat /mnt/freezer/cgroup.procs
> 2653                   # First shell that placed in this cgroup
> 3164                   # Shell started by 'unshare'
> 14197                  # cat(1)
> 
> All makes sense to me.

Right.  So in particular, docker wants to do something like:

bindpath=`grep freezer /proc/self/mountinfo | tail -n 1 | awk '{ print $4 }'`
mountpoint=`grep freezer /proc/self/mountinfo | tail -n 1 | awk '{ print $5 }'`
mycg=`awk -F: '/freezer/ { print $3 }' /proc/self/cgroup`
cat ${mountpoint}/${bindpath}/${mycg}/cgroup.procs

and see its own task.

> Tested-by: Michael Kerrisk <mtk.manpages@gmail.com>
> Acked-by: Michael Kerrisk <mtk.manpages@gmail.com>
> 
> (I did no review of the patch itself though.)

Thanks, Michael.  I'll resend with corrections and a test script of
some sort.

> Cheers,
> 
> Michael
> 
> 
> > Signed-off-by: Serge Hallyn <serge.hallyn@ubuntu.com>
> > ---
> >  fs/kernfs/mount.c      | 14 +++++++++++
> >  include/linux/kernfs.h |  2 ++
> >  kernel/cgroup.c        | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++
> >  3 files changed, 79 insertions(+)
> > 
> > diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
> > index f73541f..3b78724 100644
> > --- a/fs/kernfs/mount.c
> > +++ b/fs/kernfs/mount.c
> > @@ -15,6 +15,7 @@
> >  #include <linux/slab.h>
> >  #include <linux/pagemap.h>
> >  #include <linux/namei.h>
> > +#include <linux/seq_file.h>
> >  
> >  #include "kernfs-internal.h"
> >  
> > @@ -40,6 +41,18 @@ static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry)
> >  	return 0;
> >  }
> >  
> > +static int kernfs_sop_show_path(struct seq_file *sf, struct dentry *dentry)
> > +{
> > +	struct kernfs_node *node = dentry->d_fsdata;
> > +	struct kernfs_root *root = kernfs_root(node);
> > +	struct kernfs_syscall_ops *scops = root->syscall_ops;
> > +
> > +	if (scops && scops->show_path)
> > +		return scops->show_path(sf, node, root);
> > +
> > +	return seq_dentry(sf, dentry, " \t\n\\");
> > +}
> > +
> >  const struct super_operations kernfs_sops = {
> >  	.statfs		= simple_statfs,
> >  	.drop_inode	= generic_delete_inode,
> > @@ -47,6 +60,7 @@ const struct super_operations kernfs_sops = {
> >  
> >  	.remount_fs	= kernfs_sop_remount_fs,
> >  	.show_options	= kernfs_sop_show_options,
> > +	.show_path	= kernfs_sop_show_path,
> >  };
> >  
> >  /**
> > diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
> > index c06c442..30f089e 100644
> > --- a/include/linux/kernfs.h
> > +++ b/include/linux/kernfs.h
> > @@ -152,6 +152,8 @@ struct kernfs_syscall_ops {
> >  	int (*rmdir)(struct kernfs_node *kn);
> >  	int (*rename)(struct kernfs_node *kn, struct kernfs_node *new_parent,
> >  		      const char *new_name);
> > +	int (*show_path)(struct seq_file *sf, struct kernfs_node *kn,
> > +			 struct kernfs_root *root);
> >  };
> >  
> >  struct kernfs_root {
> > diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> > index 909a7d3..afea39e 100644
> > --- a/kernel/cgroup.c
> > +++ b/kernel/cgroup.c
> > @@ -1215,6 +1215,41 @@ static void cgroup_destroy_root(struct cgroup_root *root)
> >  	cgroup_free_root(root);
> >  }
> >  
> > +/*
> > + * look up cgroup associated with current task's cgroup namespace on the
> > + * specified hierarchy
> > + */
> > +static struct cgroup *
> > +current_cgns_cgroup_from_root(struct cgroup_root *root)
> > +{
> > +	struct cgroup *res = NULL;
> > +	struct css_set *cset;
> > +
> > +	lockdep_assert_held(&css_set_lock);
> > +
> > +	rcu_read_lock();
> > +
> > +	cset = current->nsproxy->cgroup_ns->root_cset;
> > +	if (cset == &init_css_set) {
> > +		res = &root->cgrp;
> > +	} else {
> > +		struct cgrp_cset_link *link;
> > +
> > +		list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
> > +			struct cgroup *c = link->cgrp;
> > +
> > +			if (c->root == root) {
> > +				res = c;
> > +				break;
> > +			}
> > +		}
> > +	}
> > +	rcu_read_unlock();
> > +
> > +	BUG_ON(!res);
> > +	return res;
> > +}
> > +
> >  /* look up cgroup associated with given css_set on the specified hierarchy */
> >  static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
> >  					    struct cgroup_root *root)
> > @@ -1593,6 +1628,33 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
> >  	return 0;
> >  }
> >  
> > +static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
> > +			    struct kernfs_root *kf_root)
> > +{
> > +	int len = 0, ret = 0;
> > +	char *buf = NULL;
> > +	struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root);
> > +	struct cgroup *ns_cgroup;
> > +
> > +	buf = kmalloc(PATH_MAX, GFP_KERNEL);
> > +	if (!buf)
> > +		return -ENOMEM;
> > +
> > +	spin_lock_bh(&css_set_lock);
> > +	ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);
> > +	len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
> > +	spin_unlock_bh(&css_set_lock);
> > +
> > +	if (len >= PATH_MAX)
> > +		len = -ERANGE;
> > +	else if (len > 0) {
> > +		seq_escape(sf, buf, " \t\n\\");
> > +		len = 0;
> > +	}
> > +	kfree(buf);
> > +	return len;
> > +}
> > +
> >  static int cgroup_show_options(struct seq_file *seq,
> >  			       struct kernfs_root *kf_root)
> >  {
> > @@ -5433,6 +5495,7 @@ static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
> >  	.mkdir			= cgroup_mkdir,
> >  	.rmdir			= cgroup_rmdir,
> >  	.rename			= cgroup_rename,
> > +	.show_path		= cgroup_show_path,
> >  };
> >  
> >  static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
> > 
> 
> 
> -- 
> Michael Kerrisk
> Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
> Linux/UNIX System Programming Training: http://man7.org/training/

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] mountinfo: implement show_path for kernfs and cgroup
  2016-05-06 17:33   ` Serge E. Hallyn
@ 2016-05-06 17:42     ` Tejun Heo
  2016-05-06 17:42     ` Michael Kerrisk (man-pages)
  1 sibling, 0 replies; 8+ messages in thread
From: Tejun Heo @ 2016-05-06 17:42 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: Michael Kerrisk (man-pages),
	Serge E. Hallyn, linux-kernel, linux-api, containers, hannes,
	ebiederm, gregkh, cgroups, akpm

Hello,

On Fri, May 06, 2016 at 12:33:56PM -0500, Serge E. Hallyn wrote:
> Right.  So in particular, docker wants to do something like:
> 
> bindpath=`grep freezer /proc/self/mountinfo | tail -n 1 | awk '{ print $4 }'`
> mountpoint=`grep freezer /proc/self/mountinfo | tail -n 1 | awk '{ print $5 }'`
> mycg=`awk -F: '/freezer/ { print $3 }' /proc/self/cgroup`
> cat ${mountpoint}/${bindpath}/${mycg}/cgroup.procs
> 
> and see its own task.

Can you please make the example more concrete?  e.g. Use actual paths
and show what they show now vs. what they should.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] mountinfo: implement show_path for kernfs and cgroup
  2016-05-06 17:33   ` Serge E. Hallyn
  2016-05-06 17:42     ` Tejun Heo
@ 2016-05-06 17:42     ` Michael Kerrisk (man-pages)
  2016-05-06 19:32       ` Serge E. Hallyn
  1 sibling, 1 reply; 8+ messages in thread
From: Michael Kerrisk (man-pages) @ 2016-05-06 17:42 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: Serge E. Hallyn, Tejun Heo, lkml, Linux API, Containers,
	Johannes Weiner, Eric W. Biederman, Greg KH, cgroups,
	Andrew Morton

Hi Serge,

On 6 May 2016 at 19:33, Serge E. Hallyn <serge@hallyn.com> wrote:
> Quoting Michael Kerrisk (man-pages) (mtk.manpages@gmail.com):
>> Hi Serge,
>>
>> I'll add my own notes below, as much as anything in order to convince
>> myself that I understand what's going on.
>>
>> On 05/05/2016 05:20 PM, Serge E. Hallyn wrote:
>> > Short explanation:
>> >
>> > When showing a cgroupfs entry in mountinfo, show the path of the mount
>> > root dentry relative to the reader's cgroup namespace root.
>>
>> As part of the commit message, I think it would be useful to add a
>> sentence here explain why this is needed / which applications need it.
>>
>> > Long version:
>> >
>> > When a uid 0 task which is in freezer cgroup /a/b, unshares a new cgroup
>> > namespace, and then mounts a new instance of the freezer cgroup, the new
>> > mount will be rooted at /a/b.  The root dentry field of the mountinfo
>> > entry will show '/a/b'.
>>
>> So, the point is that if we create a new cgroup namespace,
>> then we want both /proc/self/cgroup and /proc/self/mountinfo
>> to show cgroup paths that are correctly virtualized with
>> respect to the cgroup mount point. Previous to this patch,
>> /proc/self/cgroup shows the right info, but
>> /proc/self/mountinfo does not. (Walk through in a moment.)
>>
>> Is the above a correct summary?

Feel free to add that piece to the commit message :-).

[...]

>> So, I applied your patch against a current (i.e., 4.6-rc6) kernel.
>> Same steps as before, and here's what I see:
>>
>> # mkdir -p /sys/fs/cgroup/freezer/a/b
>> # echo $$ > /sys/fs/cgroup/freezer/a/b/cgroup.procs
>> # ./cgroup_info.sh
>>       /proc/self/cgroup:      10:freezer:/a/b
>>       mountinfo:              /       /sys/fs/cgroup/freezer
>> # ~mtk/tlpi/code/ns/unshare -Cm bash
>> # ./cgroup_info.sh
>>       /proc/self/cgroup:      10:freezer:/
>>       mountinfo:              /../..  /sys/fs/cgroup/freezer
>> # mount --make-rslave /
>> # mkdir -p /mnt/freezer
>> # umount /sys/fs/cgroup/freezer
>> # mount -t cgroup -o freezer freezer /mnt/freezer/
>> # ./cgroup_info.sh
>>       /proc/self/cgroup:      10:freezer:/
>>       mountinfo:              /       /mnt/freezer
>>
>> Now the root directory path shown by mountinfo is correct,
>> and when we look inside the mount point, we see that things
>> look "right" (i.e., a cgroup root directory with no
>> subdirectories, and the PID of the shell run by unshare is
>> in the cgroup.procs file of this cgroup):
>>
>> # ls /mnt/freezer/
>> cgroup.clone_children  freezer.parent_freezing  freezer.state      tasks
>> cgroup.procs           freezer.self_freezing    notify_on_release
>> # echo $$
>> 3164
>> # cat /mnt/freezer/cgroup.procs
>> 2653                   # First shell that placed in this cgroup
>> 3164                   # Shell started by 'unshare'
>> 14197                  # cat(1)
>>
>> All makes sense to me.
>
> Right.  So in particular, docker wants to do something like:
>
> bindpath=`grep freezer /proc/self/mountinfo | tail -n 1 | awk '{ print $4 }'`
> mountpoint=`grep freezer /proc/self/mountinfo | tail -n 1 | awk '{ print $5 }'`
> mycg=`awk -F: '/freezer/ { print $3 }' /proc/self/cgroup`
> cat ${mountpoint}/${bindpath}/${mycg}/cgroup.procs
>
> and see its own task.

I think that'd be a great piece to include in the commit message, near
the top, as rationale for the patch

>> Tested-by: Michael Kerrisk <mtk.manpages@gmail.com>
>> Acked-by: Michael Kerrisk <mtk.manpages@gmail.com>
>>
>> (I did no review of the patch itself though.)
>
> Thanks, Michael.

You're welcome.

> I'll resend with corrections and a test script of
> some sort.

I think including some version of the two walk thoughs (without + with
patch) would also make for a great commit message :-).

Cheers,

Michael

[...]

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] mountinfo: implement show_path for kernfs and cgroup
  2016-05-06 17:42     ` Michael Kerrisk (man-pages)
@ 2016-05-06 19:32       ` Serge E. Hallyn
  0 siblings, 0 replies; 8+ messages in thread
From: Serge E. Hallyn @ 2016-05-06 19:32 UTC (permalink / raw)
  To: Michael Kerrisk (man-pages)
  Cc: Serge E. Hallyn, Serge E. Hallyn, Tejun Heo, lkml, Linux API,
	Containers, Johannes Weiner, Eric W. Biederman, Greg KH, cgroups,
	Andrew Morton

Quoting Michael Kerrisk (man-pages) (mtk.manpages@gmail.com):
> Hi Serge,
> 
> On 6 May 2016 at 19:33, Serge E. Hallyn <serge@hallyn.com> wrote:
> > Quoting Michael Kerrisk (man-pages) (mtk.manpages@gmail.com):
> >> Hi Serge,
> >>
> >> I'll add my own notes below, as much as anything in order to convince
> >> myself that I understand what's going on.
> >>
> >> On 05/05/2016 05:20 PM, Serge E. Hallyn wrote:
> >> > Short explanation:
> >> >
> >> > When showing a cgroupfs entry in mountinfo, show the path of the mount
> >> > root dentry relative to the reader's cgroup namespace root.
> >>
> >> As part of the commit message, I think it would be useful to add a
> >> sentence here explain why this is needed / which applications need it.
> >>
> >> > Long version:
> >> >
> >> > When a uid 0 task which is in freezer cgroup /a/b, unshares a new cgroup
> >> > namespace, and then mounts a new instance of the freezer cgroup, the new
> >> > mount will be rooted at /a/b.  The root dentry field of the mountinfo
> >> > entry will show '/a/b'.
> >>
> >> So, the point is that if we create a new cgroup namespace,
> >> then we want both /proc/self/cgroup and /proc/self/mountinfo
> >> to show cgroup paths that are correctly virtualized with
> >> respect to the cgroup mount point. Previous to this patch,
> >> /proc/self/cgroup shows the right info, but
> >> /proc/self/mountinfo does not. (Walk through in a moment.)
> >>
> >> Is the above a correct summary?
> 
> Feel free to add that piece to the commit message :-).
> 
> [...]
> 
> >> So, I applied your patch against a current (i.e., 4.6-rc6) kernel.
> >> Same steps as before, and here's what I see:
> >>
> >> # mkdir -p /sys/fs/cgroup/freezer/a/b
> >> # echo $$ > /sys/fs/cgroup/freezer/a/b/cgroup.procs
> >> # ./cgroup_info.sh
> >>       /proc/self/cgroup:      10:freezer:/a/b
> >>       mountinfo:              /       /sys/fs/cgroup/freezer
> >> # ~mtk/tlpi/code/ns/unshare -Cm bash
> >> # ./cgroup_info.sh
> >>       /proc/self/cgroup:      10:freezer:/
> >>       mountinfo:              /../..  /sys/fs/cgroup/freezer
> >> # mount --make-rslave /
> >> # mkdir -p /mnt/freezer
> >> # umount /sys/fs/cgroup/freezer
> >> # mount -t cgroup -o freezer freezer /mnt/freezer/
> >> # ./cgroup_info.sh
> >>       /proc/self/cgroup:      10:freezer:/
> >>       mountinfo:              /       /mnt/freezer
> >>
> >> Now the root directory path shown by mountinfo is correct,
> >> and when we look inside the mount point, we see that things
> >> look "right" (i.e., a cgroup root directory with no
> >> subdirectories, and the PID of the shell run by unshare is
> >> in the cgroup.procs file of this cgroup):
> >>
> >> # ls /mnt/freezer/
> >> cgroup.clone_children  freezer.parent_freezing  freezer.state      tasks
> >> cgroup.procs           freezer.self_freezing    notify_on_release
> >> # echo $$
> >> 3164
> >> # cat /mnt/freezer/cgroup.procs
> >> 2653                   # First shell that placed in this cgroup
> >> 3164                   # Shell started by 'unshare'
> >> 14197                  # cat(1)
> >>
> >> All makes sense to me.
> >
> > Right.  So in particular, docker wants to do something like:
> >
> > bindpath=`grep freezer /proc/self/mountinfo | tail -n 1 | awk '{ print $4 }'`
> > mountpoint=`grep freezer /proc/self/mountinfo | tail -n 1 | awk '{ print $5 }'`
> > mycg=`awk -F: '/freezer/ { print $3 }' /proc/self/cgroup`
> > cat ${mountpoint}/${bindpath}/${mycg}/cgroup.procs
> >
> > and see its own task.
> 
> I think that'd be a great piece to include in the commit message, near
> the top, as rationale for the patch

Hang on I've messed that up :)  The above is not actually what docker
does.  Rather it expects the $bindpath to be the same as the first part
of $mycg, and and so appends the rest of $mycg to the $mountpoint and
uses that.

> >> Tested-by: Michael Kerrisk <mtk.manpages@gmail.com>
> >> Acked-by: Michael Kerrisk <mtk.manpages@gmail.com>
> >>
> >> (I did no review of the patch itself though.)
> >
> > Thanks, Michael.
> 
> You're welcome.
> 
> > I'll resend with corrections and a test script of
> > some sort.
> 
> I think including some version of the two walk thoughs (without + with
> patch) would also make for a great commit message :-).
> 
> Cheers,
> 
> Michael
> 
> [...]

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] mountinfo: implement show_path for kernfs and cgroup
  2016-05-09 14:59 Serge E. Hallyn
@ 2016-05-09 16:19 ` Tejun Heo
  0 siblings, 0 replies; 8+ messages in thread
From: Tejun Heo @ 2016-05-09 16:19 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: linux-kernel, Michael Kerrisk (man-pages),
	serge, linux-api, containers, hannes, ebiederm, gregkh, cgroups,
	akpm

On Mon, May 09, 2016 at 09:59:55AM -0500, Serge E. Hallyn wrote:
> Patch summary:
> 
> When showing a cgroupfs entry in mountinfo, show the path of the mount
> root dentry relative to the reader's cgroup namespace root.

Applied to cgroup/for-4.6-fixes w/ subject updated.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH] mountinfo: implement show_path for kernfs and cgroup
@ 2016-05-09 14:59 Serge E. Hallyn
  2016-05-09 16:19 ` Tejun Heo
  0 siblings, 1 reply; 8+ messages in thread
From: Serge E. Hallyn @ 2016-05-09 14:59 UTC (permalink / raw)
  To: Tejun Heo, linux-kernel, Michael Kerrisk (man-pages)
  Cc: Serge E. Hallyn, serge, linux-api, containers, hannes, ebiederm,
	gregkh, cgroups, akpm

Patch summary:

When showing a cgroupfs entry in mountinfo, show the path of the mount
root dentry relative to the reader's cgroup namespace root.

Short explanation (courtesy of mkerrisk):

If we create a new cgroup namespace, then we want both /proc/self/cgroup
and /proc/self/mountinfo to show cgroup paths that are correctly
virtualized with respect to the cgroup mount point.  Previous to this
patch, /proc/self/cgroup shows the right info, but /proc/self/mountinfo
does not.

Long version:

When a uid 0 task which is in freezer cgroup /a/b, unshares a new cgroup
namespace, and then mounts a new instance of the freezer cgroup, the new
mount will be rooted at /a/b.  The root dentry field of the mountinfo
entry will show '/a/b'.

 cat > /tmp/do1 << EOF
 mount -t cgroup -o freezer freezer /mnt
 grep freezer /proc/self/mountinfo
 EOF

 unshare -Gm  bash /tmp/do1
 > 330 160 0:34 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,freezer
 > 355 133 0:34 /a/b /mnt rw,relatime - cgroup freezer rw,freezer

The task's freezer cgroup entry in /proc/self/cgroup will simply show
'/':

 grep freezer /proc/self/cgroup
 9:freezer:/

If instead the same task simply bind mounts the /a/b cgroup directory,
the resulting mountinfo entry will again show /a/b for the dentry root.
However in this case the task will find its own cgroup at /mnt/a/b,
not at /mnt:

 mount --bind /sys/fs/cgroup/freezer/a/b /mnt
 130 25 0:34 /a/b /mnt rw,nosuid,nodev,noexec,relatime shared:21 - cgroup cgroup rw,freezer

In other words, there is no way for the task to know, based on what is
in mountinfo, which cgroup directory is its own.

Example (by mkerrisk):

First, a little script to save some typing and verbiage:

# cat cgroup_info.sh
#!/bin/sh
echo -e "\t/proc/self/cgroup:\t$(cat /proc/self/cgroup | grep freezer)"
cat /proc/self/mountinfo | grep freezer |
        awk '{print "\tmountinfo:\t\t" $4 "\t" $5}'
#

Create cgroup, place this shell into the cgroup, and look at the state
of the /proc files:

# mkdir -p /sys/fs/cgroup/freezer/a/b
# echo $$ > /sys/fs/cgroup/freezer/a/b/cgroup.procs
# echo $$
2653
# cat /sys/fs/cgroup/freezer/a/b/cgroup.procs
2653                         # Our shell
14254                        # cat(1)
# ./cgroup_info.sh
        /proc/self/cgroup:      10:freezer:/a/b
        mountinfo:              /       /sys/fs/cgroup/freezer

Create a shell in new cgroup and mount namespaces. The act of creating
a new cgroup namespace causes the process's current cgroups directories
to become its cgroup root directories. (Here, I'm using my own version
of the "unshare" utility, which takes the same options as the util-linux
version):

# ~mtk/tlpi/code/ns/unshare -Cm bash

Look at the state of the /proc files:

# ./cgroup_info.sh
        /proc/self/cgroup:      10:freezer:/
        mountinfo:              /       /sys/fs/cgroup/freezer

The third entry in /proc/self/cgroup (the pathname of the cgroup inside
the hierarchy) is correctly virtualized w.r.t. the cgroup namespace, which
is rooted at /a/b in the outer namespace.

However, the info in /proc/self/mountinfo is not for this cgroup
namespace, since we are seeing a duplicate of the mount from the
old mount namespace, and the info there does not correspond to the
new cgroup namespace. However, trying to create a new mount still
doesn't show us the right information in mountinfo:

# mount --make-rslave /               # Prevent our mount operations
                                      # propagating to other mountns
# mkdir -p /mnt/freezer               # Create a new mount point
# umount /sys/fs/cgroup/freezer       # Discard old mount
# mount -t cgroup -o freezer freezer /mnt/freezer/
# ./cgroup_info.sh
        /proc/self/cgroup:      7:freezer:/
        mountinfo:              /a/b    /mnt/freezer

The act of creating a new cgroup namespace caused the process's
current freezer directory, "/a/b", to become its cgroup freezer root
directory. In other words, the pathname directory of the directory
within the newly mounted cgroup filesystem should be "/",
but mountinfo wrongly shows us "/a/b". The consequence of this is
that the process in the cgroup namespace cannot correctly construct
the pathname of its cgroup root directory from the information in
/proc/PID/mountinfo.

With this patch, the dentry root field in mountinfo is shown relative
to the reader's cgroup namespace.  So the same steps as above:

# mkdir -p /sys/fs/cgroup/freezer/a/b
# echo $$ > /sys/fs/cgroup/freezer/a/b/cgroup.procs
# ./cgroup_info.sh
        /proc/self/cgroup:      10:freezer:/a/b
        mountinfo:              /       /sys/fs/cgroup/freezer
# ~mtk/tlpi/code/ns/unshare -Cm bash
# ./cgroup_info.sh
        /proc/self/cgroup:      10:freezer:/
        mountinfo:              /../..  /sys/fs/cgroup/freezer
# mount --make-rslave /
# mkdir -p /mnt/freezer
# umount /sys/fs/cgroup/freezer
# mount -t cgroup -o freezer freezer /mnt/freezer/
# ./cgroup_info.sh
        /proc/self/cgroup:      10:freezer:/
        mountinfo:              /       /mnt/freezer

# ls /mnt/freezer/
cgroup.clone_children  freezer.parent_freezing  freezer.state      tasks
cgroup.procs           freezer.self_freezing    notify_on_release
# echo $$
3164
# cat /mnt/freezer/cgroup.procs
2653                   # First shell that placed in this cgroup
3164                   # Shell started by 'unshare'
14197                  # cat(1)

Signed-off-by: Serge Hallyn <serge.hallyn@ubuntu.com>
Tested-by: Michael Kerrisk <mtk.manpages@gmail.com>
Acked-by: Michael Kerrisk <mtk.manpages@gmail.com>
---
 fs/kernfs/mount.c      | 14 +++++++++++
 include/linux/kernfs.h |  2 ++
 kernel/cgroup.c        | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 79 insertions(+)

diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index f73541f..3b78724 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -15,6 +15,7 @@
 #include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/namei.h>
+#include <linux/seq_file.h>
 
 #include "kernfs-internal.h"
 
@@ -40,6 +41,18 @@ static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry)
 	return 0;
 }
 
+static int kernfs_sop_show_path(struct seq_file *sf, struct dentry *dentry)
+{
+	struct kernfs_node *node = dentry->d_fsdata;
+	struct kernfs_root *root = kernfs_root(node);
+	struct kernfs_syscall_ops *scops = root->syscall_ops;
+
+	if (scops && scops->show_path)
+		return scops->show_path(sf, node, root);
+
+	return seq_dentry(sf, dentry, " \t\n\\");
+}
+
 const struct super_operations kernfs_sops = {
 	.statfs		= simple_statfs,
 	.drop_inode	= generic_delete_inode,
@@ -47,6 +60,7 @@ const struct super_operations kernfs_sops = {
 
 	.remount_fs	= kernfs_sop_remount_fs,
 	.show_options	= kernfs_sop_show_options,
+	.show_path	= kernfs_sop_show_path,
 };
 
 /**
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index c06c442..30f089e 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -152,6 +152,8 @@ struct kernfs_syscall_ops {
 	int (*rmdir)(struct kernfs_node *kn);
 	int (*rename)(struct kernfs_node *kn, struct kernfs_node *new_parent,
 		      const char *new_name);
+	int (*show_path)(struct seq_file *sf, struct kernfs_node *kn,
+			 struct kernfs_root *root);
 };
 
 struct kernfs_root {
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 909a7d3..afea39e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1215,6 +1215,41 @@ static void cgroup_destroy_root(struct cgroup_root *root)
 	cgroup_free_root(root);
 }
 
+/*
+ * look up cgroup associated with current task's cgroup namespace on the
+ * specified hierarchy
+ */
+static struct cgroup *
+current_cgns_cgroup_from_root(struct cgroup_root *root)
+{
+	struct cgroup *res = NULL;
+	struct css_set *cset;
+
+	lockdep_assert_held(&css_set_lock);
+
+	rcu_read_lock();
+
+	cset = current->nsproxy->cgroup_ns->root_cset;
+	if (cset == &init_css_set) {
+		res = &root->cgrp;
+	} else {
+		struct cgrp_cset_link *link;
+
+		list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
+			struct cgroup *c = link->cgrp;
+
+			if (c->root == root) {
+				res = c;
+				break;
+			}
+		}
+	}
+	rcu_read_unlock();
+
+	BUG_ON(!res);
+	return res;
+}
+
 /* look up cgroup associated with given css_set on the specified hierarchy */
 static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
 					    struct cgroup_root *root)
@@ -1593,6 +1628,33 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
 	return 0;
 }
 
+static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
+			    struct kernfs_root *kf_root)
+{
+	int len = 0, ret = 0;
+	char *buf = NULL;
+	struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root);
+	struct cgroup *ns_cgroup;
+
+	buf = kmalloc(PATH_MAX, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	spin_lock_bh(&css_set_lock);
+	ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);
+	len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
+	spin_unlock_bh(&css_set_lock);
+
+	if (len >= PATH_MAX)
+		len = -ERANGE;
+	else if (len > 0) {
+		seq_escape(sf, buf, " \t\n\\");
+		len = 0;
+	}
+	kfree(buf);
+	return len;
+}
+
 static int cgroup_show_options(struct seq_file *seq,
 			       struct kernfs_root *kf_root)
 {
@@ -5433,6 +5495,7 @@ static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
 	.mkdir			= cgroup_mkdir,
 	.rmdir			= cgroup_rmdir,
 	.rename			= cgroup_rename,
+	.show_path		= cgroup_show_path,
 };
 
 static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2016-05-09 16:19 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-05-05 15:20 [PATCH] mountinfo: implement show_path for kernfs and cgroup Serge E. Hallyn
2016-05-06 13:39 ` Michael Kerrisk (man-pages)
2016-05-06 17:33   ` Serge E. Hallyn
2016-05-06 17:42     ` Tejun Heo
2016-05-06 17:42     ` Michael Kerrisk (man-pages)
2016-05-06 19:32       ` Serge E. Hallyn
2016-05-09 14:59 Serge E. Hallyn
2016-05-09 16:19 ` Tejun Heo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).