All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] [v3] mount: dont execute propagate_umount() many times for same mounts
@ 2016-10-10 23:26 ` Andrei Vagin
  0 siblings, 0 replies; 36+ messages in thread
From: Andrei Vagin @ 2016-10-10 23:26 UTC (permalink / raw)
  To: Alexander Viro
  Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	Eric W. Biederman, Andrei Vagin,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

The reason of this optimization is that umount() can hold namespace_sem
for a long time, this semaphore is global, so it affects all users.
Recently Eric W. Biederman added a per mount namespace limit on the
number of mounts. The default number of mounts allowed per mount
namespace at 100,000. Currently this value is allowed to construct a tree
which requires hours to be umounted.

In a worse case the current complexity of umount_tree() is O(n^3).
* Enumirate all mounts in a target tree (propagate_umount)
* Enumirate mounts to find where these changes have to
  be propagated (mark_umount_candidates)
* Enumirate mounts to find a requered mount by parent and dentry
  (__lookup_mnt_lat)

The worse case is when all mounts from the tree live in the same shared
group. In this case we have to enumirate all mounts on each step.

Here we can optimize the second step. We don't need to make it for
mounts which we already met when we did this step for previous mounts.
It reduces the complexity of umount_tree() to O(n^2).

Here is a script to generate such mount tree:
$ cat run.sh
mount -t tmpfs xxx /mnt
mount --make-shared /mnt
for i in `seq $1`; do
	mount --bind /mnt `mktemp -d /mnt/test.XXXXXX`
done
time umount -l /mnt
$ for i in `seq 10 16`; do echo $i; unshare -Urm bash ./run.sh $i; done

Here is performance measurements with and without this patch:

mounts | before	| after (sec)
-----------------------------
1024   |  0.07  |
2048   |  0.23  |
4096   |  1.0   |
8912   |  8.7   | 0.11
16384  | 75     | 0.26
32768  |        | 0.75
65536  |        | 3.0
131072 |        | 13.7

This patch is a second step to fix CVE-2016-6213.

v2: fix mark_umount_candidates() to not change the existing behaviour.
v3: mark umounted mounts in mark_umount_candidates() and
__propagate_umount() separately, because they enumerate mounts in
oposite directions.

Cc: Eric W Biederman <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Andrei Vagin <avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
---
 fs/mount.h     |  2 ++
 fs/namespace.c | 19 ++++++++++++++++---
 fs/pnode.c     | 48 ++++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 62 insertions(+), 7 deletions(-)

diff --git a/fs/mount.h b/fs/mount.h
index d2e25d7..741c8a7 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -90,6 +90,8 @@ static inline int is_mounted(struct vfsmount *mnt)
 
 extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);
 extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *);
+extern struct mount *__lookup_mnt_cont(struct mount *,
+					struct vfsmount *, struct dentry *);
 
 extern int __legitimize_mnt(struct vfsmount *, unsigned);
 extern bool legitimize_mnt(struct vfsmount *, unsigned);
diff --git a/fs/namespace.c b/fs/namespace.c
index 704a1fe..b454660 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -652,9 +652,7 @@ struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
 		goto out;
 	if (!(p->mnt.mnt_flags & MNT_UMOUNT))
 		res = p;
-	hlist_for_each_entry_continue(p, mnt_hash) {
-		if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
-			break;
+	for (; p != NULL; p = __lookup_mnt_cont(p, mnt, dentry)) {
 		if (!(p->mnt.mnt_flags & MNT_UMOUNT))
 			res = p;
 	}
@@ -662,6 +660,21 @@ struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
 	return res;
 }
 
+struct mount *__lookup_mnt_cont(struct mount *p,
+				struct vfsmount *mnt, struct dentry *dentry)
+{
+	struct hlist_node *node = p->mnt_hash.next;
+
+	if (!node)
+		return NULL;
+
+	p = hlist_entry(node, struct mount, mnt_hash);
+	if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
+		return NULL;
+
+	return p;
+}
+
 /*
  * lookup_mnt - Return the first child mount mounted at path
  *
diff --git a/fs/pnode.c b/fs/pnode.c
index 234a9ac..b28f4fd 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -399,10 +399,28 @@ static void mark_umount_candidates(struct mount *mnt)
 
 	BUG_ON(parent == mnt);
 
+	if (IS_MNT_MARKED(mnt))
+		return;
+
+	SET_MNT_MARK(mnt);
+
 	for (m = propagation_next(parent, parent); m;
 			m = propagation_next(m, parent)) {
-		struct mount *child = __lookup_mnt_last(&m->mnt,
-						mnt->mnt_mountpoint);
+		struct mount *child = NULL, *p;
+
+		for (p = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint); p;
+		     p = __lookup_mnt_cont(p, &m->mnt, mnt->mnt_mountpoint)) {
+			/*
+			 * Do this work only once for mounts from
+			 * the same propagation chain.
+			 */
+			if (p->mnt.mnt_flags & MNT_UMOUNT) {
+				SET_MNT_MARK(p);
+				continue;
+			}
+			child = p;
+		}
+
 		if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
 			SET_MNT_MARK(child);
 		}
@@ -420,11 +438,33 @@ static void __propagate_umount(struct mount *mnt)
 
 	BUG_ON(parent == mnt);
 
+	/*
+	 * All mounts has been marked in mark_umount_candidates(), so
+	 * here the absence of the mark means that it has been handled
+	 * already.
+	 */
+	if (!IS_MNT_MARKED(mnt))
+		return;
+
+	CLEAR_MNT_MARK(mnt);
+
 	for (m = propagation_next(parent, parent); m;
 			m = propagation_next(m, parent)) {
 
-		struct mount *child = __lookup_mnt_last(&m->mnt,
-						mnt->mnt_mountpoint);
+		struct mount *child = NULL, *p;
+
+		for (p = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint); p;
+		     p = __lookup_mnt_cont(p, &m->mnt, mnt->mnt_mountpoint)) {
+			/*
+			 * Do this work only once for mounts from
+			 * the same propagation chain.
+			 */
+			if (p->mnt.mnt_flags & MNT_UMOUNT) {
+				CLEAR_MNT_MARK(p);
+				continue;
+			}
+			child = p;
+		}
 		/*
 		 * umount the child only if the child has no children
 		 * and the child is marked safe to unmount.
-- 
2.5.5

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH] [v3] mount: dont execute propagate_umount() many times for same mounts
@ 2016-10-10 23:26 ` Andrei Vagin
  0 siblings, 0 replies; 36+ messages in thread
From: Andrei Vagin @ 2016-10-10 23:26 UTC (permalink / raw)
  To: Alexander Viro
  Cc: Eric W. Biederman, containers, linux-fsdevel, linux-kernel, Andrei Vagin

The reason of this optimization is that umount() can hold namespace_sem
for a long time, this semaphore is global, so it affects all users.
Recently Eric W. Biederman added a per mount namespace limit on the
number of mounts. The default number of mounts allowed per mount
namespace at 100,000. Currently this value is allowed to construct a tree
which requires hours to be umounted.

In a worse case the current complexity of umount_tree() is O(n^3).
* Enumirate all mounts in a target tree (propagate_umount)
* Enumirate mounts to find where these changes have to
  be propagated (mark_umount_candidates)
* Enumirate mounts to find a requered mount by parent and dentry
  (__lookup_mnt_lat)

The worse case is when all mounts from the tree live in the same shared
group. In this case we have to enumirate all mounts on each step.

Here we can optimize the second step. We don't need to make it for
mounts which we already met when we did this step for previous mounts.
It reduces the complexity of umount_tree() to O(n^2).

Here is a script to generate such mount tree:
$ cat run.sh
mount -t tmpfs xxx /mnt
mount --make-shared /mnt
for i in `seq $1`; do
	mount --bind /mnt `mktemp -d /mnt/test.XXXXXX`
done
time umount -l /mnt
$ for i in `seq 10 16`; do echo $i; unshare -Urm bash ./run.sh $i; done

Here is performance measurements with and without this patch:

mounts | before	| after (sec)
-----------------------------
1024   |  0.07  |
2048   |  0.23  |
4096   |  1.0   |
8912   |  8.7   | 0.11
16384  | 75     | 0.26
32768  |        | 0.75
65536  |        | 3.0
131072 |        | 13.7

This patch is a second step to fix CVE-2016-6213.

v2: fix mark_umount_candidates() to not change the existing behaviour.
v3: mark umounted mounts in mark_umount_candidates() and
__propagate_umount() separately, because they enumerate mounts in
oposite directions.

Cc: Eric W Biederman <ebiederm@xmission.com>
Signed-off-by: Andrei Vagin <avagin@openvz.org>
---
 fs/mount.h     |  2 ++
 fs/namespace.c | 19 ++++++++++++++++---
 fs/pnode.c     | 48 ++++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 62 insertions(+), 7 deletions(-)

diff --git a/fs/mount.h b/fs/mount.h
index d2e25d7..741c8a7 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -90,6 +90,8 @@ static inline int is_mounted(struct vfsmount *mnt)
 
 extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);
 extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *);
+extern struct mount *__lookup_mnt_cont(struct mount *,
+					struct vfsmount *, struct dentry *);
 
 extern int __legitimize_mnt(struct vfsmount *, unsigned);
 extern bool legitimize_mnt(struct vfsmount *, unsigned);
diff --git a/fs/namespace.c b/fs/namespace.c
index 704a1fe..b454660 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -652,9 +652,7 @@ struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
 		goto out;
 	if (!(p->mnt.mnt_flags & MNT_UMOUNT))
 		res = p;
-	hlist_for_each_entry_continue(p, mnt_hash) {
-		if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
-			break;
+	for (; p != NULL; p = __lookup_mnt_cont(p, mnt, dentry)) {
 		if (!(p->mnt.mnt_flags & MNT_UMOUNT))
 			res = p;
 	}
@@ -662,6 +660,21 @@ struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
 	return res;
 }
 
+struct mount *__lookup_mnt_cont(struct mount *p,
+				struct vfsmount *mnt, struct dentry *dentry)
+{
+	struct hlist_node *node = p->mnt_hash.next;
+
+	if (!node)
+		return NULL;
+
+	p = hlist_entry(node, struct mount, mnt_hash);
+	if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
+		return NULL;
+
+	return p;
+}
+
 /*
  * lookup_mnt - Return the first child mount mounted at path
  *
diff --git a/fs/pnode.c b/fs/pnode.c
index 234a9ac..b28f4fd 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -399,10 +399,28 @@ static void mark_umount_candidates(struct mount *mnt)
 
 	BUG_ON(parent == mnt);
 
+	if (IS_MNT_MARKED(mnt))
+		return;
+
+	SET_MNT_MARK(mnt);
+
 	for (m = propagation_next(parent, parent); m;
 			m = propagation_next(m, parent)) {
-		struct mount *child = __lookup_mnt_last(&m->mnt,
-						mnt->mnt_mountpoint);
+		struct mount *child = NULL, *p;
+
+		for (p = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint); p;
+		     p = __lookup_mnt_cont(p, &m->mnt, mnt->mnt_mountpoint)) {
+			/*
+			 * Do this work only once for mounts from
+			 * the same propagation chain.
+			 */
+			if (p->mnt.mnt_flags & MNT_UMOUNT) {
+				SET_MNT_MARK(p);
+				continue;
+			}
+			child = p;
+		}
+
 		if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
 			SET_MNT_MARK(child);
 		}
@@ -420,11 +438,33 @@ static void __propagate_umount(struct mount *mnt)
 
 	BUG_ON(parent == mnt);
 
+	/*
+	 * All mounts has been marked in mark_umount_candidates(), so
+	 * here the absence of the mark means that it has been handled
+	 * already.
+	 */
+	if (!IS_MNT_MARKED(mnt))
+		return;
+
+	CLEAR_MNT_MARK(mnt);
+
 	for (m = propagation_next(parent, parent); m;
 			m = propagation_next(m, parent)) {
 
-		struct mount *child = __lookup_mnt_last(&m->mnt,
-						mnt->mnt_mountpoint);
+		struct mount *child = NULL, *p;
+
+		for (p = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint); p;
+		     p = __lookup_mnt_cont(p, &m->mnt, mnt->mnt_mountpoint)) {
+			/*
+			 * Do this work only once for mounts from
+			 * the same propagation chain.
+			 */
+			if (p->mnt.mnt_flags & MNT_UMOUNT) {
+				CLEAR_MNT_MARK(p);
+				continue;
+			}
+			child = p;
+		}
 		/*
 		 * umount the child only if the child has no children
 		 * and the child is marked safe to unmount.
-- 
2.5.5

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH] [v3] mount: dont execute propagate_umount() many times for same mounts
       [not found] ` <1476141965-21429-1-git-send-email-avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
@ 2016-10-13 17:14   ` Eric W. Biederman
  0 siblings, 0 replies; 36+ messages in thread
From: Eric W. Biederman @ 2016-10-13 17:14 UTC (permalink / raw)
  To: Andrei Vagin
  Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	Alexander Viro, linux-kernel-u79uwXL29TY76Z2rM5mHXA

Andrei Vagin <avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org> writes:

> The reason of this optimization is that umount() can hold namespace_sem
> for a long time, this semaphore is global, so it affects all users.
> Recently Eric W. Biederman added a per mount namespace limit on the
> number of mounts. The default number of mounts allowed per mount
> namespace at 100,000. Currently this value is allowed to construct a tree
> which requires hours to be umounted.
>
> In a worse case the current complexity of umount_tree() is O(n^3).
> * Enumirate all mounts in a target tree (propagate_umount)
> * Enumirate mounts to find where these changes have to
>   be propagated (mark_umount_candidates)
> * Enumirate mounts to find a requered mount by parent and dentry
>   (__lookup_mnt_lat)
>
> The worse case is when all mounts from the tree live in the same shared
> group. In this case we have to enumirate all mounts on each step.
>
> Here we can optimize the second step. We don't need to make it for
> mounts which we already met when we did this step for previous mounts.
> It reduces the complexity of umount_tree() to O(n^2).

To O(n) not O(n^2).

A hash table lookup (aka __lookup_mnt() and friends) is O(1) or the hash
table is malfunctioning.  Please don't call 

Arguably we are getting into sizes where
the mount hash table fills up and is on the edge of malfunctioning, but
that is not particularly relevant to this case.

What your patch is aiming to do is to take a O(n^2) algorithm and
make it O(n).  That is very much worth doing.

However your patch confuses two separate issues.  Marking mounts that
may be unmounted.  Marking pieces of the propagation tree that have
already been traversed.

I do not see anything requiring propagation trees to intersect at the
set of mounts that are unmounted in umount_tree before propagate_umount
is called.  Which means there are topologies where we can and should do
better than your patch.

I am also bothered that your patch changes how we look up the mount
mounted on a mount point  (aka playing with __lookup_mnt_last).  There
is no reason to do that to solve the problem, and I think it obscures
what is actually going on.

I am going to see if I can rework your basic concept with explicit
marking of the propagation tree.  In the meantime for people
who are want to see what your patch is doing the version below
essentially does the same thing, without the extra essentially
meaningless loop.

Eric


diff --git a/fs/namespace.c b/fs/namespace.c
index 8183fba9ab4d..33a76ee1b76b 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -650,13 +650,11 @@ struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
 	p = __lookup_mnt(mnt, dentry);
 	if (!p)
 		goto out;
-	if (!(p->mnt.mnt_flags & MNT_UMOUNT))
-		res = p;
+	res = p;
 	hlist_for_each_entry_continue(p, mnt_hash) {
 		if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
 			break;
-		if (!(p->mnt.mnt_flags & MNT_UMOUNT))
-			res = p;
+		res = p;
 	}
 out:
 	return res;
diff --git a/fs/pnode.c b/fs/pnode.c
index 234a9ac49958..ade5e7d8308c 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -399,11 +399,18 @@ static void mark_umount_candidates(struct mount *mnt)
 
 	BUG_ON(parent == mnt);
 
+	if (IS_MNT_MARKED(mnt))
+		return;
+
+	SET_MNT_MARK(mnt);
+
 	for (m = propagation_next(parent, parent); m;
 			m = propagation_next(m, parent)) {
 		struct mount *child = __lookup_mnt_last(&m->mnt,
 						mnt->mnt_mountpoint);
-		if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
+		if (child && ((child->mnt.mnt_flags & MNT_UMOUNT) ||
+			      !IS_MNT_LOCKED(child) ||
+			      IS_MNT_MARKED(m))) {
 			SET_MNT_MARK(child);
 		}
 	}
@@ -420,6 +427,11 @@ static void __propagate_umount(struct mount *mnt)
 
 	BUG_ON(parent == mnt);
 
+	if (!IS_MNT_MARKED(mnt))
+		return;
+
+	CLEAR_MNT_MARK(mnt);
+
 	for (m = propagation_next(parent, parent); m;
 			m = propagation_next(m, parent)) {
 
@@ -432,7 +444,8 @@ static void __propagate_umount(struct mount *mnt)
 		if (!child || !IS_MNT_MARKED(child))
 			continue;
 		CLEAR_MNT_MARK(child);
-		if (list_empty(&child->mnt_mounts)) {
+		if (!(child->mnt.mnt_flags & MNT_UMOUNT) &&
+		    list_empty(&child->mnt_mounts)) {
 			list_del_init(&child->mnt_child);
 			child->mnt.mnt_flags |= MNT_UMOUNT;
 			list_move_tail(&child->mnt_list, &mnt->mnt_list);

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH] [v3] mount: dont execute propagate_umount() many times for same mounts
  2016-10-10 23:26 ` Andrei Vagin
  (?)
  (?)
@ 2016-10-13 17:14 ` Eric W. Biederman
  2016-10-13 19:53   ` [RFC][PATCH] mount: In mark_umount_candidates and __propogate_umount visit each mount once Eric W. Biederman
       [not found]   ` <877f9c6ui8.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org>
  -1 siblings, 2 replies; 36+ messages in thread
From: Eric W. Biederman @ 2016-10-13 17:14 UTC (permalink / raw)
  To: Andrei Vagin; +Cc: Alexander Viro, containers, linux-fsdevel, linux-kernel

Andrei Vagin <avagin@openvz.org> writes:

> The reason of this optimization is that umount() can hold namespace_sem
> for a long time, this semaphore is global, so it affects all users.
> Recently Eric W. Biederman added a per mount namespace limit on the
> number of mounts. The default number of mounts allowed per mount
> namespace at 100,000. Currently this value is allowed to construct a tree
> which requires hours to be umounted.
>
> In a worse case the current complexity of umount_tree() is O(n^3).
> * Enumirate all mounts in a target tree (propagate_umount)
> * Enumirate mounts to find where these changes have to
>   be propagated (mark_umount_candidates)
> * Enumirate mounts to find a requered mount by parent and dentry
>   (__lookup_mnt_lat)
>
> The worse case is when all mounts from the tree live in the same shared
> group. In this case we have to enumirate all mounts on each step.
>
> Here we can optimize the second step. We don't need to make it for
> mounts which we already met when we did this step for previous mounts.
> It reduces the complexity of umount_tree() to O(n^2).

To O(n) not O(n^2).

A hash table lookup (aka __lookup_mnt() and friends) is O(1) or the hash
table is malfunctioning.  Please don't call 

Arguably we are getting into sizes where
the mount hash table fills up and is on the edge of malfunctioning, but
that is not particularly relevant to this case.

What your patch is aiming to do is to take a O(n^2) algorithm and
make it O(n).  That is very much worth doing.

However your patch confuses two separate issues.  Marking mounts that
may be unmounted.  Marking pieces of the propagation tree that have
already been traversed.

I do not see anything requiring propagation trees to intersect at the
set of mounts that are unmounted in umount_tree before propagate_umount
is called.  Which means there are topologies where we can and should do
better than your patch.

I am also bothered that your patch changes how we look up the mount
mounted on a mount point  (aka playing with __lookup_mnt_last).  There
is no reason to do that to solve the problem, and I think it obscures
what is actually going on.

I am going to see if I can rework your basic concept with explicit
marking of the propagation tree.  In the meantime for people
who are want to see what your patch is doing the version below
essentially does the same thing, without the extra essentially
meaningless loop.

Eric


diff --git a/fs/namespace.c b/fs/namespace.c
index 8183fba9ab4d..33a76ee1b76b 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -650,13 +650,11 @@ struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
 	p = __lookup_mnt(mnt, dentry);
 	if (!p)
 		goto out;
-	if (!(p->mnt.mnt_flags & MNT_UMOUNT))
-		res = p;
+	res = p;
 	hlist_for_each_entry_continue(p, mnt_hash) {
 		if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
 			break;
-		if (!(p->mnt.mnt_flags & MNT_UMOUNT))
-			res = p;
+		res = p;
 	}
 out:
 	return res;
diff --git a/fs/pnode.c b/fs/pnode.c
index 234a9ac49958..ade5e7d8308c 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -399,11 +399,18 @@ static void mark_umount_candidates(struct mount *mnt)
 
 	BUG_ON(parent == mnt);
 
+	if (IS_MNT_MARKED(mnt))
+		return;
+
+	SET_MNT_MARK(mnt);
+
 	for (m = propagation_next(parent, parent); m;
 			m = propagation_next(m, parent)) {
 		struct mount *child = __lookup_mnt_last(&m->mnt,
 						mnt->mnt_mountpoint);
-		if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
+		if (child && ((child->mnt.mnt_flags & MNT_UMOUNT) ||
+			      !IS_MNT_LOCKED(child) ||
+			      IS_MNT_MARKED(m))) {
 			SET_MNT_MARK(child);
 		}
 	}
@@ -420,6 +427,11 @@ static void __propagate_umount(struct mount *mnt)
 
 	BUG_ON(parent == mnt);
 
+	if (!IS_MNT_MARKED(mnt))
+		return;
+
+	CLEAR_MNT_MARK(mnt);
+
 	for (m = propagation_next(parent, parent); m;
 			m = propagation_next(m, parent)) {
 
@@ -432,7 +444,8 @@ static void __propagate_umount(struct mount *mnt)
 		if (!child || !IS_MNT_MARKED(child))
 			continue;
 		CLEAR_MNT_MARK(child);
-		if (list_empty(&child->mnt_mounts)) {
+		if (!(child->mnt.mnt_flags & MNT_UMOUNT) &&
+		    list_empty(&child->mnt_mounts)) {
 			list_del_init(&child->mnt_child);
 			child->mnt.mnt_flags |= MNT_UMOUNT;
 			list_move_tail(&child->mnt_list, &mnt->mnt_list);

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [RFC][PATCH] mount: In mark_umount_candidates and __propogate_umount visit each mount once
       [not found]   ` <877f9c6ui8.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org>
@ 2016-10-13 19:53     ` Eric W. Biederman
  0 siblings, 0 replies; 36+ messages in thread
From: Eric W. Biederman @ 2016-10-13 19:53 UTC (permalink / raw)
  To: Andrei Vagin
  Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	Alexander Viro, linux-kernel-u79uwXL29TY76Z2rM5mHXA


Adrei Vagin pointed out that time to executue propagate_umount can go
non-linear (and take a ludicrious amount of time) when the mount
propogation trees of the mounts to be unmunted by a lazy unmount
overlap.

Solve this in the most straight forward way possible, by adding a new
mount flag to mark parts of the mount propagation tree that have been
visited, and use that mark to skip parts of the mount propagation tree
that have already been visited during an unmount.  This guarantees
that each mountpoint in the possibly overlapping mount propagation
trees will be visited exactly once.

Add the functions propagation_visit_next and propagation_revisit_next
to coordinate setting and clearling the visited mount mark.

Here is a script to generate such mount tree:
$ cat run.sh
mount -t tmpfs test-mount /mnt
mount --make-shared /mnt
for i in `seq $1`; do
        mkdir /mnt/test.$i
        mount --bind /mnt /mnt/test.$i
done
cat /proc/mounts | grep test-mount | wc -l
time umount -l /mnt
$ for i in `seq 10 16`; do echo $i; unshare -Urm bash ./run.sh $i; done

Here are the performance numbers with and without the patch:

mounts | before | after (real sec)
-----------------------------
  1024 |  0.071 | 0.024
  2048 |  0.184 | 0.030
  4096 |  0.604 | 0.040
  8912 |  4.471 | 0.043
 16384 | 34.826 | 0.082
 32768 |        | 0.151
 65536 |        | 0.289
131072 |        | 0.659

Andrei Vagin fixing this performance problem is part of the
work to fix CVE-2016-6213.

Cc: stable-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Reported-by: Andrei Vagin <avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
Signed-off-by: "Eric W. Biederman" <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
---

Andrei can you take a look at this patch and see if you can see any
problems.  My limited testing suggests this approach does a much better
job of solving the problem you were seeing.  With the time looking
almost linear in the number of mounts now.

 fs/pnode.c            | 125 ++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/pnode.h            |   4 ++
 include/linux/mount.h |   2 +
 3 files changed, 126 insertions(+), 5 deletions(-)

diff --git a/fs/pnode.c b/fs/pnode.c
index 234a9ac49958..3acce0c75f94 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -164,6 +164,120 @@ static struct mount *propagation_next(struct mount *m,
 	}
 }
 
+/*
+ * get the next mount in the propagation tree (that has not been visited)
+ * @m: the mount seen last
+ * @origin: the original mount from where the tree walk initiated
+ *
+ * Note that peer groups form contiguous segments of slave lists.
+ * We rely on that in get_source() to be able to find out if
+ * vfsmount found while iterating with propagation_next() is
+ * a peer of one we'd found earlier.
+ */
+static struct mount *propagation_visit_next(struct mount *m,
+					    struct mount *origin)
+{
+	/* Has this part of the propgation tree already been visited? */
+	if (IS_MNT_VISITED(m))
+		return NULL;
+
+	SET_MNT_VISITED(m);
+
+	/* are there any slaves of this mount? */
+	if (!list_empty(&m->mnt_slave_list)) {
+		struct mount *slave = first_slave(m);
+		while (1) {
+			if (!IS_MNT_VISITED(slave))
+				return slave;
+			if (slave->mnt_slave.next == &m->mnt_slave_list)
+				break;
+			slave = next_slave(slave);
+		}
+	}
+	while (1) {
+		struct mount *master = m->mnt_master;
+
+		if (master == origin->mnt_master) {
+			struct mount *next = next_peer(m);
+			while (1) {
+				if (next == origin)
+					return NULL;
+				if (!IS_MNT_VISITED(next))
+					return next;
+				next = next_peer(next);
+			}
+		} else {
+			while (1) {
+				if (m->mnt_slave.next == &master->mnt_slave_list)
+					break;
+				m = next_slave(m);
+				if (!IS_MNT_VISITED(m))
+					return m;
+			}
+		}
+
+		/* back at master */
+		m = master;
+	}
+}
+
+/*
+ * get the next mount in the propagation tree (that has not been revisited)
+ * @m: the mount seen last
+ * @origin: the original mount from where the tree walk initiated
+ *
+ * Note that peer groups form contiguous segments of slave lists.
+ * We rely on that in get_source() to be able to find out if
+ * vfsmount found while iterating with propagation_next() is
+ * a peer of one we'd found earlier.
+ */
+static struct mount *propagation_revisit_next(struct mount *m,
+					      struct mount *origin)
+{
+	/* Has this part of the propgation tree already been revisited? */
+	if (!IS_MNT_VISITED(m))
+		return NULL;
+
+	CLEAR_MNT_VISITED(m);
+
+	/* are there any slaves of this mount? */
+	if (!list_empty(&m->mnt_slave_list)) {
+		struct mount *slave = first_slave(m);
+		while (1) {
+			if (IS_MNT_VISITED(slave))
+				return slave;
+			if (slave->mnt_slave.next == &m->mnt_slave_list)
+				break;
+			slave = next_slave(slave);
+		}
+	}
+	while (1) {
+		struct mount *master = m->mnt_master;
+
+		if (master == origin->mnt_master) {
+			struct mount *next = next_peer(m);
+			while (1) {
+				if (next == origin)
+					return NULL;
+				if (IS_MNT_VISITED(next))
+					return next;
+				next = next_peer(next);
+			}
+		} else {
+			while (1) {
+				if (m->mnt_slave.next == &master->mnt_slave_list)
+					break;
+				m = next_slave(m);
+				if (IS_MNT_VISITED(m))
+					return m;
+			}
+		}
+
+		/* back at master */
+		m = master;
+	}
+}
+
 static struct mount *next_group(struct mount *m, struct mount *origin)
 {
 	while (1) {
@@ -399,11 +513,12 @@ static void mark_umount_candidates(struct mount *mnt)
 
 	BUG_ON(parent == mnt);
 
-	for (m = propagation_next(parent, parent); m;
-			m = propagation_next(m, parent)) {
+	for (m = propagation_visit_next(parent, parent); m;
+			m = propagation_visit_next(m, parent)) {
 		struct mount *child = __lookup_mnt_last(&m->mnt,
 						mnt->mnt_mountpoint);
-		if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
+		if (child && (!IS_MNT_LOCKED(child) ||
+			      IS_MNT_MARKED(m))) {
 			SET_MNT_MARK(child);
 		}
 	}
@@ -420,8 +535,8 @@ static void __propagate_umount(struct mount *mnt)
 
 	BUG_ON(parent == mnt);
 
-	for (m = propagation_next(parent, parent); m;
-			m = propagation_next(m, parent)) {
+	for (m = propagation_revisit_next(parent, parent); m;
+			m = propagation_revisit_next(m, parent)) {
 
 		struct mount *child = __lookup_mnt_last(&m->mnt,
 						mnt->mnt_mountpoint);
diff --git a/fs/pnode.h b/fs/pnode.h
index 550f5a8b4fcf..988ea4945764 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -21,6 +21,10 @@
 #define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED)
 #define IS_MNT_LOCKED(m) ((m)->mnt.mnt_flags & MNT_LOCKED)
 
+#define IS_MNT_VISITED(m) ((m)->mnt.mnt_flags & MNT_VISITED)
+#define SET_MNT_VISITED(m) ((m)->mnt.mnt_flags |= MNT_VISITED)
+#define CLEAR_MNT_VISITED(m) ((m)->mnt.mnt_flags &= ~MNT_VISITED)
+
 #define CL_EXPIRE    		0x01
 #define CL_SLAVE     		0x02
 #define CL_COPY_UNBINDABLE	0x04
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 9227b190fdf2..6048045b96c3 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -52,6 +52,8 @@ struct mnt_namespace;
 
 #define MNT_INTERNAL	0x4000
 
+#define MNT_VISITED		0x010000
+
 #define MNT_LOCK_ATIME		0x040000
 #define MNT_LOCK_NOEXEC		0x080000
 #define MNT_LOCK_NOSUID		0x100000
-- 
2.8.3

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [RFC][PATCH] mount: In mark_umount_candidates and __propogate_umount visit each mount once
  2016-10-13 17:14 ` Eric W. Biederman
@ 2016-10-13 19:53   ` Eric W. Biederman
  2016-10-13 21:46     ` Andrei Vagin
       [not found]     ` <87pon458l1.fsf_-_-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org>
       [not found]   ` <877f9c6ui8.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org>
  1 sibling, 2 replies; 36+ messages in thread
From: Eric W. Biederman @ 2016-10-13 19:53 UTC (permalink / raw)
  To: Andrei Vagin; +Cc: Alexander Viro, containers, linux-fsdevel, linux-kernel


Adrei Vagin pointed out that time to executue propagate_umount can go
non-linear (and take a ludicrious amount of time) when the mount
propogation trees of the mounts to be unmunted by a lazy unmount
overlap.

Solve this in the most straight forward way possible, by adding a new
mount flag to mark parts of the mount propagation tree that have been
visited, and use that mark to skip parts of the mount propagation tree
that have already been visited during an unmount.  This guarantees
that each mountpoint in the possibly overlapping mount propagation
trees will be visited exactly once.

Add the functions propagation_visit_next and propagation_revisit_next
to coordinate setting and clearling the visited mount mark.

Here is a script to generate such mount tree:
$ cat run.sh
mount -t tmpfs test-mount /mnt
mount --make-shared /mnt
for i in `seq $1`; do
        mkdir /mnt/test.$i
        mount --bind /mnt /mnt/test.$i
done
cat /proc/mounts | grep test-mount | wc -l
time umount -l /mnt
$ for i in `seq 10 16`; do echo $i; unshare -Urm bash ./run.sh $i; done

Here are the performance numbers with and without the patch:

mounts | before | after (real sec)
-----------------------------
  1024 |  0.071 | 0.024
  2048 |  0.184 | 0.030
  4096 |  0.604 | 0.040
  8912 |  4.471 | 0.043
 16384 | 34.826 | 0.082
 32768 |        | 0.151
 65536 |        | 0.289
131072 |        | 0.659

Andrei Vagin fixing this performance problem is part of the
work to fix CVE-2016-6213.

Cc: stable@vger.kernel.org
Reported-by: Andrei Vagin <avagin@openvz.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---

Andrei can you take a look at this patch and see if you can see any
problems.  My limited testing suggests this approach does a much better
job of solving the problem you were seeing.  With the time looking
almost linear in the number of mounts now.

 fs/pnode.c            | 125 ++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/pnode.h            |   4 ++
 include/linux/mount.h |   2 +
 3 files changed, 126 insertions(+), 5 deletions(-)

diff --git a/fs/pnode.c b/fs/pnode.c
index 234a9ac49958..3acce0c75f94 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -164,6 +164,120 @@ static struct mount *propagation_next(struct mount *m,
 	}
 }
 
+/*
+ * get the next mount in the propagation tree (that has not been visited)
+ * @m: the mount seen last
+ * @origin: the original mount from where the tree walk initiated
+ *
+ * Note that peer groups form contiguous segments of slave lists.
+ * We rely on that in get_source() to be able to find out if
+ * vfsmount found while iterating with propagation_next() is
+ * a peer of one we'd found earlier.
+ */
+static struct mount *propagation_visit_next(struct mount *m,
+					    struct mount *origin)
+{
+	/* Has this part of the propgation tree already been visited? */
+	if (IS_MNT_VISITED(m))
+		return NULL;
+
+	SET_MNT_VISITED(m);
+
+	/* are there any slaves of this mount? */
+	if (!list_empty(&m->mnt_slave_list)) {
+		struct mount *slave = first_slave(m);
+		while (1) {
+			if (!IS_MNT_VISITED(slave))
+				return slave;
+			if (slave->mnt_slave.next == &m->mnt_slave_list)
+				break;
+			slave = next_slave(slave);
+		}
+	}
+	while (1) {
+		struct mount *master = m->mnt_master;
+
+		if (master == origin->mnt_master) {
+			struct mount *next = next_peer(m);
+			while (1) {
+				if (next == origin)
+					return NULL;
+				if (!IS_MNT_VISITED(next))
+					return next;
+				next = next_peer(next);
+			}
+		} else {
+			while (1) {
+				if (m->mnt_slave.next == &master->mnt_slave_list)
+					break;
+				m = next_slave(m);
+				if (!IS_MNT_VISITED(m))
+					return m;
+			}
+		}
+
+		/* back at master */
+		m = master;
+	}
+}
+
+/*
+ * get the next mount in the propagation tree (that has not been revisited)
+ * @m: the mount seen last
+ * @origin: the original mount from where the tree walk initiated
+ *
+ * Note that peer groups form contiguous segments of slave lists.
+ * We rely on that in get_source() to be able to find out if
+ * vfsmount found while iterating with propagation_next() is
+ * a peer of one we'd found earlier.
+ */
+static struct mount *propagation_revisit_next(struct mount *m,
+					      struct mount *origin)
+{
+	/* Has this part of the propgation tree already been revisited? */
+	if (!IS_MNT_VISITED(m))
+		return NULL;
+
+	CLEAR_MNT_VISITED(m);
+
+	/* are there any slaves of this mount? */
+	if (!list_empty(&m->mnt_slave_list)) {
+		struct mount *slave = first_slave(m);
+		while (1) {
+			if (IS_MNT_VISITED(slave))
+				return slave;
+			if (slave->mnt_slave.next == &m->mnt_slave_list)
+				break;
+			slave = next_slave(slave);
+		}
+	}
+	while (1) {
+		struct mount *master = m->mnt_master;
+
+		if (master == origin->mnt_master) {
+			struct mount *next = next_peer(m);
+			while (1) {
+				if (next == origin)
+					return NULL;
+				if (IS_MNT_VISITED(next))
+					return next;
+				next = next_peer(next);
+			}
+		} else {
+			while (1) {
+				if (m->mnt_slave.next == &master->mnt_slave_list)
+					break;
+				m = next_slave(m);
+				if (IS_MNT_VISITED(m))
+					return m;
+			}
+		}
+
+		/* back at master */
+		m = master;
+	}
+}
+
 static struct mount *next_group(struct mount *m, struct mount *origin)
 {
 	while (1) {
@@ -399,11 +513,12 @@ static void mark_umount_candidates(struct mount *mnt)
 
 	BUG_ON(parent == mnt);
 
-	for (m = propagation_next(parent, parent); m;
-			m = propagation_next(m, parent)) {
+	for (m = propagation_visit_next(parent, parent); m;
+			m = propagation_visit_next(m, parent)) {
 		struct mount *child = __lookup_mnt_last(&m->mnt,
 						mnt->mnt_mountpoint);
-		if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
+		if (child && (!IS_MNT_LOCKED(child) ||
+			      IS_MNT_MARKED(m))) {
 			SET_MNT_MARK(child);
 		}
 	}
@@ -420,8 +535,8 @@ static void __propagate_umount(struct mount *mnt)
 
 	BUG_ON(parent == mnt);
 
-	for (m = propagation_next(parent, parent); m;
-			m = propagation_next(m, parent)) {
+	for (m = propagation_revisit_next(parent, parent); m;
+			m = propagation_revisit_next(m, parent)) {
 
 		struct mount *child = __lookup_mnt_last(&m->mnt,
 						mnt->mnt_mountpoint);
diff --git a/fs/pnode.h b/fs/pnode.h
index 550f5a8b4fcf..988ea4945764 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -21,6 +21,10 @@
 #define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED)
 #define IS_MNT_LOCKED(m) ((m)->mnt.mnt_flags & MNT_LOCKED)
 
+#define IS_MNT_VISITED(m) ((m)->mnt.mnt_flags & MNT_VISITED)
+#define SET_MNT_VISITED(m) ((m)->mnt.mnt_flags |= MNT_VISITED)
+#define CLEAR_MNT_VISITED(m) ((m)->mnt.mnt_flags &= ~MNT_VISITED)
+
 #define CL_EXPIRE    		0x01
 #define CL_SLAVE     		0x02
 #define CL_COPY_UNBINDABLE	0x04
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 9227b190fdf2..6048045b96c3 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -52,6 +52,8 @@ struct mnt_namespace;
 
 #define MNT_INTERNAL	0x4000
 
+#define MNT_VISITED		0x010000
+
 #define MNT_LOCK_ATIME		0x040000
 #define MNT_LOCK_NOEXEC		0x080000
 #define MNT_LOCK_NOSUID		0x100000
-- 
2.8.3

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [RFC][PATCH] mount: In mark_umount_candidates and __propogate_umount visit each mount once
       [not found]     ` <87pon458l1.fsf_-_-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org>
@ 2016-10-13 21:46       ` Andrei Vagin
  0 siblings, 0 replies; 36+ messages in thread
From: Andrei Vagin @ 2016-10-13 21:46 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	Andrei Vagin, Alexander Viro

On Thu, Oct 13, 2016 at 02:53:46PM -0500, Eric W. Biederman wrote:
> 
> Adrei Vagin pointed out that time to executue propagate_umount can go
> non-linear (and take a ludicrious amount of time) when the mount
> propogation trees of the mounts to be unmunted by a lazy unmount
> overlap.
> 
> Solve this in the most straight forward way possible, by adding a new
> mount flag to mark parts of the mount propagation tree that have been
> visited, and use that mark to skip parts of the mount propagation tree
> that have already been visited during an unmount.  This guarantees
> that each mountpoint in the possibly overlapping mount propagation
> trees will be visited exactly once.
> 
> Add the functions propagation_visit_next and propagation_revisit_next
> to coordinate setting and clearling the visited mount mark.
> 
> Here is a script to generate such mount tree:
> $ cat run.sh
> mount -t tmpfs test-mount /mnt
> mount --make-shared /mnt
> for i in `seq $1`; do
>         mkdir /mnt/test.$i
>         mount --bind /mnt /mnt/test.$i
> done
> cat /proc/mounts | grep test-mount | wc -l
> time umount -l /mnt
> $ for i in `seq 10 16`; do echo $i; unshare -Urm bash ./run.sh $i; done
> 
> Here are the performance numbers with and without the patch:
> 
> mounts | before | after (real sec)
> -----------------------------
>   1024 |  0.071 | 0.024
>   2048 |  0.184 | 0.030
>   4096 |  0.604 | 0.040
>   8912 |  4.471 | 0.043
>  16384 | 34.826 | 0.082
>  32768 |        | 0.151
>  65536 |        | 0.289
> 131072 |        | 0.659
> 
> Andrei Vagin fixing this performance problem is part of the
> work to fix CVE-2016-6213.
> 
> Cc: stable-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> Reported-by: Andrei Vagin <avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
> Signed-off-by: "Eric W. Biederman" <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
> ---
> 
> Andrei can you take a look at this patch and see if you can see any
> problems.  My limited testing suggests this approach does a much better
> job of solving the problem you were seeing.  With the time looking
> almost linear in the number of mounts now.

I read this patch and I like the idea.

Then I run my tests and one of them doesn't work with this patch.
I haven't found a reason yet.

Here is the test:

[root@fc24 mounts]# cat run.sh
set -e
mount -t tmpfs zdtm /mnt
mkdir -p /mnt/1 /mnt/2
mount -t tmpfs zdtm /mnt/1
mount --make-shared /mnt/1
for i in `seq $1`; do
	mount --bind /mnt/1 `mktemp -d /mnt/1/test.XXXXXX`
done
mount --rbind /mnt/1 /mnt/2
cat /proc/self/mountinfo | grep zdtm | wc -l
time umount -l /mnt/1
cat /proc/self/mountinfo | grep zdtm | wc -l
umount /mnt/2


[root@fc24 mounts]# unshare -Urm ./run.sh  5
65

real	0m0.014s
user	0m0.000s
sys	0m0.004s
33
umount: /mnt/2: target is busy
        (In some cases useful info about processes that
         use the device is found by lsof(8) or fuser(1).)

> 
>  fs/pnode.c            | 125 ++++++++++++++++++++++++++++++++++++++++++++++++--
>  fs/pnode.h            |   4 ++
>  include/linux/mount.h |   2 +
>  3 files changed, 126 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/pnode.c b/fs/pnode.c
> index 234a9ac49958..3acce0c75f94 100644
> --- a/fs/pnode.c
> +++ b/fs/pnode.c
> @@ -164,6 +164,120 @@ static struct mount *propagation_next(struct mount *m,
>  	}
>  }
>  
> +/*
> + * get the next mount in the propagation tree (that has not been visited)
> + * @m: the mount seen last
> + * @origin: the original mount from where the tree walk initiated
> + *
> + * Note that peer groups form contiguous segments of slave lists.
> + * We rely on that in get_source() to be able to find out if
> + * vfsmount found while iterating with propagation_next() is
> + * a peer of one we'd found earlier.
> + */
> +static struct mount *propagation_visit_next(struct mount *m,
> +					    struct mount *origin)
> +{
> +	/* Has this part of the propgation tree already been visited? */
> +	if (IS_MNT_VISITED(m))
> +		return NULL;
> +
> +	SET_MNT_VISITED(m);
> +
> +	/* are there any slaves of this mount? */
> +	if (!list_empty(&m->mnt_slave_list)) {
> +		struct mount *slave = first_slave(m);
> +		while (1) {
> +			if (!IS_MNT_VISITED(slave))
> +				return slave;
> +			if (slave->mnt_slave.next == &m->mnt_slave_list)
> +				break;
> +			slave = next_slave(slave);
> +		}
> +	}
> +	while (1) {
> +		struct mount *master = m->mnt_master;
> +
> +		if (master == origin->mnt_master) {
> +			struct mount *next = next_peer(m);
> +			while (1) {
> +				if (next == origin)
> +					return NULL;
> +				if (!IS_MNT_VISITED(next))
> +					return next;
> +				next = next_peer(next);
> +			}
> +		} else {
> +			while (1) {
> +				if (m->mnt_slave.next == &master->mnt_slave_list)
> +					break;
> +				m = next_slave(m);
> +				if (!IS_MNT_VISITED(m))
> +					return m;
> +			}
> +		}
> +
> +		/* back at master */
> +		m = master;
> +	}
> +}
> +
> +/*
> + * get the next mount in the propagation tree (that has not been revisited)
> + * @m: the mount seen last
> + * @origin: the original mount from where the tree walk initiated
> + *
> + * Note that peer groups form contiguous segments of slave lists.
> + * We rely on that in get_source() to be able to find out if
> + * vfsmount found while iterating with propagation_next() is
> + * a peer of one we'd found earlier.
> + */
> +static struct mount *propagation_revisit_next(struct mount *m,
> +					      struct mount *origin)
> +{
> +	/* Has this part of the propgation tree already been revisited? */
> +	if (!IS_MNT_VISITED(m))
> +		return NULL;
> +
> +	CLEAR_MNT_VISITED(m);
> +
> +	/* are there any slaves of this mount? */
> +	if (!list_empty(&m->mnt_slave_list)) {
> +		struct mount *slave = first_slave(m);
> +		while (1) {
> +			if (IS_MNT_VISITED(slave))
> +				return slave;
> +			if (slave->mnt_slave.next == &m->mnt_slave_list)
> +				break;
> +			slave = next_slave(slave);
> +		}
> +	}
> +	while (1) {
> +		struct mount *master = m->mnt_master;
> +
> +		if (master == origin->mnt_master) {
> +			struct mount *next = next_peer(m);
> +			while (1) {
> +				if (next == origin)
> +					return NULL;
> +				if (IS_MNT_VISITED(next))
> +					return next;
> +				next = next_peer(next);
> +			}
> +		} else {
> +			while (1) {
> +				if (m->mnt_slave.next == &master->mnt_slave_list)
> +					break;
> +				m = next_slave(m);
> +				if (IS_MNT_VISITED(m))
> +					return m;
> +			}
> +		}
> +
> +		/* back at master */
> +		m = master;
> +	}
> +}
> +
>  static struct mount *next_group(struct mount *m, struct mount *origin)
>  {
>  	while (1) {
> @@ -399,11 +513,12 @@ static void mark_umount_candidates(struct mount *mnt)
>  
>  	BUG_ON(parent == mnt);
>  
> -	for (m = propagation_next(parent, parent); m;
> -			m = propagation_next(m, parent)) {
> +	for (m = propagation_visit_next(parent, parent); m;
> +			m = propagation_visit_next(m, parent)) {
>  		struct mount *child = __lookup_mnt_last(&m->mnt,
>  						mnt->mnt_mountpoint);
> -		if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
> +		if (child && (!IS_MNT_LOCKED(child) ||
> +			      IS_MNT_MARKED(m))) {
>  			SET_MNT_MARK(child);
>  		}
>  	}
> @@ -420,8 +535,8 @@ static void __propagate_umount(struct mount *mnt)
>  
>  	BUG_ON(parent == mnt);
>  
> -	for (m = propagation_next(parent, parent); m;
> -			m = propagation_next(m, parent)) {
> +	for (m = propagation_revisit_next(parent, parent); m;
> +			m = propagation_revisit_next(m, parent)) {
>  
>  		struct mount *child = __lookup_mnt_last(&m->mnt,
>  						mnt->mnt_mountpoint);
> diff --git a/fs/pnode.h b/fs/pnode.h
> index 550f5a8b4fcf..988ea4945764 100644
> --- a/fs/pnode.h
> +++ b/fs/pnode.h
> @@ -21,6 +21,10 @@
>  #define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED)
>  #define IS_MNT_LOCKED(m) ((m)->mnt.mnt_flags & MNT_LOCKED)
>  
> +#define IS_MNT_VISITED(m) ((m)->mnt.mnt_flags & MNT_VISITED)
> +#define SET_MNT_VISITED(m) ((m)->mnt.mnt_flags |= MNT_VISITED)
> +#define CLEAR_MNT_VISITED(m) ((m)->mnt.mnt_flags &= ~MNT_VISITED)
> +
>  #define CL_EXPIRE    		0x01
>  #define CL_SLAVE     		0x02
>  #define CL_COPY_UNBINDABLE	0x04
> diff --git a/include/linux/mount.h b/include/linux/mount.h
> index 9227b190fdf2..6048045b96c3 100644
> --- a/include/linux/mount.h
> +++ b/include/linux/mount.h
> @@ -52,6 +52,8 @@ struct mnt_namespace;
>  
>  #define MNT_INTERNAL	0x4000
>  
> +#define MNT_VISITED		0x010000
> +
>  #define MNT_LOCK_ATIME		0x040000
>  #define MNT_LOCK_NOEXEC		0x080000
>  #define MNT_LOCK_NOSUID		0x100000
> -- 
> 2.8.3
> 

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [RFC][PATCH] mount: In mark_umount_candidates and __propogate_umount visit each mount once
  2016-10-13 19:53   ` [RFC][PATCH] mount: In mark_umount_candidates and __propogate_umount visit each mount once Eric W. Biederman
@ 2016-10-13 21:46     ` Andrei Vagin
       [not found]       ` <20161013214650.GB19836-1ViLX0X+lBJGNQ1M2rI3KwRV3xvJKrda@public.gmane.org>
       [not found]     ` <87pon458l1.fsf_-_-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org>
  1 sibling, 1 reply; 36+ messages in thread
From: Andrei Vagin @ 2016-10-13 21:46 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Andrei Vagin, Alexander Viro, containers, linux-fsdevel, linux-kernel

On Thu, Oct 13, 2016 at 02:53:46PM -0500, Eric W. Biederman wrote:
> 
> Adrei Vagin pointed out that time to executue propagate_umount can go
> non-linear (and take a ludicrious amount of time) when the mount
> propogation trees of the mounts to be unmunted by a lazy unmount
> overlap.
> 
> Solve this in the most straight forward way possible, by adding a new
> mount flag to mark parts of the mount propagation tree that have been
> visited, and use that mark to skip parts of the mount propagation tree
> that have already been visited during an unmount.  This guarantees
> that each mountpoint in the possibly overlapping mount propagation
> trees will be visited exactly once.
> 
> Add the functions propagation_visit_next and propagation_revisit_next
> to coordinate setting and clearling the visited mount mark.
> 
> Here is a script to generate such mount tree:
> $ cat run.sh
> mount -t tmpfs test-mount /mnt
> mount --make-shared /mnt
> for i in `seq $1`; do
>         mkdir /mnt/test.$i
>         mount --bind /mnt /mnt/test.$i
> done
> cat /proc/mounts | grep test-mount | wc -l
> time umount -l /mnt
> $ for i in `seq 10 16`; do echo $i; unshare -Urm bash ./run.sh $i; done
> 
> Here are the performance numbers with and without the patch:
> 
> mounts | before | after (real sec)
> -----------------------------
>   1024 |  0.071 | 0.024
>   2048 |  0.184 | 0.030
>   4096 |  0.604 | 0.040
>   8912 |  4.471 | 0.043
>  16384 | 34.826 | 0.082
>  32768 |        | 0.151
>  65536 |        | 0.289
> 131072 |        | 0.659
> 
> Andrei Vagin fixing this performance problem is part of the
> work to fix CVE-2016-6213.
> 
> Cc: stable@vger.kernel.org
> Reported-by: Andrei Vagin <avagin@openvz.org>
> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
> ---
> 
> Andrei can you take a look at this patch and see if you can see any
> problems.  My limited testing suggests this approach does a much better
> job of solving the problem you were seeing.  With the time looking
> almost linear in the number of mounts now.

I read this patch and I like the idea.

Then I run my tests and one of them doesn't work with this patch.
I haven't found a reason yet.

Here is the test:

[root@fc24 mounts]# cat run.sh
set -e
mount -t tmpfs zdtm /mnt
mkdir -p /mnt/1 /mnt/2
mount -t tmpfs zdtm /mnt/1
mount --make-shared /mnt/1
for i in `seq $1`; do
	mount --bind /mnt/1 `mktemp -d /mnt/1/test.XXXXXX`
done
mount --rbind /mnt/1 /mnt/2
cat /proc/self/mountinfo | grep zdtm | wc -l
time umount -l /mnt/1
cat /proc/self/mountinfo | grep zdtm | wc -l
umount /mnt/2


[root@fc24 mounts]# unshare -Urm ./run.sh  5
65

real	0m0.014s
user	0m0.000s
sys	0m0.004s
33
umount: /mnt/2: target is busy
        (In some cases useful info about processes that
         use the device is found by lsof(8) or fuser(1).)

> 
>  fs/pnode.c            | 125 ++++++++++++++++++++++++++++++++++++++++++++++++--
>  fs/pnode.h            |   4 ++
>  include/linux/mount.h |   2 +
>  3 files changed, 126 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/pnode.c b/fs/pnode.c
> index 234a9ac49958..3acce0c75f94 100644
> --- a/fs/pnode.c
> +++ b/fs/pnode.c
> @@ -164,6 +164,120 @@ static struct mount *propagation_next(struct mount *m,
>  	}
>  }
>  
> +/*
> + * get the next mount in the propagation tree (that has not been visited)
> + * @m: the mount seen last
> + * @origin: the original mount from where the tree walk initiated
> + *
> + * Note that peer groups form contiguous segments of slave lists.
> + * We rely on that in get_source() to be able to find out if
> + * vfsmount found while iterating with propagation_next() is
> + * a peer of one we'd found earlier.
> + */
> +static struct mount *propagation_visit_next(struct mount *m,
> +					    struct mount *origin)
> +{
> +	/* Has this part of the propgation tree already been visited? */
> +	if (IS_MNT_VISITED(m))
> +		return NULL;
> +
> +	SET_MNT_VISITED(m);
> +
> +	/* are there any slaves of this mount? */
> +	if (!list_empty(&m->mnt_slave_list)) {
> +		struct mount *slave = first_slave(m);
> +		while (1) {
> +			if (!IS_MNT_VISITED(slave))
> +				return slave;
> +			if (slave->mnt_slave.next == &m->mnt_slave_list)
> +				break;
> +			slave = next_slave(slave);
> +		}
> +	}
> +	while (1) {
> +		struct mount *master = m->mnt_master;
> +
> +		if (master == origin->mnt_master) {
> +			struct mount *next = next_peer(m);
> +			while (1) {
> +				if (next == origin)
> +					return NULL;
> +				if (!IS_MNT_VISITED(next))
> +					return next;
> +				next = next_peer(next);
> +			}
> +		} else {
> +			while (1) {
> +				if (m->mnt_slave.next == &master->mnt_slave_list)
> +					break;
> +				m = next_slave(m);
> +				if (!IS_MNT_VISITED(m))
> +					return m;
> +			}
> +		}
> +
> +		/* back at master */
> +		m = master;
> +	}
> +}
> +
> +/*
> + * get the next mount in the propagation tree (that has not been revisited)
> + * @m: the mount seen last
> + * @origin: the original mount from where the tree walk initiated
> + *
> + * Note that peer groups form contiguous segments of slave lists.
> + * We rely on that in get_source() to be able to find out if
> + * vfsmount found while iterating with propagation_next() is
> + * a peer of one we'd found earlier.
> + */
> +static struct mount *propagation_revisit_next(struct mount *m,
> +					      struct mount *origin)
> +{
> +	/* Has this part of the propgation tree already been revisited? */
> +	if (!IS_MNT_VISITED(m))
> +		return NULL;
> +
> +	CLEAR_MNT_VISITED(m);
> +
> +	/* are there any slaves of this mount? */
> +	if (!list_empty(&m->mnt_slave_list)) {
> +		struct mount *slave = first_slave(m);
> +		while (1) {
> +			if (IS_MNT_VISITED(slave))
> +				return slave;
> +			if (slave->mnt_slave.next == &m->mnt_slave_list)
> +				break;
> +			slave = next_slave(slave);
> +		}
> +	}
> +	while (1) {
> +		struct mount *master = m->mnt_master;
> +
> +		if (master == origin->mnt_master) {
> +			struct mount *next = next_peer(m);
> +			while (1) {
> +				if (next == origin)
> +					return NULL;
> +				if (IS_MNT_VISITED(next))
> +					return next;
> +				next = next_peer(next);
> +			}
> +		} else {
> +			while (1) {
> +				if (m->mnt_slave.next == &master->mnt_slave_list)
> +					break;
> +				m = next_slave(m);
> +				if (IS_MNT_VISITED(m))
> +					return m;
> +			}
> +		}
> +
> +		/* back at master */
> +		m = master;
> +	}
> +}
> +
>  static struct mount *next_group(struct mount *m, struct mount *origin)
>  {
>  	while (1) {
> @@ -399,11 +513,12 @@ static void mark_umount_candidates(struct mount *mnt)
>  
>  	BUG_ON(parent == mnt);
>  
> -	for (m = propagation_next(parent, parent); m;
> -			m = propagation_next(m, parent)) {
> +	for (m = propagation_visit_next(parent, parent); m;
> +			m = propagation_visit_next(m, parent)) {
>  		struct mount *child = __lookup_mnt_last(&m->mnt,
>  						mnt->mnt_mountpoint);
> -		if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
> +		if (child && (!IS_MNT_LOCKED(child) ||
> +			      IS_MNT_MARKED(m))) {
>  			SET_MNT_MARK(child);
>  		}
>  	}
> @@ -420,8 +535,8 @@ static void __propagate_umount(struct mount *mnt)
>  
>  	BUG_ON(parent == mnt);
>  
> -	for (m = propagation_next(parent, parent); m;
> -			m = propagation_next(m, parent)) {
> +	for (m = propagation_revisit_next(parent, parent); m;
> +			m = propagation_revisit_next(m, parent)) {
>  
>  		struct mount *child = __lookup_mnt_last(&m->mnt,
>  						mnt->mnt_mountpoint);
> diff --git a/fs/pnode.h b/fs/pnode.h
> index 550f5a8b4fcf..988ea4945764 100644
> --- a/fs/pnode.h
> +++ b/fs/pnode.h
> @@ -21,6 +21,10 @@
>  #define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED)
>  #define IS_MNT_LOCKED(m) ((m)->mnt.mnt_flags & MNT_LOCKED)
>  
> +#define IS_MNT_VISITED(m) ((m)->mnt.mnt_flags & MNT_VISITED)
> +#define SET_MNT_VISITED(m) ((m)->mnt.mnt_flags |= MNT_VISITED)
> +#define CLEAR_MNT_VISITED(m) ((m)->mnt.mnt_flags &= ~MNT_VISITED)
> +
>  #define CL_EXPIRE    		0x01
>  #define CL_SLAVE     		0x02
>  #define CL_COPY_UNBINDABLE	0x04
> diff --git a/include/linux/mount.h b/include/linux/mount.h
> index 9227b190fdf2..6048045b96c3 100644
> --- a/include/linux/mount.h
> +++ b/include/linux/mount.h
> @@ -52,6 +52,8 @@ struct mnt_namespace;
>  
>  #define MNT_INTERNAL	0x4000
>  
> +#define MNT_VISITED		0x010000
> +
>  #define MNT_LOCK_ATIME		0x040000
>  #define MNT_LOCK_NOEXEC		0x080000
>  #define MNT_LOCK_NOSUID		0x100000
> -- 
> 2.8.3
> 

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [RFC][PATCH] mount: In mark_umount_candidates and __propogate_umount visit each mount once
  2016-10-13 21:46     ` Andrei Vagin
@ 2016-10-14  2:31           ` Andrey Vagin
  0 siblings, 0 replies; 36+ messages in thread
From: Andrey Vagin @ 2016-10-14  2:31 UTC (permalink / raw)
  To: Andrei Vagin
  Cc: linux-fsdevel, LKML, Linux Containers, Eric W. Biederman, Alexander Viro

On Thu, Oct 13, 2016 at 2:46 PM, Andrei Vagin <avagin-5HdwGun5lf+gSpxsJD1C4w@public.gmane.org> wrote:
> On Thu, Oct 13, 2016 at 02:53:46PM -0500, Eric W. Biederman wrote:
>>
>> Adrei Vagin pointed out that time to executue propagate_umount can go
>> non-linear (and take a ludicrious amount of time) when the mount
>> propogation trees of the mounts to be unmunted by a lazy unmount
>> overlap.
>>
>> Solve this in the most straight forward way possible, by adding a new
>> mount flag to mark parts of the mount propagation tree that have been
>> visited, and use that mark to skip parts of the mount propagation tree
>> that have already been visited during an unmount.  This guarantees
>> that each mountpoint in the possibly overlapping mount propagation
>> trees will be visited exactly once.
>>
>> Add the functions propagation_visit_next and propagation_revisit_next
>> to coordinate setting and clearling the visited mount mark.
>>
>> Here is a script to generate such mount tree:
>> $ cat run.sh
>> mount -t tmpfs test-mount /mnt
>> mount --make-shared /mnt
>> for i in `seq $1`; do
>>         mkdir /mnt/test.$i
>>         mount --bind /mnt /mnt/test.$i
>> done
>> cat /proc/mounts | grep test-mount | wc -l
>> time umount -l /mnt
>> $ for i in `seq 10 16`; do echo $i; unshare -Urm bash ./run.sh $i; done
>>
>> Here are the performance numbers with and without the patch:
>>
>> mounts | before | after (real sec)
>> -----------------------------
>>   1024 |  0.071 | 0.024
>>   2048 |  0.184 | 0.030
>>   4096 |  0.604 | 0.040
>>   8912 |  4.471 | 0.043
>>  16384 | 34.826 | 0.082
>>  32768 |        | 0.151
>>  65536 |        | 0.289
>> 131072 |        | 0.659
>>
>> Andrei Vagin fixing this performance problem is part of the
>> work to fix CVE-2016-6213.
>>
>> Cc: stable-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
>> Reported-by: Andrei Vagin <avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
>> Signed-off-by: "Eric W. Biederman" <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
>> ---
>>
>> Andrei can you take a look at this patch and see if you can see any
>> problems.  My limited testing suggests this approach does a much better
>> job of solving the problem you were seeing.  With the time looking
>> almost linear in the number of mounts now.
>
> I read this patch and I like the idea.
>
> Then I run my tests and one of them doesn't work with this patch.
> I haven't found a reason yet.

>> +     for (m = propagation_visit_next(parent, parent); m;
>> +                     m = propagation_visit_next(m, parent)) {
>>               struct mount *child = __lookup_mnt_last(&m->mnt,
>>                                               mnt->mnt_mountpoint);

The reason is that this loop is called for different "mnt", but
it is executed only once with this optimization.

So I think the idea to mark parent will not work, because one parent
can have a few children which have to be umounted.

>
> Here is the test:
>
> [root@fc24 mounts]# cat run.sh
> set -e
> mount -t tmpfs zdtm /mnt
> mkdir -p /mnt/1 /mnt/2
> mount -t tmpfs zdtm /mnt/1
> mount --make-shared /mnt/1
> for i in `seq $1`; do
>         mount --bind /mnt/1 `mktemp -d /mnt/1/test.XXXXXX`
> done
> mount --rbind /mnt/1 /mnt/2
> cat /proc/self/mountinfo | grep zdtm | wc -l
> time umount -l /mnt/1
> cat /proc/self/mountinfo | grep zdtm | wc -l
> umount /mnt/2
>
>
> [root@fc24 mounts]# unshare -Urm ./run.sh  5
> 65
>
> real    0m0.014s
> user    0m0.000s
> sys     0m0.004s
> 33
> umount: /mnt/2: target is busy
>         (In some cases useful info about processes that
>          use the device is found by lsof(8) or fuser(1).)
>
>>

Thanks,
Andrei

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [RFC][PATCH] mount: In mark_umount_candidates and __propogate_umount visit each mount once
@ 2016-10-14  2:31           ` Andrey Vagin
  0 siblings, 0 replies; 36+ messages in thread
From: Andrey Vagin @ 2016-10-14  2:31 UTC (permalink / raw)
  To: Andrei Vagin
  Cc: Eric W. Biederman, Alexander Viro, Linux Containers, linux-fsdevel, LKML

On Thu, Oct 13, 2016 at 2:46 PM, Andrei Vagin <avagin@virtuozzo.com> wrote:
> On Thu, Oct 13, 2016 at 02:53:46PM -0500, Eric W. Biederman wrote:
>>
>> Adrei Vagin pointed out that time to executue propagate_umount can go
>> non-linear (and take a ludicrious amount of time) when the mount
>> propogation trees of the mounts to be unmunted by a lazy unmount
>> overlap.
>>
>> Solve this in the most straight forward way possible, by adding a new
>> mount flag to mark parts of the mount propagation tree that have been
>> visited, and use that mark to skip parts of the mount propagation tree
>> that have already been visited during an unmount.  This guarantees
>> that each mountpoint in the possibly overlapping mount propagation
>> trees will be visited exactly once.
>>
>> Add the functions propagation_visit_next and propagation_revisit_next
>> to coordinate setting and clearling the visited mount mark.
>>
>> Here is a script to generate such mount tree:
>> $ cat run.sh
>> mount -t tmpfs test-mount /mnt
>> mount --make-shared /mnt
>> for i in `seq $1`; do
>>         mkdir /mnt/test.$i
>>         mount --bind /mnt /mnt/test.$i
>> done
>> cat /proc/mounts | grep test-mount | wc -l
>> time umount -l /mnt
>> $ for i in `seq 10 16`; do echo $i; unshare -Urm bash ./run.sh $i; done
>>
>> Here are the performance numbers with and without the patch:
>>
>> mounts | before | after (real sec)
>> -----------------------------
>>   1024 |  0.071 | 0.024
>>   2048 |  0.184 | 0.030
>>   4096 |  0.604 | 0.040
>>   8912 |  4.471 | 0.043
>>  16384 | 34.826 | 0.082
>>  32768 |        | 0.151
>>  65536 |        | 0.289
>> 131072 |        | 0.659
>>
>> Andrei Vagin fixing this performance problem is part of the
>> work to fix CVE-2016-6213.
>>
>> Cc: stable@vger.kernel.org
>> Reported-by: Andrei Vagin <avagin@openvz.org>
>> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
>> ---
>>
>> Andrei can you take a look at this patch and see if you can see any
>> problems.  My limited testing suggests this approach does a much better
>> job of solving the problem you were seeing.  With the time looking
>> almost linear in the number of mounts now.
>
> I read this patch and I like the idea.
>
> Then I run my tests and one of them doesn't work with this patch.
> I haven't found a reason yet.

>> +     for (m = propagation_visit_next(parent, parent); m;
>> +                     m = propagation_visit_next(m, parent)) {
>>               struct mount *child = __lookup_mnt_last(&m->mnt,
>>                                               mnt->mnt_mountpoint);

The reason is that this loop is called for different "mnt", but
it is executed only once with this optimization.

So I think the idea to mark parent will not work, because one parent
can have a few children which have to be umounted.

>
> Here is the test:
>
> [root@fc24 mounts]# cat run.sh
> set -e
> mount -t tmpfs zdtm /mnt
> mkdir -p /mnt/1 /mnt/2
> mount -t tmpfs zdtm /mnt/1
> mount --make-shared /mnt/1
> for i in `seq $1`; do
>         mount --bind /mnt/1 `mktemp -d /mnt/1/test.XXXXXX`
> done
> mount --rbind /mnt/1 /mnt/2
> cat /proc/self/mountinfo | grep zdtm | wc -l
> time umount -l /mnt/1
> cat /proc/self/mountinfo | grep zdtm | wc -l
> umount /mnt/2
>
>
> [root@fc24 mounts]# unshare -Urm ./run.sh  5
> 65
>
> real    0m0.014s
> user    0m0.000s
> sys     0m0.004s
> 33
> umount: /mnt/2: target is busy
>         (In some cases useful info about processes that
>          use the device is found by lsof(8) or fuser(1).)
>
>>

Thanks,
Andrei

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [RFC][PATCH] mount: In mark_umount_candidates and __propogate_umount visit each mount once
  2016-10-14  2:31           ` Andrey Vagin
@ 2016-10-14  2:45               ` Eric W. Biederman
  -1 siblings, 0 replies; 36+ messages in thread
From: Eric W. Biederman @ 2016-10-14  2:45 UTC (permalink / raw)
  To: Andrey Vagin
  Cc: linux-fsdevel, LKML, Linux Containers, Andrei Vagin, Alexander Viro

Andrey Vagin <avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org> writes:

> On Thu, Oct 13, 2016 at 2:46 PM, Andrei Vagin <avagin-5HdwGun5lf+gSpxsJD1C4w@public.gmane.org> wrote:
>> On Thu, Oct 13, 2016 at 02:53:46PM -0500, Eric W. Biederman wrote:
>>>
>>> Adrei Vagin pointed out that time to executue propagate_umount can go
>>> non-linear (and take a ludicrious amount of time) when the mount
>>> propogation trees of the mounts to be unmunted by a lazy unmount
>>> overlap.
>>>
>>> Solve this in the most straight forward way possible, by adding a new
>>> mount flag to mark parts of the mount propagation tree that have been
>>> visited, and use that mark to skip parts of the mount propagation tree
>>> that have already been visited during an unmount.  This guarantees
>>> that each mountpoint in the possibly overlapping mount propagation
>>> trees will be visited exactly once.
>>>
>>> Add the functions propagation_visit_next and propagation_revisit_next
>>> to coordinate setting and clearling the visited mount mark.
>>>
>>> Here is a script to generate such mount tree:
>>> $ cat run.sh
>>> mount -t tmpfs test-mount /mnt
>>> mount --make-shared /mnt
>>> for i in `seq $1`; do
>>>         mkdir /mnt/test.$i
>>>         mount --bind /mnt /mnt/test.$i
>>> done
>>> cat /proc/mounts | grep test-mount | wc -l
>>> time umount -l /mnt
>>> $ for i in `seq 10 16`; do echo $i; unshare -Urm bash ./run.sh $i; done
>>>
>>> Here are the performance numbers with and without the patch:
>>>
>>> mounts | before | after (real sec)
>>> -----------------------------
>>>   1024 |  0.071 | 0.024
>>>   2048 |  0.184 | 0.030
>>>   4096 |  0.604 | 0.040
>>>   8912 |  4.471 | 0.043
>>>  16384 | 34.826 | 0.082
>>>  32768 |        | 0.151
>>>  65536 |        | 0.289
>>> 131072 |        | 0.659
>>>
>>> Andrei Vagin fixing this performance problem is part of the
>>> work to fix CVE-2016-6213.
>>>
>>> Cc: stable-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
>>> Reported-by: Andrei Vagin <avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
>>> Signed-off-by: "Eric W. Biederman" <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
>>> ---
>>>
>>> Andrei can you take a look at this patch and see if you can see any
>>> problems.  My limited testing suggests this approach does a much better
>>> job of solving the problem you were seeing.  With the time looking
>>> almost linear in the number of mounts now.
>>
>> I read this patch and I like the idea.
>>
>> Then I run my tests and one of them doesn't work with this patch.
>> I haven't found a reason yet.
>
>>> +     for (m = propagation_visit_next(parent, parent); m;
>>> +                     m = propagation_visit_next(m, parent)) {
>>>               struct mount *child = __lookup_mnt_last(&m->mnt,
>>>                                               mnt->mnt_mountpoint);
>
> The reason is that this loop is called for different "mnt", but
> it is executed only once with this optimization.
>
> So I think the idea to mark parent will not work, because one parent
> can have a few children which have to be umounted.

Good catch.  So what needs to be marked is the parent mount and
mountpoint combination.  Which is effectively the child mount.

I still think replacing the propagation_next and fixing the propagation
walk is the way to go.   But it sounds like to make things work the
__lookup_mnt_last needs to be moved into the propagation walk function.

That doesn't feel to hard.  I will have to see what the code looks like.

Eric

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [RFC][PATCH] mount: In mark_umount_candidates and __propogate_umount visit each mount once
@ 2016-10-14  2:45               ` Eric W. Biederman
  0 siblings, 0 replies; 36+ messages in thread
From: Eric W. Biederman @ 2016-10-14  2:45 UTC (permalink / raw)
  To: Andrey Vagin
  Cc: Andrei Vagin, Alexander Viro, Linux Containers, linux-fsdevel, LKML

Andrey Vagin <avagin@openvz.org> writes:

> On Thu, Oct 13, 2016 at 2:46 PM, Andrei Vagin <avagin@virtuozzo.com> wrote:
>> On Thu, Oct 13, 2016 at 02:53:46PM -0500, Eric W. Biederman wrote:
>>>
>>> Adrei Vagin pointed out that time to executue propagate_umount can go
>>> non-linear (and take a ludicrious amount of time) when the mount
>>> propogation trees of the mounts to be unmunted by a lazy unmount
>>> overlap.
>>>
>>> Solve this in the most straight forward way possible, by adding a new
>>> mount flag to mark parts of the mount propagation tree that have been
>>> visited, and use that mark to skip parts of the mount propagation tree
>>> that have already been visited during an unmount.  This guarantees
>>> that each mountpoint in the possibly overlapping mount propagation
>>> trees will be visited exactly once.
>>>
>>> Add the functions propagation_visit_next and propagation_revisit_next
>>> to coordinate setting and clearling the visited mount mark.
>>>
>>> Here is a script to generate such mount tree:
>>> $ cat run.sh
>>> mount -t tmpfs test-mount /mnt
>>> mount --make-shared /mnt
>>> for i in `seq $1`; do
>>>         mkdir /mnt/test.$i
>>>         mount --bind /mnt /mnt/test.$i
>>> done
>>> cat /proc/mounts | grep test-mount | wc -l
>>> time umount -l /mnt
>>> $ for i in `seq 10 16`; do echo $i; unshare -Urm bash ./run.sh $i; done
>>>
>>> Here are the performance numbers with and without the patch:
>>>
>>> mounts | before | after (real sec)
>>> -----------------------------
>>>   1024 |  0.071 | 0.024
>>>   2048 |  0.184 | 0.030
>>>   4096 |  0.604 | 0.040
>>>   8912 |  4.471 | 0.043
>>>  16384 | 34.826 | 0.082
>>>  32768 |        | 0.151
>>>  65536 |        | 0.289
>>> 131072 |        | 0.659
>>>
>>> Andrei Vagin fixing this performance problem is part of the
>>> work to fix CVE-2016-6213.
>>>
>>> Cc: stable@vger.kernel.org
>>> Reported-by: Andrei Vagin <avagin@openvz.org>
>>> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
>>> ---
>>>
>>> Andrei can you take a look at this patch and see if you can see any
>>> problems.  My limited testing suggests this approach does a much better
>>> job of solving the problem you were seeing.  With the time looking
>>> almost linear in the number of mounts now.
>>
>> I read this patch and I like the idea.
>>
>> Then I run my tests and one of them doesn't work with this patch.
>> I haven't found a reason yet.
>
>>> +     for (m = propagation_visit_next(parent, parent); m;
>>> +                     m = propagation_visit_next(m, parent)) {
>>>               struct mount *child = __lookup_mnt_last(&m->mnt,
>>>                                               mnt->mnt_mountpoint);
>
> The reason is that this loop is called for different "mnt", but
> it is executed only once with this optimization.
>
> So I think the idea to mark parent will not work, because one parent
> can have a few children which have to be umounted.

Good catch.  So what needs to be marked is the parent mount and
mountpoint combination.  Which is effectively the child mount.

I still think replacing the propagation_next and fixing the propagation
walk is the way to go.   But it sounds like to make things work the
__lookup_mnt_last needs to be moved into the propagation walk function.

That doesn't feel to hard.  I will have to see what the code looks like.

Eric

^ permalink raw reply	[flat|nested] 36+ messages in thread

* [RFC][PATCH v2] mount: In mark_umount_candidates and __propogate_umount visit each mount once
  2016-10-14  2:45               ` Eric W. Biederman
@ 2016-10-14 18:29                   ` Eric W. Biederman
  -1 siblings, 0 replies; 36+ messages in thread
From: Eric W. Biederman @ 2016-10-14 18:29 UTC (permalink / raw)
  To: Andrey Vagin
  Cc: linux-fsdevel, LKML, Linux Containers, Andrei Vagin, Alexander Viro


Adrei Vagin pointed out that time to executue propagate_umount can go
non-linear (and take a ludicrious amount of time) when the mount
propogation trees of the mounts to be unmunted by a lazy unmount
overlap.

Solve this in the most straight forward way possible, by adding a new
mount flag to mark parts of the mount propagation tree that have been
visited, and use that mark to skip parts of the mount propagation tree
that have already been visited during an unmount.  This guarantees
that each mountpoint in the possibly overlapping mount propagation
trees will be visited exactly once.

Add the functions propagation_visit_next and propagation_revisit_next
to coordinate setting and clearling the visited mount mark.

The skipping of already unmounted mounts has been moved from
__lookup_mnt_last to mark_umount_candidates, so that the new
propagation functions can notice record when the propagation tree
passes through the initial set of unmounted mounts.  Except in
umount_tree as part of the unmounting process the only place where
unmounted mounts should be found are in unmounted subtrees.  All of
the other callers of __lookup_mnt_last are from mounted subtrees so
the not checking for unmounted mounts should not affect them.

Here is a script to generate such mount tree:
$ cat run.sh
mount -t tmpfs test-mount /mnt
mount --make-shared /mnt
for i in `seq $1`; do
        mkdir /mnt/test.$i
        mount --bind /mnt /mnt/test.$i
done
cat /proc/mounts | grep test-mount | wc -l
time umount -l /mnt
$ for i in `seq 10 16`; do echo $i; unshare -Urm bash ./run.sh $i; done

Here are the performance numbers with and without the patch:

mhash  |  8192   |  8192  |  8192       | 131072 | 131072      | 104857 | 104857
mounts | before  | after  | after (sys) | after  | after (sys) |  after | after (sys)
-------------------------------------------------------------------------------------
  1024 |  0.071s | 0.023s | 0.008s      | 0.026s | 0.000s      | 0.024s | 0.008s
  2048 |  0.184s | 0.030s | 0.012s      | 0.035s | 0.008s      | 0.030s | 0.012s
  4096 |  0.604s | 0.047s | 0.012s      | 0.042s | 0.016s      | 0.032s | 0.016s
  8912 |  4.471s | 0.085s | 0.020s      | 0.059s | 0.059s      | 0.050s | 0.036s
 16384 | 34.826s | 0.105s | 0.092s      | 0.109s | 0.060s      | 0.087s | 0.068s
 32768 |         | 0.245s | 0.168s      | 0.192s | 0.144s      | 0.167s | 0.156s
 65536 |         | 0.833s | 0.716s      | 0.485s | 0.276s      | 0.468s | 0.316s
131072 |         | 4.628s | 4.108s      | 0.758s | 0.632s      | 0.736s | 0.612s

Andrei Vagin reports fixing this performance problem is part of the
work to fix CVE-2016-6213.

Cc: stable-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Reported-by: Andrei Vagin <avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
Signed-off-by: "Eric W. Biederman" <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
---

I think this version is very close.  I had to modify __lookup_mnt_last
to not skip MOUNT_UMOUNT or we would never see when the mount
propagation trees intersected.

This doesn't look as good as the previous buggy version but it looks
good.  When the hash table isn't getting full the times look pretty
linear.  So it may be necessary to do some hash table resizing.

That said there remains one issue I need to think about some more.

In mark_umount_candidates I don't mark mounts that are locked to their
parent and their parent is not marked as a umount candidate.  Given that
we skip processing mounts multiple times this might result in a mount
whose parent gets marked as unmountable after the first time we see a
mount not getting marked as unmountable later.

Anyway Andrei if you could check this out and see if you can see
anything I missed please let me know.

Eric

 fs/namespace.c        |   6 +--
 fs/pnode.c            | 147 ++++++++++++++++++++++++++++++++++++++++++++------
 fs/pnode.h            |   4 ++
 include/linux/mount.h |   2 +
 4 files changed, 138 insertions(+), 21 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index db1b5a38864e..1ca99fa2e0f4 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -650,13 +650,11 @@ struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
 	p = __lookup_mnt(mnt, dentry);
 	if (!p)
 		goto out;
-	if (!(p->mnt.mnt_flags & MNT_UMOUNT))
-		res = p;
+	res = p;
 	hlist_for_each_entry_continue(p, mnt_hash) {
 		if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
 			break;
-		if (!(p->mnt.mnt_flags & MNT_UMOUNT))
-			res = p;
+		res = p;
 	}
 out:
 	return res;
diff --git a/fs/pnode.c b/fs/pnode.c
index 234a9ac49958..025e3d9339b0 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -390,20 +390,137 @@ void propagate_mount_unlock(struct mount *mnt)
 }
 
 /*
+ * get the next mount in the propagation tree (that has not been visited)
+ * @m: the mount seen last
+ * @origin: the original mount from where the tree walk initiated
+ *
+ * Note that peer groups form contiguous segments of slave lists.
+ * We rely on that in get_source() to be able to find out if
+ * vfsmount found while iterating with propagation_next() is
+ * a peer of one we'd found earlier.
+ */
+static struct mount *propagation_visit_child(struct mount *last_child,
+					    struct mount *origin_child)
+{
+	struct mount *m = last_child->mnt_parent;
+	struct mount *origin = origin_child->mnt_parent;
+	struct dentry *mountpoint = origin_child->mnt_mountpoint;
+	struct mount *child;
+
+	/* Has this part of the propgation tree already been visited? */
+	if (IS_MNT_VISITED(last_child))
+		return NULL;
+
+	SET_MNT_VISITED(last_child);
+
+	/* are there any slaves of this mount? */
+	if (!list_empty(&m->mnt_slave_list)) {
+		m = first_slave(m);
+		goto check_slave;
+	}
+	while (1) {
+		struct mount *master = m->mnt_master;
+
+		if (master == origin->mnt_master) {
+			struct mount *next = next_peer(m);
+			while (1) {
+				if (next == origin)
+					return NULL;
+				child = __lookup_mnt_last(&next->mnt, mountpoint);
+				if (child && !IS_MNT_VISITED(child))
+					return child;
+				next = next_peer(next);
+			}
+		} else {
+			while (1) {
+				if (m->mnt_slave.next == &master->mnt_slave_list)
+					break;
+				m = next_slave(m);
+			check_slave:
+				child = __lookup_mnt_last(&m->mnt, mountpoint);
+				if (child && !IS_MNT_VISITED(child))
+					return child;
+			}
+		}
+
+		/* back at master */
+		m = master;
+	}
+}
+
+/*
+ * get the next mount in the propagation tree (that has not been revisited)
+ * @m: the mount seen last
+ * @origin: the original mount from where the tree walk initiated
+ *
+ * Note that peer groups form contiguous segments of slave lists.
+ * We rely on that in get_source() to be able to find out if
+ * vfsmount found while iterating with propagation_next() is
+ * a peer of one we'd found earlier.
+ */
+static struct mount *propagation_revisit_child(struct mount *last_child,
+					       struct mount *origin_child)
+{
+	struct mount *m = last_child->mnt_parent;
+	struct mount *origin = origin_child->mnt_parent;
+	struct dentry *mountpoint = origin_child->mnt_mountpoint;
+	struct mount *child;
+
+	/* Has this part of the propgation tree already been revisited? */
+	if (!IS_MNT_VISITED(last_child))
+		return NULL;
+
+	CLEAR_MNT_VISITED(last_child);
+
+	/* are there any slaves of this mount? */
+	if (!list_empty(&m->mnt_slave_list)) {
+		m = first_slave(m);
+		goto check_slave;
+	}
+	while (1) {
+		struct mount *master = m->mnt_master;
+
+		if (master == origin->mnt_master) {
+			struct mount *next = next_peer(m);
+			while (1) {
+				if (next == origin)
+					return NULL;
+				child = __lookup_mnt_last(&next->mnt, mountpoint);
+				if (child && IS_MNT_VISITED(child))
+					return child;
+				next = next_peer(next);
+			}
+		} else {
+			while (1) {
+				if (m->mnt_slave.next == &master->mnt_slave_list)
+					break;
+				m = next_slave(m);
+			check_slave:
+				child = __lookup_mnt_last(&m->mnt, mountpoint);
+				if (child && IS_MNT_VISITED(child))
+					return child;
+			}
+		}
+
+		/* back at master */
+		m = master;
+	}
+}
+
+/*
  * Mark all mounts that the MNT_LOCKED logic will allow to be unmounted.
  */
 static void mark_umount_candidates(struct mount *mnt)
 {
-	struct mount *parent = mnt->mnt_parent;
-	struct mount *m;
+	struct mount *child;
 
-	BUG_ON(parent == mnt);
+	BUG_ON(mnt->mnt_parent == mnt);
 
-	for (m = propagation_next(parent, parent); m;
-			m = propagation_next(m, parent)) {
-		struct mount *child = __lookup_mnt_last(&m->mnt,
-						mnt->mnt_mountpoint);
-		if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
+	for (child = propagation_visit_child(mnt, mnt); child;
+	     child = propagation_visit_child(child, mnt)) {
+		if (child->mnt.mnt_flags & MNT_UMOUNT)
+			continue;
+		if (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(child->mnt_parent)) {
 			SET_MNT_MARK(child);
 		}
 	}
@@ -415,21 +532,17 @@ static void mark_umount_candidates(struct mount *mnt)
  */
 static void __propagate_umount(struct mount *mnt)
 {
-	struct mount *parent = mnt->mnt_parent;
-	struct mount *m;
-
-	BUG_ON(parent == mnt);
+	struct mount *child;
 
-	for (m = propagation_next(parent, parent); m;
-			m = propagation_next(m, parent)) {
+	BUG_ON(mnt->mnt_parent == mnt);
 
-		struct mount *child = __lookup_mnt_last(&m->mnt,
-						mnt->mnt_mountpoint);
+	for (child = propagation_revisit_child(mnt, mnt); child;
+	     child = propagation_revisit_child(child, mnt)) {
 		/*
 		 * umount the child only if the child has no children
 		 * and the child is marked safe to unmount.
 		 */
-		if (!child || !IS_MNT_MARKED(child))
+		if (!IS_MNT_MARKED(child))
 			continue;
 		CLEAR_MNT_MARK(child);
 		if (list_empty(&child->mnt_mounts)) {
diff --git a/fs/pnode.h b/fs/pnode.h
index 550f5a8b4fcf..988ea4945764 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -21,6 +21,10 @@
 #define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED)
 #define IS_MNT_LOCKED(m) ((m)->mnt.mnt_flags & MNT_LOCKED)
 
+#define IS_MNT_VISITED(m) ((m)->mnt.mnt_flags & MNT_VISITED)
+#define SET_MNT_VISITED(m) ((m)->mnt.mnt_flags |= MNT_VISITED)
+#define CLEAR_MNT_VISITED(m) ((m)->mnt.mnt_flags &= ~MNT_VISITED)
+
 #define CL_EXPIRE    		0x01
 #define CL_SLAVE     		0x02
 #define CL_COPY_UNBINDABLE	0x04
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 1172cce949a4..773464f85f93 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -52,6 +52,8 @@ struct mnt_namespace;
 
 #define MNT_INTERNAL	0x4000
 
+#define MNT_VISITED		0x010000
+
 #define MNT_LOCK_ATIME		0x040000
 #define MNT_LOCK_NOEXEC		0x080000
 #define MNT_LOCK_NOSUID		0x100000
-- 
2.8.3

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [RFC][PATCH v2] mount: In mark_umount_candidates and __propogate_umount visit each mount once
@ 2016-10-14 18:29                   ` Eric W. Biederman
  0 siblings, 0 replies; 36+ messages in thread
From: Eric W. Biederman @ 2016-10-14 18:29 UTC (permalink / raw)
  To: Andrey Vagin
  Cc: Andrei Vagin, Alexander Viro, Linux Containers, linux-fsdevel, LKML


Adrei Vagin pointed out that time to executue propagate_umount can go
non-linear (and take a ludicrious amount of time) when the mount
propogation trees of the mounts to be unmunted by a lazy unmount
overlap.

Solve this in the most straight forward way possible, by adding a new
mount flag to mark parts of the mount propagation tree that have been
visited, and use that mark to skip parts of the mount propagation tree
that have already been visited during an unmount.  This guarantees
that each mountpoint in the possibly overlapping mount propagation
trees will be visited exactly once.

Add the functions propagation_visit_next and propagation_revisit_next
to coordinate setting and clearling the visited mount mark.

The skipping of already unmounted mounts has been moved from
__lookup_mnt_last to mark_umount_candidates, so that the new
propagation functions can notice record when the propagation tree
passes through the initial set of unmounted mounts.  Except in
umount_tree as part of the unmounting process the only place where
unmounted mounts should be found are in unmounted subtrees.  All of
the other callers of __lookup_mnt_last are from mounted subtrees so
the not checking for unmounted mounts should not affect them.

Here is a script to generate such mount tree:
$ cat run.sh
mount -t tmpfs test-mount /mnt
mount --make-shared /mnt
for i in `seq $1`; do
        mkdir /mnt/test.$i
        mount --bind /mnt /mnt/test.$i
done
cat /proc/mounts | grep test-mount | wc -l
time umount -l /mnt
$ for i in `seq 10 16`; do echo $i; unshare -Urm bash ./run.sh $i; done

Here are the performance numbers with and without the patch:

mhash  |  8192   |  8192  |  8192       | 131072 | 131072      | 104857 | 104857
mounts | before  | after  | after (sys) | after  | after (sys) |  after | after (sys)
-------------------------------------------------------------------------------------
  1024 |  0.071s | 0.023s | 0.008s      | 0.026s | 0.000s      | 0.024s | 0.008s
  2048 |  0.184s | 0.030s | 0.012s      | 0.035s | 0.008s      | 0.030s | 0.012s
  4096 |  0.604s | 0.047s | 0.012s      | 0.042s | 0.016s      | 0.032s | 0.016s
  8912 |  4.471s | 0.085s | 0.020s      | 0.059s | 0.059s      | 0.050s | 0.036s
 16384 | 34.826s | 0.105s | 0.092s      | 0.109s | 0.060s      | 0.087s | 0.068s
 32768 |         | 0.245s | 0.168s      | 0.192s | 0.144s      | 0.167s | 0.156s
 65536 |         | 0.833s | 0.716s      | 0.485s | 0.276s      | 0.468s | 0.316s
131072 |         | 4.628s | 4.108s      | 0.758s | 0.632s      | 0.736s | 0.612s

Andrei Vagin reports fixing this performance problem is part of the
work to fix CVE-2016-6213.

Cc: stable@vger.kernel.org
Reported-by: Andrei Vagin <avagin@openvz.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---

I think this version is very close.  I had to modify __lookup_mnt_last
to not skip MOUNT_UMOUNT or we would never see when the mount
propagation trees intersected.

This doesn't look as good as the previous buggy version but it looks
good.  When the hash table isn't getting full the times look pretty
linear.  So it may be necessary to do some hash table resizing.

That said there remains one issue I need to think about some more.

In mark_umount_candidates I don't mark mounts that are locked to their
parent and their parent is not marked as a umount candidate.  Given that
we skip processing mounts multiple times this might result in a mount
whose parent gets marked as unmountable after the first time we see a
mount not getting marked as unmountable later.

Anyway Andrei if you could check this out and see if you can see
anything I missed please let me know.

Eric

 fs/namespace.c        |   6 +--
 fs/pnode.c            | 147 ++++++++++++++++++++++++++++++++++++++++++++------
 fs/pnode.h            |   4 ++
 include/linux/mount.h |   2 +
 4 files changed, 138 insertions(+), 21 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index db1b5a38864e..1ca99fa2e0f4 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -650,13 +650,11 @@ struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
 	p = __lookup_mnt(mnt, dentry);
 	if (!p)
 		goto out;
-	if (!(p->mnt.mnt_flags & MNT_UMOUNT))
-		res = p;
+	res = p;
 	hlist_for_each_entry_continue(p, mnt_hash) {
 		if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
 			break;
-		if (!(p->mnt.mnt_flags & MNT_UMOUNT))
-			res = p;
+		res = p;
 	}
 out:
 	return res;
diff --git a/fs/pnode.c b/fs/pnode.c
index 234a9ac49958..025e3d9339b0 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -390,20 +390,137 @@ void propagate_mount_unlock(struct mount *mnt)
 }
 
 /*
+ * get the next mount in the propagation tree (that has not been visited)
+ * @m: the mount seen last
+ * @origin: the original mount from where the tree walk initiated
+ *
+ * Note that peer groups form contiguous segments of slave lists.
+ * We rely on that in get_source() to be able to find out if
+ * vfsmount found while iterating with propagation_next() is
+ * a peer of one we'd found earlier.
+ */
+static struct mount *propagation_visit_child(struct mount *last_child,
+					    struct mount *origin_child)
+{
+	struct mount *m = last_child->mnt_parent;
+	struct mount *origin = origin_child->mnt_parent;
+	struct dentry *mountpoint = origin_child->mnt_mountpoint;
+	struct mount *child;
+
+	/* Has this part of the propgation tree already been visited? */
+	if (IS_MNT_VISITED(last_child))
+		return NULL;
+
+	SET_MNT_VISITED(last_child);
+
+	/* are there any slaves of this mount? */
+	if (!list_empty(&m->mnt_slave_list)) {
+		m = first_slave(m);
+		goto check_slave;
+	}
+	while (1) {
+		struct mount *master = m->mnt_master;
+
+		if (master == origin->mnt_master) {
+			struct mount *next = next_peer(m);
+			while (1) {
+				if (next == origin)
+					return NULL;
+				child = __lookup_mnt_last(&next->mnt, mountpoint);
+				if (child && !IS_MNT_VISITED(child))
+					return child;
+				next = next_peer(next);
+			}
+		} else {
+			while (1) {
+				if (m->mnt_slave.next == &master->mnt_slave_list)
+					break;
+				m = next_slave(m);
+			check_slave:
+				child = __lookup_mnt_last(&m->mnt, mountpoint);
+				if (child && !IS_MNT_VISITED(child))
+					return child;
+			}
+		}
+
+		/* back at master */
+		m = master;
+	}
+}
+
+/*
+ * get the next mount in the propagation tree (that has not been revisited)
+ * @m: the mount seen last
+ * @origin: the original mount from where the tree walk initiated
+ *
+ * Note that peer groups form contiguous segments of slave lists.
+ * We rely on that in get_source() to be able to find out if
+ * vfsmount found while iterating with propagation_next() is
+ * a peer of one we'd found earlier.
+ */
+static struct mount *propagation_revisit_child(struct mount *last_child,
+					       struct mount *origin_child)
+{
+	struct mount *m = last_child->mnt_parent;
+	struct mount *origin = origin_child->mnt_parent;
+	struct dentry *mountpoint = origin_child->mnt_mountpoint;
+	struct mount *child;
+
+	/* Has this part of the propgation tree already been revisited? */
+	if (!IS_MNT_VISITED(last_child))
+		return NULL;
+
+	CLEAR_MNT_VISITED(last_child);
+
+	/* are there any slaves of this mount? */
+	if (!list_empty(&m->mnt_slave_list)) {
+		m = first_slave(m);
+		goto check_slave;
+	}
+	while (1) {
+		struct mount *master = m->mnt_master;
+
+		if (master == origin->mnt_master) {
+			struct mount *next = next_peer(m);
+			while (1) {
+				if (next == origin)
+					return NULL;
+				child = __lookup_mnt_last(&next->mnt, mountpoint);
+				if (child && IS_MNT_VISITED(child))
+					return child;
+				next = next_peer(next);
+			}
+		} else {
+			while (1) {
+				if (m->mnt_slave.next == &master->mnt_slave_list)
+					break;
+				m = next_slave(m);
+			check_slave:
+				child = __lookup_mnt_last(&m->mnt, mountpoint);
+				if (child && IS_MNT_VISITED(child))
+					return child;
+			}
+		}
+
+		/* back at master */
+		m = master;
+	}
+}
+
+/*
  * Mark all mounts that the MNT_LOCKED logic will allow to be unmounted.
  */
 static void mark_umount_candidates(struct mount *mnt)
 {
-	struct mount *parent = mnt->mnt_parent;
-	struct mount *m;
+	struct mount *child;
 
-	BUG_ON(parent == mnt);
+	BUG_ON(mnt->mnt_parent == mnt);
 
-	for (m = propagation_next(parent, parent); m;
-			m = propagation_next(m, parent)) {
-		struct mount *child = __lookup_mnt_last(&m->mnt,
-						mnt->mnt_mountpoint);
-		if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
+	for (child = propagation_visit_child(mnt, mnt); child;
+	     child = propagation_visit_child(child, mnt)) {
+		if (child->mnt.mnt_flags & MNT_UMOUNT)
+			continue;
+		if (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(child->mnt_parent)) {
 			SET_MNT_MARK(child);
 		}
 	}
@@ -415,21 +532,17 @@ static void mark_umount_candidates(struct mount *mnt)
  */
 static void __propagate_umount(struct mount *mnt)
 {
-	struct mount *parent = mnt->mnt_parent;
-	struct mount *m;
-
-	BUG_ON(parent == mnt);
+	struct mount *child;
 
-	for (m = propagation_next(parent, parent); m;
-			m = propagation_next(m, parent)) {
+	BUG_ON(mnt->mnt_parent == mnt);
 
-		struct mount *child = __lookup_mnt_last(&m->mnt,
-						mnt->mnt_mountpoint);
+	for (child = propagation_revisit_child(mnt, mnt); child;
+	     child = propagation_revisit_child(child, mnt)) {
 		/*
 		 * umount the child only if the child has no children
 		 * and the child is marked safe to unmount.
 		 */
-		if (!child || !IS_MNT_MARKED(child))
+		if (!IS_MNT_MARKED(child))
 			continue;
 		CLEAR_MNT_MARK(child);
 		if (list_empty(&child->mnt_mounts)) {
diff --git a/fs/pnode.h b/fs/pnode.h
index 550f5a8b4fcf..988ea4945764 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -21,6 +21,10 @@
 #define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED)
 #define IS_MNT_LOCKED(m) ((m)->mnt.mnt_flags & MNT_LOCKED)
 
+#define IS_MNT_VISITED(m) ((m)->mnt.mnt_flags & MNT_VISITED)
+#define SET_MNT_VISITED(m) ((m)->mnt.mnt_flags |= MNT_VISITED)
+#define CLEAR_MNT_VISITED(m) ((m)->mnt.mnt_flags &= ~MNT_VISITED)
+
 #define CL_EXPIRE    		0x01
 #define CL_SLAVE     		0x02
 #define CL_COPY_UNBINDABLE	0x04
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 1172cce949a4..773464f85f93 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -52,6 +52,8 @@ struct mnt_namespace;
 
 #define MNT_INTERNAL	0x4000
 
+#define MNT_VISITED		0x010000
+
 #define MNT_LOCK_ATIME		0x040000
 #define MNT_LOCK_NOEXEC		0x080000
 #define MNT_LOCK_NOSUID		0x100000
-- 
2.8.3

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [RFC][PATCH v2] mount: In mark_umount_candidates and __propogate_umount visit each mount once
  2016-10-14 18:29                   ` Eric W. Biederman
@ 2016-10-18  2:40                       ` Andrei Vagin
  -1 siblings, 0 replies; 36+ messages in thread
From: Andrei Vagin @ 2016-10-18  2:40 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: linux-fsdevel, LKML, Linux Containers, Andrey Vagin, Alexander Viro

On Fri, Oct 14, 2016 at 01:29:18PM -0500, Eric W. Biederman wrote:
> 
> Adrei Vagin pointed out that time to executue propagate_umount can go
> non-linear (and take a ludicrious amount of time) when the mount
> propogation trees of the mounts to be unmunted by a lazy unmount
> overlap.
> 
> Solve this in the most straight forward way possible, by adding a new
> mount flag to mark parts of the mount propagation tree that have been
> visited, and use that mark to skip parts of the mount propagation tree
> that have already been visited during an unmount.  This guarantees
> that each mountpoint in the possibly overlapping mount propagation
> trees will be visited exactly once.
> 
> Add the functions propagation_visit_next and propagation_revisit_next
> to coordinate setting and clearling the visited mount mark.
> 
> The skipping of already unmounted mounts has been moved from
> __lookup_mnt_last to mark_umount_candidates, so that the new
> propagation functions can notice record when the propagation tree
> passes through the initial set of unmounted mounts.  Except in
> umount_tree as part of the unmounting process the only place where
> unmounted mounts should be found are in unmounted subtrees.  All of
> the other callers of __lookup_mnt_last are from mounted subtrees so
> the not checking for unmounted mounts should not affect them.
> 
> Here is a script to generate such mount tree:
> $ cat run.sh
> mount -t tmpfs test-mount /mnt
> mount --make-shared /mnt
> for i in `seq $1`; do
>         mkdir /mnt/test.$i
>         mount --bind /mnt /mnt/test.$i
> done
> cat /proc/mounts | grep test-mount | wc -l
> time umount -l /mnt
> $ for i in `seq 10 16`; do echo $i; unshare -Urm bash ./run.sh $i; done
> 
> Here are the performance numbers with and without the patch:
> 
> mhash  |  8192   |  8192  |  8192       | 131072 | 131072      | 104857 | 104857
> mounts | before  | after  | after (sys) | after  | after (sys) |  after | after (sys)
> -------------------------------------------------------------------------------------
>   1024 |  0.071s | 0.023s | 0.008s      | 0.026s | 0.000s      | 0.024s | 0.008s
>   2048 |  0.184s | 0.030s | 0.012s      | 0.035s | 0.008s      | 0.030s | 0.012s
>   4096 |  0.604s | 0.047s | 0.012s      | 0.042s | 0.016s      | 0.032s | 0.016s
>   8912 |  4.471s | 0.085s | 0.020s      | 0.059s | 0.059s      | 0.050s | 0.036s
>  16384 | 34.826s | 0.105s | 0.092s      | 0.109s | 0.060s      | 0.087s | 0.068s
>  32768 |         | 0.245s | 0.168s      | 0.192s | 0.144s      | 0.167s | 0.156s
>  65536 |         | 0.833s | 0.716s      | 0.485s | 0.276s      | 0.468s | 0.316s
> 131072 |         | 4.628s | 4.108s      | 0.758s | 0.632s      | 0.736s | 0.612s
> 
> Andrei Vagin reports fixing this performance problem is part of the
> work to fix CVE-2016-6213.
> 
> Cc: stable-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> Reported-by: Andrei Vagin <avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
> Signed-off-by: "Eric W. Biederman" <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
> ---
> 
> I think this version is very close.  I had to modify __lookup_mnt_last
> to not skip MOUNT_UMOUNT or we would never see when the mount
> propagation trees intersected.
> 
> This doesn't look as good as the previous buggy version but it looks
> good.  When the hash table isn't getting full the times look pretty
> linear.  So it may be necessary to do some hash table resizing.
> 
> That said there remains one issue I need to think about some more.
> 
> In mark_umount_candidates I don't mark mounts that are locked to their
> parent and their parent is not marked as a umount candidate.  Given that
> we skip processing mounts multiple times this might result in a mount
> whose parent gets marked as unmountable after the first time we see a
> mount not getting marked as unmountable later.
> 
> Anyway Andrei if you could check this out and see if you can see
> anything I missed please let me know.

I've tested this patch today and it works to me. The idea of this patch
looks good for me too. Thanks! There is one inline comment.

> 
> Eric
> 
>  fs/namespace.c        |   6 +--
>  fs/pnode.c            | 147 ++++++++++++++++++++++++++++++++++++++++++++------
>  fs/pnode.h            |   4 ++
>  include/linux/mount.h |   2 +
>  4 files changed, 138 insertions(+), 21 deletions(-)
> 
> diff --git a/fs/namespace.c b/fs/namespace.c
> index db1b5a38864e..1ca99fa2e0f4 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -650,13 +650,11 @@ struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
>  	p = __lookup_mnt(mnt, dentry);
>  	if (!p)
>  		goto out;
> -	if (!(p->mnt.mnt_flags & MNT_UMOUNT))
> -		res = p;
> +	res = p;
>  	hlist_for_each_entry_continue(p, mnt_hash) {
>  		if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
>  			break;
> -		if (!(p->mnt.mnt_flags & MNT_UMOUNT))
> -			res = p;

__lookup_mnt_last is used in propagate_mount_busy and
attach_recursive_mnt. Should we do smth to save old
behaviour of these functions.

> +		res = p;
>  	}
>  out:
>  	return res;
> diff --git a/fs/pnode.c b/fs/pnode.c
> index 234a9ac49958..025e3d9339b0 100644
> --- a/fs/pnode.c
> +++ b/fs/pnode.c
> @@ -390,20 +390,137 @@ void propagate_mount_unlock(struct mount *mnt)
>  }
>  
>  /*
> + * get the next mount in the propagation tree (that has not been visited)
> + * @m: the mount seen last
> + * @origin: the original mount from where the tree walk initiated
> + *
> + * Note that peer groups form contiguous segments of slave lists.
> + * We rely on that in get_source() to be able to find out if
> + * vfsmount found while iterating with propagation_next() is
> + * a peer of one we'd found earlier.
> + */
> +static struct mount *propagation_visit_child(struct mount *last_child,
> +					    struct mount *origin_child)
> +{
> +	struct mount *m = last_child->mnt_parent;
> +	struct mount *origin = origin_child->mnt_parent;
> +	struct dentry *mountpoint = origin_child->mnt_mountpoint;
> +	struct mount *child;
> +
> +	/* Has this part of the propgation tree already been visited? */
> +	if (IS_MNT_VISITED(last_child))
> +		return NULL;
> +
> +	SET_MNT_VISITED(last_child);
> +
> +	/* are there any slaves of this mount? */
> +	if (!list_empty(&m->mnt_slave_list)) {
> +		m = first_slave(m);
> +		goto check_slave;
> +	}
> +	while (1) {
> +		struct mount *master = m->mnt_master;
> +
> +		if (master == origin->mnt_master) {
> +			struct mount *next = next_peer(m);
> +			while (1) {
> +				if (next == origin)
> +					return NULL;
> +				child = __lookup_mnt_last(&next->mnt, mountpoint);
> +				if (child && !IS_MNT_VISITED(child))
> +					return child;
> +				next = next_peer(next);
> +			}
> +		} else {
> +			while (1) {
> +				if (m->mnt_slave.next == &master->mnt_slave_list)
> +					break;
> +				m = next_slave(m);
> +			check_slave:
> +				child = __lookup_mnt_last(&m->mnt, mountpoint);
> +				if (child && !IS_MNT_VISITED(child))
> +					return child;
> +			}
> +		}
> +
> +		/* back at master */
> +		m = master;
> +	}
> +}
> +
> +/*
> + * get the next mount in the propagation tree (that has not been revisited)
> + * @m: the mount seen last
> + * @origin: the original mount from where the tree walk initiated
> + *
> + * Note that peer groups form contiguous segments of slave lists.
> + * We rely on that in get_source() to be able to find out if
> + * vfsmount found while iterating with propagation_next() is
> + * a peer of one we'd found earlier.
> + */
> +static struct mount *propagation_revisit_child(struct mount *last_child,
> +					       struct mount *origin_child)
> +{
> +	struct mount *m = last_child->mnt_parent;
> +	struct mount *origin = origin_child->mnt_parent;
> +	struct dentry *mountpoint = origin_child->mnt_mountpoint;
> +	struct mount *child;
> +
> +	/* Has this part of the propgation tree already been revisited? */
> +	if (!IS_MNT_VISITED(last_child))
> +		return NULL;
> +
> +	CLEAR_MNT_VISITED(last_child);
> +
> +	/* are there any slaves of this mount? */
> +	if (!list_empty(&m->mnt_slave_list)) {
> +		m = first_slave(m);
> +		goto check_slave;
> +	}
> +	while (1) {
> +		struct mount *master = m->mnt_master;
> +
> +		if (master == origin->mnt_master) {
> +			struct mount *next = next_peer(m);
> +			while (1) {
> +				if (next == origin)
> +					return NULL;
> +				child = __lookup_mnt_last(&next->mnt, mountpoint);
> +				if (child && IS_MNT_VISITED(child))
> +					return child;
> +				next = next_peer(next);
> +			}
> +		} else {
> +			while (1) {
> +				if (m->mnt_slave.next == &master->mnt_slave_list)
> +					break;
> +				m = next_slave(m);
> +			check_slave:
> +				child = __lookup_mnt_last(&m->mnt, mountpoint);
> +				if (child && IS_MNT_VISITED(child))
> +					return child;
> +			}
> +		}
> +
> +		/* back at master */
> +		m = master;
> +	}
> +}
> +
> +/*
>   * Mark all mounts that the MNT_LOCKED logic will allow to be unmounted.
>   */
>  static void mark_umount_candidates(struct mount *mnt)
>  {
> -	struct mount *parent = mnt->mnt_parent;
> -	struct mount *m;
> +	struct mount *child;
>  
> -	BUG_ON(parent == mnt);
> +	BUG_ON(mnt->mnt_parent == mnt);
>  
> -	for (m = propagation_next(parent, parent); m;
> -			m = propagation_next(m, parent)) {
> -		struct mount *child = __lookup_mnt_last(&m->mnt,
> -						mnt->mnt_mountpoint);
> -		if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
> +	for (child = propagation_visit_child(mnt, mnt); child;
> +	     child = propagation_visit_child(child, mnt)) {
> +		if (child->mnt.mnt_flags & MNT_UMOUNT)
> +			continue;
> +		if (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(child->mnt_parent)) {
>  			SET_MNT_MARK(child);
>  		}
>  	}
> @@ -415,21 +532,17 @@ static void mark_umount_candidates(struct mount *mnt)
>   */
>  static void __propagate_umount(struct mount *mnt)
>  {
> -	struct mount *parent = mnt->mnt_parent;
> -	struct mount *m;
> -
> -	BUG_ON(parent == mnt);
> +	struct mount *child;
>  
> -	for (m = propagation_next(parent, parent); m;
> -			m = propagation_next(m, parent)) {
> +	BUG_ON(mnt->mnt_parent == mnt);
>  
> -		struct mount *child = __lookup_mnt_last(&m->mnt,
> -						mnt->mnt_mountpoint);
> +	for (child = propagation_revisit_child(mnt, mnt); child;
> +	     child = propagation_revisit_child(child, mnt)) {
>  		/*
>  		 * umount the child only if the child has no children
>  		 * and the child is marked safe to unmount.
>  		 */
> -		if (!child || !IS_MNT_MARKED(child))
> +		if (!IS_MNT_MARKED(child))
>  			continue;
>  		CLEAR_MNT_MARK(child);
>  		if (list_empty(&child->mnt_mounts)) {
> diff --git a/fs/pnode.h b/fs/pnode.h
> index 550f5a8b4fcf..988ea4945764 100644
> --- a/fs/pnode.h
> +++ b/fs/pnode.h
> @@ -21,6 +21,10 @@
>  #define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED)
>  #define IS_MNT_LOCKED(m) ((m)->mnt.mnt_flags & MNT_LOCKED)
>  
> +#define IS_MNT_VISITED(m) ((m)->mnt.mnt_flags & MNT_VISITED)
> +#define SET_MNT_VISITED(m) ((m)->mnt.mnt_flags |= MNT_VISITED)
> +#define CLEAR_MNT_VISITED(m) ((m)->mnt.mnt_flags &= ~MNT_VISITED)
> +
>  #define CL_EXPIRE    		0x01
>  #define CL_SLAVE     		0x02
>  #define CL_COPY_UNBINDABLE	0x04
> diff --git a/include/linux/mount.h b/include/linux/mount.h
> index 1172cce949a4..773464f85f93 100644
> --- a/include/linux/mount.h
> +++ b/include/linux/mount.h
> @@ -52,6 +52,8 @@ struct mnt_namespace;
>  
>  #define MNT_INTERNAL	0x4000
>  
> +#define MNT_VISITED		0x010000
> +
>  #define MNT_LOCK_ATIME		0x040000
>  #define MNT_LOCK_NOEXEC		0x080000
>  #define MNT_LOCK_NOSUID		0x100000
> -- 
> 2.8.3
> 

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [RFC][PATCH v2] mount: In mark_umount_candidates and __propogate_umount visit each mount once
@ 2016-10-18  2:40                       ` Andrei Vagin
  0 siblings, 0 replies; 36+ messages in thread
From: Andrei Vagin @ 2016-10-18  2:40 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Andrey Vagin, Alexander Viro, Linux Containers, linux-fsdevel, LKML

On Fri, Oct 14, 2016 at 01:29:18PM -0500, Eric W. Biederman wrote:
> 
> Adrei Vagin pointed out that time to executue propagate_umount can go
> non-linear (and take a ludicrious amount of time) when the mount
> propogation trees of the mounts to be unmunted by a lazy unmount
> overlap.
> 
> Solve this in the most straight forward way possible, by adding a new
> mount flag to mark parts of the mount propagation tree that have been
> visited, and use that mark to skip parts of the mount propagation tree
> that have already been visited during an unmount.  This guarantees
> that each mountpoint in the possibly overlapping mount propagation
> trees will be visited exactly once.
> 
> Add the functions propagation_visit_next and propagation_revisit_next
> to coordinate setting and clearling the visited mount mark.
> 
> The skipping of already unmounted mounts has been moved from
> __lookup_mnt_last to mark_umount_candidates, so that the new
> propagation functions can notice record when the propagation tree
> passes through the initial set of unmounted mounts.  Except in
> umount_tree as part of the unmounting process the only place where
> unmounted mounts should be found are in unmounted subtrees.  All of
> the other callers of __lookup_mnt_last are from mounted subtrees so
> the not checking for unmounted mounts should not affect them.
> 
> Here is a script to generate such mount tree:
> $ cat run.sh
> mount -t tmpfs test-mount /mnt
> mount --make-shared /mnt
> for i in `seq $1`; do
>         mkdir /mnt/test.$i
>         mount --bind /mnt /mnt/test.$i
> done
> cat /proc/mounts | grep test-mount | wc -l
> time umount -l /mnt
> $ for i in `seq 10 16`; do echo $i; unshare -Urm bash ./run.sh $i; done
> 
> Here are the performance numbers with and without the patch:
> 
> mhash  |  8192   |  8192  |  8192       | 131072 | 131072      | 104857 | 104857
> mounts | before  | after  | after (sys) | after  | after (sys) |  after | after (sys)
> -------------------------------------------------------------------------------------
>   1024 |  0.071s | 0.023s | 0.008s      | 0.026s | 0.000s      | 0.024s | 0.008s
>   2048 |  0.184s | 0.030s | 0.012s      | 0.035s | 0.008s      | 0.030s | 0.012s
>   4096 |  0.604s | 0.047s | 0.012s      | 0.042s | 0.016s      | 0.032s | 0.016s
>   8912 |  4.471s | 0.085s | 0.020s      | 0.059s | 0.059s      | 0.050s | 0.036s
>  16384 | 34.826s | 0.105s | 0.092s      | 0.109s | 0.060s      | 0.087s | 0.068s
>  32768 |         | 0.245s | 0.168s      | 0.192s | 0.144s      | 0.167s | 0.156s
>  65536 |         | 0.833s | 0.716s      | 0.485s | 0.276s      | 0.468s | 0.316s
> 131072 |         | 4.628s | 4.108s      | 0.758s | 0.632s      | 0.736s | 0.612s
> 
> Andrei Vagin reports fixing this performance problem is part of the
> work to fix CVE-2016-6213.
> 
> Cc: stable@vger.kernel.org
> Reported-by: Andrei Vagin <avagin@openvz.org>
> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
> ---
> 
> I think this version is very close.  I had to modify __lookup_mnt_last
> to not skip MOUNT_UMOUNT or we would never see when the mount
> propagation trees intersected.
> 
> This doesn't look as good as the previous buggy version but it looks
> good.  When the hash table isn't getting full the times look pretty
> linear.  So it may be necessary to do some hash table resizing.
> 
> That said there remains one issue I need to think about some more.
> 
> In mark_umount_candidates I don't mark mounts that are locked to their
> parent and their parent is not marked as a umount candidate.  Given that
> we skip processing mounts multiple times this might result in a mount
> whose parent gets marked as unmountable after the first time we see a
> mount not getting marked as unmountable later.
> 
> Anyway Andrei if you could check this out and see if you can see
> anything I missed please let me know.

I've tested this patch today and it works to me. The idea of this patch
looks good for me too. Thanks! There is one inline comment.

> 
> Eric
> 
>  fs/namespace.c        |   6 +--
>  fs/pnode.c            | 147 ++++++++++++++++++++++++++++++++++++++++++++------
>  fs/pnode.h            |   4 ++
>  include/linux/mount.h |   2 +
>  4 files changed, 138 insertions(+), 21 deletions(-)
> 
> diff --git a/fs/namespace.c b/fs/namespace.c
> index db1b5a38864e..1ca99fa2e0f4 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -650,13 +650,11 @@ struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
>  	p = __lookup_mnt(mnt, dentry);
>  	if (!p)
>  		goto out;
> -	if (!(p->mnt.mnt_flags & MNT_UMOUNT))
> -		res = p;
> +	res = p;
>  	hlist_for_each_entry_continue(p, mnt_hash) {
>  		if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
>  			break;
> -		if (!(p->mnt.mnt_flags & MNT_UMOUNT))
> -			res = p;

__lookup_mnt_last is used in propagate_mount_busy and
attach_recursive_mnt. Should we do smth to save old
behaviour of these functions.

> +		res = p;
>  	}
>  out:
>  	return res;
> diff --git a/fs/pnode.c b/fs/pnode.c
> index 234a9ac49958..025e3d9339b0 100644
> --- a/fs/pnode.c
> +++ b/fs/pnode.c
> @@ -390,20 +390,137 @@ void propagate_mount_unlock(struct mount *mnt)
>  }
>  
>  /*
> + * get the next mount in the propagation tree (that has not been visited)
> + * @m: the mount seen last
> + * @origin: the original mount from where the tree walk initiated
> + *
> + * Note that peer groups form contiguous segments of slave lists.
> + * We rely on that in get_source() to be able to find out if
> + * vfsmount found while iterating with propagation_next() is
> + * a peer of one we'd found earlier.
> + */
> +static struct mount *propagation_visit_child(struct mount *last_child,
> +					    struct mount *origin_child)
> +{
> +	struct mount *m = last_child->mnt_parent;
> +	struct mount *origin = origin_child->mnt_parent;
> +	struct dentry *mountpoint = origin_child->mnt_mountpoint;
> +	struct mount *child;
> +
> +	/* Has this part of the propgation tree already been visited? */
> +	if (IS_MNT_VISITED(last_child))
> +		return NULL;
> +
> +	SET_MNT_VISITED(last_child);
> +
> +	/* are there any slaves of this mount? */
> +	if (!list_empty(&m->mnt_slave_list)) {
> +		m = first_slave(m);
> +		goto check_slave;
> +	}
> +	while (1) {
> +		struct mount *master = m->mnt_master;
> +
> +		if (master == origin->mnt_master) {
> +			struct mount *next = next_peer(m);
> +			while (1) {
> +				if (next == origin)
> +					return NULL;
> +				child = __lookup_mnt_last(&next->mnt, mountpoint);
> +				if (child && !IS_MNT_VISITED(child))
> +					return child;
> +				next = next_peer(next);
> +			}
> +		} else {
> +			while (1) {
> +				if (m->mnt_slave.next == &master->mnt_slave_list)
> +					break;
> +				m = next_slave(m);
> +			check_slave:
> +				child = __lookup_mnt_last(&m->mnt, mountpoint);
> +				if (child && !IS_MNT_VISITED(child))
> +					return child;
> +			}
> +		}
> +
> +		/* back at master */
> +		m = master;
> +	}
> +}
> +
> +/*
> + * get the next mount in the propagation tree (that has not been revisited)
> + * @m: the mount seen last
> + * @origin: the original mount from where the tree walk initiated
> + *
> + * Note that peer groups form contiguous segments of slave lists.
> + * We rely on that in get_source() to be able to find out if
> + * vfsmount found while iterating with propagation_next() is
> + * a peer of one we'd found earlier.
> + */
> +static struct mount *propagation_revisit_child(struct mount *last_child,
> +					       struct mount *origin_child)
> +{
> +	struct mount *m = last_child->mnt_parent;
> +	struct mount *origin = origin_child->mnt_parent;
> +	struct dentry *mountpoint = origin_child->mnt_mountpoint;
> +	struct mount *child;
> +
> +	/* Has this part of the propgation tree already been revisited? */
> +	if (!IS_MNT_VISITED(last_child))
> +		return NULL;
> +
> +	CLEAR_MNT_VISITED(last_child);
> +
> +	/* are there any slaves of this mount? */
> +	if (!list_empty(&m->mnt_slave_list)) {
> +		m = first_slave(m);
> +		goto check_slave;
> +	}
> +	while (1) {
> +		struct mount *master = m->mnt_master;
> +
> +		if (master == origin->mnt_master) {
> +			struct mount *next = next_peer(m);
> +			while (1) {
> +				if (next == origin)
> +					return NULL;
> +				child = __lookup_mnt_last(&next->mnt, mountpoint);
> +				if (child && IS_MNT_VISITED(child))
> +					return child;
> +				next = next_peer(next);
> +			}
> +		} else {
> +			while (1) {
> +				if (m->mnt_slave.next == &master->mnt_slave_list)
> +					break;
> +				m = next_slave(m);
> +			check_slave:
> +				child = __lookup_mnt_last(&m->mnt, mountpoint);
> +				if (child && IS_MNT_VISITED(child))
> +					return child;
> +			}
> +		}
> +
> +		/* back at master */
> +		m = master;
> +	}
> +}
> +
> +/*
>   * Mark all mounts that the MNT_LOCKED logic will allow to be unmounted.
>   */
>  static void mark_umount_candidates(struct mount *mnt)
>  {
> -	struct mount *parent = mnt->mnt_parent;
> -	struct mount *m;
> +	struct mount *child;
>  
> -	BUG_ON(parent == mnt);
> +	BUG_ON(mnt->mnt_parent == mnt);
>  
> -	for (m = propagation_next(parent, parent); m;
> -			m = propagation_next(m, parent)) {
> -		struct mount *child = __lookup_mnt_last(&m->mnt,
> -						mnt->mnt_mountpoint);
> -		if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
> +	for (child = propagation_visit_child(mnt, mnt); child;
> +	     child = propagation_visit_child(child, mnt)) {
> +		if (child->mnt.mnt_flags & MNT_UMOUNT)
> +			continue;
> +		if (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(child->mnt_parent)) {
>  			SET_MNT_MARK(child);
>  		}
>  	}
> @@ -415,21 +532,17 @@ static void mark_umount_candidates(struct mount *mnt)
>   */
>  static void __propagate_umount(struct mount *mnt)
>  {
> -	struct mount *parent = mnt->mnt_parent;
> -	struct mount *m;
> -
> -	BUG_ON(parent == mnt);
> +	struct mount *child;
>  
> -	for (m = propagation_next(parent, parent); m;
> -			m = propagation_next(m, parent)) {
> +	BUG_ON(mnt->mnt_parent == mnt);
>  
> -		struct mount *child = __lookup_mnt_last(&m->mnt,
> -						mnt->mnt_mountpoint);
> +	for (child = propagation_revisit_child(mnt, mnt); child;
> +	     child = propagation_revisit_child(child, mnt)) {
>  		/*
>  		 * umount the child only if the child has no children
>  		 * and the child is marked safe to unmount.
>  		 */
> -		if (!child || !IS_MNT_MARKED(child))
> +		if (!IS_MNT_MARKED(child))
>  			continue;
>  		CLEAR_MNT_MARK(child);
>  		if (list_empty(&child->mnt_mounts)) {
> diff --git a/fs/pnode.h b/fs/pnode.h
> index 550f5a8b4fcf..988ea4945764 100644
> --- a/fs/pnode.h
> +++ b/fs/pnode.h
> @@ -21,6 +21,10 @@
>  #define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED)
>  #define IS_MNT_LOCKED(m) ((m)->mnt.mnt_flags & MNT_LOCKED)
>  
> +#define IS_MNT_VISITED(m) ((m)->mnt.mnt_flags & MNT_VISITED)
> +#define SET_MNT_VISITED(m) ((m)->mnt.mnt_flags |= MNT_VISITED)
> +#define CLEAR_MNT_VISITED(m) ((m)->mnt.mnt_flags &= ~MNT_VISITED)
> +
>  #define CL_EXPIRE    		0x01
>  #define CL_SLAVE     		0x02
>  #define CL_COPY_UNBINDABLE	0x04
> diff --git a/include/linux/mount.h b/include/linux/mount.h
> index 1172cce949a4..773464f85f93 100644
> --- a/include/linux/mount.h
> +++ b/include/linux/mount.h
> @@ -52,6 +52,8 @@ struct mnt_namespace;
>  
>  #define MNT_INTERNAL	0x4000
>  
> +#define MNT_VISITED		0x010000
> +
>  #define MNT_LOCK_ATIME		0x040000
>  #define MNT_LOCK_NOEXEC		0x080000
>  #define MNT_LOCK_NOSUID		0x100000
> -- 
> 2.8.3
> 

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [RFC][PATCH v2] mount: In mark_umount_candidates and __propogate_umount visit each mount once
  2016-10-18  2:40                       ` Andrei Vagin
@ 2016-10-18  6:49                           ` Eric W. Biederman
  -1 siblings, 0 replies; 36+ messages in thread
From: Eric W. Biederman @ 2016-10-18  6:49 UTC (permalink / raw)
  To: Andrei Vagin
  Cc: linux-fsdevel, LKML, Linux Containers, Andrey Vagin, Alexander Viro

Andrei Vagin <avagin-5HdwGun5lf+gSpxsJD1C4w@public.gmane.org> writes:

> On Fri, Oct 14, 2016 at 01:29:18PM -0500, Eric W. Biederman wrote:
>> 
>> Adrei Vagin pointed out that time to executue propagate_umount can go
>> non-linear (and take a ludicrious amount of time) when the mount
>> propogation trees of the mounts to be unmunted by a lazy unmount
>> overlap.
>> 
>> Solve this in the most straight forward way possible, by adding a new
>> mount flag to mark parts of the mount propagation tree that have been
>> visited, and use that mark to skip parts of the mount propagation tree
>> that have already been visited during an unmount.  This guarantees
>> that each mountpoint in the possibly overlapping mount propagation
>> trees will be visited exactly once.
>> 
>> Add the functions propagation_visit_next and propagation_revisit_next
>> to coordinate setting and clearling the visited mount mark.
>> 
>> The skipping of already unmounted mounts has been moved from
>> __lookup_mnt_last to mark_umount_candidates, so that the new
>> propagation functions can notice record when the propagation tree
>> passes through the initial set of unmounted mounts.  Except in
>> umount_tree as part of the unmounting process the only place where
>> unmounted mounts should be found are in unmounted subtrees.  All of
>> the other callers of __lookup_mnt_last are from mounted subtrees so
>> the not checking for unmounted mounts should not affect them.
>> 
>> Here is a script to generate such mount tree:
>> $ cat run.sh
>> mount -t tmpfs test-mount /mnt
>> mount --make-shared /mnt
>> for i in `seq $1`; do
>>         mkdir /mnt/test.$i
>>         mount --bind /mnt /mnt/test.$i
>> done
>> cat /proc/mounts | grep test-mount | wc -l
>> time umount -l /mnt
>> $ for i in `seq 10 16`; do echo $i; unshare -Urm bash ./run.sh $i; done
>> 
>> Here are the performance numbers with and without the patch:
>> 
>> mhash  |  8192   |  8192  |  8192       | 131072 | 131072      | 104857 | 104857
>> mounts | before  | after  | after (sys) | after  | after (sys) |  after | after (sys)
>> -------------------------------------------------------------------------------------
>>   1024 |  0.071s | 0.023s | 0.008s      | 0.026s | 0.000s      | 0.024s | 0.008s
>>   2048 |  0.184s | 0.030s | 0.012s      | 0.035s | 0.008s      | 0.030s | 0.012s
>>   4096 |  0.604s | 0.047s | 0.012s      | 0.042s | 0.016s      | 0.032s | 0.016s
>>   8912 |  4.471s | 0.085s | 0.020s      | 0.059s | 0.059s      | 0.050s | 0.036s
>>  16384 | 34.826s | 0.105s | 0.092s      | 0.109s | 0.060s      | 0.087s | 0.068s
>>  32768 |         | 0.245s | 0.168s      | 0.192s | 0.144s      | 0.167s | 0.156s
>>  65536 |         | 0.833s | 0.716s      | 0.485s | 0.276s      | 0.468s | 0.316s
>> 131072 |         | 4.628s | 4.108s      | 0.758s | 0.632s      | 0.736s | 0.612s
>> 
>> Andrei Vagin reports fixing this performance problem is part of the
>> work to fix CVE-2016-6213.
>> 
>> Cc: stable-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
>> Reported-by: Andrei Vagin <avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
>> Signed-off-by: "Eric W. Biederman" <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
>> ---
>> 
>> I think this version is very close.  I had to modify __lookup_mnt_last
>> to not skip MOUNT_UMOUNT or we would never see when the mount
>> propagation trees intersected.
>> 
>> This doesn't look as good as the previous buggy version but it looks
>> good.  When the hash table isn't getting full the times look pretty
>> linear.  So it may be necessary to do some hash table resizing.
>> 
>> That said there remains one issue I need to think about some more.
>> 
>> In mark_umount_candidates I don't mark mounts that are locked to their
>> parent and their parent is not marked as a umount candidate.  Given that
>> we skip processing mounts multiple times this might result in a mount
>> whose parent gets marked as unmountable after the first time we see a
>> mount not getting marked as unmountable later.

Unfortunately my fears are born out as demonstrated by the script
below.
    $ cat pathology.sh
    #!/bin/sh
    set -e
    set -x
    
    mount -t tmpfs base /mnt
    mount --make-shared /mnt
    mkdir -p /mnt/b
    
    mount -t tmpfs test1 /mnt/b
    mount --make-shared /mnt/b
    mkdir -p /mnt/b/10
    
    mount -t tmpfs test2 /mnt/b/10
    mount --make-shared /mnt/b/10
    mkdir -p /mnt/b/10/20
    
    mount --rbind /mnt/b /mnt/b/10/20
    
    cat /proc/self/mountinfo
    ls /mnt /mnt/b /mnt/b/10  /mnt/b/10/20  /mnt/b/10/20/10  /mnt/b/10/20/10/20 || true
    
    unshare -Urm --propagation unchanged /bin/bash -c 'cat /proc/self/mountinfo; sleep 5; ls /mnt /mnt/b /mnt/b/10 /mnt/b/10/20 /mnt/b/10/20/10 \
    /mnt/b/10/20/10/20 || true; cat /proc/self/mountinfo' &
    sleep 1
    umount -l /mnt/b/
    wait %%
    $ unshare -Urm ./pathology.sh


>> Anyway Andrei if you could check this out and see if you can see
>> anything I missed please let me know.
>
> I've tested this patch today and it works to me. The idea of this patch
> looks good for me too. Thanks! There is one inline comment.

It is definitely close but there is an ordering problem (see above)
that needs some more attention.  I have just finished building
myself a reproducer and am going to go sleep on i.

The little script above demonstrates that the locked mount handling
(of preventing umounts) is too conservative today, and is even worse
with these changes.

Even worse locked mounts are unnecessary to fail to unmount everything,
with a single pass through the propagation tree.  My script above
demonstrates one such topology where there will be problems.

Now that bug already exists today so I don't expect this change makes
anything practically worse.  But I would really like to know if it is
possible to do better before we merge this change.

>>  fs/namespace.c        |   6 +--
>>  fs/pnode.c            | 147 ++++++++++++++++++++++++++++++++++++++++++++------
>>  fs/pnode.h            |   4 ++
>>  include/linux/mount.h |   2 +
>>  4 files changed, 138 insertions(+), 21 deletions(-)
>> 
>> diff --git a/fs/namespace.c b/fs/namespace.c
>> index db1b5a38864e..1ca99fa2e0f4 100644
>> --- a/fs/namespace.c
>> +++ b/fs/namespace.c
>> @@ -650,13 +650,11 @@ struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
>>  	p = __lookup_mnt(mnt, dentry);
>>  	if (!p)
>>  		goto out;
>> -	if (!(p->mnt.mnt_flags & MNT_UMOUNT))
>> -		res = p;
>> +	res = p;
>>  	hlist_for_each_entry_continue(p, mnt_hash) {
>>  		if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
>>  			break;
>> -		if (!(p->mnt.mnt_flags & MNT_UMOUNT))
>> -			res = p;
>
> __lookup_mnt_last is used in propagate_mount_busy and
> attach_recursive_mnt. Should we do smth to save old
> behaviour of these functions.

Reasonable question. I am actually reverting __lookup_mnt_last to a
fairly recent behavior.   I added the MNT_UMOUNT test when I started
leaving things in the hash table to keep lazy unmounts from having a
information disclosure issue.

Mounts with MNT_UMOUNT will only be seen connected to mounted mounts
during propogate_umount.  attach_recursive_mounts has no chance of
seeing that condition, and propagate_mount_busy is called before
mount_umount. Similary propagate_umount_lock is also called before any
mounts get into a visible halfway unmounted state.

So no.  I don't see any reason to preseve the extra MNT_UMOUNT test.

Eric

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [RFC][PATCH v2] mount: In mark_umount_candidates and __propogate_umount visit each mount once
@ 2016-10-18  6:49                           ` Eric W. Biederman
  0 siblings, 0 replies; 36+ messages in thread
From: Eric W. Biederman @ 2016-10-18  6:49 UTC (permalink / raw)
  To: Andrei Vagin
  Cc: Andrey Vagin, Alexander Viro, Linux Containers, linux-fsdevel, LKML

Andrei Vagin <avagin@virtuozzo.com> writes:

> On Fri, Oct 14, 2016 at 01:29:18PM -0500, Eric W. Biederman wrote:
>> 
>> Adrei Vagin pointed out that time to executue propagate_umount can go
>> non-linear (and take a ludicrious amount of time) when the mount
>> propogation trees of the mounts to be unmunted by a lazy unmount
>> overlap.
>> 
>> Solve this in the most straight forward way possible, by adding a new
>> mount flag to mark parts of the mount propagation tree that have been
>> visited, and use that mark to skip parts of the mount propagation tree
>> that have already been visited during an unmount.  This guarantees
>> that each mountpoint in the possibly overlapping mount propagation
>> trees will be visited exactly once.
>> 
>> Add the functions propagation_visit_next and propagation_revisit_next
>> to coordinate setting and clearling the visited mount mark.
>> 
>> The skipping of already unmounted mounts has been moved from
>> __lookup_mnt_last to mark_umount_candidates, so that the new
>> propagation functions can notice record when the propagation tree
>> passes through the initial set of unmounted mounts.  Except in
>> umount_tree as part of the unmounting process the only place where
>> unmounted mounts should be found are in unmounted subtrees.  All of
>> the other callers of __lookup_mnt_last are from mounted subtrees so
>> the not checking for unmounted mounts should not affect them.
>> 
>> Here is a script to generate such mount tree:
>> $ cat run.sh
>> mount -t tmpfs test-mount /mnt
>> mount --make-shared /mnt
>> for i in `seq $1`; do
>>         mkdir /mnt/test.$i
>>         mount --bind /mnt /mnt/test.$i
>> done
>> cat /proc/mounts | grep test-mount | wc -l
>> time umount -l /mnt
>> $ for i in `seq 10 16`; do echo $i; unshare -Urm bash ./run.sh $i; done
>> 
>> Here are the performance numbers with and without the patch:
>> 
>> mhash  |  8192   |  8192  |  8192       | 131072 | 131072      | 104857 | 104857
>> mounts | before  | after  | after (sys) | after  | after (sys) |  after | after (sys)
>> -------------------------------------------------------------------------------------
>>   1024 |  0.071s | 0.023s | 0.008s      | 0.026s | 0.000s      | 0.024s | 0.008s
>>   2048 |  0.184s | 0.030s | 0.012s      | 0.035s | 0.008s      | 0.030s | 0.012s
>>   4096 |  0.604s | 0.047s | 0.012s      | 0.042s | 0.016s      | 0.032s | 0.016s
>>   8912 |  4.471s | 0.085s | 0.020s      | 0.059s | 0.059s      | 0.050s | 0.036s
>>  16384 | 34.826s | 0.105s | 0.092s      | 0.109s | 0.060s      | 0.087s | 0.068s
>>  32768 |         | 0.245s | 0.168s      | 0.192s | 0.144s      | 0.167s | 0.156s
>>  65536 |         | 0.833s | 0.716s      | 0.485s | 0.276s      | 0.468s | 0.316s
>> 131072 |         | 4.628s | 4.108s      | 0.758s | 0.632s      | 0.736s | 0.612s
>> 
>> Andrei Vagin reports fixing this performance problem is part of the
>> work to fix CVE-2016-6213.
>> 
>> Cc: stable@vger.kernel.org
>> Reported-by: Andrei Vagin <avagin@openvz.org>
>> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
>> ---
>> 
>> I think this version is very close.  I had to modify __lookup_mnt_last
>> to not skip MOUNT_UMOUNT or we would never see when the mount
>> propagation trees intersected.
>> 
>> This doesn't look as good as the previous buggy version but it looks
>> good.  When the hash table isn't getting full the times look pretty
>> linear.  So it may be necessary to do some hash table resizing.
>> 
>> That said there remains one issue I need to think about some more.
>> 
>> In mark_umount_candidates I don't mark mounts that are locked to their
>> parent and their parent is not marked as a umount candidate.  Given that
>> we skip processing mounts multiple times this might result in a mount
>> whose parent gets marked as unmountable after the first time we see a
>> mount not getting marked as unmountable later.

Unfortunately my fears are born out as demonstrated by the script
below.
    $ cat pathology.sh
    #!/bin/sh
    set -e
    set -x
    
    mount -t tmpfs base /mnt
    mount --make-shared /mnt
    mkdir -p /mnt/b
    
    mount -t tmpfs test1 /mnt/b
    mount --make-shared /mnt/b
    mkdir -p /mnt/b/10
    
    mount -t tmpfs test2 /mnt/b/10
    mount --make-shared /mnt/b/10
    mkdir -p /mnt/b/10/20
    
    mount --rbind /mnt/b /mnt/b/10/20
    
    cat /proc/self/mountinfo
    ls /mnt /mnt/b /mnt/b/10  /mnt/b/10/20  /mnt/b/10/20/10  /mnt/b/10/20/10/20 || true
    
    unshare -Urm --propagation unchanged /bin/bash -c 'cat /proc/self/mountinfo; sleep 5; ls /mnt /mnt/b /mnt/b/10 /mnt/b/10/20 /mnt/b/10/20/10 \
    /mnt/b/10/20/10/20 || true; cat /proc/self/mountinfo' &
    sleep 1
    umount -l /mnt/b/
    wait %%
    $ unshare -Urm ./pathology.sh


>> Anyway Andrei if you could check this out and see if you can see
>> anything I missed please let me know.
>
> I've tested this patch today and it works to me. The idea of this patch
> looks good for me too. Thanks! There is one inline comment.

It is definitely close but there is an ordering problem (see above)
that needs some more attention.  I have just finished building
myself a reproducer and am going to go sleep on i.

The little script above demonstrates that the locked mount handling
(of preventing umounts) is too conservative today, and is even worse
with these changes.

Even worse locked mounts are unnecessary to fail to unmount everything,
with a single pass through the propagation tree.  My script above
demonstrates one such topology where there will be problems.

Now that bug already exists today so I don't expect this change makes
anything practically worse.  But I would really like to know if it is
possible to do better before we merge this change.

>>  fs/namespace.c        |   6 +--
>>  fs/pnode.c            | 147 ++++++++++++++++++++++++++++++++++++++++++++------
>>  fs/pnode.h            |   4 ++
>>  include/linux/mount.h |   2 +
>>  4 files changed, 138 insertions(+), 21 deletions(-)
>> 
>> diff --git a/fs/namespace.c b/fs/namespace.c
>> index db1b5a38864e..1ca99fa2e0f4 100644
>> --- a/fs/namespace.c
>> +++ b/fs/namespace.c
>> @@ -650,13 +650,11 @@ struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
>>  	p = __lookup_mnt(mnt, dentry);
>>  	if (!p)
>>  		goto out;
>> -	if (!(p->mnt.mnt_flags & MNT_UMOUNT))
>> -		res = p;
>> +	res = p;
>>  	hlist_for_each_entry_continue(p, mnt_hash) {
>>  		if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
>>  			break;
>> -		if (!(p->mnt.mnt_flags & MNT_UMOUNT))
>> -			res = p;
>
> __lookup_mnt_last is used in propagate_mount_busy and
> attach_recursive_mnt. Should we do smth to save old
> behaviour of these functions.

Reasonable question. I am actually reverting __lookup_mnt_last to a
fairly recent behavior.   I added the MNT_UMOUNT test when I started
leaving things in the hash table to keep lazy unmounts from having a
information disclosure issue.

Mounts with MNT_UMOUNT will only be seen connected to mounted mounts
during propogate_umount.  attach_recursive_mounts has no chance of
seeing that condition, and propagate_mount_busy is called before
mount_umount. Similary propagate_umount_lock is also called before any
mounts get into a visible halfway unmounted state.

So no.  I don't see any reason to preseve the extra MNT_UMOUNT test.

Eric

^ permalink raw reply	[flat|nested] 36+ messages in thread

* [REVIEW][PATCH] mount: In propagate_umount handle overlapping mount propagation trees
       [not found]                           ` <87r37e9mnj.fsf-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
@ 2016-10-19  3:46                             ` Eric W. Biederman
  0 siblings, 0 replies; 36+ messages in thread
From: Eric W. Biederman @ 2016-10-19  3:46 UTC (permalink / raw)
  To: Andrei Vagin
  Cc: linux-fsdevel, LKML, Linux Containers, Andrey Vagin, Alexander Viro


Adrei Vagin pointed out that time to executue propagate_umount can go
non-linear (and take a ludicrious amount of time) when the mount
propogation trees of the mounts to be unmunted by a lazy unmount
overlap.

While investigating the horrible performance I realized that in
the case overlapping mount trees since the addition of locked
mount support the code has been failing to unmount all of the
mounts it should have been unmounting.

Make the walk of the mount propagation trees nearly linear by using
MNT_MARK to mark pieces of the mount propagation trees that have
already been visited, allowing subsequent walks to skip over
subtrees.

Make the processing of mounts order independent by adding a list of
mount entries that need to be unmounted, and simply adding a mount to
that list when it becomes apparent the mount can safely be unmounted.
For mounts that are locked on other mounts but otherwise could be
unmounted move them from their parnets mnt_mounts to mnt_umounts so
that if and when their parent becomes unmounted these mounts can be
added to the list of mounts to unmount.

Add a final pass to clear MNT_MARK and to restore mnt_mounts
from mnt_umounts for anything that did not get unmounted.

Add the functions propagation_visit_next and propagation_revisit_next
to coordinate walking of the mount tree and setting and clearing the
mount mark.

The skipping of already unmounted mounts has been moved from
__lookup_mnt_last to mark_umount_candidates, so that the new
propagation functions can notice when the propagation tree passes
through the initial set of unmounted mounts.  Except in umount_tree as
part of the unmounting process the only place where unmounted mounts
should be found are in unmounted subtrees.  All of the other callers
of __lookup_mnt_last are from mounted subtrees so the not checking for
unmounted mounts should not affect them.

A script to generate overlapping mount propagation trees:
$ cat run.sh
mount -t tmpfs test-mount /mnt
mount --make-shared /mnt
for i in `seq $1`; do
        mkdir /mnt/test.$i
        mount --bind /mnt /mnt/test.$i
done
cat /proc/mounts | grep test-mount | wc -l
time umount -l /mnt
$ for i in `seq 10 16`; do echo $i; unshare -Urm bash ./run.sh $i; done

Here are the performance numbers with and without the patch:

mhash  |  8192   |  8192  |  8192       | 131072 | 131072      | 104857 | 104857
mounts | before  | after  | after (sys) | after  | after (sys) |  after | after (sys)
-------------------------------------------------------------------------------------
  1024 |  0.071s | 0.020s | 0.000s      | 0.022s | 0.004s      | 0.020s | 0.004s
  2048 |  0.184s | 0.022s | 0.004s      | 0.023s | 0.004s      | 0.022s | 0.008s
  4096 |  0.604s | 0.025s | 0.020s      | 0.029s | 0.008s      | 0.026s | 0.004s
  8912 |  4.471s | 0.053s | 0.020s      | 0.051s | 0.024s      | 0.047s | 0.016s
 16384 | 34.826s | 0.088s | 0.060s      | 0.081s | 0.048s      | 0.082s | 0.052s
 32768 |         | 0.216s | 0.172s      | 0.160s | 0.124s      | 0.160s | 0.096s
 65536 |         | 0.819s | 0.726s      | 0.330s | 0.260s      | 0.338s | 0.256s
131072 |         | 4.502s | 4.168s      | 0.707s | 0.580s      | 0.709s | 0.592s

Andrei Vagin reports fixing the performance problem is part of the
work to fix CVE-2016-6213.

A script for a pathlogical set of mounts:

$ cat pathological.sh

mount -t tmpfs base /mnt
mount --make-shared /mnt
mkdir -p /mnt/b

mount -t tmpfs test1 /mnt/b
mount --make-shared /mnt/b
mkdir -p /mnt/b/10

mount -t tmpfs test2 /mnt/b/10
mount --make-shared /mnt/b/10
mkdir -p /mnt/b/10/20

mount --rbind /mnt/b /mnt/b/10/20

unshare -Urm sleep 2
umount -l /mnt/b
wait %%

$ unsahre -Urm pathlogical.sh

Cc: stable-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Fixes: a05964f3917c ("[PATCH] shared mounts handling: umount")
Fixes: 0c56fe31420c ("mnt: Don't propagate unmounts to locked mounts")
Reported-by: Andrei Vagin <avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
Signed-off-by: "Eric W. Biederman" <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
---

Barring some stupid mistake this looks like it fixes both the performance
and the correctness issues I was able to spot earlier.  Andrei if you
could give this version a look over I would appreciate it.

Unless we can find a problem I am going to call this the final version.

 fs/mount.h     |   1 +
 fs/namespace.c |   7 +-
 fs/pnode.c     | 198 ++++++++++++++++++++++++++++++++++++++++++++++-----------
 fs/pnode.h     |   2 +-
 4 files changed, 165 insertions(+), 43 deletions(-)

diff --git a/fs/mount.h b/fs/mount.h
index d2e25d7b64b3..00fe0d1d6ba7 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -58,6 +58,7 @@ struct mount {
 	struct mnt_namespace *mnt_ns;	/* containing namespace */
 	struct mountpoint *mnt_mp;	/* where is it mounted */
 	struct hlist_node mnt_mp_list;	/* list mounts with the same mountpoint */
+	struct list_head mnt_umounts;	/* list of children that are being unmounted */
 #ifdef CONFIG_FSNOTIFY
 	struct hlist_head mnt_fsnotify_marks;
 	__u32 mnt_fsnotify_mask;
diff --git a/fs/namespace.c b/fs/namespace.c
index e6c234b1a645..73801391bb00 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -237,6 +237,7 @@ static struct mount *alloc_vfsmnt(const char *name)
 		INIT_LIST_HEAD(&mnt->mnt_slave_list);
 		INIT_LIST_HEAD(&mnt->mnt_slave);
 		INIT_HLIST_NODE(&mnt->mnt_mp_list);
+		INIT_LIST_HEAD(&mnt->mnt_umounts);
 #ifdef CONFIG_FSNOTIFY
 		INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
 #endif
@@ -650,13 +651,11 @@ struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
 	p = __lookup_mnt(mnt, dentry);
 	if (!p)
 		goto out;
-	if (!(p->mnt.mnt_flags & MNT_UMOUNT))
-		res = p;
+	res = p;
 	hlist_for_each_entry_continue(p, mnt_hash) {
 		if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
 			break;
-		if (!(p->mnt.mnt_flags & MNT_UMOUNT))
-			res = p;
+		res = p;
 	}
 out:
 	return res;
diff --git a/fs/pnode.c b/fs/pnode.c
index 234a9ac49958..15e30e861a14 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -390,56 +390,153 @@ void propagate_mount_unlock(struct mount *mnt)
 }
 
 /*
- * Mark all mounts that the MNT_LOCKED logic will allow to be unmounted.
+ * get the next mount in the propagation tree (that has not been visited)
+ * @m: the mount seen last
+ * @origin: the original mount from where the tree walk initiated
+ *
+ * Note that peer groups form contiguous segments of slave lists.
+ * We rely on that in get_source() to be able to find out if
+ * vfsmount found while iterating with propagation_next() is
+ * a peer of one we'd found earlier.
  */
-static void mark_umount_candidates(struct mount *mnt)
+static struct mount *propagation_visit_child(struct mount *last_child,
+					    struct mount *origin_child)
 {
-	struct mount *parent = mnt->mnt_parent;
-	struct mount *m;
+	struct mount *m = last_child->mnt_parent;
+	struct mount *origin = origin_child->mnt_parent;
+	struct dentry *mountpoint = origin_child->mnt_mountpoint;
+	struct mount *child;
 
-	BUG_ON(parent == mnt);
+	/* Has this part of the propgation tree already been visited? */
+	if (IS_MNT_MARKED(last_child))
+		return NULL;
 
-	for (m = propagation_next(parent, parent); m;
-			m = propagation_next(m, parent)) {
-		struct mount *child = __lookup_mnt_last(&m->mnt,
-						mnt->mnt_mountpoint);
-		if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
-			SET_MNT_MARK(child);
+	SET_MNT_MARK(last_child);
+
+	/* are there any slaves of this mount? */
+	if (!list_empty(&m->mnt_slave_list)) {
+		m = first_slave(m);
+		goto check_slave;
+	}
+	while (1) {
+		struct mount *master = m->mnt_master;
+
+		if (master == origin->mnt_master) {
+			struct mount *next = next_peer(m);
+			while (1) {
+				if (next == origin)
+					return NULL;
+				child = __lookup_mnt_last(&next->mnt, mountpoint);
+				if (child && !IS_MNT_MARKED(child))
+					return child;
+				next = next_peer(next);
+			}
+		} else {
+			while (1) {
+				if (m->mnt_slave.next == &master->mnt_slave_list)
+					break;
+				m = next_slave(m);
+			check_slave:
+				child = __lookup_mnt_last(&m->mnt, mountpoint);
+				if (child && !IS_MNT_MARKED(child))
+					return child;
+			}
 		}
+
+		/* back at master */
+		m = master;
 	}
 }
 
 /*
- * NOTE: unmounting 'mnt' naturally propagates to all other mounts its
- * parent propagates to.
+ * get the next mount in the propagation tree (that has not been revisited)
+ * @m: the mount seen last
+ * @origin: the original mount from where the tree walk initiated
+ *
+ * Note that peer groups form contiguous segments of slave lists.
+ * We rely on that in get_source() to be able to find out if
+ * vfsmount found while iterating with propagation_next() is
+ * a peer of one we'd found earlier.
  */
-static void __propagate_umount(struct mount *mnt)
+static struct mount *propagation_revisit_child(struct mount *last_child,
+					       struct mount *origin_child)
 {
-	struct mount *parent = mnt->mnt_parent;
-	struct mount *m;
+	struct mount *m = last_child->mnt_parent;
+	struct mount *origin = origin_child->mnt_parent;
+	struct dentry *mountpoint = origin_child->mnt_mountpoint;
+	struct mount *child;
 
-	BUG_ON(parent == mnt);
+	/* Has this part of the propgation tree already been revisited? */
+	if (!IS_MNT_MARKED(last_child))
+		return NULL;
 
-	for (m = propagation_next(parent, parent); m;
-			m = propagation_next(m, parent)) {
+	CLEAR_MNT_MARK(last_child);
 
-		struct mount *child = __lookup_mnt_last(&m->mnt,
-						mnt->mnt_mountpoint);
-		/*
-		 * umount the child only if the child has no children
-		 * and the child is marked safe to unmount.
-		 */
-		if (!child || !IS_MNT_MARKED(child))
-			continue;
-		CLEAR_MNT_MARK(child);
-		if (list_empty(&child->mnt_mounts)) {
-			list_del_init(&child->mnt_child);
-			child->mnt.mnt_flags |= MNT_UMOUNT;
-			list_move_tail(&child->mnt_list, &mnt->mnt_list);
+	/* are there any slaves of this mount? */
+	if (!list_empty(&m->mnt_slave_list)) {
+		m = first_slave(m);
+		goto check_slave;
+	}
+	while (1) {
+		struct mount *master = m->mnt_master;
+
+		if (master == origin->mnt_master) {
+			struct mount *next = next_peer(m);
+			while (1) {
+				if (next == origin)
+					return NULL;
+				child = __lookup_mnt_last(&next->mnt, mountpoint);
+				if (child && IS_MNT_MARKED(child))
+					return child;
+				next = next_peer(next);
+			}
+		} else {
+			while (1) {
+				if (m->mnt_slave.next == &master->mnt_slave_list)
+					break;
+				m = next_slave(m);
+			check_slave:
+				child = __lookup_mnt_last(&m->mnt, mountpoint);
+				if (child && IS_MNT_MARKED(child))
+					return child;
+			}
 		}
+
+		/* back at master */
+		m = master;
 	}
 }
 
+static void start_umount_propagation(struct mount *child,
+				     struct list_head *to_umount)
+{
+	do {
+		struct mount *parent = child->mnt_parent;
+
+		if ((child->mnt.mnt_flags & MNT_UMOUNT) ||
+		    !list_empty(&child->mnt_mounts))
+			return;
+
+		if (!IS_MNT_LOCKED(child))
+			list_move_tail(&child->mnt_child, to_umount);
+		else
+			list_move_tail(&child->mnt_child, &parent->mnt_umounts);
+
+		child = NULL;
+		if (IS_MNT_MARKED(parent))
+			child = parent;
+	} while (child);
+}
+
+static void end_umount_propagation(struct mount *child)
+{
+	struct mount *parent = child->mnt_parent;
+
+	if (!list_empty(&parent->mnt_umounts))
+		list_splice_tail_init(&parent->mnt_umounts, &parent->mnt_mounts);
+}
+
+
 /*
  * collect all mounts that receive propagation from the mount in @list,
  * and return these additional mounts in the same list.
@@ -447,14 +544,39 @@ static void __propagate_umount(struct mount *mnt)
  *
  * vfsmount lock must be held for write
  */
-int propagate_umount(struct list_head *list)
+void propagate_umount(struct list_head *list)
 {
 	struct mount *mnt;
+	LIST_HEAD(to_umount);
+	LIST_HEAD(tmp_list);
+
+	/* Find candidates for unmounting */
+	list_for_each_entry(mnt, list, mnt_list) {
+		struct mount *child;
+		for (child = propagation_visit_child(mnt, mnt); child;
+		     child = propagation_visit_child(child, mnt))
+			start_umount_propagation(child, &to_umount);
+	}
 
-	list_for_each_entry_reverse(mnt, list, mnt_list)
-		mark_umount_candidates(mnt);
+	/* Begin unmounting */
+	while (!list_empty(&to_umount)) {
+		mnt = list_first_entry(&to_umount, struct mount, mnt_child);
 
-	list_for_each_entry(mnt, list, mnt_list)
-		__propagate_umount(mnt);
-	return 0;
+		list_del_init(&mnt->mnt_child);
+		mnt->mnt.mnt_flags |= MNT_UMOUNT;
+		list_move_tail(&mnt->mnt_list, &tmp_list);
+
+		if (!list_empty(&mnt->mnt_umounts))
+			list_splice_tail_init(&mnt->mnt_umounts, &to_umount);
+	}
+
+	/* Cleanup the mount propagation tree */
+	list_for_each_entry(mnt, list, mnt_list) {
+		struct mount *child;
+		for (child = propagation_revisit_child(mnt, mnt); child;
+		     child = propagation_revisit_child(child, mnt))
+			end_umount_propagation(child);
+	}
+
+	list_splice_tail(&tmp_list, list);
 }
diff --git a/fs/pnode.h b/fs/pnode.h
index 550f5a8b4fcf..38c6cdb96b34 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -41,7 +41,7 @@ static inline void set_mnt_shared(struct mount *mnt)
 void change_mnt_propagation(struct mount *, int);
 int propagate_mnt(struct mount *, struct mountpoint *, struct mount *,
 		struct hlist_head *);
-int propagate_umount(struct list_head *);
+void propagate_umount(struct list_head *);
 int propagate_mount_busy(struct mount *, int);
 void propagate_mount_unlock(struct mount *);
 void mnt_release_group_id(struct mount *);
-- 
2.10.1

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [REVIEW][PATCH] mount: In propagate_umount handle overlapping mount propagation trees
  2016-10-18  6:49                           ` Eric W. Biederman
  (?)
@ 2016-10-19  3:46                           ` Eric W. Biederman
       [not found]                             ` <877f95ngpr.fsf_-_-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
  -1 siblings, 1 reply; 36+ messages in thread
From: Eric W. Biederman @ 2016-10-19  3:46 UTC (permalink / raw)
  To: Andrei Vagin
  Cc: Andrey Vagin, Alexander Viro, Linux Containers, linux-fsdevel, LKML


Adrei Vagin pointed out that time to executue propagate_umount can go
non-linear (and take a ludicrious amount of time) when the mount
propogation trees of the mounts to be unmunted by a lazy unmount
overlap.

While investigating the horrible performance I realized that in
the case overlapping mount trees since the addition of locked
mount support the code has been failing to unmount all of the
mounts it should have been unmounting.

Make the walk of the mount propagation trees nearly linear by using
MNT_MARK to mark pieces of the mount propagation trees that have
already been visited, allowing subsequent walks to skip over
subtrees.

Make the processing of mounts order independent by adding a list of
mount entries that need to be unmounted, and simply adding a mount to
that list when it becomes apparent the mount can safely be unmounted.
For mounts that are locked on other mounts but otherwise could be
unmounted move them from their parnets mnt_mounts to mnt_umounts so
that if and when their parent becomes unmounted these mounts can be
added to the list of mounts to unmount.

Add a final pass to clear MNT_MARK and to restore mnt_mounts
from mnt_umounts for anything that did not get unmounted.

Add the functions propagation_visit_next and propagation_revisit_next
to coordinate walking of the mount tree and setting and clearing the
mount mark.

The skipping of already unmounted mounts has been moved from
__lookup_mnt_last to mark_umount_candidates, so that the new
propagation functions can notice when the propagation tree passes
through the initial set of unmounted mounts.  Except in umount_tree as
part of the unmounting process the only place where unmounted mounts
should be found are in unmounted subtrees.  All of the other callers
of __lookup_mnt_last are from mounted subtrees so the not checking for
unmounted mounts should not affect them.

A script to generate overlapping mount propagation trees:
$ cat run.sh
mount -t tmpfs test-mount /mnt
mount --make-shared /mnt
for i in `seq $1`; do
        mkdir /mnt/test.$i
        mount --bind /mnt /mnt/test.$i
done
cat /proc/mounts | grep test-mount | wc -l
time umount -l /mnt
$ for i in `seq 10 16`; do echo $i; unshare -Urm bash ./run.sh $i; done

Here are the performance numbers with and without the patch:

mhash  |  8192   |  8192  |  8192       | 131072 | 131072      | 104857 | 104857
mounts | before  | after  | after (sys) | after  | after (sys) |  after | after (sys)
-------------------------------------------------------------------------------------
  1024 |  0.071s | 0.020s | 0.000s      | 0.022s | 0.004s      | 0.020s | 0.004s
  2048 |  0.184s | 0.022s | 0.004s      | 0.023s | 0.004s      | 0.022s | 0.008s
  4096 |  0.604s | 0.025s | 0.020s      | 0.029s | 0.008s      | 0.026s | 0.004s
  8912 |  4.471s | 0.053s | 0.020s      | 0.051s | 0.024s      | 0.047s | 0.016s
 16384 | 34.826s | 0.088s | 0.060s      | 0.081s | 0.048s      | 0.082s | 0.052s
 32768 |         | 0.216s | 0.172s      | 0.160s | 0.124s      | 0.160s | 0.096s
 65536 |         | 0.819s | 0.726s      | 0.330s | 0.260s      | 0.338s | 0.256s
131072 |         | 4.502s | 4.168s      | 0.707s | 0.580s      | 0.709s | 0.592s

Andrei Vagin reports fixing the performance problem is part of the
work to fix CVE-2016-6213.

A script for a pathlogical set of mounts:

$ cat pathological.sh

mount -t tmpfs base /mnt
mount --make-shared /mnt
mkdir -p /mnt/b

mount -t tmpfs test1 /mnt/b
mount --make-shared /mnt/b
mkdir -p /mnt/b/10

mount -t tmpfs test2 /mnt/b/10
mount --make-shared /mnt/b/10
mkdir -p /mnt/b/10/20

mount --rbind /mnt/b /mnt/b/10/20

unshare -Urm sleep 2
umount -l /mnt/b
wait %%

$ unsahre -Urm pathlogical.sh

Cc: stable@vger.kernel.org
Fixes: a05964f3917c ("[PATCH] shared mounts handling: umount")
Fixes: 0c56fe31420c ("mnt: Don't propagate unmounts to locked mounts")
Reported-by: Andrei Vagin <avagin@openvz.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---

Barring some stupid mistake this looks like it fixes both the performance
and the correctness issues I was able to spot earlier.  Andrei if you
could give this version a look over I would appreciate it.

Unless we can find a problem I am going to call this the final version.

 fs/mount.h     |   1 +
 fs/namespace.c |   7 +-
 fs/pnode.c     | 198 ++++++++++++++++++++++++++++++++++++++++++++++-----------
 fs/pnode.h     |   2 +-
 4 files changed, 165 insertions(+), 43 deletions(-)

diff --git a/fs/mount.h b/fs/mount.h
index d2e25d7b64b3..00fe0d1d6ba7 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -58,6 +58,7 @@ struct mount {
 	struct mnt_namespace *mnt_ns;	/* containing namespace */
 	struct mountpoint *mnt_mp;	/* where is it mounted */
 	struct hlist_node mnt_mp_list;	/* list mounts with the same mountpoint */
+	struct list_head mnt_umounts;	/* list of children that are being unmounted */
 #ifdef CONFIG_FSNOTIFY
 	struct hlist_head mnt_fsnotify_marks;
 	__u32 mnt_fsnotify_mask;
diff --git a/fs/namespace.c b/fs/namespace.c
index e6c234b1a645..73801391bb00 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -237,6 +237,7 @@ static struct mount *alloc_vfsmnt(const char *name)
 		INIT_LIST_HEAD(&mnt->mnt_slave_list);
 		INIT_LIST_HEAD(&mnt->mnt_slave);
 		INIT_HLIST_NODE(&mnt->mnt_mp_list);
+		INIT_LIST_HEAD(&mnt->mnt_umounts);
 #ifdef CONFIG_FSNOTIFY
 		INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
 #endif
@@ -650,13 +651,11 @@ struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
 	p = __lookup_mnt(mnt, dentry);
 	if (!p)
 		goto out;
-	if (!(p->mnt.mnt_flags & MNT_UMOUNT))
-		res = p;
+	res = p;
 	hlist_for_each_entry_continue(p, mnt_hash) {
 		if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
 			break;
-		if (!(p->mnt.mnt_flags & MNT_UMOUNT))
-			res = p;
+		res = p;
 	}
 out:
 	return res;
diff --git a/fs/pnode.c b/fs/pnode.c
index 234a9ac49958..15e30e861a14 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -390,56 +390,153 @@ void propagate_mount_unlock(struct mount *mnt)
 }
 
 /*
- * Mark all mounts that the MNT_LOCKED logic will allow to be unmounted.
+ * get the next mount in the propagation tree (that has not been visited)
+ * @m: the mount seen last
+ * @origin: the original mount from where the tree walk initiated
+ *
+ * Note that peer groups form contiguous segments of slave lists.
+ * We rely on that in get_source() to be able to find out if
+ * vfsmount found while iterating with propagation_next() is
+ * a peer of one we'd found earlier.
  */
-static void mark_umount_candidates(struct mount *mnt)
+static struct mount *propagation_visit_child(struct mount *last_child,
+					    struct mount *origin_child)
 {
-	struct mount *parent = mnt->mnt_parent;
-	struct mount *m;
+	struct mount *m = last_child->mnt_parent;
+	struct mount *origin = origin_child->mnt_parent;
+	struct dentry *mountpoint = origin_child->mnt_mountpoint;
+	struct mount *child;
 
-	BUG_ON(parent == mnt);
+	/* Has this part of the propgation tree already been visited? */
+	if (IS_MNT_MARKED(last_child))
+		return NULL;
 
-	for (m = propagation_next(parent, parent); m;
-			m = propagation_next(m, parent)) {
-		struct mount *child = __lookup_mnt_last(&m->mnt,
-						mnt->mnt_mountpoint);
-		if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
-			SET_MNT_MARK(child);
+	SET_MNT_MARK(last_child);
+
+	/* are there any slaves of this mount? */
+	if (!list_empty(&m->mnt_slave_list)) {
+		m = first_slave(m);
+		goto check_slave;
+	}
+	while (1) {
+		struct mount *master = m->mnt_master;
+
+		if (master == origin->mnt_master) {
+			struct mount *next = next_peer(m);
+			while (1) {
+				if (next == origin)
+					return NULL;
+				child = __lookup_mnt_last(&next->mnt, mountpoint);
+				if (child && !IS_MNT_MARKED(child))
+					return child;
+				next = next_peer(next);
+			}
+		} else {
+			while (1) {
+				if (m->mnt_slave.next == &master->mnt_slave_list)
+					break;
+				m = next_slave(m);
+			check_slave:
+				child = __lookup_mnt_last(&m->mnt, mountpoint);
+				if (child && !IS_MNT_MARKED(child))
+					return child;
+			}
 		}
+
+		/* back at master */
+		m = master;
 	}
 }
 
 /*
- * NOTE: unmounting 'mnt' naturally propagates to all other mounts its
- * parent propagates to.
+ * get the next mount in the propagation tree (that has not been revisited)
+ * @m: the mount seen last
+ * @origin: the original mount from where the tree walk initiated
+ *
+ * Note that peer groups form contiguous segments of slave lists.
+ * We rely on that in get_source() to be able to find out if
+ * vfsmount found while iterating with propagation_next() is
+ * a peer of one we'd found earlier.
  */
-static void __propagate_umount(struct mount *mnt)
+static struct mount *propagation_revisit_child(struct mount *last_child,
+					       struct mount *origin_child)
 {
-	struct mount *parent = mnt->mnt_parent;
-	struct mount *m;
+	struct mount *m = last_child->mnt_parent;
+	struct mount *origin = origin_child->mnt_parent;
+	struct dentry *mountpoint = origin_child->mnt_mountpoint;
+	struct mount *child;
 
-	BUG_ON(parent == mnt);
+	/* Has this part of the propgation tree already been revisited? */
+	if (!IS_MNT_MARKED(last_child))
+		return NULL;
 
-	for (m = propagation_next(parent, parent); m;
-			m = propagation_next(m, parent)) {
+	CLEAR_MNT_MARK(last_child);
 
-		struct mount *child = __lookup_mnt_last(&m->mnt,
-						mnt->mnt_mountpoint);
-		/*
-		 * umount the child only if the child has no children
-		 * and the child is marked safe to unmount.
-		 */
-		if (!child || !IS_MNT_MARKED(child))
-			continue;
-		CLEAR_MNT_MARK(child);
-		if (list_empty(&child->mnt_mounts)) {
-			list_del_init(&child->mnt_child);
-			child->mnt.mnt_flags |= MNT_UMOUNT;
-			list_move_tail(&child->mnt_list, &mnt->mnt_list);
+	/* are there any slaves of this mount? */
+	if (!list_empty(&m->mnt_slave_list)) {
+		m = first_slave(m);
+		goto check_slave;
+	}
+	while (1) {
+		struct mount *master = m->mnt_master;
+
+		if (master == origin->mnt_master) {
+			struct mount *next = next_peer(m);
+			while (1) {
+				if (next == origin)
+					return NULL;
+				child = __lookup_mnt_last(&next->mnt, mountpoint);
+				if (child && IS_MNT_MARKED(child))
+					return child;
+				next = next_peer(next);
+			}
+		} else {
+			while (1) {
+				if (m->mnt_slave.next == &master->mnt_slave_list)
+					break;
+				m = next_slave(m);
+			check_slave:
+				child = __lookup_mnt_last(&m->mnt, mountpoint);
+				if (child && IS_MNT_MARKED(child))
+					return child;
+			}
 		}
+
+		/* back at master */
+		m = master;
 	}
 }
 
+static void start_umount_propagation(struct mount *child,
+				     struct list_head *to_umount)
+{
+	do {
+		struct mount *parent = child->mnt_parent;
+
+		if ((child->mnt.mnt_flags & MNT_UMOUNT) ||
+		    !list_empty(&child->mnt_mounts))
+			return;
+
+		if (!IS_MNT_LOCKED(child))
+			list_move_tail(&child->mnt_child, to_umount);
+		else
+			list_move_tail(&child->mnt_child, &parent->mnt_umounts);
+
+		child = NULL;
+		if (IS_MNT_MARKED(parent))
+			child = parent;
+	} while (child);
+}
+
+static void end_umount_propagation(struct mount *child)
+{
+	struct mount *parent = child->mnt_parent;
+
+	if (!list_empty(&parent->mnt_umounts))
+		list_splice_tail_init(&parent->mnt_umounts, &parent->mnt_mounts);
+}
+
+
 /*
  * collect all mounts that receive propagation from the mount in @list,
  * and return these additional mounts in the same list.
@@ -447,14 +544,39 @@ static void __propagate_umount(struct mount *mnt)
  *
  * vfsmount lock must be held for write
  */
-int propagate_umount(struct list_head *list)
+void propagate_umount(struct list_head *list)
 {
 	struct mount *mnt;
+	LIST_HEAD(to_umount);
+	LIST_HEAD(tmp_list);
+
+	/* Find candidates for unmounting */
+	list_for_each_entry(mnt, list, mnt_list) {
+		struct mount *child;
+		for (child = propagation_visit_child(mnt, mnt); child;
+		     child = propagation_visit_child(child, mnt))
+			start_umount_propagation(child, &to_umount);
+	}
 
-	list_for_each_entry_reverse(mnt, list, mnt_list)
-		mark_umount_candidates(mnt);
+	/* Begin unmounting */
+	while (!list_empty(&to_umount)) {
+		mnt = list_first_entry(&to_umount, struct mount, mnt_child);
 
-	list_for_each_entry(mnt, list, mnt_list)
-		__propagate_umount(mnt);
-	return 0;
+		list_del_init(&mnt->mnt_child);
+		mnt->mnt.mnt_flags |= MNT_UMOUNT;
+		list_move_tail(&mnt->mnt_list, &tmp_list);
+
+		if (!list_empty(&mnt->mnt_umounts))
+			list_splice_tail_init(&mnt->mnt_umounts, &to_umount);
+	}
+
+	/* Cleanup the mount propagation tree */
+	list_for_each_entry(mnt, list, mnt_list) {
+		struct mount *child;
+		for (child = propagation_revisit_child(mnt, mnt); child;
+		     child = propagation_revisit_child(child, mnt))
+			end_umount_propagation(child);
+	}
+
+	list_splice_tail(&tmp_list, list);
 }
diff --git a/fs/pnode.h b/fs/pnode.h
index 550f5a8b4fcf..38c6cdb96b34 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -41,7 +41,7 @@ static inline void set_mnt_shared(struct mount *mnt)
 void change_mnt_propagation(struct mount *, int);
 int propagate_mnt(struct mount *, struct mountpoint *, struct mount *,
 		struct hlist_head *);
-int propagate_umount(struct list_head *);
+void propagate_umount(struct list_head *);
 int propagate_mount_busy(struct mount *, int);
 void propagate_mount_unlock(struct mount *);
 void mnt_release_group_id(struct mount *);
-- 
2.10.1

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [REVIEW][PATCH] mount: In propagate_umount handle overlapping mount propagation trees
  2016-10-19  3:46                           ` [REVIEW][PATCH] mount: In propagate_umount handle overlapping mount propagation trees Eric W. Biederman
@ 2016-10-20 21:30                                 ` Andrei Vagin
  0 siblings, 0 replies; 36+ messages in thread
From: Andrei Vagin @ 2016-10-20 21:30 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: linux-fsdevel, LKML, Linux Containers, Andrey Vagin, Alexander Viro

On Tue, Oct 18, 2016 at 10:46:40PM -0500, Eric W. Biederman wrote:
> 
> Adrei Vagin pointed out that time to executue propagate_umount can go
> non-linear (and take a ludicrious amount of time) when the mount
> propogation trees of the mounts to be unmunted by a lazy unmount
> overlap.
> 
> While investigating the horrible performance I realized that in
> the case overlapping mount trees since the addition of locked
> mount support the code has been failing to unmount all of the
> mounts it should have been unmounting.
> 
> Make the walk of the mount propagation trees nearly linear by using
> MNT_MARK to mark pieces of the mount propagation trees that have
> already been visited, allowing subsequent walks to skip over
> subtrees.
> 
> Make the processing of mounts order independent by adding a list of
> mount entries that need to be unmounted, and simply adding a mount to
> that list when it becomes apparent the mount can safely be unmounted.
> For mounts that are locked on other mounts but otherwise could be
> unmounted move them from their parnets mnt_mounts to mnt_umounts so
> that if and when their parent becomes unmounted these mounts can be
> added to the list of mounts to unmount.
> 
> Add a final pass to clear MNT_MARK and to restore mnt_mounts
> from mnt_umounts for anything that did not get unmounted.
> 
> Add the functions propagation_visit_next and propagation_revisit_next
> to coordinate walking of the mount tree and setting and clearing the
> mount mark.
> 
> The skipping of already unmounted mounts has been moved from
> __lookup_mnt_last to mark_umount_candidates, so that the new
> propagation functions can notice when the propagation tree passes
> through the initial set of unmounted mounts.  Except in umount_tree as
> part of the unmounting process the only place where unmounted mounts
> should be found are in unmounted subtrees.  All of the other callers
> of __lookup_mnt_last are from mounted subtrees so the not checking for
> unmounted mounts should not affect them.
> 
> A script to generate overlapping mount propagation trees:
> $ cat run.sh
> mount -t tmpfs test-mount /mnt
> mount --make-shared /mnt
> for i in `seq $1`; do
>         mkdir /mnt/test.$i
>         mount --bind /mnt /mnt/test.$i
> done
> cat /proc/mounts | grep test-mount | wc -l
> time umount -l /mnt
> $ for i in `seq 10 16`; do echo $i; unshare -Urm bash ./run.sh $i; done
> 
> Here are the performance numbers with and without the patch:
> 
> mhash  |  8192   |  8192  |  8192       | 131072 | 131072      | 104857 | 104857
> mounts | before  | after  | after (sys) | after  | after (sys) |  after | after (sys)
> -------------------------------------------------------------------------------------
>   1024 |  0.071s | 0.020s | 0.000s      | 0.022s | 0.004s      | 0.020s | 0.004s
>   2048 |  0.184s | 0.022s | 0.004s      | 0.023s | 0.004s      | 0.022s | 0.008s
>   4096 |  0.604s | 0.025s | 0.020s      | 0.029s | 0.008s      | 0.026s | 0.004s
>   8912 |  4.471s | 0.053s | 0.020s      | 0.051s | 0.024s      | 0.047s | 0.016s
>  16384 | 34.826s | 0.088s | 0.060s      | 0.081s | 0.048s      | 0.082s | 0.052s
>  32768 |         | 0.216s | 0.172s      | 0.160s | 0.124s      | 0.160s | 0.096s
>  65536 |         | 0.819s | 0.726s      | 0.330s | 0.260s      | 0.338s | 0.256s
> 131072 |         | 4.502s | 4.168s      | 0.707s | 0.580s      | 0.709s | 0.592s
> 
> Andrei Vagin reports fixing the performance problem is part of the
> work to fix CVE-2016-6213.
> 
> A script for a pathlogical set of mounts:
> 
> $ cat pathological.sh
> 
> mount -t tmpfs base /mnt
> mount --make-shared /mnt
> mkdir -p /mnt/b
> 
> mount -t tmpfs test1 /mnt/b
> mount --make-shared /mnt/b
> mkdir -p /mnt/b/10
> 
> mount -t tmpfs test2 /mnt/b/10
> mount --make-shared /mnt/b/10
> mkdir -p /mnt/b/10/20
> 
> mount --rbind /mnt/b /mnt/b/10/20
> 
> unshare -Urm sleep 2
> umount -l /mnt/b
> wait %%
> 
> $ unsahre -Urm pathlogical.sh
> 
> Cc: stable-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> Fixes: a05964f3917c ("[PATCH] shared mounts handling: umount")
> Fixes: 0c56fe31420c ("mnt: Don't propagate unmounts to locked mounts")
> Reported-by: Andrei Vagin <avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
> Signed-off-by: "Eric W. Biederman" <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
> ---
> 
> Barring some stupid mistake this looks like it fixes both the performance
> and the correctness issues I was able to spot earlier.  Andrei if you
> could give this version a look over I would appreciate it.

Eric, could you try out this script:

[root@fc24 mounts]# cat run.sh 
set -e -m

mount -t tmpfs zdtm /mnt
mkdir -p /mnt/1 /mnt/2
mount -t tmpfs zdtm /mnt/1
mount --make-shared /mnt/1
mkdir /mnt/1/1

iteration=30
if [ -n "$1" ]; then
	iteration=$1
fi

for i in `seq $iteration`; do
	mount --bind /mnt/1/1 /mnt/1/1 &
done

ret=0
for i in `seq $iteration`; do
	wait -n || ret=1
done

[ "$ret" -ne 0 ] && {
	time umount -l /mnt/1
	exit 0
}

mount --rbind /mnt/1 /mnt/2
mount --make-slave /mnt/2
mount -t tmpfs zzz /mnt/2/1

nr=`cat /proc/self/mountinfo | grep zdtm | wc -l`
echo -n "umount -l /mnt/1 -> $nr	"
/usr/bin/time -f '%E' umount -l /mnt/1

nr=`cat /proc/self/mountinfo | grep zdtm | wc -l`
echo -n "umount -l /mnt/2 -> $nr	"
/usr/bin/time -f '%E' umount -l /mnt/2

[root@fc24 mounts]# unshare -Urm sh run.sh 4

It hangs up on my host with this patch.

NMI watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [umount:789]
Modules linked in: nfsv3 nfs fscache bridge stp llc ebtable_filter
ebtables ip6table_filter ip6_tables ppdev crct10dif_pclmul crc32_pclmul
ghash_clmulni_intel virtio_balloon i2c_piix4 parport_pc parport
acpi_cpufreq nfsd auth_rpcgss nfs_acl lockd grace sunrpc binfmt_misc
virtio_net virtio_blk virtio_console crc32c_intel serio_raw virtio_pci
virtio_ring virtio ata_generic pata_acpi
CPU: 0 PID: 789 Comm: umount Not tainted 4.9.0-rc1+ #137
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.9.1-1.fc24
04/01/2014
task: ffff88007c11c100 task.stack: ffffc900007c0000
RIP: 0010:[<ffffffff8125bd87>]  [<ffffffff8125bd87>]
__lookup_mnt_last+0x67/0x80
RSP: 0018:ffffc900007c3db0  EFLAGS: 00000286
RAX: ffff88007a5f0900 RBX: ffff88007b136620 RCX: ffff88007a3e2900
RDX: ffff88007a3e2900 RSI: ffff88007b136600 RDI: ffff88007b136600
RBP: ffffc900007c3dc0 R08: ffff880036df5850 R09: ffffffff81249664
R10: ffff88007bd84c38 R11: 0000000100000000 R12: ffff88007bce3f00
R13: ffffc900007c3e00 R14: ffff88007bce3f00 R15: 00007ffe54245328
FS:  00007ff465de0840(0000) GS:ffff88007fc00000(0000)
knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 000055f86b328128 CR3: 000000007ba23000 CR4: 00000000003406f0
Stack:
 ffff88007b136600 ffff88007b136480 ffffc900007c3df0 ffffffff8126a70a
 ffffc900007c3e58 ffffc900007c3e10 ffffc900007c3e00 ffff880079f99980
 ffffc900007c3e48 ffffffff8126b0d5 ffff88007a3e2660 ffff88007a3e24e0
Call Trace:
 [<ffffffff8126a70a>] propagation_visit_child.isra.8+0x5a/0xd0
 [<ffffffff8126b0d5>] propagate_umount+0x65/0x2e0
 [<ffffffff8125a76e>] umount_tree+0x2be/0x2d0
 [<ffffffff8125b75f>] do_umount+0x13f/0x340
 [<ffffffff8125c3ce>] SyS_umount+0x10e/0x120
 [<ffffffff817ba837>] entry_SYSCALL_64_fastpath+0x1a/0xa9

Thanks,
Andrei

> 
> Unless we can find a problem I am going to call this the final version.
> 
>  fs/mount.h     |   1 +
>  fs/namespace.c |   7 +-
>  fs/pnode.c     | 198 ++++++++++++++++++++++++++++++++++++++++++++++-----------
>  fs/pnode.h     |   2 +-
>  4 files changed, 165 insertions(+), 43 deletions(-)
> 
> diff --git a/fs/mount.h b/fs/mount.h
> index d2e25d7b64b3..00fe0d1d6ba7 100644
> --- a/fs/mount.h
> +++ b/fs/mount.h
> @@ -58,6 +58,7 @@ struct mount {
>  	struct mnt_namespace *mnt_ns;	/* containing namespace */
>  	struct mountpoint *mnt_mp;	/* where is it mounted */
>  	struct hlist_node mnt_mp_list;	/* list mounts with the same mountpoint */
> +	struct list_head mnt_umounts;	/* list of children that are being unmounted */
>  #ifdef CONFIG_FSNOTIFY
>  	struct hlist_head mnt_fsnotify_marks;
>  	__u32 mnt_fsnotify_mask;
> diff --git a/fs/namespace.c b/fs/namespace.c
> index e6c234b1a645..73801391bb00 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -237,6 +237,7 @@ static struct mount *alloc_vfsmnt(const char *name)
>  		INIT_LIST_HEAD(&mnt->mnt_slave_list);
>  		INIT_LIST_HEAD(&mnt->mnt_slave);
>  		INIT_HLIST_NODE(&mnt->mnt_mp_list);
> +		INIT_LIST_HEAD(&mnt->mnt_umounts);
>  #ifdef CONFIG_FSNOTIFY
>  		INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
>  #endif
> @@ -650,13 +651,11 @@ struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
>  	p = __lookup_mnt(mnt, dentry);
>  	if (!p)
>  		goto out;
> -	if (!(p->mnt.mnt_flags & MNT_UMOUNT))
> -		res = p;
> +	res = p;
>  	hlist_for_each_entry_continue(p, mnt_hash) {
>  		if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
>  			break;
> -		if (!(p->mnt.mnt_flags & MNT_UMOUNT))
> -			res = p;
> +		res = p;
>  	}
>  out:
>  	return res;
> diff --git a/fs/pnode.c b/fs/pnode.c
> index 234a9ac49958..15e30e861a14 100644
> --- a/fs/pnode.c
> +++ b/fs/pnode.c
> @@ -390,56 +390,153 @@ void propagate_mount_unlock(struct mount *mnt)
>  }
>  
>  /*
> - * Mark all mounts that the MNT_LOCKED logic will allow to be unmounted.
> + * get the next mount in the propagation tree (that has not been visited)
> + * @m: the mount seen last
> + * @origin: the original mount from where the tree walk initiated
> + *
> + * Note that peer groups form contiguous segments of slave lists.
> + * We rely on that in get_source() to be able to find out if
> + * vfsmount found while iterating with propagation_next() is
> + * a peer of one we'd found earlier.
>   */
> -static void mark_umount_candidates(struct mount *mnt)
> +static struct mount *propagation_visit_child(struct mount *last_child,
> +					    struct mount *origin_child)
>  {
> -	struct mount *parent = mnt->mnt_parent;
> -	struct mount *m;
> +	struct mount *m = last_child->mnt_parent;
> +	struct mount *origin = origin_child->mnt_parent;
> +	struct dentry *mountpoint = origin_child->mnt_mountpoint;
> +	struct mount *child;
>  
> -	BUG_ON(parent == mnt);
> +	/* Has this part of the propgation tree already been visited? */
> +	if (IS_MNT_MARKED(last_child))
> +		return NULL;
>  
> -	for (m = propagation_next(parent, parent); m;
> -			m = propagation_next(m, parent)) {
> -		struct mount *child = __lookup_mnt_last(&m->mnt,
> -						mnt->mnt_mountpoint);
> -		if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
> -			SET_MNT_MARK(child);
> +	SET_MNT_MARK(last_child);
> +
> +	/* are there any slaves of this mount? */
> +	if (!list_empty(&m->mnt_slave_list)) {
> +		m = first_slave(m);
> +		goto check_slave;
> +	}
> +	while (1) {
> +		struct mount *master = m->mnt_master;
> +
> +		if (master == origin->mnt_master) {
> +			struct mount *next = next_peer(m);
> +			while (1) {
> +				if (next == origin)
> +					return NULL;
> +				child = __lookup_mnt_last(&next->mnt, mountpoint);
> +				if (child && !IS_MNT_MARKED(child))
> +					return child;
> +				next = next_peer(next);
> +			}
> +		} else {
> +			while (1) {
> +				if (m->mnt_slave.next == &master->mnt_slave_list)
> +					break;
> +				m = next_slave(m);
> +			check_slave:
> +				child = __lookup_mnt_last(&m->mnt, mountpoint);
> +				if (child && !IS_MNT_MARKED(child))
> +					return child;
> +			}
>  		}
> +
> +		/* back at master */
> +		m = master;
>  	}
>  }
>  
>  /*
> - * NOTE: unmounting 'mnt' naturally propagates to all other mounts its
> - * parent propagates to.
> + * get the next mount in the propagation tree (that has not been revisited)
> + * @m: the mount seen last
> + * @origin: the original mount from where the tree walk initiated
> + *
> + * Note that peer groups form contiguous segments of slave lists.
> + * We rely on that in get_source() to be able to find out if
> + * vfsmount found while iterating with propagation_next() is
> + * a peer of one we'd found earlier.
>   */
> -static void __propagate_umount(struct mount *mnt)
> +static struct mount *propagation_revisit_child(struct mount *last_child,
> +					       struct mount *origin_child)
>  {
> -	struct mount *parent = mnt->mnt_parent;
> -	struct mount *m;
> +	struct mount *m = last_child->mnt_parent;
> +	struct mount *origin = origin_child->mnt_parent;
> +	struct dentry *mountpoint = origin_child->mnt_mountpoint;
> +	struct mount *child;
>  
> -	BUG_ON(parent == mnt);
> +	/* Has this part of the propgation tree already been revisited? */
> +	if (!IS_MNT_MARKED(last_child))
> +		return NULL;
>  
> -	for (m = propagation_next(parent, parent); m;
> -			m = propagation_next(m, parent)) {
> +	CLEAR_MNT_MARK(last_child);
>  
> -		struct mount *child = __lookup_mnt_last(&m->mnt,
> -						mnt->mnt_mountpoint);
> -		/*
> -		 * umount the child only if the child has no children
> -		 * and the child is marked safe to unmount.
> -		 */
> -		if (!child || !IS_MNT_MARKED(child))
> -			continue;
> -		CLEAR_MNT_MARK(child);
> -		if (list_empty(&child->mnt_mounts)) {
> -			list_del_init(&child->mnt_child);
> -			child->mnt.mnt_flags |= MNT_UMOUNT;
> -			list_move_tail(&child->mnt_list, &mnt->mnt_list);
> +	/* are there any slaves of this mount? */
> +	if (!list_empty(&m->mnt_slave_list)) {
> +		m = first_slave(m);
> +		goto check_slave;
> +	}
> +	while (1) {
> +		struct mount *master = m->mnt_master;
> +
> +		if (master == origin->mnt_master) {
> +			struct mount *next = next_peer(m);
> +			while (1) {
> +				if (next == origin)
> +					return NULL;
> +				child = __lookup_mnt_last(&next->mnt, mountpoint);
> +				if (child && IS_MNT_MARKED(child))
> +					return child;
> +				next = next_peer(next);
> +			}
> +		} else {
> +			while (1) {
> +				if (m->mnt_slave.next == &master->mnt_slave_list)
> +					break;
> +				m = next_slave(m);
> +			check_slave:
> +				child = __lookup_mnt_last(&m->mnt, mountpoint);
> +				if (child && IS_MNT_MARKED(child))
> +					return child;
> +			}
>  		}
> +
> +		/* back at master */
> +		m = master;
>  	}
>  }
>  
> +static void start_umount_propagation(struct mount *child,
> +				     struct list_head *to_umount)
> +{
> +	do {
> +		struct mount *parent = child->mnt_parent;
> +
> +		if ((child->mnt.mnt_flags & MNT_UMOUNT) ||
> +		    !list_empty(&child->mnt_mounts))
> +			return;
> +
> +		if (!IS_MNT_LOCKED(child))
> +			list_move_tail(&child->mnt_child, to_umount);
> +		else
> +			list_move_tail(&child->mnt_child, &parent->mnt_umounts);
> +
> +		child = NULL;
> +		if (IS_MNT_MARKED(parent))
> +			child = parent;
> +	} while (child);
> +}
> +
> +static void end_umount_propagation(struct mount *child)
> +{
> +	struct mount *parent = child->mnt_parent;
> +
> +	if (!list_empty(&parent->mnt_umounts))
> +		list_splice_tail_init(&parent->mnt_umounts, &parent->mnt_mounts);
> +}
> +
> +
>  /*
>   * collect all mounts that receive propagation from the mount in @list,
>   * and return these additional mounts in the same list.
> @@ -447,14 +544,39 @@ static void __propagate_umount(struct mount *mnt)
>   *
>   * vfsmount lock must be held for write
>   */
> -int propagate_umount(struct list_head *list)
> +void propagate_umount(struct list_head *list)
>  {
>  	struct mount *mnt;
> +	LIST_HEAD(to_umount);
> +	LIST_HEAD(tmp_list);
> +
> +	/* Find candidates for unmounting */
> +	list_for_each_entry(mnt, list, mnt_list) {
> +		struct mount *child;
> +		for (child = propagation_visit_child(mnt, mnt); child;
> +		     child = propagation_visit_child(child, mnt))
> +			start_umount_propagation(child, &to_umount);
> +	}
>  
> -	list_for_each_entry_reverse(mnt, list, mnt_list)
> -		mark_umount_candidates(mnt);
> +	/* Begin unmounting */
> +	while (!list_empty(&to_umount)) {
> +		mnt = list_first_entry(&to_umount, struct mount, mnt_child);
>  
> -	list_for_each_entry(mnt, list, mnt_list)
> -		__propagate_umount(mnt);
> -	return 0;
> +		list_del_init(&mnt->mnt_child);
> +		mnt->mnt.mnt_flags |= MNT_UMOUNT;
> +		list_move_tail(&mnt->mnt_list, &tmp_list);
> +
> +		if (!list_empty(&mnt->mnt_umounts))
> +			list_splice_tail_init(&mnt->mnt_umounts, &to_umount);
> +	}
> +
> +	/* Cleanup the mount propagation tree */
> +	list_for_each_entry(mnt, list, mnt_list) {
> +		struct mount *child;
> +		for (child = propagation_revisit_child(mnt, mnt); child;
> +		     child = propagation_revisit_child(child, mnt))
> +			end_umount_propagation(child);
> +	}
> +
> +	list_splice_tail(&tmp_list, list);
>  }
> diff --git a/fs/pnode.h b/fs/pnode.h
> index 550f5a8b4fcf..38c6cdb96b34 100644
> --- a/fs/pnode.h
> +++ b/fs/pnode.h
> @@ -41,7 +41,7 @@ static inline void set_mnt_shared(struct mount *mnt)
>  void change_mnt_propagation(struct mount *, int);
>  int propagate_mnt(struct mount *, struct mountpoint *, struct mount *,
>  		struct hlist_head *);
> -int propagate_umount(struct list_head *);
> +void propagate_umount(struct list_head *);
>  int propagate_mount_busy(struct mount *, int);
>  void propagate_mount_unlock(struct mount *);
>  void mnt_release_group_id(struct mount *);
> -- 
> 2.10.1
> 

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [REVIEW][PATCH] mount: In propagate_umount handle overlapping mount propagation trees
@ 2016-10-20 21:30                                 ` Andrei Vagin
  0 siblings, 0 replies; 36+ messages in thread
From: Andrei Vagin @ 2016-10-20 21:30 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Andrey Vagin, Alexander Viro, Linux Containers, linux-fsdevel, LKML

On Tue, Oct 18, 2016 at 10:46:40PM -0500, Eric W. Biederman wrote:
> 
> Adrei Vagin pointed out that time to executue propagate_umount can go
> non-linear (and take a ludicrious amount of time) when the mount
> propogation trees of the mounts to be unmunted by a lazy unmount
> overlap.
> 
> While investigating the horrible performance I realized that in
> the case overlapping mount trees since the addition of locked
> mount support the code has been failing to unmount all of the
> mounts it should have been unmounting.
> 
> Make the walk of the mount propagation trees nearly linear by using
> MNT_MARK to mark pieces of the mount propagation trees that have
> already been visited, allowing subsequent walks to skip over
> subtrees.
> 
> Make the processing of mounts order independent by adding a list of
> mount entries that need to be unmounted, and simply adding a mount to
> that list when it becomes apparent the mount can safely be unmounted.
> For mounts that are locked on other mounts but otherwise could be
> unmounted move them from their parnets mnt_mounts to mnt_umounts so
> that if and when their parent becomes unmounted these mounts can be
> added to the list of mounts to unmount.
> 
> Add a final pass to clear MNT_MARK and to restore mnt_mounts
> from mnt_umounts for anything that did not get unmounted.
> 
> Add the functions propagation_visit_next and propagation_revisit_next
> to coordinate walking of the mount tree and setting and clearing the
> mount mark.
> 
> The skipping of already unmounted mounts has been moved from
> __lookup_mnt_last to mark_umount_candidates, so that the new
> propagation functions can notice when the propagation tree passes
> through the initial set of unmounted mounts.  Except in umount_tree as
> part of the unmounting process the only place where unmounted mounts
> should be found are in unmounted subtrees.  All of the other callers
> of __lookup_mnt_last are from mounted subtrees so the not checking for
> unmounted mounts should not affect them.
> 
> A script to generate overlapping mount propagation trees:
> $ cat run.sh
> mount -t tmpfs test-mount /mnt
> mount --make-shared /mnt
> for i in `seq $1`; do
>         mkdir /mnt/test.$i
>         mount --bind /mnt /mnt/test.$i
> done
> cat /proc/mounts | grep test-mount | wc -l
> time umount -l /mnt
> $ for i in `seq 10 16`; do echo $i; unshare -Urm bash ./run.sh $i; done
> 
> Here are the performance numbers with and without the patch:
> 
> mhash  |  8192   |  8192  |  8192       | 131072 | 131072      | 104857 | 104857
> mounts | before  | after  | after (sys) | after  | after (sys) |  after | after (sys)
> -------------------------------------------------------------------------------------
>   1024 |  0.071s | 0.020s | 0.000s      | 0.022s | 0.004s      | 0.020s | 0.004s
>   2048 |  0.184s | 0.022s | 0.004s      | 0.023s | 0.004s      | 0.022s | 0.008s
>   4096 |  0.604s | 0.025s | 0.020s      | 0.029s | 0.008s      | 0.026s | 0.004s
>   8912 |  4.471s | 0.053s | 0.020s      | 0.051s | 0.024s      | 0.047s | 0.016s
>  16384 | 34.826s | 0.088s | 0.060s      | 0.081s | 0.048s      | 0.082s | 0.052s
>  32768 |         | 0.216s | 0.172s      | 0.160s | 0.124s      | 0.160s | 0.096s
>  65536 |         | 0.819s | 0.726s      | 0.330s | 0.260s      | 0.338s | 0.256s
> 131072 |         | 4.502s | 4.168s      | 0.707s | 0.580s      | 0.709s | 0.592s
> 
> Andrei Vagin reports fixing the performance problem is part of the
> work to fix CVE-2016-6213.
> 
> A script for a pathlogical set of mounts:
> 
> $ cat pathological.sh
> 
> mount -t tmpfs base /mnt
> mount --make-shared /mnt
> mkdir -p /mnt/b
> 
> mount -t tmpfs test1 /mnt/b
> mount --make-shared /mnt/b
> mkdir -p /mnt/b/10
> 
> mount -t tmpfs test2 /mnt/b/10
> mount --make-shared /mnt/b/10
> mkdir -p /mnt/b/10/20
> 
> mount --rbind /mnt/b /mnt/b/10/20
> 
> unshare -Urm sleep 2
> umount -l /mnt/b
> wait %%
> 
> $ unsahre -Urm pathlogical.sh
> 
> Cc: stable@vger.kernel.org
> Fixes: a05964f3917c ("[PATCH] shared mounts handling: umount")
> Fixes: 0c56fe31420c ("mnt: Don't propagate unmounts to locked mounts")
> Reported-by: Andrei Vagin <avagin@openvz.org>
> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
> ---
> 
> Barring some stupid mistake this looks like it fixes both the performance
> and the correctness issues I was able to spot earlier.  Andrei if you
> could give this version a look over I would appreciate it.

Eric, could you try out this script:

[root@fc24 mounts]# cat run.sh 
set -e -m

mount -t tmpfs zdtm /mnt
mkdir -p /mnt/1 /mnt/2
mount -t tmpfs zdtm /mnt/1
mount --make-shared /mnt/1
mkdir /mnt/1/1

iteration=30
if [ -n "$1" ]; then
	iteration=$1
fi

for i in `seq $iteration`; do
	mount --bind /mnt/1/1 /mnt/1/1 &
done

ret=0
for i in `seq $iteration`; do
	wait -n || ret=1
done

[ "$ret" -ne 0 ] && {
	time umount -l /mnt/1
	exit 0
}

mount --rbind /mnt/1 /mnt/2
mount --make-slave /mnt/2
mount -t tmpfs zzz /mnt/2/1

nr=`cat /proc/self/mountinfo | grep zdtm | wc -l`
echo -n "umount -l /mnt/1 -> $nr	"
/usr/bin/time -f '%E' umount -l /mnt/1

nr=`cat /proc/self/mountinfo | grep zdtm | wc -l`
echo -n "umount -l /mnt/2 -> $nr	"
/usr/bin/time -f '%E' umount -l /mnt/2

[root@fc24 mounts]# unshare -Urm sh run.sh 4

It hangs up on my host with this patch.

NMI watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [umount:789]
Modules linked in: nfsv3 nfs fscache bridge stp llc ebtable_filter
ebtables ip6table_filter ip6_tables ppdev crct10dif_pclmul crc32_pclmul
ghash_clmulni_intel virtio_balloon i2c_piix4 parport_pc parport
acpi_cpufreq nfsd auth_rpcgss nfs_acl lockd grace sunrpc binfmt_misc
virtio_net virtio_blk virtio_console crc32c_intel serio_raw virtio_pci
virtio_ring virtio ata_generic pata_acpi
CPU: 0 PID: 789 Comm: umount Not tainted 4.9.0-rc1+ #137
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.9.1-1.fc24
04/01/2014
task: ffff88007c11c100 task.stack: ffffc900007c0000
RIP: 0010:[<ffffffff8125bd87>]  [<ffffffff8125bd87>]
__lookup_mnt_last+0x67/0x80
RSP: 0018:ffffc900007c3db0  EFLAGS: 00000286
RAX: ffff88007a5f0900 RBX: ffff88007b136620 RCX: ffff88007a3e2900
RDX: ffff88007a3e2900 RSI: ffff88007b136600 RDI: ffff88007b136600
RBP: ffffc900007c3dc0 R08: ffff880036df5850 R09: ffffffff81249664
R10: ffff88007bd84c38 R11: 0000000100000000 R12: ffff88007bce3f00
R13: ffffc900007c3e00 R14: ffff88007bce3f00 R15: 00007ffe54245328
FS:  00007ff465de0840(0000) GS:ffff88007fc00000(0000)
knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 000055f86b328128 CR3: 000000007ba23000 CR4: 00000000003406f0
Stack:
 ffff88007b136600 ffff88007b136480 ffffc900007c3df0 ffffffff8126a70a
 ffffc900007c3e58 ffffc900007c3e10 ffffc900007c3e00 ffff880079f99980
 ffffc900007c3e48 ffffffff8126b0d5 ffff88007a3e2660 ffff88007a3e24e0
Call Trace:
 [<ffffffff8126a70a>] propagation_visit_child.isra.8+0x5a/0xd0
 [<ffffffff8126b0d5>] propagate_umount+0x65/0x2e0
 [<ffffffff8125a76e>] umount_tree+0x2be/0x2d0
 [<ffffffff8125b75f>] do_umount+0x13f/0x340
 [<ffffffff8125c3ce>] SyS_umount+0x10e/0x120
 [<ffffffff817ba837>] entry_SYSCALL_64_fastpath+0x1a/0xa9

Thanks,
Andrei

> 
> Unless we can find a problem I am going to call this the final version.
> 
>  fs/mount.h     |   1 +
>  fs/namespace.c |   7 +-
>  fs/pnode.c     | 198 ++++++++++++++++++++++++++++++++++++++++++++++-----------
>  fs/pnode.h     |   2 +-
>  4 files changed, 165 insertions(+), 43 deletions(-)
> 
> diff --git a/fs/mount.h b/fs/mount.h
> index d2e25d7b64b3..00fe0d1d6ba7 100644
> --- a/fs/mount.h
> +++ b/fs/mount.h
> @@ -58,6 +58,7 @@ struct mount {
>  	struct mnt_namespace *mnt_ns;	/* containing namespace */
>  	struct mountpoint *mnt_mp;	/* where is it mounted */
>  	struct hlist_node mnt_mp_list;	/* list mounts with the same mountpoint */
> +	struct list_head mnt_umounts;	/* list of children that are being unmounted */
>  #ifdef CONFIG_FSNOTIFY
>  	struct hlist_head mnt_fsnotify_marks;
>  	__u32 mnt_fsnotify_mask;
> diff --git a/fs/namespace.c b/fs/namespace.c
> index e6c234b1a645..73801391bb00 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -237,6 +237,7 @@ static struct mount *alloc_vfsmnt(const char *name)
>  		INIT_LIST_HEAD(&mnt->mnt_slave_list);
>  		INIT_LIST_HEAD(&mnt->mnt_slave);
>  		INIT_HLIST_NODE(&mnt->mnt_mp_list);
> +		INIT_LIST_HEAD(&mnt->mnt_umounts);
>  #ifdef CONFIG_FSNOTIFY
>  		INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
>  #endif
> @@ -650,13 +651,11 @@ struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
>  	p = __lookup_mnt(mnt, dentry);
>  	if (!p)
>  		goto out;
> -	if (!(p->mnt.mnt_flags & MNT_UMOUNT))
> -		res = p;
> +	res = p;
>  	hlist_for_each_entry_continue(p, mnt_hash) {
>  		if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
>  			break;
> -		if (!(p->mnt.mnt_flags & MNT_UMOUNT))
> -			res = p;
> +		res = p;
>  	}
>  out:
>  	return res;
> diff --git a/fs/pnode.c b/fs/pnode.c
> index 234a9ac49958..15e30e861a14 100644
> --- a/fs/pnode.c
> +++ b/fs/pnode.c
> @@ -390,56 +390,153 @@ void propagate_mount_unlock(struct mount *mnt)
>  }
>  
>  /*
> - * Mark all mounts that the MNT_LOCKED logic will allow to be unmounted.
> + * get the next mount in the propagation tree (that has not been visited)
> + * @m: the mount seen last
> + * @origin: the original mount from where the tree walk initiated
> + *
> + * Note that peer groups form contiguous segments of slave lists.
> + * We rely on that in get_source() to be able to find out if
> + * vfsmount found while iterating with propagation_next() is
> + * a peer of one we'd found earlier.
>   */
> -static void mark_umount_candidates(struct mount *mnt)
> +static struct mount *propagation_visit_child(struct mount *last_child,
> +					    struct mount *origin_child)
>  {
> -	struct mount *parent = mnt->mnt_parent;
> -	struct mount *m;
> +	struct mount *m = last_child->mnt_parent;
> +	struct mount *origin = origin_child->mnt_parent;
> +	struct dentry *mountpoint = origin_child->mnt_mountpoint;
> +	struct mount *child;
>  
> -	BUG_ON(parent == mnt);
> +	/* Has this part of the propgation tree already been visited? */
> +	if (IS_MNT_MARKED(last_child))
> +		return NULL;
>  
> -	for (m = propagation_next(parent, parent); m;
> -			m = propagation_next(m, parent)) {
> -		struct mount *child = __lookup_mnt_last(&m->mnt,
> -						mnt->mnt_mountpoint);
> -		if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
> -			SET_MNT_MARK(child);
> +	SET_MNT_MARK(last_child);
> +
> +	/* are there any slaves of this mount? */
> +	if (!list_empty(&m->mnt_slave_list)) {
> +		m = first_slave(m);
> +		goto check_slave;
> +	}
> +	while (1) {
> +		struct mount *master = m->mnt_master;
> +
> +		if (master == origin->mnt_master) {
> +			struct mount *next = next_peer(m);
> +			while (1) {
> +				if (next == origin)
> +					return NULL;
> +				child = __lookup_mnt_last(&next->mnt, mountpoint);
> +				if (child && !IS_MNT_MARKED(child))
> +					return child;
> +				next = next_peer(next);
> +			}
> +		} else {
> +			while (1) {
> +				if (m->mnt_slave.next == &master->mnt_slave_list)
> +					break;
> +				m = next_slave(m);
> +			check_slave:
> +				child = __lookup_mnt_last(&m->mnt, mountpoint);
> +				if (child && !IS_MNT_MARKED(child))
> +					return child;
> +			}
>  		}
> +
> +		/* back at master */
> +		m = master;
>  	}
>  }
>  
>  /*
> - * NOTE: unmounting 'mnt' naturally propagates to all other mounts its
> - * parent propagates to.
> + * get the next mount in the propagation tree (that has not been revisited)
> + * @m: the mount seen last
> + * @origin: the original mount from where the tree walk initiated
> + *
> + * Note that peer groups form contiguous segments of slave lists.
> + * We rely on that in get_source() to be able to find out if
> + * vfsmount found while iterating with propagation_next() is
> + * a peer of one we'd found earlier.
>   */
> -static void __propagate_umount(struct mount *mnt)
> +static struct mount *propagation_revisit_child(struct mount *last_child,
> +					       struct mount *origin_child)
>  {
> -	struct mount *parent = mnt->mnt_parent;
> -	struct mount *m;
> +	struct mount *m = last_child->mnt_parent;
> +	struct mount *origin = origin_child->mnt_parent;
> +	struct dentry *mountpoint = origin_child->mnt_mountpoint;
> +	struct mount *child;
>  
> -	BUG_ON(parent == mnt);
> +	/* Has this part of the propgation tree already been revisited? */
> +	if (!IS_MNT_MARKED(last_child))
> +		return NULL;
>  
> -	for (m = propagation_next(parent, parent); m;
> -			m = propagation_next(m, parent)) {
> +	CLEAR_MNT_MARK(last_child);
>  
> -		struct mount *child = __lookup_mnt_last(&m->mnt,
> -						mnt->mnt_mountpoint);
> -		/*
> -		 * umount the child only if the child has no children
> -		 * and the child is marked safe to unmount.
> -		 */
> -		if (!child || !IS_MNT_MARKED(child))
> -			continue;
> -		CLEAR_MNT_MARK(child);
> -		if (list_empty(&child->mnt_mounts)) {
> -			list_del_init(&child->mnt_child);
> -			child->mnt.mnt_flags |= MNT_UMOUNT;
> -			list_move_tail(&child->mnt_list, &mnt->mnt_list);
> +	/* are there any slaves of this mount? */
> +	if (!list_empty(&m->mnt_slave_list)) {
> +		m = first_slave(m);
> +		goto check_slave;
> +	}
> +	while (1) {
> +		struct mount *master = m->mnt_master;
> +
> +		if (master == origin->mnt_master) {
> +			struct mount *next = next_peer(m);
> +			while (1) {
> +				if (next == origin)
> +					return NULL;
> +				child = __lookup_mnt_last(&next->mnt, mountpoint);
> +				if (child && IS_MNT_MARKED(child))
> +					return child;
> +				next = next_peer(next);
> +			}
> +		} else {
> +			while (1) {
> +				if (m->mnt_slave.next == &master->mnt_slave_list)
> +					break;
> +				m = next_slave(m);
> +			check_slave:
> +				child = __lookup_mnt_last(&m->mnt, mountpoint);
> +				if (child && IS_MNT_MARKED(child))
> +					return child;
> +			}
>  		}
> +
> +		/* back at master */
> +		m = master;
>  	}
>  }
>  
> +static void start_umount_propagation(struct mount *child,
> +				     struct list_head *to_umount)
> +{
> +	do {
> +		struct mount *parent = child->mnt_parent;
> +
> +		if ((child->mnt.mnt_flags & MNT_UMOUNT) ||
> +		    !list_empty(&child->mnt_mounts))
> +			return;
> +
> +		if (!IS_MNT_LOCKED(child))
> +			list_move_tail(&child->mnt_child, to_umount);
> +		else
> +			list_move_tail(&child->mnt_child, &parent->mnt_umounts);
> +
> +		child = NULL;
> +		if (IS_MNT_MARKED(parent))
> +			child = parent;
> +	} while (child);
> +}
> +
> +static void end_umount_propagation(struct mount *child)
> +{
> +	struct mount *parent = child->mnt_parent;
> +
> +	if (!list_empty(&parent->mnt_umounts))
> +		list_splice_tail_init(&parent->mnt_umounts, &parent->mnt_mounts);
> +}
> +
> +
>  /*
>   * collect all mounts that receive propagation from the mount in @list,
>   * and return these additional mounts in the same list.
> @@ -447,14 +544,39 @@ static void __propagate_umount(struct mount *mnt)
>   *
>   * vfsmount lock must be held for write
>   */
> -int propagate_umount(struct list_head *list)
> +void propagate_umount(struct list_head *list)
>  {
>  	struct mount *mnt;
> +	LIST_HEAD(to_umount);
> +	LIST_HEAD(tmp_list);
> +
> +	/* Find candidates for unmounting */
> +	list_for_each_entry(mnt, list, mnt_list) {
> +		struct mount *child;
> +		for (child = propagation_visit_child(mnt, mnt); child;
> +		     child = propagation_visit_child(child, mnt))
> +			start_umount_propagation(child, &to_umount);
> +	}
>  
> -	list_for_each_entry_reverse(mnt, list, mnt_list)
> -		mark_umount_candidates(mnt);
> +	/* Begin unmounting */
> +	while (!list_empty(&to_umount)) {
> +		mnt = list_first_entry(&to_umount, struct mount, mnt_child);
>  
> -	list_for_each_entry(mnt, list, mnt_list)
> -		__propagate_umount(mnt);
> -	return 0;
> +		list_del_init(&mnt->mnt_child);
> +		mnt->mnt.mnt_flags |= MNT_UMOUNT;
> +		list_move_tail(&mnt->mnt_list, &tmp_list);
> +
> +		if (!list_empty(&mnt->mnt_umounts))
> +			list_splice_tail_init(&mnt->mnt_umounts, &to_umount);
> +	}
> +
> +	/* Cleanup the mount propagation tree */
> +	list_for_each_entry(mnt, list, mnt_list) {
> +		struct mount *child;
> +		for (child = propagation_revisit_child(mnt, mnt); child;
> +		     child = propagation_revisit_child(child, mnt))
> +			end_umount_propagation(child);
> +	}
> +
> +	list_splice_tail(&tmp_list, list);
>  }
> diff --git a/fs/pnode.h b/fs/pnode.h
> index 550f5a8b4fcf..38c6cdb96b34 100644
> --- a/fs/pnode.h
> +++ b/fs/pnode.h
> @@ -41,7 +41,7 @@ static inline void set_mnt_shared(struct mount *mnt)
>  void change_mnt_propagation(struct mount *, int);
>  int propagate_mnt(struct mount *, struct mountpoint *, struct mount *,
>  		struct hlist_head *);
> -int propagate_umount(struct list_head *);
> +void propagate_umount(struct list_head *);
>  int propagate_mount_busy(struct mount *, int);
>  void propagate_mount_unlock(struct mount *);
>  void mnt_release_group_id(struct mount *);
> -- 
> 2.10.1
> 

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [REVIEW][PATCH] mount: In propagate_umount handle overlapping mount propagation trees
  2016-10-20 21:30                                 ` Andrei Vagin
@ 2016-10-21 19:26                                     ` Eric W. Biederman
  -1 siblings, 0 replies; 36+ messages in thread
From: Eric W. Biederman @ 2016-10-21 19:26 UTC (permalink / raw)
  To: Andrei Vagin
  Cc: linux-fsdevel, LKML, Linux Containers, Andrey Vagin, Alexander Viro

Andrei Vagin <avagin-5HdwGun5lf+gSpxsJD1C4w@public.gmane.org> writes:
>> Barring some stupid mistake this looks like it fixes both the performance
>> and the correctness issues I was able to spot earlier.  Andrei if you
>> could give this version a look over I would appreciate it.
>
> Eric, could you try out this script:
[snip script]

> It hangs up on my host with this patch.

Ugh.  I am seeing the hang as well digging into it.

Thanks for keeping me honest.

Eric

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [REVIEW][PATCH] mount: In propagate_umount handle overlapping mount propagation trees
@ 2016-10-21 19:26                                     ` Eric W. Biederman
  0 siblings, 0 replies; 36+ messages in thread
From: Eric W. Biederman @ 2016-10-21 19:26 UTC (permalink / raw)
  To: Andrei Vagin
  Cc: Andrey Vagin, Alexander Viro, Linux Containers, linux-fsdevel, LKML

Andrei Vagin <avagin@virtuozzo.com> writes:
>> Barring some stupid mistake this looks like it fixes both the performance
>> and the correctness issues I was able to spot earlier.  Andrei if you
>> could give this version a look over I would appreciate it.
>
> Eric, could you try out this script:
[snip script]

> It hangs up on my host with this patch.

Ugh.  I am seeing the hang as well digging into it.

Thanks for keeping me honest.

Eric

^ permalink raw reply	[flat|nested] 36+ messages in thread

* [RFC][PATCH v2] mount: In propagate_umount handle overlapping mount propagation trees
  2016-10-21 19:26                                     ` Eric W. Biederman
@ 2016-10-22 19:42                                         ` Eric W. Biederman
  -1 siblings, 0 replies; 36+ messages in thread
From: Eric W. Biederman @ 2016-10-22 19:42 UTC (permalink / raw)
  To: Andrei Vagin
  Cc: linux-fsdevel, LKML, Linux Containers, Andrey Vagin, Alexander Viro


Andrei,

This fixes the issue you have reported and through a refactoring
makes the code simpler and easier to verify.  That said I find your
last test case very interesting.   While looking at it in detail
I have realized I don't fully understand why we have both lookup_mnt and
lookup_mnt_last, so I can't say that this change is fully correct.

Outside of propogate_umount I am don't have concerns but I am not 100%
convinced that my change to lookup_mnt_last does the right thing
in the case of propagate_umount.

I do see why your last test case scales badly.  Long chains of shared
mounts that we can't skip.  At the same time I don't really understand
that case.  Part of it has to do with multiple child mounts of the same
mount on the same mountpoint.

So I am working through my concerns.  In the mean time I figured it
would be useful to post this version.  As this version is clearly better
than the version of this change that have come before it.

Eric

From: "Eric W. Biederman" <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
Date: Thu, 13 Oct 2016 13:27:19 -0500

Adrei Vagin pointed out that time to executue propagate_umount can go
non-linear (and take a ludicrious amount of time) when the mount
propogation trees of the mounts to be unmunted by a lazy unmount
overlap.

While investigating the horrible performance I realized that in
the case overlapping mount trees since the addition of locked
mount support the code has been failing to unmount all of the
mounts it should have been unmounting.

Make the walk of the mount propagation trees nearly linear by using
MNT_MARK to mark pieces of the mount propagation trees that have
already been visited, allowing subsequent walks to skip over
subtrees.

Make the processing of mounts order independent by adding a list of
mount entries that need to be unmounted, and simply adding a mount to
that list when it becomes apparent the mount can safely be unmounted.
For mounts that are locked on other mounts but otherwise could be
unmounted move them from their parnets mnt_mounts to mnt_umounts so
that if and when their parent becomes unmounted these mounts can be
added to the list of mounts to unmount.

Add a final pass to clear MNT_MARK and to restore mnt_mounts
from mnt_umounts for anything that did not get unmounted.

Add the functions propagation_visit_next and propagation_revisit_next
to coordinate walking of the mount tree and setting and clearing the
mount mark.

The skipping of already unmounted mounts has been moved from
__lookup_mnt_last to mark_umount_candidates, so that the new
propagation functions can notice when the propagation tree passes
through the initial set of unmounted mounts.  Except in umount_tree as
part of the unmounting process the only place where unmounted mounts
should be found are in unmounted subtrees.  All of the other callers
of __lookup_mnt_last are from mounted subtrees so the not checking for
unmounted mounts should not affect them.

A script to generate overlapping mount propagation trees:
$ cat run.sh
mount -t tmpfs test-mount /mnt
mount --make-shared /mnt
for i in `seq $1`; do
        mkdir /mnt/test.$i
        mount --bind /mnt /mnt/test.$i
done
cat /proc/mounts | grep test-mount | wc -l
time umount -l /mnt
$ for i in `seq 10 16`; do echo $i; unshare -Urm bash ./run.sh $i; done

Here are the performance numbers with and without the patch:

mhash  |  8192   |  8192  |  8192       | 131072 | 131072      | 104857 | 104857
mounts | before  | after  | after (sys) | after  | after (sys) |  after | after (sys)
-------------------------------------------------------------------------------------
  1024 |  0.071s | 0.020s | 0.000s      | 0.022s | 0.004s      | 0.020s | 0.004s
  2048 |  0.184s | 0.022s | 0.004s      | 0.023s | 0.004s      | 0.022s | 0.008s
  4096 |  0.604s | 0.025s | 0.020s      | 0.029s | 0.008s      | 0.026s | 0.004s
  8912 |  4.471s | 0.053s | 0.020s      | 0.051s | 0.024s      | 0.047s | 0.016s
 16384 | 34.826s | 0.088s | 0.060s      | 0.081s | 0.048s      | 0.082s | 0.052s
 32768 |         | 0.216s | 0.172s      | 0.160s | 0.124s      | 0.160s | 0.096s
 65536 |         | 0.819s | 0.726s      | 0.330s | 0.260s      | 0.338s | 0.256s
131072 |         | 4.502s | 4.168s      | 0.707s | 0.580s      | 0.709s | 0.592s

Andrei Vagin reports fixing the performance problem is part of the
work to fix CVE-2016-6213.

A script for a pathlogical set of mounts:

$ cat pathological.sh

mount -t tmpfs base /mnt
mount --make-shared /mnt
mkdir -p /mnt/b

mount -t tmpfs test1 /mnt/b
mount --make-shared /mnt/b
mkdir -p /mnt/b/10

mount -t tmpfs test2 /mnt/b/10
mount --make-shared /mnt/b/10
mkdir -p /mnt/b/10/20

mount --rbind /mnt/b /mnt/b/10/20

unshare -Urm sleep 2
umount -l /mnt/b
wait %%

$ unshare -Urm pathlogical.sh

Cc: stable-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Fixes: a05964f3917c ("[PATCH] shared mounts handling: umount")
Fixes: 0c56fe31420c ("mnt: Don't propagate unmounts to locked mounts")
Reported-by: Andrei Vagin <avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
Signed-off-by: "Eric W. Biederman" <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
---
 fs/mount.h     |   1 +
 fs/namespace.c |   7 +--
 fs/pnode.c     | 179 +++++++++++++++++++++++++++++++++++++++++----------------
 fs/pnode.h     |   2 +-
 4 files changed, 133 insertions(+), 56 deletions(-)

diff --git a/fs/mount.h b/fs/mount.h
index d2e25d7b64b3..00fe0d1d6ba7 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -58,6 +58,7 @@ struct mount {
 	struct mnt_namespace *mnt_ns;	/* containing namespace */
 	struct mountpoint *mnt_mp;	/* where is it mounted */
 	struct hlist_node mnt_mp_list;	/* list mounts with the same mountpoint */
+	struct list_head mnt_umounts;	/* list of children that are being unmounted */
 #ifdef CONFIG_FSNOTIFY
 	struct hlist_head mnt_fsnotify_marks;
 	__u32 mnt_fsnotify_mask;
diff --git a/fs/namespace.c b/fs/namespace.c
index e6c234b1a645..73801391bb00 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -237,6 +237,7 @@ static struct mount *alloc_vfsmnt(const char *name)
 		INIT_LIST_HEAD(&mnt->mnt_slave_list);
 		INIT_LIST_HEAD(&mnt->mnt_slave);
 		INIT_HLIST_NODE(&mnt->mnt_mp_list);
+		INIT_LIST_HEAD(&mnt->mnt_umounts);
 #ifdef CONFIG_FSNOTIFY
 		INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
 #endif
@@ -650,13 +651,11 @@ struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
 	p = __lookup_mnt(mnt, dentry);
 	if (!p)
 		goto out;
-	if (!(p->mnt.mnt_flags & MNT_UMOUNT))
-		res = p;
+	res = p;
 	hlist_for_each_entry_continue(p, mnt_hash) {
 		if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
 			break;
-		if (!(p->mnt.mnt_flags & MNT_UMOUNT))
-			res = p;
+		res = p;
 	}
 out:
 	return res;
diff --git a/fs/pnode.c b/fs/pnode.c
index 234a9ac49958..8fd1a3fb420c 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -134,7 +134,8 @@ void change_mnt_propagation(struct mount *mnt, int type)
 }
 
 /*
- * get the next mount in the propagation tree.
+ * get the next mount that is not a slave of the current mount in the
+ * propagation tree.
  * @m: the mount seen last
  * @origin: the original mount from where the tree walk initiated
  *
@@ -143,13 +144,9 @@ void change_mnt_propagation(struct mount *mnt, int type)
  * vfsmount found while iterating with propagation_next() is
  * a peer of one we'd found earlier.
  */
-static struct mount *propagation_next(struct mount *m,
-					 struct mount *origin)
+static struct mount *propagation_next_sib(struct mount *m,
+						struct mount *origin)
 {
-	/* are there any slaves of this mount? */
-	if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
-		return first_slave(m);
-
 	while (1) {
 		struct mount *master = m->mnt_master;
 
@@ -164,6 +161,26 @@ static struct mount *propagation_next(struct mount *m,
 	}
 }
 
+/*
+ * get the next mount in the propagation tree.
+ * @m: the mount seen last
+ * @origin: the original mount from where the tree walk initiated
+ *
+ * Note that peer groups form contiguous segments of slave lists.
+ * We rely on that in get_source() to be able to find out if
+ * vfsmount found while iterating with propagation_next() is
+ * a peer of one we'd found earlier.
+ */
+static struct mount *propagation_next(struct mount *m,
+					 struct mount *origin)
+{
+	/* are there any slaves of this mount? */
+	if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
+		return first_slave(m);
+
+	return propagation_next_sib(m, origin);
+}
+
 static struct mount *next_group(struct mount *m, struct mount *origin)
 {
 	while (1) {
@@ -389,57 +406,92 @@ void propagate_mount_unlock(struct mount *mnt)
 	}
 }
 
-/*
- * Mark all mounts that the MNT_LOCKED logic will allow to be unmounted.
- */
-static void mark_umount_candidates(struct mount *mnt)
+static struct mount *propagation_visit_child(struct mount *last_child,
+					    struct mount *origin_child)
 {
-	struct mount *parent = mnt->mnt_parent;
-	struct mount *m;
+	struct mount *m = last_child->mnt_parent;
+	struct mount *origin = origin_child->mnt_parent;
+	struct dentry *mountpoint = origin_child->mnt_mountpoint;
+	struct mount *child;
 
-	BUG_ON(parent == mnt);
+	/* Has this part of the propgation tree already been visited? */
+	if (IS_MNT_MARKED(last_child))
+		return NULL;
 
-	for (m = propagation_next(parent, parent); m;
-			m = propagation_next(m, parent)) {
-		struct mount *child = __lookup_mnt_last(&m->mnt,
-						mnt->mnt_mountpoint);
-		if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
-			SET_MNT_MARK(child);
-		}
+	SET_MNT_MARK(last_child);
+
+	m = propagation_next(m, origin);
+	while (m) {
+		child = __lookup_mnt_last(&m->mnt, mountpoint);
+		if (child && !IS_MNT_MARKED(child))
+			return child;
+
+		if (!child)
+			m = propagation_next(m, origin);
+		else
+			m = propagation_next_sib(m, origin);
 	}
+	return NULL;
 }
 
-/*
- * NOTE: unmounting 'mnt' naturally propagates to all other mounts its
- * parent propagates to.
- */
-static void __propagate_umount(struct mount *mnt)
+static struct mount *propagation_revisit_child(struct mount *last_child,
+					       struct mount *origin_child)
 {
-	struct mount *parent = mnt->mnt_parent;
-	struct mount *m;
+	struct mount *m = last_child->mnt_parent;
+	struct mount *origin = origin_child->mnt_parent;
+	struct dentry *mountpoint = origin_child->mnt_mountpoint;
+	struct mount *child;
 
-	BUG_ON(parent == mnt);
+	/* Has this part of the propgation tree already been revisited? */
+	if (!IS_MNT_MARKED(last_child))
+		return NULL;
 
-	for (m = propagation_next(parent, parent); m;
-			m = propagation_next(m, parent)) {
+	CLEAR_MNT_MARK(last_child);
 
-		struct mount *child = __lookup_mnt_last(&m->mnt,
-						mnt->mnt_mountpoint);
-		/*
-		 * umount the child only if the child has no children
-		 * and the child is marked safe to unmount.
-		 */
-		if (!child || !IS_MNT_MARKED(child))
-			continue;
-		CLEAR_MNT_MARK(child);
-		if (list_empty(&child->mnt_mounts)) {
-			list_del_init(&child->mnt_child);
-			child->mnt.mnt_flags |= MNT_UMOUNT;
-			list_move_tail(&child->mnt_list, &mnt->mnt_list);
-		}
+	m = propagation_next(m, origin);
+	while (m) {
+		child = __lookup_mnt_last(&m->mnt, mountpoint);
+		if (child && IS_MNT_MARKED(child))
+			return child;
+
+		if (!child)
+			m = propagation_next(m, origin);
+		else
+			m = propagation_next_sib(m, origin);
 	}
+	return NULL;
 }
 
+static void start_umount_propagation(struct mount *child,
+				     struct list_head *to_umount)
+{
+	do {
+		struct mount *parent = child->mnt_parent;
+
+		if ((child->mnt.mnt_flags & MNT_UMOUNT) ||
+		    !list_empty(&child->mnt_mounts))
+			return;
+
+		if (!IS_MNT_LOCKED(child))
+			list_move_tail(&child->mnt_child, to_umount);
+		else
+			list_move_tail(&child->mnt_child, &parent->mnt_umounts);
+
+		child = NULL;
+		if (IS_MNT_MARKED(parent))
+			child = parent;
+	} while (child);
+}
+
+static void end_umount_propagation(struct mount *child)
+{
+	struct mount *parent = child->mnt_parent;
+
+	if (!list_empty(&parent->mnt_umounts))
+		list_splice_tail_init(&parent->mnt_umounts, &parent->mnt_mounts);
+}
+
+
 /*
  * collect all mounts that receive propagation from the mount in @list,
  * and return these additional mounts in the same list.
@@ -447,14 +499,39 @@ static void __propagate_umount(struct mount *mnt)
  *
  * vfsmount lock must be held for write
  */
-int propagate_umount(struct list_head *list)
+void propagate_umount(struct list_head *list)
 {
 	struct mount *mnt;
+	LIST_HEAD(to_umount);
+	LIST_HEAD(tmp_list);
+
+	/* Find candidates for unmounting */
+	list_for_each_entry(mnt, list, mnt_list) {
+		struct mount *child;
+		for (child = propagation_visit_child(mnt, mnt); child;
+		     child = propagation_visit_child(child, mnt))
+			start_umount_propagation(child, &to_umount);
+	}
 
-	list_for_each_entry_reverse(mnt, list, mnt_list)
-		mark_umount_candidates(mnt);
+	/* Begin unmounting */
+	while (!list_empty(&to_umount)) {
+		mnt = list_first_entry(&to_umount, struct mount, mnt_child);
 
-	list_for_each_entry(mnt, list, mnt_list)
-		__propagate_umount(mnt);
-	return 0;
+		list_del_init(&mnt->mnt_child);
+		mnt->mnt.mnt_flags |= MNT_UMOUNT;
+		list_move_tail(&mnt->mnt_list, &tmp_list);
+
+		if (!list_empty(&mnt->mnt_umounts))
+			list_splice_tail_init(&mnt->mnt_umounts, &to_umount);
+	}
+
+	/* Cleanup the mount propagation tree */
+	list_for_each_entry(mnt, list, mnt_list) {
+		struct mount *child;
+		for (child = propagation_revisit_child(mnt, mnt); child;
+		     child = propagation_revisit_child(child, mnt))
+			end_umount_propagation(child);
+	}
+
+	list_splice_tail(&tmp_list, list);
 }
diff --git a/fs/pnode.h b/fs/pnode.h
index 550f5a8b4fcf..38c6cdb96b34 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -41,7 +41,7 @@ static inline void set_mnt_shared(struct mount *mnt)
 void change_mnt_propagation(struct mount *, int);
 int propagate_mnt(struct mount *, struct mountpoint *, struct mount *,
 		struct hlist_head *);
-int propagate_umount(struct list_head *);
+void propagate_umount(struct list_head *);
 int propagate_mount_busy(struct mount *, int);
 void propagate_mount_unlock(struct mount *);
 void mnt_release_group_id(struct mount *);
-- 
2.10.1

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [RFC][PATCH v2] mount: In propagate_umount handle overlapping mount propagation trees
@ 2016-10-22 19:42                                         ` Eric W. Biederman
  0 siblings, 0 replies; 36+ messages in thread
From: Eric W. Biederman @ 2016-10-22 19:42 UTC (permalink / raw)
  To: Andrei Vagin
  Cc: Andrey Vagin, Alexander Viro, Linux Containers, linux-fsdevel, LKML


Andrei,

This fixes the issue you have reported and through a refactoring
makes the code simpler and easier to verify.  That said I find your
last test case very interesting.   While looking at it in detail
I have realized I don't fully understand why we have both lookup_mnt and
lookup_mnt_last, so I can't say that this change is fully correct.

Outside of propogate_umount I am don't have concerns but I am not 100%
convinced that my change to lookup_mnt_last does the right thing
in the case of propagate_umount.

I do see why your last test case scales badly.  Long chains of shared
mounts that we can't skip.  At the same time I don't really understand
that case.  Part of it has to do with multiple child mounts of the same
mount on the same mountpoint.

So I am working through my concerns.  In the mean time I figured it
would be useful to post this version.  As this version is clearly better
than the version of this change that have come before it.

Eric

From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 13 Oct 2016 13:27:19 -0500

Adrei Vagin pointed out that time to executue propagate_umount can go
non-linear (and take a ludicrious amount of time) when the mount
propogation trees of the mounts to be unmunted by a lazy unmount
overlap.

While investigating the horrible performance I realized that in
the case overlapping mount trees since the addition of locked
mount support the code has been failing to unmount all of the
mounts it should have been unmounting.

Make the walk of the mount propagation trees nearly linear by using
MNT_MARK to mark pieces of the mount propagation trees that have
already been visited, allowing subsequent walks to skip over
subtrees.

Make the processing of mounts order independent by adding a list of
mount entries that need to be unmounted, and simply adding a mount to
that list when it becomes apparent the mount can safely be unmounted.
For mounts that are locked on other mounts but otherwise could be
unmounted move them from their parnets mnt_mounts to mnt_umounts so
that if and when their parent becomes unmounted these mounts can be
added to the list of mounts to unmount.

Add a final pass to clear MNT_MARK and to restore mnt_mounts
from mnt_umounts for anything that did not get unmounted.

Add the functions propagation_visit_next and propagation_revisit_next
to coordinate walking of the mount tree and setting and clearing the
mount mark.

The skipping of already unmounted mounts has been moved from
__lookup_mnt_last to mark_umount_candidates, so that the new
propagation functions can notice when the propagation tree passes
through the initial set of unmounted mounts.  Except in umount_tree as
part of the unmounting process the only place where unmounted mounts
should be found are in unmounted subtrees.  All of the other callers
of __lookup_mnt_last are from mounted subtrees so the not checking for
unmounted mounts should not affect them.

A script to generate overlapping mount propagation trees:
$ cat run.sh
mount -t tmpfs test-mount /mnt
mount --make-shared /mnt
for i in `seq $1`; do
        mkdir /mnt/test.$i
        mount --bind /mnt /mnt/test.$i
done
cat /proc/mounts | grep test-mount | wc -l
time umount -l /mnt
$ for i in `seq 10 16`; do echo $i; unshare -Urm bash ./run.sh $i; done

Here are the performance numbers with and without the patch:

mhash  |  8192   |  8192  |  8192       | 131072 | 131072      | 104857 | 104857
mounts | before  | after  | after (sys) | after  | after (sys) |  after | after (sys)
-------------------------------------------------------------------------------------
  1024 |  0.071s | 0.020s | 0.000s      | 0.022s | 0.004s      | 0.020s | 0.004s
  2048 |  0.184s | 0.022s | 0.004s      | 0.023s | 0.004s      | 0.022s | 0.008s
  4096 |  0.604s | 0.025s | 0.020s      | 0.029s | 0.008s      | 0.026s | 0.004s
  8912 |  4.471s | 0.053s | 0.020s      | 0.051s | 0.024s      | 0.047s | 0.016s
 16384 | 34.826s | 0.088s | 0.060s      | 0.081s | 0.048s      | 0.082s | 0.052s
 32768 |         | 0.216s | 0.172s      | 0.160s | 0.124s      | 0.160s | 0.096s
 65536 |         | 0.819s | 0.726s      | 0.330s | 0.260s      | 0.338s | 0.256s
131072 |         | 4.502s | 4.168s      | 0.707s | 0.580s      | 0.709s | 0.592s

Andrei Vagin reports fixing the performance problem is part of the
work to fix CVE-2016-6213.

A script for a pathlogical set of mounts:

$ cat pathological.sh

mount -t tmpfs base /mnt
mount --make-shared /mnt
mkdir -p /mnt/b

mount -t tmpfs test1 /mnt/b
mount --make-shared /mnt/b
mkdir -p /mnt/b/10

mount -t tmpfs test2 /mnt/b/10
mount --make-shared /mnt/b/10
mkdir -p /mnt/b/10/20

mount --rbind /mnt/b /mnt/b/10/20

unshare -Urm sleep 2
umount -l /mnt/b
wait %%

$ unshare -Urm pathlogical.sh

Cc: stable@vger.kernel.org
Fixes: a05964f3917c ("[PATCH] shared mounts handling: umount")
Fixes: 0c56fe31420c ("mnt: Don't propagate unmounts to locked mounts")
Reported-by: Andrei Vagin <avagin@openvz.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/mount.h     |   1 +
 fs/namespace.c |   7 +--
 fs/pnode.c     | 179 +++++++++++++++++++++++++++++++++++++++++----------------
 fs/pnode.h     |   2 +-
 4 files changed, 133 insertions(+), 56 deletions(-)

diff --git a/fs/mount.h b/fs/mount.h
index d2e25d7b64b3..00fe0d1d6ba7 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -58,6 +58,7 @@ struct mount {
 	struct mnt_namespace *mnt_ns;	/* containing namespace */
 	struct mountpoint *mnt_mp;	/* where is it mounted */
 	struct hlist_node mnt_mp_list;	/* list mounts with the same mountpoint */
+	struct list_head mnt_umounts;	/* list of children that are being unmounted */
 #ifdef CONFIG_FSNOTIFY
 	struct hlist_head mnt_fsnotify_marks;
 	__u32 mnt_fsnotify_mask;
diff --git a/fs/namespace.c b/fs/namespace.c
index e6c234b1a645..73801391bb00 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -237,6 +237,7 @@ static struct mount *alloc_vfsmnt(const char *name)
 		INIT_LIST_HEAD(&mnt->mnt_slave_list);
 		INIT_LIST_HEAD(&mnt->mnt_slave);
 		INIT_HLIST_NODE(&mnt->mnt_mp_list);
+		INIT_LIST_HEAD(&mnt->mnt_umounts);
 #ifdef CONFIG_FSNOTIFY
 		INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
 #endif
@@ -650,13 +651,11 @@ struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
 	p = __lookup_mnt(mnt, dentry);
 	if (!p)
 		goto out;
-	if (!(p->mnt.mnt_flags & MNT_UMOUNT))
-		res = p;
+	res = p;
 	hlist_for_each_entry_continue(p, mnt_hash) {
 		if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
 			break;
-		if (!(p->mnt.mnt_flags & MNT_UMOUNT))
-			res = p;
+		res = p;
 	}
 out:
 	return res;
diff --git a/fs/pnode.c b/fs/pnode.c
index 234a9ac49958..8fd1a3fb420c 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -134,7 +134,8 @@ void change_mnt_propagation(struct mount *mnt, int type)
 }
 
 /*
- * get the next mount in the propagation tree.
+ * get the next mount that is not a slave of the current mount in the
+ * propagation tree.
  * @m: the mount seen last
  * @origin: the original mount from where the tree walk initiated
  *
@@ -143,13 +144,9 @@ void change_mnt_propagation(struct mount *mnt, int type)
  * vfsmount found while iterating with propagation_next() is
  * a peer of one we'd found earlier.
  */
-static struct mount *propagation_next(struct mount *m,
-					 struct mount *origin)
+static struct mount *propagation_next_sib(struct mount *m,
+						struct mount *origin)
 {
-	/* are there any slaves of this mount? */
-	if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
-		return first_slave(m);
-
 	while (1) {
 		struct mount *master = m->mnt_master;
 
@@ -164,6 +161,26 @@ static struct mount *propagation_next(struct mount *m,
 	}
 }
 
+/*
+ * get the next mount in the propagation tree.
+ * @m: the mount seen last
+ * @origin: the original mount from where the tree walk initiated
+ *
+ * Note that peer groups form contiguous segments of slave lists.
+ * We rely on that in get_source() to be able to find out if
+ * vfsmount found while iterating with propagation_next() is
+ * a peer of one we'd found earlier.
+ */
+static struct mount *propagation_next(struct mount *m,
+					 struct mount *origin)
+{
+	/* are there any slaves of this mount? */
+	if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
+		return first_slave(m);
+
+	return propagation_next_sib(m, origin);
+}
+
 static struct mount *next_group(struct mount *m, struct mount *origin)
 {
 	while (1) {
@@ -389,57 +406,92 @@ void propagate_mount_unlock(struct mount *mnt)
 	}
 }
 
-/*
- * Mark all mounts that the MNT_LOCKED logic will allow to be unmounted.
- */
-static void mark_umount_candidates(struct mount *mnt)
+static struct mount *propagation_visit_child(struct mount *last_child,
+					    struct mount *origin_child)
 {
-	struct mount *parent = mnt->mnt_parent;
-	struct mount *m;
+	struct mount *m = last_child->mnt_parent;
+	struct mount *origin = origin_child->mnt_parent;
+	struct dentry *mountpoint = origin_child->mnt_mountpoint;
+	struct mount *child;
 
-	BUG_ON(parent == mnt);
+	/* Has this part of the propgation tree already been visited? */
+	if (IS_MNT_MARKED(last_child))
+		return NULL;
 
-	for (m = propagation_next(parent, parent); m;
-			m = propagation_next(m, parent)) {
-		struct mount *child = __lookup_mnt_last(&m->mnt,
-						mnt->mnt_mountpoint);
-		if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
-			SET_MNT_MARK(child);
-		}
+	SET_MNT_MARK(last_child);
+
+	m = propagation_next(m, origin);
+	while (m) {
+		child = __lookup_mnt_last(&m->mnt, mountpoint);
+		if (child && !IS_MNT_MARKED(child))
+			return child;
+
+		if (!child)
+			m = propagation_next(m, origin);
+		else
+			m = propagation_next_sib(m, origin);
 	}
+	return NULL;
 }
 
-/*
- * NOTE: unmounting 'mnt' naturally propagates to all other mounts its
- * parent propagates to.
- */
-static void __propagate_umount(struct mount *mnt)
+static struct mount *propagation_revisit_child(struct mount *last_child,
+					       struct mount *origin_child)
 {
-	struct mount *parent = mnt->mnt_parent;
-	struct mount *m;
+	struct mount *m = last_child->mnt_parent;
+	struct mount *origin = origin_child->mnt_parent;
+	struct dentry *mountpoint = origin_child->mnt_mountpoint;
+	struct mount *child;
 
-	BUG_ON(parent == mnt);
+	/* Has this part of the propgation tree already been revisited? */
+	if (!IS_MNT_MARKED(last_child))
+		return NULL;
 
-	for (m = propagation_next(parent, parent); m;
-			m = propagation_next(m, parent)) {
+	CLEAR_MNT_MARK(last_child);
 
-		struct mount *child = __lookup_mnt_last(&m->mnt,
-						mnt->mnt_mountpoint);
-		/*
-		 * umount the child only if the child has no children
-		 * and the child is marked safe to unmount.
-		 */
-		if (!child || !IS_MNT_MARKED(child))
-			continue;
-		CLEAR_MNT_MARK(child);
-		if (list_empty(&child->mnt_mounts)) {
-			list_del_init(&child->mnt_child);
-			child->mnt.mnt_flags |= MNT_UMOUNT;
-			list_move_tail(&child->mnt_list, &mnt->mnt_list);
-		}
+	m = propagation_next(m, origin);
+	while (m) {
+		child = __lookup_mnt_last(&m->mnt, mountpoint);
+		if (child && IS_MNT_MARKED(child))
+			return child;
+
+		if (!child)
+			m = propagation_next(m, origin);
+		else
+			m = propagation_next_sib(m, origin);
 	}
+	return NULL;
 }
 
+static void start_umount_propagation(struct mount *child,
+				     struct list_head *to_umount)
+{
+	do {
+		struct mount *parent = child->mnt_parent;
+
+		if ((child->mnt.mnt_flags & MNT_UMOUNT) ||
+		    !list_empty(&child->mnt_mounts))
+			return;
+
+		if (!IS_MNT_LOCKED(child))
+			list_move_tail(&child->mnt_child, to_umount);
+		else
+			list_move_tail(&child->mnt_child, &parent->mnt_umounts);
+
+		child = NULL;
+		if (IS_MNT_MARKED(parent))
+			child = parent;
+	} while (child);
+}
+
+static void end_umount_propagation(struct mount *child)
+{
+	struct mount *parent = child->mnt_parent;
+
+	if (!list_empty(&parent->mnt_umounts))
+		list_splice_tail_init(&parent->mnt_umounts, &parent->mnt_mounts);
+}
+
+
 /*
  * collect all mounts that receive propagation from the mount in @list,
  * and return these additional mounts in the same list.
@@ -447,14 +499,39 @@ static void __propagate_umount(struct mount *mnt)
  *
  * vfsmount lock must be held for write
  */
-int propagate_umount(struct list_head *list)
+void propagate_umount(struct list_head *list)
 {
 	struct mount *mnt;
+	LIST_HEAD(to_umount);
+	LIST_HEAD(tmp_list);
+
+	/* Find candidates for unmounting */
+	list_for_each_entry(mnt, list, mnt_list) {
+		struct mount *child;
+		for (child = propagation_visit_child(mnt, mnt); child;
+		     child = propagation_visit_child(child, mnt))
+			start_umount_propagation(child, &to_umount);
+	}
 
-	list_for_each_entry_reverse(mnt, list, mnt_list)
-		mark_umount_candidates(mnt);
+	/* Begin unmounting */
+	while (!list_empty(&to_umount)) {
+		mnt = list_first_entry(&to_umount, struct mount, mnt_child);
 
-	list_for_each_entry(mnt, list, mnt_list)
-		__propagate_umount(mnt);
-	return 0;
+		list_del_init(&mnt->mnt_child);
+		mnt->mnt.mnt_flags |= MNT_UMOUNT;
+		list_move_tail(&mnt->mnt_list, &tmp_list);
+
+		if (!list_empty(&mnt->mnt_umounts))
+			list_splice_tail_init(&mnt->mnt_umounts, &to_umount);
+	}
+
+	/* Cleanup the mount propagation tree */
+	list_for_each_entry(mnt, list, mnt_list) {
+		struct mount *child;
+		for (child = propagation_revisit_child(mnt, mnt); child;
+		     child = propagation_revisit_child(child, mnt))
+			end_umount_propagation(child);
+	}
+
+	list_splice_tail(&tmp_list, list);
 }
diff --git a/fs/pnode.h b/fs/pnode.h
index 550f5a8b4fcf..38c6cdb96b34 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -41,7 +41,7 @@ static inline void set_mnt_shared(struct mount *mnt)
 void change_mnt_propagation(struct mount *, int);
 int propagate_mnt(struct mount *, struct mountpoint *, struct mount *,
 		struct hlist_head *);
-int propagate_umount(struct list_head *);
+void propagate_umount(struct list_head *);
 int propagate_mount_busy(struct mount *, int);
 void propagate_mount_unlock(struct mount *);
 void mnt_release_group_id(struct mount *);
-- 
2.10.1

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [RFC][PATCH v2] mount: In propagate_umount handle overlapping mount propagation trees
  2016-10-22 19:42                                         ` Eric W. Biederman
@ 2016-10-25 20:58                                             ` Andrei Vagin
  -1 siblings, 0 replies; 36+ messages in thread
From: Andrei Vagin @ 2016-10-25 20:58 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: linux-fsdevel, LKML, Linux Containers, Andrey Vagin, Alexander Viro

[-- Attachment #1: Type: text/plain, Size: 15259 bytes --]

On Sat, Oct 22, 2016 at 02:42:03PM -0500, Eric W. Biederman wrote:
> 
> Andrei,
> 
> This fixes the issue you have reported and through a refactoring
> makes the code simpler and easier to verify.  That said I find your
> last test case very interesting.   While looking at it in detail
> I have realized I don't fully understand why we have both lookup_mnt and
> lookup_mnt_last, so I can't say that this change is fully correct.
> 
> Outside of propogate_umount I am don't have concerns but I am not 100%
> convinced that my change to lookup_mnt_last does the right thing
> in the case of propagate_umount.
> 
> I do see why your last test case scales badly.  Long chains of shared
> mounts that we can't skip.  At the same time I don't really understand
> that case.  Part of it has to do with multiple child mounts of the same
> mount on the same mountpoint.
> 
> So I am working through my concerns.  In the mean time I figured it
> would be useful to post this version.  As this version is clearly better
> than the version of this change that have come before it.

Hi Eric,

I have tested this version and it works fine.

As for the the last test case, could you look at the attached patch?
The idea is that we can skip all mounts from a shared group, if one
of them already marked.

> 
> Eric
> 
> From: "Eric W. Biederman" <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
> Date: Thu, 13 Oct 2016 13:27:19 -0500
> 
> Adrei Vagin pointed out that time to executue propagate_umount can go
> non-linear (and take a ludicrious amount of time) when the mount
> propogation trees of the mounts to be unmunted by a lazy unmount
> overlap.
> 
> While investigating the horrible performance I realized that in
> the case overlapping mount trees since the addition of locked
> mount support the code has been failing to unmount all of the
> mounts it should have been unmounting.
> 
> Make the walk of the mount propagation trees nearly linear by using
> MNT_MARK to mark pieces of the mount propagation trees that have
> already been visited, allowing subsequent walks to skip over
> subtrees.
> 
> Make the processing of mounts order independent by adding a list of
> mount entries that need to be unmounted, and simply adding a mount to
> that list when it becomes apparent the mount can safely be unmounted.
> For mounts that are locked on other mounts but otherwise could be
> unmounted move them from their parnets mnt_mounts to mnt_umounts so
> that if and when their parent becomes unmounted these mounts can be
> added to the list of mounts to unmount.
> 
> Add a final pass to clear MNT_MARK and to restore mnt_mounts
> from mnt_umounts for anything that did not get unmounted.
> 
> Add the functions propagation_visit_next and propagation_revisit_next
> to coordinate walking of the mount tree and setting and clearing the
> mount mark.
> 
> The skipping of already unmounted mounts has been moved from
> __lookup_mnt_last to mark_umount_candidates, so that the new
> propagation functions can notice when the propagation tree passes
> through the initial set of unmounted mounts.  Except in umount_tree as
> part of the unmounting process the only place where unmounted mounts
> should be found are in unmounted subtrees.  All of the other callers
> of __lookup_mnt_last are from mounted subtrees so the not checking for
> unmounted mounts should not affect them.
> 
> A script to generate overlapping mount propagation trees:
> $ cat run.sh
> mount -t tmpfs test-mount /mnt
> mount --make-shared /mnt
> for i in `seq $1`; do
>         mkdir /mnt/test.$i
>         mount --bind /mnt /mnt/test.$i
> done
> cat /proc/mounts | grep test-mount | wc -l
> time umount -l /mnt
> $ for i in `seq 10 16`; do echo $i; unshare -Urm bash ./run.sh $i; done
> 
> Here are the performance numbers with and without the patch:
> 
> mhash  |  8192   |  8192  |  8192       | 131072 | 131072      | 104857 | 104857
> mounts | before  | after  | after (sys) | after  | after (sys) |  after | after (sys)
> -------------------------------------------------------------------------------------
>   1024 |  0.071s | 0.020s | 0.000s      | 0.022s | 0.004s      | 0.020s | 0.004s
>   2048 |  0.184s | 0.022s | 0.004s      | 0.023s | 0.004s      | 0.022s | 0.008s
>   4096 |  0.604s | 0.025s | 0.020s      | 0.029s | 0.008s      | 0.026s | 0.004s
>   8912 |  4.471s | 0.053s | 0.020s      | 0.051s | 0.024s      | 0.047s | 0.016s
>  16384 | 34.826s | 0.088s | 0.060s      | 0.081s | 0.048s      | 0.082s | 0.052s
>  32768 |         | 0.216s | 0.172s      | 0.160s | 0.124s      | 0.160s | 0.096s
>  65536 |         | 0.819s | 0.726s      | 0.330s | 0.260s      | 0.338s | 0.256s
> 131072 |         | 4.502s | 4.168s      | 0.707s | 0.580s      | 0.709s | 0.592s
> 
> Andrei Vagin reports fixing the performance problem is part of the
> work to fix CVE-2016-6213.
> 
> A script for a pathlogical set of mounts:
> 
> $ cat pathological.sh
> 
> mount -t tmpfs base /mnt
> mount --make-shared /mnt
> mkdir -p /mnt/b
> 
> mount -t tmpfs test1 /mnt/b
> mount --make-shared /mnt/b
> mkdir -p /mnt/b/10
> 
> mount -t tmpfs test2 /mnt/b/10
> mount --make-shared /mnt/b/10
> mkdir -p /mnt/b/10/20
> 
> mount --rbind /mnt/b /mnt/b/10/20
> 
> unshare -Urm sleep 2
> umount -l /mnt/b
> wait %%
> 
> $ unshare -Urm pathlogical.sh
> 
> Cc: stable-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> Fixes: a05964f3917c ("[PATCH] shared mounts handling: umount")
> Fixes: 0c56fe31420c ("mnt: Don't propagate unmounts to locked mounts")
> Reported-by: Andrei Vagin <avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
> Signed-off-by: "Eric W. Biederman" <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
> ---
>  fs/mount.h     |   1 +
>  fs/namespace.c |   7 +--
>  fs/pnode.c     | 179 +++++++++++++++++++++++++++++++++++++++++----------------
>  fs/pnode.h     |   2 +-
>  4 files changed, 133 insertions(+), 56 deletions(-)
> 
> diff --git a/fs/mount.h b/fs/mount.h
> index d2e25d7b64b3..00fe0d1d6ba7 100644
> --- a/fs/mount.h
> +++ b/fs/mount.h
> @@ -58,6 +58,7 @@ struct mount {
>  	struct mnt_namespace *mnt_ns;	/* containing namespace */
>  	struct mountpoint *mnt_mp;	/* where is it mounted */
>  	struct hlist_node mnt_mp_list;	/* list mounts with the same mountpoint */
> +	struct list_head mnt_umounts;	/* list of children that are being unmounted */
>  #ifdef CONFIG_FSNOTIFY
>  	struct hlist_head mnt_fsnotify_marks;
>  	__u32 mnt_fsnotify_mask;
> diff --git a/fs/namespace.c b/fs/namespace.c
> index e6c234b1a645..73801391bb00 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -237,6 +237,7 @@ static struct mount *alloc_vfsmnt(const char *name)
>  		INIT_LIST_HEAD(&mnt->mnt_slave_list);
>  		INIT_LIST_HEAD(&mnt->mnt_slave);
>  		INIT_HLIST_NODE(&mnt->mnt_mp_list);
> +		INIT_LIST_HEAD(&mnt->mnt_umounts);
>  #ifdef CONFIG_FSNOTIFY
>  		INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
>  #endif
> @@ -650,13 +651,11 @@ struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
>  	p = __lookup_mnt(mnt, dentry);
>  	if (!p)
>  		goto out;
> -	if (!(p->mnt.mnt_flags & MNT_UMOUNT))
> -		res = p;
> +	res = p;
>  	hlist_for_each_entry_continue(p, mnt_hash) {
>  		if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
>  			break;
> -		if (!(p->mnt.mnt_flags & MNT_UMOUNT))
> -			res = p;
> +		res = p;
>  	}
>  out:
>  	return res;
> diff --git a/fs/pnode.c b/fs/pnode.c
> index 234a9ac49958..8fd1a3fb420c 100644
> --- a/fs/pnode.c
> +++ b/fs/pnode.c
> @@ -134,7 +134,8 @@ void change_mnt_propagation(struct mount *mnt, int type)
>  }
>  
>  /*
> - * get the next mount in the propagation tree.
> + * get the next mount that is not a slave of the current mount in the
> + * propagation tree.
>   * @m: the mount seen last
>   * @origin: the original mount from where the tree walk initiated
>   *
> @@ -143,13 +144,9 @@ void change_mnt_propagation(struct mount *mnt, int type)
>   * vfsmount found while iterating with propagation_next() is
>   * a peer of one we'd found earlier.
>   */
> -static struct mount *propagation_next(struct mount *m,
> -					 struct mount *origin)
> +static struct mount *propagation_next_sib(struct mount *m,
> +						struct mount *origin)
>  {
> -	/* are there any slaves of this mount? */
> -	if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
> -		return first_slave(m);
> -
>  	while (1) {
>  		struct mount *master = m->mnt_master;
>  
> @@ -164,6 +161,26 @@ static struct mount *propagation_next(struct mount *m,
>  	}
>  }
>  
> +/*
> + * get the next mount in the propagation tree.
> + * @m: the mount seen last
> + * @origin: the original mount from where the tree walk initiated
> + *
> + * Note that peer groups form contiguous segments of slave lists.
> + * We rely on that in get_source() to be able to find out if
> + * vfsmount found while iterating with propagation_next() is
> + * a peer of one we'd found earlier.
> + */
> +static struct mount *propagation_next(struct mount *m,
> +					 struct mount *origin)
> +{
> +	/* are there any slaves of this mount? */
> +	if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
> +		return first_slave(m);
> +
> +	return propagation_next_sib(m, origin);
> +}
> +
>  static struct mount *next_group(struct mount *m, struct mount *origin)
>  {
>  	while (1) {
> @@ -389,57 +406,92 @@ void propagate_mount_unlock(struct mount *mnt)
>  	}
>  }
>  
> -/*
> - * Mark all mounts that the MNT_LOCKED logic will allow to be unmounted.
> - */
> -static void mark_umount_candidates(struct mount *mnt)
> +static struct mount *propagation_visit_child(struct mount *last_child,
> +					    struct mount *origin_child)
>  {
> -	struct mount *parent = mnt->mnt_parent;
> -	struct mount *m;
> +	struct mount *m = last_child->mnt_parent;
> +	struct mount *origin = origin_child->mnt_parent;
> +	struct dentry *mountpoint = origin_child->mnt_mountpoint;
> +	struct mount *child;
>  
> -	BUG_ON(parent == mnt);
> +	/* Has this part of the propgation tree already been visited? */
> +	if (IS_MNT_MARKED(last_child))
> +		return NULL;
>  
> -	for (m = propagation_next(parent, parent); m;
> -			m = propagation_next(m, parent)) {
> -		struct mount *child = __lookup_mnt_last(&m->mnt,
> -						mnt->mnt_mountpoint);
> -		if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
> -			SET_MNT_MARK(child);
> -		}
> +	SET_MNT_MARK(last_child);
> +
> +	m = propagation_next(m, origin);
> +	while (m) {
> +		child = __lookup_mnt_last(&m->mnt, mountpoint);
> +		if (child && !IS_MNT_MARKED(child))
> +			return child;
> +
> +		if (!child)
> +			m = propagation_next(m, origin);
> +		else
> +			m = propagation_next_sib(m, origin);
>  	}
> +	return NULL;
>  }
>  
> -/*
> - * NOTE: unmounting 'mnt' naturally propagates to all other mounts its
> - * parent propagates to.
> - */
> -static void __propagate_umount(struct mount *mnt)
> +static struct mount *propagation_revisit_child(struct mount *last_child,
> +					       struct mount *origin_child)
>  {
> -	struct mount *parent = mnt->mnt_parent;
> -	struct mount *m;
> +	struct mount *m = last_child->mnt_parent;
> +	struct mount *origin = origin_child->mnt_parent;
> +	struct dentry *mountpoint = origin_child->mnt_mountpoint;
> +	struct mount *child;
>  
> -	BUG_ON(parent == mnt);
> +	/* Has this part of the propgation tree already been revisited? */
> +	if (!IS_MNT_MARKED(last_child))
> +		return NULL;
>  
> -	for (m = propagation_next(parent, parent); m;
> -			m = propagation_next(m, parent)) {
> +	CLEAR_MNT_MARK(last_child);
>  
> -		struct mount *child = __lookup_mnt_last(&m->mnt,
> -						mnt->mnt_mountpoint);
> -		/*
> -		 * umount the child only if the child has no children
> -		 * and the child is marked safe to unmount.
> -		 */
> -		if (!child || !IS_MNT_MARKED(child))
> -			continue;
> -		CLEAR_MNT_MARK(child);
> -		if (list_empty(&child->mnt_mounts)) {
> -			list_del_init(&child->mnt_child);
> -			child->mnt.mnt_flags |= MNT_UMOUNT;
> -			list_move_tail(&child->mnt_list, &mnt->mnt_list);
> -		}
> +	m = propagation_next(m, origin);
> +	while (m) {
> +		child = __lookup_mnt_last(&m->mnt, mountpoint);
> +		if (child && IS_MNT_MARKED(child))
> +			return child;
> +
> +		if (!child)
> +			m = propagation_next(m, origin);
> +		else
> +			m = propagation_next_sib(m, origin);
>  	}
> +	return NULL;
>  }
>  
> +static void start_umount_propagation(struct mount *child,
> +				     struct list_head *to_umount)
> +{
> +	do {
> +		struct mount *parent = child->mnt_parent;
> +
> +		if ((child->mnt.mnt_flags & MNT_UMOUNT) ||
> +		    !list_empty(&child->mnt_mounts))
> +			return;
> +
> +		if (!IS_MNT_LOCKED(child))
> +			list_move_tail(&child->mnt_child, to_umount);
> +		else
> +			list_move_tail(&child->mnt_child, &parent->mnt_umounts);
> +
> +		child = NULL;
> +		if (IS_MNT_MARKED(parent))
> +			child = parent;
> +	} while (child);
> +}
> +
> +static void end_umount_propagation(struct mount *child)
> +{
> +	struct mount *parent = child->mnt_parent;
> +
> +	if (!list_empty(&parent->mnt_umounts))
> +		list_splice_tail_init(&parent->mnt_umounts, &parent->mnt_mounts);
> +}
> +
> +
>  /*
>   * collect all mounts that receive propagation from the mount in @list,
>   * and return these additional mounts in the same list.
> @@ -447,14 +499,39 @@ static void __propagate_umount(struct mount *mnt)
>   *
>   * vfsmount lock must be held for write
>   */
> -int propagate_umount(struct list_head *list)
> +void propagate_umount(struct list_head *list)
>  {
>  	struct mount *mnt;
> +	LIST_HEAD(to_umount);
> +	LIST_HEAD(tmp_list);
> +
> +	/* Find candidates for unmounting */
> +	list_for_each_entry(mnt, list, mnt_list) {
> +		struct mount *child;
> +		for (child = propagation_visit_child(mnt, mnt); child;
> +		     child = propagation_visit_child(child, mnt))
> +			start_umount_propagation(child, &to_umount);
> +	}
>  
> -	list_for_each_entry_reverse(mnt, list, mnt_list)
> -		mark_umount_candidates(mnt);
> +	/* Begin unmounting */
> +	while (!list_empty(&to_umount)) {
> +		mnt = list_first_entry(&to_umount, struct mount, mnt_child);
>  
> -	list_for_each_entry(mnt, list, mnt_list)
> -		__propagate_umount(mnt);
> -	return 0;
> +		list_del_init(&mnt->mnt_child);
> +		mnt->mnt.mnt_flags |= MNT_UMOUNT;
> +		list_move_tail(&mnt->mnt_list, &tmp_list);
> +
> +		if (!list_empty(&mnt->mnt_umounts))
> +			list_splice_tail_init(&mnt->mnt_umounts, &to_umount);
> +	}
> +
> +	/* Cleanup the mount propagation tree */
> +	list_for_each_entry(mnt, list, mnt_list) {
> +		struct mount *child;
> +		for (child = propagation_revisit_child(mnt, mnt); child;
> +		     child = propagation_revisit_child(child, mnt))
> +			end_umount_propagation(child);
> +	}
> +
> +	list_splice_tail(&tmp_list, list);
>  }
> diff --git a/fs/pnode.h b/fs/pnode.h
> index 550f5a8b4fcf..38c6cdb96b34 100644
> --- a/fs/pnode.h
> +++ b/fs/pnode.h
> @@ -41,7 +41,7 @@ static inline void set_mnt_shared(struct mount *mnt)
>  void change_mnt_propagation(struct mount *, int);
>  int propagate_mnt(struct mount *, struct mountpoint *, struct mount *,
>  		struct hlist_head *);
> -int propagate_umount(struct list_head *);
> +void propagate_umount(struct list_head *);
>  int propagate_mount_busy(struct mount *, int);
>  void propagate_mount_unlock(struct mount *);
>  void mnt_release_group_id(struct mount *);
> -- 
> 2.10.1
> 

[-- Attachment #2: 0001-mount-skip-all-mounts-from-a-shared-group-if-one-is-.patch --]
[-- Type: text/plain, Size: 1436 bytes --]

From 8e0f45c0272aa1f789d1657a0acc98c58919dcc3 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
Date: Tue, 25 Oct 2016 13:57:31 -0700
Subject: [PATCH] mount: skip all mounts from a shared group if one is marked

If we meet a marked mount, it means that all mounts from
its group have been already revised.

Signed-off-by: Andrei Vagin <avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
---
 fs/pnode.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/fs/pnode.c b/fs/pnode.c
index 8fd1a3f..ebb7134 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -426,10 +426,16 @@ static struct mount *propagation_visit_child(struct mount *last_child,
 		if (child && !IS_MNT_MARKED(child))
 			return child;
 
-		if (!child)
+		if (!child) {
 			m = propagation_next(m, origin);
-		else
+		} else {
+			if (IS_MNT_MARKED(child)) {
+				if (m->mnt_group_id == origin->mnt_group_id)
+					return NULL;
+				m = m->mnt_master;
+			}
 			m = propagation_next_sib(m, origin);
+		}
 	}
 	return NULL;
 }
@@ -456,8 +462,14 @@ static struct mount *propagation_revisit_child(struct mount *last_child,
 
 		if (!child)
 			m = propagation_next(m, origin);
-		else
+		else {
+			if (!IS_MNT_MARKED(child)) {
+				if (m->mnt_group_id == origin->mnt_group_id)
+					return NULL;
+				m = m->mnt_master;
+			}
 			m = propagation_next_sib(m, origin);
+		}
 	}
 	return NULL;
 }
-- 
2.7.4


[-- Attachment #3: Type: text/plain, Size: 205 bytes --]

_______________________________________________
Containers mailing list
Containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
https://lists.linuxfoundation.org/mailman/listinfo/containers

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [RFC][PATCH v2] mount: In propagate_umount handle overlapping mount propagation trees
@ 2016-10-25 20:58                                             ` Andrei Vagin
  0 siblings, 0 replies; 36+ messages in thread
From: Andrei Vagin @ 2016-10-25 20:58 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Andrey Vagin, Alexander Viro, Linux Containers, linux-fsdevel, LKML

[-- Attachment #1: Type: text/plain, Size: 15152 bytes --]

On Sat, Oct 22, 2016 at 02:42:03PM -0500, Eric W. Biederman wrote:
> 
> Andrei,
> 
> This fixes the issue you have reported and through a refactoring
> makes the code simpler and easier to verify.  That said I find your
> last test case very interesting.   While looking at it in detail
> I have realized I don't fully understand why we have both lookup_mnt and
> lookup_mnt_last, so I can't say that this change is fully correct.
> 
> Outside of propogate_umount I am don't have concerns but I am not 100%
> convinced that my change to lookup_mnt_last does the right thing
> in the case of propagate_umount.
> 
> I do see why your last test case scales badly.  Long chains of shared
> mounts that we can't skip.  At the same time I don't really understand
> that case.  Part of it has to do with multiple child mounts of the same
> mount on the same mountpoint.
> 
> So I am working through my concerns.  In the mean time I figured it
> would be useful to post this version.  As this version is clearly better
> than the version of this change that have come before it.

Hi Eric,

I have tested this version and it works fine.

As for the the last test case, could you look at the attached patch?
The idea is that we can skip all mounts from a shared group, if one
of them already marked.

> 
> Eric
> 
> From: "Eric W. Biederman" <ebiederm@xmission.com>
> Date: Thu, 13 Oct 2016 13:27:19 -0500
> 
> Adrei Vagin pointed out that time to executue propagate_umount can go
> non-linear (and take a ludicrious amount of time) when the mount
> propogation trees of the mounts to be unmunted by a lazy unmount
> overlap.
> 
> While investigating the horrible performance I realized that in
> the case overlapping mount trees since the addition of locked
> mount support the code has been failing to unmount all of the
> mounts it should have been unmounting.
> 
> Make the walk of the mount propagation trees nearly linear by using
> MNT_MARK to mark pieces of the mount propagation trees that have
> already been visited, allowing subsequent walks to skip over
> subtrees.
> 
> Make the processing of mounts order independent by adding a list of
> mount entries that need to be unmounted, and simply adding a mount to
> that list when it becomes apparent the mount can safely be unmounted.
> For mounts that are locked on other mounts but otherwise could be
> unmounted move them from their parnets mnt_mounts to mnt_umounts so
> that if and when their parent becomes unmounted these mounts can be
> added to the list of mounts to unmount.
> 
> Add a final pass to clear MNT_MARK and to restore mnt_mounts
> from mnt_umounts for anything that did not get unmounted.
> 
> Add the functions propagation_visit_next and propagation_revisit_next
> to coordinate walking of the mount tree and setting and clearing the
> mount mark.
> 
> The skipping of already unmounted mounts has been moved from
> __lookup_mnt_last to mark_umount_candidates, so that the new
> propagation functions can notice when the propagation tree passes
> through the initial set of unmounted mounts.  Except in umount_tree as
> part of the unmounting process the only place where unmounted mounts
> should be found are in unmounted subtrees.  All of the other callers
> of __lookup_mnt_last are from mounted subtrees so the not checking for
> unmounted mounts should not affect them.
> 
> A script to generate overlapping mount propagation trees:
> $ cat run.sh
> mount -t tmpfs test-mount /mnt
> mount --make-shared /mnt
> for i in `seq $1`; do
>         mkdir /mnt/test.$i
>         mount --bind /mnt /mnt/test.$i
> done
> cat /proc/mounts | grep test-mount | wc -l
> time umount -l /mnt
> $ for i in `seq 10 16`; do echo $i; unshare -Urm bash ./run.sh $i; done
> 
> Here are the performance numbers with and without the patch:
> 
> mhash  |  8192   |  8192  |  8192       | 131072 | 131072      | 104857 | 104857
> mounts | before  | after  | after (sys) | after  | after (sys) |  after | after (sys)
> -------------------------------------------------------------------------------------
>   1024 |  0.071s | 0.020s | 0.000s      | 0.022s | 0.004s      | 0.020s | 0.004s
>   2048 |  0.184s | 0.022s | 0.004s      | 0.023s | 0.004s      | 0.022s | 0.008s
>   4096 |  0.604s | 0.025s | 0.020s      | 0.029s | 0.008s      | 0.026s | 0.004s
>   8912 |  4.471s | 0.053s | 0.020s      | 0.051s | 0.024s      | 0.047s | 0.016s
>  16384 | 34.826s | 0.088s | 0.060s      | 0.081s | 0.048s      | 0.082s | 0.052s
>  32768 |         | 0.216s | 0.172s      | 0.160s | 0.124s      | 0.160s | 0.096s
>  65536 |         | 0.819s | 0.726s      | 0.330s | 0.260s      | 0.338s | 0.256s
> 131072 |         | 4.502s | 4.168s      | 0.707s | 0.580s      | 0.709s | 0.592s
> 
> Andrei Vagin reports fixing the performance problem is part of the
> work to fix CVE-2016-6213.
> 
> A script for a pathlogical set of mounts:
> 
> $ cat pathological.sh
> 
> mount -t tmpfs base /mnt
> mount --make-shared /mnt
> mkdir -p /mnt/b
> 
> mount -t tmpfs test1 /mnt/b
> mount --make-shared /mnt/b
> mkdir -p /mnt/b/10
> 
> mount -t tmpfs test2 /mnt/b/10
> mount --make-shared /mnt/b/10
> mkdir -p /mnt/b/10/20
> 
> mount --rbind /mnt/b /mnt/b/10/20
> 
> unshare -Urm sleep 2
> umount -l /mnt/b
> wait %%
> 
> $ unshare -Urm pathlogical.sh
> 
> Cc: stable@vger.kernel.org
> Fixes: a05964f3917c ("[PATCH] shared mounts handling: umount")
> Fixes: 0c56fe31420c ("mnt: Don't propagate unmounts to locked mounts")
> Reported-by: Andrei Vagin <avagin@openvz.org>
> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
> ---
>  fs/mount.h     |   1 +
>  fs/namespace.c |   7 +--
>  fs/pnode.c     | 179 +++++++++++++++++++++++++++++++++++++++++----------------
>  fs/pnode.h     |   2 +-
>  4 files changed, 133 insertions(+), 56 deletions(-)
> 
> diff --git a/fs/mount.h b/fs/mount.h
> index d2e25d7b64b3..00fe0d1d6ba7 100644
> --- a/fs/mount.h
> +++ b/fs/mount.h
> @@ -58,6 +58,7 @@ struct mount {
>  	struct mnt_namespace *mnt_ns;	/* containing namespace */
>  	struct mountpoint *mnt_mp;	/* where is it mounted */
>  	struct hlist_node mnt_mp_list;	/* list mounts with the same mountpoint */
> +	struct list_head mnt_umounts;	/* list of children that are being unmounted */
>  #ifdef CONFIG_FSNOTIFY
>  	struct hlist_head mnt_fsnotify_marks;
>  	__u32 mnt_fsnotify_mask;
> diff --git a/fs/namespace.c b/fs/namespace.c
> index e6c234b1a645..73801391bb00 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -237,6 +237,7 @@ static struct mount *alloc_vfsmnt(const char *name)
>  		INIT_LIST_HEAD(&mnt->mnt_slave_list);
>  		INIT_LIST_HEAD(&mnt->mnt_slave);
>  		INIT_HLIST_NODE(&mnt->mnt_mp_list);
> +		INIT_LIST_HEAD(&mnt->mnt_umounts);
>  #ifdef CONFIG_FSNOTIFY
>  		INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
>  #endif
> @@ -650,13 +651,11 @@ struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
>  	p = __lookup_mnt(mnt, dentry);
>  	if (!p)
>  		goto out;
> -	if (!(p->mnt.mnt_flags & MNT_UMOUNT))
> -		res = p;
> +	res = p;
>  	hlist_for_each_entry_continue(p, mnt_hash) {
>  		if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
>  			break;
> -		if (!(p->mnt.mnt_flags & MNT_UMOUNT))
> -			res = p;
> +		res = p;
>  	}
>  out:
>  	return res;
> diff --git a/fs/pnode.c b/fs/pnode.c
> index 234a9ac49958..8fd1a3fb420c 100644
> --- a/fs/pnode.c
> +++ b/fs/pnode.c
> @@ -134,7 +134,8 @@ void change_mnt_propagation(struct mount *mnt, int type)
>  }
>  
>  /*
> - * get the next mount in the propagation tree.
> + * get the next mount that is not a slave of the current mount in the
> + * propagation tree.
>   * @m: the mount seen last
>   * @origin: the original mount from where the tree walk initiated
>   *
> @@ -143,13 +144,9 @@ void change_mnt_propagation(struct mount *mnt, int type)
>   * vfsmount found while iterating with propagation_next() is
>   * a peer of one we'd found earlier.
>   */
> -static struct mount *propagation_next(struct mount *m,
> -					 struct mount *origin)
> +static struct mount *propagation_next_sib(struct mount *m,
> +						struct mount *origin)
>  {
> -	/* are there any slaves of this mount? */
> -	if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
> -		return first_slave(m);
> -
>  	while (1) {
>  		struct mount *master = m->mnt_master;
>  
> @@ -164,6 +161,26 @@ static struct mount *propagation_next(struct mount *m,
>  	}
>  }
>  
> +/*
> + * get the next mount in the propagation tree.
> + * @m: the mount seen last
> + * @origin: the original mount from where the tree walk initiated
> + *
> + * Note that peer groups form contiguous segments of slave lists.
> + * We rely on that in get_source() to be able to find out if
> + * vfsmount found while iterating with propagation_next() is
> + * a peer of one we'd found earlier.
> + */
> +static struct mount *propagation_next(struct mount *m,
> +					 struct mount *origin)
> +{
> +	/* are there any slaves of this mount? */
> +	if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
> +		return first_slave(m);
> +
> +	return propagation_next_sib(m, origin);
> +}
> +
>  static struct mount *next_group(struct mount *m, struct mount *origin)
>  {
>  	while (1) {
> @@ -389,57 +406,92 @@ void propagate_mount_unlock(struct mount *mnt)
>  	}
>  }
>  
> -/*
> - * Mark all mounts that the MNT_LOCKED logic will allow to be unmounted.
> - */
> -static void mark_umount_candidates(struct mount *mnt)
> +static struct mount *propagation_visit_child(struct mount *last_child,
> +					    struct mount *origin_child)
>  {
> -	struct mount *parent = mnt->mnt_parent;
> -	struct mount *m;
> +	struct mount *m = last_child->mnt_parent;
> +	struct mount *origin = origin_child->mnt_parent;
> +	struct dentry *mountpoint = origin_child->mnt_mountpoint;
> +	struct mount *child;
>  
> -	BUG_ON(parent == mnt);
> +	/* Has this part of the propgation tree already been visited? */
> +	if (IS_MNT_MARKED(last_child))
> +		return NULL;
>  
> -	for (m = propagation_next(parent, parent); m;
> -			m = propagation_next(m, parent)) {
> -		struct mount *child = __lookup_mnt_last(&m->mnt,
> -						mnt->mnt_mountpoint);
> -		if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
> -			SET_MNT_MARK(child);
> -		}
> +	SET_MNT_MARK(last_child);
> +
> +	m = propagation_next(m, origin);
> +	while (m) {
> +		child = __lookup_mnt_last(&m->mnt, mountpoint);
> +		if (child && !IS_MNT_MARKED(child))
> +			return child;
> +
> +		if (!child)
> +			m = propagation_next(m, origin);
> +		else
> +			m = propagation_next_sib(m, origin);
>  	}
> +	return NULL;
>  }
>  
> -/*
> - * NOTE: unmounting 'mnt' naturally propagates to all other mounts its
> - * parent propagates to.
> - */
> -static void __propagate_umount(struct mount *mnt)
> +static struct mount *propagation_revisit_child(struct mount *last_child,
> +					       struct mount *origin_child)
>  {
> -	struct mount *parent = mnt->mnt_parent;
> -	struct mount *m;
> +	struct mount *m = last_child->mnt_parent;
> +	struct mount *origin = origin_child->mnt_parent;
> +	struct dentry *mountpoint = origin_child->mnt_mountpoint;
> +	struct mount *child;
>  
> -	BUG_ON(parent == mnt);
> +	/* Has this part of the propgation tree already been revisited? */
> +	if (!IS_MNT_MARKED(last_child))
> +		return NULL;
>  
> -	for (m = propagation_next(parent, parent); m;
> -			m = propagation_next(m, parent)) {
> +	CLEAR_MNT_MARK(last_child);
>  
> -		struct mount *child = __lookup_mnt_last(&m->mnt,
> -						mnt->mnt_mountpoint);
> -		/*
> -		 * umount the child only if the child has no children
> -		 * and the child is marked safe to unmount.
> -		 */
> -		if (!child || !IS_MNT_MARKED(child))
> -			continue;
> -		CLEAR_MNT_MARK(child);
> -		if (list_empty(&child->mnt_mounts)) {
> -			list_del_init(&child->mnt_child);
> -			child->mnt.mnt_flags |= MNT_UMOUNT;
> -			list_move_tail(&child->mnt_list, &mnt->mnt_list);
> -		}
> +	m = propagation_next(m, origin);
> +	while (m) {
> +		child = __lookup_mnt_last(&m->mnt, mountpoint);
> +		if (child && IS_MNT_MARKED(child))
> +			return child;
> +
> +		if (!child)
> +			m = propagation_next(m, origin);
> +		else
> +			m = propagation_next_sib(m, origin);
>  	}
> +	return NULL;
>  }
>  
> +static void start_umount_propagation(struct mount *child,
> +				     struct list_head *to_umount)
> +{
> +	do {
> +		struct mount *parent = child->mnt_parent;
> +
> +		if ((child->mnt.mnt_flags & MNT_UMOUNT) ||
> +		    !list_empty(&child->mnt_mounts))
> +			return;
> +
> +		if (!IS_MNT_LOCKED(child))
> +			list_move_tail(&child->mnt_child, to_umount);
> +		else
> +			list_move_tail(&child->mnt_child, &parent->mnt_umounts);
> +
> +		child = NULL;
> +		if (IS_MNT_MARKED(parent))
> +			child = parent;
> +	} while (child);
> +}
> +
> +static void end_umount_propagation(struct mount *child)
> +{
> +	struct mount *parent = child->mnt_parent;
> +
> +	if (!list_empty(&parent->mnt_umounts))
> +		list_splice_tail_init(&parent->mnt_umounts, &parent->mnt_mounts);
> +}
> +
> +
>  /*
>   * collect all mounts that receive propagation from the mount in @list,
>   * and return these additional mounts in the same list.
> @@ -447,14 +499,39 @@ static void __propagate_umount(struct mount *mnt)
>   *
>   * vfsmount lock must be held for write
>   */
> -int propagate_umount(struct list_head *list)
> +void propagate_umount(struct list_head *list)
>  {
>  	struct mount *mnt;
> +	LIST_HEAD(to_umount);
> +	LIST_HEAD(tmp_list);
> +
> +	/* Find candidates for unmounting */
> +	list_for_each_entry(mnt, list, mnt_list) {
> +		struct mount *child;
> +		for (child = propagation_visit_child(mnt, mnt); child;
> +		     child = propagation_visit_child(child, mnt))
> +			start_umount_propagation(child, &to_umount);
> +	}
>  
> -	list_for_each_entry_reverse(mnt, list, mnt_list)
> -		mark_umount_candidates(mnt);
> +	/* Begin unmounting */
> +	while (!list_empty(&to_umount)) {
> +		mnt = list_first_entry(&to_umount, struct mount, mnt_child);
>  
> -	list_for_each_entry(mnt, list, mnt_list)
> -		__propagate_umount(mnt);
> -	return 0;
> +		list_del_init(&mnt->mnt_child);
> +		mnt->mnt.mnt_flags |= MNT_UMOUNT;
> +		list_move_tail(&mnt->mnt_list, &tmp_list);
> +
> +		if (!list_empty(&mnt->mnt_umounts))
> +			list_splice_tail_init(&mnt->mnt_umounts, &to_umount);
> +	}
> +
> +	/* Cleanup the mount propagation tree */
> +	list_for_each_entry(mnt, list, mnt_list) {
> +		struct mount *child;
> +		for (child = propagation_revisit_child(mnt, mnt); child;
> +		     child = propagation_revisit_child(child, mnt))
> +			end_umount_propagation(child);
> +	}
> +
> +	list_splice_tail(&tmp_list, list);
>  }
> diff --git a/fs/pnode.h b/fs/pnode.h
> index 550f5a8b4fcf..38c6cdb96b34 100644
> --- a/fs/pnode.h
> +++ b/fs/pnode.h
> @@ -41,7 +41,7 @@ static inline void set_mnt_shared(struct mount *mnt)
>  void change_mnt_propagation(struct mount *, int);
>  int propagate_mnt(struct mount *, struct mountpoint *, struct mount *,
>  		struct hlist_head *);
> -int propagate_umount(struct list_head *);
> +void propagate_umount(struct list_head *);
>  int propagate_mount_busy(struct mount *, int);
>  void propagate_mount_unlock(struct mount *);
>  void mnt_release_group_id(struct mount *);
> -- 
> 2.10.1
> 

[-- Attachment #2: 0001-mount-skip-all-mounts-from-a-shared-group-if-one-is-.patch --]
[-- Type: text/plain, Size: 1379 bytes --]

>From 8e0f45c0272aa1f789d1657a0acc98c58919dcc3 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@openvz.org>
Date: Tue, 25 Oct 2016 13:57:31 -0700
Subject: [PATCH] mount: skip all mounts from a shared group if one is marked

If we meet a marked mount, it means that all mounts from
its group have been already revised.

Signed-off-by: Andrei Vagin <avagin@openvz.org>
---
 fs/pnode.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/fs/pnode.c b/fs/pnode.c
index 8fd1a3f..ebb7134 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -426,10 +426,16 @@ static struct mount *propagation_visit_child(struct mount *last_child,
 		if (child && !IS_MNT_MARKED(child))
 			return child;
 
-		if (!child)
+		if (!child) {
 			m = propagation_next(m, origin);
-		else
+		} else {
+			if (IS_MNT_MARKED(child)) {
+				if (m->mnt_group_id == origin->mnt_group_id)
+					return NULL;
+				m = m->mnt_master;
+			}
 			m = propagation_next_sib(m, origin);
+		}
 	}
 	return NULL;
 }
@@ -456,8 +462,14 @@ static struct mount *propagation_revisit_child(struct mount *last_child,
 
 		if (!child)
 			m = propagation_next(m, origin);
-		else
+		else {
+			if (!IS_MNT_MARKED(child)) {
+				if (m->mnt_group_id == origin->mnt_group_id)
+					return NULL;
+				m = m->mnt_master;
+			}
 			m = propagation_next_sib(m, origin);
+		}
 	}
 	return NULL;
 }
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [RFC][PATCH v2] mount: In propagate_umount handle overlapping mount propagation trees
  2016-10-25 20:58                                             ` Andrei Vagin
@ 2016-10-25 21:45                                                 ` Eric W. Biederman
  -1 siblings, 0 replies; 36+ messages in thread
From: Eric W. Biederman @ 2016-10-25 21:45 UTC (permalink / raw)
  To: Andrei Vagin
  Cc: linux-fsdevel, LKML, Linux Containers, Andrey Vagin, Alexander Viro

Andrei Vagin <avagin-5HdwGun5lf+gSpxsJD1C4w@public.gmane.org> writes:

> On Sat, Oct 22, 2016 at 02:42:03PM -0500, Eric W. Biederman wrote:
>> 
>> Andrei,
>> 
>> This fixes the issue you have reported and through a refactoring
>> makes the code simpler and easier to verify.  That said I find your
>> last test case very interesting.   While looking at it in detail
>> I have realized I don't fully understand why we have both lookup_mnt and
>> lookup_mnt_last, so I can't say that this change is fully correct.
>> 
>> Outside of propogate_umount I am don't have concerns but I am not 100%
>> convinced that my change to lookup_mnt_last does the right thing
>> in the case of propagate_umount.
>> 
>> I do see why your last test case scales badly.  Long chains of shared
>> mounts that we can't skip.  At the same time I don't really understand
>> that case.  Part of it has to do with multiple child mounts of the same
>> mount on the same mountpoint.
>> 
>> So I am working through my concerns.  In the mean time I figured it
>> would be useful to post this version.  As this version is clearly better
>> than the version of this change that have come before it.
>
> Hi Eric,
>
> I have tested this version and it works fine.
>
> As for the the last test case, could you look at the attached patch?
> The idea is that we can skip all mounts from a shared group, if one
> of them already marked.
>
>> 
>> Eric
>> 
>> From: "Eric W. Biederman" <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
>> Date: Thu, 13 Oct 2016 13:27:19 -0500
>> 
>> Adrei Vagin pointed out that time to executue propagate_umount can go
>> non-linear (and take a ludicrious amount of time) when the mount
>> propogation trees of the mounts to be unmunted by a lazy unmount
>> overlap.
>> 
>> While investigating the horrible performance I realized that in
>> the case overlapping mount trees since the addition of locked
>> mount support the code has been failing to unmount all of the
>> mounts it should have been unmounting.
>> 
>> Make the walk of the mount propagation trees nearly linear by using
>> MNT_MARK to mark pieces of the mount propagation trees that have
>> already been visited, allowing subsequent walks to skip over
>> subtrees.
>> 
>> Make the processing of mounts order independent by adding a list of
>> mount entries that need to be unmounted, and simply adding a mount to
>> that list when it becomes apparent the mount can safely be unmounted.
>> For mounts that are locked on other mounts but otherwise could be
>> unmounted move them from their parnets mnt_mounts to mnt_umounts so
>> that if and when their parent becomes unmounted these mounts can be
>> added to the list of mounts to unmount.
>> 
>> Add a final pass to clear MNT_MARK and to restore mnt_mounts
>> from mnt_umounts for anything that did not get unmounted.
>> 
>> Add the functions propagation_visit_next and propagation_revisit_next
>> to coordinate walking of the mount tree and setting and clearing the
>> mount mark.
>> 
>> The skipping of already unmounted mounts has been moved from
>> __lookup_mnt_last to mark_umount_candidates, so that the new
>> propagation functions can notice when the propagation tree passes
>> through the initial set of unmounted mounts.  Except in umount_tree as
>> part of the unmounting process the only place where unmounted mounts
>> should be found are in unmounted subtrees.  All of the other callers
>> of __lookup_mnt_last are from mounted subtrees so the not checking for
>> unmounted mounts should not affect them.
>> 
>> A script to generate overlapping mount propagation trees:
>> $ cat run.sh
>> mount -t tmpfs test-mount /mnt
>> mount --make-shared /mnt
>> for i in `seq $1`; do
>>         mkdir /mnt/test.$i
>>         mount --bind /mnt /mnt/test.$i
>> done
>> cat /proc/mounts | grep test-mount | wc -l
>> time umount -l /mnt
>> $ for i in `seq 10 16`; do echo $i; unshare -Urm bash ./run.sh $i; done
>> 
>> Here are the performance numbers with and without the patch:
>> 
>> mhash  |  8192   |  8192  |  8192       | 131072 | 131072      | 104857 | 104857
>> mounts | before  | after  | after (sys) | after  | after (sys) |  after | after (sys)
>> -------------------------------------------------------------------------------------
>>   1024 |  0.071s | 0.020s | 0.000s      | 0.022s | 0.004s      | 0.020s | 0.004s
>>   2048 |  0.184s | 0.022s | 0.004s      | 0.023s | 0.004s      | 0.022s | 0.008s
>>   4096 |  0.604s | 0.025s | 0.020s      | 0.029s | 0.008s      | 0.026s | 0.004s
>>   8912 |  4.471s | 0.053s | 0.020s      | 0.051s | 0.024s      | 0.047s | 0.016s
>>  16384 | 34.826s | 0.088s | 0.060s      | 0.081s | 0.048s      | 0.082s | 0.052s
>>  32768 |         | 0.216s | 0.172s      | 0.160s | 0.124s      | 0.160s | 0.096s
>>  65536 |         | 0.819s | 0.726s      | 0.330s | 0.260s      | 0.338s | 0.256s
>> 131072 |         | 4.502s | 4.168s      | 0.707s | 0.580s      | 0.709s | 0.592s
>> 
>> Andrei Vagin reports fixing the performance problem is part of the
>> work to fix CVE-2016-6213.
>> 
>> A script for a pathlogical set of mounts:
>> 
>> $ cat pathological.sh
>> 
>> mount -t tmpfs base /mnt
>> mount --make-shared /mnt
>> mkdir -p /mnt/b
>> 
>> mount -t tmpfs test1 /mnt/b
>> mount --make-shared /mnt/b
>> mkdir -p /mnt/b/10
>> 
>> mount -t tmpfs test2 /mnt/b/10
>> mount --make-shared /mnt/b/10
>> mkdir -p /mnt/b/10/20
>> 
>> mount --rbind /mnt/b /mnt/b/10/20
>> 
>> unshare -Urm sleep 2
>> umount -l /mnt/b
>> wait %%
>> 
>> $ unshare -Urm pathlogical.sh
>> 
>> Cc: stable-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
>> Fixes: a05964f3917c ("[PATCH] shared mounts handling: umount")
>> Fixes: 0c56fe31420c ("mnt: Don't propagate unmounts to locked mounts")
>> Reported-by: Andrei Vagin <avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
>> Signed-off-by: "Eric W. Biederman" <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
>> ---
>>  fs/mount.h     |   1 +
>>  fs/namespace.c |   7 +--
>>  fs/pnode.c     | 179 +++++++++++++++++++++++++++++++++++++++++----------------
>>  fs/pnode.h     |   2 +-
>>  4 files changed, 133 insertions(+), 56 deletions(-)
>> 
>> diff --git a/fs/mount.h b/fs/mount.h
>> index d2e25d7b64b3..00fe0d1d6ba7 100644
>> --- a/fs/mount.h
>> +++ b/fs/mount.h
>> @@ -58,6 +58,7 @@ struct mount {
>>  	struct mnt_namespace *mnt_ns;	/* containing namespace */
>>  	struct mountpoint *mnt_mp;	/* where is it mounted */
>>  	struct hlist_node mnt_mp_list;	/* list mounts with the same mountpoint */
>> +	struct list_head mnt_umounts;	/* list of children that are being unmounted */
>>  #ifdef CONFIG_FSNOTIFY
>>  	struct hlist_head mnt_fsnotify_marks;
>>  	__u32 mnt_fsnotify_mask;
>> diff --git a/fs/namespace.c b/fs/namespace.c
>> index e6c234b1a645..73801391bb00 100644
>> --- a/fs/namespace.c
>> +++ b/fs/namespace.c
>> @@ -237,6 +237,7 @@ static struct mount *alloc_vfsmnt(const char *name)
>>  		INIT_LIST_HEAD(&mnt->mnt_slave_list);
>>  		INIT_LIST_HEAD(&mnt->mnt_slave);
>>  		INIT_HLIST_NODE(&mnt->mnt_mp_list);
>> +		INIT_LIST_HEAD(&mnt->mnt_umounts);
>>  #ifdef CONFIG_FSNOTIFY
>>  		INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
>>  #endif
>> @@ -650,13 +651,11 @@ struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
>>  	p = __lookup_mnt(mnt, dentry);
>>  	if (!p)
>>  		goto out;
>> -	if (!(p->mnt.mnt_flags & MNT_UMOUNT))
>> -		res = p;
>> +	res = p;
>>  	hlist_for_each_entry_continue(p, mnt_hash) {
>>  		if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
>>  			break;
>> -		if (!(p->mnt.mnt_flags & MNT_UMOUNT))
>> -			res = p;
>> +		res = p;
>>  	}
>>  out:
>>  	return res;
>> diff --git a/fs/pnode.c b/fs/pnode.c
>> index 234a9ac49958..8fd1a3fb420c 100644
>> --- a/fs/pnode.c
>> +++ b/fs/pnode.c
>> @@ -134,7 +134,8 @@ void change_mnt_propagation(struct mount *mnt, int type)
>>  }
>>  
>>  /*
>> - * get the next mount in the propagation tree.
>> + * get the next mount that is not a slave of the current mount in the
>> + * propagation tree.
>>   * @m: the mount seen last
>>   * @origin: the original mount from where the tree walk initiated
>>   *
>> @@ -143,13 +144,9 @@ void change_mnt_propagation(struct mount *mnt, int type)
>>   * vfsmount found while iterating with propagation_next() is
>>   * a peer of one we'd found earlier.
>>   */
>> -static struct mount *propagation_next(struct mount *m,
>> -					 struct mount *origin)
>> +static struct mount *propagation_next_sib(struct mount *m,
>> +						struct mount *origin)
>>  {
>> -	/* are there any slaves of this mount? */
>> -	if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
>> -		return first_slave(m);
>> -
>>  	while (1) {
>>  		struct mount *master = m->mnt_master;
>>  
>> @@ -164,6 +161,26 @@ static struct mount *propagation_next(struct mount *m,
>>  	}
>>  }
>>  
>> +/*
>> + * get the next mount in the propagation tree.
>> + * @m: the mount seen last
>> + * @origin: the original mount from where the tree walk initiated
>> + *
>> + * Note that peer groups form contiguous segments of slave lists.
>> + * We rely on that in get_source() to be able to find out if
>> + * vfsmount found while iterating with propagation_next() is
>> + * a peer of one we'd found earlier.
>> + */
>> +static struct mount *propagation_next(struct mount *m,
>> +					 struct mount *origin)
>> +{
>> +	/* are there any slaves of this mount? */
>> +	if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
>> +		return first_slave(m);
>> +
>> +	return propagation_next_sib(m, origin);
>> +}
>> +
>>  static struct mount *next_group(struct mount *m, struct mount *origin)
>>  {
>>  	while (1) {
>> @@ -389,57 +406,92 @@ void propagate_mount_unlock(struct mount *mnt)
>>  	}
>>  }
>>  
>> -/*
>> - * Mark all mounts that the MNT_LOCKED logic will allow to be unmounted.
>> - */
>> -static void mark_umount_candidates(struct mount *mnt)
>> +static struct mount *propagation_visit_child(struct mount *last_child,
>> +					    struct mount *origin_child)
>>  {
>> -	struct mount *parent = mnt->mnt_parent;
>> -	struct mount *m;
>> +	struct mount *m = last_child->mnt_parent;
>> +	struct mount *origin = origin_child->mnt_parent;
>> +	struct dentry *mountpoint = origin_child->mnt_mountpoint;
>> +	struct mount *child;
>>  
>> -	BUG_ON(parent == mnt);
>> +	/* Has this part of the propgation tree already been visited? */
>> +	if (IS_MNT_MARKED(last_child))
>> +		return NULL;
>>  
>> -	for (m = propagation_next(parent, parent); m;
>> -			m = propagation_next(m, parent)) {
>> -		struct mount *child = __lookup_mnt_last(&m->mnt,
>> -						mnt->mnt_mountpoint);
>> -		if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
>> -			SET_MNT_MARK(child);
>> -		}
>> +	SET_MNT_MARK(last_child);
>> +
>> +	m = propagation_next(m, origin);
>> +	while (m) {
>> +		child = __lookup_mnt_last(&m->mnt, mountpoint);
>> +		if (child && !IS_MNT_MARKED(child))
>> +			return child;
>> +
>> +		if (!child)
>> +			m = propagation_next(m, origin);
>> +		else
>> +			m = propagation_next_sib(m, origin);
>>  	}
>> +	return NULL;
>>  }
>>  
>> -/*
>> - * NOTE: unmounting 'mnt' naturally propagates to all other mounts its
>> - * parent propagates to.
>> - */
>> -static void __propagate_umount(struct mount *mnt)
>> +static struct mount *propagation_revisit_child(struct mount *last_child,
>> +					       struct mount *origin_child)
>>  {
>> -	struct mount *parent = mnt->mnt_parent;
>> -	struct mount *m;
>> +	struct mount *m = last_child->mnt_parent;
>> +	struct mount *origin = origin_child->mnt_parent;
>> +	struct dentry *mountpoint = origin_child->mnt_mountpoint;
>> +	struct mount *child;
>>  
>> -	BUG_ON(parent == mnt);
>> +	/* Has this part of the propgation tree already been revisited? */
>> +	if (!IS_MNT_MARKED(last_child))
>> +		return NULL;
>>  
>> -	for (m = propagation_next(parent, parent); m;
>> -			m = propagation_next(m, parent)) {
>> +	CLEAR_MNT_MARK(last_child);
>>  
>> -		struct mount *child = __lookup_mnt_last(&m->mnt,
>> -						mnt->mnt_mountpoint);
>> -		/*
>> -		 * umount the child only if the child has no children
>> -		 * and the child is marked safe to unmount.
>> -		 */
>> -		if (!child || !IS_MNT_MARKED(child))
>> -			continue;
>> -		CLEAR_MNT_MARK(child);
>> -		if (list_empty(&child->mnt_mounts)) {
>> -			list_del_init(&child->mnt_child);
>> -			child->mnt.mnt_flags |= MNT_UMOUNT;
>> -			list_move_tail(&child->mnt_list, &mnt->mnt_list);
>> -		}
>> +	m = propagation_next(m, origin);
>> +	while (m) {
>> +		child = __lookup_mnt_last(&m->mnt, mountpoint);
>> +		if (child && IS_MNT_MARKED(child))
>> +			return child;
>> +
>> +		if (!child)
>> +			m = propagation_next(m, origin);
>> +		else
>> +			m = propagation_next_sib(m, origin);
>>  	}
>> +	return NULL;
>>  }
>>  
>> +static void start_umount_propagation(struct mount *child,
>> +				     struct list_head *to_umount)
>> +{
>> +	do {
>> +		struct mount *parent = child->mnt_parent;
>> +
>> +		if ((child->mnt.mnt_flags & MNT_UMOUNT) ||
>> +		    !list_empty(&child->mnt_mounts))
>> +			return;
>> +
>> +		if (!IS_MNT_LOCKED(child))
>> +			list_move_tail(&child->mnt_child, to_umount);
>> +		else
>> +			list_move_tail(&child->mnt_child, &parent->mnt_umounts);
>> +
>> +		child = NULL;
>> +		if (IS_MNT_MARKED(parent))
>> +			child = parent;
>> +	} while (child);
>> +}
>> +
>> +static void end_umount_propagation(struct mount *child)
>> +{
>> +	struct mount *parent = child->mnt_parent;
>> +
>> +	if (!list_empty(&parent->mnt_umounts))
>> +		list_splice_tail_init(&parent->mnt_umounts, &parent->mnt_mounts);
>> +}
>> +
>> +
>>  /*
>>   * collect all mounts that receive propagation from the mount in @list,
>>   * and return these additional mounts in the same list.
>> @@ -447,14 +499,39 @@ static void __propagate_umount(struct mount *mnt)
>>   *
>>   * vfsmount lock must be held for write
>>   */
>> -int propagate_umount(struct list_head *list)
>> +void propagate_umount(struct list_head *list)
>>  {
>>  	struct mount *mnt;
>> +	LIST_HEAD(to_umount);
>> +	LIST_HEAD(tmp_list);
>> +
>> +	/* Find candidates for unmounting */
>> +	list_for_each_entry(mnt, list, mnt_list) {
>> +		struct mount *child;
>> +		for (child = propagation_visit_child(mnt, mnt); child;
>> +		     child = propagation_visit_child(child, mnt))
>> +			start_umount_propagation(child, &to_umount);
>> +	}
>>  
>> -	list_for_each_entry_reverse(mnt, list, mnt_list)
>> -		mark_umount_candidates(mnt);
>> +	/* Begin unmounting */
>> +	while (!list_empty(&to_umount)) {
>> +		mnt = list_first_entry(&to_umount, struct mount, mnt_child);
>>  
>> -	list_for_each_entry(mnt, list, mnt_list)
>> -		__propagate_umount(mnt);
>> -	return 0;
>> +		list_del_init(&mnt->mnt_child);
>> +		mnt->mnt.mnt_flags |= MNT_UMOUNT;
>> +		list_move_tail(&mnt->mnt_list, &tmp_list);
>> +
>> +		if (!list_empty(&mnt->mnt_umounts))
>> +			list_splice_tail_init(&mnt->mnt_umounts, &to_umount);
>> +	}
>> +
>> +	/* Cleanup the mount propagation tree */
>> +	list_for_each_entry(mnt, list, mnt_list) {
>> +		struct mount *child;
>> +		for (child = propagation_revisit_child(mnt, mnt); child;
>> +		     child = propagation_revisit_child(child, mnt))
>> +			end_umount_propagation(child);
>> +	}
>> +
>> +	list_splice_tail(&tmp_list, list);
>>  }
>> diff --git a/fs/pnode.h b/fs/pnode.h
>> index 550f5a8b4fcf..38c6cdb96b34 100644
>> --- a/fs/pnode.h
>> +++ b/fs/pnode.h
>> @@ -41,7 +41,7 @@ static inline void set_mnt_shared(struct mount *mnt)
>>  void change_mnt_propagation(struct mount *, int);
>>  int propagate_mnt(struct mount *, struct mountpoint *, struct mount *,
>>  		struct hlist_head *);
>> -int propagate_umount(struct list_head *);
>> +void propagate_umount(struct list_head *);
>>  int propagate_mount_busy(struct mount *, int);
>>  void propagate_mount_unlock(struct mount *);
>>  void mnt_release_group_id(struct mount *);
>> -- 
>> 2.10.1
>> 
>
> From 8e0f45c0272aa1f789d1657a0acc98c58919dcc3 Mon Sep 17 00:00:00 2001
> From: Andrei Vagin <avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
> Date: Tue, 25 Oct 2016 13:57:31 -0700
> Subject: [PATCH] mount: skip all mounts from a shared group if one is marked
>
> If we meet a marked mount, it means that all mounts from
> its group have been already revised.
>
> Signed-off-by: Andrei Vagin <avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
> ---
>  fs/pnode.c | 18 +++++++++++++++---
>  1 file changed, 15 insertions(+), 3 deletions(-)
>
> diff --git a/fs/pnode.c b/fs/pnode.c
> index 8fd1a3f..ebb7134 100644
> --- a/fs/pnode.c
> +++ b/fs/pnode.c
> @@ -426,10 +426,16 @@ static struct mount *propagation_visit_child(struct mount *last_child,
>  		if (child && !IS_MNT_MARKED(child))
>  			return child;
>  
> -		if (!child)
> +		if (!child) {
>  			m = propagation_next(m, origin);
> -		else
> +		} else {
> +			if (IS_MNT_MARKED(child)) {
> +				if (m->mnt_group_id == origin->mnt_group_id)
> +					return NULL;
> +				m = m->mnt_master;
> +			}
>  			m = propagation_next_sib(m, origin);
> +		}
>  	}
>  	return NULL;
>  }
> @@ -456,8 +462,14 @@ static struct mount *propagation_revisit_child(struct mount *last_child,
>  
>  		if (!child)
>  			m = propagation_next(m, origin);
> -		else
> +		else {
> +			if (!IS_MNT_MARKED(child)) {
> +				if (m->mnt_group_id == origin->mnt_group_id)
> +					return NULL;
> +				m = m->mnt_master;
> +			}
>  			m = propagation_next_sib(m, origin);
> +		}
>  	}
>  	return NULL;
>  }

That is certainly interesting.  The problem is that the reason we were
going slow is that there were in fact mounts that had not been traversed
in the share group.

And in fact the entire idea of visiting a vfsmount mountpoint pair
exactly once is wrong in the face of shadow mounts.  For a vfsmount
mountpoint pair that has shadow mounts the number of shadow mounts needs
to be descreased by one each time the propgation tree is traversed
during unmount. Which means that as far as I can see we have to kill
shadow mounts to correctly optimize this code.  Once shadow mounts are
gone I don't know of a case where need your optimization.

I am busily verifying my patch to kill shadow mounts but the following
patch is the minimal version.  As far as I can see propagate_one
is the only place we create shadow mounts, and holding the
namespace_lock over attach_recursive_mnt, propagate_mnt, and
propgate_one is sufficient for that __lookup_mnt to be competely safe.

diff --git a/fs/pnode.c b/fs/pnode.c
index 234a9ac49958..b14119b370d4 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -217,6 +217,9 @@ static int propagate_one(struct mount *m)
        /* skip if mountpoint isn't covered by it */
        if (!is_subdir(mp->m_dentry, m->mnt.mnt_root))
                return 0;
+       /* skip if mountpoint already has a mount on it */
+       if (__lookup_mnt(&m->mnt, mp->m_dentry))
+               return 0;
        if (peers(m, last_dest)) {
                type = CL_MAKE_SHARED;
        } else {

If you run with that patch you will see that there are go faster stripes.

Eric

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [RFC][PATCH v2] mount: In propagate_umount handle overlapping mount propagation trees
@ 2016-10-25 21:45                                                 ` Eric W. Biederman
  0 siblings, 0 replies; 36+ messages in thread
From: Eric W. Biederman @ 2016-10-25 21:45 UTC (permalink / raw)
  To: Andrei Vagin
  Cc: Andrey Vagin, Alexander Viro, Linux Containers, linux-fsdevel, LKML

Andrei Vagin <avagin@virtuozzo.com> writes:

> On Sat, Oct 22, 2016 at 02:42:03PM -0500, Eric W. Biederman wrote:
>> 
>> Andrei,
>> 
>> This fixes the issue you have reported and through a refactoring
>> makes the code simpler and easier to verify.  That said I find your
>> last test case very interesting.   While looking at it in detail
>> I have realized I don't fully understand why we have both lookup_mnt and
>> lookup_mnt_last, so I can't say that this change is fully correct.
>> 
>> Outside of propogate_umount I am don't have concerns but I am not 100%
>> convinced that my change to lookup_mnt_last does the right thing
>> in the case of propagate_umount.
>> 
>> I do see why your last test case scales badly.  Long chains of shared
>> mounts that we can't skip.  At the same time I don't really understand
>> that case.  Part of it has to do with multiple child mounts of the same
>> mount on the same mountpoint.
>> 
>> So I am working through my concerns.  In the mean time I figured it
>> would be useful to post this version.  As this version is clearly better
>> than the version of this change that have come before it.
>
> Hi Eric,
>
> I have tested this version and it works fine.
>
> As for the the last test case, could you look at the attached patch?
> The idea is that we can skip all mounts from a shared group, if one
> of them already marked.
>
>> 
>> Eric
>> 
>> From: "Eric W. Biederman" <ebiederm@xmission.com>
>> Date: Thu, 13 Oct 2016 13:27:19 -0500
>> 
>> Adrei Vagin pointed out that time to executue propagate_umount can go
>> non-linear (and take a ludicrious amount of time) when the mount
>> propogation trees of the mounts to be unmunted by a lazy unmount
>> overlap.
>> 
>> While investigating the horrible performance I realized that in
>> the case overlapping mount trees since the addition of locked
>> mount support the code has been failing to unmount all of the
>> mounts it should have been unmounting.
>> 
>> Make the walk of the mount propagation trees nearly linear by using
>> MNT_MARK to mark pieces of the mount propagation trees that have
>> already been visited, allowing subsequent walks to skip over
>> subtrees.
>> 
>> Make the processing of mounts order independent by adding a list of
>> mount entries that need to be unmounted, and simply adding a mount to
>> that list when it becomes apparent the mount can safely be unmounted.
>> For mounts that are locked on other mounts but otherwise could be
>> unmounted move them from their parnets mnt_mounts to mnt_umounts so
>> that if and when their parent becomes unmounted these mounts can be
>> added to the list of mounts to unmount.
>> 
>> Add a final pass to clear MNT_MARK and to restore mnt_mounts
>> from mnt_umounts for anything that did not get unmounted.
>> 
>> Add the functions propagation_visit_next and propagation_revisit_next
>> to coordinate walking of the mount tree and setting and clearing the
>> mount mark.
>> 
>> The skipping of already unmounted mounts has been moved from
>> __lookup_mnt_last to mark_umount_candidates, so that the new
>> propagation functions can notice when the propagation tree passes
>> through the initial set of unmounted mounts.  Except in umount_tree as
>> part of the unmounting process the only place where unmounted mounts
>> should be found are in unmounted subtrees.  All of the other callers
>> of __lookup_mnt_last are from mounted subtrees so the not checking for
>> unmounted mounts should not affect them.
>> 
>> A script to generate overlapping mount propagation trees:
>> $ cat run.sh
>> mount -t tmpfs test-mount /mnt
>> mount --make-shared /mnt
>> for i in `seq $1`; do
>>         mkdir /mnt/test.$i
>>         mount --bind /mnt /mnt/test.$i
>> done
>> cat /proc/mounts | grep test-mount | wc -l
>> time umount -l /mnt
>> $ for i in `seq 10 16`; do echo $i; unshare -Urm bash ./run.sh $i; done
>> 
>> Here are the performance numbers with and without the patch:
>> 
>> mhash  |  8192   |  8192  |  8192       | 131072 | 131072      | 104857 | 104857
>> mounts | before  | after  | after (sys) | after  | after (sys) |  after | after (sys)
>> -------------------------------------------------------------------------------------
>>   1024 |  0.071s | 0.020s | 0.000s      | 0.022s | 0.004s      | 0.020s | 0.004s
>>   2048 |  0.184s | 0.022s | 0.004s      | 0.023s | 0.004s      | 0.022s | 0.008s
>>   4096 |  0.604s | 0.025s | 0.020s      | 0.029s | 0.008s      | 0.026s | 0.004s
>>   8912 |  4.471s | 0.053s | 0.020s      | 0.051s | 0.024s      | 0.047s | 0.016s
>>  16384 | 34.826s | 0.088s | 0.060s      | 0.081s | 0.048s      | 0.082s | 0.052s
>>  32768 |         | 0.216s | 0.172s      | 0.160s | 0.124s      | 0.160s | 0.096s
>>  65536 |         | 0.819s | 0.726s      | 0.330s | 0.260s      | 0.338s | 0.256s
>> 131072 |         | 4.502s | 4.168s      | 0.707s | 0.580s      | 0.709s | 0.592s
>> 
>> Andrei Vagin reports fixing the performance problem is part of the
>> work to fix CVE-2016-6213.
>> 
>> A script for a pathlogical set of mounts:
>> 
>> $ cat pathological.sh
>> 
>> mount -t tmpfs base /mnt
>> mount --make-shared /mnt
>> mkdir -p /mnt/b
>> 
>> mount -t tmpfs test1 /mnt/b
>> mount --make-shared /mnt/b
>> mkdir -p /mnt/b/10
>> 
>> mount -t tmpfs test2 /mnt/b/10
>> mount --make-shared /mnt/b/10
>> mkdir -p /mnt/b/10/20
>> 
>> mount --rbind /mnt/b /mnt/b/10/20
>> 
>> unshare -Urm sleep 2
>> umount -l /mnt/b
>> wait %%
>> 
>> $ unshare -Urm pathlogical.sh
>> 
>> Cc: stable@vger.kernel.org
>> Fixes: a05964f3917c ("[PATCH] shared mounts handling: umount")
>> Fixes: 0c56fe31420c ("mnt: Don't propagate unmounts to locked mounts")
>> Reported-by: Andrei Vagin <avagin@openvz.org>
>> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
>> ---
>>  fs/mount.h     |   1 +
>>  fs/namespace.c |   7 +--
>>  fs/pnode.c     | 179 +++++++++++++++++++++++++++++++++++++++++----------------
>>  fs/pnode.h     |   2 +-
>>  4 files changed, 133 insertions(+), 56 deletions(-)
>> 
>> diff --git a/fs/mount.h b/fs/mount.h
>> index d2e25d7b64b3..00fe0d1d6ba7 100644
>> --- a/fs/mount.h
>> +++ b/fs/mount.h
>> @@ -58,6 +58,7 @@ struct mount {
>>  	struct mnt_namespace *mnt_ns;	/* containing namespace */
>>  	struct mountpoint *mnt_mp;	/* where is it mounted */
>>  	struct hlist_node mnt_mp_list;	/* list mounts with the same mountpoint */
>> +	struct list_head mnt_umounts;	/* list of children that are being unmounted */
>>  #ifdef CONFIG_FSNOTIFY
>>  	struct hlist_head mnt_fsnotify_marks;
>>  	__u32 mnt_fsnotify_mask;
>> diff --git a/fs/namespace.c b/fs/namespace.c
>> index e6c234b1a645..73801391bb00 100644
>> --- a/fs/namespace.c
>> +++ b/fs/namespace.c
>> @@ -237,6 +237,7 @@ static struct mount *alloc_vfsmnt(const char *name)
>>  		INIT_LIST_HEAD(&mnt->mnt_slave_list);
>>  		INIT_LIST_HEAD(&mnt->mnt_slave);
>>  		INIT_HLIST_NODE(&mnt->mnt_mp_list);
>> +		INIT_LIST_HEAD(&mnt->mnt_umounts);
>>  #ifdef CONFIG_FSNOTIFY
>>  		INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
>>  #endif
>> @@ -650,13 +651,11 @@ struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
>>  	p = __lookup_mnt(mnt, dentry);
>>  	if (!p)
>>  		goto out;
>> -	if (!(p->mnt.mnt_flags & MNT_UMOUNT))
>> -		res = p;
>> +	res = p;
>>  	hlist_for_each_entry_continue(p, mnt_hash) {
>>  		if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
>>  			break;
>> -		if (!(p->mnt.mnt_flags & MNT_UMOUNT))
>> -			res = p;
>> +		res = p;
>>  	}
>>  out:
>>  	return res;
>> diff --git a/fs/pnode.c b/fs/pnode.c
>> index 234a9ac49958..8fd1a3fb420c 100644
>> --- a/fs/pnode.c
>> +++ b/fs/pnode.c
>> @@ -134,7 +134,8 @@ void change_mnt_propagation(struct mount *mnt, int type)
>>  }
>>  
>>  /*
>> - * get the next mount in the propagation tree.
>> + * get the next mount that is not a slave of the current mount in the
>> + * propagation tree.
>>   * @m: the mount seen last
>>   * @origin: the original mount from where the tree walk initiated
>>   *
>> @@ -143,13 +144,9 @@ void change_mnt_propagation(struct mount *mnt, int type)
>>   * vfsmount found while iterating with propagation_next() is
>>   * a peer of one we'd found earlier.
>>   */
>> -static struct mount *propagation_next(struct mount *m,
>> -					 struct mount *origin)
>> +static struct mount *propagation_next_sib(struct mount *m,
>> +						struct mount *origin)
>>  {
>> -	/* are there any slaves of this mount? */
>> -	if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
>> -		return first_slave(m);
>> -
>>  	while (1) {
>>  		struct mount *master = m->mnt_master;
>>  
>> @@ -164,6 +161,26 @@ static struct mount *propagation_next(struct mount *m,
>>  	}
>>  }
>>  
>> +/*
>> + * get the next mount in the propagation tree.
>> + * @m: the mount seen last
>> + * @origin: the original mount from where the tree walk initiated
>> + *
>> + * Note that peer groups form contiguous segments of slave lists.
>> + * We rely on that in get_source() to be able to find out if
>> + * vfsmount found while iterating with propagation_next() is
>> + * a peer of one we'd found earlier.
>> + */
>> +static struct mount *propagation_next(struct mount *m,
>> +					 struct mount *origin)
>> +{
>> +	/* are there any slaves of this mount? */
>> +	if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
>> +		return first_slave(m);
>> +
>> +	return propagation_next_sib(m, origin);
>> +}
>> +
>>  static struct mount *next_group(struct mount *m, struct mount *origin)
>>  {
>>  	while (1) {
>> @@ -389,57 +406,92 @@ void propagate_mount_unlock(struct mount *mnt)
>>  	}
>>  }
>>  
>> -/*
>> - * Mark all mounts that the MNT_LOCKED logic will allow to be unmounted.
>> - */
>> -static void mark_umount_candidates(struct mount *mnt)
>> +static struct mount *propagation_visit_child(struct mount *last_child,
>> +					    struct mount *origin_child)
>>  {
>> -	struct mount *parent = mnt->mnt_parent;
>> -	struct mount *m;
>> +	struct mount *m = last_child->mnt_parent;
>> +	struct mount *origin = origin_child->mnt_parent;
>> +	struct dentry *mountpoint = origin_child->mnt_mountpoint;
>> +	struct mount *child;
>>  
>> -	BUG_ON(parent == mnt);
>> +	/* Has this part of the propgation tree already been visited? */
>> +	if (IS_MNT_MARKED(last_child))
>> +		return NULL;
>>  
>> -	for (m = propagation_next(parent, parent); m;
>> -			m = propagation_next(m, parent)) {
>> -		struct mount *child = __lookup_mnt_last(&m->mnt,
>> -						mnt->mnt_mountpoint);
>> -		if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
>> -			SET_MNT_MARK(child);
>> -		}
>> +	SET_MNT_MARK(last_child);
>> +
>> +	m = propagation_next(m, origin);
>> +	while (m) {
>> +		child = __lookup_mnt_last(&m->mnt, mountpoint);
>> +		if (child && !IS_MNT_MARKED(child))
>> +			return child;
>> +
>> +		if (!child)
>> +			m = propagation_next(m, origin);
>> +		else
>> +			m = propagation_next_sib(m, origin);
>>  	}
>> +	return NULL;
>>  }
>>  
>> -/*
>> - * NOTE: unmounting 'mnt' naturally propagates to all other mounts its
>> - * parent propagates to.
>> - */
>> -static void __propagate_umount(struct mount *mnt)
>> +static struct mount *propagation_revisit_child(struct mount *last_child,
>> +					       struct mount *origin_child)
>>  {
>> -	struct mount *parent = mnt->mnt_parent;
>> -	struct mount *m;
>> +	struct mount *m = last_child->mnt_parent;
>> +	struct mount *origin = origin_child->mnt_parent;
>> +	struct dentry *mountpoint = origin_child->mnt_mountpoint;
>> +	struct mount *child;
>>  
>> -	BUG_ON(parent == mnt);
>> +	/* Has this part of the propgation tree already been revisited? */
>> +	if (!IS_MNT_MARKED(last_child))
>> +		return NULL;
>>  
>> -	for (m = propagation_next(parent, parent); m;
>> -			m = propagation_next(m, parent)) {
>> +	CLEAR_MNT_MARK(last_child);
>>  
>> -		struct mount *child = __lookup_mnt_last(&m->mnt,
>> -						mnt->mnt_mountpoint);
>> -		/*
>> -		 * umount the child only if the child has no children
>> -		 * and the child is marked safe to unmount.
>> -		 */
>> -		if (!child || !IS_MNT_MARKED(child))
>> -			continue;
>> -		CLEAR_MNT_MARK(child);
>> -		if (list_empty(&child->mnt_mounts)) {
>> -			list_del_init(&child->mnt_child);
>> -			child->mnt.mnt_flags |= MNT_UMOUNT;
>> -			list_move_tail(&child->mnt_list, &mnt->mnt_list);
>> -		}
>> +	m = propagation_next(m, origin);
>> +	while (m) {
>> +		child = __lookup_mnt_last(&m->mnt, mountpoint);
>> +		if (child && IS_MNT_MARKED(child))
>> +			return child;
>> +
>> +		if (!child)
>> +			m = propagation_next(m, origin);
>> +		else
>> +			m = propagation_next_sib(m, origin);
>>  	}
>> +	return NULL;
>>  }
>>  
>> +static void start_umount_propagation(struct mount *child,
>> +				     struct list_head *to_umount)
>> +{
>> +	do {
>> +		struct mount *parent = child->mnt_parent;
>> +
>> +		if ((child->mnt.mnt_flags & MNT_UMOUNT) ||
>> +		    !list_empty(&child->mnt_mounts))
>> +			return;
>> +
>> +		if (!IS_MNT_LOCKED(child))
>> +			list_move_tail(&child->mnt_child, to_umount);
>> +		else
>> +			list_move_tail(&child->mnt_child, &parent->mnt_umounts);
>> +
>> +		child = NULL;
>> +		if (IS_MNT_MARKED(parent))
>> +			child = parent;
>> +	} while (child);
>> +}
>> +
>> +static void end_umount_propagation(struct mount *child)
>> +{
>> +	struct mount *parent = child->mnt_parent;
>> +
>> +	if (!list_empty(&parent->mnt_umounts))
>> +		list_splice_tail_init(&parent->mnt_umounts, &parent->mnt_mounts);
>> +}
>> +
>> +
>>  /*
>>   * collect all mounts that receive propagation from the mount in @list,
>>   * and return these additional mounts in the same list.
>> @@ -447,14 +499,39 @@ static void __propagate_umount(struct mount *mnt)
>>   *
>>   * vfsmount lock must be held for write
>>   */
>> -int propagate_umount(struct list_head *list)
>> +void propagate_umount(struct list_head *list)
>>  {
>>  	struct mount *mnt;
>> +	LIST_HEAD(to_umount);
>> +	LIST_HEAD(tmp_list);
>> +
>> +	/* Find candidates for unmounting */
>> +	list_for_each_entry(mnt, list, mnt_list) {
>> +		struct mount *child;
>> +		for (child = propagation_visit_child(mnt, mnt); child;
>> +		     child = propagation_visit_child(child, mnt))
>> +			start_umount_propagation(child, &to_umount);
>> +	}
>>  
>> -	list_for_each_entry_reverse(mnt, list, mnt_list)
>> -		mark_umount_candidates(mnt);
>> +	/* Begin unmounting */
>> +	while (!list_empty(&to_umount)) {
>> +		mnt = list_first_entry(&to_umount, struct mount, mnt_child);
>>  
>> -	list_for_each_entry(mnt, list, mnt_list)
>> -		__propagate_umount(mnt);
>> -	return 0;
>> +		list_del_init(&mnt->mnt_child);
>> +		mnt->mnt.mnt_flags |= MNT_UMOUNT;
>> +		list_move_tail(&mnt->mnt_list, &tmp_list);
>> +
>> +		if (!list_empty(&mnt->mnt_umounts))
>> +			list_splice_tail_init(&mnt->mnt_umounts, &to_umount);
>> +	}
>> +
>> +	/* Cleanup the mount propagation tree */
>> +	list_for_each_entry(mnt, list, mnt_list) {
>> +		struct mount *child;
>> +		for (child = propagation_revisit_child(mnt, mnt); child;
>> +		     child = propagation_revisit_child(child, mnt))
>> +			end_umount_propagation(child);
>> +	}
>> +
>> +	list_splice_tail(&tmp_list, list);
>>  }
>> diff --git a/fs/pnode.h b/fs/pnode.h
>> index 550f5a8b4fcf..38c6cdb96b34 100644
>> --- a/fs/pnode.h
>> +++ b/fs/pnode.h
>> @@ -41,7 +41,7 @@ static inline void set_mnt_shared(struct mount *mnt)
>>  void change_mnt_propagation(struct mount *, int);
>>  int propagate_mnt(struct mount *, struct mountpoint *, struct mount *,
>>  		struct hlist_head *);
>> -int propagate_umount(struct list_head *);
>> +void propagate_umount(struct list_head *);
>>  int propagate_mount_busy(struct mount *, int);
>>  void propagate_mount_unlock(struct mount *);
>>  void mnt_release_group_id(struct mount *);
>> -- 
>> 2.10.1
>> 
>
> From 8e0f45c0272aa1f789d1657a0acc98c58919dcc3 Mon Sep 17 00:00:00 2001
> From: Andrei Vagin <avagin@openvz.org>
> Date: Tue, 25 Oct 2016 13:57:31 -0700
> Subject: [PATCH] mount: skip all mounts from a shared group if one is marked
>
> If we meet a marked mount, it means that all mounts from
> its group have been already revised.
>
> Signed-off-by: Andrei Vagin <avagin@openvz.org>
> ---
>  fs/pnode.c | 18 +++++++++++++++---
>  1 file changed, 15 insertions(+), 3 deletions(-)
>
> diff --git a/fs/pnode.c b/fs/pnode.c
> index 8fd1a3f..ebb7134 100644
> --- a/fs/pnode.c
> +++ b/fs/pnode.c
> @@ -426,10 +426,16 @@ static struct mount *propagation_visit_child(struct mount *last_child,
>  		if (child && !IS_MNT_MARKED(child))
>  			return child;
>  
> -		if (!child)
> +		if (!child) {
>  			m = propagation_next(m, origin);
> -		else
> +		} else {
> +			if (IS_MNT_MARKED(child)) {
> +				if (m->mnt_group_id == origin->mnt_group_id)
> +					return NULL;
> +				m = m->mnt_master;
> +			}
>  			m = propagation_next_sib(m, origin);
> +		}
>  	}
>  	return NULL;
>  }
> @@ -456,8 +462,14 @@ static struct mount *propagation_revisit_child(struct mount *last_child,
>  
>  		if (!child)
>  			m = propagation_next(m, origin);
> -		else
> +		else {
> +			if (!IS_MNT_MARKED(child)) {
> +				if (m->mnt_group_id == origin->mnt_group_id)
> +					return NULL;
> +				m = m->mnt_master;
> +			}
>  			m = propagation_next_sib(m, origin);
> +		}
>  	}
>  	return NULL;
>  }

That is certainly interesting.  The problem is that the reason we were
going slow is that there were in fact mounts that had not been traversed
in the share group.

And in fact the entire idea of visiting a vfsmount mountpoint pair
exactly once is wrong in the face of shadow mounts.  For a vfsmount
mountpoint pair that has shadow mounts the number of shadow mounts needs
to be descreased by one each time the propgation tree is traversed
during unmount. Which means that as far as I can see we have to kill
shadow mounts to correctly optimize this code.  Once shadow mounts are
gone I don't know of a case where need your optimization.

I am busily verifying my patch to kill shadow mounts but the following
patch is the minimal version.  As far as I can see propagate_one
is the only place we create shadow mounts, and holding the
namespace_lock over attach_recursive_mnt, propagate_mnt, and
propgate_one is sufficient for that __lookup_mnt to be competely safe.

diff --git a/fs/pnode.c b/fs/pnode.c
index 234a9ac49958..b14119b370d4 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -217,6 +217,9 @@ static int propagate_one(struct mount *m)
        /* skip if mountpoint isn't covered by it */
        if (!is_subdir(mp->m_dentry, m->mnt.mnt_root))
                return 0;
+       /* skip if mountpoint already has a mount on it */
+       if (__lookup_mnt(&m->mnt, mp->m_dentry))
+               return 0;
        if (peers(m, last_dest)) {
                type = CL_MAKE_SHARED;
        } else {

If you run with that patch you will see that there are go faster stripes.

Eric

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [RFC][PATCH v2] mount: In propagate_umount handle overlapping mount propagation trees
  2016-10-25 21:45                                                 ` Eric W. Biederman
@ 2016-10-25 23:41                                                     ` Andrei Vagin
  -1 siblings, 0 replies; 36+ messages in thread
From: Andrei Vagin @ 2016-10-25 23:41 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: linux-fsdevel, LKML, Linux Containers, Andrey Vagin, Alexander Viro

On Tue, Oct 25, 2016 at 04:45:44PM -0500, Eric W. Biederman wrote:
> Andrei Vagin <avagin-5HdwGun5lf+gSpxsJD1C4w@public.gmane.org> writes:
> 
> > On Sat, Oct 22, 2016 at 02:42:03PM -0500, Eric W. Biederman wrote:
> >> 
> >> Andrei,
> >> 
> >> This fixes the issue you have reported and through a refactoring
> >> makes the code simpler and easier to verify.  That said I find your
> >> last test case very interesting.   While looking at it in detail
> >> I have realized I don't fully understand why we have both lookup_mnt and
> >> lookup_mnt_last, so I can't say that this change is fully correct.
> >> 
> >> Outside of propogate_umount I am don't have concerns but I am not 100%
> >> convinced that my change to lookup_mnt_last does the right thing
> >> in the case of propagate_umount.
> >> 
> >> I do see why your last test case scales badly.  Long chains of shared
> >> mounts that we can't skip.  At the same time I don't really understand
> >> that case.  Part of it has to do with multiple child mounts of the same
> >> mount on the same mountpoint.
> >> 
> >> So I am working through my concerns.  In the mean time I figured it
> >> would be useful to post this version.  As this version is clearly better
> >> than the version of this change that have come before it.
> >
> > Hi Eric,
> >
> > I have tested this version and it works fine.
> >
> > As for the the last test case, could you look at the attached patch?
> > The idea is that we can skip all mounts from a shared group, if one
> > of them already marked.
> >
> >> 
> >> Eric
> >> 
> >> From: "Eric W. Biederman" <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
> >> Date: Thu, 13 Oct 2016 13:27:19 -0500
> >> 
> >> Adrei Vagin pointed out that time to executue propagate_umount can go
> >> non-linear (and take a ludicrious amount of time) when the mount
> >> propogation trees of the mounts to be unmunted by a lazy unmount
> >> overlap.
> >> 
> >> While investigating the horrible performance I realized that in
> >> the case overlapping mount trees since the addition of locked
> >> mount support the code has been failing to unmount all of the
> >> mounts it should have been unmounting.
> >> 
> >> Make the walk of the mount propagation trees nearly linear by using
> >> MNT_MARK to mark pieces of the mount propagation trees that have
> >> already been visited, allowing subsequent walks to skip over
> >> subtrees.
> >> 
> >> Make the processing of mounts order independent by adding a list of
> >> mount entries that need to be unmounted, and simply adding a mount to
> >> that list when it becomes apparent the mount can safely be unmounted.
> >> For mounts that are locked on other mounts but otherwise could be
> >> unmounted move them from their parnets mnt_mounts to mnt_umounts so
> >> that if and when their parent becomes unmounted these mounts can be
> >> added to the list of mounts to unmount.
> >> 
> >> Add a final pass to clear MNT_MARK and to restore mnt_mounts
> >> from mnt_umounts for anything that did not get unmounted.
> >> 
> >> Add the functions propagation_visit_next and propagation_revisit_next
> >> to coordinate walking of the mount tree and setting and clearing the
> >> mount mark.
> >> 
> >> The skipping of already unmounted mounts has been moved from
> >> __lookup_mnt_last to mark_umount_candidates, so that the new
> >> propagation functions can notice when the propagation tree passes
> >> through the initial set of unmounted mounts.  Except in umount_tree as
> >> part of the unmounting process the only place where unmounted mounts
> >> should be found are in unmounted subtrees.  All of the other callers
> >> of __lookup_mnt_last are from mounted subtrees so the not checking for
> >> unmounted mounts should not affect them.
> >> 
> >> A script to generate overlapping mount propagation trees:
> >> $ cat run.sh
> >> mount -t tmpfs test-mount /mnt
> >> mount --make-shared /mnt
> >> for i in `seq $1`; do
> >>         mkdir /mnt/test.$i
> >>         mount --bind /mnt /mnt/test.$i
> >> done
> >> cat /proc/mounts | grep test-mount | wc -l
> >> time umount -l /mnt
> >> $ for i in `seq 10 16`; do echo $i; unshare -Urm bash ./run.sh $i; done
> >> 
> >> Here are the performance numbers with and without the patch:
> >> 
> >> mhash  |  8192   |  8192  |  8192       | 131072 | 131072      | 104857 | 104857
> >> mounts | before  | after  | after (sys) | after  | after (sys) |  after | after (sys)
> >> -------------------------------------------------------------------------------------
> >>   1024 |  0.071s | 0.020s | 0.000s      | 0.022s | 0.004s      | 0.020s | 0.004s
> >>   2048 |  0.184s | 0.022s | 0.004s      | 0.023s | 0.004s      | 0.022s | 0.008s
> >>   4096 |  0.604s | 0.025s | 0.020s      | 0.029s | 0.008s      | 0.026s | 0.004s
> >>   8912 |  4.471s | 0.053s | 0.020s      | 0.051s | 0.024s      | 0.047s | 0.016s
> >>  16384 | 34.826s | 0.088s | 0.060s      | 0.081s | 0.048s      | 0.082s | 0.052s
> >>  32768 |         | 0.216s | 0.172s      | 0.160s | 0.124s      | 0.160s | 0.096s
> >>  65536 |         | 0.819s | 0.726s      | 0.330s | 0.260s      | 0.338s | 0.256s
> >> 131072 |         | 4.502s | 4.168s      | 0.707s | 0.580s      | 0.709s | 0.592s
> >> 
> >> Andrei Vagin reports fixing the performance problem is part of the
> >> work to fix CVE-2016-6213.
> >> 
> >> A script for a pathlogical set of mounts:
> >> 
> >> $ cat pathological.sh
> >> 
> >> mount -t tmpfs base /mnt
> >> mount --make-shared /mnt
> >> mkdir -p /mnt/b
> >> 
> >> mount -t tmpfs test1 /mnt/b
> >> mount --make-shared /mnt/b
> >> mkdir -p /mnt/b/10
> >> 
> >> mount -t tmpfs test2 /mnt/b/10
> >> mount --make-shared /mnt/b/10
> >> mkdir -p /mnt/b/10/20
> >> 
> >> mount --rbind /mnt/b /mnt/b/10/20
> >> 
> >> unshare -Urm sleep 2
> >> umount -l /mnt/b
> >> wait %%
> >> 
> >> $ unshare -Urm pathlogical.sh
> >> 
> >> Cc: stable-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> >> Fixes: a05964f3917c ("[PATCH] shared mounts handling: umount")
> >> Fixes: 0c56fe31420c ("mnt: Don't propagate unmounts to locked mounts")
> >> Reported-by: Andrei Vagin <avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
> >> Signed-off-by: "Eric W. Biederman" <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
> >> ---
> >>  fs/mount.h     |   1 +
> >>  fs/namespace.c |   7 +--
> >>  fs/pnode.c     | 179 +++++++++++++++++++++++++++++++++++++++++----------------
> >>  fs/pnode.h     |   2 +-
> >>  4 files changed, 133 insertions(+), 56 deletions(-)
> >> 
> >> diff --git a/fs/mount.h b/fs/mount.h
> >> index d2e25d7b64b3..00fe0d1d6ba7 100644
> >> --- a/fs/mount.h
> >> +++ b/fs/mount.h
> >> @@ -58,6 +58,7 @@ struct mount {
> >>  	struct mnt_namespace *mnt_ns;	/* containing namespace */
> >>  	struct mountpoint *mnt_mp;	/* where is it mounted */
> >>  	struct hlist_node mnt_mp_list;	/* list mounts with the same mountpoint */
> >> +	struct list_head mnt_umounts;	/* list of children that are being unmounted */
> >>  #ifdef CONFIG_FSNOTIFY
> >>  	struct hlist_head mnt_fsnotify_marks;
> >>  	__u32 mnt_fsnotify_mask;
> >> diff --git a/fs/namespace.c b/fs/namespace.c
> >> index e6c234b1a645..73801391bb00 100644
> >> --- a/fs/namespace.c
> >> +++ b/fs/namespace.c
> >> @@ -237,6 +237,7 @@ static struct mount *alloc_vfsmnt(const char *name)
> >>  		INIT_LIST_HEAD(&mnt->mnt_slave_list);
> >>  		INIT_LIST_HEAD(&mnt->mnt_slave);
> >>  		INIT_HLIST_NODE(&mnt->mnt_mp_list);
> >> +		INIT_LIST_HEAD(&mnt->mnt_umounts);
> >>  #ifdef CONFIG_FSNOTIFY
> >>  		INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
> >>  #endif
> >> @@ -650,13 +651,11 @@ struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
> >>  	p = __lookup_mnt(mnt, dentry);
> >>  	if (!p)
> >>  		goto out;
> >> -	if (!(p->mnt.mnt_flags & MNT_UMOUNT))
> >> -		res = p;
> >> +	res = p;
> >>  	hlist_for_each_entry_continue(p, mnt_hash) {
> >>  		if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
> >>  			break;
> >> -		if (!(p->mnt.mnt_flags & MNT_UMOUNT))
> >> -			res = p;
> >> +		res = p;
> >>  	}
> >>  out:
> >>  	return res;
> >> diff --git a/fs/pnode.c b/fs/pnode.c
> >> index 234a9ac49958..8fd1a3fb420c 100644
> >> --- a/fs/pnode.c
> >> +++ b/fs/pnode.c
> >> @@ -134,7 +134,8 @@ void change_mnt_propagation(struct mount *mnt, int type)
> >>  }
> >>  
> >>  /*
> >> - * get the next mount in the propagation tree.
> >> + * get the next mount that is not a slave of the current mount in the
> >> + * propagation tree.
> >>   * @m: the mount seen last
> >>   * @origin: the original mount from where the tree walk initiated
> >>   *
> >> @@ -143,13 +144,9 @@ void change_mnt_propagation(struct mount *mnt, int type)
> >>   * vfsmount found while iterating with propagation_next() is
> >>   * a peer of one we'd found earlier.
> >>   */
> >> -static struct mount *propagation_next(struct mount *m,
> >> -					 struct mount *origin)
> >> +static struct mount *propagation_next_sib(struct mount *m,
> >> +						struct mount *origin)
> >>  {
> >> -	/* are there any slaves of this mount? */
> >> -	if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
> >> -		return first_slave(m);
> >> -
> >>  	while (1) {
> >>  		struct mount *master = m->mnt_master;
> >>  
> >> @@ -164,6 +161,26 @@ static struct mount *propagation_next(struct mount *m,
> >>  	}
> >>  }
> >>  
> >> +/*
> >> + * get the next mount in the propagation tree.
> >> + * @m: the mount seen last
> >> + * @origin: the original mount from where the tree walk initiated
> >> + *
> >> + * Note that peer groups form contiguous segments of slave lists.
> >> + * We rely on that in get_source() to be able to find out if
> >> + * vfsmount found while iterating with propagation_next() is
> >> + * a peer of one we'd found earlier.
> >> + */
> >> +static struct mount *propagation_next(struct mount *m,
> >> +					 struct mount *origin)
> >> +{
> >> +	/* are there any slaves of this mount? */
> >> +	if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
> >> +		return first_slave(m);
> >> +
> >> +	return propagation_next_sib(m, origin);
> >> +}
> >> +
> >>  static struct mount *next_group(struct mount *m, struct mount *origin)
> >>  {
> >>  	while (1) {
> >> @@ -389,57 +406,92 @@ void propagate_mount_unlock(struct mount *mnt)
> >>  	}
> >>  }
> >>  
> >> -/*
> >> - * Mark all mounts that the MNT_LOCKED logic will allow to be unmounted.
> >> - */
> >> -static void mark_umount_candidates(struct mount *mnt)
> >> +static struct mount *propagation_visit_child(struct mount *last_child,
> >> +					    struct mount *origin_child)
> >>  {
> >> -	struct mount *parent = mnt->mnt_parent;
> >> -	struct mount *m;
> >> +	struct mount *m = last_child->mnt_parent;
> >> +	struct mount *origin = origin_child->mnt_parent;
> >> +	struct dentry *mountpoint = origin_child->mnt_mountpoint;
> >> +	struct mount *child;
> >>  
> >> -	BUG_ON(parent == mnt);
> >> +	/* Has this part of the propgation tree already been visited? */
> >> +	if (IS_MNT_MARKED(last_child))
> >> +		return NULL;
> >>  
> >> -	for (m = propagation_next(parent, parent); m;
> >> -			m = propagation_next(m, parent)) {
> >> -		struct mount *child = __lookup_mnt_last(&m->mnt,
> >> -						mnt->mnt_mountpoint);
> >> -		if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
> >> -			SET_MNT_MARK(child);
> >> -		}
> >> +	SET_MNT_MARK(last_child);
> >> +
> >> +	m = propagation_next(m, origin);
> >> +	while (m) {
> >> +		child = __lookup_mnt_last(&m->mnt, mountpoint);
> >> +		if (child && !IS_MNT_MARKED(child))
> >> +			return child;
> >> +
> >> +		if (!child)
> >> +			m = propagation_next(m, origin);
> >> +		else
> >> +			m = propagation_next_sib(m, origin);
> >>  	}
> >> +	return NULL;
> >>  }
> >>  
> >> -/*
> >> - * NOTE: unmounting 'mnt' naturally propagates to all other mounts its
> >> - * parent propagates to.
> >> - */
> >> -static void __propagate_umount(struct mount *mnt)
> >> +static struct mount *propagation_revisit_child(struct mount *last_child,
> >> +					       struct mount *origin_child)
> >>  {
> >> -	struct mount *parent = mnt->mnt_parent;
> >> -	struct mount *m;
> >> +	struct mount *m = last_child->mnt_parent;
> >> +	struct mount *origin = origin_child->mnt_parent;
> >> +	struct dentry *mountpoint = origin_child->mnt_mountpoint;
> >> +	struct mount *child;
> >>  
> >> -	BUG_ON(parent == mnt);
> >> +	/* Has this part of the propgation tree already been revisited? */
> >> +	if (!IS_MNT_MARKED(last_child))
> >> +		return NULL;
> >>  
> >> -	for (m = propagation_next(parent, parent); m;
> >> -			m = propagation_next(m, parent)) {
> >> +	CLEAR_MNT_MARK(last_child);
> >>  
> >> -		struct mount *child = __lookup_mnt_last(&m->mnt,
> >> -						mnt->mnt_mountpoint);
> >> -		/*
> >> -		 * umount the child only if the child has no children
> >> -		 * and the child is marked safe to unmount.
> >> -		 */
> >> -		if (!child || !IS_MNT_MARKED(child))
> >> -			continue;
> >> -		CLEAR_MNT_MARK(child);
> >> -		if (list_empty(&child->mnt_mounts)) {
> >> -			list_del_init(&child->mnt_child);
> >> -			child->mnt.mnt_flags |= MNT_UMOUNT;
> >> -			list_move_tail(&child->mnt_list, &mnt->mnt_list);
> >> -		}
> >> +	m = propagation_next(m, origin);
> >> +	while (m) {
> >> +		child = __lookup_mnt_last(&m->mnt, mountpoint);
> >> +		if (child && IS_MNT_MARKED(child))
> >> +			return child;
> >> +
> >> +		if (!child)
> >> +			m = propagation_next(m, origin);
> >> +		else
> >> +			m = propagation_next_sib(m, origin);
> >>  	}
> >> +	return NULL;
> >>  }
> >>  
> >> +static void start_umount_propagation(struct mount *child,
> >> +				     struct list_head *to_umount)
> >> +{
> >> +	do {
> >> +		struct mount *parent = child->mnt_parent;
> >> +
> >> +		if ((child->mnt.mnt_flags & MNT_UMOUNT) ||
> >> +		    !list_empty(&child->mnt_mounts))
> >> +			return;
> >> +
> >> +		if (!IS_MNT_LOCKED(child))
> >> +			list_move_tail(&child->mnt_child, to_umount);
> >> +		else
> >> +			list_move_tail(&child->mnt_child, &parent->mnt_umounts);
> >> +
> >> +		child = NULL;
> >> +		if (IS_MNT_MARKED(parent))
> >> +			child = parent;
> >> +	} while (child);
> >> +}
> >> +
> >> +static void end_umount_propagation(struct mount *child)
> >> +{
> >> +	struct mount *parent = child->mnt_parent;
> >> +
> >> +	if (!list_empty(&parent->mnt_umounts))
> >> +		list_splice_tail_init(&parent->mnt_umounts, &parent->mnt_mounts);
> >> +}
> >> +
> >> +
> >>  /*
> >>   * collect all mounts that receive propagation from the mount in @list,
> >>   * and return these additional mounts in the same list.
> >> @@ -447,14 +499,39 @@ static void __propagate_umount(struct mount *mnt)
> >>   *
> >>   * vfsmount lock must be held for write
> >>   */
> >> -int propagate_umount(struct list_head *list)
> >> +void propagate_umount(struct list_head *list)
> >>  {
> >>  	struct mount *mnt;
> >> +	LIST_HEAD(to_umount);
> >> +	LIST_HEAD(tmp_list);
> >> +
> >> +	/* Find candidates for unmounting */
> >> +	list_for_each_entry(mnt, list, mnt_list) {
> >> +		struct mount *child;
> >> +		for (child = propagation_visit_child(mnt, mnt); child;
> >> +		     child = propagation_visit_child(child, mnt))
> >> +			start_umount_propagation(child, &to_umount);
> >> +	}
> >>  
> >> -	list_for_each_entry_reverse(mnt, list, mnt_list)
> >> -		mark_umount_candidates(mnt);
> >> +	/* Begin unmounting */
> >> +	while (!list_empty(&to_umount)) {
> >> +		mnt = list_first_entry(&to_umount, struct mount, mnt_child);
> >>  
> >> -	list_for_each_entry(mnt, list, mnt_list)
> >> -		__propagate_umount(mnt);
> >> -	return 0;
> >> +		list_del_init(&mnt->mnt_child);
> >> +		mnt->mnt.mnt_flags |= MNT_UMOUNT;
> >> +		list_move_tail(&mnt->mnt_list, &tmp_list);
> >> +
> >> +		if (!list_empty(&mnt->mnt_umounts))
> >> +			list_splice_tail_init(&mnt->mnt_umounts, &to_umount);
> >> +	}
> >> +
> >> +	/* Cleanup the mount propagation tree */
> >> +	list_for_each_entry(mnt, list, mnt_list) {
> >> +		struct mount *child;
> >> +		for (child = propagation_revisit_child(mnt, mnt); child;
> >> +		     child = propagation_revisit_child(child, mnt))
> >> +			end_umount_propagation(child);
> >> +	}
> >> +
> >> +	list_splice_tail(&tmp_list, list);
> >>  }
> >> diff --git a/fs/pnode.h b/fs/pnode.h
> >> index 550f5a8b4fcf..38c6cdb96b34 100644
> >> --- a/fs/pnode.h
> >> +++ b/fs/pnode.h
> >> @@ -41,7 +41,7 @@ static inline void set_mnt_shared(struct mount *mnt)
> >>  void change_mnt_propagation(struct mount *, int);
> >>  int propagate_mnt(struct mount *, struct mountpoint *, struct mount *,
> >>  		struct hlist_head *);
> >> -int propagate_umount(struct list_head *);
> >> +void propagate_umount(struct list_head *);
> >>  int propagate_mount_busy(struct mount *, int);
> >>  void propagate_mount_unlock(struct mount *);
> >>  void mnt_release_group_id(struct mount *);
> >> -- 
> >> 2.10.1
> >> 
> >
> > From 8e0f45c0272aa1f789d1657a0acc98c58919dcc3 Mon Sep 17 00:00:00 2001
> > From: Andrei Vagin <avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
> > Date: Tue, 25 Oct 2016 13:57:31 -0700
> > Subject: [PATCH] mount: skip all mounts from a shared group if one is marked
> >
> > If we meet a marked mount, it means that all mounts from
> > its group have been already revised.
> >
> > Signed-off-by: Andrei Vagin <avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
> > ---
> >  fs/pnode.c | 18 +++++++++++++++---
> >  1 file changed, 15 insertions(+), 3 deletions(-)
> >
> > diff --git a/fs/pnode.c b/fs/pnode.c
> > index 8fd1a3f..ebb7134 100644
> > --- a/fs/pnode.c
> > +++ b/fs/pnode.c
> > @@ -426,10 +426,16 @@ static struct mount *propagation_visit_child(struct mount *last_child,
> >  		if (child && !IS_MNT_MARKED(child))
> >  			return child;
> >  
> > -		if (!child)
> > +		if (!child) {
> >  			m = propagation_next(m, origin);
> > -		else
> > +		} else {
> > +			if (IS_MNT_MARKED(child)) {
> > +				if (m->mnt_group_id == origin->mnt_group_id)
> > +					return NULL;
> > +				m = m->mnt_master;
> > +			}
> >  			m = propagation_next_sib(m, origin);
> > +		}
> >  	}
> >  	return NULL;
> >  }
> > @@ -456,8 +462,14 @@ static struct mount *propagation_revisit_child(struct mount *last_child,
> >  
> >  		if (!child)
> >  			m = propagation_next(m, origin);
> > -		else
> > +		else {
> > +			if (!IS_MNT_MARKED(child)) {
> > +				if (m->mnt_group_id == origin->mnt_group_id)
> > +					return NULL;
> > +				m = m->mnt_master;
> > +			}
> >  			m = propagation_next_sib(m, origin);
> > +		}
> >  	}
> >  	return NULL;
> >  }
> 
> That is certainly interesting.  The problem is that the reason we were
> going slow is that there were in fact mounts that had not been traversed
> in the share group.

You are right.

> 
> And in fact the entire idea of visiting a vfsmount mountpoint pair
> exactly once is wrong in the face of shadow mounts.  For a vfsmount
> mountpoint pair that has shadow mounts the number of shadow mounts needs
> to be descreased by one each time the propgation tree is traversed
> during unmount. Which means that as far as I can see we have to kill
> shadow mounts to correctly optimize this code.  Once shadow mounts are
> gone I don't know of a case where need your optimization.

Without shadow mounts, it will be hard to save predictable behaviour
for cases like this:

$ unshare --propagation private -m sh test.sh
+ mount -t tmpfs --make-shared zzzz A
+ mkdir A/a
+ mount -t tmpfs zzzz A/a
+ mount --bind A B
+ mount -t tmpfs zzzz B/a
+ grep zzzz
+ cat /proc/self/mountinfo
155 123 0:44 / /root/tmp/A rw,relatime shared:70 - tmpfs zzzz rw
156 155 0:45 / /root/tmp/A/a rw,relatime shared:71 - tmpfs zzzz rw
157 123 0:44 / /root/tmp/B rw,relatime shared:70 - tmpfs zzzz rw
158 157 0:46 / /root/tmp/B/a rw,relatime shared:72 - tmpfs zzzz rw
159 155 0:46 / /root/tmp/A/a rw,relatime shared:72 - tmpfs zzzz rw
+ umount B/a
+ grep zzzz
+ cat /proc/self/mountinfo
155 123 0:44 / /root/tmp/A rw,relatime shared:70 - tmpfs zzzz rw
156 155 0:45 / /root/tmp/A/a rw,relatime shared:71 - tmpfs zzzz rw
157 123 0:44 / /root/tmp/B rw,relatime shared:70 - tmpfs zzzz rw

X + a - a = X

Maybe we need to add another ID for propagated mounts and when we
do umount, we will detach only mounts with the same propagation id.

I support the idea to kill shadow mounts. I guess it will help us to
simplify algorithm of dumping and restoring a mount tree in CRIU.

Currently it is a big pain for us.

> 
> I am busily verifying my patch to kill shadow mounts but the following
> patch is the minimal version.  As far as I can see propagate_one
> is the only place we create shadow mounts, and holding the
> namespace_lock over attach_recursive_mnt, propagate_mnt, and
> propgate_one is sufficient for that __lookup_mnt to be competely safe.
> 
> diff --git a/fs/pnode.c b/fs/pnode.c
> index 234a9ac49958..b14119b370d4 100644
> --- a/fs/pnode.c
> +++ b/fs/pnode.c
> @@ -217,6 +217,9 @@ static int propagate_one(struct mount *m)
>         /* skip if mountpoint isn't covered by it */
>         if (!is_subdir(mp->m_dentry, m->mnt.mnt_root))
>                 return 0;
> +       /* skip if mountpoint already has a mount on it */
> +       if (__lookup_mnt(&m->mnt, mp->m_dentry))
> +               return 0;
>         if (peers(m, last_dest)) {
>                 type = CL_MAKE_SHARED;
>         } else {
> 
> If you run with that patch you will see that there are go faster stripes.
> 
> Eric
> 

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [RFC][PATCH v2] mount: In propagate_umount handle overlapping mount propagation trees
@ 2016-10-25 23:41                                                     ` Andrei Vagin
  0 siblings, 0 replies; 36+ messages in thread
From: Andrei Vagin @ 2016-10-25 23:41 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Andrey Vagin, Alexander Viro, Linux Containers, linux-fsdevel, LKML

On Tue, Oct 25, 2016 at 04:45:44PM -0500, Eric W. Biederman wrote:
> Andrei Vagin <avagin@virtuozzo.com> writes:
> 
> > On Sat, Oct 22, 2016 at 02:42:03PM -0500, Eric W. Biederman wrote:
> >> 
> >> Andrei,
> >> 
> >> This fixes the issue you have reported and through a refactoring
> >> makes the code simpler and easier to verify.  That said I find your
> >> last test case very interesting.   While looking at it in detail
> >> I have realized I don't fully understand why we have both lookup_mnt and
> >> lookup_mnt_last, so I can't say that this change is fully correct.
> >> 
> >> Outside of propogate_umount I am don't have concerns but I am not 100%
> >> convinced that my change to lookup_mnt_last does the right thing
> >> in the case of propagate_umount.
> >> 
> >> I do see why your last test case scales badly.  Long chains of shared
> >> mounts that we can't skip.  At the same time I don't really understand
> >> that case.  Part of it has to do with multiple child mounts of the same
> >> mount on the same mountpoint.
> >> 
> >> So I am working through my concerns.  In the mean time I figured it
> >> would be useful to post this version.  As this version is clearly better
> >> than the version of this change that have come before it.
> >
> > Hi Eric,
> >
> > I have tested this version and it works fine.
> >
> > As for the the last test case, could you look at the attached patch?
> > The idea is that we can skip all mounts from a shared group, if one
> > of them already marked.
> >
> >> 
> >> Eric
> >> 
> >> From: "Eric W. Biederman" <ebiederm@xmission.com>
> >> Date: Thu, 13 Oct 2016 13:27:19 -0500
> >> 
> >> Adrei Vagin pointed out that time to executue propagate_umount can go
> >> non-linear (and take a ludicrious amount of time) when the mount
> >> propogation trees of the mounts to be unmunted by a lazy unmount
> >> overlap.
> >> 
> >> While investigating the horrible performance I realized that in
> >> the case overlapping mount trees since the addition of locked
> >> mount support the code has been failing to unmount all of the
> >> mounts it should have been unmounting.
> >> 
> >> Make the walk of the mount propagation trees nearly linear by using
> >> MNT_MARK to mark pieces of the mount propagation trees that have
> >> already been visited, allowing subsequent walks to skip over
> >> subtrees.
> >> 
> >> Make the processing of mounts order independent by adding a list of
> >> mount entries that need to be unmounted, and simply adding a mount to
> >> that list when it becomes apparent the mount can safely be unmounted.
> >> For mounts that are locked on other mounts but otherwise could be
> >> unmounted move them from their parnets mnt_mounts to mnt_umounts so
> >> that if and when their parent becomes unmounted these mounts can be
> >> added to the list of mounts to unmount.
> >> 
> >> Add a final pass to clear MNT_MARK and to restore mnt_mounts
> >> from mnt_umounts for anything that did not get unmounted.
> >> 
> >> Add the functions propagation_visit_next and propagation_revisit_next
> >> to coordinate walking of the mount tree and setting and clearing the
> >> mount mark.
> >> 
> >> The skipping of already unmounted mounts has been moved from
> >> __lookup_mnt_last to mark_umount_candidates, so that the new
> >> propagation functions can notice when the propagation tree passes
> >> through the initial set of unmounted mounts.  Except in umount_tree as
> >> part of the unmounting process the only place where unmounted mounts
> >> should be found are in unmounted subtrees.  All of the other callers
> >> of __lookup_mnt_last are from mounted subtrees so the not checking for
> >> unmounted mounts should not affect them.
> >> 
> >> A script to generate overlapping mount propagation trees:
> >> $ cat run.sh
> >> mount -t tmpfs test-mount /mnt
> >> mount --make-shared /mnt
> >> for i in `seq $1`; do
> >>         mkdir /mnt/test.$i
> >>         mount --bind /mnt /mnt/test.$i
> >> done
> >> cat /proc/mounts | grep test-mount | wc -l
> >> time umount -l /mnt
> >> $ for i in `seq 10 16`; do echo $i; unshare -Urm bash ./run.sh $i; done
> >> 
> >> Here are the performance numbers with and without the patch:
> >> 
> >> mhash  |  8192   |  8192  |  8192       | 131072 | 131072      | 104857 | 104857
> >> mounts | before  | after  | after (sys) | after  | after (sys) |  after | after (sys)
> >> -------------------------------------------------------------------------------------
> >>   1024 |  0.071s | 0.020s | 0.000s      | 0.022s | 0.004s      | 0.020s | 0.004s
> >>   2048 |  0.184s | 0.022s | 0.004s      | 0.023s | 0.004s      | 0.022s | 0.008s
> >>   4096 |  0.604s | 0.025s | 0.020s      | 0.029s | 0.008s      | 0.026s | 0.004s
> >>   8912 |  4.471s | 0.053s | 0.020s      | 0.051s | 0.024s      | 0.047s | 0.016s
> >>  16384 | 34.826s | 0.088s | 0.060s      | 0.081s | 0.048s      | 0.082s | 0.052s
> >>  32768 |         | 0.216s | 0.172s      | 0.160s | 0.124s      | 0.160s | 0.096s
> >>  65536 |         | 0.819s | 0.726s      | 0.330s | 0.260s      | 0.338s | 0.256s
> >> 131072 |         | 4.502s | 4.168s      | 0.707s | 0.580s      | 0.709s | 0.592s
> >> 
> >> Andrei Vagin reports fixing the performance problem is part of the
> >> work to fix CVE-2016-6213.
> >> 
> >> A script for a pathlogical set of mounts:
> >> 
> >> $ cat pathological.sh
> >> 
> >> mount -t tmpfs base /mnt
> >> mount --make-shared /mnt
> >> mkdir -p /mnt/b
> >> 
> >> mount -t tmpfs test1 /mnt/b
> >> mount --make-shared /mnt/b
> >> mkdir -p /mnt/b/10
> >> 
> >> mount -t tmpfs test2 /mnt/b/10
> >> mount --make-shared /mnt/b/10
> >> mkdir -p /mnt/b/10/20
> >> 
> >> mount --rbind /mnt/b /mnt/b/10/20
> >> 
> >> unshare -Urm sleep 2
> >> umount -l /mnt/b
> >> wait %%
> >> 
> >> $ unshare -Urm pathlogical.sh
> >> 
> >> Cc: stable@vger.kernel.org
> >> Fixes: a05964f3917c ("[PATCH] shared mounts handling: umount")
> >> Fixes: 0c56fe31420c ("mnt: Don't propagate unmounts to locked mounts")
> >> Reported-by: Andrei Vagin <avagin@openvz.org>
> >> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
> >> ---
> >>  fs/mount.h     |   1 +
> >>  fs/namespace.c |   7 +--
> >>  fs/pnode.c     | 179 +++++++++++++++++++++++++++++++++++++++++----------------
> >>  fs/pnode.h     |   2 +-
> >>  4 files changed, 133 insertions(+), 56 deletions(-)
> >> 
> >> diff --git a/fs/mount.h b/fs/mount.h
> >> index d2e25d7b64b3..00fe0d1d6ba7 100644
> >> --- a/fs/mount.h
> >> +++ b/fs/mount.h
> >> @@ -58,6 +58,7 @@ struct mount {
> >>  	struct mnt_namespace *mnt_ns;	/* containing namespace */
> >>  	struct mountpoint *mnt_mp;	/* where is it mounted */
> >>  	struct hlist_node mnt_mp_list;	/* list mounts with the same mountpoint */
> >> +	struct list_head mnt_umounts;	/* list of children that are being unmounted */
> >>  #ifdef CONFIG_FSNOTIFY
> >>  	struct hlist_head mnt_fsnotify_marks;
> >>  	__u32 mnt_fsnotify_mask;
> >> diff --git a/fs/namespace.c b/fs/namespace.c
> >> index e6c234b1a645..73801391bb00 100644
> >> --- a/fs/namespace.c
> >> +++ b/fs/namespace.c
> >> @@ -237,6 +237,7 @@ static struct mount *alloc_vfsmnt(const char *name)
> >>  		INIT_LIST_HEAD(&mnt->mnt_slave_list);
> >>  		INIT_LIST_HEAD(&mnt->mnt_slave);
> >>  		INIT_HLIST_NODE(&mnt->mnt_mp_list);
> >> +		INIT_LIST_HEAD(&mnt->mnt_umounts);
> >>  #ifdef CONFIG_FSNOTIFY
> >>  		INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
> >>  #endif
> >> @@ -650,13 +651,11 @@ struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
> >>  	p = __lookup_mnt(mnt, dentry);
> >>  	if (!p)
> >>  		goto out;
> >> -	if (!(p->mnt.mnt_flags & MNT_UMOUNT))
> >> -		res = p;
> >> +	res = p;
> >>  	hlist_for_each_entry_continue(p, mnt_hash) {
> >>  		if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
> >>  			break;
> >> -		if (!(p->mnt.mnt_flags & MNT_UMOUNT))
> >> -			res = p;
> >> +		res = p;
> >>  	}
> >>  out:
> >>  	return res;
> >> diff --git a/fs/pnode.c b/fs/pnode.c
> >> index 234a9ac49958..8fd1a3fb420c 100644
> >> --- a/fs/pnode.c
> >> +++ b/fs/pnode.c
> >> @@ -134,7 +134,8 @@ void change_mnt_propagation(struct mount *mnt, int type)
> >>  }
> >>  
> >>  /*
> >> - * get the next mount in the propagation tree.
> >> + * get the next mount that is not a slave of the current mount in the
> >> + * propagation tree.
> >>   * @m: the mount seen last
> >>   * @origin: the original mount from where the tree walk initiated
> >>   *
> >> @@ -143,13 +144,9 @@ void change_mnt_propagation(struct mount *mnt, int type)
> >>   * vfsmount found while iterating with propagation_next() is
> >>   * a peer of one we'd found earlier.
> >>   */
> >> -static struct mount *propagation_next(struct mount *m,
> >> -					 struct mount *origin)
> >> +static struct mount *propagation_next_sib(struct mount *m,
> >> +						struct mount *origin)
> >>  {
> >> -	/* are there any slaves of this mount? */
> >> -	if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
> >> -		return first_slave(m);
> >> -
> >>  	while (1) {
> >>  		struct mount *master = m->mnt_master;
> >>  
> >> @@ -164,6 +161,26 @@ static struct mount *propagation_next(struct mount *m,
> >>  	}
> >>  }
> >>  
> >> +/*
> >> + * get the next mount in the propagation tree.
> >> + * @m: the mount seen last
> >> + * @origin: the original mount from where the tree walk initiated
> >> + *
> >> + * Note that peer groups form contiguous segments of slave lists.
> >> + * We rely on that in get_source() to be able to find out if
> >> + * vfsmount found while iterating with propagation_next() is
> >> + * a peer of one we'd found earlier.
> >> + */
> >> +static struct mount *propagation_next(struct mount *m,
> >> +					 struct mount *origin)
> >> +{
> >> +	/* are there any slaves of this mount? */
> >> +	if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
> >> +		return first_slave(m);
> >> +
> >> +	return propagation_next_sib(m, origin);
> >> +}
> >> +
> >>  static struct mount *next_group(struct mount *m, struct mount *origin)
> >>  {
> >>  	while (1) {
> >> @@ -389,57 +406,92 @@ void propagate_mount_unlock(struct mount *mnt)
> >>  	}
> >>  }
> >>  
> >> -/*
> >> - * Mark all mounts that the MNT_LOCKED logic will allow to be unmounted.
> >> - */
> >> -static void mark_umount_candidates(struct mount *mnt)
> >> +static struct mount *propagation_visit_child(struct mount *last_child,
> >> +					    struct mount *origin_child)
> >>  {
> >> -	struct mount *parent = mnt->mnt_parent;
> >> -	struct mount *m;
> >> +	struct mount *m = last_child->mnt_parent;
> >> +	struct mount *origin = origin_child->mnt_parent;
> >> +	struct dentry *mountpoint = origin_child->mnt_mountpoint;
> >> +	struct mount *child;
> >>  
> >> -	BUG_ON(parent == mnt);
> >> +	/* Has this part of the propgation tree already been visited? */
> >> +	if (IS_MNT_MARKED(last_child))
> >> +		return NULL;
> >>  
> >> -	for (m = propagation_next(parent, parent); m;
> >> -			m = propagation_next(m, parent)) {
> >> -		struct mount *child = __lookup_mnt_last(&m->mnt,
> >> -						mnt->mnt_mountpoint);
> >> -		if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
> >> -			SET_MNT_MARK(child);
> >> -		}
> >> +	SET_MNT_MARK(last_child);
> >> +
> >> +	m = propagation_next(m, origin);
> >> +	while (m) {
> >> +		child = __lookup_mnt_last(&m->mnt, mountpoint);
> >> +		if (child && !IS_MNT_MARKED(child))
> >> +			return child;
> >> +
> >> +		if (!child)
> >> +			m = propagation_next(m, origin);
> >> +		else
> >> +			m = propagation_next_sib(m, origin);
> >>  	}
> >> +	return NULL;
> >>  }
> >>  
> >> -/*
> >> - * NOTE: unmounting 'mnt' naturally propagates to all other mounts its
> >> - * parent propagates to.
> >> - */
> >> -static void __propagate_umount(struct mount *mnt)
> >> +static struct mount *propagation_revisit_child(struct mount *last_child,
> >> +					       struct mount *origin_child)
> >>  {
> >> -	struct mount *parent = mnt->mnt_parent;
> >> -	struct mount *m;
> >> +	struct mount *m = last_child->mnt_parent;
> >> +	struct mount *origin = origin_child->mnt_parent;
> >> +	struct dentry *mountpoint = origin_child->mnt_mountpoint;
> >> +	struct mount *child;
> >>  
> >> -	BUG_ON(parent == mnt);
> >> +	/* Has this part of the propgation tree already been revisited? */
> >> +	if (!IS_MNT_MARKED(last_child))
> >> +		return NULL;
> >>  
> >> -	for (m = propagation_next(parent, parent); m;
> >> -			m = propagation_next(m, parent)) {
> >> +	CLEAR_MNT_MARK(last_child);
> >>  
> >> -		struct mount *child = __lookup_mnt_last(&m->mnt,
> >> -						mnt->mnt_mountpoint);
> >> -		/*
> >> -		 * umount the child only if the child has no children
> >> -		 * and the child is marked safe to unmount.
> >> -		 */
> >> -		if (!child || !IS_MNT_MARKED(child))
> >> -			continue;
> >> -		CLEAR_MNT_MARK(child);
> >> -		if (list_empty(&child->mnt_mounts)) {
> >> -			list_del_init(&child->mnt_child);
> >> -			child->mnt.mnt_flags |= MNT_UMOUNT;
> >> -			list_move_tail(&child->mnt_list, &mnt->mnt_list);
> >> -		}
> >> +	m = propagation_next(m, origin);
> >> +	while (m) {
> >> +		child = __lookup_mnt_last(&m->mnt, mountpoint);
> >> +		if (child && IS_MNT_MARKED(child))
> >> +			return child;
> >> +
> >> +		if (!child)
> >> +			m = propagation_next(m, origin);
> >> +		else
> >> +			m = propagation_next_sib(m, origin);
> >>  	}
> >> +	return NULL;
> >>  }
> >>  
> >> +static void start_umount_propagation(struct mount *child,
> >> +				     struct list_head *to_umount)
> >> +{
> >> +	do {
> >> +		struct mount *parent = child->mnt_parent;
> >> +
> >> +		if ((child->mnt.mnt_flags & MNT_UMOUNT) ||
> >> +		    !list_empty(&child->mnt_mounts))
> >> +			return;
> >> +
> >> +		if (!IS_MNT_LOCKED(child))
> >> +			list_move_tail(&child->mnt_child, to_umount);
> >> +		else
> >> +			list_move_tail(&child->mnt_child, &parent->mnt_umounts);
> >> +
> >> +		child = NULL;
> >> +		if (IS_MNT_MARKED(parent))
> >> +			child = parent;
> >> +	} while (child);
> >> +}
> >> +
> >> +static void end_umount_propagation(struct mount *child)
> >> +{
> >> +	struct mount *parent = child->mnt_parent;
> >> +
> >> +	if (!list_empty(&parent->mnt_umounts))
> >> +		list_splice_tail_init(&parent->mnt_umounts, &parent->mnt_mounts);
> >> +}
> >> +
> >> +
> >>  /*
> >>   * collect all mounts that receive propagation from the mount in @list,
> >>   * and return these additional mounts in the same list.
> >> @@ -447,14 +499,39 @@ static void __propagate_umount(struct mount *mnt)
> >>   *
> >>   * vfsmount lock must be held for write
> >>   */
> >> -int propagate_umount(struct list_head *list)
> >> +void propagate_umount(struct list_head *list)
> >>  {
> >>  	struct mount *mnt;
> >> +	LIST_HEAD(to_umount);
> >> +	LIST_HEAD(tmp_list);
> >> +
> >> +	/* Find candidates for unmounting */
> >> +	list_for_each_entry(mnt, list, mnt_list) {
> >> +		struct mount *child;
> >> +		for (child = propagation_visit_child(mnt, mnt); child;
> >> +		     child = propagation_visit_child(child, mnt))
> >> +			start_umount_propagation(child, &to_umount);
> >> +	}
> >>  
> >> -	list_for_each_entry_reverse(mnt, list, mnt_list)
> >> -		mark_umount_candidates(mnt);
> >> +	/* Begin unmounting */
> >> +	while (!list_empty(&to_umount)) {
> >> +		mnt = list_first_entry(&to_umount, struct mount, mnt_child);
> >>  
> >> -	list_for_each_entry(mnt, list, mnt_list)
> >> -		__propagate_umount(mnt);
> >> -	return 0;
> >> +		list_del_init(&mnt->mnt_child);
> >> +		mnt->mnt.mnt_flags |= MNT_UMOUNT;
> >> +		list_move_tail(&mnt->mnt_list, &tmp_list);
> >> +
> >> +		if (!list_empty(&mnt->mnt_umounts))
> >> +			list_splice_tail_init(&mnt->mnt_umounts, &to_umount);
> >> +	}
> >> +
> >> +	/* Cleanup the mount propagation tree */
> >> +	list_for_each_entry(mnt, list, mnt_list) {
> >> +		struct mount *child;
> >> +		for (child = propagation_revisit_child(mnt, mnt); child;
> >> +		     child = propagation_revisit_child(child, mnt))
> >> +			end_umount_propagation(child);
> >> +	}
> >> +
> >> +	list_splice_tail(&tmp_list, list);
> >>  }
> >> diff --git a/fs/pnode.h b/fs/pnode.h
> >> index 550f5a8b4fcf..38c6cdb96b34 100644
> >> --- a/fs/pnode.h
> >> +++ b/fs/pnode.h
> >> @@ -41,7 +41,7 @@ static inline void set_mnt_shared(struct mount *mnt)
> >>  void change_mnt_propagation(struct mount *, int);
> >>  int propagate_mnt(struct mount *, struct mountpoint *, struct mount *,
> >>  		struct hlist_head *);
> >> -int propagate_umount(struct list_head *);
> >> +void propagate_umount(struct list_head *);
> >>  int propagate_mount_busy(struct mount *, int);
> >>  void propagate_mount_unlock(struct mount *);
> >>  void mnt_release_group_id(struct mount *);
> >> -- 
> >> 2.10.1
> >> 
> >
> > From 8e0f45c0272aa1f789d1657a0acc98c58919dcc3 Mon Sep 17 00:00:00 2001
> > From: Andrei Vagin <avagin@openvz.org>
> > Date: Tue, 25 Oct 2016 13:57:31 -0700
> > Subject: [PATCH] mount: skip all mounts from a shared group if one is marked
> >
> > If we meet a marked mount, it means that all mounts from
> > its group have been already revised.
> >
> > Signed-off-by: Andrei Vagin <avagin@openvz.org>
> > ---
> >  fs/pnode.c | 18 +++++++++++++++---
> >  1 file changed, 15 insertions(+), 3 deletions(-)
> >
> > diff --git a/fs/pnode.c b/fs/pnode.c
> > index 8fd1a3f..ebb7134 100644
> > --- a/fs/pnode.c
> > +++ b/fs/pnode.c
> > @@ -426,10 +426,16 @@ static struct mount *propagation_visit_child(struct mount *last_child,
> >  		if (child && !IS_MNT_MARKED(child))
> >  			return child;
> >  
> > -		if (!child)
> > +		if (!child) {
> >  			m = propagation_next(m, origin);
> > -		else
> > +		} else {
> > +			if (IS_MNT_MARKED(child)) {
> > +				if (m->mnt_group_id == origin->mnt_group_id)
> > +					return NULL;
> > +				m = m->mnt_master;
> > +			}
> >  			m = propagation_next_sib(m, origin);
> > +		}
> >  	}
> >  	return NULL;
> >  }
> > @@ -456,8 +462,14 @@ static struct mount *propagation_revisit_child(struct mount *last_child,
> >  
> >  		if (!child)
> >  			m = propagation_next(m, origin);
> > -		else
> > +		else {
> > +			if (!IS_MNT_MARKED(child)) {
> > +				if (m->mnt_group_id == origin->mnt_group_id)
> > +					return NULL;
> > +				m = m->mnt_master;
> > +			}
> >  			m = propagation_next_sib(m, origin);
> > +		}
> >  	}
> >  	return NULL;
> >  }
> 
> That is certainly interesting.  The problem is that the reason we were
> going slow is that there were in fact mounts that had not been traversed
> in the share group.

You are right.

> 
> And in fact the entire idea of visiting a vfsmount mountpoint pair
> exactly once is wrong in the face of shadow mounts.  For a vfsmount
> mountpoint pair that has shadow mounts the number of shadow mounts needs
> to be descreased by one each time the propgation tree is traversed
> during unmount. Which means that as far as I can see we have to kill
> shadow mounts to correctly optimize this code.  Once shadow mounts are
> gone I don't know of a case where need your optimization.

Without shadow mounts, it will be hard to save predictable behaviour
for cases like this:

$ unshare --propagation private -m sh test.sh
+ mount -t tmpfs --make-shared zzzz A
+ mkdir A/a
+ mount -t tmpfs zzzz A/a
+ mount --bind A B
+ mount -t tmpfs zzzz B/a
+ grep zzzz
+ cat /proc/self/mountinfo
155 123 0:44 / /root/tmp/A rw,relatime shared:70 - tmpfs zzzz rw
156 155 0:45 / /root/tmp/A/a rw,relatime shared:71 - tmpfs zzzz rw
157 123 0:44 / /root/tmp/B rw,relatime shared:70 - tmpfs zzzz rw
158 157 0:46 / /root/tmp/B/a rw,relatime shared:72 - tmpfs zzzz rw
159 155 0:46 / /root/tmp/A/a rw,relatime shared:72 - tmpfs zzzz rw
+ umount B/a
+ grep zzzz
+ cat /proc/self/mountinfo
155 123 0:44 / /root/tmp/A rw,relatime shared:70 - tmpfs zzzz rw
156 155 0:45 / /root/tmp/A/a rw,relatime shared:71 - tmpfs zzzz rw
157 123 0:44 / /root/tmp/B rw,relatime shared:70 - tmpfs zzzz rw

X + a - a = X

Maybe we need to add another ID for propagated mounts and when we
do umount, we will detach only mounts with the same propagation id.

I support the idea to kill shadow mounts. I guess it will help us to
simplify algorithm of dumping and restoring a mount tree in CRIU.

Currently it is a big pain for us.

> 
> I am busily verifying my patch to kill shadow mounts but the following
> patch is the minimal version.  As far as I can see propagate_one
> is the only place we create shadow mounts, and holding the
> namespace_lock over attach_recursive_mnt, propagate_mnt, and
> propgate_one is sufficient for that __lookup_mnt to be competely safe.
> 
> diff --git a/fs/pnode.c b/fs/pnode.c
> index 234a9ac49958..b14119b370d4 100644
> --- a/fs/pnode.c
> +++ b/fs/pnode.c
> @@ -217,6 +217,9 @@ static int propagate_one(struct mount *m)
>         /* skip if mountpoint isn't covered by it */
>         if (!is_subdir(mp->m_dentry, m->mnt.mnt_root))
>                 return 0;
> +       /* skip if mountpoint already has a mount on it */
> +       if (__lookup_mnt(&m->mnt, mp->m_dentry))
> +               return 0;
>         if (peers(m, last_dest)) {
>                 type = CL_MAKE_SHARED;
>         } else {
> 
> If you run with that patch you will see that there are go faster stripes.
> 
> Eric
> 

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [RFC][PATCH v2] mount: In propagate_umount handle overlapping mount propagation trees
       [not found]                                                     ` <20161025234125.GA20335-1ViLX0X+lBJGNQ1M2rI3KwRV3xvJKrda@public.gmane.org>
@ 2016-10-26  1:42                                                       ` Eric W. Biederman
  0 siblings, 0 replies; 36+ messages in thread
From: Eric W. Biederman @ 2016-10-26  1:42 UTC (permalink / raw)
  To: Andrei Vagin
  Cc: linux-fsdevel, LKML, Linux Containers, Andrey Vagin, Alexander Viro

Andrei Vagin <avagin-5HdwGun5lf+gSpxsJD1C4w@public.gmane.org> writes:

> On Tue, Oct 25, 2016 at 04:45:44PM -0500, Eric W. Biederman wrote:
>> That is certainly interesting.  The problem is that the reason we were
>> going slow is that there were in fact mounts that had not been traversed
>> in the share group.
>
> You are right.
>
>> 
>> And in fact the entire idea of visiting a vfsmount mountpoint pair
>> exactly once is wrong in the face of shadow mounts.  For a vfsmount
>> mountpoint pair that has shadow mounts the number of shadow mounts needs
>> to be descreased by one each time the propgation tree is traversed
>> during unmount. Which means that as far as I can see we have to kill
>> shadow mounts to correctly optimize this code.  Once shadow mounts are
>> gone I don't know of a case where need your optimization.
>
> Without shadow mounts, it will be hard to save predictable behaviour
> for cases like this:
>
> $ unshare --propagation private -m sh test.sh
> + mount -t tmpfs --make-shared zzzz A
> + mkdir A/a
> + mount -t tmpfs zzzz A/a
> + mount --bind A B
> + mount -t tmpfs zzzz B/a
> + grep zzzz
> + cat /proc/self/mountinfo
> 155 123 0:44 / /root/tmp/A rw,relatime shared:70 - tmpfs zzzz rw
> 156 155 0:45 / /root/tmp/A/a rw,relatime shared:71 - tmpfs zzzz rw
> 157 123 0:44 / /root/tmp/B rw,relatime shared:70 - tmpfs zzzz rw
> 158 157 0:46 / /root/tmp/B/a rw,relatime shared:72 - tmpfs zzzz rw
> 159 155 0:46 / /root/tmp/A/a rw,relatime shared:72 - tmpfs zzzz rw
> + umount B/a
> + grep zzzz
> + cat /proc/self/mountinfo
> 155 123 0:44 / /root/tmp/A rw,relatime shared:70 - tmpfs zzzz rw
> 156 155 0:45 / /root/tmp/A/a rw,relatime shared:71 - tmpfs zzzz rw
> 157 123 0:44 / /root/tmp/B rw,relatime shared:70 - tmpfs zzzz rw
>
> X + a - a = X
>
> Maybe we need to add another ID for propagated mounts and when we
> do umount, we will detach only mounts with the same propagation id.
>
> I support the idea to kill shadow mounts. I guess it will help us to
> simplify algorithm of dumping and restoring a mount tree in CRIU.
>
> Currently it is a big pain for us.

Killing shadow mounts is not exactly a done deal as there are some user
visible effects.  The practical question becomes do we break anything
anyone cares about in userspace.  Answering those practical questions
sucks.

I definitely think we should try to kill shadow mounts because they are
such a big pain to deal with, and only provide very limited value.

So far the only thing I have seem shadow mounts being good for is
preserving unmount behavior in cases where what someone has
constructed an artificially evil mount tree. I haven't figured out how
to see how any of those mount trees are actually useful in real life.

Eric

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [RFC][PATCH v2] mount: In propagate_umount handle overlapping mount propagation trees
  2016-10-25 23:41                                                     ` Andrei Vagin
  (?)
@ 2016-10-26  1:42                                                     ` Eric W. Biederman
  -1 siblings, 0 replies; 36+ messages in thread
From: Eric W. Biederman @ 2016-10-26  1:42 UTC (permalink / raw)
  To: Andrei Vagin
  Cc: Andrey Vagin, Alexander Viro, Linux Containers, linux-fsdevel, LKML

Andrei Vagin <avagin@virtuozzo.com> writes:

> On Tue, Oct 25, 2016 at 04:45:44PM -0500, Eric W. Biederman wrote:
>> That is certainly interesting.  The problem is that the reason we were
>> going slow is that there were in fact mounts that had not been traversed
>> in the share group.
>
> You are right.
>
>> 
>> And in fact the entire idea of visiting a vfsmount mountpoint pair
>> exactly once is wrong in the face of shadow mounts.  For a vfsmount
>> mountpoint pair that has shadow mounts the number of shadow mounts needs
>> to be descreased by one each time the propgation tree is traversed
>> during unmount. Which means that as far as I can see we have to kill
>> shadow mounts to correctly optimize this code.  Once shadow mounts are
>> gone I don't know of a case where need your optimization.
>
> Without shadow mounts, it will be hard to save predictable behaviour
> for cases like this:
>
> $ unshare --propagation private -m sh test.sh
> + mount -t tmpfs --make-shared zzzz A
> + mkdir A/a
> + mount -t tmpfs zzzz A/a
> + mount --bind A B
> + mount -t tmpfs zzzz B/a
> + grep zzzz
> + cat /proc/self/mountinfo
> 155 123 0:44 / /root/tmp/A rw,relatime shared:70 - tmpfs zzzz rw
> 156 155 0:45 / /root/tmp/A/a rw,relatime shared:71 - tmpfs zzzz rw
> 157 123 0:44 / /root/tmp/B rw,relatime shared:70 - tmpfs zzzz rw
> 158 157 0:46 / /root/tmp/B/a rw,relatime shared:72 - tmpfs zzzz rw
> 159 155 0:46 / /root/tmp/A/a rw,relatime shared:72 - tmpfs zzzz rw
> + umount B/a
> + grep zzzz
> + cat /proc/self/mountinfo
> 155 123 0:44 / /root/tmp/A rw,relatime shared:70 - tmpfs zzzz rw
> 156 155 0:45 / /root/tmp/A/a rw,relatime shared:71 - tmpfs zzzz rw
> 157 123 0:44 / /root/tmp/B rw,relatime shared:70 - tmpfs zzzz rw
>
> X + a - a = X
>
> Maybe we need to add another ID for propagated mounts and when we
> do umount, we will detach only mounts with the same propagation id.
>
> I support the idea to kill shadow mounts. I guess it will help us to
> simplify algorithm of dumping and restoring a mount tree in CRIU.
>
> Currently it is a big pain for us.

Killing shadow mounts is not exactly a done deal as there are some user
visible effects.  The practical question becomes do we break anything
anyone cares about in userspace.  Answering those practical questions
sucks.

I definitely think we should try to kill shadow mounts because they are
such a big pain to deal with, and only provide very limited value.

So far the only thing I have seem shadow mounts being good for is
preserving unmount behavior in cases where what someone has
constructed an artificially evil mount tree. I haven't figured out how
to see how any of those mount trees are actually useful in real life.

Eric

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [RFC][PATCH v2] mount: In propagate_umount handle overlapping mount propagation trees
  2016-10-25 21:45                                                 ` Eric W. Biederman
@ 2016-11-01  6:14                                                     ` Andrei Vagin
  -1 siblings, 0 replies; 36+ messages in thread
From: Andrei Vagin @ 2016-11-01  6:14 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: linux-fsdevel, LKML, Linux Containers, Andrey Vagin, Alexander Viro

On Tue, Oct 25, 2016 at 04:45:44PM -0500, Eric W. Biederman wrote:
> Andrei Vagin <avagin-5HdwGun5lf+gSpxsJD1C4w@public.gmane.org> writes:
> >
> > From 8e0f45c0272aa1f789d1657a0acc98c58919dcc3 Mon Sep 17 00:00:00 2001
> > From: Andrei Vagin <avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
> > Date: Tue, 25 Oct 2016 13:57:31 -0700
> > Subject: [PATCH] mount: skip all mounts from a shared group if one is marked
> >
> > If we meet a marked mount, it means that all mounts from
> > its group have been already revised.
> >
> > Signed-off-by: Andrei Vagin <avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
> > ---
> >  fs/pnode.c | 18 +++++++++++++++---
> >  1 file changed, 15 insertions(+), 3 deletions(-)
> >
> > diff --git a/fs/pnode.c b/fs/pnode.c
> > index 8fd1a3f..ebb7134 100644
> > --- a/fs/pnode.c
> > +++ b/fs/pnode.c
> > @@ -426,10 +426,16 @@ static struct mount *propagation_visit_child(struct mount *last_child,
> >  		if (child && !IS_MNT_MARKED(child))
> >  			return child;
> >  
> > -		if (!child)
> > +		if (!child) {
> >  			m = propagation_next(m, origin);
> > -		else
> > +		} else {
> > +			if (IS_MNT_MARKED(child)) {
> > +				if (m->mnt_group_id == origin->mnt_group_id)
> > +					return NULL;
> > +				m = m->mnt_master;
> > +			}
> >  			m = propagation_next_sib(m, origin);
> > +		}
> >  	}
> >  	return NULL;
> >  }
> > @@ -456,8 +462,14 @@ static struct mount *propagation_revisit_child(struct mount *last_child,
> >  
> >  		if (!child)
> >  			m = propagation_next(m, origin);
> > -		else
> > +		else {
> > +			if (!IS_MNT_MARKED(child)) {
> > +				if (m->mnt_group_id == origin->mnt_group_id)
> > +					return NULL;
> > +				m = m->mnt_master;
> > +			}
> >  			m = propagation_next_sib(m, origin);
> > +		}
> >  	}
> >  	return NULL;
> >  }
> 
> That is certainly interesting.  The problem is that the reason we were
> going slow is that there were in fact mounts that had not been traversed
> in the share group.
> 
> And in fact the entire idea of visiting a vfsmount mountpoint pair
> exactly once is wrong in the face of shadow mounts.  For a vfsmount
> mountpoint pair that has shadow mounts the number of shadow mounts needs
> to be descreased by one each time the propgation tree is traversed
> during unmount. Which means that as far as I can see we have to kill
> shadow mounts to correctly optimize this code.  Once shadow mounts are
> gone I don't know of a case where need your optimization.

I am not sure that now shadow mounts are worked as you described
here. start_umount_propagation() doesn't remove a mount from mnt_hash,
so in a second time we will look up the same mount again.

Look at this script:

[root@fc24 mounts]# cat ./opus02.sh
set -e
mkdir -p /mnt
mount -t tmpfs zdtm /mnt
mkdir -p /mnt/A/a
mkdir -p /mnt/B/a
mount --bind --make-shared /mnt/A /mnt/A
mount --bind /mnt/A /mnt/B
mount --bind /mnt/A/a /mnt/A/a
mount --bind /mnt/A/a /mnt/A/a

umount -l /mnt/A
cat /proc/self/mountinfo | grep zdtm

[root@fc24 mounts]# unshare --propagation private -m ./opus02.sh
159 121 0:46 / /mnt rw,relatime - tmpfs zdtm rw
162 159 0:46 /A /mnt/B rw,relatime shared:67 - tmpfs zdtm rw
167 162 0:46 /A/a /mnt/B/a rw,relatime shared:67 - tmpfs zdtm rw

We mount nothing into /mnt/B, but when we umount everything from A, we
still have something in B.

Thanks,
Andrei
> 
> I am busily verifying my patch to kill shadow mounts but the following
> patch is the minimal version.  As far as I can see propagate_one
> is the only place we create shadow mounts, and holding the
> namespace_lock over attach_recursive_mnt, propagate_mnt, and
> propgate_one is sufficient for that __lookup_mnt to be competely safe.
> 
> diff --git a/fs/pnode.c b/fs/pnode.c
> index 234a9ac49958..b14119b370d4 100644
> --- a/fs/pnode.c
> +++ b/fs/pnode.c
> @@ -217,6 +217,9 @@ static int propagate_one(struct mount *m)
>         /* skip if mountpoint isn't covered by it */
>         if (!is_subdir(mp->m_dentry, m->mnt.mnt_root))
>                 return 0;
> +       /* skip if mountpoint already has a mount on it */
> +       if (__lookup_mnt(&m->mnt, mp->m_dentry))
> +               return 0;
>         if (peers(m, last_dest)) {
>                 type = CL_MAKE_SHARED;
>         } else {
> 
> If you run with that patch you will see that there are go faster stripes.
> 
> Eric
> 

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [RFC][PATCH v2] mount: In propagate_umount handle overlapping mount propagation trees
@ 2016-11-01  6:14                                                     ` Andrei Vagin
  0 siblings, 0 replies; 36+ messages in thread
From: Andrei Vagin @ 2016-11-01  6:14 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Andrey Vagin, Alexander Viro, Linux Containers, linux-fsdevel, LKML

On Tue, Oct 25, 2016 at 04:45:44PM -0500, Eric W. Biederman wrote:
> Andrei Vagin <avagin@virtuozzo.com> writes:
> >
> > From 8e0f45c0272aa1f789d1657a0acc98c58919dcc3 Mon Sep 17 00:00:00 2001
> > From: Andrei Vagin <avagin@openvz.org>
> > Date: Tue, 25 Oct 2016 13:57:31 -0700
> > Subject: [PATCH] mount: skip all mounts from a shared group if one is marked
> >
> > If we meet a marked mount, it means that all mounts from
> > its group have been already revised.
> >
> > Signed-off-by: Andrei Vagin <avagin@openvz.org>
> > ---
> >  fs/pnode.c | 18 +++++++++++++++---
> >  1 file changed, 15 insertions(+), 3 deletions(-)
> >
> > diff --git a/fs/pnode.c b/fs/pnode.c
> > index 8fd1a3f..ebb7134 100644
> > --- a/fs/pnode.c
> > +++ b/fs/pnode.c
> > @@ -426,10 +426,16 @@ static struct mount *propagation_visit_child(struct mount *last_child,
> >  		if (child && !IS_MNT_MARKED(child))
> >  			return child;
> >  
> > -		if (!child)
> > +		if (!child) {
> >  			m = propagation_next(m, origin);
> > -		else
> > +		} else {
> > +			if (IS_MNT_MARKED(child)) {
> > +				if (m->mnt_group_id == origin->mnt_group_id)
> > +					return NULL;
> > +				m = m->mnt_master;
> > +			}
> >  			m = propagation_next_sib(m, origin);
> > +		}
> >  	}
> >  	return NULL;
> >  }
> > @@ -456,8 +462,14 @@ static struct mount *propagation_revisit_child(struct mount *last_child,
> >  
> >  		if (!child)
> >  			m = propagation_next(m, origin);
> > -		else
> > +		else {
> > +			if (!IS_MNT_MARKED(child)) {
> > +				if (m->mnt_group_id == origin->mnt_group_id)
> > +					return NULL;
> > +				m = m->mnt_master;
> > +			}
> >  			m = propagation_next_sib(m, origin);
> > +		}
> >  	}
> >  	return NULL;
> >  }
> 
> That is certainly interesting.  The problem is that the reason we were
> going slow is that there were in fact mounts that had not been traversed
> in the share group.
> 
> And in fact the entire idea of visiting a vfsmount mountpoint pair
> exactly once is wrong in the face of shadow mounts.  For a vfsmount
> mountpoint pair that has shadow mounts the number of shadow mounts needs
> to be descreased by one each time the propgation tree is traversed
> during unmount. Which means that as far as I can see we have to kill
> shadow mounts to correctly optimize this code.  Once shadow mounts are
> gone I don't know of a case where need your optimization.

I am not sure that now shadow mounts are worked as you described
here. start_umount_propagation() doesn't remove a mount from mnt_hash,
so in a second time we will look up the same mount again.

Look at this script:

[root@fc24 mounts]# cat ./opus02.sh
set -e
mkdir -p /mnt
mount -t tmpfs zdtm /mnt
mkdir -p /mnt/A/a
mkdir -p /mnt/B/a
mount --bind --make-shared /mnt/A /mnt/A
mount --bind /mnt/A /mnt/B
mount --bind /mnt/A/a /mnt/A/a
mount --bind /mnt/A/a /mnt/A/a

umount -l /mnt/A
cat /proc/self/mountinfo | grep zdtm

[root@fc24 mounts]# unshare --propagation private -m ./opus02.sh
159 121 0:46 / /mnt rw,relatime - tmpfs zdtm rw
162 159 0:46 /A /mnt/B rw,relatime shared:67 - tmpfs zdtm rw
167 162 0:46 /A/a /mnt/B/a rw,relatime shared:67 - tmpfs zdtm rw

We mount nothing into /mnt/B, but when we umount everything from A, we
still have something in B.

Thanks,
Andrei
> 
> I am busily verifying my patch to kill shadow mounts but the following
> patch is the minimal version.  As far as I can see propagate_one
> is the only place we create shadow mounts, and holding the
> namespace_lock over attach_recursive_mnt, propagate_mnt, and
> propgate_one is sufficient for that __lookup_mnt to be competely safe.
> 
> diff --git a/fs/pnode.c b/fs/pnode.c
> index 234a9ac49958..b14119b370d4 100644
> --- a/fs/pnode.c
> +++ b/fs/pnode.c
> @@ -217,6 +217,9 @@ static int propagate_one(struct mount *m)
>         /* skip if mountpoint isn't covered by it */
>         if (!is_subdir(mp->m_dentry, m->mnt.mnt_root))
>                 return 0;
> +       /* skip if mountpoint already has a mount on it */
> +       if (__lookup_mnt(&m->mnt, mp->m_dentry))
> +               return 0;
>         if (peers(m, last_dest)) {
>                 type = CL_MAKE_SHARED;
>         } else {
> 
> If you run with that patch you will see that there are go faster stripes.
> 
> Eric
> 

^ permalink raw reply	[flat|nested] 36+ messages in thread

end of thread, other threads:[~2016-11-01  8:49 UTC | newest]

Thread overview: 36+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-10-10 23:26 [PATCH] [v3] mount: dont execute propagate_umount() many times for same mounts Andrei Vagin
2016-10-10 23:26 ` Andrei Vagin
     [not found] ` <1476141965-21429-1-git-send-email-avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
2016-10-13 17:14   ` Eric W. Biederman
2016-10-13 17:14 ` Eric W. Biederman
2016-10-13 19:53   ` [RFC][PATCH] mount: In mark_umount_candidates and __propogate_umount visit each mount once Eric W. Biederman
2016-10-13 21:46     ` Andrei Vagin
     [not found]       ` <20161013214650.GB19836-1ViLX0X+lBJGNQ1M2rI3KwRV3xvJKrda@public.gmane.org>
2016-10-14  2:31         ` Andrey Vagin
2016-10-14  2:31           ` Andrey Vagin
     [not found]           ` <CANaxB-xPkgdyeg0z6TvExMfyy4uOC+Nu4Q99WpCscNKMWz8VPg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2016-10-14  2:45             ` Eric W. Biederman
2016-10-14  2:45               ` Eric W. Biederman
     [not found]               ` <87wphb4pjn.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org>
2016-10-14 18:29                 ` [RFC][PATCH v2] " Eric W. Biederman
2016-10-14 18:29                   ` Eric W. Biederman
     [not found]                   ` <8737jy3htt.fsf_-_-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org>
2016-10-18  2:40                     ` Andrei Vagin
2016-10-18  2:40                       ` Andrei Vagin
     [not found]                       ` <20161018024000.GA4901-1ViLX0X+lBJGNQ1M2rI3KwRV3xvJKrda@public.gmane.org>
2016-10-18  6:49                         ` Eric W. Biederman
2016-10-18  6:49                           ` Eric W. Biederman
2016-10-19  3:46                           ` [REVIEW][PATCH] mount: In propagate_umount handle overlapping mount propagation trees Eric W. Biederman
     [not found]                             ` <877f95ngpr.fsf_-_-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
2016-10-20 21:30                               ` Andrei Vagin
2016-10-20 21:30                                 ` Andrei Vagin
     [not found]                                 ` <20161020213052.GA25226-1ViLX0X+lBJGNQ1M2rI3KwRV3xvJKrda@public.gmane.org>
2016-10-21 19:26                                   ` Eric W. Biederman
2016-10-21 19:26                                     ` Eric W. Biederman
     [not found]                                     ` <87pomtec6c.fsf-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
2016-10-22 19:42                                       ` [RFC][PATCH v2] " Eric W. Biederman
2016-10-22 19:42                                         ` Eric W. Biederman
     [not found]                                         ` <877f90b27o.fsf_-_-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
2016-10-25 20:58                                           ` Andrei Vagin
2016-10-25 20:58                                             ` Andrei Vagin
     [not found]                                             ` <20161025205846.GA25080-1ViLX0X+lBJGNQ1M2rI3KwRV3xvJKrda@public.gmane.org>
2016-10-25 21:45                                               ` Eric W. Biederman
2016-10-25 21:45                                                 ` Eric W. Biederman
     [not found]                                                 ` <87mvhs14s7.fsf-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
2016-10-25 23:41                                                   ` Andrei Vagin
2016-10-25 23:41                                                     ` Andrei Vagin
2016-10-26  1:42                                                     ` Eric W. Biederman
     [not found]                                                     ` <20161025234125.GA20335-1ViLX0X+lBJGNQ1M2rI3KwRV3xvJKrda@public.gmane.org>
2016-10-26  1:42                                                       ` Eric W. Biederman
2016-11-01  6:14                                                   ` Andrei Vagin
2016-11-01  6:14                                                     ` Andrei Vagin
     [not found]                           ` <87r37e9mnj.fsf-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>
2016-10-19  3:46                             ` [REVIEW][PATCH] " Eric W. Biederman
     [not found]     ` <87pon458l1.fsf_-_-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org>
2016-10-13 21:46       ` [RFC][PATCH] mount: In mark_umount_candidates and __propogate_umount visit each mount once Andrei Vagin
     [not found]   ` <877f9c6ui8.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org>
2016-10-13 19:53     ` Eric W. Biederman

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.