All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH RFC] mnt: umount mounts one by one in umount_tree()
@ 2017-05-12  7:08 Andrei Vagin
  2017-05-12 18:56 ` [PATCH v2 " Andrei Vagin
       [not found] ` <20170512070838.5037-1-avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
  0 siblings, 2 replies; 7+ messages in thread
From: Andrei Vagin @ 2017-05-12  7:08 UTC (permalink / raw)
  To: Eric W . Biederman
  Cc: Alexander Viro, linux-fsdevel, linux-kernel, Linux Containers,
	Andrei Vagin

With this patch, we don't try to umount all mounts of a tree together.
Instead of this we umount them one by one. In this case, we see a significant
improvement in performance for the worsе case.

The reason of this optimization is that umount() can hold namespace_sem
for a long time, this semaphore is global, so it affects all users.
Recently Eric W. Biederman added a per mount namespace limit on the
number of mounts. The default number of mounts allowed per mount
namespace at 100,000. Currently this value is allowed to construct a tree
which requires hours to be umounted.

In a worse case the current complexity of umount_tree() is O(n^3).
* Enumirate all mounts in a target tree (propagate_umount)
* Enumirate mounts to find where these changes have to
  be propagated (mark_umount_candidates)
* Enumirate mounts to find a requered mount by parent and dentry
  (__lookup_mnt). __lookup_mnt() searches a mount in m_hash, but
  the number of mounts is much bigger than a size of the hash.

The worse case is when all mounts from the tree live in the same shared
group. In this case we have to enumirate all mounts on each step.

There is CVE-2016-6213 about this case.

Here are results for the kernel with this patch
$ for i in `seq 10 15`; do  unshare -m sh ./run.sh $i; done
umount -l /mnt/1 -> 	0:00.00
umount -l /mnt/1 -> 	0:00.01
umount -l /mnt/1 -> 	0:00.01
umount -l /mnt/1 -> 	0:00.03
umount -l /mnt/1 -> 	0:00.07
umount -l /mnt/1 -> 	0:00.14

Here are results for the kernel without this patch
$ for i in `seq 10 15`; do  unshare -m sh ./run.sh $i; done
umount -l /mnt/1 -> 	0:00.04
umount -l /mnt/1 -> 	0:00.17
umount -l /mnt/1 -> 	0:00.75
umount -l /mnt/1 -> 	0:05.96
umount -l /mnt/1 -> 	0:34.40
umount -l /mnt/1 -> 	3:46.27

And here is a test script:
$ cat run.sh
set -e -m

mount -t tmpfs zdtm /mnt
mkdir -p /mnt/1 /mnt/2
mount -t tmpfs zdtm /mnt/1
mount --make-shared /mnt/1
mkdir /mnt/1/1

for i in `seq $1`; do
	./mount --bind /mnt/1/1 /mnt/1/1
done

echo -n "umount -l /mnt/1 -> "
/usr/bin/time -f '%E' ./umount -l /mnt/1

And we need these simple mount and umount tools, because the standard
ones read /proc/self/mountinfo, but this is extremely slow when we have
thousands of mounts.
$ cat mount.c
 #include <sys/mount.h>
 #include <stdlib.h>

 int main(int argc, char **argv)
 {
 	return mount(argv[2], argv[3], NULL, MS_BIND, NULL);
 }

$ cat umount.c
 #include <sys/mount.h>

 int main(int argc, char **argv)
 {
 	return umount2(argv[2], MNT_DETACH);
 }

Here is a previous attempt to optimize this code:
https://lkml.org/lkml/2016/10/10/495

Signed-off-by: Andrei Vagin <avagin@openvz.org>
---
 fs/namespace.c | 81 +++++++++++++++++++++++++++++++---------------------------
 1 file changed, 43 insertions(+), 38 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 3bf0cd2..4e6f258 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1474,56 +1474,61 @@ static bool disconnect_mount(struct mount *mnt, enum umount_tree_flags how)
  */
 static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
 {
-	LIST_HEAD(tmp_list);
 	struct mount *p;
+	int done = 0;
 
 	if (how & UMOUNT_PROPAGATE)
 		propagate_mount_unlock(mnt);
 
 	/* Gather the mounts to umount */
-	for (p = mnt; p; p = next_mnt(p, mnt)) {
+	while (!done) {
+		LIST_HEAD(tmp_list);
+
+		p = mnt;
+		while (!list_empty(&p->mnt_mounts))
+			p = list_entry(p->mnt_mounts.next, struct mount, mnt_child);
+
 		p->mnt.mnt_flags |= MNT_UMOUNT;
 		list_move(&p->mnt_list, &tmp_list);
-	}
-
-	/* Hide the mounts from mnt_mounts */
-	list_for_each_entry(p, &tmp_list, mnt_list) {
 		list_del_init(&p->mnt_child);
-	}
 
-	/* Add propogated mounts to the tmp_list */
-	if (how & UMOUNT_PROPAGATE)
-		propagate_umount(&tmp_list);
-
-	while (!list_empty(&tmp_list)) {
-		struct mnt_namespace *ns;
-		bool disconnect;
-		p = list_first_entry(&tmp_list, struct mount, mnt_list);
-		list_del_init(&p->mnt_expire);
-		list_del_init(&p->mnt_list);
-		ns = p->mnt_ns;
-		if (ns) {
-			ns->mounts--;
-			__touch_mnt_namespace(ns);
-		}
-		p->mnt_ns = NULL;
-		if (how & UMOUNT_SYNC)
-			p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
-
-		disconnect = disconnect_mount(p, how);
-
-		pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt,
-				 disconnect ? &unmounted : NULL);
-		if (mnt_has_parent(p)) {
-			mnt_add_count(p->mnt_parent, -1);
-			if (!disconnect) {
-				/* Don't forget about p */
-				list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
-			} else {
-				umount_mnt(p);
+		/* Add propogated mounts to the tmp_list */
+		if (how & UMOUNT_PROPAGATE)
+			propagate_umount(&tmp_list);
+
+		if (p == mnt)
+			done = 1;
+
+		while (!list_empty(&tmp_list)) {
+			struct mnt_namespace *ns;
+			bool disconnect;
+			p = list_first_entry(&tmp_list, struct mount, mnt_list);
+			list_del_init(&p->mnt_expire);
+			list_del_init(&p->mnt_list);
+			ns = p->mnt_ns;
+			if (ns) {
+				ns->mounts--;
+				__touch_mnt_namespace(ns);
+			}
+			p->mnt_ns = NULL;
+			if (how & UMOUNT_SYNC)
+				p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
+
+			disconnect = disconnect_mount(p, how);
+
+			pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt,
+					 disconnect ? &unmounted : NULL);
+			if (mnt_has_parent(p)) {
+				mnt_add_count(p->mnt_parent, -1);
+				if (!disconnect) {
+					/* Don't forget about p */
+					list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
+				} else {
+					umount_mnt(p);
+				}
 			}
+			change_mnt_propagation(p, MS_PRIVATE);
 		}
-		change_mnt_propagation(p, MS_PRIVATE);
 	}
 }
 
-- 
2.9.3

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH v2 RFC] mnt: umount mounts one by one in umount_tree()
       [not found] ` <20170512070838.5037-1-avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
@ 2017-05-12 18:56   ` Andrei Vagin
  2017-06-14  1:53     ` Ian Kent
  1 sibling, 0 replies; 7+ messages in thread
From: Andrei Vagin @ 2017-05-12 18:56 UTC (permalink / raw)
  To: Eric W . Biederman
  Cc: Andrei Vagin, linux-api-u79uwXL29TY76Z2rM5mHXA, Linux Containers,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, criu-GEFAQzZX7r8dnm+yROfE0A,
	Alexander Viro, linux-fsdevel-u79uwXL29TY76Z2rM5mHXA

With this patch, we don't try to umount all mounts of a tree together.
Instead of this we umount them one by one. In this case, we see a significant
improvement in performance for the worsе case.

v2: create a sorted list of mounts, so that umount a child before its
parent.

The reason of this optimization is that umount() can hold namespace_sem
for a long time, this semaphore is global, so it affects all users.
Recently Eric W. Biederman added a per mount namespace limit on the
number of mounts. The default number of mounts allowed per mount
namespace at 100,000. Currently this value is allowed to construct a tree
which requires hours to be umounted.

In a worse case the current complexity of umount_tree() is O(n^3).
* Enumirate all mounts in a target tree (propagate_umount)
* Enumirate mounts to find where these changes have to
  be propagated (mark_umount_candidates)
* Enumirate mounts to find a requered mount by parent and dentry
  (__lookup_mnt). __lookup_mnt() searches a mount in m_hash, but
  the number of mounts is much bigger than a size of the hash.

The worse case is when all mounts from the tree live in the same shared
group. In this case we have to enumirate all mounts on each step.

There is CVE-2016-6213 about this case.

Here are results for the kernel with this patch
$ for i in `seq 10 15`; do  unshare -m sh ./run.sh $i; done
umount -l /mnt/1 -> 	0:00.00
umount -l /mnt/1 -> 	0:00.01
umount -l /mnt/1 -> 	0:00.01
umount -l /mnt/1 -> 	0:00.03
umount -l /mnt/1 -> 	0:00.07
umount -l /mnt/1 -> 	0:00.14

Here are results for the kernel without this patch
$ for i in `seq 10 15`; do  unshare -m sh ./run.sh $i; done
umount -l /mnt/1 -> 	0:00.04
umount -l /mnt/1 -> 	0:00.17
umount -l /mnt/1 -> 	0:00.75
umount -l /mnt/1 -> 	0:05.96
umount -l /mnt/1 -> 	0:34.40
umount -l /mnt/1 -> 	3:46.27

And here is a test script:
$ cat run.sh
set -e -m

mount -t tmpfs zdtm /mnt
mkdir -p /mnt/1 /mnt/2
mount -t tmpfs zdtm /mnt/1
mount --make-shared /mnt/1
mkdir /mnt/1/1

for i in `seq $1`; do
	./mount --bind /mnt/1/1 /mnt/1/1
done

echo -n "umount -l /mnt/1 -> "
/usr/bin/time -f '%E' ./umount -l /mnt/1

And we need these simple mount and umount tools, because the standard
ones read /proc/self/mountinfo, but this is extremely slow when we have
thousands of mounts.
$ cat mount.c
 #include <sys/mount.h>
 #include <stdlib.h>

 int main(int argc, char **argv)
 {
 	return mount(argv[2], argv[3], NULL, MS_BIND, NULL);
 }

$ cat umount.c
 #include <sys/mount.h>

 int main(int argc, char **argv)
 {
 	return umount2(argv[2], MNT_DETACH);
 }

Here is a previous attempt to optimize this code:
https://lkml.org/lkml/2016/10/10/495

Signed-off-by: Andrei Vagin <avagin@openvz.org>
---
 fs/namespace.c | 105 ++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 67 insertions(+), 38 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 3bf0cd2..4f7e36c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1468,62 +1468,91 @@ static bool disconnect_mount(struct mount *mnt, enum umount_tree_flags how)
 	return true;
 }
 
+/* post-order depth-first search */
+static struct mount *next_kid(struct mount *p, struct mount *root)
+{
+	if (p == NULL) {
+		p = root;
+		goto out;
+	}
+	if (p->mnt_child.next != &p->mnt_parent->mnt_mounts) {
+		p = list_entry(p->mnt_child.next, struct mount, mnt_child);
+		goto out;
+	}
+
+	return p->mnt_parent;
+
+out:
+	while (!list_empty(&p->mnt_mounts))
+		p = list_first_entry(&p->mnt_mounts, struct mount, mnt_child);
+	return p;
+}
+
 /*
  * mount_lock must be held
  * namespace_sem must be held for write
  */
 static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
 {
-	LIST_HEAD(tmp_list);
 	struct mount *p;
+	LIST_HEAD(post_order_list);
+
+	/* Create a list of mounts to in an order to umount them */
+	p = NULL;
+	while (1) {
+		p = next_kid(p, mnt);
+		list_move_tail(&p->mnt_list, &post_order_list);
+		if (p == mnt)
+			break;
+	}
 
 	if (how & UMOUNT_PROPAGATE)
 		propagate_mount_unlock(mnt);
 
 	/* Gather the mounts to umount */
-	for (p = mnt; p; p = next_mnt(p, mnt)) {
+	while (!list_empty(&post_order_list)) {
+		LIST_HEAD(tmp_list);
+
+		p = list_first_entry(&post_order_list, struct mount, mnt_list);
+
 		p->mnt.mnt_flags |= MNT_UMOUNT;
 		list_move(&p->mnt_list, &tmp_list);
-	}
-
-	/* Hide the mounts from mnt_mounts */
-	list_for_each_entry(p, &tmp_list, mnt_list) {
 		list_del_init(&p->mnt_child);
-	}
 
-	/* Add propogated mounts to the tmp_list */
-	if (how & UMOUNT_PROPAGATE)
-		propagate_umount(&tmp_list);
-
-	while (!list_empty(&tmp_list)) {
-		struct mnt_namespace *ns;
-		bool disconnect;
-		p = list_first_entry(&tmp_list, struct mount, mnt_list);
-		list_del_init(&p->mnt_expire);
-		list_del_init(&p->mnt_list);
-		ns = p->mnt_ns;
-		if (ns) {
-			ns->mounts--;
-			__touch_mnt_namespace(ns);
-		}
-		p->mnt_ns = NULL;
-		if (how & UMOUNT_SYNC)
-			p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
-
-		disconnect = disconnect_mount(p, how);
-
-		pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt,
-				 disconnect ? &unmounted : NULL);
-		if (mnt_has_parent(p)) {
-			mnt_add_count(p->mnt_parent, -1);
-			if (!disconnect) {
-				/* Don't forget about p */
-				list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
-			} else {
-				umount_mnt(p);
+		/* Add propogated mounts to the tmp_list */
+		if (how & UMOUNT_PROPAGATE)
+			propagate_umount(&tmp_list);
+
+		while (!list_empty(&tmp_list)) {
+			struct mnt_namespace *ns;
+			bool disconnect;
+			p = list_first_entry(&tmp_list, struct mount, mnt_list);
+			list_del_init(&p->mnt_expire);
+			list_del_init(&p->mnt_list);
+			ns = p->mnt_ns;
+			if (ns) {
+				ns->mounts--;
+				__touch_mnt_namespace(ns);
+			}
+			p->mnt_ns = NULL;
+			if (how & UMOUNT_SYNC)
+				p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
+
+			disconnect = disconnect_mount(p, how);
+
+			pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt,
+					 disconnect ? &unmounted : NULL);
+			if (mnt_has_parent(p)) {
+				mnt_add_count(p->mnt_parent, -1);
+				if (!disconnect) {
+					/* Don't forget about p */
+					list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
+				} else {
+					umount_mnt(p);
+				}
 			}
+			change_mnt_propagation(p, MS_PRIVATE);
 		}
-		change_mnt_propagation(p, MS_PRIVATE);
 	}
 }
 
-- 
2.9.3

_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/containers

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH v2 RFC] mnt: umount mounts one by one in umount_tree()
  2017-05-12  7:08 [PATCH RFC] mnt: umount mounts one by one in umount_tree() Andrei Vagin
@ 2017-05-12 18:56 ` Andrei Vagin
       [not found] ` <20170512070838.5037-1-avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
  1 sibling, 0 replies; 7+ messages in thread
From: Andrei Vagin @ 2017-05-12 18:56 UTC (permalink / raw)
  To: Eric W . Biederman
  Cc: Alexander Viro, linux-fsdevel, linux-kernel, linux-api, criu,
	Linux Containers, Andrei Vagin

With this patch, we don't try to umount all mounts of a tree together.
Instead of this we umount them one by one. In this case, we see a significant
improvement in performance for the worsе case.

v2: create a sorted list of mounts, so that umount a child before its
parent.

The reason of this optimization is that umount() can hold namespace_sem
for a long time, this semaphore is global, so it affects all users.
Recently Eric W. Biederman added a per mount namespace limit on the
number of mounts. The default number of mounts allowed per mount
namespace at 100,000. Currently this value is allowed to construct a tree
which requires hours to be umounted.

In a worse case the current complexity of umount_tree() is O(n^3).
* Enumirate all mounts in a target tree (propagate_umount)
* Enumirate mounts to find where these changes have to
  be propagated (mark_umount_candidates)
* Enumirate mounts to find a requered mount by parent and dentry
  (__lookup_mnt). __lookup_mnt() searches a mount in m_hash, but
  the number of mounts is much bigger than a size of the hash.

The worse case is when all mounts from the tree live in the same shared
group. In this case we have to enumirate all mounts on each step.

There is CVE-2016-6213 about this case.

Here are results for the kernel with this patch
$ for i in `seq 10 15`; do  unshare -m sh ./run.sh $i; done
umount -l /mnt/1 -> 	0:00.00
umount -l /mnt/1 -> 	0:00.01
umount -l /mnt/1 -> 	0:00.01
umount -l /mnt/1 -> 	0:00.03
umount -l /mnt/1 -> 	0:00.07
umount -l /mnt/1 -> 	0:00.14

Here are results for the kernel without this patch
$ for i in `seq 10 15`; do  unshare -m sh ./run.sh $i; done
umount -l /mnt/1 -> 	0:00.04
umount -l /mnt/1 -> 	0:00.17
umount -l /mnt/1 -> 	0:00.75
umount -l /mnt/1 -> 	0:05.96
umount -l /mnt/1 -> 	0:34.40
umount -l /mnt/1 -> 	3:46.27

And here is a test script:
$ cat run.sh
set -e -m

mount -t tmpfs zdtm /mnt
mkdir -p /mnt/1 /mnt/2
mount -t tmpfs zdtm /mnt/1
mount --make-shared /mnt/1
mkdir /mnt/1/1

for i in `seq $1`; do
	./mount --bind /mnt/1/1 /mnt/1/1
done

echo -n "umount -l /mnt/1 -> "
/usr/bin/time -f '%E' ./umount -l /mnt/1

And we need these simple mount and umount tools, because the standard
ones read /proc/self/mountinfo, but this is extremely slow when we have
thousands of mounts.
$ cat mount.c
 #include <sys/mount.h>
 #include <stdlib.h>

 int main(int argc, char **argv)
 {
 	return mount(argv[2], argv[3], NULL, MS_BIND, NULL);
 }

$ cat umount.c
 #include <sys/mount.h>

 int main(int argc, char **argv)
 {
 	return umount2(argv[2], MNT_DETACH);
 }

Here is a previous attempt to optimize this code:
https://lkml.org/lkml/2016/10/10/495

Signed-off-by: Andrei Vagin <avagin@openvz.org>
---
 fs/namespace.c | 105 ++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 67 insertions(+), 38 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 3bf0cd2..4f7e36c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1468,62 +1468,91 @@ static bool disconnect_mount(struct mount *mnt, enum umount_tree_flags how)
 	return true;
 }
 
+/* post-order depth-first search */
+static struct mount *next_kid(struct mount *p, struct mount *root)
+{
+	if (p == NULL) {
+		p = root;
+		goto out;
+	}
+	if (p->mnt_child.next != &p->mnt_parent->mnt_mounts) {
+		p = list_entry(p->mnt_child.next, struct mount, mnt_child);
+		goto out;
+	}
+
+	return p->mnt_parent;
+
+out:
+	while (!list_empty(&p->mnt_mounts))
+		p = list_first_entry(&p->mnt_mounts, struct mount, mnt_child);
+	return p;
+}
+
 /*
  * mount_lock must be held
  * namespace_sem must be held for write
  */
 static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
 {
-	LIST_HEAD(tmp_list);
 	struct mount *p;
+	LIST_HEAD(post_order_list);
+
+	/* Create a list of mounts to in an order to umount them */
+	p = NULL;
+	while (1) {
+		p = next_kid(p, mnt);
+		list_move_tail(&p->mnt_list, &post_order_list);
+		if (p == mnt)
+			break;
+	}
 
 	if (how & UMOUNT_PROPAGATE)
 		propagate_mount_unlock(mnt);
 
 	/* Gather the mounts to umount */
-	for (p = mnt; p; p = next_mnt(p, mnt)) {
+	while (!list_empty(&post_order_list)) {
+		LIST_HEAD(tmp_list);
+
+		p = list_first_entry(&post_order_list, struct mount, mnt_list);
+
 		p->mnt.mnt_flags |= MNT_UMOUNT;
 		list_move(&p->mnt_list, &tmp_list);
-	}
-
-	/* Hide the mounts from mnt_mounts */
-	list_for_each_entry(p, &tmp_list, mnt_list) {
 		list_del_init(&p->mnt_child);
-	}
 
-	/* Add propogated mounts to the tmp_list */
-	if (how & UMOUNT_PROPAGATE)
-		propagate_umount(&tmp_list);
-
-	while (!list_empty(&tmp_list)) {
-		struct mnt_namespace *ns;
-		bool disconnect;
-		p = list_first_entry(&tmp_list, struct mount, mnt_list);
-		list_del_init(&p->mnt_expire);
-		list_del_init(&p->mnt_list);
-		ns = p->mnt_ns;
-		if (ns) {
-			ns->mounts--;
-			__touch_mnt_namespace(ns);
-		}
-		p->mnt_ns = NULL;
-		if (how & UMOUNT_SYNC)
-			p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
-
-		disconnect = disconnect_mount(p, how);
-
-		pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt,
-				 disconnect ? &unmounted : NULL);
-		if (mnt_has_parent(p)) {
-			mnt_add_count(p->mnt_parent, -1);
-			if (!disconnect) {
-				/* Don't forget about p */
-				list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
-			} else {
-				umount_mnt(p);
+		/* Add propogated mounts to the tmp_list */
+		if (how & UMOUNT_PROPAGATE)
+			propagate_umount(&tmp_list);
+
+		while (!list_empty(&tmp_list)) {
+			struct mnt_namespace *ns;
+			bool disconnect;
+			p = list_first_entry(&tmp_list, struct mount, mnt_list);
+			list_del_init(&p->mnt_expire);
+			list_del_init(&p->mnt_list);
+			ns = p->mnt_ns;
+			if (ns) {
+				ns->mounts--;
+				__touch_mnt_namespace(ns);
+			}
+			p->mnt_ns = NULL;
+			if (how & UMOUNT_SYNC)
+				p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
+
+			disconnect = disconnect_mount(p, how);
+
+			pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt,
+					 disconnect ? &unmounted : NULL);
+			if (mnt_has_parent(p)) {
+				mnt_add_count(p->mnt_parent, -1);
+				if (!disconnect) {
+					/* Don't forget about p */
+					list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
+				} else {
+					umount_mnt(p);
+				}
 			}
+			change_mnt_propagation(p, MS_PRIVATE);
 		}
-		change_mnt_propagation(p, MS_PRIVATE);
 	}
 }
 
-- 
2.9.3

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH RFC] mnt: umount mounts one by one in umount_tree()
  2017-05-12  7:08 [PATCH RFC] mnt: umount mounts one by one in umount_tree() Andrei Vagin
@ 2017-06-14  1:53     ` Ian Kent
       [not found] ` <20170512070838.5037-1-avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
  1 sibling, 0 replies; 7+ messages in thread
From: Ian Kent @ 2017-06-14  1:53 UTC (permalink / raw)
  To: Andrei Vagin, Eric W . Biederman
  Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, Linux Containers,
	Alexander Viro, linux-kernel-u79uwXL29TY76Z2rM5mHXA

On Fri, 2017-05-12 at 00:08 -0700, Andrei Vagin wrote:
> With this patch, we don't try to umount all mounts of a tree together.
> Instead of this we umount them one by one. In this case, we see a significant
> improvement in performance for the worsе case.

Indeed, umount has been very slow for a while now.
Even a moderately large number of mounts (~10000) become painfully slow.

Re you still perusing this?
Anything I can do to help?

Eric, what are your thoughts on this latest attempt?

> 
> The reason of this optimization is that umount() can hold namespace_sem
> for a long time, this semaphore is global, so it affects all users.
> Recently Eric W. Biederman added a per mount namespace limit on the
> number of mounts. The default number of mounts allowed per mount
> namespace at 100,000. Currently this value is allowed to construct a tree
> which requires hours to be umounted.
> 
> In a worse case the current complexity of umount_tree() is O(n^3).
> * Enumirate all mounts in a target tree (propagate_umount)
> * Enumirate mounts to find where these changes have to
>   be propagated (mark_umount_candidates)
> * Enumirate mounts to find a requered mount by parent and dentry
>   (__lookup_mnt). __lookup_mnt() searches a mount in m_hash, but
>   the number of mounts is much bigger than a size of the hash.
> 
> The worse case is when all mounts from the tree live in the same shared
> group. In this case we have to enumirate all mounts on each step.
> 
> There is CVE-2016-6213 about this case.
> 
> Here are results for the kernel with this patch
> $ for i in `seq 10 15`; do  unshare -m sh ./run.sh $i; done
> umount -l /mnt/1 -> 	0:00.00
> umount -l /mnt/1 -> 	0:00.01
> umount -l /mnt/1 -> 	0:00.01
> umount -l /mnt/1 -> 	0:00.03
> umount -l /mnt/1 -> 	0:00.07
> umount -l /mnt/1 -> 	0:00.14
> 
> Here are results for the kernel without this patch
> $ for i in `seq 10 15`; do  unshare -m sh ./run.sh $i; done
> umount -l /mnt/1 -> 	0:00.04
> umount -l /mnt/1 -> 	0:00.17
> umount -l /mnt/1 -> 	0:00.75
> umount -l /mnt/1 -> 	0:05.96
> umount -l /mnt/1 -> 	0:34.40
> umount -l /mnt/1 -> 	3:46.27
> 
> And here is a test script:
> $ cat run.sh
> set -e -m
> 
> mount -t tmpfs zdtm /mnt
> mkdir -p /mnt/1 /mnt/2
> mount -t tmpfs zdtm /mnt/1
> mount --make-shared /mnt/1
> mkdir /mnt/1/1
> 
> for i in `seq $1`; do
> 	./mount --bind /mnt/1/1 /mnt/1/1
> done
> 
> echo -n "umount -l /mnt/1 -> "
> /usr/bin/time -f '%E' ./umount -l /mnt/1
> 
> And we need these simple mount and umount tools, because the standard
> ones read /proc/self/mountinfo, but this is extremely slow when we have
> thousands of mounts.
> $ cat mount.c
>  #include <sys/mount.h>
>  #include <stdlib.h>
> 
>  int main(int argc, char **argv)
>  {
>  	return mount(argv[2], argv[3], NULL, MS_BIND, NULL);
>  }
> 
> $ cat umount.c
>  #include <sys/mount.h>
> 
>  int main(int argc, char **argv)
>  {
>  	return umount2(argv[2], MNT_DETACH);
>  }
> 
> Here is a previous attempt to optimize this code:
> https://lkml.org/lkml/2016/10/10/495
> 
> Signed-off-by: Andrei Vagin <avagin@openvz.org>
> ---
>  fs/namespace.c | 81 +++++++++++++++++++++++++++++++------------------------
> ---
>  1 file changed, 43 insertions(+), 38 deletions(-)
> 
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 3bf0cd2..4e6f258 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -1474,56 +1474,61 @@ static bool disconnect_mount(struct mount *mnt, enum
> umount_tree_flags how)
>   */
>  static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
>  {
> -	LIST_HEAD(tmp_list);
>  	struct mount *p;
> +	int done = 0;
>  
>  	if (how & UMOUNT_PROPAGATE)
>  		propagate_mount_unlock(mnt);
>  
>  	/* Gather the mounts to umount */
> -	for (p = mnt; p; p = next_mnt(p, mnt)) {
> +	while (!done) {
> +		LIST_HEAD(tmp_list);
> +
> +		p = mnt;
> +		while (!list_empty(&p->mnt_mounts))
> +			p = list_entry(p->mnt_mounts.next, struct mount,
> mnt_child);
> +
>  		p->mnt.mnt_flags |= MNT_UMOUNT;
>  		list_move(&p->mnt_list, &tmp_list);
> -	}
> -
> -	/* Hide the mounts from mnt_mounts */
> -	list_for_each_entry(p, &tmp_list, mnt_list) {
>  		list_del_init(&p->mnt_child);
> -	}
>  
> -	/* Add propogated mounts to the tmp_list */
> -	if (how & UMOUNT_PROPAGATE)
> -		propagate_umount(&tmp_list);
> -
> -	while (!list_empty(&tmp_list)) {
> -		struct mnt_namespace *ns;
> -		bool disconnect;
> -		p = list_first_entry(&tmp_list, struct mount, mnt_list);
> -		list_del_init(&p->mnt_expire);
> -		list_del_init(&p->mnt_list);
> -		ns = p->mnt_ns;
> -		if (ns) {
> -			ns->mounts--;
> -			__touch_mnt_namespace(ns);
> -		}
> -		p->mnt_ns = NULL;
> -		if (how & UMOUNT_SYNC)
> -			p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
> -
> -		disconnect = disconnect_mount(p, how);
> -
> -		pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt,
> -				 disconnect ? &unmounted : NULL);
> -		if (mnt_has_parent(p)) {
> -			mnt_add_count(p->mnt_parent, -1);
> -			if (!disconnect) {
> -				/* Don't forget about p */
> -				list_add_tail(&p->mnt_child, &p->mnt_parent-
> >mnt_mounts);
> -			} else {
> -				umount_mnt(p);
> +		/* Add propogated mounts to the tmp_list */
> +		if (how & UMOUNT_PROPAGATE)
> +			propagate_umount(&tmp_list);
> +
> +		if (p == mnt)
> +			done = 1;
> +
> +		while (!list_empty(&tmp_list)) {
> +			struct mnt_namespace *ns;
> +			bool disconnect;
> +			p = list_first_entry(&tmp_list, struct mount,
> mnt_list);
> +			list_del_init(&p->mnt_expire);
> +			list_del_init(&p->mnt_list);
> +			ns = p->mnt_ns;
> +			if (ns) {
> +				ns->mounts--;
> +				__touch_mnt_namespace(ns);
> +			}
> +			p->mnt_ns = NULL;
> +			if (how & UMOUNT_SYNC)
> +				p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
> +
> +			disconnect = disconnect_mount(p, how);
> +
> +			pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt,
> +					 disconnect ? &unmounted : NULL);
> +			if (mnt_has_parent(p)) {
> +				mnt_add_count(p->mnt_parent, -1);
> +				if (!disconnect) {
> +					/* Don't forget about p */
> +					list_add_tail(&p->mnt_child, &p-
> >mnt_parent->mnt_mounts);
> +				} else {
> +					umount_mnt(p);
> +				}
>  			}
> +			change_mnt_propagation(p, MS_PRIVATE);
>  		}
> -		change_mnt_propagation(p, MS_PRIVATE);
>  	}
>  }
>  
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/containers

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH RFC] mnt: umount mounts one by one in umount_tree()
@ 2017-06-14  1:53     ` Ian Kent
  0 siblings, 0 replies; 7+ messages in thread
From: Ian Kent @ 2017-06-14  1:53 UTC (permalink / raw)
  To: Andrei Vagin, Eric W . Biederman
  Cc: Alexander Viro, linux-fsdevel, linux-kernel, Linux Containers

On Fri, 2017-05-12 at 00:08 -0700, Andrei Vagin wrote:
> With this patch, we don't try to umount all mounts of a tree together.
> Instead of this we umount them one by one. In this case, we see a significant
> improvement in performance for the worsе case.

Indeed, umount has been very slow for a while now.
Even a moderately large number of mounts (~10000) become painfully slow.

Re you still perusing this?
Anything I can do to help?

Eric, what are your thoughts on this latest attempt?

> 
> The reason of this optimization is that umount() can hold namespace_sem
> for a long time, this semaphore is global, so it affects all users.
> Recently Eric W. Biederman added a per mount namespace limit on the
> number of mounts. The default number of mounts allowed per mount
> namespace at 100,000. Currently this value is allowed to construct a tree
> which requires hours to be umounted.
> 
> In a worse case the current complexity of umount_tree() is O(n^3).
> * Enumirate all mounts in a target tree (propagate_umount)
> * Enumirate mounts to find where these changes have to
>   be propagated (mark_umount_candidates)
> * Enumirate mounts to find a requered mount by parent and dentry
>   (__lookup_mnt). __lookup_mnt() searches a mount in m_hash, but
>   the number of mounts is much bigger than a size of the hash.
> 
> The worse case is when all mounts from the tree live in the same shared
> group. In this case we have to enumirate all mounts on each step.
> 
> There is CVE-2016-6213 about this case.
> 
> Here are results for the kernel with this patch
> $ for i in `seq 10 15`; do  unshare -m sh ./run.sh $i; done
> umount -l /mnt/1 -> 	0:00.00
> umount -l /mnt/1 -> 	0:00.01
> umount -l /mnt/1 -> 	0:00.01
> umount -l /mnt/1 -> 	0:00.03
> umount -l /mnt/1 -> 	0:00.07
> umount -l /mnt/1 -> 	0:00.14
> 
> Here are results for the kernel without this patch
> $ for i in `seq 10 15`; do  unshare -m sh ./run.sh $i; done
> umount -l /mnt/1 -> 	0:00.04
> umount -l /mnt/1 -> 	0:00.17
> umount -l /mnt/1 -> 	0:00.75
> umount -l /mnt/1 -> 	0:05.96
> umount -l /mnt/1 -> 	0:34.40
> umount -l /mnt/1 -> 	3:46.27
> 
> And here is a test script:
> $ cat run.sh
> set -e -m
> 
> mount -t tmpfs zdtm /mnt
> mkdir -p /mnt/1 /mnt/2
> mount -t tmpfs zdtm /mnt/1
> mount --make-shared /mnt/1
> mkdir /mnt/1/1
> 
> for i in `seq $1`; do
> 	./mount --bind /mnt/1/1 /mnt/1/1
> done
> 
> echo -n "umount -l /mnt/1 -> "
> /usr/bin/time -f '%E' ./umount -l /mnt/1
> 
> And we need these simple mount and umount tools, because the standard
> ones read /proc/self/mountinfo, but this is extremely slow when we have
> thousands of mounts.
> $ cat mount.c
>  #include <sys/mount.h>
>  #include <stdlib.h>
> 
>  int main(int argc, char **argv)
>  {
>  	return mount(argv[2], argv[3], NULL, MS_BIND, NULL);
>  }
> 
> $ cat umount.c
>  #include <sys/mount.h>
> 
>  int main(int argc, char **argv)
>  {
>  	return umount2(argv[2], MNT_DETACH);
>  }
> 
> Here is a previous attempt to optimize this code:
> https://lkml.org/lkml/2016/10/10/495
> 
> Signed-off-by: Andrei Vagin <avagin@openvz.org>
> ---
>  fs/namespace.c | 81 +++++++++++++++++++++++++++++++------------------------
> ---
>  1 file changed, 43 insertions(+), 38 deletions(-)
> 
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 3bf0cd2..4e6f258 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -1474,56 +1474,61 @@ static bool disconnect_mount(struct mount *mnt, enum
> umount_tree_flags how)
>   */
>  static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
>  {
> -	LIST_HEAD(tmp_list);
>  	struct mount *p;
> +	int done = 0;
>  
>  	if (how & UMOUNT_PROPAGATE)
>  		propagate_mount_unlock(mnt);
>  
>  	/* Gather the mounts to umount */
> -	for (p = mnt; p; p = next_mnt(p, mnt)) {
> +	while (!done) {
> +		LIST_HEAD(tmp_list);
> +
> +		p = mnt;
> +		while (!list_empty(&p->mnt_mounts))
> +			p = list_entry(p->mnt_mounts.next, struct mount,
> mnt_child);
> +
>  		p->mnt.mnt_flags |= MNT_UMOUNT;
>  		list_move(&p->mnt_list, &tmp_list);
> -	}
> -
> -	/* Hide the mounts from mnt_mounts */
> -	list_for_each_entry(p, &tmp_list, mnt_list) {
>  		list_del_init(&p->mnt_child);
> -	}
>  
> -	/* Add propogated mounts to the tmp_list */
> -	if (how & UMOUNT_PROPAGATE)
> -		propagate_umount(&tmp_list);
> -
> -	while (!list_empty(&tmp_list)) {
> -		struct mnt_namespace *ns;
> -		bool disconnect;
> -		p = list_first_entry(&tmp_list, struct mount, mnt_list);
> -		list_del_init(&p->mnt_expire);
> -		list_del_init(&p->mnt_list);
> -		ns = p->mnt_ns;
> -		if (ns) {
> -			ns->mounts--;
> -			__touch_mnt_namespace(ns);
> -		}
> -		p->mnt_ns = NULL;
> -		if (how & UMOUNT_SYNC)
> -			p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
> -
> -		disconnect = disconnect_mount(p, how);
> -
> -		pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt,
> -				 disconnect ? &unmounted : NULL);
> -		if (mnt_has_parent(p)) {
> -			mnt_add_count(p->mnt_parent, -1);
> -			if (!disconnect) {
> -				/* Don't forget about p */
> -				list_add_tail(&p->mnt_child, &p->mnt_parent-
> >mnt_mounts);
> -			} else {
> -				umount_mnt(p);
> +		/* Add propogated mounts to the tmp_list */
> +		if (how & UMOUNT_PROPAGATE)
> +			propagate_umount(&tmp_list);
> +
> +		if (p == mnt)
> +			done = 1;
> +
> +		while (!list_empty(&tmp_list)) {
> +			struct mnt_namespace *ns;
> +			bool disconnect;
> +			p = list_first_entry(&tmp_list, struct mount,
> mnt_list);
> +			list_del_init(&p->mnt_expire);
> +			list_del_init(&p->mnt_list);
> +			ns = p->mnt_ns;
> +			if (ns) {
> +				ns->mounts--;
> +				__touch_mnt_namespace(ns);
> +			}
> +			p->mnt_ns = NULL;
> +			if (how & UMOUNT_SYNC)
> +				p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
> +
> +			disconnect = disconnect_mount(p, how);
> +
> +			pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt,
> +					 disconnect ? &unmounted : NULL);
> +			if (mnt_has_parent(p)) {
> +				mnt_add_count(p->mnt_parent, -1);
> +				if (!disconnect) {
> +					/* Don't forget about p */
> +					list_add_tail(&p->mnt_child, &p-
> >mnt_parent->mnt_mounts);
> +				} else {
> +					umount_mnt(p);
> +				}
>  			}
> +			change_mnt_propagation(p, MS_PRIVATE);
>  		}
> -		change_mnt_propagation(p, MS_PRIVATE);
>  	}
>  }
>  

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH RFC] mnt: umount mounts one by one in umount_tree()
  2017-06-14  1:53     ` Ian Kent
@ 2017-06-14  9:37         ` Eric W. Biederman
  -1 siblings, 0 replies; 7+ messages in thread
From: Eric W. Biederman @ 2017-06-14  9:37 UTC (permalink / raw)
  To: Ian Kent
  Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Linux Containers,
	Andrei Vagin, Alexander Viro

Ian Kent <raven@themaw.net> writes:

> On Fri, 2017-05-12 at 00:08 -0700, Andrei Vagin wrote:
>> With this patch, we don't try to umount all mounts of a tree together.
>> Instead of this we umount them one by one. In this case, we see a significant
>> improvement in performance for the worsе case.
>
> Indeed, umount has been very slow for a while now.
> Even a moderately large number of mounts (~10000) become painfully slow.
>
> Re you still perusing this?
> Anything I can do to help?
>
> Eric, what are your thoughts on this latest attempt?

I have something slightly more recent.  Please checkout my for-next
branch of my userns tree:

git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace.git for-next

There is one open area of semantics that I looking at with Ram Pai in
the hopes we can drive consensus before we take any patches for
better checkpoint-restart support.

Eric

_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/containers

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH RFC] mnt: umount mounts one by one in umount_tree()
@ 2017-06-14  9:37         ` Eric W. Biederman
  0 siblings, 0 replies; 7+ messages in thread
From: Eric W. Biederman @ 2017-06-14  9:37 UTC (permalink / raw)
  To: Ian Kent
  Cc: Andrei Vagin, Alexander Viro, linux-fsdevel, linux-kernel,
	Linux Containers

Ian Kent <raven@themaw.net> writes:

> On Fri, 2017-05-12 at 00:08 -0700, Andrei Vagin wrote:
>> With this patch, we don't try to umount all mounts of a tree together.
>> Instead of this we umount them one by one. In this case, we see a significant
>> improvement in performance for the worsе case.
>
> Indeed, umount has been very slow for a while now.
> Even a moderately large number of mounts (~10000) become painfully slow.
>
> Re you still perusing this?
> Anything I can do to help?
>
> Eric, what are your thoughts on this latest attempt?

I have something slightly more recent.  Please checkout my for-next
branch of my userns tree:

git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace.git for-next

There is one open area of semantics that I looking at with Ram Pai in
the hopes we can drive consensus before we take any patches for
better checkpoint-restart support.

Eric

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2017-06-14  9:44 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-05-12  7:08 [PATCH RFC] mnt: umount mounts one by one in umount_tree() Andrei Vagin
2017-05-12 18:56 ` [PATCH v2 " Andrei Vagin
     [not found] ` <20170512070838.5037-1-avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
2017-05-12 18:56   ` Andrei Vagin
2017-06-14  1:53   ` [PATCH " Ian Kent
2017-06-14  1:53     ` Ian Kent
     [not found]     ` <1497405190.2595.3.camel-PKsaG3nR2I+sTnJN9+BGXg@public.gmane.org>
2017-06-14  9:37       ` Eric W. Biederman
2017-06-14  9:37         ` Eric W. Biederman

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.