linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] [RFC] mnt: restrict a number of "struct mnt"
@ 2013-06-17  8:24 Andrey Vagin
  2013-06-17 19:58 ` Eric W. Biederman
  0 siblings, 1 reply; 7+ messages in thread
From: Andrey Vagin @ 2013-06-17  8:24 UTC (permalink / raw)
  To: Alexander Viro
  Cc: linux-fsdevel, linux-kernel, Andrey Vagin, Eric W. Biederman,
	Serge E. Hallyn, Andrew Morton, Ingo Molnar, Kees Cook,
	Mel Gorman, Rik van Riel

I found that a few processes can eat all host memory and nobody can kill them.
$ mount -t tmpfs xxx /mnt
$ mount --make-shared /mnt
$ for i in `seq 30`; do mount --bind /mnt `mktemp -d /mnt/test.XXXXXX` & done

All this processes are unkillable, because they took i_mutex and waits
namespace_lock.

...
21715 pts/0    D      0:00          \_ mount --bind /mnt /mnt/test.ht6jzO
21716 pts/0    D      0:00          \_ mount --bind /mnt /mnt/test.97K4mI
21717 pts/0    R      0:01          \_ mount --bind /mnt /mnt/test.gO2CD9
...

Each of this process doubles a number of mounts, so at the end we will
have about 2^32 mounts and the size of struct mnt is 256 bytes, so we
need about 1TB of RAM.

Another problem is that “umount” of a big tree is very hard operation
and it requires a lot of time.
E.g.:
16411
umount("/tmp/xxx", MNT_DETACH)          = 0 <7.852066> (7.8 sec)
32795
umount("/tmp/xxx", MNT_DETACH)          = 0 <34.485501> ( 34 sec)

For all this time sys_umoun takes namespace_sem and vfsmount_lock...

Due to all this reasons I suggest to restrict a number of mounts.
Probably we can optimize this code in a future, but now this restriction
can help.

Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: "Serge E. Hallyn" <serge@hallyn.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrey Vagin <avagin@openvz.org>
---
 fs/namespace.c                | 66 +++++++++++++++++++++++++------------------
 include/linux/mnt_namespace.h |  2 ++
 kernel/sysctl.c               |  8 ++++++
 3 files changed, 49 insertions(+), 27 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 7b1ca9b..d22e54c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -41,6 +41,9 @@ static struct list_head *mountpoint_hashtable __read_mostly;
 static struct kmem_cache *mnt_cache __read_mostly;
 static struct rw_semaphore namespace_sem;
 
+unsigned int sysctl_mount_nr __read_mostly = 16384;
+static atomic_t mount_nr = ATOMIC_INIT(0);
+
 /* /sys/fs */
 struct kobject *fs_kobj;
 EXPORT_SYMBOL_GPL(fs_kobj);
@@ -164,43 +167,49 @@ unsigned int mnt_get_count(struct mount *mnt)
 
 static struct mount *alloc_vfsmnt(const char *name)
 {
-	struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
-	if (mnt) {
-		int err;
+	struct mount *mnt;
+	int err;
 
-		err = mnt_alloc_id(mnt);
-		if (err)
-			goto out_free_cache;
+	if (atomic_inc_return(&mount_nr) > sysctl_mount_nr)
+		goto out_dec_mount_nr;
 
-		if (name) {
-			mnt->mnt_devname = kstrdup(name, GFP_KERNEL);
-			if (!mnt->mnt_devname)
-				goto out_free_id;
-		}
+	mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
+	if (!mnt)
+		goto out_dec_mount_nr;
+
+	err = mnt_alloc_id(mnt);
+	if (err)
+		goto out_free_cache;
+
+	if (name) {
+		mnt->mnt_devname = kstrdup(name, GFP_KERNEL);
+		if (!mnt->mnt_devname)
+			goto out_free_id;
+	}
 
 #ifdef CONFIG_SMP
-		mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
-		if (!mnt->mnt_pcp)
-			goto out_free_devname;
+	mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
+	if (!mnt->mnt_pcp)
+		goto out_free_devname;
 
-		this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
+	this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
 #else
-		mnt->mnt_count = 1;
-		mnt->mnt_writers = 0;
+	mnt->mnt_count = 1;
+	mnt->mnt_writers = 0;
 #endif
 
-		INIT_LIST_HEAD(&mnt->mnt_hash);
-		INIT_LIST_HEAD(&mnt->mnt_child);
-		INIT_LIST_HEAD(&mnt->mnt_mounts);
-		INIT_LIST_HEAD(&mnt->mnt_list);
-		INIT_LIST_HEAD(&mnt->mnt_expire);
-		INIT_LIST_HEAD(&mnt->mnt_share);
-		INIT_LIST_HEAD(&mnt->mnt_slave_list);
-		INIT_LIST_HEAD(&mnt->mnt_slave);
+	INIT_LIST_HEAD(&mnt->mnt_hash);
+	INIT_LIST_HEAD(&mnt->mnt_child);
+	INIT_LIST_HEAD(&mnt->mnt_mounts);
+	INIT_LIST_HEAD(&mnt->mnt_list);
+	INIT_LIST_HEAD(&mnt->mnt_expire);
+	INIT_LIST_HEAD(&mnt->mnt_share);
+	INIT_LIST_HEAD(&mnt->mnt_slave_list);
+	INIT_LIST_HEAD(&mnt->mnt_slave);
 #ifdef CONFIG_FSNOTIFY
-		INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
+	INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
 #endif
-	}
+
 	return mnt;
 
 #ifdef CONFIG_SMP
@@ -211,6 +220,8 @@ out_free_id:
 	mnt_free_id(mnt);
 out_free_cache:
 	kmem_cache_free(mnt_cache, mnt);
+out_dec_mount_nr:
+	atomic_dec(&mount_nr);
 	return NULL;
 }
 
@@ -546,6 +557,7 @@ static void free_vfsmnt(struct mount *mnt)
 #ifdef CONFIG_SMP
 	free_percpu(mnt->mnt_pcp);
 #endif
+	atomic_dec(&mount_nr);
 	kmem_cache_free(mnt_cache, mnt);
 }
 
diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h
index 12b2ab5..d8e5ec9 100644
--- a/include/linux/mnt_namespace.h
+++ b/include/linux/mnt_namespace.h
@@ -2,6 +2,8 @@
 #define _NAMESPACE_H_
 #ifdef __KERNEL__
 
+extern unsigned int sysctl_mount_nr;
+
 struct mnt_namespace;
 struct fs_struct;
 struct user_namespace;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9edcf45..bebfdd7 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -61,6 +61,7 @@
 #include <linux/kmod.h>
 #include <linux/capability.h>
 #include <linux/binfmts.h>
+#include <linux/mnt_namespace.h>
 #include <linux/sched/sysctl.h>
 
 #include <asm/uaccess.h>
@@ -1616,6 +1617,13 @@ static struct ctl_table fs_table[] = {
 		.proc_handler	= &pipe_proc_fn,
 		.extra1		= &pipe_min_size,
 	},
+	{
+		.procname	= "mount-nr",
+		.data		= &sysctl_mount_nr,
+		.maxlen		= sizeof(sysctl_mount_nr),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 	{ }
 };
 
-- 
1.8.1.4


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] [RFC] mnt: restrict a number of "struct mnt"
  2013-06-17  8:24 [PATCH] [RFC] mnt: restrict a number of "struct mnt" Andrey Vagin
@ 2013-06-17 19:58 ` Eric W. Biederman
  2013-06-17 22:56   ` Andrew Morton
  2013-06-17 22:56   ` Andrey Wagin
  0 siblings, 2 replies; 7+ messages in thread
From: Eric W. Biederman @ 2013-06-17 19:58 UTC (permalink / raw)
  To: Andrey Vagin
  Cc: Alexander Viro, linux-fsdevel, linux-kernel, Serge E. Hallyn,
	Andrew Morton, Ingo Molnar, Kees Cook, Mel Gorman, Rik van Riel

Andrey Vagin <avagin@openvz.org> writes:

> I found that a few processes can eat all host memory and nobody can kill them.
> $ mount -t tmpfs xxx /mnt
> $ mount --make-shared /mnt
> $ for i in `seq 30`; do mount --bind /mnt `mktemp -d /mnt/test.XXXXXX` & done
>
> All this processes are unkillable, because they took i_mutex and waits
> namespace_lock.
>
> ...
> 21715 pts/0    D      0:00          \_ mount --bind /mnt /mnt/test.ht6jzO
> 21716 pts/0    D      0:00          \_ mount --bind /mnt /mnt/test.97K4mI
> 21717 pts/0    R      0:01          \_ mount --bind /mnt /mnt/test.gO2CD9
> ...
>
> Each of this process doubles a number of mounts, so at the end we will
> have about 2^32 mounts and the size of struct mnt is 256 bytes, so we
> need about 1TB of RAM.
>
> Another problem is that “umount” of a big tree is very hard operation
> and it requires a lot of time.
> E.g.:
> 16411
> umount("/tmp/xxx", MNT_DETACH)          = 0 <7.852066> (7.8 sec)
> 32795
> umount("/tmp/xxx", MNT_DETACH)          = 0 <34.485501> ( 34 sec)
>
> For all this time sys_umoun takes namespace_sem and vfsmount_lock...
>
> Due to all this reasons I suggest to restrict a number of mounts.
> Probably we can optimize this code in a future, but now this restriction
> can help.

So for anyone seriously worried about this kind of thing in general we
already have the memory control group, which is quite capable of
limiting this kind of thing, and it limits all memory allocations not
just mount.

Is there some reason we want to go down the path of adding and tuning
static limits all over the kernel?  As opposed to streamlining the memory
control group so it is low overhead and everyone that cares can use it?

Should we reach the point where we automatically enable the memory
control group to prevent abuse of the kernel?

Eric

> Cc: Alexander Viro <viro@zeniv.linux.org.uk>
> Cc: "Eric W. Biederman" <ebiederm@xmission.com>
> Cc: "Serge E. Hallyn" <serge@hallyn.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Ingo Molnar <mingo@kernel.org>
> Cc: Kees Cook <keescook@chromium.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Rik van Riel <riel@redhat.com>
> Signed-off-by: Andrey Vagin <avagin@openvz.org>
> ---
>  fs/namespace.c                | 66 +++++++++++++++++++++++++------------------
>  include/linux/mnt_namespace.h |  2 ++
>  kernel/sysctl.c               |  8 ++++++
>  3 files changed, 49 insertions(+), 27 deletions(-)
>
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 7b1ca9b..d22e54c 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -41,6 +41,9 @@ static struct list_head *mountpoint_hashtable __read_mostly;
>  static struct kmem_cache *mnt_cache __read_mostly;
>  static struct rw_semaphore namespace_sem;
>  
> +unsigned int sysctl_mount_nr __read_mostly = 16384;
> +static atomic_t mount_nr = ATOMIC_INIT(0);
> +
>  /* /sys/fs */
>  struct kobject *fs_kobj;
>  EXPORT_SYMBOL_GPL(fs_kobj);
> @@ -164,43 +167,49 @@ unsigned int mnt_get_count(struct mount *mnt)
>  
>  static struct mount *alloc_vfsmnt(const char *name)
>  {
> -	struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
> -	if (mnt) {
> -		int err;
> +	struct mount *mnt;
> +	int err;
>  
> -		err = mnt_alloc_id(mnt);
> -		if (err)
> -			goto out_free_cache;
> +	if (atomic_inc_return(&mount_nr) > sysctl_mount_nr)
> +		goto out_dec_mount_nr;
>  
> -		if (name) {
> -			mnt->mnt_devname = kstrdup(name, GFP_KERNEL);
> -			if (!mnt->mnt_devname)
> -				goto out_free_id;
> -		}
> +	mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
> +	if (!mnt)
> +		goto out_dec_mount_nr;
> +
> +	err = mnt_alloc_id(mnt);
> +	if (err)
> +		goto out_free_cache;
> +
> +	if (name) {
> +		mnt->mnt_devname = kstrdup(name, GFP_KERNEL);
> +		if (!mnt->mnt_devname)
> +			goto out_free_id;
> +	}
>  
>  #ifdef CONFIG_SMP
> -		mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
> -		if (!mnt->mnt_pcp)
> -			goto out_free_devname;
> +	mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
> +	if (!mnt->mnt_pcp)
> +		goto out_free_devname;
>  
> -		this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
> +	this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
>  #else
> -		mnt->mnt_count = 1;
> -		mnt->mnt_writers = 0;
> +	mnt->mnt_count = 1;
> +	mnt->mnt_writers = 0;
>  #endif
>  
> -		INIT_LIST_HEAD(&mnt->mnt_hash);
> -		INIT_LIST_HEAD(&mnt->mnt_child);
> -		INIT_LIST_HEAD(&mnt->mnt_mounts);
> -		INIT_LIST_HEAD(&mnt->mnt_list);
> -		INIT_LIST_HEAD(&mnt->mnt_expire);
> -		INIT_LIST_HEAD(&mnt->mnt_share);
> -		INIT_LIST_HEAD(&mnt->mnt_slave_list);
> -		INIT_LIST_HEAD(&mnt->mnt_slave);
> +	INIT_LIST_HEAD(&mnt->mnt_hash);
> +	INIT_LIST_HEAD(&mnt->mnt_child);
> +	INIT_LIST_HEAD(&mnt->mnt_mounts);
> +	INIT_LIST_HEAD(&mnt->mnt_list);
> +	INIT_LIST_HEAD(&mnt->mnt_expire);
> +	INIT_LIST_HEAD(&mnt->mnt_share);
> +	INIT_LIST_HEAD(&mnt->mnt_slave_list);
> +	INIT_LIST_HEAD(&mnt->mnt_slave);
>  #ifdef CONFIG_FSNOTIFY
> -		INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
> +	INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
>  #endif
> -	}
> +
>  	return mnt;
>  
>  #ifdef CONFIG_SMP
> @@ -211,6 +220,8 @@ out_free_id:
>  	mnt_free_id(mnt);
>  out_free_cache:
>  	kmem_cache_free(mnt_cache, mnt);
> +out_dec_mount_nr:
> +	atomic_dec(&mount_nr);
>  	return NULL;
>  }
>  
> @@ -546,6 +557,7 @@ static void free_vfsmnt(struct mount *mnt)
>  #ifdef CONFIG_SMP
>  	free_percpu(mnt->mnt_pcp);
>  #endif
> +	atomic_dec(&mount_nr);
>  	kmem_cache_free(mnt_cache, mnt);
>  }
>  
> diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h
> index 12b2ab5..d8e5ec9 100644
> --- a/include/linux/mnt_namespace.h
> +++ b/include/linux/mnt_namespace.h
> @@ -2,6 +2,8 @@
>  #define _NAMESPACE_H_
>  #ifdef __KERNEL__
>  
> +extern unsigned int sysctl_mount_nr;
> +
>  struct mnt_namespace;
>  struct fs_struct;
>  struct user_namespace;
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 9edcf45..bebfdd7 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -61,6 +61,7 @@
>  #include <linux/kmod.h>
>  #include <linux/capability.h>
>  #include <linux/binfmts.h>
> +#include <linux/mnt_namespace.h>
>  #include <linux/sched/sysctl.h>
>  
>  #include <asm/uaccess.h>
> @@ -1616,6 +1617,13 @@ static struct ctl_table fs_table[] = {
>  		.proc_handler	= &pipe_proc_fn,
>  		.extra1		= &pipe_min_size,
>  	},
> +	{
> +		.procname	= "mount-nr",
> +		.data		= &sysctl_mount_nr,
> +		.maxlen		= sizeof(sysctl_mount_nr),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec,
> +	},
>  	{ }
>  };

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] [RFC] mnt: restrict a number of "struct mnt"
  2013-06-17 19:58 ` Eric W. Biederman
@ 2013-06-17 22:56   ` Andrew Morton
  2013-06-18  6:09     ` Andrew Vagin
  2013-06-17 22:56   ` Andrey Wagin
  1 sibling, 1 reply; 7+ messages in thread
From: Andrew Morton @ 2013-06-17 22:56 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Andrey Vagin, Alexander Viro, linux-fsdevel, linux-kernel,
	Serge E. Hallyn, Ingo Molnar, Kees Cook, Mel Gorman,
	Rik van Riel

On Mon, 17 Jun 2013 12:58:00 -0700 ebiederm@xmission.com (Eric W. Biederman) wrote:

> > I found that a few processes can eat all host memory and nobody can kill them.
> > $ mount -t tmpfs xxx /mnt
> > $ mount --make-shared /mnt
> > $ for i in `seq 30`; do mount --bind /mnt `mktemp -d /mnt/test.XXXXXX` & done
> >
> > All this processes are unkillable, because they took i_mutex and waits
> > namespace_lock.
> >
> > ...
> > 21715 pts/0 ______D __________0:00 __________________\_ mount --bind /mnt /mnt/test.ht6jzO
> > 21716 pts/0 ______D __________0:00 __________________\_ mount --bind /mnt /mnt/test.97K4mI
> > 21717 pts/0 ______R __________0:01 __________________\_ mount --bind /mnt /mnt/test.gO2CD9
> > ...
> >
> > Each of this process doubles a number of mounts, so at the end we will
> > have about 2^32 mounts and the size of struct mnt is 256 bytes, so we
> > need about 1TB of RAM.
> >
> > Another problem is that ___umount___ of a big tree is very hard operation
> > and it requires a lot of time.
> > E.g.:
> > 16411
> > umount("/tmp/xxx", MNT_DETACH) __________________= 0 <7.852066> (7.8 sec)
> > 32795
> > umount("/tmp/xxx", MNT_DETACH) __________________= 0 <34.485501> ( 34 sec)
> >
> > For all this time sys_umoun takes namespace_sem and vfsmount_lock...
> >
> > Due to all this reasons I suggest to restrict a number of mounts.
> > Probably we can optimize this code in a future, but now this restriction
> > can help.
> 
> So for anyone seriously worried about this kind of thing in general we
> already have the memory control group, which is quite capable of
> limiting this kind of thing, and it limits all memory allocations not
> just mount.

What is the exposure here?  By what means can a non-CAP_SYS_ADMIN user
run sys_mount() under the namespace system?

IOW, what does the sysadmin have to do to permit this?  Is that a
typical thing to do, or did the sysadmin make a mistake?

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] [RFC] mnt: restrict a number of "struct mnt"
  2013-06-17 19:58 ` Eric W. Biederman
  2013-06-17 22:56   ` Andrew Morton
@ 2013-06-17 22:56   ` Andrey Wagin
  2013-06-19 21:35     ` Andrey Wagin
  1 sibling, 1 reply; 7+ messages in thread
From: Andrey Wagin @ 2013-06-17 22:56 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Alexander Viro, linux-fsdevel, LKML, Serge E. Hallyn,
	Andrew Morton, Ingo Molnar, Kees Cook, Mel Gorman, Rik van Riel

2013/6/17 Eric W. Biederman <ebiederm@xmission.com>:
> Andrey Vagin <avagin@openvz.org> writes:
>
>> I found that a few processes can eat all host memory and nobody can kill them.
>> $ mount -t tmpfs xxx /mnt
>> $ mount --make-shared /mnt
>> $ for i in `seq 30`; do mount --bind /mnt `mktemp -d /mnt/test.XXXXXX` & done
>>
>> All this processes are unkillable, because they took i_mutex and waits
>> namespace_lock.
>>
>> ...
>> 21715 pts/0    D      0:00          \_ mount --bind /mnt /mnt/test.ht6jzO
>> 21716 pts/0    D      0:00          \_ mount --bind /mnt /mnt/test.97K4mI
>> 21717 pts/0    R      0:01          \_ mount --bind /mnt /mnt/test.gO2CD9
>> ...
>>
>> Each of this process doubles a number of mounts, so at the end we will
>> have about 2^32 mounts and the size of struct mnt is 256 bytes, so we
>> need about 1TB of RAM.
>>
>> Another problem is that “umount” of a big tree is very hard operation
>> and it requires a lot of time.
>> E.g.:
>> 16411
>> umount("/tmp/xxx", MNT_DETACH)          = 0 <7.852066> (7.8 sec)
>> 32795
>> umount("/tmp/xxx", MNT_DETACH)          = 0 <34.485501> ( 34 sec)
>>
>> For all this time sys_umoun takes namespace_sem and vfsmount_lock...
>>
>> Due to all this reasons I suggest to restrict a number of mounts.
>> Probably we can optimize this code in a future, but now this restriction
>> can help.
>
> So for anyone seriously worried about this kind of thing in general we
> already have the memory control group, which is quite capable of
> limiting this kind of thing,

> and it limits all memory allocations not just mount.

And that is problem, we can't to limit a particular slab. Let's
imagine a real container with 4Gb of RAM. What is a kernel memory
limit resonable for it? I setup 64 Mb (it may be not enough for real
CT, but it's enough to make host inaccessible for some minutes).

$ mkdir /sys/fs/cgroup/memory/test
$ echo $((64 << 20)) > /sys/fs/cgroup/memory/test/memory.kmem.limit_in_bytes
$ unshare -m
$ echo $$ > /sys/fs/cgroup/memory/test/tasks
$ mount --make-rprivate /
$ mount -t tmpfs xxx /mnt
$ mount --make-shared /mnt
$ time bash -c 'set -m; for i in `seq 30`; do mount --bind /mnt
`mktemp -d /mnt/test.XXXXXX` & done;  for i in `seq 30`; do wait;
done'
real 0m23.141s
user 0m0.016s
sys 0m22.881s

While the last script is working, nobody can't to read /proc/mounts or
mount something. I don't think that users from other containers will
be glad. This problem is not so significant in compared with umounting
of this tree.

$ strace -T umount -l /mnt
umount("/mnt", MNT_DETACH)              = 0 <548.898244>
The host is inaccessible, it writes messages about soft lockup in
kernel log and eats 100% cpu.


>
> Is there some reason we want to go down the path of adding and tuning
> static limits all over the kernel?  As opposed to streamlining the memory
> control group so it is low overhead and everyone that cares can use it?

The memory control group doesn't help in this case... I need to look
at this code in more details, maybe we can limit a depth of nested
mount points.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] [RFC] mnt: restrict a number of "struct mnt"
  2013-06-17 22:56   ` Andrew Morton
@ 2013-06-18  6:09     ` Andrew Vagin
  0 siblings, 0 replies; 7+ messages in thread
From: Andrew Vagin @ 2013-06-18  6:09 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Eric W. Biederman, Andrey Vagin, Alexander Viro, linux-fsdevel,
	linux-kernel, Serge E. Hallyn, Ingo Molnar, Kees Cook,
	Mel Gorman, Rik van Riel

On Mon, Jun 17, 2013 at 03:56:14PM -0700, Andrew Morton wrote:
> On Mon, 17 Jun 2013 12:58:00 -0700 ebiederm@xmission.com (Eric W. Biederman) wrote:
> 
> > > I found that a few processes can eat all host memory and nobody can kill them.
> > > $ mount -t tmpfs xxx /mnt
> > > $ mount --make-shared /mnt
> > > $ for i in `seq 30`; do mount --bind /mnt `mktemp -d /mnt/test.XXXXXX` & done
> > >
> > > All this processes are unkillable, because they took i_mutex and waits
> > > namespace_lock.
> > >
> > > ...
> > > 21715 pts/0 ______D __________0:00 __________________\_ mount --bind /mnt /mnt/test.ht6jzO
> > > 21716 pts/0 ______D __________0:00 __________________\_ mount --bind /mnt /mnt/test.97K4mI
> > > 21717 pts/0 ______R __________0:01 __________________\_ mount --bind /mnt /mnt/test.gO2CD9
> > > ...
> > >
> > > Each of this process doubles a number of mounts, so at the end we will
> > > have about 2^32 mounts and the size of struct mnt is 256 bytes, so we
> > > need about 1TB of RAM.
> > >
> > > Another problem is that ___umount___ of a big tree is very hard operation
> > > and it requires a lot of time.
> > > E.g.:
> > > 16411
> > > umount("/tmp/xxx", MNT_DETACH) __________________= 0 <7.852066> (7.8 sec)
> > > 32795
> > > umount("/tmp/xxx", MNT_DETACH) __________________= 0 <34.485501> ( 34 sec)
> > >
> > > For all this time sys_umoun takes namespace_sem and vfsmount_lock...
> > >
> > > Due to all this reasons I suggest to restrict a number of mounts.
> > > Probably we can optimize this code in a future, but now this restriction
> > > can help.
> > 
> > So for anyone seriously worried about this kind of thing in general we
> > already have the memory control group, which is quite capable of
> > limiting this kind of thing, and it limits all memory allocations not
> > just mount.
> 
> What is the exposure here?  By what means can a non-CAP_SYS_ADMIN user
> run sys_mount() under the namespace system?
> 
> IOW, what does the sysadmin have to do to permit this?  Is that a
> typical thing to do, or did the sysadmin make a mistake?

It's a problem for Linux Containers. Because usually the root user in
container should have enough rights to mount something (tmpfs,
bindmounts, etc).  Our target is to make containers completely isolated.

A container is isolated with help of namespaces. The user namespace
creates a new sets of capabilities and users.


> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] [RFC] mnt: restrict a number of "struct mnt"
  2013-06-17 22:56   ` Andrey Wagin
@ 2013-06-19 21:35     ` Andrey Wagin
  2013-06-21  1:04       ` Eric W. Biederman
  0 siblings, 1 reply; 7+ messages in thread
From: Andrey Wagin @ 2013-06-19 21:35 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Alexander Viro, linux-fsdevel, LKML, Serge E. Hallyn,
	Andrew Morton, Ingo Molnar, Kees Cook, Mel Gorman, Rik van Riel

On Tue, Jun 18, 2013 at 02:56:51AM +0400, Andrey Wagin wrote:
> 2013/6/17 Eric W. Biederman <ebiederm@xmission.com>:
> > So for anyone seriously worried about this kind of thing in general we
> > already have the memory control group, which is quite capable of
> > limiting this kind of thing,
> 
> > and it limits all memory allocations not just mount.
> 
> And that is problem, we can't to limit a particular slab. Let's
> imagine a real container with 4Gb of RAM. What is a kernel memory
> limit resonable for it? I setup 64 Mb (it may be not enough for real
> CT, but it's enough to make host inaccessible for some minutes).
> 
> $ mkdir /sys/fs/cgroup/memory/test
> $ echo $((64 << 20)) > /sys/fs/cgroup/memory/test/memory.kmem.limit_in_bytes
> $ unshare -m
> $ echo $$ > /sys/fs/cgroup/memory/test/tasks
> $ mount --make-rprivate /
> $ mount -t tmpfs xxx /mnt
> $ mount --make-shared /mnt
> $ time bash -c 'set -m; for i in `seq 30`; do mount --bind /mnt
> `mktemp -d /mnt/test.XXXXXX` & done;  for i in `seq 30`; do wait;
> done'
> real 0m23.141s
> user 0m0.016s
> sys 0m22.881s
> 
> While the last script is working, nobody can't to read /proc/mounts or
> mount something. I don't think that users from other containers will
> be glad. This problem is not so significant in compared with umounting
> of this tree.
> 
> $ strace -T umount -l /mnt
> umount("/mnt", MNT_DETACH)              = 0 <548.898244>
> The host is inaccessible, it writes messages about soft lockup in
> kernel log and eats 100% cpu.

Eric, do you agree that
* It is a problem
* Currently we don't have a mechanism to prevent this problem
* We need to find a way to prevent this problem

> 
> 
> >
> > Is there some reason we want to go down the path of adding and tuning
> > static limits all over the kernel?  As opposed to streamlining the memory
> > control group so it is low overhead and everyone that cares can use it?
> 
> The memory control group doesn't help in this case... I need to look
> at this code in more details, maybe we can limit a depth of nested
> mount points.

Complexity of the umount algorithm does not depends on a depth of nested
mounts, it depends on a number of mounts and sometimes complexity is O(n^2).

For example:

	mount -t tmpfs xxx /mnt
	mount --make-shared /mnt

	mkdir /mnt/tmp
	mount -t tmpfs xxx /mnt/tmp
	mkdir /mnt/d

	for ((i = 0; i < $1; i++)); do
		d=`mktemp -d /mnt/d/xxx.XXXXXX`
		mount --bind /mnt/tmp $d || break
	done

	mkdir /mnt/tmp/d
	for ((i = 0; i < $1; i++)); do
		d=`mktemp -d /mnt/tmp/xxx.XXXXXX`
		mount --bind /mnt/tmp/d $d || break
	done

perf data for umount -l /mnt
    29.60%     dbus-daemon  [kernel.kallsyms]        [k] __ticket_spin_lock
               |
               --- __ticket_spin_lock
                   lg_local_lock
                   path_init
                   path_openat
                   do_filp_open
                   do_sys_open
                   SyS_openat
                   system_call_fastpath
                   __openat64_nocancel
                   0x747379732f312d73

    20.20%          umount  [kernel.kallsyms]        [k] propagation_next
                    |
                    --- propagation_next
                       |
                       |--65.35%-- umount_tree
                       |          SyS_umount
                       |          system_call_fastpath
                       |          __umount2
                       |          __libc_start_main
                       |
                        --34.65%-- propagate_umount
                                  umount_tree
                                  SyS_umount
                                  system_call_fastpath
                                  __umount2
                                  __libc_start_main

    17.81%          umount  [kernel.kallsyms]        [k] __lookup_mnt
                    |
                    --- __lookup_mnt
                       |
                       |--82.78%-- propagate_umount
                       |          umount_tree
                       |          SyS_umount
                       |          system_call_fastpath
                       |          __umount2
                       |          __libc_start_main
                       |
                        --17.22%-- umount_tree
                                  SyS_umount
                                  system_call_fastpath
                                  __umount2
                                  __libc_start_main


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] [RFC] mnt: restrict a number of "struct mnt"
  2013-06-19 21:35     ` Andrey Wagin
@ 2013-06-21  1:04       ` Eric W. Biederman
  0 siblings, 0 replies; 7+ messages in thread
From: Eric W. Biederman @ 2013-06-21  1:04 UTC (permalink / raw)
  To: Andrey Wagin
  Cc: Alexander Viro, linux-fsdevel, LKML, Serge E. Hallyn,
	Andrew Morton, Ingo Molnar, Kees Cook, Mel Gorman, Rik van Riel

Andrey Wagin <avagin@gmail.com> writes:

> On Tue, Jun 18, 2013 at 02:56:51AM +0400, Andrey Wagin wrote:
>> 2013/6/17 Eric W. Biederman <ebiederm@xmission.com>:
>> > So for anyone seriously worried about this kind of thing in general we
>> > already have the memory control group, which is quite capable of
>> > limiting this kind of thing,
>> 
>> > and it limits all memory allocations not just mount.
>> 
>> And that is problem, we can't to limit a particular slab. Let's
>> imagine a real container with 4Gb of RAM. What is a kernel memory
>> limit resonable for it? I setup 64 Mb (it may be not enough for real
>> CT, but it's enough to make host inaccessible for some minutes).
>> 
>> $ mkdir /sys/fs/cgroup/memory/test
>> $ echo $((64 << 20)) > /sys/fs/cgroup/memory/test/memory.kmem.limit_in_bytes
>> $ unshare -m
>> $ echo $$ > /sys/fs/cgroup/memory/test/tasks
>> $ mount --make-rprivate /
>> $ mount -t tmpfs xxx /mnt
>> $ mount --make-shared /mnt
>> $ time bash -c 'set -m; for i in `seq 30`; do mount --bind /mnt
>> `mktemp -d /mnt/test.XXXXXX` & done;  for i in `seq 30`; do wait;
>> done'
>> real 0m23.141s
>> user 0m0.016s
>> sys 0m22.881s
>> 
>> While the last script is working, nobody can't to read /proc/mounts or
>> mount something. I don't think that users from other containers will
>> be glad. This problem is not so significant in compared with umounting
>> of this tree.
>> 
>> $ strace -T umount -l /mnt
>> umount("/mnt", MNT_DETACH)              = 0 <548.898244>
>> The host is inaccessible, it writes messages about soft lockup in
>> kernel log and eats 100% cpu.
>
> Eric, do you agree that
> * It is a problem
> * Currently we don't have a mechanism to prevent this problem
> * We need to find a way to prevent this problem

Ugh.  I knew mount propagation was annoying semantically but I had not
realized the implementation was quite so bad.

This doesn't happen in normal operation to normal folks.  So I don't
think this is something we need to rush in a fix at the last moment to
prevent the entire world from melting down.  Even people using mount
namespaces in containers.

I do think it is worth looking at.  Which kernel were you testing?.  I
haven't gotten as far as looking too closely but I just noticed that Al
Viro has been busy rewriting the lock of this.  So if you aren't testing
at least 2.10-rcX you probably need to retest.

My thoughts would be.  Improve the locking as much as possible,
and if that is not enough keep a measure of how many mounts will be
affected at least for the umount.  Possibly for the umount -l case.
Then just don't allow the complexity to exceed some limit so we know
things will happen in a timely manner.

Eric

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2013-06-21  1:05 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-06-17  8:24 [PATCH] [RFC] mnt: restrict a number of "struct mnt" Andrey Vagin
2013-06-17 19:58 ` Eric W. Biederman
2013-06-17 22:56   ` Andrew Morton
2013-06-18  6:09     ` Andrew Vagin
2013-06-17 22:56   ` Andrey Wagin
2013-06-19 21:35     ` Andrey Wagin
2013-06-21  1:04       ` Eric W. Biederman

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).