linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] new cgroup controller "fork"
@ 2011-11-03 16:22 Max Kellermann
  2011-11-03 16:43 ` Frederic Weisbecker
                   ` (2 more replies)
  0 siblings, 3 replies; 31+ messages in thread
From: Max Kellermann @ 2011-11-03 16:22 UTC (permalink / raw)
  To: linux-kernel; +Cc: containers, max, menage

Can limit the number of fork()/clone() calls in a cgroup.  It is
useful as a safeguard against fork bombs.

Signed-off-by: Max Kellermann <mk@cm4all.com>
---
 Documentation/cgroups/fork.txt |   30 ++++++
 include/linux/cgroup_fork.h    |   26 +++++
 include/linux/cgroup_subsys.h  |    6 +
 init/Kconfig                   |    6 +
 kernel/Makefile                |    1 
 kernel/cgroup_fork.c           |  197 ++++++++++++++++++++++++++++++++++++++++
 kernel/fork.c                  |    5 +
 7 files changed, 271 insertions(+), 0 deletions(-)
 create mode 100644 Documentation/cgroups/fork.txt
 create mode 100644 include/linux/cgroup_fork.h
 create mode 100644 kernel/cgroup_fork.c

diff --git a/Documentation/cgroups/fork.txt b/Documentation/cgroups/fork.txt
new file mode 100644
index 0000000..dfbf291
--- /dev/null
+++ b/Documentation/cgroups/fork.txt
@@ -0,0 +1,30 @@
+The "fork" Controller
+---------------------
+
+The "fork" controller limits the number of times a new child process
+or thread can be created.  It maintains a per-group counter which gets
+decremented on each fork() / clone().  When the counter reaches zero,
+no process in the cgroup is allowed to create new child
+processes/threads, even if existing ones quit.
+
+This has been proven useful in a shared hosting environment.  A new
+temporary cgroup is created for each CGI process, and the maximum fork
+count is configured to a sensible value.  Since CGIs are expected to
+run for only a short time with predictable resource usage, this may be
+an appropriate tool to limit the damage that a freaked CGI can do.
+
+Initially, the counter is set to -1, which is a magic value for
+"disabled" - no limits are imposed on the processes in the group.  To
+set a new value, type (in the working directory of that control
+group):
+
+ echo 16 > fork.remaining
+
+This examples allows 16 forks in the control group.  0 means no
+further forks are allowed.  The limit may be lowered or increased or
+even disabled at any time by a process with write permissions to the
+attribute.
+
+To check if a fork is allowed, the controller walks the cgroup
+hierarchy up, and verifies all ancestors.  The counter of all
+ancestors is decreased.
diff --git a/include/linux/cgroup_fork.h b/include/linux/cgroup_fork.h
new file mode 100644
index 0000000..4ac66b3
--- /dev/null
+++ b/include/linux/cgroup_fork.h
@@ -0,0 +1,26 @@
+#ifndef _LINUX_CGROUP_FORK_H
+#define _LINUX_CGROUP_FORK_H
+
+#ifdef CONFIG_CGROUP_FORK
+
+/**
+ * Checks if another fork is allowed.  Call this before creating a new
+ * child process.
+ *
+ * @return 0 on success, a negative errno value if forking should be
+ * denied
+ */
+int
+cgroup_fork_pre_fork(void);
+
+#else /* !CONFIG_CGROUP_FORK */
+
+static inline int
+cgroup_fork_pre_fork(void)
+{
+	return 0;
+}
+
+#endif /* !CONFIG_CGROUP_FORK */
+
+#endif /* !_LINUX_CGROUP_FORK_H */
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index ac663c1..e2dbd65 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -64,3 +64,9 @@ SUBSYS(perf)
 #endif
 
 /* */
+
+#ifdef CONFIG_CGROUP_FORK
+SUBSYS(fork)
+#endif
+
+/* */
diff --git a/init/Kconfig b/init/Kconfig
index 31ba0fd..7a2fe2e 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -603,6 +603,12 @@ config CGROUP_FREEZER
 	  Provides a way to freeze and unfreeze all tasks in a
 	  cgroup.
 
+config CGROUP_FORK
+	bool "fork controller for cgroups"
+	help
+	  Limits the number of fork() calls in a cgroup.  An application
+	  for this is to make a cgroup safe against fork bombs.
+
 config CGROUP_DEVICE
 	bool "Device controller for cgroups"
 	help
diff --git a/kernel/Makefile b/kernel/Makefile
index e898c5b..2aab192 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -60,6 +60,7 @@ obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
 obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CGROUPS) += cgroup.o
 obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
+obj-$(CONFIG_CGROUP_FORK) += cgroup_fork.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_UTS_NS) += utsname.o
 obj-$(CONFIG_USER_NS) += user_namespace.o
diff --git a/kernel/cgroup_fork.c b/kernel/cgroup_fork.c
new file mode 100644
index 0000000..e9aa650
--- /dev/null
+++ b/kernel/cgroup_fork.c
@@ -0,0 +1,197 @@
+/*
+ * A cgroup implementation which limits the number of fork() calls.
+ * See Documentation/cgroups/fork.txt for more information.
+ *
+ * Copyright 2011 Content Management AG
+ * Author: Max Kellermann <mk@cm4all.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General
+ * Public License.  See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#include <linux/cgroup.h>
+#include <linux/cgroup_fork.h>
+#include <linux/slab.h>
+
+struct cgroup_fork {
+	struct cgroup_subsys_state css;
+
+	/** protect the "remaining" attribute */
+	spinlock_t lock;
+
+	/**
+	 * The remaining number of forks allowed.  -1 is the magic
+	 * value for "unlimited".
+	 */
+	int remaining;
+};
+
+/**
+ * Get the #cgroup_fork instance of the specified #cgroup.
+ */
+static inline struct cgroup_fork *
+cgroup_fork_group(struct cgroup *cgroup)
+{
+	return container_of(cgroup_subsys_state(cgroup, fork_subsys_id),
+			    struct cgroup_fork, css);
+}
+
+/**
+ * Get the #cgroup_fork instance of the specified task.
+ */
+static inline struct cgroup_fork *
+cgroup_fork_task(struct task_struct *task)
+{
+	return container_of(task_subsys_state(task, fork_subsys_id),
+			    struct cgroup_fork, css);
+}
+
+/**
+ * Get the #cgroup_fork instance of the current task.
+ */
+static inline struct cgroup_fork *
+cgroup_fork_current(void)
+{
+	return cgroup_fork_task(current);
+}
+
+static __pure int
+cgroup_fork_lock_get_remaining(struct cgroup_fork *t)
+{
+	unsigned remaining;
+
+	spin_lock(&t->lock);
+	remaining = t->remaining;
+	spin_unlock(&t->lock);
+
+	return remaining;
+}
+
+static struct cgroup_subsys_state *
+cgroup_fork_create(struct cgroup_subsys *ss, struct cgroup *cgroup)
+{
+	struct cgroup_fork *t = kzalloc(sizeof(*t), GFP_KERNEL);
+	if (!t)
+		return ERR_PTR(-ENOMEM);
+
+	spin_lock_init(&t->lock);
+
+	t->remaining = -1;
+
+	return &t->css;
+}
+
+static void
+cgroup_fork_destroy(struct cgroup_subsys *ss, struct cgroup *cgroup)
+{
+	struct cgroup_fork *t = cgroup_fork_group(cgroup);
+
+	kfree(t);
+}
+
+static void
+cgroup_fork_fork(struct cgroup_subsys *ss, struct task_struct *task)
+{
+	struct cgroup_fork *t;
+
+	rcu_read_lock();
+
+	/* decrement the counters in the cgroup and all of its
+	   ancestors (except for the root cgroup) */
+
+	t = cgroup_fork_current();
+	while (t->css.cgroup->parent != NULL) {
+		spin_lock(&t->lock);
+		if (t->remaining > 0)
+			--t->remaining;
+		spin_unlock(&t->lock);
+
+		t = cgroup_fork_group(t->css.cgroup->parent);
+	}
+
+	rcu_read_unlock();
+}
+
+static s64
+cgroup_fork_remaining_read(struct cgroup *cgroup, struct cftype *cft)
+{
+	struct cgroup_fork *t = cgroup_fork_group(cgroup);
+
+	return cgroup_fork_lock_get_remaining(t);
+}
+
+static int
+cgroup_fork_remaining_write(struct cgroup *cgroup, struct cftype *cft,
+			    s64 value)
+{
+	struct cgroup_fork *t = cgroup_fork_group(cgroup);
+
+	if (value < -1 || value > (1L << 30))
+		return -EINVAL;
+
+	spin_lock(&t->lock);
+	t->remaining = (int)value;
+	spin_unlock(&t->lock);
+
+	return 0;
+}
+
+static const struct cftype cgroup_fork_files[] =  {
+	{
+		.name = "remaining",
+		.read_s64 = cgroup_fork_remaining_read,
+		.write_s64 = cgroup_fork_remaining_write,
+	},
+};
+
+static int
+cgroup_fork_populate(struct cgroup_subsys *ss, struct cgroup *cgroup)
+{
+	if (cgroup->parent == NULL)
+		/* cannot limit the root cgroup */
+		return 0;
+
+	return cgroup_add_files(cgroup, ss, cgroup_fork_files,
+				ARRAY_SIZE(cgroup_fork_files));
+}
+
+struct cgroup_subsys fork_subsys = {
+	.name = "fork",
+	.create = cgroup_fork_create,
+	.destroy = cgroup_fork_destroy,
+	.fork = cgroup_fork_fork,
+	.populate = cgroup_fork_populate,
+	.subsys_id = fork_subsys_id,
+};
+
+int
+cgroup_fork_pre_fork(void)
+{
+	struct cgroup_fork *t;
+	int err = 0;
+
+	if (unlikely(current == &init_task))
+		/* ignore the kernel's fork request while booting; the
+		   cgroup subsystem doesn't get initialized by
+		   INIT_TASK(), so we need this check */
+		return err;
+
+	BUG_ON(current->cgroups == NULL);
+
+	rcu_read_lock();
+
+	t = cgroup_fork_current();
+	while (t->css.cgroup->parent != NULL && err == 0) {
+		if (unlikely(cgroup_fork_lock_get_remaining(t) == 0)) {
+			err = -EPERM;
+			break;
+		}
+
+		t = cgroup_fork_group(t->css.cgroup->parent);
+	}
+
+	rcu_read_unlock();
+
+	return err;
+}
diff --git a/kernel/fork.c b/kernel/fork.c
index 70d7619..c8cba7d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -32,6 +32,7 @@
 #include <linux/capability.h>
 #include <linux/cpu.h>
 #include <linux/cgroup.h>
+#include <linux/cgroup_fork.h>
 #include <linux/security.h>
 #include <linux/hugetlb.h>
 #include <linux/swap.h>
@@ -1084,6 +1085,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 				current->signal->flags & SIGNAL_UNKILLABLE)
 		return ERR_PTR(-EINVAL);
 
+	retval = cgroup_fork_pre_fork();
+	if (retval)
+		goto fork_out;
+
 	retval = security_task_create(clone_flags);
 	if (retval)
 		goto fork_out;


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH] new cgroup controller "fork"
  2011-11-03 16:22 [PATCH] new cgroup controller "fork" Max Kellermann
@ 2011-11-03 16:43 ` Frederic Weisbecker
  2011-11-03 17:16   ` Max Kellermann
  2011-11-03 16:43 ` Glauber Costa
  2011-11-03 17:31 ` richard -rw- weinberger
  2 siblings, 1 reply; 31+ messages in thread
From: Frederic Weisbecker @ 2011-11-03 16:43 UTC (permalink / raw)
  To: Max Kellermann, Andrew Morton
  Cc: linux-kernel, containers, max, menage, Li Zefan, Johannes Weiner,
	Tim Hockin

On Thu, Nov 03, 2011 at 05:22:38PM +0100, Max Kellermann wrote:
> Can limit the number of fork()/clone() calls in a cgroup.  It is
> useful as a safeguard against fork bombs.
> 
> Signed-off-by: Max Kellermann <mk@cm4all.com>

Please have a look at the task counter subsystem: https://lwn.net/Articles/461462/

It's in the -mm tree. I'm glad to hear about another user who wants
this feature in cgroups. We need to hear about you and whether this
meets your requirements in order to get it merged upstream.

Thanks.

	Frédéric.

> ---
>  Documentation/cgroups/fork.txt |   30 ++++++
>  include/linux/cgroup_fork.h    |   26 +++++
>  include/linux/cgroup_subsys.h  |    6 +
>  init/Kconfig                   |    6 +
>  kernel/Makefile                |    1 
>  kernel/cgroup_fork.c           |  197 ++++++++++++++++++++++++++++++++++++++++
>  kernel/fork.c                  |    5 +
>  7 files changed, 271 insertions(+), 0 deletions(-)
>  create mode 100644 Documentation/cgroups/fork.txt
>  create mode 100644 include/linux/cgroup_fork.h
>  create mode 100644 kernel/cgroup_fork.c
> 
> diff --git a/Documentation/cgroups/fork.txt b/Documentation/cgroups/fork.txt
> new file mode 100644
> index 0000000..dfbf291
> --- /dev/null
> +++ b/Documentation/cgroups/fork.txt
> @@ -0,0 +1,30 @@
> +The "fork" Controller
> +---------------------
> +
> +The "fork" controller limits the number of times a new child process
> +or thread can be created.  It maintains a per-group counter which gets
> +decremented on each fork() / clone().  When the counter reaches zero,
> +no process in the cgroup is allowed to create new child
> +processes/threads, even if existing ones quit.
> +
> +This has been proven useful in a shared hosting environment.  A new
> +temporary cgroup is created for each CGI process, and the maximum fork
> +count is configured to a sensible value.  Since CGIs are expected to
> +run for only a short time with predictable resource usage, this may be
> +an appropriate tool to limit the damage that a freaked CGI can do.
> +
> +Initially, the counter is set to -1, which is a magic value for
> +"disabled" - no limits are imposed on the processes in the group.  To
> +set a new value, type (in the working directory of that control
> +group):
> +
> + echo 16 > fork.remaining
> +
> +This examples allows 16 forks in the control group.  0 means no
> +further forks are allowed.  The limit may be lowered or increased or
> +even disabled at any time by a process with write permissions to the
> +attribute.
> +
> +To check if a fork is allowed, the controller walks the cgroup
> +hierarchy up, and verifies all ancestors.  The counter of all
> +ancestors is decreased.
> diff --git a/include/linux/cgroup_fork.h b/include/linux/cgroup_fork.h
> new file mode 100644
> index 0000000..4ac66b3
> --- /dev/null
> +++ b/include/linux/cgroup_fork.h
> @@ -0,0 +1,26 @@
> +#ifndef _LINUX_CGROUP_FORK_H
> +#define _LINUX_CGROUP_FORK_H
> +
> +#ifdef CONFIG_CGROUP_FORK
> +
> +/**
> + * Checks if another fork is allowed.  Call this before creating a new
> + * child process.
> + *
> + * @return 0 on success, a negative errno value if forking should be
> + * denied
> + */
> +int
> +cgroup_fork_pre_fork(void);
> +
> +#else /* !CONFIG_CGROUP_FORK */
> +
> +static inline int
> +cgroup_fork_pre_fork(void)
> +{
> +	return 0;
> +}
> +
> +#endif /* !CONFIG_CGROUP_FORK */
> +
> +#endif /* !_LINUX_CGROUP_FORK_H */
> diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
> index ac663c1..e2dbd65 100644
> --- a/include/linux/cgroup_subsys.h
> +++ b/include/linux/cgroup_subsys.h
> @@ -64,3 +64,9 @@ SUBSYS(perf)
>  #endif
>  
>  /* */
> +
> +#ifdef CONFIG_CGROUP_FORK
> +SUBSYS(fork)
> +#endif
> +
> +/* */
> diff --git a/init/Kconfig b/init/Kconfig
> index 31ba0fd..7a2fe2e 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -603,6 +603,12 @@ config CGROUP_FREEZER
>  	  Provides a way to freeze and unfreeze all tasks in a
>  	  cgroup.
>  
> +config CGROUP_FORK
> +	bool "fork controller for cgroups"
> +	help
> +	  Limits the number of fork() calls in a cgroup.  An application
> +	  for this is to make a cgroup safe against fork bombs.
> +
>  config CGROUP_DEVICE
>  	bool "Device controller for cgroups"
>  	help
> diff --git a/kernel/Makefile b/kernel/Makefile
> index e898c5b..2aab192 100644
> --- a/kernel/Makefile
> +++ b/kernel/Makefile
> @@ -60,6 +60,7 @@ obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
>  obj-$(CONFIG_COMPAT) += compat.o
>  obj-$(CONFIG_CGROUPS) += cgroup.o
>  obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
> +obj-$(CONFIG_CGROUP_FORK) += cgroup_fork.o
>  obj-$(CONFIG_CPUSETS) += cpuset.o
>  obj-$(CONFIG_UTS_NS) += utsname.o
>  obj-$(CONFIG_USER_NS) += user_namespace.o
> diff --git a/kernel/cgroup_fork.c b/kernel/cgroup_fork.c
> new file mode 100644
> index 0000000..e9aa650
> --- /dev/null
> +++ b/kernel/cgroup_fork.c
> @@ -0,0 +1,197 @@
> +/*
> + * A cgroup implementation which limits the number of fork() calls.
> + * See Documentation/cgroups/fork.txt for more information.
> + *
> + * Copyright 2011 Content Management AG
> + * Author: Max Kellermann <mk@cm4all.com>
> + *
> + * This file is subject to the terms and conditions of the GNU General
> + * Public License.  See the file COPYING in the main directory of the
> + * Linux distribution for more details.
> + */
> +
> +#include <linux/cgroup.h>
> +#include <linux/cgroup_fork.h>
> +#include <linux/slab.h>
> +
> +struct cgroup_fork {
> +	struct cgroup_subsys_state css;
> +
> +	/** protect the "remaining" attribute */
> +	spinlock_t lock;
> +
> +	/**
> +	 * The remaining number of forks allowed.  -1 is the magic
> +	 * value for "unlimited".
> +	 */
> +	int remaining;
> +};
> +
> +/**
> + * Get the #cgroup_fork instance of the specified #cgroup.
> + */
> +static inline struct cgroup_fork *
> +cgroup_fork_group(struct cgroup *cgroup)
> +{
> +	return container_of(cgroup_subsys_state(cgroup, fork_subsys_id),
> +			    struct cgroup_fork, css);
> +}
> +
> +/**
> + * Get the #cgroup_fork instance of the specified task.
> + */
> +static inline struct cgroup_fork *
> +cgroup_fork_task(struct task_struct *task)
> +{
> +	return container_of(task_subsys_state(task, fork_subsys_id),
> +			    struct cgroup_fork, css);
> +}
> +
> +/**
> + * Get the #cgroup_fork instance of the current task.
> + */
> +static inline struct cgroup_fork *
> +cgroup_fork_current(void)
> +{
> +	return cgroup_fork_task(current);
> +}
> +
> +static __pure int
> +cgroup_fork_lock_get_remaining(struct cgroup_fork *t)
> +{
> +	unsigned remaining;
> +
> +	spin_lock(&t->lock);
> +	remaining = t->remaining;
> +	spin_unlock(&t->lock);
> +
> +	return remaining;
> +}
> +
> +static struct cgroup_subsys_state *
> +cgroup_fork_create(struct cgroup_subsys *ss, struct cgroup *cgroup)
> +{
> +	struct cgroup_fork *t = kzalloc(sizeof(*t), GFP_KERNEL);
> +	if (!t)
> +		return ERR_PTR(-ENOMEM);
> +
> +	spin_lock_init(&t->lock);
> +
> +	t->remaining = -1;
> +
> +	return &t->css;
> +}
> +
> +static void
> +cgroup_fork_destroy(struct cgroup_subsys *ss, struct cgroup *cgroup)
> +{
> +	struct cgroup_fork *t = cgroup_fork_group(cgroup);
> +
> +	kfree(t);
> +}
> +
> +static void
> +cgroup_fork_fork(struct cgroup_subsys *ss, struct task_struct *task)
> +{
> +	struct cgroup_fork *t;
> +
> +	rcu_read_lock();
> +
> +	/* decrement the counters in the cgroup and all of its
> +	   ancestors (except for the root cgroup) */
> +
> +	t = cgroup_fork_current();
> +	while (t->css.cgroup->parent != NULL) {
> +		spin_lock(&t->lock);
> +		if (t->remaining > 0)
> +			--t->remaining;
> +		spin_unlock(&t->lock);
> +
> +		t = cgroup_fork_group(t->css.cgroup->parent);
> +	}
> +
> +	rcu_read_unlock();
> +}
> +
> +static s64
> +cgroup_fork_remaining_read(struct cgroup *cgroup, struct cftype *cft)
> +{
> +	struct cgroup_fork *t = cgroup_fork_group(cgroup);
> +
> +	return cgroup_fork_lock_get_remaining(t);
> +}
> +
> +static int
> +cgroup_fork_remaining_write(struct cgroup *cgroup, struct cftype *cft,
> +			    s64 value)
> +{
> +	struct cgroup_fork *t = cgroup_fork_group(cgroup);
> +
> +	if (value < -1 || value > (1L << 30))
> +		return -EINVAL;
> +
> +	spin_lock(&t->lock);
> +	t->remaining = (int)value;
> +	spin_unlock(&t->lock);
> +
> +	return 0;
> +}
> +
> +static const struct cftype cgroup_fork_files[] =  {
> +	{
> +		.name = "remaining",
> +		.read_s64 = cgroup_fork_remaining_read,
> +		.write_s64 = cgroup_fork_remaining_write,
> +	},
> +};
> +
> +static int
> +cgroup_fork_populate(struct cgroup_subsys *ss, struct cgroup *cgroup)
> +{
> +	if (cgroup->parent == NULL)
> +		/* cannot limit the root cgroup */
> +		return 0;
> +
> +	return cgroup_add_files(cgroup, ss, cgroup_fork_files,
> +				ARRAY_SIZE(cgroup_fork_files));
> +}
> +
> +struct cgroup_subsys fork_subsys = {
> +	.name = "fork",
> +	.create = cgroup_fork_create,
> +	.destroy = cgroup_fork_destroy,
> +	.fork = cgroup_fork_fork,
> +	.populate = cgroup_fork_populate,
> +	.subsys_id = fork_subsys_id,
> +};
> +
> +int
> +cgroup_fork_pre_fork(void)
> +{
> +	struct cgroup_fork *t;
> +	int err = 0;
> +
> +	if (unlikely(current == &init_task))
> +		/* ignore the kernel's fork request while booting; the
> +		   cgroup subsystem doesn't get initialized by
> +		   INIT_TASK(), so we need this check */
> +		return err;
> +
> +	BUG_ON(current->cgroups == NULL);
> +
> +	rcu_read_lock();
> +
> +	t = cgroup_fork_current();
> +	while (t->css.cgroup->parent != NULL && err == 0) {
> +		if (unlikely(cgroup_fork_lock_get_remaining(t) == 0)) {
> +			err = -EPERM;
> +			break;
> +		}
> +
> +		t = cgroup_fork_group(t->css.cgroup->parent);
> +	}
> +
> +	rcu_read_unlock();
> +
> +	return err;
> +}
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 70d7619..c8cba7d 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -32,6 +32,7 @@
>  #include <linux/capability.h>
>  #include <linux/cpu.h>
>  #include <linux/cgroup.h>
> +#include <linux/cgroup_fork.h>
>  #include <linux/security.h>
>  #include <linux/hugetlb.h>
>  #include <linux/swap.h>
> @@ -1084,6 +1085,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
>  				current->signal->flags & SIGNAL_UNKILLABLE)
>  		return ERR_PTR(-EINVAL);
>  
> +	retval = cgroup_fork_pre_fork();
> +	if (retval)
> +		goto fork_out;
> +
>  	retval = security_task_create(clone_flags);
>  	if (retval)
>  		goto fork_out;
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] new cgroup controller "fork"
  2011-11-03 16:22 [PATCH] new cgroup controller "fork" Max Kellermann
  2011-11-03 16:43 ` Frederic Weisbecker
@ 2011-11-03 16:43 ` Glauber Costa
  2011-11-03 16:59   ` Max Kellermann
  2011-11-03 17:31 ` richard -rw- weinberger
  2 siblings, 1 reply; 31+ messages in thread
From: Glauber Costa @ 2011-11-03 16:43 UTC (permalink / raw)
  To: Max Kellermann; +Cc: linux-kernel, containers, max, menage, Frederic Weisbecker

On 11/03/2011 02:22 PM, Max Kellermann wrote:
> Can limit the number of fork()/clone() calls in a cgroup.  It is
> useful as a safeguard against fork bombs.

I do have a couple of questions about this, but the most important one 
is: Is this a competing implementation, or a cooperative effort with 
Frederic's ?

> Signed-off-by: Max Kellermann<mk@cm4all.com>
> ---
>   Documentation/cgroups/fork.txt |   30 ++++++
>   include/linux/cgroup_fork.h    |   26 +++++
>   include/linux/cgroup_subsys.h  |    6 +
>   init/Kconfig                   |    6 +
>   kernel/Makefile                |    1
>   kernel/cgroup_fork.c           |  197 ++++++++++++++++++++++++++++++++++++++++
>   kernel/fork.c                  |    5 +
>   7 files changed, 271 insertions(+), 0 deletions(-)
>   create mode 100644 Documentation/cgroups/fork.txt
>   create mode 100644 include/linux/cgroup_fork.h
>   create mode 100644 kernel/cgroup_fork.c
>
> diff --git a/Documentation/cgroups/fork.txt b/Documentation/cgroups/fork.txt
> new file mode 100644
> index 0000000..dfbf291
> --- /dev/null
> +++ b/Documentation/cgroups/fork.txt
> @@ -0,0 +1,30 @@
> +The "fork" Controller
> +---------------------
> +
> +The "fork" controller limits the number of times a new child process
> +or thread can be created.  It maintains a per-group counter which gets
> +decremented on each fork() / clone().  When the counter reaches zero,
> +no process in the cgroup is allowed to create new child
> +processes/threads, even if existing ones quit.
> +
> +This has been proven useful in a shared hosting environment.  A new
> +temporary cgroup is created for each CGI process, and the maximum fork
> +count is configured to a sensible value.  Since CGIs are expected to
> +run for only a short time with predictable resource usage, this may be
> +an appropriate tool to limit the damage that a freaked CGI can do.
> +
> +Initially, the counter is set to -1, which is a magic value for
> +"disabled" - no limits are imposed on the processes in the group.  To
> +set a new value, type (in the working directory of that control
> +group):
> +
> + echo 16>  fork.remaining
> +
> +This examples allows 16 forks in the control group.  0 means no
> +further forks are allowed.  The limit may be lowered or increased or
> +even disabled at any time by a process with write permissions to the
> +attribute.
> +
> +To check if a fork is allowed, the controller walks the cgroup
> +hierarchy up, and verifies all ancestors.  The counter of all
> +ancestors is decreased.
> diff --git a/include/linux/cgroup_fork.h b/include/linux/cgroup_fork.h
> new file mode 100644
> index 0000000..4ac66b3
> --- /dev/null
> +++ b/include/linux/cgroup_fork.h
> @@ -0,0 +1,26 @@
> +#ifndef _LINUX_CGROUP_FORK_H
> +#define _LINUX_CGROUP_FORK_H
> +
> +#ifdef CONFIG_CGROUP_FORK
> +
> +/**
> + * Checks if another fork is allowed.  Call this before creating a new
> + * child process.
> + *
> + * @return 0 on success, a negative errno value if forking should be
> + * denied
> + */
> +int
> +cgroup_fork_pre_fork(void);
> +
> +#else /* !CONFIG_CGROUP_FORK */
> +
> +static inline int
> +cgroup_fork_pre_fork(void)
> +{
> +	return 0;
> +}
> +
> +#endif /* !CONFIG_CGROUP_FORK */
> +
> +#endif /* !_LINUX_CGROUP_FORK_H */
> diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
> index ac663c1..e2dbd65 100644
> --- a/include/linux/cgroup_subsys.h
> +++ b/include/linux/cgroup_subsys.h
> @@ -64,3 +64,9 @@ SUBSYS(perf)
>   #endif
>
>   /* */
> +
> +#ifdef CONFIG_CGROUP_FORK
> +SUBSYS(fork)
> +#endif
> +
> +/* */
> diff --git a/init/Kconfig b/init/Kconfig
> index 31ba0fd..7a2fe2e 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -603,6 +603,12 @@ config CGROUP_FREEZER
>   	  Provides a way to freeze and unfreeze all tasks in a
>   	  cgroup.
>
> +config CGROUP_FORK
> +	bool "fork controller for cgroups"
> +	help
> +	  Limits the number of fork() calls in a cgroup.  An application
> +	  for this is to make a cgroup safe against fork bombs.
> +
>   config CGROUP_DEVICE
>   	bool "Device controller for cgroups"
>   	help
> diff --git a/kernel/Makefile b/kernel/Makefile
> index e898c5b..2aab192 100644
> --- a/kernel/Makefile
> +++ b/kernel/Makefile
> @@ -60,6 +60,7 @@ obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
>   obj-$(CONFIG_COMPAT) += compat.o
>   obj-$(CONFIG_CGROUPS) += cgroup.o
>   obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
> +obj-$(CONFIG_CGROUP_FORK) += cgroup_fork.o
>   obj-$(CONFIG_CPUSETS) += cpuset.o
>   obj-$(CONFIG_UTS_NS) += utsname.o
>   obj-$(CONFIG_USER_NS) += user_namespace.o
> diff --git a/kernel/cgroup_fork.c b/kernel/cgroup_fork.c
> new file mode 100644
> index 0000000..e9aa650
> --- /dev/null
> +++ b/kernel/cgroup_fork.c
> @@ -0,0 +1,197 @@
> +/*
> + * A cgroup implementation which limits the number of fork() calls.
> + * See Documentation/cgroups/fork.txt for more information.
> + *
> + * Copyright 2011 Content Management AG
> + * Author: Max Kellermann<mk@cm4all.com>
> + *
> + * This file is subject to the terms and conditions of the GNU General
> + * Public License.  See the file COPYING in the main directory of the
> + * Linux distribution for more details.
> + */
> +
> +#include<linux/cgroup.h>
> +#include<linux/cgroup_fork.h>
> +#include<linux/slab.h>
> +
> +struct cgroup_fork {
> +	struct cgroup_subsys_state css;
> +
> +	/** protect the "remaining" attribute */
> +	spinlock_t lock;
> +
> +	/**
> +	 * The remaining number of forks allowed.  -1 is the magic
> +	 * value for "unlimited".
> +	 */
> +	int remaining;
> +};
> +
> +/**
> + * Get the #cgroup_fork instance of the specified #cgroup.
> + */
> +static inline struct cgroup_fork *
> +cgroup_fork_group(struct cgroup *cgroup)
> +{
> +	return container_of(cgroup_subsys_state(cgroup, fork_subsys_id),
> +			    struct cgroup_fork, css);
> +}
> +
> +/**
> + * Get the #cgroup_fork instance of the specified task.
> + */
> +static inline struct cgroup_fork *
> +cgroup_fork_task(struct task_struct *task)
> +{
> +	return container_of(task_subsys_state(task, fork_subsys_id),
> +			    struct cgroup_fork, css);
> +}
> +
> +/**
> + * Get the #cgroup_fork instance of the current task.
> + */
> +static inline struct cgroup_fork *
> +cgroup_fork_current(void)
> +{
> +	return cgroup_fork_task(current);
> +}
> +
> +static __pure int
> +cgroup_fork_lock_get_remaining(struct cgroup_fork *t)
> +{
> +	unsigned remaining;
> +
> +	spin_lock(&t->lock);
> +	remaining = t->remaining;
> +	spin_unlock(&t->lock);
> +
> +	return remaining;
> +}
> +
> +static struct cgroup_subsys_state *
> +cgroup_fork_create(struct cgroup_subsys *ss, struct cgroup *cgroup)
> +{
> +	struct cgroup_fork *t = kzalloc(sizeof(*t), GFP_KERNEL);
> +	if (!t)
> +		return ERR_PTR(-ENOMEM);
> +
> +	spin_lock_init(&t->lock);
> +
> +	t->remaining = -1;
> +
> +	return&t->css;
> +}
> +
> +static void
> +cgroup_fork_destroy(struct cgroup_subsys *ss, struct cgroup *cgroup)
> +{
> +	struct cgroup_fork *t = cgroup_fork_group(cgroup);
> +
> +	kfree(t);
> +}
> +
> +static void
> +cgroup_fork_fork(struct cgroup_subsys *ss, struct task_struct *task)
> +{
> +	struct cgroup_fork *t;
> +
> +	rcu_read_lock();
> +
> +	/* decrement the counters in the cgroup and all of its
> +	   ancestors (except for the root cgroup) */
> +
> +	t = cgroup_fork_current();
> +	while (t->css.cgroup->parent != NULL) {
> +		spin_lock(&t->lock);
> +		if (t->remaining>  0)
> +			--t->remaining;
> +		spin_unlock(&t->lock);
> +
> +		t = cgroup_fork_group(t->css.cgroup->parent);
> +	}
> +
> +	rcu_read_unlock();
> +}
> +
> +static s64
> +cgroup_fork_remaining_read(struct cgroup *cgroup, struct cftype *cft)
> +{
> +	struct cgroup_fork *t = cgroup_fork_group(cgroup);
> +
> +	return cgroup_fork_lock_get_remaining(t);
> +}
> +
> +static int
> +cgroup_fork_remaining_write(struct cgroup *cgroup, struct cftype *cft,
> +			    s64 value)
> +{
> +	struct cgroup_fork *t = cgroup_fork_group(cgroup);
> +
> +	if (value<  -1 || value>  (1L<<  30))
> +		return -EINVAL;
> +
> +	spin_lock(&t->lock);
> +	t->remaining = (int)value;
> +	spin_unlock(&t->lock);
> +
> +	return 0;
> +}
> +
> +static const struct cftype cgroup_fork_files[] =  {
> +	{
> +		.name = "remaining",
> +		.read_s64 = cgroup_fork_remaining_read,
> +		.write_s64 = cgroup_fork_remaining_write,
> +	},
> +};
> +
> +static int
> +cgroup_fork_populate(struct cgroup_subsys *ss, struct cgroup *cgroup)
> +{
> +	if (cgroup->parent == NULL)
> +		/* cannot limit the root cgroup */
> +		return 0;
> +
> +	return cgroup_add_files(cgroup, ss, cgroup_fork_files,
> +				ARRAY_SIZE(cgroup_fork_files));
> +}
> +
> +struct cgroup_subsys fork_subsys = {
> +	.name = "fork",
> +	.create = cgroup_fork_create,
> +	.destroy = cgroup_fork_destroy,
> +	.fork = cgroup_fork_fork,
> +	.populate = cgroup_fork_populate,
> +	.subsys_id = fork_subsys_id,
> +};
> +
> +int
> +cgroup_fork_pre_fork(void)
> +{
> +	struct cgroup_fork *t;
> +	int err = 0;
> +
> +	if (unlikely(current ==&init_task))
> +		/* ignore the kernel's fork request while booting; the
> +		   cgroup subsystem doesn't get initialized by
> +		   INIT_TASK(), so we need this check */
> +		return err;
> +
> +	BUG_ON(current->cgroups == NULL);
> +
> +	rcu_read_lock();
> +
> +	t = cgroup_fork_current();
> +	while (t->css.cgroup->parent != NULL&&  err == 0) {
> +		if (unlikely(cgroup_fork_lock_get_remaining(t) == 0)) {
> +			err = -EPERM;
> +			break;
> +		}
> +
> +		t = cgroup_fork_group(t->css.cgroup->parent);
> +	}
> +
> +	rcu_read_unlock();
> +
> +	return err;
> +}
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 70d7619..c8cba7d 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -32,6 +32,7 @@
>   #include<linux/capability.h>
>   #include<linux/cpu.h>
>   #include<linux/cgroup.h>
> +#include<linux/cgroup_fork.h>
>   #include<linux/security.h>
>   #include<linux/hugetlb.h>
>   #include<linux/swap.h>
> @@ -1084,6 +1085,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
>   				current->signal->flags&  SIGNAL_UNKILLABLE)
>   		return ERR_PTR(-EINVAL);
>
> +	retval = cgroup_fork_pre_fork();
> +	if (retval)
> +		goto fork_out;
> +
>   	retval = security_task_create(clone_flags);
>   	if (retval)
>   		goto fork_out;
>
> _______________________________________________
> Containers mailing list
> Containers@lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/containers


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] new cgroup controller "fork"
  2011-11-03 16:43 ` Glauber Costa
@ 2011-11-03 16:59   ` Max Kellermann
  2011-11-03 17:05     ` Frederic Weisbecker
  2011-11-03 18:21     ` Alan Cox
  0 siblings, 2 replies; 31+ messages in thread
From: Max Kellermann @ 2011-11-03 16:59 UTC (permalink / raw)
  To: Glauber Costa; +Cc: linux-kernel, containers, Frederic Weisbecker

On 2011/11/03 17:43, Glauber Costa <glommer@parallels.com> wrote:
> I do have a couple of questions about this, but the most important
> one is: Is this a competing implementation, or a cooperative effort
> with Frederic's ?

Short answer: competing.

Long answer: I do not know Frederic's patch.  This is a repost of a
patch that I submitted 9 months ago:

 https://lkml.org/lkml/2011/2/17/116

After little discussion, nobody seemed to be interested in it, and
nobody merged it.  I reposted it today, not knowing somebody else had
come up with a similar idea meanwhile.

Max

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] new cgroup controller "fork"
  2011-11-03 16:59   ` Max Kellermann
@ 2011-11-03 17:05     ` Frederic Weisbecker
  2011-11-03 18:21     ` Alan Cox
  1 sibling, 0 replies; 31+ messages in thread
From: Frederic Weisbecker @ 2011-11-03 17:05 UTC (permalink / raw)
  To: Glauber Costa, linux-kernel, containers

On Thu, Nov 03, 2011 at 05:59:03PM +0100, Max Kellermann wrote:
> On 2011/11/03 17:43, Glauber Costa <glommer@parallels.com> wrote:
> > I do have a couple of questions about this, but the most important
> > one is: Is this a competing implementation, or a cooperative effort
> > with Frederic's ?
> 
> Short answer: competing.
> 
> Long answer: I do not know Frederic's patch.  This is a repost of a
> patch that I submitted 9 months ago:
> 
>  https://lkml.org/lkml/2011/2/17/116
> 
> After little discussion, nobody seemed to be interested in it, and
> nobody merged it.  I reposted it today, not knowing somebody else had
> come up with a similar idea meanwhile.

Woops, sorry I never heard about it before I started working on this.
Otherwise I would have tried to work with you.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] new cgroup controller "fork"
  2011-11-03 16:43 ` Frederic Weisbecker
@ 2011-11-03 17:16   ` Max Kellermann
  2011-11-03 17:26     ` Glauber Costa
  0 siblings, 1 reply; 31+ messages in thread
From: Max Kellermann @ 2011-11-03 17:16 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: Andrew Morton, linux-kernel, containers, Li Zefan,
	Johannes Weiner, Tim Hockin

On 2011/11/03 17:43, Frederic Weisbecker <fweisbec@gmail.com> wrote:
> Please have a look at the task counter subsystem: https://lwn.net/Articles/461462/
> 
> It's in the -mm tree. I'm glad to hear about another user who wants
> this feature in cgroups. We need to hear about you and whether this
> meets your requirements in order to get it merged upstream.

Had a quick look at your patch set.  No, it does not seem to meet my
requirements.  It limits the number of processes in a cgroup - that is
also very useful, but is different from my controller.

My controller limits the number of fork() calls, not the number of
processes running at a time.

I've been using it for 7 years to put an upper bound on CGI resource
usage in a shared hosting environment at my employer (initially based
on a proprietary cgroup-like subsystem I wrote, when cgroups were not
available yet).

Max

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] new cgroup controller "fork"
  2011-11-03 17:16   ` Max Kellermann
@ 2011-11-03 17:26     ` Glauber Costa
  2011-11-03 17:48       ` Max Kellermann
  0 siblings, 1 reply; 31+ messages in thread
From: Glauber Costa @ 2011-11-03 17:26 UTC (permalink / raw)
  To: Max Kellermann
  Cc: Frederic Weisbecker, Tim Hockin, containers, linux-kernel,
	Johannes Weiner, Andrew Morton

On 11/03/2011 03:16 PM, Max Kellermann wrote:
> On 2011/11/03 17:43, Frederic Weisbecker<fweisbec@gmail.com>  wrote:
>> Please have a look at the task counter subsystem: https://lwn.net/Articles/461462/
>>
>> It's in the -mm tree. I'm glad to hear about another user who wants
>> this feature in cgroups. We need to hear about you and whether this
>> meets your requirements in order to get it merged upstream.
>
> Had a quick look at your patch set.  No, it does not seem to meet my
> requirements.  It limits the number of processes in a cgroup - that is
> also very useful, but is different from my controller.
>
How so?
If you never move tasks to a cgroup except at setup time - which is the 
common case for almost everybody, I imagine, you end up achieving the 
same thing.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] new cgroup controller "fork"
  2011-11-03 16:22 [PATCH] new cgroup controller "fork" Max Kellermann
  2011-11-03 16:43 ` Frederic Weisbecker
  2011-11-03 16:43 ` Glauber Costa
@ 2011-11-03 17:31 ` richard -rw- weinberger
  2 siblings, 0 replies; 31+ messages in thread
From: richard -rw- weinberger @ 2011-11-03 17:31 UTC (permalink / raw)
  To: Max Kellermann; +Cc: linux-kernel, containers, max, menage

On Thu, Nov 3, 2011 at 5:22 PM, Max Kellermann <mk@cm4all.com> wrote:
> +struct cgroup_fork {
> +       struct cgroup_subsys_state css;
> +
> +       /** protect the "remaining" attribute */
> +       spinlock_t lock;
> +
> +       /**
> +        * The remaining number of forks allowed.  -1 is the magic
> +        * value for "unlimited".
> +        */
> +       int remaining;
> +};

Wouldn't an atomic_t also do it?

-- 
Thanks,
//richard

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] new cgroup controller "fork"
  2011-11-03 17:26     ` Glauber Costa
@ 2011-11-03 17:48       ` Max Kellermann
  2011-11-03 17:50         ` Glauber Costa
  0 siblings, 1 reply; 31+ messages in thread
From: Max Kellermann @ 2011-11-03 17:48 UTC (permalink / raw)
  To: Glauber Costa
  Cc: Frederic Weisbecker, Tim Hockin, containers, linux-kernel,
	Johannes Weiner, Andrew Morton

On 2011/11/03 18:26, Glauber Costa <glommer@parallels.com> wrote:
> On 11/03/2011 03:16 PM, Max Kellermann wrote:
> >but is different from my controller.
> >
> How so?

Once the "remaining" counter has reached zero, no further forks are
possible, no matter how many processes are left.  It is a fork
counter, not a process counter.

Let's say: Frederic's controller counts "things" that exists
(processes), and my controller counts "verbs" or "ations" (fork()).

Max

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] new cgroup controller "fork"
  2011-11-03 17:48       ` Max Kellermann
@ 2011-11-03 17:50         ` Glauber Costa
  2011-11-03 18:30           ` Max Kellermann
  0 siblings, 1 reply; 31+ messages in thread
From: Glauber Costa @ 2011-11-03 17:50 UTC (permalink / raw)
  To: Max Kellermann
  Cc: Frederic Weisbecker, Tim Hockin, containers, linux-kernel,
	Johannes Weiner, Andrew Morton

On 11/03/2011 03:48 PM, Max Kellermann wrote:
> On 2011/11/03 18:26, Glauber Costa<glommer@parallels.com>  wrote:
>> On 11/03/2011 03:16 PM, Max Kellermann wrote:
>>> but is different from my controller.
>>>
>> How so?
>
> Once the "remaining" counter has reached zero, no further forks are
> possible, no matter how many processes are left.  It is a fork
> counter, not a process counter.
>
> Let's say: Frederic's controller counts "things" that exists
> (processes), and my controller counts "verbs" or "ations" (fork()).
>
> Max
That still seems to be up to admin. If no processes are removed from the 
cgroup or included in the cgroup, the only action/verb the counter
is concerned about is to fork. Under this circumstance, both seem 
equivalent from my PoV.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] new cgroup controller "fork"
  2011-11-03 16:59   ` Max Kellermann
  2011-11-03 17:05     ` Frederic Weisbecker
@ 2011-11-03 18:21     ` Alan Cox
  2011-11-03 18:51       ` Max Kellermann
  1 sibling, 1 reply; 31+ messages in thread
From: Alan Cox @ 2011-11-03 18:21 UTC (permalink / raw)
  To: Max Kellermann
  Cc: Glauber Costa, linux-kernel, containers, Frederic Weisbecker

> After little discussion, nobody seemed to be interested in it, and
> nobody merged it.  I reposted it today, not knowing somebody else had
> come up with a similar idea meanwhile.

I don't really see a meaningful use case for this. Why should millions of
users have this stuff in their kernel. What's the general purpose use
case we should all be excited about ?

Alan

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] new cgroup controller "fork"
  2011-11-03 17:50         ` Glauber Costa
@ 2011-11-03 18:30           ` Max Kellermann
  2011-11-03 18:34             ` Glauber Costa
  0 siblings, 1 reply; 31+ messages in thread
From: Max Kellermann @ 2011-11-03 18:30 UTC (permalink / raw)
  To: Glauber Costa
  Cc: Frederic Weisbecker, Tim Hockin, containers, linux-kernel,
	Johannes Weiner, Andrew Morton

On 2011/11/03 18:50, Glauber Costa <glommer@parallels.com> wrote:
> That still seems to be up to admin. If no processes are removed from
> the cgroup or included in the cgroup, the only action/verb the
> counter
> is concerned about is to fork. Under this circumstance, both seem
> equivalent from my PoV.

I'm confused.  One of us misunderstands the whole thing.

Examples of both controllers:

task_counter: task.limit=2.  Let's say the only process in that group
forks, then you have two processes.  Forking is disallowed from now
on.  The child process exits, and there's only one left - which is
allowed to fork!  The group may bounce between 0 and 2 processes
forever.

cgroup_fork: fork.remaining=2.  Now let's say we have one thousand
processes in that group!  One of those forks (allowed).  And it forks
again (allowed).  And tries again - blocked because "fork.remaining"
has reached zero.  We have 1002 processes; when 1001 of those
processes exit, one remains, but it is still disallowed to fork,
because "fork.remaining" is still zero.  It will remaing zero until
somebody with write permissions raises it again.

Did I get it wrong?  To me, that is not look equivalent at all.

Max

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] new cgroup controller "fork"
  2011-11-03 18:30           ` Max Kellermann
@ 2011-11-03 18:34             ` Glauber Costa
  0 siblings, 0 replies; 31+ messages in thread
From: Glauber Costa @ 2011-11-03 18:34 UTC (permalink / raw)
  To: Max Kellermann
  Cc: Frederic Weisbecker, Tim Hockin, containers, linux-kernel,
	Johannes Weiner, Andrew Morton

On 11/03/2011 04:30 PM, Max Kellermann wrote:
> On 2011/11/03 18:50, Glauber Costa<glommer@parallels.com>  wrote:
>> That still seems to be up to admin. If no processes are removed from
>> the cgroup or included in the cgroup, the only action/verb the
>> counter
>> is concerned about is to fork. Under this circumstance, both seem
>> equivalent from my PoV.
>
> I'm confused.  One of us misunderstands the whole thing.
>
> Examples of both controllers:
>
> task_counter: task.limit=2.  Let's say the only process in that group
> forks, then you have two processes.  Forking is disallowed from now
> on.  The child process exits, and there's only one left - which is
> allowed to fork!  The group may bounce between 0 and 2 processes
> forever.

Ah, I see. I assumed you were decrementing the counter when a process 
exited.

> cgroup_fork: fork.remaining=2.  Now let's say we have one thousand
> processes in that group!  One of those forks (allowed).  And it forks
> again (allowed).  And tries again - blocked because "fork.remaining"
> has reached zero.  We have 1002 processes; when 1001 of those
> processes exit, one remains, but it is still disallowed to fork,
> because "fork.remaining" is still zero.  It will remaing zero until
> somebody with write permissions raises it again.
>
> Did I get it wrong?  To me, that is not look equivalent at all.
No, I was not careful enough when reading the patch, and as already 
stated, I made an assumption that seemed reasonable.

Which raises the question: If you don't decrement on exit, and only 
count how many forks happened in the past, what is your use case for this?

Please note that both of you seem to target the same goal: prevent fork 
bombs. If a child exits, I see no reason to not open up room for a new 
process inside the group.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] new cgroup controller "fork"
  2011-11-03 18:21     ` Alan Cox
@ 2011-11-03 18:51       ` Max Kellermann
  2011-11-03 18:56         ` Glauber Costa
  2011-11-03 19:03         ` Alan Cox
  0 siblings, 2 replies; 31+ messages in thread
From: Max Kellermann @ 2011-11-03 18:51 UTC (permalink / raw)
  To: Alan Cox; +Cc: Glauber Costa, linux-kernel, containers, Frederic Weisbecker

On 2011/11/03 19:21, Alan Cox <alan@lxorguk.ukuu.org.uk> wrote:
> > After little discussion, nobody seemed to be interested in it, and
> > nobody merged it.  I reposted it today, not knowing somebody else had
> > come up with a similar idea meanwhile.
> 
> I don't really see a meaningful use case for this. Why should millions of
> users have this stuff in their kernel. What's the general purpose use
> case we should all be excited about ?

Putting a reasonable limit on jobs that are expected to run only for a
limited amount of time, with a limited amount of total resources.  For
example: CGI, cron jobs, backup, munin plugins, virus scanners and
other email filters, procmail, ... - when the job is done, the group
can be deleted, and new instances will run in a new group.

With just RLIMIT_NPROC or task_counter, you can limit the total number
of processes, but it will not stop a fork bomb - it will only slow it
down.  The fork bomb will still bounce between 1 and the limit, and
consume lots of resources for forking and exiting.

(Glauber: the above should answer your last email, too)

Similar existing feature: RLIMIT_CPU.  Millions of users have it in
their kernels, but nobody uses it nowadays.  And it's not even
optional.

Btw. I have no problem with maintaining this patch (and a whole bunch
of others) in my proprietary git repository at work forever.  They're
very useful for my employer.  I'm just trying to be a good citizen by
sharing them.

Max

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] new cgroup controller "fork"
  2011-11-03 18:51       ` Max Kellermann
@ 2011-11-03 18:56         ` Glauber Costa
  2011-11-03 20:08           ` Matt Helsley
  2011-11-03 19:03         ` Alan Cox
  1 sibling, 1 reply; 31+ messages in thread
From: Glauber Costa @ 2011-11-03 18:56 UTC (permalink / raw)
  To: Alan Cox, linux-kernel, containers, Frederic Weisbecker

On 11/03/2011 04:51 PM, Max Kellermann wrote:
> On 2011/11/03 19:21, Alan Cox<alan@lxorguk.ukuu.org.uk>  wrote:
>>> After little discussion, nobody seemed to be interested in it, and
>>> nobody merged it.  I reposted it today, not knowing somebody else had
>>> come up with a similar idea meanwhile.
>>
>> I don't really see a meaningful use case for this. Why should millions of
>> users have this stuff in their kernel. What's the general purpose use
>> case we should all be excited about ?
>
> Putting a reasonable limit on jobs that are expected to run only for a
> limited amount of time, with a limited amount of total resources.  For
> example: CGI, cron jobs, backup, munin plugins, virus scanners and
> other email filters, procmail, ... - when the job is done, the group
> can be deleted, and new instances will run in a new group.
>
> With just RLIMIT_NPROC or task_counter, you can limit the total number
> of processes, but it will not stop a fork bomb - it will only slow it
> down.  The fork bomb will still bounce between 1 and the limit, and
> consume lots of resources for forking and exiting.
>
> (Glauber: the above should answer your last email, too)

Yet, the damage a fork bomb can pose into the system this way is 
severely limited. Combined with the cpu controller to guarantee that 
this group of process will never take the whole cpu for themselves,
you have almost everything you need, if not everything.

> Similar existing feature: RLIMIT_CPU.  Millions of users have it in
> their kernels, but nobody uses it nowadays.  And it's not even
> optional.
>
> Btw. I have no problem with maintaining this patch (and a whole bunch
> of others) in my proprietary git repository at work forever.  They're
> very useful for my employer.  I'm just trying to be a good citizen by
> sharing them.

Well, one alternative is to try to rebase your work on top of -mm, 
taking Frederic's work into account. What we really don't need, is 
another cgroup for that. So if you manage to convince people that this 
is really a win - haven't convinced me so far - the way to go is 
enhancing the existing fork cgroup.

> Max


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] new cgroup controller "fork"
  2011-11-03 18:51       ` Max Kellermann
  2011-11-03 18:56         ` Glauber Costa
@ 2011-11-03 19:03         ` Alan Cox
  2011-11-03 19:20           ` Max Kellermann
  1 sibling, 1 reply; 31+ messages in thread
From: Alan Cox @ 2011-11-03 19:03 UTC (permalink / raw)
  To: Max Kellermann
  Cc: Glauber Costa, linux-kernel, containers, Frederic Weisbecker

> Putting a reasonable limit on jobs that are expected to run only for a
> limited amount of time, with a limited amount of total resources.  For

fork is an almost irrelevant resource. Address space (ie memory), file
handles and the like are actual constrained resources.

I can't help thinking this is focussing on a completely irrelevant
cornercase issue. It belongs as part of a general resource limiting
cgroup that also deals with memory, I/O and the like. In fact most of
these resources are a balancing act.

Who cares if you have 10,000 threads. We can handle that without trying.
10,000 different mappings is a whole different ball game, and 100,000 file
handles in some configurations might also matter.

In short in your specific environment a fork runaway is a symptom you can
use to detect and sometimes halt the failure case. It's not the actual
resource problem and it doesn't solve the general case.

> Similar existing feature: RLIMIT_CPU.  Millions of users have it in
> their kernels, but nobody uses it nowadays.  And it's not even
> optional.

It's required by the standards, and basically unmeasurable overhead.

> Btw. I have no problem with maintaining this patch (and a whole bunch
> of others) in my proprietary git repository at work forever.  They're
> very useful for my employer.  I'm just trying to be a good citizen by
> sharing them.

Sure - I'm just not seeing that a whole separate cgroup for it is
appropriate or a good plan. Anyone doing real resource management needs
the rest of the stuff anyway.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] new cgroup controller "fork"
  2011-11-03 19:03         ` Alan Cox
@ 2011-11-03 19:20           ` Max Kellermann
  2011-11-03 19:25             ` Glauber Costa
  0 siblings, 1 reply; 31+ messages in thread
From: Max Kellermann @ 2011-11-03 19:20 UTC (permalink / raw)
  To: Alan Cox; +Cc: Glauber Costa, linux-kernel, containers, Frederic Weisbecker

On 2011/11/03 20:03, Alan Cox <alan@lxorguk.ukuu.org.uk> wrote:
> Sure - I'm just not seeing that a whole separate cgroup for it is
> appropriate or a good plan. Anyone doing real resource management needs
> the rest of the stuff anyway.

Right.  When I saw Frederic's controller today, my first thought was
that one could move the fork limit code over into that controller.  If
we reach a consensus that this would be a good idea, and would have
chances to get merged, I could probably take some time to refactor my
code.

Max

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] new cgroup controller "fork"
  2011-11-03 19:20           ` Max Kellermann
@ 2011-11-03 19:25             ` Glauber Costa
  2011-11-03 20:13               ` Brian K. White
  0 siblings, 1 reply; 31+ messages in thread
From: Glauber Costa @ 2011-11-03 19:25 UTC (permalink / raw)
  To: Alan Cox, linux-kernel, containers, Frederic Weisbecker

On 11/03/2011 05:20 PM, Max Kellermann wrote:
> On 2011/11/03 20:03, Alan Cox<alan@lxorguk.ukuu.org.uk>  wrote:
>> Sure - I'm just not seeing that a whole separate cgroup for it is
>> appropriate or a good plan. Anyone doing real resource management needs
>> the rest of the stuff anyway.
>
> Right.  When I saw Frederic's controller today, my first thought was
> that one could move the fork limit code over into that controller.  If
> we reach a consensus that this would be a good idea, and would have
> chances to get merged, I could probably take some time to refactor my
> code.
>
> Max
I'd advise you to take a step back and think if this is really needed. 
As Alan pointed out, the really expensive resource here is already being
constrained by Frederic's controller.

But ultimately, you're the only one that knows about your real 
requirements.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] new cgroup controller "fork"
  2011-11-03 18:56         ` Glauber Costa
@ 2011-11-03 20:08           ` Matt Helsley
  0 siblings, 0 replies; 31+ messages in thread
From: Matt Helsley @ 2011-11-03 20:08 UTC (permalink / raw)
  To: Glauber Costa; +Cc: Alan Cox, linux-kernel, containers, Frederic Weisbecker

On Thu, Nov 03, 2011 at 04:56:38PM -0200, Glauber Costa wrote:
> On 11/03/2011 04:51 PM, Max Kellermann wrote:
> >On 2011/11/03 19:21, Alan Cox<alan@lxorguk.ukuu.org.uk>  wrote:
> >>>After little discussion, nobody seemed to be interested in it, and
> >>>nobody merged it.  I reposted it today, not knowing somebody else had
> >>>come up with a similar idea meanwhile.
> >>
> >>I don't really see a meaningful use case for this. Why should millions of
> >>users have this stuff in their kernel. What's the general purpose use
> >>case we should all be excited about ?
> >
> >Putting a reasonable limit on jobs that are expected to run only for a
> >limited amount of time, with a limited amount of total resources.  For
> >example: CGI, cron jobs, backup, munin plugins, virus scanners and
> >other email filters, procmail, ... - when the job is done, the group
> >can be deleted, and new instances will run in a new group.
> >
> >With just RLIMIT_NPROC or task_counter, you can limit the total number
> >of processes, but it will not stop a fork bomb - it will only slow it
> >down.  The fork bomb will still bounce between 1 and the limit, and
> >consume lots of resources for forking and exiting.
> >
> >(Glauber: the above should answer your last email, too)
> 
> Yet, the damage a fork bomb can pose into the system this way is
> severely limited. Combined with the cpu controller to guarantee that
> this group of process will never take the whole cpu for themselves,
> you have almost everything you need, if not everything.

Assuming we're only talking about fork bombs, I tend to agree.
Using Frederic's cgroup subsystem we can adjust the limit to the number of
legitimate tasks in the cgroup (0 if you can't distinguish them) and then
start killing the fork bombs. If the forkbomb goes into a fork-then-exit
loop in order to eat cpu once it's reached the task limit the cpu
controller becomes more useful.

However, isn't it possible that forking an extra task could be a sign of
a security issue other than a fork bomb? Imagine a CGI module that could
set the fork limit (not number of tasks) to the precise or maximum
number of tasks that CGI script should create (perhaps a per-script
limit known and configured somewhere by an admin). If the CGI script
attempts to go over that limit it could be a sign of an exploit attempt.
The fork limit could prevent the CGI script from creating a shell with
unintended privileges. Or the shell might be created but no non-builtin
commands could be executed. The exploit would not be able to kill an
existing task to make room. Anyhow, that's purely hypothetical --
I can imagine a use for this feature but I don't know that it's
been implemented or how practical it really is. Also, depending on how
such a CGI module+script drops privileges, there still may be acceptable
alternatives like syscall filtering...

Cheers,
	-Matt Helsley


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] new cgroup controller "fork"
  2011-11-03 19:25             ` Glauber Costa
@ 2011-11-03 20:13               ` Brian K. White
  2011-11-03 21:54                 ` Glauber Costa
  0 siblings, 1 reply; 31+ messages in thread
From: Brian K. White @ 2011-11-03 20:13 UTC (permalink / raw)
  To: containers; +Cc: linux-kernel, cgroups

On 11/3/2011 3:25 PM, Glauber Costa wrote:
> On 11/03/2011 05:20 PM, Max Kellermann wrote:
>> On 2011/11/03 20:03, Alan Cox<alan@lxorguk.ukuu.org.uk> wrote:
>>> Sure - I'm just not seeing that a whole separate cgroup for it is
>>> appropriate or a good plan. Anyone doing real resource management needs
>>> the rest of the stuff anyway.
>>
>> Right. When I saw Frederic's controller today, my first thought was
>> that one could move the fork limit code over into that controller. If
>> we reach a consensus that this would be a good idea, and would have
>> chances to get merged, I could probably take some time to refactor my
>> code.
>>
>> Max
> I'd advise you to take a step back and think if this is really needed.
> As Alan pointed out, the really expensive resource here is already being
> constrained by Frederic's controller.

I think this really is a different knob that is nice to have as long as 
it doesn't cost much. It's a way to set a max lifespan in a way that 
isn't really addressed by the other controls. (I could absolutely be 
missing something.)

I think Max explained the issue clearly enough.

It doesn't matter that the fork itself is supposedly so cheap.

It's still nice to have a way to say, you may not fork/die/fork/die/fork 
in a race.

What's so unimaginable about having a process that you know needs a lot 
of cpu and ram or other resources to do it's job, and you expressly want 
to allow it to take as much of those resources as it can, but you know 
it has no need to fork, so if it forks, _that_ is the only indication of 
a problem, so you may only want to block it based on that.

Sure many other processes would legitimately fork/die/fork/die a lot 
while never exceeding a few total concurrent tasks, and for them you 
would not want to set any such fork limit. So what?

-- 
bkw

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] new cgroup controller "fork"
  2011-11-03 20:13               ` Brian K. White
@ 2011-11-03 21:54                 ` Glauber Costa
  2011-11-04  3:03                   ` Li Zefan
  0 siblings, 1 reply; 31+ messages in thread
From: Glauber Costa @ 2011-11-03 21:54 UTC (permalink / raw)
  To: Brian K. White; +Cc: containers, cgroups, linux-kernel

On 11/03/2011 06:13 PM, Brian K. White wrote:
> On 11/3/2011 3:25 PM, Glauber Costa wrote:
>> On 11/03/2011 05:20 PM, Max Kellermann wrote:
>>> On 2011/11/03 20:03, Alan Cox<alan@lxorguk.ukuu.org.uk> wrote:
>>>> Sure - I'm just not seeing that a whole separate cgroup for it is
>>>> appropriate or a good plan. Anyone doing real resource management needs
>>>> the rest of the stuff anyway.
>>>
>>> Right. When I saw Frederic's controller today, my first thought was
>>> that one could move the fork limit code over into that controller. If
>>> we reach a consensus that this would be a good idea, and would have
>>> chances to get merged, I could probably take some time to refactor my
>>> code.
>>>
>>> Max
>> I'd advise you to take a step back and think if this is really needed.
>> As Alan pointed out, the really expensive resource here is already being
>> constrained by Frederic's controller.
>
> I think this really is a different knob that is nice to have as long as
> it doesn't cost much. It's a way to set a max lifespan in a way that
> isn't really addressed by the other controls. (I could absolutely be
> missing something.)
>
> I think Max explained the issue clearly enough.

He did, indeed.

> It doesn't matter that the fork itself is supposedly so cheap.
>
> It's still nice to have a way to say, you may not fork/die/fork/die/fork
> in a race.
>
> What's so unimaginable about having a process that you know needs a lot
> of cpu and ram or other resources to do it's job, and you expressly want
> to allow it to take as much of those resources as it can, but you know
> it has no need to fork, so if it forks, _that_ is the only indication of
> a problem, so you may only want to block it based on that.
>
> Sure many other processes would legitimately fork/die/fork/die a lot
> while never exceeding a few total concurrent tasks, and for them you
> would not want to set any such fork limit. So what?
>
As I said previously, he knows his use cases better than anyone else.
If a use case can be found in which the summation of cpu+task 
controllers is not enough, and if this is implemented as an option to 
the task controller, and does not make it:
1) confusing,
2) more expensive,

then I don't see why not we shouldn't take it.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] new cgroup controller "fork"
  2011-11-03 21:54                 ` Glauber Costa
@ 2011-11-04  3:03                   ` Li Zefan
  2011-11-04  4:37                     ` KAMEZAWA Hiroyuki
                                       ` (2 more replies)
  0 siblings, 3 replies; 31+ messages in thread
From: Li Zefan @ 2011-11-04  3:03 UTC (permalink / raw)
  To: Glauber Costa; +Cc: Brian K. White, cgroups, containers, linux-kernel

于 2011年11月04日 05:54, Glauber Costa 写道:
> On 11/03/2011 06:13 PM, Brian K. White wrote:
>> On 11/3/2011 3:25 PM, Glauber Costa wrote:
>>> On 11/03/2011 05:20 PM, Max Kellermann wrote:
>>>> On 2011/11/03 20:03, Alan Cox<alan@lxorguk.ukuu.org.uk> wrote:
>>>>> Sure - I'm just not seeing that a whole separate cgroup for it is
>>>>> appropriate or a good plan. Anyone doing real resource management needs
>>>>> the rest of the stuff anyway.
>>>>
>>>> Right. When I saw Frederic's controller today, my first thought was
>>>> that one could move the fork limit code over into that controller. If
>>>> we reach a consensus that this would be a good idea, and would have
>>>> chances to get merged, I could probably take some time to refactor my
>>>> code.
>>>>
>>>> Max
>>> I'd advise you to take a step back and think if this is really needed.
>>> As Alan pointed out, the really expensive resource here is already being
>>> constrained by Frederic's controller.
>>
>> I think this really is a different knob that is nice to have as long as
>> it doesn't cost much. It's a way to set a max lifespan in a way that
>> isn't really addressed by the other controls. (I could absolutely be
>> missing something.)
>>
>> I think Max explained the issue clearly enough.
> 
> He did, indeed.
> 
>> It doesn't matter that the fork itself is supposedly so cheap.
>>
>> It's still nice to have a way to say, you may not fork/die/fork/die/fork
>> in a race.
>>
>> What's so unimaginable about having a process that you know needs a lot
>> of cpu and ram or other resources to do it's job, and you expressly want
>> to allow it to take as much of those resources as it can, but you know
>> it has no need to fork, so if it forks, _that_ is the only indication of
>> a problem, so you may only want to block it based on that.
>>
>> Sure many other processes would legitimately fork/die/fork/die a lot
>> while never exceeding a few total concurrent tasks, and for them you
>> would not want to set any such fork limit. So what?
>>
> As I said previously, he knows his use cases better than anyone else.
> If a use case can be found in which the summation of cpu+task controllers is not enough, and if this is implemented as an option to the task controller, and does not make it:
> 1) confusing,
> 2) more expensive,
> 
> then I don't see why not we shouldn't take it.

Quoted from Lennart's reply in another mail thread:

"Given that shutting down some services might involve forking off a few
things (think: a shell script handling shutdown which forks off a couple
of shell utilities) we'd want something that is between "from now on no
forking at all" and "unlimited forking". This could be done in many
different ways: we'd be happy if we could do time-based rate limiting,
but we'd also be fine with defining a certain budget of additional forks
a cgroup can do (i.e. "from now on you can do 50 more forks, then you'll
get EPERM)."

(http://lkml.org/lkml/2011/10/19/468)

The last sentence suggests he might like this fork controller.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] new cgroup controller "fork"
  2011-11-04  3:03                   ` Li Zefan
@ 2011-11-04  4:37                     ` KAMEZAWA Hiroyuki
  2011-11-04 13:11                     ` Glauber Costa
  2011-11-04 13:59                     ` Lennart Poettering
  2 siblings, 0 replies; 31+ messages in thread
From: KAMEZAWA Hiroyuki @ 2011-11-04  4:37 UTC (permalink / raw)
  To: Li Zefan; +Cc: Glauber Costa, cgroups, containers, linux-kernel

On Fri, 04 Nov 2011 11:03:41 +0800
Li Zefan <lizf@cn.fujitsu.com> wrote:

> 于 2011年11月04日 05:54, Glauber Costa 写道:
> > On 11/03/2011 06:13 PM, Brian K. White wrote:
> >> On 11/3/2011 3:25 PM, Glauber Costa wrote:
> >>> On 11/03/2011 05:20 PM, Max Kellermann wrote:
> >>>> On 2011/11/03 20:03, Alan Cox<alan@lxorguk.ukuu.org.uk> wrote:
> >>>>> Sure - I'm just not seeing that a whole separate cgroup for it is
> >>>>> appropriate or a good plan. Anyone doing real resource management needs
> >>>>> the rest of the stuff anyway.
> >>>>
> >>>> Right. When I saw Frederic's controller today, my first thought was
> >>>> that one could move the fork limit code over into that controller. If
> >>>> we reach a consensus that this would be a good idea, and would have
> >>>> chances to get merged, I could probably take some time to refactor my
> >>>> code.
> >>>>
> >>>> Max
> >>> I'd advise you to take a step back and think if this is really needed.
> >>> As Alan pointed out, the really expensive resource here is already being
> >>> constrained by Frederic's controller.
> >>
> >> I think this really is a different knob that is nice to have as long as
> >> it doesn't cost much. It's a way to set a max lifespan in a way that
> >> isn't really addressed by the other controls. (I could absolutely be
> >> missing something.)
> >>
> >> I think Max explained the issue clearly enough.
> > 
> > He did, indeed.
> > 
> >> It doesn't matter that the fork itself is supposedly so cheap.
> >>
> >> It's still nice to have a way to say, you may not fork/die/fork/die/fork
> >> in a race.
> >>
> >> What's so unimaginable about having a process that you know needs a lot
> >> of cpu and ram or other resources to do it's job, and you expressly want
> >> to allow it to take as much of those resources as it can, but you know
> >> it has no need to fork, so if it forks, _that_ is the only indication of
> >> a problem, so you may only want to block it based on that.
> >>
> >> Sure many other processes would legitimately fork/die/fork/die a lot
> >> while never exceeding a few total concurrent tasks, and for them you
> >> would not want to set any such fork limit. So what?
> >>
> > As I said previously, he knows his use cases better than anyone else.
> > If a use case can be found in which the summation of cpu+task controllers is not enough, and if this is implemented as an option to the task controller, and does not make it:
> > 1) confusing,
> > 2) more expensive,
> > 
> > then I don't see why not we shouldn't take it.
> 
> Quoted from Lennart's reply in another mail thread:
> 
> "Given that shutting down some services might involve forking off a few
> things (think: a shell script handling shutdown which forks off a couple
> of shell utilities) we'd want something that is between "from now on no
> forking at all" and "unlimited forking". This could be done in many
> different ways: we'd be happy if we could do time-based rate limiting,
> but we'd also be fine with defining a certain budget of additional forks
> a cgroup can do (i.e. "from now on you can do 50 more forks, then you'll
> get EPERM)."
> 
> (http://lkml.org/lkml/2011/10/19/468)
> 
> The last sentence suggests he might like this fork controller.
> 

Hmm, IMHO, this feature may have some use case. But I don't like to have
both of fork/task controller at the same time and need to mount 2 of them.

How about accounting the number of fork or fork-speed in 'task'
controller and  add 'notifier' as memcg's memory usage notification ?
(Or fork-limit in task controller.)

BTW, what is performance impact to add lock/counter in fork/die path ?

Thanks,
-Kame




^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] new cgroup controller "fork"
  2011-11-04  3:03                   ` Li Zefan
  2011-11-04  4:37                     ` KAMEZAWA Hiroyuki
@ 2011-11-04 13:11                     ` Glauber Costa
  2011-11-04 13:38                       ` Max Kellermann
  2011-11-04 13:59                     ` Lennart Poettering
  2 siblings, 1 reply; 31+ messages in thread
From: Glauber Costa @ 2011-11-04 13:11 UTC (permalink / raw)
  To: Li Zefan; +Cc: Brian K. White, cgroups, containers, linux-kernel, max

On 11/04/2011 01:03 AM, Li Zefan wrote:
> 于 2011年11月04日 05:54, Glauber Costa 写道:
>> On 11/03/2011 06:13 PM, Brian K. White wrote:
>>> On 11/3/2011 3:25 PM, Glauber Costa wrote:
>>>> On 11/03/2011 05:20 PM, Max Kellermann wrote:
>>>>> On 2011/11/03 20:03, Alan Cox<alan@lxorguk.ukuu.org.uk>  wrote:
>>>>>> Sure - I'm just not seeing that a whole separate cgroup for it is
>>>>>> appropriate or a good plan. Anyone doing real resource management needs
>>>>>> the rest of the stuff anyway.
>>>>>
>>>>> Right. When I saw Frederic's controller today, my first thought was
>>>>> that one could move the fork limit code over into that controller. If
>>>>> we reach a consensus that this would be a good idea, and would have
>>>>> chances to get merged, I could probably take some time to refactor my
>>>>> code.
>>>>>
>>>>> Max
>>>> I'd advise you to take a step back and think if this is really needed.
>>>> As Alan pointed out, the really expensive resource here is already being
>>>> constrained by Frederic's controller.
>>>
>>> I think this really is a different knob that is nice to have as long as
>>> it doesn't cost much. It's a way to set a max lifespan in a way that
>>> isn't really addressed by the other controls. (I could absolutely be
>>> missing something.)
>>>
>>> I think Max explained the issue clearly enough.
>>
>> He did, indeed.
>>
>>> It doesn't matter that the fork itself is supposedly so cheap.
>>>
>>> It's still nice to have a way to say, you may not fork/die/fork/die/fork
>>> in a race.
>>>
>>> What's so unimaginable about having a process that you know needs a lot
>>> of cpu and ram or other resources to do it's job, and you expressly want
>>> to allow it to take as much of those resources as it can, but you know
>>> it has no need to fork, so if it forks, _that_ is the only indication of
>>> a problem, so you may only want to block it based on that.
>>>
>>> Sure many other processes would legitimately fork/die/fork/die a lot
>>> while never exceeding a few total concurrent tasks, and for them you
>>> would not want to set any such fork limit. So what?
>>>
>> As I said previously, he knows his use cases better than anyone else.
>> If a use case can be found in which the summation of cpu+task controllers is not enough, and if this is implemented as an option to the task controller, and does not make it:
>> 1) confusing,
>> 2) more expensive,
>>
>> then I don't see why not we shouldn't take it.
>
> Quoted from Lennart's reply in another mail thread:
>
> "Given that shutting down some services might involve forking off a few
> things (think: a shell script handling shutdown which forks off a couple
> of shell utilities) we'd want something that is between "from now on no
> forking at all" and "unlimited forking". This could be done in many
> different ways: we'd be happy if we could do time-based rate limiting,
> but we'd also be fine with defining a certain budget of additional forks
> a cgroup can do (i.e. "from now on you can do 50 more forks, then you'll
> get EPERM)."
>
> (http://lkml.org/lkml/2011/10/19/468)
>
> The last sentence suggests he might like this fork controller.

Well, If I understand Frederic's work well enough, this can be achieved 
by setting the task limit to 0 in his controller. No?

Because being lower than your limit won't kick tasks out, the practical 
effect is that no forks will be allowed in the group with this setting.

So for time-based rate limiting, it is trivial to just set it to 0 after 
x seconds.

For other uses, we can watch the task counter increase until a certain 
value, and then set the limit to 0.

Max, wouldn't it be enough for your use?

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] new cgroup controller "fork"
  2011-11-04 13:11                     ` Glauber Costa
@ 2011-11-04 13:38                       ` Max Kellermann
  0 siblings, 0 replies; 31+ messages in thread
From: Max Kellermann @ 2011-11-04 13:38 UTC (permalink / raw)
  To: Glauber Costa; +Cc: Li Zefan, Brian K. White, cgroups, containers, linux-kernel

On 2011/11/04 14:11, Glauber Costa <glommer@parallels.com> wrote:
> For other uses, we can watch the task counter increase until a
> certain value, and then set the limit to 0.
> 
> Max, wouldn't it be enough for your use?

No.  We do have a process limit already (I didn't publish it yet), but
we might adopt Frederic's new controller as soon as it hits our
servers.  The fork controller complements it, and we have many others.
We run a shared CGI hosting platform with millions of accounts, and
many users have badly designed or even vulnerable PHP scripts.  The
fork controller is very effective at stopping certain kinds of those.
Other controllers shall keep other problems small.  This mix of many
different measures has been working very well for quite a few years.

We'll just keep that code on our private git repository .. rebasing on
new kernel releases is easy enough for me.

Max

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] new cgroup controller "fork"
  2011-11-04  3:03                   ` Li Zefan
  2011-11-04  4:37                     ` KAMEZAWA Hiroyuki
  2011-11-04 13:11                     ` Glauber Costa
@ 2011-11-04 13:59                     ` Lennart Poettering
  2 siblings, 0 replies; 31+ messages in thread
From: Lennart Poettering @ 2011-11-04 13:59 UTC (permalink / raw)
  To: Li Zefan; +Cc: Glauber Costa, Brian K. White, cgroups, containers, linux-kernel

On Fri, 04.11.11 11:03, Li Zefan (lizf@cn.fujitsu.com) wrote:

> >> Sure many other processes would legitimately fork/die/fork/die a lot
> >> while never exceeding a few total concurrent tasks, and for them you
> >> would not want to set any such fork limit. So what?
> >>
> > As I said previously, he knows his use cases better than anyone else.
> > If a use case can be found in which the summation of cpu+task controllers is not enough, and if this is implemented as an option to the task controller, and does not make it:
> > 1) confusing,
> > 2) more expensive,
> > 
> > then I don't see why not we shouldn't take it.
> 
> Quoted from Lennart's reply in another mail thread:
> 
> "Given that shutting down some services might involve forking off a few
> things (think: a shell script handling shutdown which forks off a couple
> of shell utilities) we'd want something that is between "from now on no
> forking at all" and "unlimited forking". This could be done in many
> different ways: we'd be happy if we could do time-based rate limiting,
> but we'd also be fine with defining a certain budget of additional forks
> a cgroup can do (i.e. "from now on you can do 50 more forks, then you'll
> get EPERM)."
> 
> (http://lkml.org/lkml/2011/10/19/468)
> 
> The last sentence suggests he might like this fork controller.

Yes, indeed. Limiting forks like this would be pretty much exactly what
we need in systemd to make the shutdown of services robust towards
code which tries to fork faster than we could kill it. I am all in
favour of this code, especially due to its simplicity.

However, I'd like to see this implemented as part of the core cgroup
interfaces, instead of an independent controller. Otherwise we might
have multiple userspace frameworks fighting over it: LXC might want to
take posession of it, systemd too, and Max' own CGI tool might as
well. I believe limiting forks like this is an important part of the
basic cgroup management that userspace needs, independently of what a
specific software actually wants to do with it (i.e. which cgroup
controller it wants to use, if any), and it hence should be available in
all hierarchies, including the named ones that are useful to ensure that
a specific controller is not monopolized by one userspace consumer.

Lennart

-- 
Lennart Poettering - Red Hat, Inc.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] new cgroup controller "fork"
  2011-02-18  0:59 ` Paul Menage
@ 2011-02-18  9:26   ` Max Kellermann
  0 siblings, 0 replies; 31+ messages in thread
From: Max Kellermann @ 2011-02-18  9:26 UTC (permalink / raw)
  To: Paul Menage, KAMEZAWA Hiroyuki; +Cc: lizf, containers, linux-kernel

On 2011/02/18 01:59, Paul Menage <menage@google.com> wrote:
> I'd be inclined to simplify this a bit - avoid impacting the fork()
> path twice, with cgroup_fork_pre_fork() and cgroup_fork_fork() and
> just do the checks and decrements in a single pass. (In the event of
> hitting a limit, you may need to make another partial pass up the tree
> to restore the charged fork attempts)

I have implemented it, but I don't like your idea.  It actually
complicates the code.  It tries to do two things at once, and running
again until it hits the failed cgroup seems somewhat fragile.  I
believe the overhead for doing two separate runs in case of success is
negligible compared to the cost of sys_fork().

(Documentation not adjusted yet in the new patch)

> Also, it would be slightly clearer to use fork_cgroup_* rather than
> cgroup_fork_* - this makes it clearer that it's part of a cgroups
> subsystem called "fork", rather than part of the cgroups core
> framework.

Changed, but I've preserved the file name cgroup_fork.c.  Do you want
me to change that, too?  (What about cgroup_freezer.c and the config
option names CONFIG_CGROUP_*?)

> I don't think that you need to make your spinlock IRQ-safe - AFAICS
> nothing accesses it from the interrupt path.

Changed.

On 2011/02/17 14:50, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> How about -EAGAIN here ? I think it's not good to add new error code
> for system calls.

Changed that, but that got me a funny quirk while testing:

 bear:~# ls
 -bash: fork: retry: Resource temporarily unavailable
 -bash: fork: retry: Resource temporarily unavailable
 -bash: fork: retry: Resource temporarily unavailable
 -bash: fork: retry: Resource temporarily unavailable
 -bash: fork: Resource temporarily unavailable
 bear:~# 

Generally, I don't think EAGAIN is a good errno code for
"adminstrative limit exceeded".  EAGAIN's meaning is "try again
later".  Usually there is something like poll() to wait until the
resource is available - but a process cannot wait for the adminstrator
to raise the configured limits.  You could blame that quirk on bash,
because it does not consider that divergent definition of EAGAIN for
fork()..

The changed patch follows for further discussion; I'll repost the
complete patch with description again once we agree that it's
finished.

Max


diff --git a/Documentation/cgroups/fork.txt b/Documentation/cgroups/fork.txt
new file mode 100644
index 0000000..dfbf291
--- /dev/null
+++ b/Documentation/cgroups/fork.txt
@@ -0,0 +1,30 @@
+The "fork" Controller
+---------------------
+
+The "fork" controller limits the number of times a new child process
+or thread can be created.  It maintains a per-group counter which gets
+decremented on each fork() / clone().  When the counter reaches zero,
+no process in the cgroup is allowed to create new child
+processes/threads, even if existing ones quit.
+
+This has been proven useful in a shared hosting environment.  A new
+temporary cgroup is created for each CGI process, and the maximum fork
+count is configured to a sensible value.  Since CGIs are expected to
+run for only a short time with predictable resource usage, this may be
+an appropriate tool to limit the damage that a freaked CGI can do.
+
+Initially, the counter is set to -1, which is a magic value for
+"disabled" - no limits are imposed on the processes in the group.  To
+set a new value, type (in the working directory of that control
+group):
+
+ echo 16 > fork.remaining
+
+This examples allows 16 forks in the control group.  0 means no
+further forks are allowed.  The limit may be lowered or increased or
+even disabled at any time by a process with write permissions to the
+attribute.
+
+To check if a fork is allowed, the controller walks the cgroup
+hierarchy up, and verifies all ancestors.  The counter of all
+ancestors is decreased.
diff --git a/include/linux/cgroup_fork.h b/include/linux/cgroup_fork.h
new file mode 100644
index 0000000..aef1dbd
--- /dev/null
+++ b/include/linux/cgroup_fork.h
@@ -0,0 +1,26 @@
+#ifndef _LINUX_CGROUP_FORK_H
+#define _LINUX_CGROUP_FORK_H
+
+#ifdef CONFIG_CGROUP_FORK
+
+/**
+ * Checks if another fork is allowed.  Call this before creating a new
+ * child process.
+ *
+ * @return 0 on success, a negative errno value if forking should be
+ * denied
+ */
+int
+fork_cgroup_pre_fork(void);
+
+#else /* !CONFIG_CGROUP_FORK */
+
+static inline int
+fork_cgroup_pre_fork(void)
+{
+	return 0;
+}
+
+#endif /* !CONFIG_CGROUP_FORK */
+
+#endif /* !_LINUX_CGROUP_FORK_H */
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index ccefff0..8ead7f9 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -66,3 +66,9 @@ SUBSYS(blkio)
 #endif
 
 /* */
+
+#ifdef CONFIG_CGROUP_FORK
+SUBSYS(fork)
+#endif
+
+/* */
diff --git a/init/Kconfig b/init/Kconfig
index 17e2cfb..ef53a85 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -596,6 +596,12 @@ config CGROUP_FREEZER
 	  Provides a way to freeze and unfreeze all tasks in a
 	  cgroup.
 
+config CGROUP_FORK
+	bool "fork controller for cgroups"
+	help
+	  Limits the number of fork() calls in a cgroup.  An application
+	  for this is to make a cgroup safe against fork bombs.
+
 config CGROUP_DEVICE
 	bool "Device controller for cgroups"
 	help
diff --git a/kernel/Makefile b/kernel/Makefile
index 353d3fe..b58cc01 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -61,6 +61,7 @@ obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
 obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CGROUPS) += cgroup.o
 obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
+obj-$(CONFIG_CGROUP_FORK) += cgroup_fork.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
 obj-$(CONFIG_UTS_NS) += utsname.o
diff --git a/kernel/cgroup_fork.c b/kernel/cgroup_fork.c
new file mode 100644
index 0000000..e56b2c6
--- /dev/null
+++ b/kernel/cgroup_fork.c
@@ -0,0 +1,186 @@
+/*
+ *  A cgroup implementation which limits the number of fork() calls.
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+#include <linux/cgroup.h>
+#include <linux/cgroup_fork.h>
+#include <linux/slab.h>
+
+struct cgroup_fork {
+	struct cgroup_subsys_state css;
+
+	/** protect the "remaining" attribute */
+	spinlock_t lock;
+
+	/**
+	 * The remaining number of forks allowed.  -1 is the magic
+	 * value for "unlimited".
+	 */
+	int remaining;
+};
+
+/**
+ * Get the #cgrou_fork instance of the specified #cgroup.
+ */
+static inline struct cgroup_fork *
+fork_cgroup_group(struct cgroup *cgroup)
+{
+	return container_of(cgroup_subsys_state(cgroup, fork_subsys_id),
+			    struct cgroup_fork, css);
+}
+
+/**
+ * Get the #cgroup_fork instance of the specified task.
+ */
+static inline struct cgroup_fork *
+fork_cgroup_task(struct task_struct *task)
+{
+	return container_of(task_subsys_state(current_task, fork_subsys_id),
+			    struct cgroup_fork, css);
+}
+
+/**
+ * Get the #cgroup_fork instance of the current task.
+ */
+static inline struct cgroup_fork *
+fork_cgroup_current(void)
+{
+	return fork_cgroup_task(current_task);
+}
+
+static struct cgroup_subsys_state *
+fork_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgroup)
+{
+	struct cgroup_fork *t = kzalloc(sizeof(*t), GFP_KERNEL);
+	if (!t)
+		return ERR_PTR(-ENOMEM);
+
+	spin_lock_init(&t->lock);
+
+	t->remaining = -1;
+
+	return &t->css;
+}
+
+static void
+fork_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgroup)
+{
+	struct cgroup_fork *t = fork_cgroup_group(cgroup);
+
+	kfree(t);
+}
+
+static s64
+fork_cgroup_remaining_read(struct cgroup *cgroup, struct cftype *cft)
+{
+	struct cgroup_fork *t = fork_cgroup_group(cgroup);
+	int value;
+
+	spin_lock(&t->lock);
+	value = t->remaining;
+	spin_unlock(&t->lock);
+
+	return value;
+}
+
+static int
+fork_cgroup_remaining_write(struct cgroup *cgroup, struct cftype *cft,
+			    s64 value)
+{
+	struct cgroup_fork *t = fork_cgroup_group(cgroup);
+
+	if (value < -1 || value > (1L << 30))
+		return -EINVAL;
+
+	spin_lock(&t->lock);
+	t->remaining = (int)value;
+	spin_unlock(&t->lock);
+
+	return 0;
+}
+
+static const struct cftype fork_cgroup_files[] =  {
+	{
+		.name = "remaining",
+		.read_s64 = fork_cgroup_remaining_read,
+		.write_s64 = fork_cgroup_remaining_write,
+	},
+};
+
+static int
+fork_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cgroup)
+{
+	if (cgroup->parent == NULL)
+		/* cannot limit the root cgroup */
+		return 0;
+
+	return cgroup_add_files(cgroup, ss, fork_cgroup_files,
+				ARRAY_SIZE(fork_cgroup_files));
+}
+
+struct cgroup_subsys fork_subsys = {
+	.name = "fork",
+	.create = fork_cgroup_create,
+	.destroy = fork_cgroup_destroy,
+	.populate = fork_cgroup_populate,
+	.subsys_id = fork_subsys_id,
+};
+
+/**
+ * After a failure, restore the "remaining" counter in all cgroups
+ * from the task_current's one up to the failed one.
+ */
+static void
+fork_cgroup_restore(struct cgroup_fork *until_excluding)
+{
+	struct cgroup_fork *t;
+
+	for (t = fork_cgroup_current(); t != until_excluding;
+	     t = fork_cgroup_group(t->css.cgroup->parent)) {
+		spin_lock(&t->lock);
+
+		if (t->remaining >= 0)
+			++t->remaining;
+
+		spin_unlock(&t->lock);
+	}
+}
+
+int
+fork_cgroup_pre_fork(void)
+{
+	struct cgroup_fork *t;
+	int err = 0;
+
+	rcu_read_lock();
+
+	for (t = fork_cgroup_current(); t->css.cgroup->parent != NULL;
+	     t = fork_cgroup_group(t->css.cgroup->parent)) {
+		spin_lock(&t->lock);
+
+		if (t->remaining > 0)
+			/* decrement the counter */
+			--t->remaining;
+		else if (t->remaining == 0) {
+			/* fork manpage: "[...] RLIMIT_NPROC resource
+			   limit was encountered." - should be close
+			   enough to this condition */
+			spin_unlock(&t->lock);
+			err = -EAGAIN;
+
+			/* restore the decremented counters */
+			fork_cgroup_restore(t);
+			break;
+		}
+
+		spin_unlock(&t->lock);
+	}
+
+	rcu_read_unlock();
+
+	return err;
+}
diff --git a/kernel/fork.c b/kernel/fork.c
index 25e4291..0f06202 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -32,6 +32,7 @@
 #include <linux/capability.h>
 #include <linux/cpu.h>
 #include <linux/cgroup.h>
+#include <linux/cgroup_fork.h>
 #include <linux/security.h>
 #include <linux/hugetlb.h>
 #include <linux/swap.h>
@@ -1024,6 +1025,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 				current->signal->flags & SIGNAL_UNKILLABLE)
 		return ERR_PTR(-EINVAL);
 
+	retval = fork_cgroup_pre_fork();
+	if (retval)
+		goto fork_out;
+
 	retval = security_task_create(clone_flags);
 	if (retval)
 		goto fork_out;

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH] new cgroup controller "fork"
  2011-02-17 13:31 Max Kellermann
  2011-02-17 13:50 ` KAMEZAWA Hiroyuki
@ 2011-02-18  0:59 ` Paul Menage
  2011-02-18  9:26   ` Max Kellermann
  1 sibling, 1 reply; 31+ messages in thread
From: Paul Menage @ 2011-02-18  0:59 UTC (permalink / raw)
  To: Max Kellermann; +Cc: lizf, containers, linux-kernel

On Thu, Feb 17, 2011 at 5:31 AM, Max Kellermann <mk@cm4all.com> wrote:
> Can limit the number of fork()/clone() calls in a cgroup.  It is
> useful as a safeguard against fork bombs.

I'd be inclined to simplify this a bit - avoid impacting the fork()
path twice, with cgroup_fork_pre_fork() and cgroup_fork_fork() and
just do the checks and decrements in a single pass. (In the event of
hitting a limit, you may need to make another partial pass up the tree
to restore the charged fork attempts)

Yes, it's true that you might charge for a fork() that later failed
for some other reason, but this will very rare (except on a machine
that's already screwed for other reasons) so that I don't think anyone
would complain about it. Especially if you explicitly document
"fork.remaining" as number of permitted "fork attempts".

Also, it would be slightly clearer to use fork_cgroup_* rather than
cgroup_fork_* - this makes it clearer that it's part of a cgroups
subsystem called "fork", rather than part of the cgroups core
framework.

I don't think that you need to make your spinlock IRQ-safe - AFAICS
nothing accesses it from the interrupt path.

Paul

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] new cgroup controller "fork"
  2011-02-17 13:50 ` KAMEZAWA Hiroyuki
@ 2011-02-17 14:09   ` Max Kellermann
  0 siblings, 0 replies; 31+ messages in thread
From: Max Kellermann @ 2011-02-17 14:09 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: menage, lizf, containers, linux-kernel

On 2011/02/17 14:50, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> I wonder allowing to set the limit to Root cgroup may imply the system death.
> How about disabling to set value to Root cgroup ?

That is taken care of already:

> > +static int
> > +cgroup_fork_populate(struct cgroup_subsys *ss, struct cgroup *cgroup)
> > +{
> > +	if (cgroup->parent == NULL)
> > +		/* cannot limit the root cgroup */
> > +		return 0;

The attribute simply doesn't exist in the root cgroup.

Also watch the loop condition in cgroup_fork_pre_fork() closely, the
root cgroup isn't checked (even if you could find a way to configure
it):

> > +	t = cgroup_fork_current();
> > +	while (t->css.cgroup->parent != NULL && err == 0) {


> IIRC, fork()'s error code is EAGAIN or ENOMEM. The exisiting limit of
> rlimit() returns EAGAIN.
> 
> How about -EAGAIN here ? I think it's not good to add new error code for
> system calls.

EPERM seemed appropriate to me, because the administrator disallows
more than N forks.  If there are practical reasons for changing it to
EAGAIN or ENOMEM, I'm ok with that.  Thanks for the hint.

Max

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] new cgroup controller "fork"
  2011-02-17 13:31 Max Kellermann
@ 2011-02-17 13:50 ` KAMEZAWA Hiroyuki
  2011-02-17 14:09   ` Max Kellermann
  2011-02-18  0:59 ` Paul Menage
  1 sibling, 1 reply; 31+ messages in thread
From: KAMEZAWA Hiroyuki @ 2011-02-17 13:50 UTC (permalink / raw)
  To: Max Kellermann; +Cc: menage, lizf, containers, linux-kernel

On Thu, 17 Feb 2011 14:31:52 +0100
Max Kellermann <mk@cm4all.com> wrote:

> Can limit the number of fork()/clone() calls in a cgroup.  It is
> useful as a safeguard against fork bombs.
> 

brief comments below.


> Signed-off-by: Max Kellermann <mk@cm4all.com>
<snip>

> +static int
> +cgroup_fork_remaining_write(struct cgroup *cgroup, struct cftype *cft,
> +			    s64 value)
> +{
> +	struct cgroup_fork *t = cgroup_fork_group(cgroup);
> +
> +	if (value < -1 || value > (1L << 30))
> +		return -EINVAL;
> +
> +	spin_lock_irq(&t->lock);
> +	t->remaining = (int)value;
> +	spin_unlock_irq(&t->lock);
> +
> +	return 0;
> +}

I wonder allowing to set the limit to Root cgroup may imply the system death.
How about disabling to set value to Root cgroup ?


> +
> +static const struct cftype cgroup_fork_files[] =  {
> +	{
> +		.name = "remaining",
> +		.read_s64 = cgroup_fork_remaining_read,
> +		.write_s64 = cgroup_fork_remaining_write,
> +	},
> +};
> +
> +static int
> +cgroup_fork_populate(struct cgroup_subsys *ss, struct cgroup *cgroup)
> +{
> +	if (cgroup->parent == NULL)
> +		/* cannot limit the root cgroup */
> +		return 0;
> +
> +	return cgroup_add_files(cgroup, ss, cgroup_fork_files,
> +				ARRAY_SIZE(cgroup_fork_files));
> +}
> +
> +struct cgroup_subsys fork_subsys = {
> +	.name = "fork",
> +	.create = cgroup_fork_create,
> +	.destroy = cgroup_fork_destroy,
> +	.fork = cgroup_fork_fork,
> +	.populate = cgroup_fork_populate,
> +	.subsys_id = fork_subsys_id,
> +};
> +
> +int
> +cgroup_fork_pre_fork(void)
> +{
> +	struct cgroup_fork *t;
> +	int err = 0;
> +
> +	rcu_read_lock();
> +
> +	t = cgroup_fork_current();
> +	while (t->css.cgroup->parent != NULL && err == 0) {
> +		spin_lock_irq(&t->lock);
> +
> +		if (t->remaining == 0)
> +			err = -EPERM;

IIRC, fork()'s error code is EAGAIN or ENOMEM. The exisiting limit of
rlimit() returns EAGAIN.

How about -EAGAIN here ? I think it's not good to add new error code for
system calls.

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH] new cgroup controller "fork"
@ 2011-02-17 13:31 Max Kellermann
  2011-02-17 13:50 ` KAMEZAWA Hiroyuki
  2011-02-18  0:59 ` Paul Menage
  0 siblings, 2 replies; 31+ messages in thread
From: Max Kellermann @ 2011-02-17 13:31 UTC (permalink / raw)
  To: menage, lizf, containers; +Cc: linux-kernel

Can limit the number of fork()/clone() calls in a cgroup.  It is
useful as a safeguard against fork bombs.

Signed-off-by: Max Kellermann <mk@cm4all.com>
---
 Documentation/cgroups/fork.txt |   30 +++++++
 include/linux/cgroup_fork.h    |   26 ++++++
 include/linux/cgroup_subsys.h  |    6 +
 init/Kconfig                   |    6 +
 kernel/Makefile                |    1 
 kernel/cgroup_fork.c           |  180 ++++++++++++++++++++++++++++++++++++++++
 kernel/fork.c                  |    5 +
 7 files changed, 254 insertions(+), 0 deletions(-)
 create mode 100644 Documentation/cgroups/fork.txt
 create mode 100644 include/linux/cgroup_fork.h
 create mode 100644 kernel/cgroup_fork.c

diff --git a/Documentation/cgroups/fork.txt b/Documentation/cgroups/fork.txt
new file mode 100644
index 0000000..dfbf291
--- /dev/null
+++ b/Documentation/cgroups/fork.txt
@@ -0,0 +1,30 @@
+The "fork" Controller
+---------------------
+
+The "fork" controller limits the number of times a new child process
+or thread can be created.  It maintains a per-group counter which gets
+decremented on each fork() / clone().  When the counter reaches zero,
+no process in the cgroup is allowed to create new child
+processes/threads, even if existing ones quit.
+
+This has been proven useful in a shared hosting environment.  A new
+temporary cgroup is created for each CGI process, and the maximum fork
+count is configured to a sensible value.  Since CGIs are expected to
+run for only a short time with predictable resource usage, this may be
+an appropriate tool to limit the damage that a freaked CGI can do.
+
+Initially, the counter is set to -1, which is a magic value for
+"disabled" - no limits are imposed on the processes in the group.  To
+set a new value, type (in the working directory of that control
+group):
+
+ echo 16 > fork.remaining
+
+This examples allows 16 forks in the control group.  0 means no
+further forks are allowed.  The limit may be lowered or increased or
+even disabled at any time by a process with write permissions to the
+attribute.
+
+To check if a fork is allowed, the controller walks the cgroup
+hierarchy up, and verifies all ancestors.  The counter of all
+ancestors is decreased.
diff --git a/include/linux/cgroup_fork.h b/include/linux/cgroup_fork.h
new file mode 100644
index 0000000..4ac66b3
--- /dev/null
+++ b/include/linux/cgroup_fork.h
@@ -0,0 +1,26 @@
+#ifndef _LINUX_CGROUP_FORK_H
+#define _LINUX_CGROUP_FORK_H
+
+#ifdef CONFIG_CGROUP_FORK
+
+/**
+ * Checks if another fork is allowed.  Call this before creating a new
+ * child process.
+ *
+ * @return 0 on success, a negative errno value if forking should be
+ * denied
+ */
+int
+cgroup_fork_pre_fork(void);
+
+#else /* !CONFIG_CGROUP_FORK */
+
+static inline int
+cgroup_fork_pre_fork(void)
+{
+	return 0;
+}
+
+#endif /* !CONFIG_CGROUP_FORK */
+
+#endif /* !_LINUX_CGROUP_FORK_H */
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index ccefff0..8ead7f9 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -66,3 +66,9 @@ SUBSYS(blkio)
 #endif
 
 /* */
+
+#ifdef CONFIG_CGROUP_FORK
+SUBSYS(fork)
+#endif
+
+/* */
diff --git a/init/Kconfig b/init/Kconfig
index 17e2cfb..ef53a85 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -596,6 +596,12 @@ config CGROUP_FREEZER
 	  Provides a way to freeze and unfreeze all tasks in a
 	  cgroup.
 
+config CGROUP_FORK
+	bool "fork controller for cgroups"
+	help
+	  Limits the number of fork() calls in a cgroup.  An application
+	  for this is to make a cgroup safe against fork bombs.
+
 config CGROUP_DEVICE
 	bool "Device controller for cgroups"
 	help
diff --git a/kernel/Makefile b/kernel/Makefile
index 353d3fe..b58cc01 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -61,6 +61,7 @@ obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
 obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CGROUPS) += cgroup.o
 obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
+obj-$(CONFIG_CGROUP_FORK) += cgroup_fork.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
 obj-$(CONFIG_UTS_NS) += utsname.o
diff --git a/kernel/cgroup_fork.c b/kernel/cgroup_fork.c
new file mode 100644
index 0000000..24c4b16
--- /dev/null
+++ b/kernel/cgroup_fork.c
@@ -0,0 +1,180 @@
+/*
+ *  A cgroup implementation which limits the number of fork() calls.
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+#include <linux/cgroup.h>
+#include <linux/cgroup_fork.h>
+#include <linux/slab.h>
+
+struct cgroup_fork {
+	struct cgroup_subsys_state css;
+
+	/** protect the "remaining" attribute */
+	spinlock_t lock;
+
+	/**
+	 * The remaining number of forks allowed.  -1 is the magic
+	 * value for "unlimited".
+	 */
+	int remaining;
+};
+
+/**
+ * Get the #cgrou_fork instance of the specified #cgroup.
+ */
+static inline struct cgroup_fork *
+cgroup_fork_group(struct cgroup *cgroup)
+{
+	return container_of(cgroup_subsys_state(cgroup, fork_subsys_id),
+			    struct cgroup_fork, css);
+}
+
+/**
+ * Get the #cgroup_fork instance of the specified task.
+ */
+static inline struct cgroup_fork *
+cgroup_fork_task(struct task_struct *task)
+{
+	return container_of(task_subsys_state(current_task, fork_subsys_id),
+			    struct cgroup_fork, css);
+}
+
+/**
+ * Get the #cgroup_fork instance of the current task.
+ */
+static inline struct cgroup_fork *
+cgroup_fork_current(void)
+{
+	return cgroup_fork_task(current_task);
+}
+
+static struct cgroup_subsys_state *
+cgroup_fork_create(struct cgroup_subsys *ss, struct cgroup *cgroup)
+{
+	struct cgroup_fork *t = kzalloc(sizeof(*t), GFP_KERNEL);
+	if (!t)
+		return ERR_PTR(-ENOMEM);
+
+	spin_lock_init(&t->lock);
+
+	t->remaining = -1;
+
+	return &t->css;
+}
+
+static void
+cgroup_fork_destroy(struct cgroup_subsys *ss, struct cgroup *cgroup)
+{
+	struct cgroup_fork *t = cgroup_fork_group(cgroup);
+
+	kfree(t);
+}
+
+static void
+cgroup_fork_fork(struct cgroup_subsys *ss, struct task_struct *task)
+{
+	struct cgroup_fork *t;
+
+	rcu_read_lock();
+
+	/* decrement the counters in the cgroup and all of its
+	   ancestors (except for the root cgroup) */
+
+	t = cgroup_fork_current();
+	while (t->css.cgroup->parent != NULL) {
+		spin_lock_irq(&t->lock);
+		if (t->remaining > 0)
+			--t->remaining;
+		spin_unlock_irq(&t->lock);
+
+		t = cgroup_fork_group(t->css.cgroup->parent);
+	}
+
+	rcu_read_unlock();
+}
+
+static s64
+cgroup_fork_remaining_read(struct cgroup *cgroup, struct cftype *cft)
+{
+	struct cgroup_fork *t = cgroup_fork_group(cgroup);
+	int value;
+
+	spin_lock_irq(&t->lock);
+	value = t->remaining;
+	spin_unlock_irq(&t->lock);
+
+	return value;
+}
+
+static int
+cgroup_fork_remaining_write(struct cgroup *cgroup, struct cftype *cft,
+			    s64 value)
+{
+	struct cgroup_fork *t = cgroup_fork_group(cgroup);
+
+	if (value < -1 || value > (1L << 30))
+		return -EINVAL;
+
+	spin_lock_irq(&t->lock);
+	t->remaining = (int)value;
+	spin_unlock_irq(&t->lock);
+
+	return 0;
+}
+
+static const struct cftype cgroup_fork_files[] =  {
+	{
+		.name = "remaining",
+		.read_s64 = cgroup_fork_remaining_read,
+		.write_s64 = cgroup_fork_remaining_write,
+	},
+};
+
+static int
+cgroup_fork_populate(struct cgroup_subsys *ss, struct cgroup *cgroup)
+{
+	if (cgroup->parent == NULL)
+		/* cannot limit the root cgroup */
+		return 0;
+
+	return cgroup_add_files(cgroup, ss, cgroup_fork_files,
+				ARRAY_SIZE(cgroup_fork_files));
+}
+
+struct cgroup_subsys fork_subsys = {
+	.name = "fork",
+	.create = cgroup_fork_create,
+	.destroy = cgroup_fork_destroy,
+	.fork = cgroup_fork_fork,
+	.populate = cgroup_fork_populate,
+	.subsys_id = fork_subsys_id,
+};
+
+int
+cgroup_fork_pre_fork(void)
+{
+	struct cgroup_fork *t;
+	int err = 0;
+
+	rcu_read_lock();
+
+	t = cgroup_fork_current();
+	while (t->css.cgroup->parent != NULL && err == 0) {
+		spin_lock_irq(&t->lock);
+
+		if (t->remaining == 0)
+			err = -EPERM;
+
+		spin_unlock_irq(&t->lock);
+
+		t = cgroup_fork_group(t->css.cgroup->parent);
+	}
+
+	rcu_read_unlock();
+
+	return err;
+}
diff --git a/kernel/fork.c b/kernel/fork.c
index 25e4291..35836e6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -32,6 +32,7 @@
 #include <linux/capability.h>
 #include <linux/cpu.h>
 #include <linux/cgroup.h>
+#include <linux/cgroup_fork.h>
 #include <linux/security.h>
 #include <linux/hugetlb.h>
 #include <linux/swap.h>
@@ -1024,6 +1025,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 				current->signal->flags & SIGNAL_UNKILLABLE)
 		return ERR_PTR(-EINVAL);
 
+	retval = cgroup_fork_pre_fork();
+	if (retval)
+		goto fork_out;
+
 	retval = security_task_create(clone_flags);
 	if (retval)
 		goto fork_out;


^ permalink raw reply related	[flat|nested] 31+ messages in thread

end of thread, other threads:[~2011-11-04 13:59 UTC | newest]

Thread overview: 31+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-11-03 16:22 [PATCH] new cgroup controller "fork" Max Kellermann
2011-11-03 16:43 ` Frederic Weisbecker
2011-11-03 17:16   ` Max Kellermann
2011-11-03 17:26     ` Glauber Costa
2011-11-03 17:48       ` Max Kellermann
2011-11-03 17:50         ` Glauber Costa
2011-11-03 18:30           ` Max Kellermann
2011-11-03 18:34             ` Glauber Costa
2011-11-03 16:43 ` Glauber Costa
2011-11-03 16:59   ` Max Kellermann
2011-11-03 17:05     ` Frederic Weisbecker
2011-11-03 18:21     ` Alan Cox
2011-11-03 18:51       ` Max Kellermann
2011-11-03 18:56         ` Glauber Costa
2011-11-03 20:08           ` Matt Helsley
2011-11-03 19:03         ` Alan Cox
2011-11-03 19:20           ` Max Kellermann
2011-11-03 19:25             ` Glauber Costa
2011-11-03 20:13               ` Brian K. White
2011-11-03 21:54                 ` Glauber Costa
2011-11-04  3:03                   ` Li Zefan
2011-11-04  4:37                     ` KAMEZAWA Hiroyuki
2011-11-04 13:11                     ` Glauber Costa
2011-11-04 13:38                       ` Max Kellermann
2011-11-04 13:59                     ` Lennart Poettering
2011-11-03 17:31 ` richard -rw- weinberger
  -- strict thread matches above, loose matches on Subject: below --
2011-02-17 13:31 Max Kellermann
2011-02-17 13:50 ` KAMEZAWA Hiroyuki
2011-02-17 14:09   ` Max Kellermann
2011-02-18  0:59 ` Paul Menage
2011-02-18  9:26   ` Max Kellermann

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).