linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v12 0/8] cgroup: add pids subsystem
@ 2015-05-18 14:50 Aleksa Sarai
  2015-05-18 14:51 ` [PATCH v12 1/8] cgroup: switch to unsigned long for bitmasks Aleksa Sarai
                   ` (7 more replies)
  0 siblings, 8 replies; 27+ messages in thread
From: Aleksa Sarai @ 2015-05-18 14:50 UTC (permalink / raw)
  To: tj, lizefan, mingo, peterz
  Cc: richard, fweisbec, linux-kernel, cgroups, Aleksa Sarai

This is a small update to v11 of the pids patchset[1], including:

* Fix up (incorrect) commit messages.
* Remove defunct code.
* Modify subsys_canfork_private{,p} signatures to use
  (void *[SUBSYS_CANFORK_COUNT]).

[1]: https://lkml.org/lkml/2015/5/16/1

Aleksa Sarai (7):
  cgroup: switch to unsigned long for bitmasks
  cgroup: use bitmask to filter for_each_subsys
  cgroup: replace explicit ss_mask checking with for_each_subsys_which
  cgroup: move enum cgroup_subsys_id definition
  cgroup: allow a cgroup subsystem to reject a fork
  cgroup: add a tset_get_css macro
  cgroup: implement the PIDs subsystem

Tejun Heo (1):
  cgroup, block: implement task_get_css() and use it in
    bio_associate_current()

 CREDITS                       |   5 +
 block/bio.c                   |  11 +-
 include/linux/cgroup.h        |  80 ++++++++--
 include/linux/cgroup_subsys.h |  22 +++
 init/Kconfig                  |  16 ++
 kernel/Makefile               |   1 +
 kernel/cgroup.c               | 216 +++++++++++++++++--------
 kernel/cgroup_freezer.c       |   2 +-
 kernel/cgroup_pids.c          | 355 ++++++++++++++++++++++++++++++++++++++++++
 kernel/fork.c                 |  17 +-
 kernel/sched/core.c           |   2 +-
 11 files changed, 633 insertions(+), 94 deletions(-)
 create mode 100644 kernel/cgroup_pids.c

-- 
2.4.1


^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v12 1/8] cgroup: switch to unsigned long for bitmasks
  2015-05-18 14:50 [PATCH v12 0/8] cgroup: add pids subsystem Aleksa Sarai
@ 2015-05-18 14:51 ` Aleksa Sarai
  2015-05-18 14:51 ` [PATCH v12 2/8] cgroup: use bitmask to filter for_each_subsys Aleksa Sarai
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 27+ messages in thread
From: Aleksa Sarai @ 2015-05-18 14:51 UTC (permalink / raw)
  To: tj, lizefan, mingo, peterz
  Cc: richard, fweisbec, linux-kernel, cgroups, Aleksa Sarai

Switch the type of all internal cgroup masks to (unsigned long), which
is the correct type for bitmasks. This is in preparation for the
for_each_subsys_which patch.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 kernel/cgroup.c | 39 ++++++++++++++++++++-------------------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 469dd54..15896ed 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -156,7 +156,7 @@ static bool cgrp_dfl_root_visible;
 static bool cgroup_legacy_files_on_dfl;
 
 /* some controllers are not supported in the default hierarchy */
-static unsigned int cgrp_dfl_root_inhibit_ss_mask;
+static unsigned long cgrp_dfl_root_inhibit_ss_mask;
 
 /* The list of hierarchy roots */
 
@@ -186,7 +186,7 @@ static struct cftype cgroup_dfl_base_files[];
 static struct cftype cgroup_legacy_base_files[];
 
 static int rebind_subsystems(struct cgroup_root *dst_root,
-			     unsigned int ss_mask);
+			     unsigned long ss_mask);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
 static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
 		      bool visible);
@@ -998,7 +998,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
  * update of a tasks cgroup pointer by cgroup_attach_task()
  */
 
-static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask);
+static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
 static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
 static const struct file_operations proc_cgroupstats_operations;
 
@@ -1068,11 +1068,11 @@ static void cgroup_put(struct cgroup *cgrp)
  * @subtree_control is to be applied to @cgrp.  The returned mask is always
  * a superset of @subtree_control and follows the usual hierarchy rules.
  */
-static unsigned int cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
-						  unsigned int subtree_control)
+static unsigned long cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
+						  unsigned long subtree_control)
 {
 	struct cgroup *parent = cgroup_parent(cgrp);
-	unsigned int cur_ss_mask = subtree_control;
+	unsigned long cur_ss_mask = subtree_control;
 	struct cgroup_subsys *ss;
 	int ssid;
 
@@ -1082,7 +1082,7 @@ static unsigned int cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
 		return cur_ss_mask;
 
 	while (true) {
-		unsigned int new_ss_mask = cur_ss_mask;
+		unsigned long new_ss_mask = cur_ss_mask;
 
 		for_each_subsys(ss, ssid)
 			if (cur_ss_mask & (1 << ssid))
@@ -1200,7 +1200,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
  * @cgrp: target cgroup
  * @subsys_mask: mask of the subsystem ids whose files should be removed
  */
-static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask)
+static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
 {
 	struct cgroup_subsys *ss;
 	int i;
@@ -1215,10 +1215,11 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask)
 	}
 }
 
-static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
+static int rebind_subsystems(struct cgroup_root *dst_root,
+			     unsigned long ss_mask)
 {
 	struct cgroup_subsys *ss;
-	unsigned int tmp_ss_mask;
+	unsigned long tmp_ss_mask;
 	int ssid, i, ret;
 
 	lockdep_assert_held(&cgroup_mutex);
@@ -1253,7 +1254,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
 		 * Just warn about it and continue.
 		 */
 		if (cgrp_dfl_root_visible) {
-			pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n",
+			pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n",
 				ret, ss_mask);
 			pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
 		}
@@ -1338,7 +1339,7 @@ static int cgroup_show_options(struct seq_file *seq,
 }
 
 struct cgroup_sb_opts {
-	unsigned int subsys_mask;
+	unsigned long subsys_mask;
 	unsigned int flags;
 	char *release_agent;
 	bool cpuset_clone_children;
@@ -1351,7 +1352,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 {
 	char *token, *o = data;
 	bool all_ss = false, one_ss = false;
-	unsigned int mask = -1U;
+	unsigned long mask = -1UL;
 	struct cgroup_subsys *ss;
 	int nr_opts = 0;
 	int i;
@@ -1495,7 +1496,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
 	int ret = 0;
 	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
 	struct cgroup_sb_opts opts;
-	unsigned int added_mask, removed_mask;
+	unsigned long added_mask, removed_mask;
 
 	if (root == &cgrp_dfl_root) {
 		pr_err("remount is not allowed\n");
@@ -1641,7 +1642,7 @@ static void init_cgroup_root(struct cgroup_root *root,
 		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
 }
 
-static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
+static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
 {
 	LIST_HEAD(tmp_links);
 	struct cgroup *root_cgrp = &root->cgrp;
@@ -2542,7 +2543,7 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static void cgroup_print_ss_mask(struct seq_file *seq, unsigned int ss_mask)
+static void cgroup_print_ss_mask(struct seq_file *seq, unsigned long ss_mask)
 {
 	struct cgroup_subsys *ss;
 	bool printed = false;
@@ -2689,8 +2690,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
 					    char *buf, size_t nbytes,
 					    loff_t off)
 {
-	unsigned int enable = 0, disable = 0;
-	unsigned int css_enable, css_disable, old_sc, new_sc, old_ss, new_ss;
+	unsigned long enable = 0, disable = 0;
+	unsigned long css_enable, css_disable, old_sc, new_sc, old_ss, new_ss;
 	struct cgroup *cgrp, *child;
 	struct cgroup_subsys *ss;
 	char *tok;
@@ -4322,7 +4323,7 @@ static struct cftype cgroup_legacy_base_files[] = {
  *
  * On failure, no file is added.
  */
-static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask)
+static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
 {
 	struct cgroup_subsys *ss;
 	int i, ret = 0;
-- 
2.4.1


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH v12 2/8] cgroup: use bitmask to filter for_each_subsys
  2015-05-18 14:50 [PATCH v12 0/8] cgroup: add pids subsystem Aleksa Sarai
  2015-05-18 14:51 ` [PATCH v12 1/8] cgroup: switch to unsigned long for bitmasks Aleksa Sarai
@ 2015-05-18 14:51 ` Aleksa Sarai
  2015-05-18 14:51 ` [PATCH v12 3/8] cgroup: replace explicit ss_mask checking with for_each_subsys_which Aleksa Sarai
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 27+ messages in thread
From: Aleksa Sarai @ 2015-05-18 14:51 UTC (permalink / raw)
  To: tj, lizefan, mingo, peterz
  Cc: richard, fweisbec, linux-kernel, cgroups, Aleksa Sarai

Add a new macro for_each_subsys_which that allows all enabled cgroup
subsystems to be filtered by a bitmask, such that mask & (1 << ssid)
determines if the subsystem is to be processed in the loop body (where
ssid is the unique id of the subsystem).

Also replace the need_forkexit_callback with two separate bitmasks for
each callback to make (ss->{fork,exit}) checks unnecessary.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 kernel/cgroup.c | 51 +++++++++++++++++++++++++++++++--------------------
 1 file changed, 31 insertions(+), 20 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 15896ed..d66fb20 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -175,12 +175,13 @@ static DEFINE_IDR(cgroup_hierarchy_idr);
  */
 static u64 css_serial_nr_next = 1;
 
-/* This flag indicates whether tasks in the fork and exit paths should
- * check for fork/exit handlers to call. This avoids us having to do
- * extra work in the fork/exit path if none of the subsystems need to
- * be called.
+/*
+ * These bitmask flags indicate whether tasks in the fork and exit paths have
+ * fork/exit handlers to call. This avoids us having to do extra work in the
+ * fork/exit path to check which subsystems have fork/exit callbacks.
  */
-static int need_forkexit_callback __read_mostly;
+static unsigned long have_fork_callback __read_mostly;
+static unsigned long have_exit_callback __read_mostly;
 
 static struct cftype cgroup_dfl_base_files[];
 static struct cftype cgroup_legacy_base_files[];
@@ -409,6 +410,22 @@ static int notify_on_release(const struct cgroup *cgrp)
 	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT &&		\
 	     (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
 
+
+/**
+ * for_each_subsys_which - filter for_each_subsys with a bitmask
+ * @ss: the iteration cursor
+ * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
+ * @ss_maskp: a pointer to the bitmask
+ *
+ * The block will only run for cases where the ssid-th bit (1 << ssid) of
+ * mask is set to 1.
+ */
+#define for_each_subsys_which(ss, ssid, ss_maskp)			\
+	for_each_set_bit(ssid, ss_maskp, CGROUP_SUBSYS_COUNT)		\
+		if (((ss) = cgroup_subsys[ssid]) && false)		\
+			;						\
+		else
+
 /* iterate across the hierarchies */
 #define for_each_root(root)						\
 	list_for_each_entry((root), &cgroup_roots, root_list)
@@ -4932,7 +4949,8 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
 	 * init_css_set is in the subsystem's root cgroup. */
 	init_css_set.subsys[ss->id] = css;
 
-	need_forkexit_callback |= ss->fork || ss->exit;
+	have_fork_callback |= (bool)ss->fork << ss->id;
+	have_exit_callback |= (bool)ss->exit << ss->id;
 
 	/* At system boot, before all subsystems have been
 	 * registered, no tasks have been forked, so we don't
@@ -5242,11 +5260,8 @@ void cgroup_post_fork(struct task_struct *child)
 	 * css_set; otherwise, @child might change state between ->fork()
 	 * and addition to css_set.
 	 */
-	if (need_forkexit_callback) {
-		for_each_subsys(ss, i)
-			if (ss->fork)
-				ss->fork(child);
-	}
+	for_each_subsys_which(ss, i, &have_fork_callback)
+		ss->fork(child);
 }
 
 /**
@@ -5290,16 +5305,12 @@ void cgroup_exit(struct task_struct *tsk)
 	cset = task_css_set(tsk);
 	RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
 
-	if (need_forkexit_callback) {
-		/* see cgroup_post_fork() for details */
-		for_each_subsys(ss, i) {
-			if (ss->exit) {
-				struct cgroup_subsys_state *old_css = cset->subsys[i];
-				struct cgroup_subsys_state *css = task_css(tsk, i);
+	/* see cgroup_post_fork() for details */
+	for_each_subsys_which(ss, i, &have_exit_callback) {
+		struct cgroup_subsys_state *old_css = cset->subsys[i];
+		struct cgroup_subsys_state *css = task_css(tsk, i);
 
-				ss->exit(css, old_css, tsk);
-			}
-		}
+		ss->exit(css, old_css, tsk);
 	}
 
 	if (put_cset)
-- 
2.4.1


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH v12 3/8] cgroup: replace explicit ss_mask checking with for_each_subsys_which
  2015-05-18 14:50 [PATCH v12 0/8] cgroup: add pids subsystem Aleksa Sarai
  2015-05-18 14:51 ` [PATCH v12 1/8] cgroup: switch to unsigned long for bitmasks Aleksa Sarai
  2015-05-18 14:51 ` [PATCH v12 2/8] cgroup: use bitmask to filter for_each_subsys Aleksa Sarai
@ 2015-05-18 14:51 ` Aleksa Sarai
  2015-05-18 14:51 ` [PATCH v12 4/8] cgroup, block: implement task_get_css() and use it in bio_associate_current() Aleksa Sarai
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 27+ messages in thread
From: Aleksa Sarai @ 2015-05-18 14:51 UTC (permalink / raw)
  To: tj, lizefan, mingo, peterz
  Cc: richard, fweisbec, linux-kernel, cgroups, Aleksa Sarai

Replace the explicit checking against ss_masks inside a for_each_subsys
block with for_each_subsys_which(..., ss_mask), to take advantage of the
more readable macro.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 kernel/cgroup.c | 44 ++++++++++++++++----------------------------
 1 file changed, 16 insertions(+), 28 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d66fb20..a9dfdf3 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1101,9 +1101,8 @@ static unsigned long cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
 	while (true) {
 		unsigned long new_ss_mask = cur_ss_mask;
 
-		for_each_subsys(ss, ssid)
-			if (cur_ss_mask & (1 << ssid))
-				new_ss_mask |= ss->depends_on;
+		for_each_subsys_which(ss, ssid, &cur_ss_mask)
+			new_ss_mask |= ss->depends_on;
 
 		/*
 		 * Mask out subsystems which aren't available.  This can
@@ -1241,10 +1240,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
 
 	lockdep_assert_held(&cgroup_mutex);
 
-	for_each_subsys(ss, ssid) {
-		if (!(ss_mask & (1 << ssid)))
-			continue;
-
+	for_each_subsys_which(ss, ssid, &ss_mask) {
 		/* if @ss has non-root csses attached to it, can't move */
 		if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)))
 			return -EBUSY;
@@ -1281,18 +1277,14 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
 	 * Nothing can fail from this point on.  Remove files for the
 	 * removed subsystems and rebind each subsystem.
 	 */
-	for_each_subsys(ss, ssid)
-		if (ss_mask & (1 << ssid))
-			cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
+	for_each_subsys_which(ss, ssid, &ss_mask)
+		cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
 
-	for_each_subsys(ss, ssid) {
+	for_each_subsys_which(ss, ssid, &ss_mask) {
 		struct cgroup_root *src_root;
 		struct cgroup_subsys_state *css;
 		struct css_set *cset;
 
-		if (!(ss_mask & (1 << ssid)))
-			continue;
-
 		src_root = ss->root;
 		css = cgroup_css(&src_root->cgrp, ss);
 
@@ -2566,13 +2558,11 @@ static void cgroup_print_ss_mask(struct seq_file *seq, unsigned long ss_mask)
 	bool printed = false;
 	int ssid;
 
-	for_each_subsys(ss, ssid) {
-		if (ss_mask & (1 << ssid)) {
-			if (printed)
-				seq_putc(seq, ' ');
-			seq_printf(seq, "%s", ss->name);
-			printed = true;
-		}
+	for_each_subsys_which(ss, ssid, &ss_mask) {
+		if (printed)
+			seq_putc(seq, ' ');
+		seq_printf(seq, "%s", ss->name);
+		printed = true;
 	}
 	if (printed)
 		seq_putc(seq, '\n');
@@ -2720,11 +2710,12 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
 	 */
 	buf = strstrip(buf);
 	while ((tok = strsep(&buf, " "))) {
+		unsigned long tmp_ss_mask = ~cgrp_dfl_root_inhibit_ss_mask;
+
 		if (tok[0] == '\0')
 			continue;
-		for_each_subsys(ss, ssid) {
-			if (ss->disabled || strcmp(tok + 1, ss->name) ||
-			    ((1 << ss->id) & cgrp_dfl_root_inhibit_ss_mask))
+		for_each_subsys_which(ss, ssid, &tmp_ss_mask) {
+			if (ss->disabled || strcmp(tok + 1, ss->name))
 				continue;
 
 			if (*tok == '+') {
@@ -2811,10 +2802,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
 	 * still around.  In such cases, wait till it's gone using
 	 * offline_waitq.
 	 */
-	for_each_subsys(ss, ssid) {
-		if (!(css_enable & (1 << ssid)))
-			continue;
-
+	for_each_subsys_which(ss, ssid, &css_enable) {
 		cgroup_for_each_live_child(child, cgrp) {
 			DEFINE_WAIT(wait);
 
-- 
2.4.1


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH v12 4/8] cgroup, block: implement task_get_css() and use it in bio_associate_current()
  2015-05-18 14:50 [PATCH v12 0/8] cgroup: add pids subsystem Aleksa Sarai
                   ` (2 preceding siblings ...)
  2015-05-18 14:51 ` [PATCH v12 3/8] cgroup: replace explicit ss_mask checking with for_each_subsys_which Aleksa Sarai
@ 2015-05-18 14:51 ` Aleksa Sarai
  2015-05-18 21:59   ` Tejun Heo
  2015-05-18 14:51 ` [PATCH v12 5/8] cgroup: move enum cgroup_subsys_id definition Aleksa Sarai
                   ` (3 subsequent siblings)
  7 siblings, 1 reply; 27+ messages in thread
From: Aleksa Sarai @ 2015-05-18 14:51 UTC (permalink / raw)
  To: tj, lizefan, mingo, peterz
  Cc: richard, fweisbec, linux-kernel, cgroups, Jens Axboe, Vivek Goyal

From: Tejun Heo <tj@kernel.org>

bio_associate_current() currently open codes task_css() and
css_tryget_online() to find and pin $current's blkcg css.  Abstract it
into task_get_css() which is implemented from cgroup side.  As a task
is always associated with an online css for every subsystem except
while the css_set update is propagating, task_get_css() retries till
css_tryget_online() succeeds.

This is a cleanup and shouldn't lead to noticeable behavior changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Vivek Goyal <vgoyal@redhat.com>
---
 block/bio.c            | 11 +----------
 include/linux/cgroup.h | 25 +++++++++++++++++++++++++
 2 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index f66a4ea..968683e 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1987,7 +1987,6 @@ EXPORT_SYMBOL(bioset_create_nobvec);
 int bio_associate_current(struct bio *bio)
 {
 	struct io_context *ioc;
-	struct cgroup_subsys_state *css;
 
 	if (bio->bi_ioc)
 		return -EBUSY;
@@ -1996,17 +1995,9 @@ int bio_associate_current(struct bio *bio)
 	if (!ioc)
 		return -ENOENT;
 
-	/* acquire active ref on @ioc and associate */
 	get_io_context_active(ioc);
 	bio->bi_ioc = ioc;
-
-	/* associate blkcg if exists */
-	rcu_read_lock();
-	css = task_css(current, blkio_cgrp_id);
-	if (css && css_tryget_online(css))
-		bio->bi_css = css;
-	rcu_read_unlock();
-
+	bio->bi_css = task_get_css(current, blkio_cgrp_id);
 	return 0;
 }
 
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b9cb94c..e7da0aa 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -774,6 +774,31 @@ static inline struct cgroup_subsys_state *task_css(struct task_struct *task,
 }
 
 /**
+ * task_get_css - find and get the css for (task, subsys)
+ * @task: the target task
+ * @subsys_id: the target subsystem ID
+ *
+ * Find the css for the (@task, @subsys_id) combination, increment a
+ * reference on and return it.  This function is guaranteed to return a
+ * valid css.
+ */
+static inline struct cgroup_subsys_state *
+task_get_css(struct task_struct *task, int subsys_id)
+{
+	struct cgroup_subsys_state *css;
+
+	rcu_read_lock();
+	while (true) {
+		css = task_css(task, subsys_id);
+		if (likely(css_tryget_online(css)))
+			break;
+		cpu_relax();
+	}
+	rcu_read_unlock();
+	return css;
+}
+
+/**
  * task_css_is_root - test whether a task belongs to the root css
  * @task: the target task
  * @subsys_id: the target subsystem ID
-- 
2.4.1


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH v12 5/8] cgroup: move enum cgroup_subsys_id definition
  2015-05-18 14:50 [PATCH v12 0/8] cgroup: add pids subsystem Aleksa Sarai
                   ` (3 preceding siblings ...)
  2015-05-18 14:51 ` [PATCH v12 4/8] cgroup, block: implement task_get_css() and use it in bio_associate_current() Aleksa Sarai
@ 2015-05-18 14:51 ` Aleksa Sarai
  2015-05-18 21:59   ` Tejun Heo
  2015-05-18 14:51 ` [PATCH v12 6/8] cgroup: allow a cgroup subsystem to reject a fork Aleksa Sarai
                   ` (2 subsequent siblings)
  7 siblings, 1 reply; 27+ messages in thread
From: Aleksa Sarai @ 2015-05-18 14:51 UTC (permalink / raw)
  To: tj, lizefan, mingo, peterz
  Cc: richard, fweisbec, linux-kernel, cgroups, Aleksa Sarai

This is in preparation for implementing the pids cgroup subsystem. It is
not a functional change and should not change any behavior.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 include/linux/cgroup.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index e7da0aa..35ba593 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -25,6 +25,14 @@
 
 #ifdef CONFIG_CGROUPS
 
+/* define the enumeration of all cgroup subsystems */
+#define SUBSYS(_x) _x ## _cgrp_id,
+enum cgroup_subsys_id {
+#include <linux/cgroup_subsys.h>
+	CGROUP_SUBSYS_COUNT,
+};
+#undef SUBSYS
+
 struct cgroup_root;
 struct cgroup_subsys;
 struct cgroup;
@@ -40,14 +48,6 @@ extern int cgroupstats_build(struct cgroupstats *stats,
 extern int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
 			    struct pid *pid, struct task_struct *tsk);
 
-/* define the enumeration of all cgroup subsystems */
-#define SUBSYS(_x) _x ## _cgrp_id,
-enum cgroup_subsys_id {
-#include <linux/cgroup_subsys.h>
-	CGROUP_SUBSYS_COUNT,
-};
-#undef SUBSYS
-
 /*
  * Per-subsystem/per-cgroup state maintained by the system.  This is the
  * fundamental structural building block that controllers deal with.
-- 
2.4.1


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH v12 6/8] cgroup: allow a cgroup subsystem to reject a fork
  2015-05-18 14:50 [PATCH v12 0/8] cgroup: add pids subsystem Aleksa Sarai
                   ` (4 preceding siblings ...)
  2015-05-18 14:51 ` [PATCH v12 5/8] cgroup: move enum cgroup_subsys_id definition Aleksa Sarai
@ 2015-05-18 14:51 ` Aleksa Sarai
  2015-05-18 22:12   ` Tejun Heo
  2015-05-18 14:51 ` [PATCH v12 7/8] cgroup: add a tset_get_css macro Aleksa Sarai
  2015-05-18 14:51 ` [PATCH v12 8/8] cgroup: implement the PIDs subsystem Aleksa Sarai
  7 siblings, 1 reply; 27+ messages in thread
From: Aleksa Sarai @ 2015-05-18 14:51 UTC (permalink / raw)
  To: tj, lizefan, mingo, peterz
  Cc: richard, fweisbec, linux-kernel, cgroups, Aleksa Sarai

Add a new cgroup subsystem callback can_fork that conditionally
states whether or not the fork is accepted or rejected by a cgroup
policy. In addition, add a cancel_fork callback so that if an error
occurs later in the forking process, any state modified by can_fork can
be reverted.

Allow for a private opaque pointer to be passed from the cgroup_can_fork
to cgroup_post_fork, allowing for the fork state to be stored by each
subsystem separately.

Also add a tagging system for cgroup_subsys.h to allow for CGROUP_<TAG>
enumerations to be be defined and used. Also explicitly add a
CGROUP_CANFORK_COUNT macro to make arrays easier to define.

This is in preparation for implementing the pids cgroup subsystem.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 include/linux/cgroup.h        | 27 ++++++++++++--
 include/linux/cgroup_subsys.h | 17 +++++++++
 kernel/cgroup.c               | 84 +++++++++++++++++++++++++++++++++++++++++--
 kernel/cgroup_freezer.c       |  2 +-
 kernel/fork.c                 | 17 +++++++--
 kernel/sched/core.c           |  2 +-
 6 files changed, 139 insertions(+), 10 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 35ba593..886a883 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -28,11 +28,16 @@
 /* define the enumeration of all cgroup subsystems */
 #define SUBSYS(_x) _x ## _cgrp_id,
 enum cgroup_subsys_id {
+#define SUBSYS_TAG(_t) CGROUP_ ## _t, \
+	__unused_tag_ ## _t = CGROUP_ ## _t - 1,
 #include <linux/cgroup_subsys.h>
+#undef SUBSYS_TAG
 	CGROUP_SUBSYS_COUNT,
 };
 #undef SUBSYS
 
+#define CGROUP_CANFORK_COUNT (CGROUP_CANFORK_END - CGROUP_CANFORK_START)
+
 struct cgroup_root;
 struct cgroup_subsys;
 struct cgroup;
@@ -40,7 +45,12 @@ struct cgroup;
 extern int cgroup_init_early(void);
 extern int cgroup_init(void);
 extern void cgroup_fork(struct task_struct *p);
-extern void cgroup_post_fork(struct task_struct *p);
+extern int cgroup_can_fork(struct task_struct *p,
+			   void *ss_private[CGROUP_CANFORK_COUNT]);
+extern void cgroup_cancel_fork(struct task_struct *p,
+			       void *ss_private[CGROUP_CANFORK_COUNT]);
+extern void cgroup_post_fork(struct task_struct *p,
+			     void *old_ss_private[CGROUP_CANFORK_COUNT]);
 extern void cgroup_exit(struct task_struct *p);
 extern int cgroupstats_build(struct cgroupstats *stats,
 				struct dentry *dentry);
@@ -649,7 +659,9 @@ struct cgroup_subsys {
 			      struct cgroup_taskset *tset);
 	void (*attach)(struct cgroup_subsys_state *css,
 		       struct cgroup_taskset *tset);
-	void (*fork)(struct task_struct *task);
+	int (*can_fork)(struct task_struct *task, void **privatep);
+	void (*cancel_fork)(struct task_struct *task, void *private);
+	void (*fork)(struct task_struct *task, void *private);
 	void (*exit)(struct cgroup_subsys_state *css,
 		     struct cgroup_subsys_state *old_css,
 		     struct task_struct *task);
@@ -970,10 +982,19 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
 
 struct cgroup_subsys_state;
 
+#define CGROUP_CANFORK_COUNT 0
+
 static inline int cgroup_init_early(void) { return 0; }
 static inline int cgroup_init(void) { return 0; }
 static inline void cgroup_fork(struct task_struct *p) {}
-static inline void cgroup_post_fork(struct task_struct *p) {}
+static inline int cgroup_can_fork(struct task_struct *p,
+				  void *ss_private[CGROUP_CANFORK_COUNT])
+{ return 0; }
+static inline void cgroup_cancel_fork(struct task_struct *p,
+				      void *ss_private[CGROUP_CANFORK_COUNT]) {}
+static inline void cgroup_post_fork(struct task_struct *p,
+				    void *ss_private[CGROUP_CANFORK_COUNT]) {}
+
 static inline void cgroup_exit(struct task_struct *p) {}
 
 static inline int cgroupstats_build(struct cgroupstats *stats,
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index e4a96fb..81b7bdd 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -3,6 +3,11 @@
  *
  * DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS.
  */
+#ifndef SUBSYS_TAG
+#	define __TMP_SUBSYS_TAG
+#	define SUBSYS_TAG(_x)
+#endif
+
 #if IS_ENABLED(CONFIG_CPUSETS)
 SUBSYS(cpuset)
 #endif
@@ -48,11 +53,23 @@ SUBSYS(hugetlb)
 #endif
 
 /*
+ * Subsystems that implement the can_fork() family of callbacks.
+ */
+SUBSYS_TAG(CANFORK_START)
+SUBSYS_TAG(CANFORK_END)
+
+/*
  * The following subsystems are not supported on the default hierarchy.
  */
 #if IS_ENABLED(CONFIG_CGROUP_DEBUG)
 SUBSYS(debug)
 #endif
+
+#ifdef __TMP_SUBSYS_TAG
+#	undef __TMP_SUBSYS_TAG
+#	undef SUBSYS_TAG
+#endif
+
 /*
  * DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS.
  */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a9dfdf3..ce54b78 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -183,6 +183,9 @@ static u64 css_serial_nr_next = 1;
 static unsigned long have_fork_callback __read_mostly;
 static unsigned long have_exit_callback __read_mostly;
 
+/* Ditto for the can_fork callback. */
+static unsigned long have_canfork_callback __read_mostly;
+
 static struct cftype cgroup_dfl_base_files[];
 static struct cftype cgroup_legacy_base_files[];
 
@@ -2324,9 +2327,10 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
 	 */
 	tset.csets = &tset.dst_csets;
 
-	for_each_e_css(css, i, cgrp)
+	for_each_e_css(css, i, cgrp) {
 		if (css->ss->attach)
 			css->ss->attach(css, &tset);
+	}
 
 	ret = 0;
 	goto out_release_tset;
@@ -4939,6 +4943,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
 
 	have_fork_callback |= (bool)ss->fork << ss->id;
 	have_exit_callback |= (bool)ss->exit << ss->id;
+	have_canfork_callback |= (bool)ss->can_fork << ss->id;
 
 	/* At system boot, before all subsystems have been
 	 * registered, no tasks have been forked, so we don't
@@ -5180,6 +5185,23 @@ static const struct file_operations proc_cgroupstats_operations = {
 	.release = single_release,
 };
 
+static void **subsys_canfork_privatep(void *ss_private[CGROUP_CANFORK_COUNT],
+				      int i)
+{
+	if (CGROUP_CANFORK_START <= i && i < CGROUP_CANFORK_END)
+		return &ss_private[i - CGROUP_CANFORK_START];
+	return NULL;
+}
+
+static void *subsys_canfork_private(void *ss_private[CGROUP_CANFORK_COUNT],
+				    int i)
+{
+	void **private;
+	if ((private = subsys_canfork_privatep(ss_private, i)) != NULL)
+		return *private;
+	return NULL;
+}
+
 /**
  * cgroup_fork - initialize cgroup related fields during copy_process()
  * @child: pointer to task_struct of forking parent process.
@@ -5195,6 +5217,61 @@ void cgroup_fork(struct task_struct *child)
 }
 
 /**
+ * cgroup_can_fork - called on a new task before the process is exposed
+ * @child: the task in question.
+ *
+ * This calls the subsystem can_fork() callbacks. If the can_fork() callback
+ * returns an error, the fork aborts with that error code. This allows for
+ * a cgroup subsystem to conditionally allow or deny new forks.
+ */
+int cgroup_can_fork(struct task_struct *child,
+		    void *ss_private[CGROUP_CANFORK_COUNT])
+{
+	struct cgroup_subsys *ss;
+	int i, j, retval;
+
+	for_each_subsys_which(ss, i, &have_canfork_callback) {
+		retval = ss->can_fork(child,
+				      subsys_canfork_privatep(ss_private, i));
+		if (retval)
+			goto out_revert;
+	}
+
+	return 0;
+
+out_revert:
+	for_each_subsys(ss, j) {
+		if (j >= i)
+			break;
+
+		if (ss->cancel_fork)
+			ss->cancel_fork(child,
+					subsys_canfork_private(ss_private, j));
+	}
+
+	return retval;
+}
+
+/**
+ * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
+ * @child: the task in question
+ *
+ * This calls the cancel_fork() callbacks if a fork failed *after*
+ * cgroup_can_fork() succeded.
+ */
+void cgroup_cancel_fork(struct task_struct *child,
+			void *ss_private[CGROUP_CANFORK_COUNT])
+{
+	struct cgroup_subsys *ss;
+	int i;
+
+	for_each_subsys(ss, i)
+		if(ss->cancel_fork)
+			ss->cancel_fork(child,
+					subsys_canfork_private(ss_private, i));
+}
+
+/**
  * cgroup_post_fork - called on a new task after adding it to the task list
  * @child: the task in question
  *
@@ -5204,7 +5281,8 @@ void cgroup_fork(struct task_struct *child)
  * cgroup_task_iter_start() - to guarantee that the new task ends up on its
  * list.
  */
-void cgroup_post_fork(struct task_struct *child)
+void cgroup_post_fork(struct task_struct *child,
+		      void *old_ss_private[CGROUP_CANFORK_COUNT])
 {
 	struct cgroup_subsys *ss;
 	int i;
@@ -5249,7 +5327,7 @@ void cgroup_post_fork(struct task_struct *child)
 	 * and addition to css_set.
 	 */
 	for_each_subsys_which(ss, i, &have_fork_callback)
-		ss->fork(child);
+		ss->fork(child, subsys_canfork_private(old_ss_private, i));
 }
 
 /**
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 92b98cc..f1b30ad 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -203,7 +203,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
  * to do anything as freezer_attach() will put @task into the appropriate
  * state.
  */
-static void freezer_fork(struct task_struct *task)
+static void freezer_fork(struct task_struct *task, void *private)
 {
 	struct freezer *freezer;
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 03c1eaa..7377698 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1245,6 +1245,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 {
 	int retval;
 	struct task_struct *p;
+	void *cgrp_ss_private[CGROUP_CANFORK_COUNT] = {};
 
 	if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
 		return ERR_PTR(-EINVAL);
@@ -1516,6 +1517,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->task_works = NULL;
 
 	/*
+	 * Ensure that the cgroup subsystem policies allow the new process to be
+	 * forked. It should be noted the the new process's css_set can be changed
+	 * between here and cgroup_post_fork() if an organisation operation is in
+	 * progress.
+	 */
+	retval = cgroup_can_fork(p, cgrp_ss_private);
+	if (retval)
+		goto bad_fork_free_pid;
+
+	/*
 	 * Make it visible to the rest of the system, but dont wake it up yet.
 	 * Need tasklist lock for parent etc handling!
 	 */
@@ -1551,7 +1562,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		spin_unlock(&current->sighand->siglock);
 		write_unlock_irq(&tasklist_lock);
 		retval = -ERESTARTNOINTR;
-		goto bad_fork_free_pid;
+		goto bad_fork_cancel_cgroup;
 	}
 
 	if (likely(p->pid)) {
@@ -1593,7 +1604,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	write_unlock_irq(&tasklist_lock);
 
 	proc_fork_connector(p);
-	cgroup_post_fork(p);
+	cgroup_post_fork(p, cgrp_ss_private);
 	if (clone_flags & CLONE_THREAD)
 		threadgroup_change_end(current);
 	perf_event_fork(p);
@@ -1603,6 +1614,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	return p;
 
+bad_fork_cancel_cgroup:
+	cgroup_cancel_fork(p, cgrp_ss_private);
 bad_fork_free_pid:
 	if (pid != &init_struct_pid)
 		free_pid(pid);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f9123a8..050936e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8007,7 +8007,7 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
 	sched_offline_group(tg);
 }
 
-static void cpu_cgroup_fork(struct task_struct *task)
+static void cpu_cgroup_fork(struct task_struct *task, void *private)
 {
 	sched_move_task(task);
 }
-- 
2.4.1


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH v12 7/8] cgroup: add a tset_get_css macro
  2015-05-18 14:50 [PATCH v12 0/8] cgroup: add pids subsystem Aleksa Sarai
                   ` (5 preceding siblings ...)
  2015-05-18 14:51 ` [PATCH v12 6/8] cgroup: allow a cgroup subsystem to reject a fork Aleksa Sarai
@ 2015-05-18 14:51 ` Aleksa Sarai
  2015-05-18 21:34   ` Tejun Heo
  2015-05-18 14:51 ` [PATCH v12 8/8] cgroup: implement the PIDs subsystem Aleksa Sarai
  7 siblings, 1 reply; 27+ messages in thread
From: Aleksa Sarai @ 2015-05-18 14:51 UTC (permalink / raw)
  To: tj, lizefan, mingo, peterz
  Cc: richard, fweisbec, linux-kernel, cgroups, Aleksa Sarai

This adds a macro to get the css of a tset (using task_get_css()) by
just grabbing a ref to the first item in the tset (since there is a
guarantee that all tasks in a tset share a css).

This is in preparation for implementing the pids cgroup subsystem.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 include/linux/cgroup.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 886a883..773846d 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -639,6 +639,18 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset);
 	for ((task) = cgroup_taskset_first((tset)); (task);		\
 	     (task) = cgroup_taskset_next((tset)))
 
+/**
+ * tset_get_css - obtain and get css for (tset, subsys_id)
+ * @tset: target taskset
+ * @subsys_id: target subsystem id
+ *
+ * Since all of the tasks in a taskset are guaranteed to have the same css, it's
+ * safe to grab the ref of just the first task's css and treat it as though you
+ * have a ref on the taskset's "collective" css.
+ */
+#define tset_get_css(tset, subsys_id)					\
+	task_get_css(cgroup_taskset_first(tset), subsys_id)
+
 /*
  * Control Group subsystem type.
  * See Documentation/cgroups/cgroups.txt for details
-- 
2.4.1


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH v12 8/8] cgroup: implement the PIDs subsystem
  2015-05-18 14:50 [PATCH v12 0/8] cgroup: add pids subsystem Aleksa Sarai
                   ` (6 preceding siblings ...)
  2015-05-18 14:51 ` [PATCH v12 7/8] cgroup: add a tset_get_css macro Aleksa Sarai
@ 2015-05-18 14:51 ` Aleksa Sarai
  2015-05-19  8:00   ` Peter Zijlstra
  7 siblings, 1 reply; 27+ messages in thread
From: Aleksa Sarai @ 2015-05-18 14:51 UTC (permalink / raw)
  To: tj, lizefan, mingo, peterz
  Cc: richard, fweisbec, linux-kernel, cgroups, Aleksa Sarai

Adds a new single-purpose PIDs subsystem to limit the number of
tasks that can be forked inside a cgroup. Essentially this is an
implementation of RLIMIT_NPROC that applies to a cgroup rather than a
process tree.

However, it should be noted that organisational operations (adding and
removing tasks from a PIDs hierarchy) will *not* be prevented. Rather,
the number of tasks in the hierarchy cannot exceed the limit through
forking. This is due to the fact that, in the unified hierarchy, attach
cannot fail (and it is not possible for a task to overcome its PIDs
cgroup policy limit by attaching to a child cgroup).

PIDs are fundamentally a global resource, and it is possible to reach
PID exhaustion inside a cgroup without hitting any reasonable kmemcg
policy. Once you've hit PID exhaustion, you're only in a marginally
better state than OOM. This subsystem allows PID exhaustion inside a
cgroup to be prevented.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 CREDITS                       |   5 +
 include/linux/cgroup_subsys.h |   5 +
 init/Kconfig                  |  16 ++
 kernel/Makefile               |   1 +
 kernel/cgroup_pids.c          | 355 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 382 insertions(+)
 create mode 100644 kernel/cgroup_pids.c

diff --git a/CREDITS b/CREDITS
index 40cc4bf..0727426 100644
--- a/CREDITS
+++ b/CREDITS
@@ -3215,6 +3215,11 @@ S: 69 rue Dunois
 S: 75013 Paris
 S: France
 
+N: Aleksa Sarai
+E: cyphar@cyphar.com
+W: https://www.cyphar.com/
+D: `pids` cgroup subsystem
+
 N: Dipankar Sarma
 E: dipankar@in.ibm.com
 D: RCU
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 81b7bdd..32becaf 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -56,6 +56,11 @@ SUBSYS(hugetlb)
  * Subsystems that implement the can_fork() family of callbacks.
  */
 SUBSYS_TAG(CANFORK_START)
+
+#if IS_ENABLED(CONFIG_CGROUP_PIDS)
+SUBSYS(pids)
+#endif
+
 SUBSYS_TAG(CANFORK_END)
 
 /*
diff --git a/init/Kconfig b/init/Kconfig
index dc24dec..24b2563 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -967,6 +967,22 @@ config CGROUP_FREEZER
 	  Provides a way to freeze and unfreeze all tasks in a
 	  cgroup.
 
+config CGROUP_PIDS
+	bool "PIDs cgroup subsystem"
+	help
+	  Provides enforcement of process number limits in the scope of a
+	  cgroup. Any attempt to fork more processes than is allowed in the
+	  cgroup will fail. PIDs are fundamentally a global resource because it
+	  is fairly trivial to reach PID exhaustion before you reach even a
+	  conservative kmemcg limit. As a result, it is possible to grind a
+	  system to halt without being limited by other cgroup policies. The
+	  PIDs cgroup subsystem is designed to stop this from happening.
+
+	  It should be noted that organisational operations (such as attaching
+	  to a cgroup hierarchy will *not* be blocked by the PIDs subsystem),
+	  since the PIDs limit only affects a process's ability to fork, not to
+	  attach to a cgroup.
+
 config CGROUP_DEVICE
 	bool "Device controller for cgroups"
 	help
diff --git a/kernel/Makefile b/kernel/Makefile
index 0f8f8b0..df5406c 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -55,6 +55,7 @@ obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
 obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CGROUPS) += cgroup.o
 obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
+obj-$(CONFIG_CGROUP_PIDS) += cgroup_pids.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_UTS_NS) += utsname.o
 obj-$(CONFIG_USER_NS) += user_namespace.o
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c
new file mode 100644
index 0000000..309d0e5
--- /dev/null
+++ b/kernel/cgroup_pids.c
@@ -0,0 +1,355 @@
+/*
+ * Process number limiting controller for cgroups.
+ *
+ * Used to allow a cgroup hierarchy to stop any new processes from fork()ing
+ * after a certain limit is reached.
+ *
+ * Since it is trivial to hit the task limit without hitting any kmemcg limits
+ * in place, PIDs are a fundamental resource. As such, PID exhaustion must be
+ * preventable in the scope of a cgroup hierarchy by allowing resource limiting
+ * of the number of tasks in a cgroup.
+ *
+ * In order to use the `pids` controller, set the maximum number of tasks in
+ * pids.max (this is not available in the root cgroup for obvious reasons). The
+ * number of processes currently in the cgroup is given by pids.current.
+ * Organisational operations are not blocked by cgroup policies, so it is
+ * possible to have pids.current > pids.max. However, it is not possible to
+ * violate a cgroup policy through fork(). fork() will retrun -EAGAIN if forking
+ * would cause a cgroup policy to be violated.
+ *
+ * To set a cgroup to have no limit, set pids.max to "max". This is the default
+ * for all new cgroups (NB that PID limits are hierarchical, so the most
+ * stringent limit in the hierarchy is followed).
+ *
+ * pids.current tracks all child cgroup hierarchies, so parent/pids.current is
+ * a superset of parent/child/pids.current.
+ *
+ * Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com>
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/threads.h>
+#include <linux/atomic.h>
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+
+#define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
+#define PIDS_MAX_STR "max"
+
+struct pids_cgroup {
+	struct cgroup_subsys_state	css;
+
+	/*
+	 * Use 64-bit types so that we can safely represent "max" as
+	 * %PIDS_MAX = (%PID_MAX_LIMIT + 1).
+	 */
+	atomic64_t			counter;
+	int64_t				limit;
+};
+
+static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css)
+{
+	return container_of(css, struct pids_cgroup, css);
+}
+
+static struct pids_cgroup *parent_pids(struct pids_cgroup *pids)
+{
+	return css_pids(pids->css.parent);
+}
+
+static struct cgroup_subsys_state *
+pids_css_alloc(struct cgroup_subsys_state *parent)
+{
+	struct pids_cgroup *pids;
+
+	pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL);
+	if (!pids)
+		return ERR_PTR(-ENOMEM);
+
+	pids->limit = PIDS_MAX;
+	atomic64_set(&pids->counter, 0);
+	return &pids->css;
+}
+
+static void pids_css_free(struct cgroup_subsys_state *css)
+{
+	kfree(css_pids(css));
+}
+
+/**
+ * pids_cancel - uncharge the local pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to cancel
+ *
+ * This function will WARN if the pid count goes under 0, because such a case is
+ * a bug in the pids controller proper.
+ */
+static void pids_cancel(struct pids_cgroup *pids, int num)
+{
+	/*
+	 * A negative count (or overflow for that matter) is invalid,
+	 * and indicates a bug in the `pids` controller proper.
+	 */
+	WARN_ON_ONCE(atomic64_add_negative(-num, &pids->counter));
+}
+
+/**
+ * pids_uncharge - hierarchically uncharge the pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to uncharge
+ */
+static void pids_uncharge(struct pids_cgroup *pids, int num)
+{
+	struct pids_cgroup *p;
+
+	for (p = pids; p; p = parent_pids(p))
+		pids_cancel(p, num);
+}
+
+/**
+ * pids_charge - hierarchically charge the pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to charge
+ *
+ * This function does *not* follow the pid limit set. It cannot fail and the new
+ * pid count may exceed the limit, because organisational operations cannot fail
+ * in the unified hierarchy.
+ */
+static void pids_charge(struct pids_cgroup *pids, int num)
+{
+	struct pids_cgroup *p;
+
+	for (p = pids; p; p = parent_pids(p))
+		atomic64_add(num, &p->counter);
+}
+
+/**
+ * pids_try_charge - hierarchically try to charge the pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to charge
+ *
+ * This function follows the set limit. It will fail if the charge would cause
+ * the new value to exceed the hierarchical limit. Returns 0 if the charge
+ * succeded, otherwise -EAGAIN.
+ */
+static int pids_try_charge(struct pids_cgroup *pids, int num)
+{
+	struct pids_cgroup *p, *q;
+
+	for (p = pids; p; p = parent_pids(p)) {
+		int64_t new = atomic64_add_return(num, &p->counter);
+
+		/*
+		 * Since new is capped to the maximum number of pid_t, if
+		 * p->limit is %PIDS_MAX then we know that this test will never
+		 * fail.
+		 */
+		if (new > p->limit)
+			goto revert;
+	}
+
+	return 0;
+
+revert:
+	for (q = pids; q != p; q = parent_pids(q))
+		pids_cancel(q, num);
+	pids_cancel(p, num);
+
+	return -EAGAIN;
+}
+
+static int pids_can_attach(struct cgroup_subsys_state *css,
+			   struct cgroup_taskset *tset)
+{
+	struct pids_cgroup *pids = css_pids(css);
+	struct cgroup_subsys_state *old_css;
+	struct pids_cgroup *old_pids;
+	struct task_struct *task;
+	int64_t num = 0;
+
+	old_css = tset_get_css(tset, pids_cgrp_id);
+	old_pids = css_pids(old_css);
+
+	cgroup_taskset_for_each(task, tset)
+		num++;
+
+	/*
+	 * Attaching to a cgroup is allowed to overcome the
+	 * the PID limit, so that organisation operations aren't
+	 * blocked by the `pids` cgroup controller.
+	 */
+	pids_charge(pids, num);
+	pids_uncharge(old_pids, num);
+	css_put(old_css);
+	return 0;
+}
+
+static void pids_cancel_attach(struct cgroup_subsys_state *css,
+			       struct cgroup_taskset *tset)
+{
+	struct pids_cgroup *pids = css_pids(css);
+	struct cgroup_subsys_state *old_css;
+	struct pids_cgroup *old_pids;
+	struct task_struct *task;
+	int64_t num = 0;
+
+	old_css = tset_get_css(tset, pids_cgrp_id);
+	old_pids = css_pids(old_css);
+
+	cgroup_taskset_for_each(task, tset)
+		num++;
+
+	pids_charge(old_pids, num);
+	pids_uncharge(pids, num);
+	css_put(old_css);
+}
+
+static int pids_can_fork(struct task_struct *task, void **private)
+{
+	struct cgroup_subsys_state *css;
+	struct pids_cgroup *pids;
+	int err;
+
+	/*
+	 * Use the "current" task_css for the pids subsystem as the tentative
+	 * css. It is possible we will charge the wrong hierarchy, in which
+	 * case we will forcefully revert/reapply the charge on the right
+	 * hierarchy after it is committed to the task proper.
+	 */
+	css = task_get_css(current, pids_cgrp_id);
+	pids = css_pids(css);
+
+	err = pids_try_charge(pids, 1);
+	if (err)
+		goto err_css_put;
+
+	*private = css;
+	return 0;
+
+err_css_put:
+	css_put(css);
+	return err;
+}
+
+static void pids_cancel_fork(struct task_struct *task, void *private)
+{
+	struct cgroup_subsys_state *css = private;
+	struct pids_cgroup *pids = css_pids(css);
+
+	pids_uncharge(pids, 1);
+	css_put(css);
+}
+
+static void pids_fork(struct task_struct *task, void *private)
+{
+	struct cgroup_subsys_state *css;
+	struct cgroup_subsys_state *old_css = private;
+	struct pids_cgroup *pids;
+	struct pids_cgroup *old_pids = css_pids(old_css);
+
+	css = task_get_css(task, pids_cgrp_id);
+	pids = css_pids(css);
+
+	/*
+	 * If the association has changed, we have to revert and reapply the
+	 * charge/uncharge on the wrong hierarchy to the current one. Since
+	 * the association can only change due to an organisation event, its
+	 * okay for us to ignore the limit in this case.
+	 */
+	if (pids != old_pids) {
+		pids_uncharge(old_pids, 1);
+		pids_charge(pids, 1);
+	}
+
+	css_put(css);
+	css_put(old_css);
+}
+
+static void pids_exit(struct cgroup_subsys_state *css,
+		      struct cgroup_subsys_state *old_css,
+		      struct task_struct *task)
+{
+	struct pids_cgroup *pids = css_pids(old_css);
+
+	pids_uncharge(pids, 1);
+}
+
+static ssize_t pids_max_write(struct kernfs_open_file *of, char *buf,
+			      size_t nbytes, loff_t off)
+{
+	struct cgroup_subsys_state *css = of_css(of);
+	struct pids_cgroup *pids = css_pids(css);
+	int64_t limit;
+	int err;
+
+	buf = strstrip(buf);
+	if (!strcmp(buf, PIDS_MAX_STR)) {
+		limit = PIDS_MAX;
+		goto set_limit;
+	}
+
+	err = kstrtoll(buf, 0, &limit);
+	if (err)
+		return err;
+
+	if (limit < 0 || limit >= PIDS_MAX)
+		return -EINVAL;
+
+set_limit:
+	/*
+	 * Limit updates don't need to be mutex'd, since it isn't
+	 * critical that any racing fork()s follow the new limit.
+	 */
+	pids->limit = limit;
+	return nbytes;
+}
+
+static int pids_max_show(struct seq_file *sf, void *v)
+{
+	struct cgroup_subsys_state *css = seq_css(sf);
+	struct pids_cgroup *pids = css_pids(css);
+	int64_t limit = pids->limit;
+
+	if (limit >= PIDS_MAX)
+		seq_printf(sf, "%s\n", PIDS_MAX_STR);
+	else
+		seq_printf(sf, "%lld\n", limit);
+
+	return 0;
+}
+
+static s64 pids_current_read(struct cgroup_subsys_state *css,
+			     struct cftype *cft)
+{
+	struct pids_cgroup *pids = css_pids(css);
+
+	return atomic64_read(&pids->counter);
+}
+
+static struct cftype files[] = {
+	{
+		.name = "max",
+		.write = pids_max_write,
+		.seq_show = pids_max_show,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+	{
+		.name = "current",
+		.read_s64 = pids_current_read,
+	},
+	{ }	/* terminate */
+};
+
+struct cgroup_subsys pids_cgrp_subsys = {
+	.css_alloc	= pids_css_alloc,
+	.css_free	= pids_css_free,
+	.can_attach	= pids_can_attach,
+	.cancel_attach	= pids_cancel_attach,
+	.can_fork	= pids_can_fork,
+	.cancel_fork	= pids_cancel_fork,
+	.fork		= pids_fork,
+	.exit		= pids_exit,
+	.legacy_cftypes	= files,
+	.early_init	= 0,
+};
-- 
2.4.1


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH v12 7/8] cgroup: add a tset_get_css macro
  2015-05-18 14:51 ` [PATCH v12 7/8] cgroup: add a tset_get_css macro Aleksa Sarai
@ 2015-05-18 21:34   ` Tejun Heo
  2015-05-19  1:18     ` Aleksa Sarai
  0 siblings, 1 reply; 27+ messages in thread
From: Tejun Heo @ 2015-05-18 21:34 UTC (permalink / raw)
  To: Aleksa Sarai
  Cc: lizefan, mingo, peterz, richard, fweisbec, linux-kernel, cgroups

On Tue, May 19, 2015 at 12:51:06AM +1000, Aleksa Sarai wrote:
> This adds a macro to get the css of a tset (using task_get_css()) by
> just grabbing a ref to the first item in the tset (since there is a
> guarantee that all tasks in a tset share a css).
> 
> This is in preparation for implementing the pids cgroup subsystem.
> 
> Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
> ---
>  include/linux/cgroup.h | 12 ++++++++++++
>  1 file changed, 12 insertions(+)
> 
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index 886a883..773846d 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -639,6 +639,18 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset);
>  	for ((task) = cgroup_taskset_first((tset)); (task);		\
>  	     (task) = cgroup_taskset_next((tset)))
>  
> +/**
> + * tset_get_css - obtain and get css for (tset, subsys_id)
> + * @tset: target taskset
> + * @subsys_id: target subsystem id
> + *
> + * Since all of the tasks in a taskset are guaranteed to have the same css, it's
> + * safe to grab the ref of just the first task's css and treat it as though you
> + * have a ref on the taskset's "collective" css.
> + */
> +#define tset_get_css(tset, subsys_id)					\
> +	task_get_css(cgroup_taskset_first(tset), subsys_id)

They aren't tho.  It depends on the hierarchy configuration.  csses
belonging to hierarchies which aren't the current migration target may
differ.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v12 4/8] cgroup, block: implement task_get_css() and use it in bio_associate_current()
  2015-05-18 14:51 ` [PATCH v12 4/8] cgroup, block: implement task_get_css() and use it in bio_associate_current() Aleksa Sarai
@ 2015-05-18 21:59   ` Tejun Heo
  2015-05-19  3:05     ` Aleksa Sarai
  0 siblings, 1 reply; 27+ messages in thread
From: Tejun Heo @ 2015-05-18 21:59 UTC (permalink / raw)
  To: Aleksa Sarai
  Cc: lizefan, mingo, peterz, richard, fweisbec, linux-kernel, cgroups,
	Jens Axboe, Vivek Goyal

On Tue, May 19, 2015 at 12:51:03AM +1000, Aleksa Sarai wrote:
> From: Tejun Heo <tj@kernel.org>
> 
> bio_associate_current() currently open codes task_css() and
> css_tryget_online() to find and pin $current's blkcg css.  Abstract it
> into task_get_css() which is implemented from cgroup side.  As a task
> is always associated with an online css for every subsystem except
> while the css_set update is propagating, task_get_css() retries till
> css_tryget_online() succeeds.
> 
> This is a cleanup and shouldn't lead to noticeable behavior changes.

Applied 1-4 but skipped bio_associate_current() changes from this
patch.  I'll route that part through a different patchset.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v12 5/8] cgroup: move enum cgroup_subsys_id definition
  2015-05-18 14:51 ` [PATCH v12 5/8] cgroup: move enum cgroup_subsys_id definition Aleksa Sarai
@ 2015-05-18 21:59   ` Tejun Heo
  2015-05-19  1:25     ` Aleksa Sarai
  0 siblings, 1 reply; 27+ messages in thread
From: Tejun Heo @ 2015-05-18 21:59 UTC (permalink / raw)
  To: Aleksa Sarai
  Cc: lizefan, mingo, peterz, richard, fweisbec, linux-kernel, cgroups

On Tue, May 19, 2015 at 12:51:04AM +1000, Aleksa Sarai wrote:
> This is in preparation for implementing the pids cgroup subsystem. It is
> not a functional change and should not change any behavior.
> 
> Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>

I did a big reorg of cgroup.h and split of cgroup-defs.h from it and
this patch doesn't seem necessary anymore.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v12 6/8] cgroup: allow a cgroup subsystem to reject a fork
  2015-05-18 14:51 ` [PATCH v12 6/8] cgroup: allow a cgroup subsystem to reject a fork Aleksa Sarai
@ 2015-05-18 22:12   ` Tejun Heo
  0 siblings, 0 replies; 27+ messages in thread
From: Tejun Heo @ 2015-05-18 22:12 UTC (permalink / raw)
  To: Aleksa Sarai
  Cc: lizefan, mingo, peterz, richard, fweisbec, linux-kernel, cgroups

Hello,

On Tue, May 19, 2015 at 12:51:05AM +1000, Aleksa Sarai wrote:
> Add a new cgroup subsystem callback can_fork that conditionally
> states whether or not the fork is accepted or rejected by a cgroup
> policy. In addition, add a cancel_fork callback so that if an error
> occurs later in the forking process, any state modified by can_fork can
> be reverted.
> 
> Allow for a private opaque pointer to be passed from the cgroup_can_fork
> to cgroup_post_fork, allowing for the fork state to be stored by each
> subsystem separately.
> 
> Also add a tagging system for cgroup_subsys.h to allow for CGROUP_<TAG>
> enumerations to be be defined and used. Also explicitly add a
> CGROUP_CANFORK_COUNT macro to make arrays easier to define.
> 
> This is in preparation for implementing the pids cgroup subsystem.
> 
> Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>

This generally looks good to me now.

Ingo, Peter, I'm planning on routing this through cgroup branch after
minor revisions.  If there's any objection, please let me know.

> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index 35ba593..886a883 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -28,11 +28,16 @@
>  /* define the enumeration of all cgroup subsystems */
>  #define SUBSYS(_x) _x ## _cgrp_id,
>  enum cgroup_subsys_id {
> +#define SUBSYS_TAG(_t) CGROUP_ ## _t, \
> +	__unused_tag_ ## _t = CGROUP_ ## _t - 1,

Why not put this together with SUBSYS() def?

>  #include <linux/cgroup_subsys.h>
> +#undef SUBSYS_TAG

And this with undef?

>  	CGROUP_SUBSYS_COUNT,
>  };
>  #undef SUBSYS
>  
> +#define CGROUP_CANFORK_COUNT (CGROUP_CANFORK_END - CGROUP_CANFORK_START)
> +
>  struct cgroup_root;
>  struct cgroup_subsys;
>  struct cgroup;
...
> @@ -3,6 +3,11 @@
>   *
>   * DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS.
>   */
> +#ifndef SUBSYS_TAG
> +#	define __TMP_SUBSYS_TAG
> +#	define SUBSYS_TAG(_x)

It'd be nice if there's a comment explaining inclusion rules for this
file.  Also, let's not do the indenting thing.  I don't think it adds
much and it tends to get inconsistent and weird (e.g. why isn't
#include indented when inside #ifdef?) over time.  These aren't really
eye dazzling definitions.

> @@ -2324,9 +2327,10 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
>  	 */
>  	tset.csets = &tset.dst_csets;
>  
> -	for_each_e_css(css, i, cgrp)
> +	for_each_e_css(css, i, cgrp) {
>  		if (css->ss->attach)
>  			css->ss->attach(css, &tset);
> +	}

I don't object but is there a reason for this change?  If it's just
for stylistic consistency, please mention it in the description.

> @@ -5180,6 +5185,23 @@ static const struct file_operations proc_cgroupstats_operations = {
>  	.release = single_release,
>  };
>  
> +static void **subsys_canfork_privatep(void *ss_private[CGROUP_CANFORK_COUNT],
> +				      int i)

Heh, how about subsys_canfork_priv_p()?  privatep and private are
kinda tricky to tell apart.

> +{
> +	if (CGROUP_CANFORK_START <= i && i < CGROUP_CANFORK_END)
> +		return &ss_private[i - CGROUP_CANFORK_START];
> +	return NULL;
> +}
> +
> +static void *subsys_canfork_private(void *ss_private[CGROUP_CANFORK_COUNT],
> +				    int i)

and subsys_canfork_priv() here.

> +{
> +	void **private;
> +	if ((private = subsys_canfork_privatep(ss_private, i)) != NULL)
> +		return *private;
> +	return NULL;
> +}
...
> @@ -5195,6 +5217,61 @@ void cgroup_fork(struct task_struct *child)
>  }
>  
>  /**
> + * cgroup_can_fork - called on a new task before the process is exposed
> + * @child: the task in question.
> + *
> + * This calls the subsystem can_fork() callbacks. If the can_fork() callback
> + * returns an error, the fork aborts with that error code. This allows for
> + * a cgroup subsystem to conditionally allow or deny new forks.
> + */
> +int cgroup_can_fork(struct task_struct *child,
> +		    void *ss_private[CGROUP_CANFORK_COUNT])
> +{
> +	struct cgroup_subsys *ss;
> +	int i, j, retval;
> +
> +	for_each_subsys_which(ss, i, &have_canfork_callback) {
> +		retval = ss->can_fork(child,
> +				      subsys_canfork_privatep(ss_private, i));

How about shortening things a bit?  It doesn't lose any clarity and we
don't have to do ugly line splits.

		ret = ss->can_fork(child, subsys_canfork_priv_p(ss_priv, i));

> +		if (retval)
> +			goto out_revert;
> +	}
> +
> +	return 0;
> +
> +out_revert:
> +	for_each_subsys(ss, j) {
> +		if (j >= i)
> +			break;
> +
> +		if (ss->cancel_fork)
> +			ss->cancel_fork(child,
> +					subsys_canfork_private(ss_private, j));
> +	}
> +
> +	return retval;
> +}
...
> @@ -1516,6 +1517,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
>  	p->task_works = NULL;
>  
>  	/*
> +	 * Ensure that the cgroup subsystem policies allow the new process to be
> +	 * forked. It should be noted the the new process's css_set can be changed
> +	 * between here and cgroup_post_fork() if an organisation operation is in
> +	 * progress.
> +	 */

Let's move the latter half of the comment to cgroup_can_fork().  fork
path doesn't need to care about this level of cgroup-specific details.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v12 7/8] cgroup: add a tset_get_css macro
  2015-05-18 21:34   ` Tejun Heo
@ 2015-05-19  1:18     ` Aleksa Sarai
  0 siblings, 0 replies; 27+ messages in thread
From: Aleksa Sarai @ 2015-05-19  1:18 UTC (permalink / raw)
  To: Tejun Heo
  Cc: lizefan, mingo, Peter Zijlstra, richard,
	Frédéric Weisbecker, linux-kernel, cgroups

Hi Tejun,

>> +/**
>> + * tset_get_css - obtain and get css for (tset, subsys_id)
>> + * @tset: target taskset
>> + * @subsys_id: target subsystem id
>> + *
>> + * Since all of the tasks in a taskset are guaranteed to have the same css, it's
>> + * safe to grab the ref of just the first task's css and treat it as though you
>> + * have a ref on the taskset's "collective" css.
>> + */
>> +#define tset_get_css(tset, subsys_id)                                        \
>> +     task_get_css(cgroup_taskset_first(tset), subsys_id)
>
> They aren't tho.  It depends on the hierarchy configuration.  csses
> belonging to hierarchies which aren't the current migration target may
> differ.

Okay, I was confused by what you said earlier. In this case, we might
as well just move all of the attach accounting to ->attach() and not
deal with reverting it (unless we're guaranteed that the css of each
task in the tset doesn't change between ->can_attach() and
->cancel_attach()).

--
Aleksa Sarai (cyphar)
www.cyphar.com

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v12 5/8] cgroup: move enum cgroup_subsys_id definition
  2015-05-18 21:59   ` Tejun Heo
@ 2015-05-19  1:25     ` Aleksa Sarai
  2015-05-28 20:44       ` Tejun Heo
  0 siblings, 1 reply; 27+ messages in thread
From: Aleksa Sarai @ 2015-05-19  1:25 UTC (permalink / raw)
  To: Tejun Heo
  Cc: lizefan, mingo, Peter Zijlstra, richard,
	Frédéric Weisbecker, linux-kernel, cgroups

Hi Tejun,

>> This is in preparation for implementing the pids cgroup subsystem. It is
>> not a functional change and should not change any behavior.
>>
>> Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
>
> I did a big reorg of cgroup.h and split of cgroup-defs.h from it and
> this patch doesn't seem necessary anymore.

Is it in your tree?

--
Aleksa Sarai (cyphar)
www.cyphar.com

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v12 4/8] cgroup, block: implement task_get_css() and use it in bio_associate_current()
  2015-05-18 21:59   ` Tejun Heo
@ 2015-05-19  3:05     ` Aleksa Sarai
  2015-05-28 20:47       ` Tejun Heo
  0 siblings, 1 reply; 27+ messages in thread
From: Aleksa Sarai @ 2015-05-19  3:05 UTC (permalink / raw)
  To: Tejun Heo
  Cc: lizefan, mingo, Peter Zijlstra, richard,
	Frédéric Weisbecker, linux-kernel, cgroups, Jens Axboe,
	Vivek Goyal

Hi Tejun,

> Applied 1-4 but skipped bio_associate_current() changes from this
> patch.  I'll route that part through a different patchset.

Do you want the updated versions of 6-8 of the patchset to be based on
your tree's for-next?

--
Aleksa Sarai (cyphar)
www.cyphar.com

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v12 8/8] cgroup: implement the PIDs subsystem
  2015-05-18 14:51 ` [PATCH v12 8/8] cgroup: implement the PIDs subsystem Aleksa Sarai
@ 2015-05-19  8:00   ` Peter Zijlstra
  2015-05-19  8:44     ` Aleksa Sarai
  0 siblings, 1 reply; 27+ messages in thread
From: Peter Zijlstra @ 2015-05-19  8:00 UTC (permalink / raw)
  To: Aleksa Sarai
  Cc: tj, lizefan, mingo, richard, fweisbec, linux-kernel, cgroups,
	Thomas Gleixner

On Tue, May 19, 2015 at 12:51:07AM +1000, Aleksa Sarai wrote:

> However, it should be noted that organisational operations (adding and
> removing tasks from a PIDs hierarchy) will *not* be prevented.

This is how you spell: broken controller.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v12 8/8] cgroup: implement the PIDs subsystem
  2015-05-19  8:00   ` Peter Zijlstra
@ 2015-05-19  8:44     ` Aleksa Sarai
  2015-05-19 10:56       ` Peter Zijlstra
  2015-05-19 13:11       ` Thomas Gleixner
  0 siblings, 2 replies; 27+ messages in thread
From: Aleksa Sarai @ 2015-05-19  8:44 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Tejun Heo, lizefan, mingo, richard,
	Frédéric Weisbecker, linux-kernel, cgroups,
	Thomas Gleixner

>> However, it should be noted that organisational operations (adding and
>> removing tasks from a PIDs hierarchy) will *not* be prevented.
>
> This is how you spell: broken controller.

This has been discussed before. Organisational operations (i.e.
attaching to a cgroup) are not to be blocked by a cgroup controller in
the unified hierarchy. You simply can't escape out of a parent
cgroup's limit through attaching to a child cgroup (because you will
attach either before the fork checks against the cgroup [in which case
the child's limit is followed -- which means you also follow the
parent's limit] or after it checks [which means you'll hit the
parent's limit and won't be able to fork]).

--
Aleksa Sarai (cyphar)
www.cyphar.com

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v12 8/8] cgroup: implement the PIDs subsystem
  2015-05-19  8:44     ` Aleksa Sarai
@ 2015-05-19 10:56       ` Peter Zijlstra
  2015-05-28 20:33         ` Tejun Heo
  2015-05-19 13:11       ` Thomas Gleixner
  1 sibling, 1 reply; 27+ messages in thread
From: Peter Zijlstra @ 2015-05-19 10:56 UTC (permalink / raw)
  To: Aleksa Sarai
  Cc: Tejun Heo, lizefan, mingo, richard,
	Frédéric Weisbecker, linux-kernel, cgroups,
	Thomas Gleixner

On Tue, May 19, 2015 at 06:44:39PM +1000, Aleksa Sarai wrote:
> >> However, it should be noted that organisational operations (adding and
> >> removing tasks from a PIDs hierarchy) will *not* be prevented.
> >
> > This is how you spell: broken controller.
> 
> This has been discussed before. Organisational operations (i.e.
> attaching to a cgroup) are not to be blocked by a cgroup controller in
> the unified hierarchy. 

That's utterly insane. As argued at length in threads like:

  lkml.kernel.org/r/alpine.DEB.2.11.1505061100040.4225@nanos

This breaks fundamental control rules and makes life for a number of
controllers impossible.

Also, I'll NAK each and every patch that will attempt to remove failing
can_attach from the cgroup core as it will fundamentally break some
scheduler controllers.

So please use it, it doesn't make any bloody sense to 'control' the
number of PIDs but then allow it to overrun the set point.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v12 8/8] cgroup: implement the PIDs subsystem
  2015-05-19  8:44     ` Aleksa Sarai
  2015-05-19 10:56       ` Peter Zijlstra
@ 2015-05-19 13:11       ` Thomas Gleixner
  2015-05-19 13:43         ` Aleksa Sarai
  2015-05-28 20:40         ` Tejun Heo
  1 sibling, 2 replies; 27+ messages in thread
From: Thomas Gleixner @ 2015-05-19 13:11 UTC (permalink / raw)
  To: Aleksa Sarai
  Cc: Peter Zijlstra, Tejun Heo, lizefan, mingo, richard,
	Frédéric Weisbecker, linux-kernel, cgroups

On Tue, 19 May 2015, Aleksa Sarai wrote:

> >> However, it should be noted that organisational operations (adding and
> >> removing tasks from a PIDs hierarchy) will *not* be prevented.
> >
> > This is how you spell: broken controller.
> 
> This has been discussed before. Organisational operations (i.e.
> attaching to a cgroup) are not to be blocked by a cgroup controller in
> the unified hierarchy. You simply can't escape out of a parent
> cgroup's limit through attaching to a child cgroup (because you will
> attach either before the fork checks against the cgroup [in which case
> the child's limit is followed -- which means you also follow the
> parent's limit] or after it checks [which means you'll hit the
> parent's limit and won't be able to fork]).

That's complete and utter nonsense. What has the parent limit to do
with the overflow of the child limit?

parent:	 limit 100   usecnt 80
child:	 limit 10    usecnt 10

So moving anything into child is violating the constraints and has to
be refused. Anything else is just dirty hackery.

Thanks,

	tglx





^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v12 8/8] cgroup: implement the PIDs subsystem
  2015-05-19 13:11       ` Thomas Gleixner
@ 2015-05-19 13:43         ` Aleksa Sarai
  2015-06-03  2:44           ` Aleksa Sarai
  2015-05-28 20:40         ` Tejun Heo
  1 sibling, 1 reply; 27+ messages in thread
From: Aleksa Sarai @ 2015-05-19 13:43 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Peter Zijlstra, Tejun Heo, lizefan, mingo, richard,
	Frédéric Weisbecker, linux-kernel, cgroups

>> >> However, it should be noted that organisational operations (adding and
>> >> removing tasks from a PIDs hierarchy) will *not* be prevented.
>> >
>> > This is how you spell: broken controller.
>>
>> This has been discussed before. Organisational operations (i.e.
>> attaching to a cgroup) are not to be blocked by a cgroup controller in
>> the unified hierarchy. You simply can't escape out of a parent
>> cgroup's limit through attaching to a child cgroup (because you will
>> attach either before the fork checks against the cgroup [in which case
>> the child's limit is followed -- which means you also follow the
>> parent's limit] or after it checks [which means you'll hit the
>> parent's limit and won't be able to fork]).
>
> That's complete and utter nonsense. What has the parent limit to do
> with the overflow of the child limit?
>
> parent:  limit 100   usecnt 80
> child:   limit 10    usecnt 10
>
> So moving anything into child is violating the constraints and has to
> be refused. Anything else is just dirty hackery.

Whoops. Yes, you're completely right. All right, I'll fix up the
patchset in a few days.

--
Aleksa Sarai (cyphar)
www.cyphar.com

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v12 8/8] cgroup: implement the PIDs subsystem
  2015-05-19 10:56       ` Peter Zijlstra
@ 2015-05-28 20:33         ` Tejun Heo
  0 siblings, 0 replies; 27+ messages in thread
From: Tejun Heo @ 2015-05-28 20:33 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Aleksa Sarai, lizefan, mingo, richard,
	Frédéric Weisbecker, linux-kernel, cgroups,
	Thomas Gleixner

Hello, sorry about the delay.

On Tue, May 19, 2015 at 12:56:31PM +0200, Peter Zijlstra wrote:
> > This has been discussed before. Organisational operations (i.e.
> > attaching to a cgroup) are not to be blocked by a cgroup controller in
> > the unified hierarchy. 
> 
> That's utterly insane. As argued at length in threads like:
> 
>   lkml.kernel.org/r/alpine.DEB.2.11.1505061100040.4225@nanos
> 
> This breaks fundamental control rules and makes life for a number of
> controllers impossible.

I didn't chase that dicussion because it was rather off-topic for
scheduler.

There are several classes of distribution schemes that cgroups deal
with.

A. Ratio-based.  Usually used to distributed resources which are
   replenished over time.  IO time, CPU cycles and so on.  This
   primarily doesn't deal with persistent state.

B. Limiting over-committable resources.  This applies to persistent
   resources like memory but also to transient ones like IO bandwidth
   and iops.  These all operate by limiting how much resources are
   newly given out and thus their neutral state is the overcommitted
   no-limit state.

C. Non-over-committable "hard" resources.  Currently, scheduler RT
   slices are the only one.  These actually should be distributed by
   carving out a finite whole and thus its limits can't be
   over-committed.  They have to behave as allocators rather than
   limiters.

Most persistent resources fall in the B category and we have a very
clear precedences in dealing with configurations of these limits.
Just think about the NPROC ulimit or quota.  They all operate by
suppressing distribution of new resources and allow new limit
configuration to be lower than the current consumption.

There's a clear reason for this.  it allows closing the race window
between configuration change and increasing resource consumption in a
very simple way - lowering the limit and checking the existing usage.

While what Thomas suggested - building a whole new transaction model
on top - can also close the race window.  This breaks from the
convention for no good reason.  It doesn't provide anything beyond
what's what's possible with the established model and it's outright
silly to have NPROC controller to behave so differently from the
existing mechanism which controls exactly the same resource.

> Also, I'll NAK each and every patch that will attempt to remove failing
> can_attach from the cgroup core as it will fundamentally break some
> scheduler controllers.

I was struggling with C above because it was just a single resource
type which belongs to that category but given that cgroups have to
support it ->can_attach() will have to be able to fail for those
resource types, but only for that resource type.

> So please use it, it doesn't make any bloody sense to 'control' the
> number of PIDs but then allow it to overrun the set point.

Again, it's not about ->can_attach() can fail or not in terms of
implementation at all.  It's about following consistent resource
distribution model.  Please don't conflate different resource types.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v12 8/8] cgroup: implement the PIDs subsystem
  2015-05-19 13:11       ` Thomas Gleixner
  2015-05-19 13:43         ` Aleksa Sarai
@ 2015-05-28 20:40         ` Tejun Heo
  2015-05-30  6:50           ` Aleksa Sarai
  1 sibling, 1 reply; 27+ messages in thread
From: Tejun Heo @ 2015-05-28 20:40 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Aleksa Sarai, Peter Zijlstra, lizefan, mingo, richard,
	Frédéric Weisbecker, linux-kernel, cgroups

Hello, Thomas.

On Tue, May 19, 2015 at 03:11:34PM +0200, Thomas Gleixner wrote:
> That's complete and utter nonsense. What has the parent limit to do
> with the overflow of the child limit?
> 
> parent:	 limit 100   usecnt 80
> child:	 limit 10    usecnt 10
> 
> So moving anything into child is violating the constraints and has to
> be refused. Anything else is just dirty hackery.

And the one who's moving the process there might as well raise the
limit in the child all the same.  It doesn't make any difference
without delegation and with delegation we need to restrict migration
at the exactly same junctions.  We can't delegate otherwise.  And the
resource limit for the delegated subtree is enforced from its parent
which delegatee can't escape how it changes the configuration or moves
processes around.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v12 5/8] cgroup: move enum cgroup_subsys_id definition
  2015-05-19  1:25     ` Aleksa Sarai
@ 2015-05-28 20:44       ` Tejun Heo
  0 siblings, 0 replies; 27+ messages in thread
From: Tejun Heo @ 2015-05-28 20:44 UTC (permalink / raw)
  To: Aleksa Sarai
  Cc: lizefan, mingo, Peter Zijlstra, richard,
	Frédéric Weisbecker, linux-kernel, cgroups

On Tue, May 19, 2015 at 11:25:41AM +1000, Aleksa Sarai wrote:
> Hi Tejun,
> 
> >> This is in preparation for implementing the pids cgroup subsystem. It is
> >> not a functional change and should not change any behavior.
> >>
> >> Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
> >
> > I did a big reorg of cgroup.h and split of cgroup-defs.h from it and
> > this patch doesn't seem necessary anymore.
> 
> Is it in your tree?

Yeap, in for-4.2 branch.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v12 4/8] cgroup, block: implement task_get_css() and use it in bio_associate_current()
  2015-05-19  3:05     ` Aleksa Sarai
@ 2015-05-28 20:47       ` Tejun Heo
  0 siblings, 0 replies; 27+ messages in thread
From: Tejun Heo @ 2015-05-28 20:47 UTC (permalink / raw)
  To: Aleksa Sarai
  Cc: lizefan, mingo, Peter Zijlstra, richard,
	Frédéric Weisbecker, linux-kernel, cgroups, Jens Axboe,
	Vivek Goyal

On Tue, May 19, 2015 at 01:05:01PM +1000, Aleksa Sarai wrote:
> Hi Tejun,
> 
> > Applied 1-4 but skipped bio_associate_current() changes from this
> > patch.  I'll route that part through a different patchset.
> 
> Do you want the updated versions of 6-8 of the patchset to be based on
> your tree's for-next?

Can you please base changes on cgroups/for-4.2?  New features usually
should be based on cgroups branch headed for the next merge window.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v12 8/8] cgroup: implement the PIDs subsystem
  2015-05-28 20:40         ` Tejun Heo
@ 2015-05-30  6:50           ` Aleksa Sarai
  0 siblings, 0 replies; 27+ messages in thread
From: Aleksa Sarai @ 2015-05-30  6:50 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Thomas Gleixner, Peter Zijlstra, lizefan, mingo, richard,
	Frédéric Weisbecker, linux-kernel, cgroups

Hello,

>> That's complete and utter nonsense. What has the parent limit to do
>> with the overflow of the child limit?
>>
>> parent:        limit 100   usecnt 80
>> child:         limit 10    usecnt 10
>>
>> So moving anything into child is violating the constraints and has to
>> be refused. Anything else is just dirty hackery.
>
> And the one who's moving the process there might as well raise the
> limit in the child all the same.  It doesn't make any difference
> without delegation and with delegation we need to restrict migration
> at the exactly same junctions.  We can't delegate otherwise.  And the
> resource limit for the delegated subtree is enforced from its parent
> which delegatee can't escape how it changes the configuration or moves
> processes around.

Here's a case where we've delegated a subtree, for an example of how a
delegated subtree can't overcome `subtree_parent`'s limit -- and by
extension `parent`'s limit:

parent: limit=128 usage=64
-- subtree_parent: limit=64 usage=32
---- subtree_child: limit=2 usage=1

If you delegate a subtree (such that a process cannot attach processes
to `parent`), then it is not possible for the subtree to violate
`subtree_parent`'s limit. This is because the ability to migrate a
process mid-fork relies on the ability to *actually* fork in the
_original_ cgroup (`subtree_parent` or `subtree_child` [which requires
the ability to fork in `subtree_parent`]). Once you've hit
subtree_parent's limit, there's no way for you to violate that limit.
The only other method I can think of is if you do the mid-fork thing
to migrate into `subtree_child`, then you migrate the two processes
into `subtree_parent`. This won't help you either, because if you then
continue and try to fork in `subtree_child` and then migrate, you'll
be blocked if the fork would violate `subtree_parent`'s limit.

If you try to attach to `subtree_child` a process that is mid-fork,
you'll bump the usage count to 3 (while this is bad, I can't really
think of any way we can tell can_attach() that the process is
mid-fork). If you do it again (because we don't stop can_attach()),
you aren't blocked by the fact that you're attaching to a cgroup that
has already exceeded its usage count, so you'll bump the count to 5 --
this I can understand would _seem_ to indicate a broken controller.
And you /can/ continue this ad infinitum -- up _until_ you run out of
the ability to make new processes inside `subtree_parent` (which
*will* happen). At that point, can_fork() will fail the fork on
`subtree_parent`, before you can attempt to migrate mid-fork.

And I just want to point out that if you have the ability to attach
processes to `subtree_child`, then you *already* have the right to
violate its set limit through attach anyway (or just changing the
limit) -- so the fact you can do this mid-fork isn't untoward at all.
If a user has the ability to just disable the cgroup's limit, then why
should that same user be hampered when attempting to attach processes
that said cgroup (which is an administrative operation -- so you'd
assume that they're clever enough to know that migration into a cgroup
may bump usage so it's greater than the limit [or that they just
RTFM'd])?

--
Aleksa Sarai (cyphar)
www.cyphar.com

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v12 8/8] cgroup: implement the PIDs subsystem
  2015-05-19 13:43         ` Aleksa Sarai
@ 2015-06-03  2:44           ` Aleksa Sarai
  0 siblings, 0 replies; 27+ messages in thread
From: Aleksa Sarai @ 2015-06-03  2:44 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Peter Zijlstra, Tejun Heo, lizefan, mingo, richard,
	Frédéric Weisbecker, linux-kernel, cgroups

>> That's complete and utter nonsense. What has the parent limit to do
>> with the overflow of the child limit?

I didn't read this sentence properly. It's because you're migrating
*from* the parent to the child. If you have the right to attach to the
child from the parent, then you also have the right to change the
limit of the child.

--
Aleksa Sarai (cyphar)
www.cyphar.com

^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2015-06-03  2:44 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-05-18 14:50 [PATCH v12 0/8] cgroup: add pids subsystem Aleksa Sarai
2015-05-18 14:51 ` [PATCH v12 1/8] cgroup: switch to unsigned long for bitmasks Aleksa Sarai
2015-05-18 14:51 ` [PATCH v12 2/8] cgroup: use bitmask to filter for_each_subsys Aleksa Sarai
2015-05-18 14:51 ` [PATCH v12 3/8] cgroup: replace explicit ss_mask checking with for_each_subsys_which Aleksa Sarai
2015-05-18 14:51 ` [PATCH v12 4/8] cgroup, block: implement task_get_css() and use it in bio_associate_current() Aleksa Sarai
2015-05-18 21:59   ` Tejun Heo
2015-05-19  3:05     ` Aleksa Sarai
2015-05-28 20:47       ` Tejun Heo
2015-05-18 14:51 ` [PATCH v12 5/8] cgroup: move enum cgroup_subsys_id definition Aleksa Sarai
2015-05-18 21:59   ` Tejun Heo
2015-05-19  1:25     ` Aleksa Sarai
2015-05-28 20:44       ` Tejun Heo
2015-05-18 14:51 ` [PATCH v12 6/8] cgroup: allow a cgroup subsystem to reject a fork Aleksa Sarai
2015-05-18 22:12   ` Tejun Heo
2015-05-18 14:51 ` [PATCH v12 7/8] cgroup: add a tset_get_css macro Aleksa Sarai
2015-05-18 21:34   ` Tejun Heo
2015-05-19  1:18     ` Aleksa Sarai
2015-05-18 14:51 ` [PATCH v12 8/8] cgroup: implement the PIDs subsystem Aleksa Sarai
2015-05-19  8:00   ` Peter Zijlstra
2015-05-19  8:44     ` Aleksa Sarai
2015-05-19 10:56       ` Peter Zijlstra
2015-05-28 20:33         ` Tejun Heo
2015-05-19 13:11       ` Thomas Gleixner
2015-05-19 13:43         ` Aleksa Sarai
2015-06-03  2:44           ` Aleksa Sarai
2015-05-28 20:40         ` Tejun Heo
2015-05-30  6:50           ` Aleksa Sarai

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).