linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [-mm][PATCH 0/4] Add rlimit controller to cgroups (v3)
@ 2008-05-03 21:37 Balbir Singh
  2008-05-03 21:37 ` [-mm][PATCH 1/4] Setup the rlimit controller Balbir Singh
                   ` (5 more replies)
  0 siblings, 6 replies; 32+ messages in thread
From: Balbir Singh @ 2008-05-03 21:37 UTC (permalink / raw)
  To: linux-mm
  Cc: Sudhir Kumar, YAMAMOTO Takashi, Paul Menage, lizf, linux-kernel,
	David Rientjes, Pavel Emelianov, Balbir Singh, Andrew Morton,
	KAMEZAWA Hiroyuki

This is the third version of the address space control patches. These
patches are against 2.6.25-mm1  and have been tested using KVM in SMP mode,
both with and without the config enabled.

The first patch adds the user interface. The second patch fixes the
cgroup mm_owner_changed callback to pass the task struct, so that
accounting can be adjusted on owner changes. The thrid patch adds accounting
and control. The fourth patch updates documentation.

An earlier post of the patchset can be found at
http://lwn.net/Articles/275143/

This patch is built on top of the mm owner patches and utilizes that feature
to virtually group tasks by mm_struct.

Reviews, Comments?

Series

rlimit-controller-setup.patch
cgroup-add-task-to-mm--owner-callbacks.patch
rlimit-controller-address-space-accounting.patch
rlimit-controller-add-documentation.patch

-- 
	Warm Regards,
	Balbir Singh
	Linux Technology Center
	IBM, ISTL

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [-mm][PATCH 1/4] Setup the rlimit controller
  2008-05-03 21:37 [-mm][PATCH 0/4] Add rlimit controller to cgroups (v3) Balbir Singh
@ 2008-05-03 21:37 ` Balbir Singh
  2008-05-05 22:11   ` Andrew Morton
  2008-05-06  1:31   ` Li Zefan
  2008-05-03 21:38 ` [-mm][PATCH 2/4] Enhance cgroup mm_owner_changed callback to add task information Balbir Singh
                   ` (4 subsequent siblings)
  5 siblings, 2 replies; 32+ messages in thread
From: Balbir Singh @ 2008-05-03 21:37 UTC (permalink / raw)
  To: linux-mm
  Cc: Sudhir Kumar, YAMAMOTO Takashi, Paul Menage, lizf, linux-kernel,
	David Rientjes, Pavel Emelianov, Balbir Singh, Andrew Morton,
	KAMEZAWA Hiroyuki

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain, Size: 7374 bytes --]



This patch sets up the rlimit cgroup controller. It adds the basic create,
destroy and populate functionality. The user interface provided is very
similar to the memory resource controller. The rlimit controller can be
enhanced easily in the future to control mlocked pages.

Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
---

 include/linux/cgroup_subsys.h |    4 +
 include/linux/rlimitcgroup.h  |   19 +++++
 init/Kconfig                  |   10 ++
 mm/Makefile                   |    1 
 mm/rlimitcgroup.c             |  141 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 175 insertions(+)

diff -puN /dev/null mm/rlimitcgroup.c
--- /dev/null	2008-05-03 22:12:13.033285313 +0530
+++ linux-2.6.25-balbir/mm/rlimitcgroup.c	2008-05-04 02:52:51.000000000 +0530
@@ -0,0 +1,141 @@
+/*
+ * Copyright © International Business Machines  Corp., 2008
+ *
+ * Author: Balbir Singh <balbir@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Provide resource limits for tasks in a control group. A lot of code is
+ * duplicated from the memory controller (this code is common to almost
+ * all controllers). TODO: Consider writing a tool that can generate this
+ * code.
+ */
+#include <linux/cgroup.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/res_counter.h>
+#include <linux/rlimitcgroup.h>
+
+struct cgroup_subsys rlimit_cgroup_subsys;
+
+struct rlimit_cgroup {
+	struct cgroup_subsys_state css;
+	struct res_counter as_res;	/* address space counter */
+};
+
+static struct rlimit_cgroup init_rlimit_cgroup;
+
+struct rlimit_cgroup *rlimit_cgroup_from_cgrp(struct cgroup *cgrp)
+{
+	return container_of(cgroup_subsys_state(cgrp, rlimit_cgroup_subsys_id),
+				struct rlimit_cgroup, css);
+}
+
+static struct cgroup_subsys_state *
+rlimit_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+	struct rlimit_cgroup *rcg;
+
+	if (unlikely(cgrp->parent == NULL))
+		rcg = &init_rlimit_cgroup;
+	else {
+		rcg = kzalloc(sizeof(*rcg), GFP_KERNEL);
+		if (!rcg)
+			return ERR_PTR(-ENOMEM);
+	}
+	res_counter_init(&rcg->as_res);
+	return &rcg->css;
+}
+
+static void rlimit_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+	kfree(rlimit_cgroup_from_cgrp(cgrp));
+}
+
+static int rlimit_cgroup_reset(struct cgroup *cgrp, unsigned int event)
+{
+	struct rlimit_cgroup *rcg;
+
+	rcg = rlimit_cgroup_from_cgrp(cgrp);
+	switch (event) {
+	case RES_FAILCNT:
+		res_counter_reset_failcnt(&rcg->as_res);
+		break;
+	}
+	return 0;
+}
+
+static u64 rlimit_cgroup_read(struct cgroup *cgrp, struct cftype *cft)
+{
+	return res_counter_read_u64(&rlimit_cgroup_from_cgrp(cgrp)->as_res,
+					cft->private);
+}
+
+static int rlimit_cgroup_write_strategy(char *buf, unsigned long long *tmp)
+{
+	*tmp = memparse(buf, &buf);
+	if (*buf != '\0')
+		return -EINVAL;
+
+	*tmp = ((*tmp + PAGE_SIZE) >> PAGE_SHIFT) << PAGE_SHIFT;
+	return 0;
+}
+
+static ssize_t rlimit_cgroup_write(struct cgroup *cgrp, struct cftype *cft,
+					struct file *file,
+					const char __user *userbuf,
+					size_t nbytes,
+					loff_t *ppos)
+{
+	return res_counter_write(&rlimit_cgroup_from_cgrp(cgrp)->as_res,
+					cft->private, userbuf, nbytes, ppos,
+					rlimit_cgroup_write_strategy);
+}
+
+static struct cftype rlimit_cgroup_files[] = {
+	{
+		.name = "usage_in_bytes",
+		.private = RES_USAGE,
+		.read_u64 = rlimit_cgroup_read,
+	},
+	{
+		.name = "limit_in_bytes",
+		.private = RES_LIMIT,
+		.write = rlimit_cgroup_write,
+		.read_u64 = rlimit_cgroup_read,
+	},
+	{
+		.name = "failcnt",
+		.private = RES_FAILCNT,
+		.trigger = rlimit_cgroup_reset,
+		.read_u64 = rlimit_cgroup_read,
+	},
+};
+
+static int rlimit_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+	return cgroup_add_files(cgrp, ss, rlimit_cgroup_files,
+				ARRAY_SIZE(rlimit_cgroup_files));
+}
+
+struct cgroup_subsys rlimit_cgroup_subsys = {
+	.name = "rlimit",
+	.subsys_id = rlimit_cgroup_subsys_id,
+	.create = rlimit_cgroup_create,
+	.destroy = rlimit_cgroup_destroy,
+	.populate = rlimit_cgroup_populate,
+	.early_init = 0,
+};
diff -puN /dev/null include/linux/rlimitcgroup.h
--- /dev/null	2008-05-03 22:12:13.033285313 +0530
+++ linux-2.6.25-balbir/include/linux/rlimitcgroup.h	2008-05-04 02:53:02.000000000 +0530
@@ -0,0 +1,19 @@
+/*
+ * Copyright © International Business Machines  Corp., 2008
+ *
+ * Author: Balbir Singh <balbir@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#ifndef LINUX_RLIMITCGROUP_H
+#define LINUX_RLIMITCGROUP_H
+
+#endif /* LINUX_RLIMITCGROUP_H */
diff -puN include/linux/cgroup_subsys.h~rlimit-controller-setup include/linux/cgroup_subsys.h
--- linux-2.6.25/include/linux/cgroup_subsys.h~rlimit-controller-setup	2008-05-04 02:52:51.000000000 +0530
+++ linux-2.6.25-balbir/include/linux/cgroup_subsys.h	2008-05-04 02:52:51.000000000 +0530
@@ -47,4 +47,8 @@ SUBSYS(mem_cgroup)
 SUBSYS(devices)
 #endif
 
+#ifdef CONFIG_CGROUP_RLIMIT_CTLR
+SUBSYS(rlimit_cgroup)
+#endif
+
 /* */
diff -puN init/Kconfig~rlimit-controller-setup init/Kconfig
--- linux-2.6.25/init/Kconfig~rlimit-controller-setup	2008-05-04 02:52:51.000000000 +0530
+++ linux-2.6.25-balbir/init/Kconfig	2008-05-04 02:52:51.000000000 +0530
@@ -393,6 +393,16 @@ config CGROUP_MEM_RES_CTLR
 	  This config option also selects MM_OWNER config option, which
 	  could in turn add some fork/exit overhead.
 
+config CGROUP_RLIMIT_CTLR
+	bool "rlimit controls for cgroups"
+	depends on CGROUPS && RESOURCE_COUNTERS && MMU
+	select MM_OWNER
+	help
+	  Provides resource limits for all the tasks belonging to a
+	  control group. CGROUP_MEM_RES_CTLR provides support for physical
+	  memory RSS and Page Cache control. Virtual address space control
+	  is provided by this controller.
+
 config SYSFS_DEPRECATED
 	bool
 
diff -puN mm/Makefile~rlimit-controller-setup mm/Makefile
--- linux-2.6.25/mm/Makefile~rlimit-controller-setup	2008-05-04 02:52:51.000000000 +0530
+++ linux-2.6.25-balbir/mm/Makefile	2008-05-04 02:52:51.000000000 +0530
@@ -33,4 +33,5 @@ obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_SMP) += allocpercpu.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o
+obj-$(CONFIG_CGROUP_RLIMIT_CTLR) += rlimitcgroup.o
 
_

-- 
	Warm Regards,
	Balbir Singh
	Linux Technology Center
	IBM, ISTL

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [-mm][PATCH 2/4] Enhance cgroup mm_owner_changed callback to add task information
  2008-05-03 21:37 [-mm][PATCH 0/4] Add rlimit controller to cgroups (v3) Balbir Singh
  2008-05-03 21:37 ` [-mm][PATCH 1/4] Setup the rlimit controller Balbir Singh
@ 2008-05-03 21:38 ` Balbir Singh
  2008-05-05 22:15   ` Andrew Morton
  2008-05-05 23:00   ` Paul Menage
  2008-05-03 21:38 ` [-mm][PATCH 3/4] Add rlimit controller accounting and control Balbir Singh
                   ` (3 subsequent siblings)
  5 siblings, 2 replies; 32+ messages in thread
From: Balbir Singh @ 2008-05-03 21:38 UTC (permalink / raw)
  To: linux-mm
  Cc: Sudhir Kumar, YAMAMOTO Takashi, Paul Menage, lizf, linux-kernel,
	David Rientjes, Pavel Emelianov, Balbir Singh, Andrew Morton,
	KAMEZAWA Hiroyuki



This patch adds an additional field to the mm_owner callbacks. This field
is required to get to the mm that changed.

Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
---

 include/linux/cgroup.h |    3 ++-
 kernel/cgroup.c        |    2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff -puN kernel/cgroup.c~cgroup-add-task-to-mm--owner-callbacks kernel/cgroup.c
--- linux-2.6.25/kernel/cgroup.c~cgroup-add-task-to-mm--owner-callbacks	2008-05-04 02:53:05.000000000 +0530
+++ linux-2.6.25-balbir/kernel/cgroup.c	2008-05-04 02:53:05.000000000 +0530
@@ -2772,7 +2772,7 @@ void cgroup_mm_owner_callbacks(struct ta
 			if (oldcgrp == newcgrp)
 				continue;
 			if (ss->mm_owner_changed)
-				ss->mm_owner_changed(ss, oldcgrp, newcgrp);
+				ss->mm_owner_changed(ss, oldcgrp, newcgrp, new);
 		}
 	}
 }
diff -puN include/linux/cgroup.h~cgroup-add-task-to-mm--owner-callbacks include/linux/cgroup.h
--- linux-2.6.25/include/linux/cgroup.h~cgroup-add-task-to-mm--owner-callbacks	2008-05-04 02:53:05.000000000 +0530
+++ linux-2.6.25-balbir/include/linux/cgroup.h	2008-05-04 02:53:05.000000000 +0530
@@ -310,7 +310,8 @@ struct cgroup_subsys {
 	 */
 	void (*mm_owner_changed)(struct cgroup_subsys *ss,
 					struct cgroup *old,
-					struct cgroup *new);
+					struct cgroup *new,
+					struct task_struct *p);
 	int subsys_id;
 	int active;
 	int disabled;
_

-- 
	Warm Regards,
	Balbir Singh
	Linux Technology Center
	IBM, ISTL

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [-mm][PATCH 3/4] Add rlimit controller accounting and control
  2008-05-03 21:37 [-mm][PATCH 0/4] Add rlimit controller to cgroups (v3) Balbir Singh
  2008-05-03 21:37 ` [-mm][PATCH 1/4] Setup the rlimit controller Balbir Singh
  2008-05-03 21:38 ` [-mm][PATCH 2/4] Enhance cgroup mm_owner_changed callback to add task information Balbir Singh
@ 2008-05-03 21:38 ` Balbir Singh
  2008-05-05 22:24   ` Andrew Morton
                     ` (2 more replies)
  2008-05-03 21:38 ` [-mm][PATCH 4/4] Add rlimit controller documentation Balbir Singh
                   ` (2 subsequent siblings)
  5 siblings, 3 replies; 32+ messages in thread
From: Balbir Singh @ 2008-05-03 21:38 UTC (permalink / raw)
  To: linux-mm
  Cc: Sudhir Kumar, YAMAMOTO Takashi, Paul Menage, lizf, linux-kernel,
	David Rientjes, Pavel Emelianov, Balbir Singh, Andrew Morton,
	KAMEZAWA Hiroyuki



This patch adds support for accounting and control of virtual address space
limits. The accounting is done via the rlimit_cgroup_(un)charge_as functions.
The core of the accounting takes place during fork time in copy_process(),
may_expand_vm(), remove_vma_list() and exit_mmap(). There are some special
cases that are handled here as well (arch/ia64/kernel/perform.c,
arch/x86/kernel/ptrace.c, insert_special_mapping())

Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
---

 arch/ia64/kernel/perfmon.c   |    6 ++
 arch/x86/kernel/ds.c         |   10 ++++
 fs/exec.c                    |    4 +
 include/linux/rlimitcgroup.h |   20 +++++++++
 kernel/fork.c                |   12 +++++
 mm/mmap.c                    |   11 +++++
 mm/rlimitcgroup.c            |   87 +++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 149 insertions(+), 1 deletion(-)

diff -puN mm/rlimitcgroup.c~rlimit-controller-address-space-accounting mm/rlimitcgroup.c
--- linux-2.6.25/mm/rlimitcgroup.c~rlimit-controller-address-space-accounting	2008-05-04 02:53:20.000000000 +0530
+++ linux-2.6.25-balbir/mm/rlimitcgroup.c	2008-05-04 02:53:20.000000000 +0530
@@ -44,6 +44,40 @@ struct rlimit_cgroup *rlimit_cgroup_from
 				struct rlimit_cgroup, css);
 }
 
+struct rlimit_cgroup *rlimit_cgroup_from_task(struct task_struct *p)
+{
+	return container_of(task_subsys_state(p, rlimit_cgroup_subsys_id),
+				struct rlimit_cgroup, css);
+}
+
+int rlimit_cgroup_charge_as(struct mm_struct *mm, unsigned long nr_pages)
+{
+	int ret;
+	struct rlimit_cgroup *rcg;
+
+	rcu_read_lock();
+	rcg = rlimit_cgroup_from_task(rcu_dereference(mm->owner));
+	css_get(&rcg->css);
+	rcu_read_unlock();
+
+	ret = res_counter_charge(&rcg->as_res, (nr_pages << PAGE_SHIFT));
+	css_put(&rcg->css);
+	return ret;
+}
+
+void rlimit_cgroup_uncharge_as(struct mm_struct *mm, unsigned long nr_pages)
+{
+	struct rlimit_cgroup *rcg;
+
+	rcu_read_lock();
+	rcg = rlimit_cgroup_from_task(rcu_dereference(mm->owner));
+	css_get(&rcg->css);
+	rcu_read_unlock();
+
+	res_counter_uncharge(&rcg->as_res, (nr_pages << PAGE_SHIFT));
+	css_put(&rcg->css);
+}
+
 static struct cgroup_subsys_state *
 rlimit_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
@@ -65,6 +99,39 @@ static void rlimit_cgroup_destroy(struct
 	kfree(rlimit_cgroup_from_cgrp(cgrp));
 }
 
+/*
+ * TODO: get the attach callbacks to fail and disallow task movement.
+ */
+static void rlimit_cgroup_move_task(struct cgroup_subsys *ss,
+					struct cgroup *cgrp,
+					struct cgroup *old_cgrp,
+					struct task_struct *p)
+{
+	struct mm_struct *mm;
+	struct rlimit_cgroup *rcg, *old_rcg;
+
+	mm = get_task_mm(p);
+	if (mm == NULL)
+		return;
+
+	rcu_read_lock();
+	if (p != rcu_dereference(mm->owner))
+		goto out;
+
+	rcg = rlimit_cgroup_from_cgrp(cgrp);
+	old_rcg = rlimit_cgroup_from_cgrp(old_cgrp);
+
+	if (rcg == old_rcg)
+		goto out;
+
+	if (res_counter_charge(&rcg->as_res, (mm->total_vm << PAGE_SHIFT)))
+		goto out;
+	res_counter_uncharge(&old_rcg->as_res, (mm->total_vm << PAGE_SHIFT));
+out:
+	rcu_read_unlock();
+	mmput(mm);
+}
+
 static int rlimit_cgroup_reset(struct cgroup *cgrp, unsigned int event)
 {
 	struct rlimit_cgroup *rcg;
@@ -131,11 +198,31 @@ static int rlimit_cgroup_populate(struct
 				ARRAY_SIZE(rlimit_cgroup_files));
 }
 
+static void rlimit_cgroup_mm_owner_changed(struct cgroup_subsys *ss,
+						struct cgroup *cgrp,
+						struct cgroup *old_cgrp,
+						struct task_struct *p)
+{
+	struct rlimit_cgroup *rcg, *old_rcg;
+	struct mm_struct *mm = get_task_mm(p);
+
+	BUG_ON(!mm);
+	rcg = rlimit_cgroup_from_cgrp(cgrp);
+	old_rcg = rlimit_cgroup_from_cgrp(old_cgrp);
+	if (res_counter_charge(&rcg->as_res, (mm->total_vm << PAGE_SHIFT)))
+		goto out;
+	res_counter_uncharge(&old_rcg->as_res, (mm->total_vm << PAGE_SHIFT));
+out:
+	mmput(mm);
+}
+
 struct cgroup_subsys rlimit_cgroup_subsys = {
 	.name = "rlimit",
 	.subsys_id = rlimit_cgroup_subsys_id,
 	.create = rlimit_cgroup_create,
 	.destroy = rlimit_cgroup_destroy,
 	.populate = rlimit_cgroup_populate,
+	.attach = rlimit_cgroup_move_task,
+	.mm_owner_changed = rlimit_cgroup_mm_owner_changed,
 	.early_init = 0,
 };
diff -puN include/linux/rlimitcgroup.h~rlimit-controller-address-space-accounting include/linux/rlimitcgroup.h
--- linux-2.6.25/include/linux/rlimitcgroup.h~rlimit-controller-address-space-accounting	2008-05-04 02:53:20.000000000 +0530
+++ linux-2.6.25-balbir/include/linux/rlimitcgroup.h	2008-05-04 02:54:07.000000000 +0530
@@ -16,4 +16,24 @@
 #ifndef LINUX_RLIMITCGROUP_H
 #define LINUX_RLIMITCGROUP_H
 
+#ifdef CONFIG_CGROUP_RLIMIT_CTLR
+
+int rlimit_cgroup_charge_as(struct mm_struct *mm, unsigned long nr_pages);
+void rlimit_cgroup_uncharge_as(struct mm_struct *mm, unsigned long nr_pages);
+
+#else /* !CONFIG_CGROUP_RLIMIT_CTLR */
+
+static inline int
+rlimit_cgroup_charge_as(struct mm_struct *mm, unsigned long nr_pages)
+{
+	return 0;
+}
+
+static inline void
+rlimit_cgroup_uncharge_as(struct mm_struct *mm, unsigned long nr_pages)
+{
+}
+
+#endif /* CONFIG_CGROUP_RLIMIT_CTLR */
+
 #endif /* LINUX_RLIMITCGROUP_H */
diff -puN mm/mmap.c~rlimit-controller-address-space-accounting mm/mmap.c
--- linux-2.6.25/mm/mmap.c~rlimit-controller-address-space-accounting	2008-05-04 02:53:20.000000000 +0530
+++ linux-2.6.25-balbir/mm/mmap.c	2008-05-04 02:53:20.000000000 +0530
@@ -26,6 +26,7 @@
 #include <linux/mount.h>
 #include <linux/mempolicy.h>
 #include <linux/rmap.h>
+#include <linux/rlimitcgroup.h>
 
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -1730,6 +1731,7 @@ static void remove_vma_list(struct mm_st
 		long nrpages = vma_pages(vma);
 
 		mm->total_vm -= nrpages;
+		rlimit_cgroup_uncharge_as(mm, nrpages);
 		if (vma->vm_flags & VM_LOCKED)
 			mm->locked_vm -= nrpages;
 		vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
@@ -2056,6 +2058,7 @@ void exit_mmap(struct mm_struct *mm)
 	/* Use -1 here to ensure all VMAs in the mm are unmapped */
 	end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
 	vm_unacct_memory(nr_accounted);
+	rlimit_cgroup_uncharge_as(mm, mm->total_vm);
 	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
 	tlb_finish_mmu(tlb, 0, end);
 
@@ -2174,6 +2177,10 @@ int may_expand_vm(struct mm_struct *mm, 
 
 	if (cur + npages > lim)
 		return 0;
+
+	if (rlimit_cgroup_charge_as(mm, npages))
+		return 0;
+
 	return 1;
 }
 
@@ -2236,6 +2243,9 @@ int install_special_mapping(struct mm_st
 	if (unlikely(vma == NULL))
 		return -ENOMEM;
 
+	if (rlimit_cgroup_charge_as(mm, len >> PAGE_SHIFT))
+		return -ENOMEM;
+
 	vma->vm_mm = mm;
 	vma->vm_start = addr;
 	vma->vm_end = addr + len;
@@ -2248,6 +2258,7 @@ int install_special_mapping(struct mm_st
 
 	if (unlikely(insert_vm_struct(mm, vma))) {
 		kmem_cache_free(vm_area_cachep, vma);
+		rlimit_cgroup_uncharge_as(mm, len >> PAGE_SHIFT);
 		return -ENOMEM;
 	}
 
diff -puN arch/ia64/kernel/perfmon.c~rlimit-controller-address-space-accounting arch/ia64/kernel/perfmon.c
--- linux-2.6.25/arch/ia64/kernel/perfmon.c~rlimit-controller-address-space-accounting	2008-05-04 02:53:20.000000000 +0530
+++ linux-2.6.25-balbir/arch/ia64/kernel/perfmon.c	2008-05-04 02:53:20.000000000 +0530
@@ -40,6 +40,7 @@
 #include <linux/capability.h>
 #include <linux/rcupdate.h>
 #include <linux/completion.h>
+#include <linux/rlimitcgroup.h>
 
 #include <asm/errno.h>
 #include <asm/intrinsics.h>
@@ -2300,6 +2301,9 @@ pfm_smpl_buffer_alloc(struct task_struct
 	 * if ((mm->total_vm << PAGE_SHIFT) + len> task->rlim[RLIMIT_AS].rlim_cur)
 	 * 	return -ENOMEM;
 	 */
+	if (rlimit_cgroup_charge_as(mm, size >> PAGE_SHIFT))
+		return -ENOMEM;
+
 	if (size > task->signal->rlim[RLIMIT_MEMLOCK].rlim_cur)
 		return -ENOMEM;
 
@@ -2311,6 +2315,7 @@ pfm_smpl_buffer_alloc(struct task_struct
 	smpl_buf = pfm_rvmalloc(size);
 	if (smpl_buf == NULL) {
 		DPRINT(("Can't allocate sampling buffer\n"));
+		rlimit_cgroup_uncharge_as(mm, size >> PAGE_SHIFT);
 		return -ENOMEM;
 	}
 
@@ -2390,6 +2395,7 @@ pfm_smpl_buffer_alloc(struct task_struct
 error:
 	kmem_cache_free(vm_area_cachep, vma);
 error_kmem:
+	rlimit_cgroup_uncharge_as(mm, size >> PAGE_SHIFT);
 	pfm_rvfree(smpl_buf, size);
 
 	return -ENOMEM;
diff -puN arch/x86/kernel/ptrace.c~rlimit-controller-address-space-accounting arch/x86/kernel/ptrace.c
diff -puN fs/exec.c~rlimit-controller-address-space-accounting fs/exec.c
--- linux-2.6.25/fs/exec.c~rlimit-controller-address-space-accounting	2008-05-04 02:53:20.000000000 +0530
+++ linux-2.6.25-balbir/fs/exec.c	2008-05-04 02:53:20.000000000 +0530
@@ -51,6 +51,7 @@
 #include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
 #include <linux/audit.h>
+#include <linux/rlimitcgroup.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -229,6 +230,9 @@ static int __bprm_mm_init(struct linux_b
 	if (!vma)
 		goto err;
 
+	if (rlimit_cgroup_charge_as(mm, 1))
+		goto err;
+
 	down_write(&mm->mmap_sem);
 	vma->vm_mm = mm;
 
diff -puN kernel/fork.c~rlimit-controller-address-space-accounting kernel/fork.c
--- linux-2.6.25/kernel/fork.c~rlimit-controller-address-space-accounting	2008-05-04 02:53:20.000000000 +0530
+++ linux-2.6.25-balbir/kernel/fork.c	2008-05-04 02:53:20.000000000 +0530
@@ -53,6 +53,7 @@
 #include <linux/tty.h>
 #include <linux/proc_fs.h>
 #include <linux/blkdev.h>
+#include <linux/rlimitcgroup.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -243,6 +244,7 @@ static int dup_mmap(struct mm_struct *mm
 			mm->total_vm -= pages;
 			vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
 								-pages);
+			rlimit_cgroup_uncharge_as(mm, pages);
 			continue;
 		}
 		charge = 0;
@@ -1053,6 +1055,15 @@ static struct task_struct *copy_process(
 	DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
 	DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
 #endif
+
+	/*
+	 * We need to duplicate the address space charges on fork
+	 */
+	if (current->mm && !(clone_flags & CLONE_VM)) {
+		if (rlimit_cgroup_charge_as(current->mm, current->mm->total_vm))
+			goto bad_fork_free;
+	}
+
 	retval = -EAGAIN;
 	if (atomic_read(&p->user->processes) >=
 			p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
@@ -1406,6 +1417,7 @@ bad_fork_cleanup_count:
 	put_group_info(p->group_info);
 	atomic_dec(&p->user->processes);
 	free_uid(p->user);
+	rlimit_cgroup_uncharge_as(current->mm, current->mm->total_vm);
 bad_fork_free:
 	free_task(p);
 fork_out:
diff -puN mm/mremap.c~rlimit-controller-address-space-accounting mm/mremap.c
diff -puN arch/x86/kernel/ds.c~rlimit-controller-address-space-accounting arch/x86/kernel/ds.c
--- linux-2.6.25/arch/x86/kernel/ds.c~rlimit-controller-address-space-accounting	2008-05-04 02:53:20.000000000 +0530
+++ linux-2.6.25-balbir/arch/x86/kernel/ds.c	2008-05-04 02:53:20.000000000 +0530
@@ -29,6 +29,7 @@
 #include <linux/string.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/rlimitcgroup.h>
 
 
 /*
@@ -348,9 +349,14 @@ static inline void *ds_allocate_buffer(s
 	if (rlim < vm)
 		return 0;
 
+	if (rlimit_cgroup_charge_as(current->mm, pgsz))
+		return 0;
+
 	buffer = kzalloc(size, GFP_KERNEL);
-	if (!buffer)
+	if (!buffer) {
+		rlimit_cgroup_uncharge_as(current->mm, pgsz);
 		return 0;
+	}
 
 	current->mm->total_vm  += pgsz;
 	current->mm->locked_vm += pgsz;
@@ -480,6 +486,8 @@ static int ds_release(struct task_struct
 	kfree(context->buffer[qual]);
 	context->buffer[qual] = 0;
 
+	rlimit_cgroup_uncharge_as(current->mm, context->pages[qual]);
+
 	current->mm->total_vm  -= context->pages[qual];
 	current->mm->locked_vm -= context->pages[qual];
 	context->pages[qual] = 0;
_

-- 
	Warm Regards,
	Balbir Singh
	Linux Technology Center
	IBM, ISTL

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [-mm][PATCH 4/4] Add rlimit controller documentation
  2008-05-03 21:37 [-mm][PATCH 0/4] Add rlimit controller to cgroups (v3) Balbir Singh
                   ` (2 preceding siblings ...)
  2008-05-03 21:38 ` [-mm][PATCH 3/4] Add rlimit controller accounting and control Balbir Singh
@ 2008-05-03 21:38 ` Balbir Singh
  2008-05-05 22:35   ` Andrew Morton
  2008-05-04 15:24 ` [-mm][PATCH 0/4] Add rlimit controller to cgroups (v3) kamezawa.hiroyu
  2008-05-04 15:27 ` kamezawa.hiroyu
  5 siblings, 1 reply; 32+ messages in thread
From: Balbir Singh @ 2008-05-03 21:38 UTC (permalink / raw)
  To: linux-mm
  Cc: Sudhir Kumar, YAMAMOTO Takashi, Paul Menage, lizf, linux-kernel,
	David Rientjes, Pavel Emelianov, Balbir Singh, Andrew Morton,
	KAMEZAWA Hiroyuki



This is the documentation patch. It describes the rlimit controller and how
to build and use it.

Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
---

 Documentation/controllers/rlimit.txt |   29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff -puN /dev/null Documentation/controllers/rlimit.txt
--- /dev/null	2008-05-03 22:12:13.033285313 +0530
+++ linux-2.6.25-balbir/Documentation/controllers/rlimit.txt	2008-05-04 03:06:06.000000000 +0530
@@ -0,0 +1,29 @@
+This controller is enabled by the CONFIG_CGROUP_RLIMIT_CTLR option. Prior
+to reading this documentation please read Documentation/cgroups.txt and
+Documentation/controllers/memory.txt. Several of the principles of this
+controller are similar to the memory resource controller.
+
+This controller framework is designed to be extensible to control any
+resource limit (memory related) with little effort.
+
+This new controller, controls the address space expansion of the tasks
+belonging to a cgroup. Address space control is provided along the same lines as
+RLIMIT_AS control, which is available via getrlimit(2)/setrlimit(2).
+The interface for controlling address space is provided through
+"rlimit.limit_in_bytes". The file is similar to "limit_in_bytes" w.r.t. the user
+interface. Please see section 3 of the memory resource controller documentation
+for more details on how to use the user interface to get and set values.
+
+The "rlimit.usage_in_bytes" file provides information about the total address
+space usage of the tasks in the cgroup, in bytes.
+
+Advantages of providing this feature
+
+1. Control over virtual address space allows for a cgroup to fail gracefully
+   i.e., via a malloc or mmap failure as compared to OOM kill when no
+   pages can be reclaimed.
+2. It provides better control over how many pages can be swapped out when
+   the cgroup goes over its limit. A badly setup cgroup can cause excessive
+   swapping. Providing control over the address space allocations ensures
+   that the system administrator has control over the total swapping that
+   can take place.
_

-- 
	Warm Regards,
	Balbir Singh
	Linux Technology Center
	IBM, ISTL

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [-mm][PATCH 0/4] Add rlimit controller to cgroups (v3)
  2008-05-03 21:37 [-mm][PATCH 0/4] Add rlimit controller to cgroups (v3) Balbir Singh
                   ` (3 preceding siblings ...)
  2008-05-03 21:38 ` [-mm][PATCH 4/4] Add rlimit controller documentation Balbir Singh
@ 2008-05-04 15:24 ` kamezawa.hiroyu
  2008-05-05  4:21   ` Balbir Singh
  2008-05-04 15:27 ` kamezawa.hiroyu
  5 siblings, 1 reply; 32+ messages in thread
From: kamezawa.hiroyu @ 2008-05-04 15:24 UTC (permalink / raw)
  To: Balbir Singh
  Cc: linux-mm, Sudhir Kumar, YAMAMOTO Takashi, Paul Menage, lizf,
	linux-kernel, David Rientjes, Pavel Emelianov, Balbir Singh,
	Andrew Morton, KAMEZAWA Hiroyuki

>Subject: [-mm][PATCH 0/4] Add rlimit controller to cgroups (v3)
>
>
>This is the third version of the address space control patches. These
>patches are against 2.6.25-mm1  and have been tested using KVM in SMP mode,
>both with and without the config enabled.
>
>The first patch adds the user interface. The second patch fixes the
>cgroup mm_owner_changed callback to pass the task struct, so that
>accounting can be adjusted on owner changes. The thrid patch adds accounting
>and control. The fourth patch updates documentation.
>
>An earlier post of the patchset can be found at
>http://lwn.net/Articles/275143/
>
>This patch is built on top of the mm owner patches and utilizes that feature
>to virtually group tasks by mm_struct.
>
>Reviews, Comments?
>

I can't read the whole patch deeply now but this new concept "rlimit-controlle
r" seems make sense to me.

At quick glance, I have some thoughts.

1. kerner/rlimit_cgroup.c is better for future expansion.
2. why 
   "+This controller framework is designed to be extensible to control any
   "+resource limit (memory related) with little effort."
   memory only ? Ok, all you want to do is related to memory, but someone
   may want to limit RLIMIT_CPU by group or RLIMIT_CORE by group or....
   (I have no plan but they seems useful.;)
   So, could you add design hint of rlimit contoller to the documentation ?
   
3. Rleated to 2. Showing what kind of "rlimit" params are supported by
   cgroup will be good.

I don't think you have to implement all things at once. Staring from
"only RLIMIT_AS is supported now" is good. Someone will expand it if
he needs. But showing basic view of "gerenal purpose rlimit contoller" in _doc
ument_ or _comments_ or _codes_ is a good thing to do.

If you don't want to provide RLIMIT feature other than address space,
it's better to avoid using the name of RLIMIT. It's confusing.

Thanks,
-Kame






^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: Re: [-mm][PATCH 0/4] Add rlimit controller to cgroups (v3)
  2008-05-03 21:37 [-mm][PATCH 0/4] Add rlimit controller to cgroups (v3) Balbir Singh
                   ` (4 preceding siblings ...)
  2008-05-04 15:24 ` [-mm][PATCH 0/4] Add rlimit controller to cgroups (v3) kamezawa.hiroyu
@ 2008-05-04 15:27 ` kamezawa.hiroyu
  2008-05-05  4:24   ` Balbir Singh
  5 siblings, 1 reply; 32+ messages in thread
From: kamezawa.hiroyu @ 2008-05-04 15:27 UTC (permalink / raw)
  To: kamezawa.hiroyu
  Cc: Balbir Singh, linux-mm, Sudhir Kumar, YAMAMOTO Takashi,
	Paul Menage, lizf, linux-kernel, David Rientjes, Pavel Emelianov,
	Balbir Singh, Andrew Morton, KAMEZAWA Hiroyuki


>   "+This controller framework is designed to be extensible to control any
>   "+resource limit (memory related) with little effort."
>   memory only ? Ok, all you want to do is related to memory, but someone
>   may want to limit RLIMIT_CPU by group or RLIMIT_CORE by group or....
>   (I have no plan but they seems useful.;)
...RLIMIT_MEMLOCK is in my want-to-do-list ;)

Thanks,
-Kame

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [-mm][PATCH 0/4] Add rlimit controller to cgroups (v3)
  2008-05-04 15:24 ` [-mm][PATCH 0/4] Add rlimit controller to cgroups (v3) kamezawa.hiroyu
@ 2008-05-05  4:21   ` Balbir Singh
  2008-05-07  1:09     ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 32+ messages in thread
From: Balbir Singh @ 2008-05-05  4:21 UTC (permalink / raw)
  To: kamezawa.hiroyu
  Cc: linux-mm, Sudhir Kumar, YAMAMOTO Takashi, Paul Menage, lizf,
	linux-kernel, David Rientjes, Pavel Emelianov, Andrew Morton

kamezawa.hiroyu@jp.fujitsu.com wrote:
>>
>> This is the third version of the address space control patches. These
>> patches are against 2.6.25-mm1  and have been tested using KVM in SMP mode,
>> both with and without the config enabled.
>>
>> The first patch adds the user interface. The second patch fixes the
>> cgroup mm_owner_changed callback to pass the task struct, so that
>> accounting can be adjusted on owner changes. The thrid patch adds accounting
>> and control. The fourth patch updates documentation.
>>
>> An earlier post of the patchset can be found at
>> http://lwn.net/Articles/275143/
>>
>> This patch is built on top of the mm owner patches and utilizes that feature
>> to virtually group tasks by mm_struct.
>>
>> Reviews, Comments?
>>
> 
> I can't read the whole patch deeply now but this new concept "rlimit-controlle
> r" seems make sense to me.
> 
> At quick glance, I have some thoughts.
> 
> 1. kerner/rlimit_cgroup.c is better for future expansion.

I have no problem with that name, I can rename the files.

> 2. why 
>    "+This controller framework is designed to be extensible to control any
>    "+resource limit (memory related) with little effort."
>    memory only ? Ok, all you want to do is related to memory, but someone
>    may want to limit RLIMIT_CPU by group or RLIMIT_CORE by group or....
>    (I have no plan but they seems useful.;)

I currently mentioned memory, since we have the infrastructure to group using
mm->owner infrastructure. For other purposes, we'll need to enhance the
controller quite a bit. That is why I put memory related in brackets.

>    So, could you add design hint of rlimit contoller to the documentation ?
>    

OK, I'll update the documentation

> 3. Rleated to 2. Showing what kind of "rlimit" params are supported by
>    cgroup will be good.
> 

Do you mean in init/Kconfig or documentation?. I should probably rename
limit_in_bytes and usage_in_bytes to add an as_ prefix, so that the UI clearly
shows what is supported as well.

> I don't think you have to implement all things at once. Staring from
> "only RLIMIT_AS is supported now" is good. Someone will expand it if
> he needs. But showing basic view of "gerenal purpose rlimit contoller" in _doc
> ument_ or _comments_ or _codes_ is a good thing to do.
> 

I can add to the documentation

> If you don't want to provide RLIMIT feature other than address space,
> it's better to avoid using the name of RLIMIT. It's confusing.
> 

I used RLIMIT since I want to extend it later to control memory locked pages :)
I open to other names as well.

> Thanks,
> -Kame
> 
> 
> 
> 
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>


-- 
	Warm Regards,
	Balbir Singh
	Linux Technology Center
	IBM, ISTL

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [-mm][PATCH 0/4] Add rlimit controller to cgroups (v3)
  2008-05-04 15:27 ` kamezawa.hiroyu
@ 2008-05-05  4:24   ` Balbir Singh
  0 siblings, 0 replies; 32+ messages in thread
From: Balbir Singh @ 2008-05-05  4:24 UTC (permalink / raw)
  To: kamezawa.hiroyu
  Cc: linux-mm, Sudhir Kumar, YAMAMOTO Takashi, Paul Menage, lizf,
	linux-kernel, David Rientjes, Pavel Emelianov, Andrew Morton

kamezawa.hiroyu@jp.fujitsu.com wrote:
>>   "+This controller framework is designed to be extensible to control any
>>   "+resource limit (memory related) with little effort."
>>   memory only ? Ok, all you want to do is related to memory, but someone
>>   may want to limit RLIMIT_CPU by group or RLIMIT_CORE by group or....
>>   (I have no plan but they seems useful.;)
> ...RLIMIT_MEMLOCK is in my want-to-do-list ;)
> 

Mine too, but I want to get to it after the hierarchy and soft limit patches.

-- 
	Warm Regards,
	Balbir Singh
	Linux Technology Center
	IBM, ISTL

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [-mm][PATCH 1/4] Setup the rlimit controller
  2008-05-03 21:37 ` [-mm][PATCH 1/4] Setup the rlimit controller Balbir Singh
@ 2008-05-05 22:11   ` Andrew Morton
  2008-05-06  3:40     ` Balbir Singh
  2008-05-06  1:31   ` Li Zefan
  1 sibling, 1 reply; 32+ messages in thread
From: Andrew Morton @ 2008-05-05 22:11 UTC (permalink / raw)
  To: Balbir Singh
  Cc: linux-mm, skumar, yamamoto, menage, lizf, linux-kernel, rientjes,
	xemul, balbir, kamezawa.hiroyu

On Sun, 04 May 2008 03:07:36 +0530
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:

> +	*tmp = ((*tmp + PAGE_SIZE) >> PAGE_SHIFT) << PAGE_SHIFT;

Whatever this is doing, it should not be doing it this way ;)

perhaps

	*tmp = ALIGN(*tmp, PAGE_SIZE);

or even

	*tmp = PAGE_ALIGN(*tmp);

?


<looks at PAGE_ALIGN>

Each architecture implements its own version and they of course do it
differently.  It's crying out for a consolidated implementation but we have
no include/linux/page.h into which to consolidate it.


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [-mm][PATCH 2/4] Enhance cgroup mm_owner_changed callback to add task information
  2008-05-03 21:38 ` [-mm][PATCH 2/4] Enhance cgroup mm_owner_changed callback to add task information Balbir Singh
@ 2008-05-05 22:15   ` Andrew Morton
  2008-05-06  3:43     ` Balbir Singh
  2008-05-05 23:00   ` Paul Menage
  1 sibling, 1 reply; 32+ messages in thread
From: Andrew Morton @ 2008-05-05 22:15 UTC (permalink / raw)
  To: Balbir Singh
  Cc: linux-mm, skumar, yamamoto, menage, lizf, linux-kernel, rientjes,
	xemul, balbir, kamezawa.hiroyu

On Sun, 04 May 2008 03:08:04 +0530
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:

> 
> 
> This patch adds an additional field to the mm_owner callbacks. This field
> is required to get to the mm that changed.
> 
> Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
> ---
> 
>  include/linux/cgroup.h |    3 ++-
>  kernel/cgroup.c        |    2 +-
>  2 files changed, 3 insertions(+), 2 deletions(-)
> 
> diff -puN kernel/cgroup.c~cgroup-add-task-to-mm--owner-callbacks kernel/cgroup.c
> --- linux-2.6.25/kernel/cgroup.c~cgroup-add-task-to-mm--owner-callbacks	2008-05-04 02:53:05.000000000 +0530
> +++ linux-2.6.25-balbir/kernel/cgroup.c	2008-05-04 02:53:05.000000000 +0530
> @@ -2772,7 +2772,7 @@ void cgroup_mm_owner_callbacks(struct ta
>  			if (oldcgrp == newcgrp)
>  				continue;
>  			if (ss->mm_owner_changed)
> -				ss->mm_owner_changed(ss, oldcgrp, newcgrp);
> +				ss->mm_owner_changed(ss, oldcgrp, newcgrp, new);
>  		}
>  	}
>  }
> diff -puN include/linux/cgroup.h~cgroup-add-task-to-mm--owner-callbacks include/linux/cgroup.h
> --- linux-2.6.25/include/linux/cgroup.h~cgroup-add-task-to-mm--owner-callbacks	2008-05-04 02:53:05.000000000 +0530
> +++ linux-2.6.25-balbir/include/linux/cgroup.h	2008-05-04 02:53:05.000000000 +0530
> @@ -310,7 +310,8 @@ struct cgroup_subsys {
>  	 */
>  	void (*mm_owner_changed)(struct cgroup_subsys *ss,
>  					struct cgroup *old,
> -					struct cgroup *new);
> +					struct cgroup *new,
> +					struct task_struct *p);

If mm_owner_changed() had any documentation I'd suggest that it be updated.
Sneaky.

The existing comment:

	/*
	 * This routine is called with the task_lock of mm->owner held
	 */
	void (*mm_owner_changed)(struct cgroup_subsys *ss,
					struct cgroup *old,
					struct cgroup *new);

Is rather mysterious.  To what mm does it refer?

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [-mm][PATCH 3/4] Add rlimit controller accounting and control
  2008-05-03 21:38 ` [-mm][PATCH 3/4] Add rlimit controller accounting and control Balbir Singh
@ 2008-05-05 22:24   ` Andrew Morton
  2008-05-05 22:32     ` David Rientjes
  2008-05-06  5:34     ` Balbir Singh
  2008-05-07  3:17   ` Paul Menage
  2008-05-07  3:29   ` Paul Menage
  2 siblings, 2 replies; 32+ messages in thread
From: Andrew Morton @ 2008-05-05 22:24 UTC (permalink / raw)
  To: Balbir Singh
  Cc: linux-mm, skumar, yamamoto, menage, lizf, linux-kernel, rientjes,
	xemul, balbir, kamezawa.hiroyu

On Sun, 04 May 2008 03:08:14 +0530
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:

> +	if (res_counter_charge(&rcg->as_res, (mm->total_vm << PAGE_SHIFT)))

I worry a bit about all the conversion between page-counts and byte-counts
in this code.

For example, what happens if a process sits there increasing its rss with
sbrk(4095) or sbrk(4097) or all sorts of other scenarios?  Do we get in a
situation in which the accounting is systematically wrong?

Worse, do we risk getting into that situation in the future, as unrelated
changes are made to the surrounding code?

IOW, have we chosen the best, most maintainable representation for these
things?


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [-mm][PATCH 3/4] Add rlimit controller accounting and control
  2008-05-05 22:24   ` Andrew Morton
@ 2008-05-05 22:32     ` David Rientjes
  2008-05-06  5:34     ` Balbir Singh
  1 sibling, 0 replies; 32+ messages in thread
From: David Rientjes @ 2008-05-05 22:32 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Balbir Singh, linux-mm, skumar, yamamoto, menage, lizf,
	linux-kernel, xemul, kamezawa.hiroyu

On Mon, 5 May 2008, Andrew Morton wrote:

> IOW, have we chosen the best, most maintainable representation for these
> things?
> 

There was discussion early on in the development of the memory controller 
that bytes were going to be used as the base unit for accounting.  I had 
disagreed in favor of kB since page sizes are always in these increments 
and historically the kernel has exported statistics this way via 
/proc/meminfo.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [-mm][PATCH 4/4] Add rlimit controller documentation
  2008-05-03 21:38 ` [-mm][PATCH 4/4] Add rlimit controller documentation Balbir Singh
@ 2008-05-05 22:35   ` Andrew Morton
  2008-05-06  5:39     ` Balbir Singh
  0 siblings, 1 reply; 32+ messages in thread
From: Andrew Morton @ 2008-05-05 22:35 UTC (permalink / raw)
  To: Balbir Singh
  Cc: linux-mm, skumar, yamamoto, menage, lizf, linux-kernel, rientjes,
	xemul, balbir, kamezawa.hiroyu

On Sun, 04 May 2008 03:08:25 +0530
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:

> 
> 
> This is the documentation patch. It describes the rlimit controller and how
> to build and use it.
> 
> Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
> ---
> 
>  Documentation/controllers/rlimit.txt |   29 +++++++++++++++++++++++++++++
>  1 file changed, 29 insertions(+)
> 
> diff -puN /dev/null Documentation/controllers/rlimit.txt
> --- /dev/null	2008-05-03 22:12:13.033285313 +0530
> +++ linux-2.6.25-balbir/Documentation/controllers/rlimit.txt	2008-05-04 03:06:06.000000000 +0530
> @@ -0,0 +1,29 @@
> +This controller is enabled by the CONFIG_CGROUP_RLIMIT_CTLR option. Prior
> +to reading this documentation please read Documentation/cgroups.txt and
> +Documentation/controllers/memory.txt. Several of the principles of this
> +controller are similar to the memory resource controller.
> +
> +This controller framework is designed to be extensible to control any
> +resource limit (memory related) with little effort.
> +
> +This new controller, controls the address space expansion of the tasks
> +belonging to a cgroup. Address space control is provided along the same lines as
> +RLIMIT_AS control, which is available via getrlimit(2)/setrlimit(2).
> +The interface for controlling address space is provided through
> +"rlimit.limit_in_bytes". The file is similar to "limit_in_bytes" w.r.t. the user
> +interface. Please see section 3 of the memory resource controller documentation
> +for more details on how to use the user interface to get and set values.
> +
> +The "rlimit.usage_in_bytes" file provides information about the total address
> +space usage of the tasks in the cgroup, in bytes.

Finally, with a bit of between-the-line reading, I begin to understand what
this stuff is actually supposed to do.

It puts an upper limit upon the _total_ address-space size of all the mms
which are contained within the resource group, yes?

(can am mm be shared by two threads whcih are in different resource groups,
btw?)

> +Advantages of providing this feature
> +
> +1. Control over virtual address space allows for a cgroup to fail gracefully
> +   i.e., via a malloc or mmap failure as compared to OOM kill when no
> +   pages can be reclaimed.
> +2. It provides better control over how many pages can be swapped out when
> +   the cgroup goes over its limit. A badly setup cgroup can cause excessive
> +   swapping. Providing control over the address space allocations ensures
> +   that the system administrator has control over the total swapping that
> +   can take place.

Here's another missing piece: what is the kernel's behaviour when such a
limit is increased?  Seems that the sole option is a failure return from
mmap/brk/sbrk/etc, yes?

This should be spelled out in careful detail, please.  This is a
newly-proposed kernel<->userspace interface and we care about those very
much.

Finally, I worry about overflows.  afacit the
sum-of-address-space-sizes-for-a-cgroup is accounted for in an unsigned
long?

If so, a 32-bit machine could easily overflow it.

And a 64-bit machine could possibly do so with a bit of effort, perhaps? 
That's assuming that the code doesn't attempt to avoid duplicate accounting
due to multiple-mms-mapping-the-same-pages, which afaict appears to be the
case.  (Then again, perhaps no machine will ever have the pagetable space
to get that far).



Ho hum, I had to do rather a lot of guesswork here to try to understand
your proposed overall design for this feature.  I'd prefer to hear about
your design via more direct means.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [-mm][PATCH 2/4] Enhance cgroup mm_owner_changed callback to add task information
  2008-05-03 21:38 ` [-mm][PATCH 2/4] Enhance cgroup mm_owner_changed callback to add task information Balbir Singh
  2008-05-05 22:15   ` Andrew Morton
@ 2008-05-05 23:00   ` Paul Menage
  1 sibling, 0 replies; 32+ messages in thread
From: Paul Menage @ 2008-05-05 23:00 UTC (permalink / raw)
  To: Balbir Singh
  Cc: linux-mm, Sudhir Kumar, YAMAMOTO Takashi, lizf, linux-kernel,
	David Rientjes, Pavel Emelianov, Andrew Morton,
	KAMEZAWA Hiroyuki

As Andrew suggested, can you improve the documentation? Ideally, there
should be a paragraph in Documentation/cgroups.txt that describes the
circumstances (including locking state) in which the callback is
called.

Paul

On Sat, May 3, 2008 at 2:38 PM, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
>
>
>  This patch adds an additional field to the mm_owner callbacks. This field
>  is required to get to the mm that changed.
>
>  Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
>  ---
>
>   include/linux/cgroup.h |    3 ++-
>   kernel/cgroup.c        |    2 +-
>   2 files changed, 3 insertions(+), 2 deletions(-)
>
>  diff -puN kernel/cgroup.c~cgroup-add-task-to-mm--owner-callbacks kernel/cgroup.c
>  --- linux-2.6.25/kernel/cgroup.c~cgroup-add-task-to-mm--owner-callbacks 2008-05-04 02:53:05.000000000 +0530
>  +++ linux-2.6.25-balbir/kernel/cgroup.c 2008-05-04 02:53:05.000000000 +0530
>  @@ -2772,7 +2772,7 @@ void cgroup_mm_owner_callbacks(struct ta
>                         if (oldcgrp == newcgrp)
>                                 continue;
>                         if (ss->mm_owner_changed)
>  -                               ss->mm_owner_changed(ss, oldcgrp, newcgrp);
>  +                               ss->mm_owner_changed(ss, oldcgrp, newcgrp, new);
>                 }
>         }
>   }
>  diff -puN include/linux/cgroup.h~cgroup-add-task-to-mm--owner-callbacks include/linux/cgroup.h
>  --- linux-2.6.25/include/linux/cgroup.h~cgroup-add-task-to-mm--owner-callbacks  2008-05-04 02:53:05.000000000 +0530
>  +++ linux-2.6.25-balbir/include/linux/cgroup.h  2008-05-04 02:53:05.000000000 +0530
>  @@ -310,7 +310,8 @@ struct cgroup_subsys {
>          */
>         void (*mm_owner_changed)(struct cgroup_subsys *ss,
>                                         struct cgroup *old,
>  -                                       struct cgroup *new);
>  +                                       struct cgroup *new,
>  +                                       struct task_struct *p);
>         int subsys_id;
>         int active;
>         int disabled;
>  _
>
>  --
>         Warm Regards,
>         Balbir Singh
>         Linux Technology Center
>         IBM, ISTL
>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [-mm][PATCH 1/4] Setup the rlimit controller
  2008-05-03 21:37 ` [-mm][PATCH 1/4] Setup the rlimit controller Balbir Singh
  2008-05-05 22:11   ` Andrew Morton
@ 2008-05-06  1:31   ` Li Zefan
  2008-05-06  8:15     ` Balbir Singh
  1 sibling, 1 reply; 32+ messages in thread
From: Li Zefan @ 2008-05-06  1:31 UTC (permalink / raw)
  To: Balbir Singh
  Cc: linux-mm, Sudhir Kumar, YAMAMOTO Takashi, Paul Menage,
	linux-kernel, David Rientjes, Pavel Emelianov, Andrew Morton,
	KAMEZAWA Hiroyuki

Balbir Singh wrote:
> +struct cgroup_subsys rlimit_cgroup_subsys;
> +
> +struct rlimit_cgroup {
> +	struct cgroup_subsys_state css;
> +	struct res_counter as_res;	/* address space counter */
> +};
> +
> +static struct rlimit_cgroup init_rlimit_cgroup;
> +
> +struct rlimit_cgroup *rlimit_cgroup_from_cgrp(struct cgroup *cgrp)

It can be static if I don't miss anything.

> +{
> +	return container_of(cgroup_subsys_state(cgrp, rlimit_cgroup_subsys_id),
> +				struct rlimit_cgroup, css);
> +}
> +


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [-mm][PATCH 1/4] Setup the rlimit controller
  2008-05-05 22:11   ` Andrew Morton
@ 2008-05-06  3:40     ` Balbir Singh
  0 siblings, 0 replies; 32+ messages in thread
From: Balbir Singh @ 2008-05-06  3:40 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, skumar, yamamoto, menage, lizf, linux-kernel, rientjes,
	xemul, kamezawa.hiroyu

Andrew Morton wrote:
> On Sun, 04 May 2008 03:07:36 +0530
> Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> 
>> +	*tmp = ((*tmp + PAGE_SIZE) >> PAGE_SHIFT) << PAGE_SHIFT;
> 
> Whatever this is doing, it should not be doing it this way ;)
> 
> perhaps
> 
> 	*tmp = ALIGN(*tmp, PAGE_SIZE);
> 
> or even
> 
> 	*tmp = PAGE_ALIGN(*tmp);
> 
> ?
> 

Good point, thanks for catching this.

> 
> <looks at PAGE_ALIGN>
> 
> Each architecture implements its own version and they of course do it
> differently.  It's crying out for a consolidated implementation but we have
> no include/linux/page.h into which to consolidate it.

May be we can move this to asm-generic/page.h?


-- 
	Warm Regards,
	Balbir Singh
	Linux Technology Center
	IBM, ISTL

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [-mm][PATCH 2/4] Enhance cgroup mm_owner_changed callback to add task information
  2008-05-05 22:15   ` Andrew Morton
@ 2008-05-06  3:43     ` Balbir Singh
  0 siblings, 0 replies; 32+ messages in thread
From: Balbir Singh @ 2008-05-06  3:43 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, skumar, yamamoto, menage, lizf, linux-kernel, rientjes,
	xemul, kamezawa.hiroyu

Andrew Morton wrote:
> On Sun, 04 May 2008 03:08:04 +0530
> Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> 
>>
>> This patch adds an additional field to the mm_owner callbacks. This field
>> is required to get to the mm that changed.
>>
>> Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
>> ---
>>
>>  include/linux/cgroup.h |    3 ++-
>>  kernel/cgroup.c        |    2 +-
>>  2 files changed, 3 insertions(+), 2 deletions(-)
>>
>> diff -puN kernel/cgroup.c~cgroup-add-task-to-mm--owner-callbacks kernel/cgroup.c
>> --- linux-2.6.25/kernel/cgroup.c~cgroup-add-task-to-mm--owner-callbacks	2008-05-04 02:53:05.000000000 +0530
>> +++ linux-2.6.25-balbir/kernel/cgroup.c	2008-05-04 02:53:05.000000000 +0530
>> @@ -2772,7 +2772,7 @@ void cgroup_mm_owner_callbacks(struct ta
>>  			if (oldcgrp == newcgrp)
>>  				continue;
>>  			if (ss->mm_owner_changed)
>> -				ss->mm_owner_changed(ss, oldcgrp, newcgrp);
>> +				ss->mm_owner_changed(ss, oldcgrp, newcgrp, new);
>>  		}
>>  	}
>>  }
>> diff -puN include/linux/cgroup.h~cgroup-add-task-to-mm--owner-callbacks include/linux/cgroup.h
>> --- linux-2.6.25/include/linux/cgroup.h~cgroup-add-task-to-mm--owner-callbacks	2008-05-04 02:53:05.000000000 +0530
>> +++ linux-2.6.25-balbir/include/linux/cgroup.h	2008-05-04 02:53:05.000000000 +0530
>> @@ -310,7 +310,8 @@ struct cgroup_subsys {
>>  	 */
>>  	void (*mm_owner_changed)(struct cgroup_subsys *ss,
>>  					struct cgroup *old,
>> -					struct cgroup *new);
>> +					struct cgroup *new,
>> +					struct task_struct *p);
> 
> If mm_owner_changed() had any documentation I'd suggest that it be updated.
> Sneaky.
> 

No, there's no documentation besides the comments. I'll go ahead and update
cgroups.txt with some documentation.

> The existing comment:
> 
> 	/*
> 	 * This routine is called with the task_lock of mm->owner held
> 	 */
> 	void (*mm_owner_changed)(struct cgroup_subsys *ss,
> 					struct cgroup *old,
> 					struct cgroup *new);
> 
> Is rather mysterious.  To what mm does it refer?

This callback is called when the mm->owner field that points to/owns a cgroup
changes as a result of the owner exiting.

-- 
	Warm Regards,
	Balbir Singh
	Linux Technology Center
	IBM, ISTL

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [-mm][PATCH 3/4] Add rlimit controller accounting and control
  2008-05-05 22:24   ` Andrew Morton
  2008-05-05 22:32     ` David Rientjes
@ 2008-05-06  5:34     ` Balbir Singh
  1 sibling, 0 replies; 32+ messages in thread
From: Balbir Singh @ 2008-05-06  5:34 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, skumar, yamamoto, menage, lizf, linux-kernel, rientjes,
	xemul, kamezawa.hiroyu

Andrew Morton wrote:
> On Sun, 04 May 2008 03:08:14 +0530
> Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> 
>> +	if (res_counter_charge(&rcg->as_res, (mm->total_vm << PAGE_SHIFT)))
> 
> I worry a bit about all the conversion between page-counts and byte-counts
> in this code.
> 
> For example, what happens if a process sits there increasing its rss with
> sbrk(4095) or sbrk(4097) or all sorts of other scenarios?  Do we get in a
> situation in which the accounting is systematically wrong?
> 

We already do all our accounting in pages for total_vm (field of mm_struct).
task_vsize() for example multiplies PAGE_SIZE with total_vm before returning the
result.

> Worse, do we risk getting into that situation in the future, as unrelated
> changes are made to the surrounding code?
> 

I can't see that happening, but I'll look again and request reviewers to help me
identify any such problems that can occur.

> IOW, have we chosen the best, most maintainable representation for these
> things?
> 

That's a good question. From the sustenance point of view, resource counters
have worked really well so far. Abstracting accounting and tracking from the
controllers has been a good thing. One of the goals of the rlimit controller is
to keep it open for extension, so that others can add their own control for
other resources like mlock'ed pages.

-- 
	Warm Regards,
	Balbir Singh
	Linux Technology Center
	IBM, ISTL

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [-mm][PATCH 4/4] Add rlimit controller documentation
  2008-05-05 22:35   ` Andrew Morton
@ 2008-05-06  5:39     ` Balbir Singh
  2008-05-06  5:54       ` Andrew Morton
  0 siblings, 1 reply; 32+ messages in thread
From: Balbir Singh @ 2008-05-06  5:39 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, skumar, yamamoto, menage, lizf, linux-kernel, rientjes,
	xemul, kamezawa.hiroyu

Andrew Morton wrote:
> On Sun, 04 May 2008 03:08:25 +0530
> Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> 
>>
>> This is the documentation patch. It describes the rlimit controller and how
>> to build and use it.
>>
>> Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
>> ---
>>
>>  Documentation/controllers/rlimit.txt |   29 +++++++++++++++++++++++++++++
>>  1 file changed, 29 insertions(+)
>>
>> diff -puN /dev/null Documentation/controllers/rlimit.txt
>> --- /dev/null	2008-05-03 22:12:13.033285313 +0530
>> +++ linux-2.6.25-balbir/Documentation/controllers/rlimit.txt	2008-05-04 03:06:06.000000000 +0530
>> @@ -0,0 +1,29 @@
>> +This controller is enabled by the CONFIG_CGROUP_RLIMIT_CTLR option. Prior
>> +to reading this documentation please read Documentation/cgroups.txt and
>> +Documentation/controllers/memory.txt. Several of the principles of this
>> +controller are similar to the memory resource controller.
>> +
>> +This controller framework is designed to be extensible to control any
>> +resource limit (memory related) with little effort.
>> +
>> +This new controller, controls the address space expansion of the tasks
>> +belonging to a cgroup. Address space control is provided along the same lines as
>> +RLIMIT_AS control, which is available via getrlimit(2)/setrlimit(2).
>> +The interface for controlling address space is provided through
>> +"rlimit.limit_in_bytes". The file is similar to "limit_in_bytes" w.r.t. the user
>> +interface. Please see section 3 of the memory resource controller documentation
>> +for more details on how to use the user interface to get and set values.
>> +
>> +The "rlimit.usage_in_bytes" file provides information about the total address
>> +space usage of the tasks in the cgroup, in bytes.
> 
> Finally, with a bit of between-the-line reading, I begin to understand what
> this stuff is actually supposed to do.
> 
> It puts an upper limit upon the _total_ address-space size of all the mms
> which are contained within the resource group, yes?
> 

Yesm true

> (can am mm be shared by two threads whcih are in different resource groups,
> btw?)
> 

Yes, that can happen, but all the charges go to mm->owner (the thread group leader).

>> +Advantages of providing this feature
>> +
>> +1. Control over virtual address space allows for a cgroup to fail gracefully
>> +   i.e., via a malloc or mmap failure as compared to OOM kill when no
>> +   pages can be reclaimed.
>> +2. It provides better control over how many pages can be swapped out when
>> +   the cgroup goes over its limit. A badly setup cgroup can cause excessive
>> +   swapping. Providing control over the address space allocations ensures
>> +   that the system administrator has control over the total swapping that
>> +   can take place.
> 
> Here's another missing piece: what is the kernel's behaviour when such a
> limit is increased?  Seems that the sole option is a failure return from

Do you mean limit is exceeded?

> mmap/brk/sbrk/etc, yes?
> 

If so, Yes, true.

> This should be spelled out in careful detail, please.  This is a
> newly-proposed kernel<->userspace interface and we care about those very
> much.
> 

Sure, I'll document that better.

> Finally, I worry about overflows.  afacit the
> sum-of-address-space-sizes-for-a-cgroup is accounted for in an unsigned
> long?
> 
> If so, a 32-bit machine could easily overflow it.
> 

We use an unsigned long long on all architectures to avoid overflow.

> And a 64-bit machine could possibly do so with a bit of effort, perhaps? 
> That's assuming that the code doesn't attempt to avoid duplicate accounting
> due to multiple-mms-mapping-the-same-pages, which afaict appears to be the
> case.  (Then again, perhaps no machine will ever have the pagetable space
> to get that far).
> 
> 

True

> 
> Ho hum, I had to do rather a lot of guesswork here to try to understand
> your proposed overall design for this feature.  I'd prefer to hear about
> your design via more direct means.

Do you have any suggestions on how to do that better. Would you like
documentation to be the first patch in the series? I had sent out two RFC's
earlier and got comments and feedback from several people.

Having said that, I do agree that design is the most vital thing of any patchset
and communicating that up front and better is critical. I am open to any
suggestions to help make that process better.


-- 
	Warm Regards,
	Balbir Singh
	Linux Technology Center
	IBM, ISTL

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [-mm][PATCH 4/4] Add rlimit controller documentation
  2008-05-06  5:39     ` Balbir Singh
@ 2008-05-06  5:54       ` Andrew Morton
  2008-05-06  7:59         ` Balbir Singh
  0 siblings, 1 reply; 32+ messages in thread
From: Andrew Morton @ 2008-05-06  5:54 UTC (permalink / raw)
  To: balbir
  Cc: linux-mm, skumar, yamamoto, menage, lizf, linux-kernel, rientjes,
	xemul, kamezawa.hiroyu

On Tue, 06 May 2008 11:09:52 +0530 Balbir Singh <balbir@linux.vnet.ibm.com> wrote:

> > Ho hum, I had to do rather a lot of guesswork here to try to understand
> > your proposed overall design for this feature.  I'd prefer to hear about
> > your design via more direct means.
> 
> Do you have any suggestions on how to do that better. Would you like
> documentation to be the first patch in the series? I had sent out two RFC's
> earlier and got comments and feedback from several people.
> 

I do like to see the overall what-i-am-setting-out-to-do description in
there somewhere - sometimes a Docuemtation/ file is appropriate, other
times do it via changelog.

But the first part of the review is reviewing whatever it is which you set
out to achieve.  Once that's understood and sounds like a good idea then we
can start looking at how you did it.



^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [-mm][PATCH 4/4] Add rlimit controller documentation
  2008-05-06  5:54       ` Andrew Morton
@ 2008-05-06  7:59         ` Balbir Singh
  0 siblings, 0 replies; 32+ messages in thread
From: Balbir Singh @ 2008-05-06  7:59 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, skumar, yamamoto, menage, lizf, linux-kernel, rientjes,
	xemul, kamezawa.hiroyu

Andrew Morton wrote:
> On Tue, 06 May 2008 11:09:52 +0530 Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> 
>>> Ho hum, I had to do rather a lot of guesswork here to try to understand
>>> your proposed overall design for this feature.  I'd prefer to hear about
>>> your design via more direct means.
>> Do you have any suggestions on how to do that better. Would you like
>> documentation to be the first patch in the series? I had sent out two RFC's
>> earlier and got comments and feedback from several people.
>>
> 
> I do like to see the overall what-i-am-setting-out-to-do description in
> there somewhere - sometimes a Docuemtation/ file is appropriate, other
> times do it via changelog.
> 

I think having documentation upfront does make sense in that case. I'll also try
and make the changelogs more verbose. I usually try to point to the previous
discussions in the introduction patch.

> But the first part of the review is reviewing whatever it is which you set
> out to achieve.  Once that's understood and sounds like a good idea then we
> can start looking at how you did it.
> 
> 

Yes, I agree.

-- 
	Warm Regards,
	Balbir Singh
	Linux Technology Center
	IBM, ISTL

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [-mm][PATCH 1/4] Setup the rlimit controller
  2008-05-06  1:31   ` Li Zefan
@ 2008-05-06  8:15     ` Balbir Singh
  0 siblings, 0 replies; 32+ messages in thread
From: Balbir Singh @ 2008-05-06  8:15 UTC (permalink / raw)
  To: Li Zefan
  Cc: linux-mm, Sudhir Kumar, YAMAMOTO Takashi, Paul Menage,
	linux-kernel, David Rientjes, Pavel Emelianov, Andrew Morton,
	KAMEZAWA Hiroyuki

Li Zefan wrote:
> Balbir Singh wrote:
>> +struct cgroup_subsys rlimit_cgroup_subsys;
>> +
>> +struct rlimit_cgroup {
>> +	struct cgroup_subsys_state css;
>> +	struct res_counter as_res;	/* address space counter */
>> +};
>> +
>> +static struct rlimit_cgroup init_rlimit_cgroup;
>> +
>> +struct rlimit_cgroup *rlimit_cgroup_from_cgrp(struct cgroup *cgrp)
> 
> It can be static if I don't miss anything.

Yes, it can be. Thanks!

-- 
	Warm Regards,
	Balbir Singh
	Linux Technology Center
	IBM, ISTL

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [-mm][PATCH 0/4] Add rlimit controller to cgroups (v3)
  2008-05-05  4:21   ` Balbir Singh
@ 2008-05-07  1:09     ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 32+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-05-07  1:09 UTC (permalink / raw)
  To: balbir
  Cc: linux-mm, Sudhir Kumar, YAMAMOTO Takashi, Paul Menage, lizf,
	linux-kernel, David Rientjes, Pavel Emelianov, Andrew Morton

On Mon, 05 May 2008 09:51:19 +0530
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> > 3. Rleated to 2. Showing what kind of "rlimit" params are supported by
> >    cgroup will be good.
> > 
> 
> Do you mean in init/Kconfig or documentation?. I should probably rename
> limit_in_bytes and usage_in_bytes to add an as_ prefix, so that the UI clearly
> shows what is supported as well.
I see.

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [-mm][PATCH 3/4] Add rlimit controller accounting and control
  2008-05-03 21:38 ` [-mm][PATCH 3/4] Add rlimit controller accounting and control Balbir Singh
  2008-05-05 22:24   ` Andrew Morton
@ 2008-05-07  3:17   ` Paul Menage
  2008-05-07  5:59     ` Pavel Emelyanov
  2008-05-08 14:54     ` Balbir Singh
  2008-05-07  3:29   ` Paul Menage
  2 siblings, 2 replies; 32+ messages in thread
From: Paul Menage @ 2008-05-07  3:17 UTC (permalink / raw)
  To: Balbir Singh
  Cc: linux-mm, Sudhir Kumar, YAMAMOTO Takashi, lizf, linux-kernel,
	David Rientjes, Pavel Emelianov, Andrew Morton,
	KAMEZAWA Hiroyuki

On Sat, May 3, 2008 at 2:38 PM, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
>  +
>  +int rlimit_cgroup_charge_as(struct mm_struct *mm, unsigned long nr_pages)
>  +{
>  +       int ret;
>  +       struct rlimit_cgroup *rcg;
>  +
>  +       rcu_read_lock();
>  +       rcg = rlimit_cgroup_from_task(rcu_dereference(mm->owner));
>  +       css_get(&rcg->css);
>  +       rcu_read_unlock();
>  +
>  +       ret = res_counter_charge(&rcg->as_res, (nr_pages << PAGE_SHIFT));
>  +       css_put(&rcg->css);
>  +       return ret;
>  +}

You need to synchronize against mm->owner changing, or
mm->owner->cgroups changing. How about:

int rlimit_cgroup_charge_as(struct mm_struct *mm, unsigned long nr_pages)
{
  int ret;
  struct rlimit_cgroup *rcg;
  struct task_struct *owner;

  rcu_read_lock();
 again:

  /* Find and lock the owner of the mm */
  owner = rcu_dereference(mm->owner);
  task_lock(owner);
  if (mm->owner != owner) {
    task_unlock(owner);
    goto again;
  }

  /* Charge the owner's cgroup with the new memory */
  rcg = rlimit_cgroup_from_task(owner);
  ret = res_counter_charge(&rcg->as_res, (nr_pages << PAGE_SHIFT));
  task_unlock(owner);
  rcu_read_unlock();
  return ret;
}

>  +
>  +void rlimit_cgroup_uncharge_as(struct mm_struct *mm, unsigned long nr_pages)
>  +{
>  +       struct rlimit_cgroup *rcg;
>  +
>  +       rcu_read_lock();
>  +       rcg = rlimit_cgroup_from_task(rcu_dereference(mm->owner));
>  +       css_get(&rcg->css);
>  +       rcu_read_unlock();
>  +
>  +       res_counter_uncharge(&rcg->as_res, (nr_pages << PAGE_SHIFT));
>  +       css_put(&rcg->css);
>  +}

Can't this be implemented as just a call to charge() with a negative
value? (Possibly fixing res_counter_charge() to handle negative values
if necessary) Seems simpler.

>
>  +/*
>  + * TODO: get the attach callbacks to fail and disallow task movement.
>  + */

You mean disallow all movement within a hierarchy that has this cgroup
mounted? Doesn't that make it rather hard to use?

>  +static void rlimit_cgroup_move_task(struct cgroup_subsys *ss,
>  +                                       struct cgroup *cgrp,
>  +                                       struct cgroup *old_cgrp,
>  +                                       struct task_struct *p)
>  +{
>  +       struct mm_struct *mm;
>  +       struct rlimit_cgroup *rcg, *old_rcg;
>  +
>  +       mm = get_task_mm(p);
>  +       if (mm == NULL)
>  +               return;
>  +
>  +       rcu_read_lock();
>  +       if (p != rcu_dereference(mm->owner))
>  +               goto out;
>  +
>  +       rcg = rlimit_cgroup_from_cgrp(cgrp);
>  +       old_rcg = rlimit_cgroup_from_cgrp(old_cgrp);
>  +
>  +       if (rcg == old_rcg)
>  +               goto out;
>  +
>  +       if (res_counter_charge(&rcg->as_res, (mm->total_vm << PAGE_SHIFT)))
>  +               goto out;
>  +       res_counter_uncharge(&old_rcg->as_res, (mm->total_vm << PAGE_SHIFT));
>  +out:
>  +       rcu_read_unlock();
>  +       mmput(mm);
>  +}
>  +

Since you need to protect against concurrent charges, and against
concurrent mm ownership changes, I think you should just do this under
task_lock(p).

>  +static void rlimit_cgroup_mm_owner_changed(struct cgroup_subsys *ss,
>  +                                               struct cgroup *cgrp,
>  +                                               struct cgroup *old_cgrp,
>  +                                               struct task_struct *p)
>  +{
>  +       struct rlimit_cgroup *rcg, *old_rcg;
>  +       struct mm_struct *mm = get_task_mm(p);
>  +
>  +       BUG_ON(!mm);
>  +       rcg = rlimit_cgroup_from_cgrp(cgrp);
>  +       old_rcg = rlimit_cgroup_from_cgrp(old_cgrp);
>  +       if (res_counter_charge(&rcg->as_res, (mm->total_vm << PAGE_SHIFT)))
>  +               goto out;
>  +       res_counter_uncharge(&old_rcg->as_res, (mm->total_vm << PAGE_SHIFT));
>  +out:
>  +       mmput(mm);
>  +}
>  +

Also needs to task_lock(p) to prevent concurrent charges or cgroup
reassignments?

Paul

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [-mm][PATCH 3/4] Add rlimit controller accounting and control
  2008-05-03 21:38 ` [-mm][PATCH 3/4] Add rlimit controller accounting and control Balbir Singh
  2008-05-05 22:24   ` Andrew Morton
  2008-05-07  3:17   ` Paul Menage
@ 2008-05-07  3:29   ` Paul Menage
  2008-05-08 14:35     ` Balbir Singh
  2 siblings, 1 reply; 32+ messages in thread
From: Paul Menage @ 2008-05-07  3:29 UTC (permalink / raw)
  To: Balbir Singh
  Cc: linux-mm, Sudhir Kumar, YAMAMOTO Takashi, lizf, linux-kernel,
	David Rientjes, Pavel Emelianov, Andrew Morton,
	KAMEZAWA Hiroyuki

On Sat, May 3, 2008 at 2:38 PM, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
>
>
>  This patch adds support for accounting and control of virtual address space
>  limits. The accounting is done via the rlimit_cgroup_(un)charge_as functions.
>  The core of the accounting takes place during fork time in copy_process(),
>  may_expand_vm(), remove_vma_list() and exit_mmap(). There are some special
>  cases that are handled here as well (arch/ia64/kernel/perform.c,
>  arch/x86/kernel/ptrace.c, insert_special_mapping())
>

The basic idea of the patches looks fine (apart from some
synchronization issues) but Is calling this the "rlimit" controller a
great idea? That implies that it handles all (or at least many) of the
things that setrlimit()/getrlimit() handle.

While some of the other rlimit things definitely do make sense as
cgroup controllers, putting them all in the same controller doesn't
really - paying for the address-space tracking overhead just to get,
say, the equivalent of RLIMIT_NPROC (max tasks) isn't a great idea.

Can you instead give this a name that somehow refers to virtual
address space limits, e.g. "va" or "as". That would still fit if you
expanded it to deal with locked virtual address space limits too.

I think that an "rlimit" controller would probably be best for
representing just those limits that don't really make sense when
aggregated across different tasks, but apply separately to each task
(e.g. RLIMIT_FSIZE, RLIMIT_CORE, RLIMIT_NICE, RLIMIT_NOFILE,
RLIMIT_RTPRIO, RLIMIT_STACK, RLIMIT_SIGPENDING, and maybe RLIMIT_CPU),
in order to provide an easy way to change these limits on a group of
running tasks.

On a separate note for the address-space tracking, ideally the
subsystem would track whether or not it was bound to a hierarchy, and
skip charging/uncharging if not. That way there's no (noticeable)
overhead for compiling in the subsystem but not using it. At the point
when the subsystem was bound to a hierarchy, it could at that point
run through all mms and charge each one's existing address space to
the appropriate cgroup. (Currently that would only be the root cgroup
in the hierarchy).

Paul

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [-mm][PATCH 3/4] Add rlimit controller accounting and control
  2008-05-07  3:17   ` Paul Menage
@ 2008-05-07  5:59     ` Pavel Emelyanov
  2008-05-08 14:54     ` Balbir Singh
  1 sibling, 0 replies; 32+ messages in thread
From: Pavel Emelyanov @ 2008-05-07  5:59 UTC (permalink / raw)
  To: Paul Menage
  Cc: Balbir Singh, linux-mm, Sudhir Kumar, YAMAMOTO Takashi, lizf,
	linux-kernel, David Rientjes, Andrew Morton, KAMEZAWA Hiroyuki

Paul Menage wrote:
> On Sat, May 3, 2008 at 2:38 PM, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
>>  +
>>  +int rlimit_cgroup_charge_as(struct mm_struct *mm, unsigned long nr_pages)
>>  +{
>>  +       int ret;
>>  +       struct rlimit_cgroup *rcg;
>>  +
>>  +       rcu_read_lock();
>>  +       rcg = rlimit_cgroup_from_task(rcu_dereference(mm->owner));
>>  +       css_get(&rcg->css);
>>  +       rcu_read_unlock();
>>  +
>>  +       ret = res_counter_charge(&rcg->as_res, (nr_pages << PAGE_SHIFT));
>>  +       css_put(&rcg->css);
>>  +       return ret;
>>  +}
> 
> You need to synchronize against mm->owner changing, or
> mm->owner->cgroups changing. How about:
> 
> int rlimit_cgroup_charge_as(struct mm_struct *mm, unsigned long nr_pages)
> {
>   int ret;
>   struct rlimit_cgroup *rcg;
>   struct task_struct *owner;
> 
>   rcu_read_lock();
>  again:
> 
>   /* Find and lock the owner of the mm */
>   owner = rcu_dereference(mm->owner);
>   task_lock(owner);
>   if (mm->owner != owner) {
>     task_unlock(owner);
>     goto again;
>   }
> 
>   /* Charge the owner's cgroup with the new memory */
>   rcg = rlimit_cgroup_from_task(owner);
>   ret = res_counter_charge(&rcg->as_res, (nr_pages << PAGE_SHIFT));
>   task_unlock(owner);
>   rcu_read_unlock();
>   return ret;
> }
> 
>>  +
>>  +void rlimit_cgroup_uncharge_as(struct mm_struct *mm, unsigned long nr_pages)
>>  +{
>>  +       struct rlimit_cgroup *rcg;
>>  +
>>  +       rcu_read_lock();
>>  +       rcg = rlimit_cgroup_from_task(rcu_dereference(mm->owner));
>>  +       css_get(&rcg->css);
>>  +       rcu_read_unlock();
>>  +
>>  +       res_counter_uncharge(&rcg->as_res, (nr_pages << PAGE_SHIFT));
>>  +       css_put(&rcg->css);
>>  +}
> 
> Can't this be implemented as just a call to charge() with a negative
> value? (Possibly fixing res_counter_charge() to handle negative values
> if necessary) Seems simpler.

No, I'd keep two calls - charge and uncharge. This makes you sure that
the code xxx_charge(value) is a charge, regardless of what the "value"
is. Besides, xxx_charge returns an error code, you need to check (BTW, I
think we should add a __must_check attribute there), while uncharge does
not.

>>  +/*
>>  + * TODO: get the attach callbacks to fail and disallow task movement.
>>  + */
> 
> You mean disallow all movement within a hierarchy that has this cgroup
> mounted? Doesn't that make it rather hard to use?
> 
>>  +static void rlimit_cgroup_move_task(struct cgroup_subsys *ss,
>>  +                                       struct cgroup *cgrp,
>>  +                                       struct cgroup *old_cgrp,
>>  +                                       struct task_struct *p)
>>  +{
>>  +       struct mm_struct *mm;
>>  +       struct rlimit_cgroup *rcg, *old_rcg;
>>  +
>>  +       mm = get_task_mm(p);
>>  +       if (mm == NULL)
>>  +               return;
>>  +
>>  +       rcu_read_lock();
>>  +       if (p != rcu_dereference(mm->owner))
>>  +               goto out;
>>  +
>>  +       rcg = rlimit_cgroup_from_cgrp(cgrp);
>>  +       old_rcg = rlimit_cgroup_from_cgrp(old_cgrp);
>>  +
>>  +       if (rcg == old_rcg)
>>  +               goto out;
>>  +
>>  +       if (res_counter_charge(&rcg->as_res, (mm->total_vm << PAGE_SHIFT)))
>>  +               goto out;
>>  +       res_counter_uncharge(&old_rcg->as_res, (mm->total_vm << PAGE_SHIFT));
>>  +out:
>>  +       rcu_read_unlock();
>>  +       mmput(mm);
>>  +}
>>  +
> 
> Since you need to protect against concurrent charges, and against
> concurrent mm ownership changes, I think you should just do this under
> task_lock(p).
> 
>>  +static void rlimit_cgroup_mm_owner_changed(struct cgroup_subsys *ss,
>>  +                                               struct cgroup *cgrp,
>>  +                                               struct cgroup *old_cgrp,
>>  +                                               struct task_struct *p)
>>  +{
>>  +       struct rlimit_cgroup *rcg, *old_rcg;
>>  +       struct mm_struct *mm = get_task_mm(p);
>>  +
>>  +       BUG_ON(!mm);
>>  +       rcg = rlimit_cgroup_from_cgrp(cgrp);
>>  +       old_rcg = rlimit_cgroup_from_cgrp(old_cgrp);
>>  +       if (res_counter_charge(&rcg->as_res, (mm->total_vm << PAGE_SHIFT)))
>>  +               goto out;
>>  +       res_counter_uncharge(&old_rcg->as_res, (mm->total_vm << PAGE_SHIFT));
>>  +out:
>>  +       mmput(mm);
>>  +}
>>  +
> 
> Also needs to task_lock(p) to prevent concurrent charges or cgroup
> reassignments?
> 
> Paul
> 


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [-mm][PATCH 3/4] Add rlimit controller accounting and control
  2008-05-07  3:29   ` Paul Menage
@ 2008-05-08 14:35     ` Balbir Singh
  2008-05-08 21:45       ` Paul Menage
  0 siblings, 1 reply; 32+ messages in thread
From: Balbir Singh @ 2008-05-08 14:35 UTC (permalink / raw)
  To: Paul Menage
  Cc: linux-mm, Sudhir Kumar, YAMAMOTO Takashi, lizf, linux-kernel,
	David Rientjes, Pavel Emelianov, Andrew Morton,
	KAMEZAWA Hiroyuki

Paul Menage wrote:
> On Sat, May 3, 2008 at 2:38 PM, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
>>
>>  This patch adds support for accounting and control of virtual address space
>>  limits. The accounting is done via the rlimit_cgroup_(un)charge_as functions.
>>  The core of the accounting takes place during fork time in copy_process(),
>>  may_expand_vm(), remove_vma_list() and exit_mmap(). There are some special
>>  cases that are handled here as well (arch/ia64/kernel/perform.c,
>>  arch/x86/kernel/ptrace.c, insert_special_mapping())
>>
> 
> The basic idea of the patches looks fine (apart from some
> synchronization issues) but Is calling this the "rlimit" controller a
> great idea? That implies that it handles all (or at least many) of the
> things that setrlimit()/getrlimit() handle.
> 
> While some of the other rlimit things definitely do make sense as
> cgroup controllers, putting them all in the same controller doesn't
> really - paying for the address-space tracking overhead just to get,
> say, the equivalent of RLIMIT_NPROC (max tasks) isn't a great idea.
> 
> Can you instead give this a name that somehow refers to virtual
> address space limits, e.g. "va" or "as". That would still fit if you
> expanded it to deal with locked virtual address space limits too.
> 
> I think that an "rlimit" controller would probably be best for
> representing just those limits that don't really make sense when
> aggregated across different tasks, but apply separately to each task
> (e.g. RLIMIT_FSIZE, RLIMIT_CORE, RLIMIT_NICE, RLIMIT_NOFILE,
> RLIMIT_RTPRIO, RLIMIT_STACK, RLIMIT_SIGPENDING, and maybe RLIMIT_CPU),
> in order to provide an easy way to change these limits on a group of
> running tasks.
> 

I currently intend to use this controller for controlling memory related
rlimits, like address space and mlock'ed memory. How about we use something like
"memrlimit"?


> On a separate note for the address-space tracking, ideally the
> subsystem would track whether or not it was bound to a hierarchy, and
> skip charging/uncharging if not. That way there's no (noticeable)
> overhead for compiling in the subsystem but not using it. At the point
> when the subsystem was bound to a hierarchy, it could at that point
> run through all mms and charge each one's existing address space to
> the appropriate cgroup. (Currently that would only be the root cgroup
> in the hierarchy).

Good suggestion, but it will be hard if not impossible to account the data
correctly as it changes, if we do the accounting/summation at bind time. We'll
need a really big lock to do it, something I want to avoid. Did you have
something else in mind?


-- 
	Warm Regards,
	Balbir Singh
	Linux Technology Center
	IBM, ISTL

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [-mm][PATCH 3/4] Add rlimit controller accounting and control
  2008-05-07  3:17   ` Paul Menage
  2008-05-07  5:59     ` Pavel Emelyanov
@ 2008-05-08 14:54     ` Balbir Singh
  2008-05-08 23:22       ` Paul Menage
  1 sibling, 1 reply; 32+ messages in thread
From: Balbir Singh @ 2008-05-08 14:54 UTC (permalink / raw)
  To: Paul Menage
  Cc: linux-mm, Sudhir Kumar, YAMAMOTO Takashi, lizf, linux-kernel,
	David Rientjes, Pavel Emelianov, Andrew Morton,
	KAMEZAWA Hiroyuki

Paul Menage wrote:
> On Sat, May 3, 2008 at 2:38 PM, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
>>  +
>>  +int rlimit_cgroup_charge_as(struct mm_struct *mm, unsigned long nr_pages)
>>  +{
>>  +       int ret;
>>  +       struct rlimit_cgroup *rcg;
>>  +
>>  +       rcu_read_lock();
>>  +       rcg = rlimit_cgroup_from_task(rcu_dereference(mm->owner));
>>  +       css_get(&rcg->css);
>>  +       rcu_read_unlock();
>>  +
>>  +       ret = res_counter_charge(&rcg->as_res, (nr_pages << PAGE_SHIFT));
>>  +       css_put(&rcg->css);
>>  +       return ret;
>>  +}
> 
> You need to synchronize against mm->owner changing, or
> mm->owner->cgroups changing. How about:
> 
> int rlimit_cgroup_charge_as(struct mm_struct *mm, unsigned long nr_pages)
> {
>   int ret;
>   struct rlimit_cgroup *rcg;
>   struct task_struct *owner;
> 
>   rcu_read_lock();
>  again:
> 
>   /* Find and lock the owner of the mm */
>   owner = rcu_dereference(mm->owner);
>   task_lock(owner);
>   if (mm->owner != owner) {
>     task_unlock(owner);
>     goto again;
>   }
> 
>   /* Charge the owner's cgroup with the new memory */
>   rcg = rlimit_cgroup_from_task(owner);
>   ret = res_counter_charge(&rcg->as_res, (nr_pages << PAGE_SHIFT));
>   task_unlock(owner);
>   rcu_read_unlock();
>   return ret;
> }
> 

My mind goes blank at times, so forgive me asking, what happens if we don't use
task_lock(). mm->owner cannot be freed, even if it changes, we get the callback
in mm_owner_changed(). The locations from where we call _charge and _uncharge,
we know that the mm is not going to change either.

>>  +
>>  +void rlimit_cgroup_uncharge_as(struct mm_struct *mm, unsigned long nr_pages)
>>  +{
>>  +       struct rlimit_cgroup *rcg;
>>  +
>>  +       rcu_read_lock();
>>  +       rcg = rlimit_cgroup_from_task(rcu_dereference(mm->owner));
>>  +       css_get(&rcg->css);
>>  +       rcu_read_unlock();
>>  +
>>  +       res_counter_uncharge(&rcg->as_res, (nr_pages << PAGE_SHIFT));
>>  +       css_put(&rcg->css);
>>  +}
> 
> Can't this be implemented as just a call to charge() with a negative
> value? (Possibly fixing res_counter_charge() to handle negative values
> if necessary) Seems simpler.
> 

I did that earlier, but then Pavel suggested splitting it up as charge/uncharge.
I found that his suggestion helped make the code more readable.

>>  +/*
>>  + * TODO: get the attach callbacks to fail and disallow task movement.
>>  + */
> 
> You mean disallow all movement within a hierarchy that has this cgroup
> mounted? Doesn't that make it rather hard to use?
> 

Consider the following scenario

We try to move task "t1" from cgroup "A" to cgroup "B".
Doing so, causes "B" to go over it's limit, what do we do?
Ideally, we would like to be able to go back to cgroups and say, please fail
attach, since that causes "B" to go over it's specified limit.

>>  +static void rlimit_cgroup_move_task(struct cgroup_subsys *ss,
>>  +                                       struct cgroup *cgrp,
>>  +                                       struct cgroup *old_cgrp,
>>  +                                       struct task_struct *p)
>>  +{
>>  +       struct mm_struct *mm;
>>  +       struct rlimit_cgroup *rcg, *old_rcg;
>>  +
>>  +       mm = get_task_mm(p);
>>  +       if (mm == NULL)
>>  +               return;
>>  +
>>  +       rcu_read_lock();
>>  +       if (p != rcu_dereference(mm->owner))
>>  +               goto out;
>>  +
>>  +       rcg = rlimit_cgroup_from_cgrp(cgrp);
>>  +       old_rcg = rlimit_cgroup_from_cgrp(old_cgrp);
>>  +
>>  +       if (rcg == old_rcg)
>>  +               goto out;
>>  +
>>  +       if (res_counter_charge(&rcg->as_res, (mm->total_vm << PAGE_SHIFT)))
>>  +               goto out;
>>  +       res_counter_uncharge(&old_rcg->as_res, (mm->total_vm << PAGE_SHIFT));
>>  +out:
>>  +       rcu_read_unlock();
>>  +       mmput(mm);
>>  +}
>>  +
> 
> Since you need to protect against concurrent charges, and against
> concurrent mm ownership changes, I think you should just do this under
> task_lock(p).

Please see my comment above on task_lock(). I'll draw some diagrams on the
whiteboard and check the sanity of my argument.

> 
>>  +static void rlimit_cgroup_mm_owner_changed(struct cgroup_subsys *ss,
>>  +                                               struct cgroup *cgrp,
>>  +                                               struct cgroup *old_cgrp,
>>  +                                               struct task_struct *p)
>>  +{
>>  +       struct rlimit_cgroup *rcg, *old_rcg;
>>  +       struct mm_struct *mm = get_task_mm(p);
>>  +
>>  +       BUG_ON(!mm);
>>  +       rcg = rlimit_cgroup_from_cgrp(cgrp);
>>  +       old_rcg = rlimit_cgroup_from_cgrp(old_cgrp);
>>  +       if (res_counter_charge(&rcg->as_res, (mm->total_vm << PAGE_SHIFT)))
>>  +               goto out;
>>  +       res_counter_uncharge(&old_rcg->as_res, (mm->total_vm << PAGE_SHIFT));
>>  +out:
>>  +       mmput(mm);
>>  +}
>>  +
> 
> Also needs to task_lock(p) to prevent concurrent charges or cgroup
> reassignments?
> 

The callbacks are called with task_lock() held. Please see
mm_update_next_owner->cgroup_mm_owner_callbacks() in kernel/exit.c

-- 
	Warm Regards,
	Balbir Singh
	Linux Technology Center
	IBM, ISTL

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [-mm][PATCH 3/4] Add rlimit controller accounting and control
  2008-05-08 14:35     ` Balbir Singh
@ 2008-05-08 21:45       ` Paul Menage
  2008-05-09 13:35         ` Balbir Singh
  0 siblings, 1 reply; 32+ messages in thread
From: Paul Menage @ 2008-05-08 21:45 UTC (permalink / raw)
  To: balbir
  Cc: linux-mm, Sudhir Kumar, YAMAMOTO Takashi, lizf, linux-kernel,
	David Rientjes, Pavel Emelianov, Andrew Morton,
	KAMEZAWA Hiroyuki

On Thu, May 8, 2008 at 7:35 AM, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
>
>  I currently intend to use this controller for controlling memory related
>  rlimits, like address space and mlock'ed memory. How about we use something like
>  "memrlimit"?

Sounds reasonable.

>
>  Good suggestion, but it will be hard if not impossible to account the data
>  correctly as it changes, if we do the accounting/summation at bind time. We'll
>  need a really big lock to do it, something I want to avoid. Did you have
>  something else in mind?

Yes, it'll be tricky but I think worthwhile. I believe it can be done
without the charge/uncharge code needing to take a global lock, except
for when we're actually binding/unbinding, with careful use of RCU.

My first thought for how to do this was that we have a field
"bind_transition" that indicates whether we're transitioning between
bound and unbound, and a bind_mutex. By default the charge/unpath uses
RCU, but by marking that we're in a transition state, the charge path
will use the mutex instead. By waiting for all existing chargers that
are using RCU to exit, we can then take the lock and synchronize with
the chargers.

So the charge/uncharge path would do:

  rcu_read_lock();
  if (ss->tranistioning) {
    rcu_read_unlock();
    locked = 1;
    mutex_lock(&ss->bind_mutex);
  }
  if (ss->active) {
    /* do charge/uncharge stuff, which must not block */
  }
  if (locked) {
    mutex_unlock(&ss->bind_mutex);
  } else {
    rcu_read_unlock();
  }

and the bind path would do something like:

ss->transitioning = 1;
synchronize_rcu();
mutex_lock(&ss->bind_mutex);
for_each_mm(mm) {
  down_read(&mm->mmap_sem);
  add_charge_for_mm();
  up_read(&mm->mmap_sem);
}
mutex_unlock(&ss->bind_mutex);
ss->transitioning = 0;

But this would break because we're nesting mmap_sem inside bind_mutex
in the bind path, but in the charge path we're nesting bind_mutex
inside mmap_sem. So we'd probably need to define a new bit
MMF_RLIMIT_ACCOUNTED in mm->flags to indicate whether that mm's
address space usage is accounted for. Once we've done that, we can use
mmap_sem to synchronize changes to the per-mm charged status for free,
since we already hold mmap_sem whenever we're doing the charging,
right? So it becomes simple:

charge path:

if (!test_bit(MMF_RLIMIT_ACCOUNTED, &mm->flags))
  return 0;
/* do charge/uncharge stuff */

bind path:

while((mm = find_unaccounted_mm()) {
  down_write(&mm->mmap_sem);
  add_charge_for_mm();
  set_bit(MMF_RLIMIT_ACCOUNTED, &mm->flags);
  up_write(&mm->mmap_sem);
}

Paul

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [-mm][PATCH 3/4] Add rlimit controller accounting and control
  2008-05-08 14:54     ` Balbir Singh
@ 2008-05-08 23:22       ` Paul Menage
  0 siblings, 0 replies; 32+ messages in thread
From: Paul Menage @ 2008-05-08 23:22 UTC (permalink / raw)
  To: balbir
  Cc: linux-mm, Sudhir Kumar, YAMAMOTO Takashi, lizf, linux-kernel,
	David Rientjes, Pavel Emelianov, Andrew Morton,
	KAMEZAWA Hiroyuki

On Thu, May 8, 2008 at 7:54 AM, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
>
> Paul Menage wrote:
>  > On Sat, May 3, 2008 at 2:38 PM, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
>  >>  +
>  >>  +int rlimit_cgroup_charge_as(struct mm_struct *mm, unsigned long nr_pages)
>  >>  +{
>  >>  +       int ret;
>  >>  +       struct rlimit_cgroup *rcg;
>  >>  +
>  >>  +       rcu_read_lock();
>  >>  +       rcg = rlimit_cgroup_from_task(rcu_dereference(mm->owner));
>  >>  +       css_get(&rcg->css);
>  >>  +       rcu_read_unlock();
>  >>  +
>  >>  +       ret = res_counter_charge(&rcg->as_res, (nr_pages << PAGE_SHIFT));
>  >>  +       css_put(&rcg->css);
>  >>  +       return ret;
>  >>  +}
>  >
>  > You need to synchronize against mm->owner changing, or
>  > mm->owner->cgroups changing. How about:
>  >
>
>  My mind goes blank at times, so forgive me asking, what happens if we don't use
>  task_lock(). mm->owner cannot be freed, even if it changes, we get the callback
>  in mm_owner_changed(). The locations from where we call _charge and _uncharge,
>  we know that the mm is not going to change either.

I guess I'm concerned about a race like:

A and B are threads in cgroup G, and C is a different process
A->mm->owner == B

A: enter rlimit_cgroup_charge_as()
A: charge new page to G
C: enter attach_task(newG, B)
C: update B->cgroup to point to newG
C: call memrlimit->attach(G, newG, B)
C: charge mm->total_vm to newG
C: uncharge mm->total_vm from G
A: add new page to mm->total_vm

Maybe this can be solved very simply by just taking mm->mmap_sem in
rlimit_cgroup_move_task() and rlimit_cgroup_mm_owner_changed() ? Since
mmap_sem is (I hope) held across all operations that change
mm->total_vm

>
>>  Consider the following scenario
>
>  We try to move task "t1" from cgroup "A" to cgroup "B".
>  Doing so, causes "B" to go over it's limit, what do we do?
>  Ideally, we would like to be able to go back to cgroups and say, please fail
>  attach, since that causes "B" to go over it's specified limit.
>

OK, that sounds reasonable - that's what the can_attach() callback is for.

Paul

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [-mm][PATCH 3/4] Add rlimit controller accounting and control
  2008-05-08 21:45       ` Paul Menage
@ 2008-05-09 13:35         ` Balbir Singh
  0 siblings, 0 replies; 32+ messages in thread
From: Balbir Singh @ 2008-05-09 13:35 UTC (permalink / raw)
  To: Paul Menage
  Cc: linux-mm, Sudhir Kumar, YAMAMOTO Takashi, lizf, linux-kernel,
	David Rientjes, Pavel Emelianov, Andrew Morton,
	KAMEZAWA Hiroyuki

Paul Menage wrote:
> On Thu, May 8, 2008 at 7:35 AM, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
>>  I currently intend to use this controller for controlling memory related
>>  rlimits, like address space and mlock'ed memory. How about we use something like
>>  "memrlimit"?
> 
> Sounds reasonable.
> 
>>  Good suggestion, but it will be hard if not impossible to account the data
>>  correctly as it changes, if we do the accounting/summation at bind time. We'll
>>  need a really big lock to do it, something I want to avoid. Did you have
>>  something else in mind?
> 
> Yes, it'll be tricky but I think worthwhile. I believe it can be done
> without the charge/uncharge code needing to take a global lock, except
> for when we're actually binding/unbinding, with careful use of RCU.
> 

[snip]

This is an optimization that I am willing to consider later in the project. At
first I want to focus on functionality. I would like to optimize once I know
that the functionality has been well tested by a good base of users and make
sure that the optimization is real.

-- 
	Warm Regards,
	Balbir Singh
	Linux Technology Center
	IBM, ISTL

^ permalink raw reply	[flat|nested] 32+ messages in thread

end of thread, other threads:[~2008-05-09 13:35 UTC | newest]

Thread overview: 32+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-05-03 21:37 [-mm][PATCH 0/4] Add rlimit controller to cgroups (v3) Balbir Singh
2008-05-03 21:37 ` [-mm][PATCH 1/4] Setup the rlimit controller Balbir Singh
2008-05-05 22:11   ` Andrew Morton
2008-05-06  3:40     ` Balbir Singh
2008-05-06  1:31   ` Li Zefan
2008-05-06  8:15     ` Balbir Singh
2008-05-03 21:38 ` [-mm][PATCH 2/4] Enhance cgroup mm_owner_changed callback to add task information Balbir Singh
2008-05-05 22:15   ` Andrew Morton
2008-05-06  3:43     ` Balbir Singh
2008-05-05 23:00   ` Paul Menage
2008-05-03 21:38 ` [-mm][PATCH 3/4] Add rlimit controller accounting and control Balbir Singh
2008-05-05 22:24   ` Andrew Morton
2008-05-05 22:32     ` David Rientjes
2008-05-06  5:34     ` Balbir Singh
2008-05-07  3:17   ` Paul Menage
2008-05-07  5:59     ` Pavel Emelyanov
2008-05-08 14:54     ` Balbir Singh
2008-05-08 23:22       ` Paul Menage
2008-05-07  3:29   ` Paul Menage
2008-05-08 14:35     ` Balbir Singh
2008-05-08 21:45       ` Paul Menage
2008-05-09 13:35         ` Balbir Singh
2008-05-03 21:38 ` [-mm][PATCH 4/4] Add rlimit controller documentation Balbir Singh
2008-05-05 22:35   ` Andrew Morton
2008-05-06  5:39     ` Balbir Singh
2008-05-06  5:54       ` Andrew Morton
2008-05-06  7:59         ` Balbir Singh
2008-05-04 15:24 ` [-mm][PATCH 0/4] Add rlimit controller to cgroups (v3) kamezawa.hiroyu
2008-05-05  4:21   ` Balbir Singh
2008-05-07  1:09     ` KAMEZAWA Hiroyuki
2008-05-04 15:27 ` kamezawa.hiroyu
2008-05-05  4:24   ` Balbir Singh

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).