linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH][RFC] dirty balancing for cgroups
@ 2008-07-09  6:00 YAMAMOTO Takashi
  2008-07-10 23:54 ` KAMEZAWA Hiroyuki
  2008-07-14 13:37 ` Peter Zijlstra
  0 siblings, 2 replies; 19+ messages in thread
From: YAMAMOTO Takashi @ 2008-07-09  6:00 UTC (permalink / raw)
  To: linux-kernel; +Cc: containers, a.p.zijlstra, menage, kamezawa.hiroyu

hi,

the following patch is a simple implementation of
dirty balancing for cgroups.  any comments?

it depends on the following fix:
	http://lkml.org/lkml/2008/7/8/428

YAMAMOTO Takashi


Signed-off-by: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
---

diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 23c02e2..f5453cc 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -52,3 +52,9 @@ SUBSYS(memrlimit_cgroup)
 #endif
 
 /* */
+
+#ifdef CONFIG_CGROUP_MEMDIRTYLIMIT_CTLR
+SUBSYS(memdirtylimit_cgroup)
+#endif
+
+/* */
diff --git a/include/linux/memdirtylimitcgroup.h b/include/linux/memdirtylimitcgroup.h
new file mode 100644
index 0000000..667d312
--- /dev/null
+++ b/include/linux/memdirtylimitcgroup.h
@@ -0,0 +1,47 @@
+
+/*
+ * memdirtylimitcgroup.h COPYRIGHT FUJITSU LIMITED 2008
+ *
+ * Author: yamamoto@valinux.co.jp
+ */
+
+struct task_struct;
+
+#if defined(CONFIG_CGROUP_MEMDIRTYLIMIT_CTLR)
+
+void memdirtylimitcgroup_dirty_inc(struct task_struct *);
+void memdirtylimitcgroup_dirty_limit(struct task_struct *, long *);
+void memdirtylimitcgroup_change_shift(int);
+void memdirtylimitcgroup_init(int);
+
+#else /* defined(CONFIG_CGROUP_MEMDIRTYLIMIT_CTLR) */
+
+static inline void
+memdirtylimitcgroup_dirty_inc(struct task_struct *t)
+{
+
+	/* nothing */
+}
+
+static inline void
+memdirtylimitcgroup_dirty_limit(struct task_struct *t, long *dirtyp)
+{
+
+	/* nothing */
+}
+
+static inline void
+memdirtylimitcgroup_change_shift(int shift)
+{
+
+	/* nothing */
+}
+
+static inline void
+memdirtylimitcgroup_init(int shift)
+{
+
+	/* nothing */
+}
+
+#endif /* defined(CONFIG_CGROUP_MEMDIRTYLIMIT_CTLR) */
diff --git a/init/Kconfig b/init/Kconfig
index 162d462..985bac8 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -418,6 +418,12 @@ config CGROUP_MEMRLIMIT_CTLR
 	  memory RSS and Page Cache control. Virtual address space control
 	  is provided by this controller.
 
+config CGROUP_MEMDIRTYLIMIT_CTLR
+	bool "Memory Dirty Limit Controller for Control Groups"
+	depends on CGROUPS && RESOURCE_COUNTERS
+	help
+	  XXX TBD
+
 config SYSFS_DEPRECATED
 	bool
 
diff --git a/mm/Makefile b/mm/Makefile
index f54232d..8603d19 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -35,4 +35,5 @@ obj-$(CONFIG_SMP) += allocpercpu.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o
 obj-$(CONFIG_CGROUP_MEMRLIMIT_CTLR) += memrlimitcgroup.o
+obj-$(CONFIG_CGROUP_MEMDIRTYLIMIT_CTLR) += memdirtylimitcgroup.o
 obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
diff --git a/mm/memdirtylimitcgroup.c b/mm/memdirtylimitcgroup.c
new file mode 100644
index 0000000..b70b33d
--- /dev/null
+++ b/mm/memdirtylimitcgroup.c
@@ -0,0 +1,179 @@
+
+/*
+ * memdirtylimitcgroup.c COPYRIGHT FUJITSU LIMITED 2008
+ *
+ * Author: yamamoto@valinux.co.jp
+ */
+
+#include <linux/err.h>
+#include <linux/cgroup.h>
+#include <linux/rcupdate.h>
+#include <linux/slab.h>
+#include <linux/memdirtylimitcgroup.h>
+
+#include <asm/div64.h>
+
+static struct prop_descriptor vm_cgroup_dirties;
+
+struct memdirtylimit_cgroup {
+	struct cgroup_subsys_state dlcg_css;
+	spinlock_t dlcg_lock;
+	struct prop_local_single dlcg_dirties;
+};
+
+static struct cgroup_subsys_state *
+task_to_css(struct task_struct *task)
+{
+
+	return task_subsys_state(task, memdirtylimit_cgroup_subsys_id);
+}
+
+static struct memdirtylimit_cgroup *
+css_to_dlcg(struct cgroup_subsys_state *css)
+{
+
+	return container_of(css, struct memdirtylimit_cgroup, dlcg_css);
+}
+
+static struct cgroup_subsys_state *
+cg_to_css(struct cgroup *cg)
+{
+
+	return cgroup_subsys_state(cg, memdirtylimit_cgroup_subsys_id);
+}
+
+static struct memdirtylimit_cgroup *
+cg_to_dlcg(struct cgroup *cg)
+{
+
+	return css_to_dlcg(cg_to_css(cg));
+}
+
+/* ---------------------------------------- */
+
+static void
+getfraction(struct memdirtylimit_cgroup *dlcg, long *numeratorp,
+    long *denominatorp)
+{
+
+	spin_lock(&dlcg->dlcg_lock);
+	prop_fraction_single(&vm_cgroup_dirties, &dlcg->dlcg_dirties,
+	   numeratorp, denominatorp);
+	spin_unlock(&dlcg->dlcg_lock);
+}
+
+/* ---------------------------------------- */
+
+void
+memdirtylimitcgroup_dirty_inc(struct task_struct *t)
+{
+	struct memdirtylimit_cgroup *dlcg;
+
+	rcu_read_lock();
+	dlcg = css_to_dlcg(task_to_css(t));
+	spin_lock(&dlcg->dlcg_lock);
+	prop_inc_single(&vm_cgroup_dirties, &dlcg->dlcg_dirties);
+	spin_unlock(&dlcg->dlcg_lock);
+	rcu_read_unlock();
+}
+
+void
+memdirtylimitcgroup_dirty_limit(struct task_struct *t, long *dirtyp)
+{
+	struct memdirtylimit_cgroup *dlcg;
+	unsigned long dirty = *dirtyp;
+	uint64_t tmp;
+	long numerator;
+	long denominator;
+
+	BUG_ON(*dirtyp < 0);
+
+	rcu_read_lock();
+	dlcg = css_to_dlcg(task_to_css(t));
+	getfraction(dlcg, &numerator, &denominator);
+	rcu_read_unlock();
+
+	tmp = (uint64_t)(dirty >> 1) * numerator;
+	do_div(tmp, denominator);
+	*dirtyp = dirty - (unsigned long)tmp;
+}
+
+void
+memdirtylimitcgroup_change_shift(int shift)
+{
+
+	prop_change_shift(&vm_cgroup_dirties, shift);
+}
+
+void
+memdirtylimitcgroup_init(int shift)
+{
+
+	prop_descriptor_init(&vm_cgroup_dirties, shift);
+}
+
+/* ---------------------------------------- */
+
+static u64
+memdirtylimit_cgroup_read_fraction(struct cgroup *cg, struct cftype *cft)
+{
+	struct memdirtylimit_cgroup *dlcg;
+	uint64_t result;
+	long numerator;
+	long denominator;
+
+	dlcg = cg_to_dlcg(cg);
+	getfraction(dlcg, &numerator, &denominator);
+	result = (uint64_t)100 * numerator;
+	do_div(result, denominator);
+	return result;
+}
+
+static const struct cftype files[] = {
+	{
+		.name = "fraction",
+		.read_u64 = memdirtylimit_cgroup_read_fraction,
+	},
+};
+
+static int
+memdirtylimit_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cg)
+{
+
+	return cgroup_add_files(cg, ss, files, ARRAY_SIZE(files));
+}
+
+static struct cgroup_subsys_state *
+memdirtylimit_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cg)
+{
+	struct memdirtylimit_cgroup *dlcg;
+	int error;
+
+	dlcg = kzalloc(sizeof(*dlcg), GFP_KERNEL);
+	if (dlcg == NULL)
+		return ERR_PTR(-ENOMEM);
+	error = prop_local_init_single(&dlcg->dlcg_dirties);
+	if (error != 0) {
+		kfree(dlcg);
+		return ERR_PTR(error);
+	}
+	spin_lock_init(&dlcg->dlcg_lock);
+	return &dlcg->dlcg_css;
+}
+
+static void
+memdirtylimit_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cg)
+{
+	struct memdirtylimit_cgroup *dlcg = cg_to_dlcg(cg);
+
+	prop_local_destroy_single(&dlcg->dlcg_dirties);
+	kfree(dlcg);
+}
+
+struct cgroup_subsys memdirtylimit_cgroup_subsys = {
+	.name = "memdirtylimit",
+	.subsys_id = memdirtylimit_cgroup_subsys_id,
+	.create = memdirtylimit_cgroup_create,
+	.destroy = memdirtylimit_cgroup_destroy,
+	.populate = memdirtylimit_cgroup_populate,
+};
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index e6fa69e..f971532 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -34,6 +34,7 @@
 #include <linux/syscalls.h>
 #include <linux/buffer_head.h>
 #include <linux/pagevec.h>
+#include <linux/memdirtylimitcgroup.h>
 
 /*
  * The maximum number of pages to writeout in a single bdflush/kupdate
@@ -152,6 +153,7 @@ int dirty_ratio_handler(struct ctl_table *table, int write,
 		int shift = calc_period_shift();
 		prop_change_shift(&vm_completions, shift);
 		prop_change_shift(&vm_dirties, shift);
+		memdirtylimitcgroup_change_shift(shift);
 	}
 	return ret;
 }
@@ -393,6 +395,8 @@ get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
 	if (bdi) {
 		u64 bdi_dirty;
 		long numerator, denominator;
+		long task_dirty;
+		long cgroup_dirty;
 
 		/*
 		 * Calculate this BDI's share of the dirty ratio.
@@ -408,7 +412,11 @@ get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
 
 		*pbdi_dirty = bdi_dirty;
 		clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty);
-		task_dirty_limit(current, pbdi_dirty);
+		task_dirty = *pbdi_dirty;
+		task_dirty_limit(current, &task_dirty);
+		cgroup_dirty = *pbdi_dirty;
+		memdirtylimitcgroup_dirty_limit(current, &cgroup_dirty);
+		*pbdi_dirty = min(task_dirty, cgroup_dirty);
 	}
 }
 
@@ -842,6 +850,7 @@ void __init page_writeback_init(void)
 	shift = calc_period_shift();
 	prop_descriptor_init(&vm_completions, shift);
 	prop_descriptor_init(&vm_dirties, shift);
+	memdirtylimitcgroup_init(shift);
 }
 
 /**
@@ -1105,6 +1114,7 @@ int __set_page_dirty_nobuffers(struct page *page)
 	}
 
 	task_dirty_inc(current);
+	memdirtylimitcgroup_dirty_inc(current);
 
 	return 1;
 }

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH][RFC] dirty balancing for cgroups
  2008-07-09  6:00 [PATCH][RFC] dirty balancing for cgroups YAMAMOTO Takashi
@ 2008-07-10 23:54 ` KAMEZAWA Hiroyuki
  2008-07-11  4:06   ` YAMAMOTO Takashi
  2008-07-14 13:37 ` Peter Zijlstra
  1 sibling, 1 reply; 19+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-07-10 23:54 UTC (permalink / raw)
  To: YAMAMOTO Takashi; +Cc: linux-kernel, containers, a.p.zijlstra, menage

On Wed,  9 Jul 2008 15:00:34 +0900 (JST)
yamamoto@valinux.co.jp (YAMAMOTO Takashi) wrote:

> hi,
> 
> the following patch is a simple implementation of
> dirty balancing for cgroups.  any comments?
> 
> it depends on the following fix:
> 	http://lkml.org/lkml/2008/7/8/428
> 

A few comments  ;)

 - This looks simple but, could you merge this into memory resource controller ?
   (if conflict, I'll queue on my stack.)
 - Do you have some number ? or How we can test this works well ?
 - please CC to linux-mm.

Thanks,
-Kame

> YAMAMOTO Takashi
> 
> 
> Signed-off-by: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
> ---
> 
> diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
> index 23c02e2..f5453cc 100644
> --- a/include/linux/cgroup_subsys.h
> +++ b/include/linux/cgroup_subsys.h
> @@ -52,3 +52,9 @@ SUBSYS(memrlimit_cgroup)
>  #endif
>  
>  /* */
> +
> +#ifdef CONFIG_CGROUP_MEMDIRTYLIMIT_CTLR
> +SUBSYS(memdirtylimit_cgroup)
> +#endif
> +
> +/* */
> diff --git a/include/linux/memdirtylimitcgroup.h b/include/linux/memdirtylimitcgroup.h
> new file mode 100644
> index 0000000..667d312
> --- /dev/null
> +++ b/include/linux/memdirtylimitcgroup.h
> @@ -0,0 +1,47 @@
> +
> +/*
> + * memdirtylimitcgroup.h COPYRIGHT FUJITSU LIMITED 2008
> + *
> + * Author: yamamoto@valinux.co.jp
> + */
> +
> +struct task_struct;
> +
> +#if defined(CONFIG_CGROUP_MEMDIRTYLIMIT_CTLR)
> +
> +void memdirtylimitcgroup_dirty_inc(struct task_struct *);
> +void memdirtylimitcgroup_dirty_limit(struct task_struct *, long *);
> +void memdirtylimitcgroup_change_shift(int);
> +void memdirtylimitcgroup_init(int);
> +
> +#else /* defined(CONFIG_CGROUP_MEMDIRTYLIMIT_CTLR) */
> +
> +static inline void
> +memdirtylimitcgroup_dirty_inc(struct task_struct *t)
> +{
> +
> +	/* nothing */
> +}
> +
> +static inline void
> +memdirtylimitcgroup_dirty_limit(struct task_struct *t, long *dirtyp)
> +{
> +
> +	/* nothing */
> +}
> +
> +static inline void
> +memdirtylimitcgroup_change_shift(int shift)
> +{
> +
> +	/* nothing */
> +}
> +
> +static inline void
> +memdirtylimitcgroup_init(int shift)
> +{
> +
> +	/* nothing */
> +}
> +
> +#endif /* defined(CONFIG_CGROUP_MEMDIRTYLIMIT_CTLR) */
> diff --git a/init/Kconfig b/init/Kconfig
> index 162d462..985bac8 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -418,6 +418,12 @@ config CGROUP_MEMRLIMIT_CTLR
>  	  memory RSS and Page Cache control. Virtual address space control
>  	  is provided by this controller.
>  
> +config CGROUP_MEMDIRTYLIMIT_CTLR
> +	bool "Memory Dirty Limit Controller for Control Groups"
> +	depends on CGROUPS && RESOURCE_COUNTERS
> +	help
> +	  XXX TBD
> +
>  config SYSFS_DEPRECATED
>  	bool
>  
> diff --git a/mm/Makefile b/mm/Makefile
> index f54232d..8603d19 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -35,4 +35,5 @@ obj-$(CONFIG_SMP) += allocpercpu.o
>  obj-$(CONFIG_QUICKLIST) += quicklist.o
>  obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o
>  obj-$(CONFIG_CGROUP_MEMRLIMIT_CTLR) += memrlimitcgroup.o
> +obj-$(CONFIG_CGROUP_MEMDIRTYLIMIT_CTLR) += memdirtylimitcgroup.o
>  obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
> diff --git a/mm/memdirtylimitcgroup.c b/mm/memdirtylimitcgroup.c
> new file mode 100644
> index 0000000..b70b33d
> --- /dev/null
> +++ b/mm/memdirtylimitcgroup.c
> @@ -0,0 +1,179 @@
> +
> +/*
> + * memdirtylimitcgroup.c COPYRIGHT FUJITSU LIMITED 2008
> + *
> + * Author: yamamoto@valinux.co.jp
> + */
> +
> +#include <linux/err.h>
> +#include <linux/cgroup.h>
> +#include <linux/rcupdate.h>
> +#include <linux/slab.h>
> +#include <linux/memdirtylimitcgroup.h>
> +
> +#include <asm/div64.h>
> +
> +static struct prop_descriptor vm_cgroup_dirties;
> +
> +struct memdirtylimit_cgroup {
> +	struct cgroup_subsys_state dlcg_css;
> +	spinlock_t dlcg_lock;
> +	struct prop_local_single dlcg_dirties;
> +};
> +
> +static struct cgroup_subsys_state *
> +task_to_css(struct task_struct *task)
> +{
> +
> +	return task_subsys_state(task, memdirtylimit_cgroup_subsys_id);
> +}
> +
> +static struct memdirtylimit_cgroup *
> +css_to_dlcg(struct cgroup_subsys_state *css)
> +{
> +
> +	return container_of(css, struct memdirtylimit_cgroup, dlcg_css);
> +}
> +
> +static struct cgroup_subsys_state *
> +cg_to_css(struct cgroup *cg)
> +{
> +
> +	return cgroup_subsys_state(cg, memdirtylimit_cgroup_subsys_id);
> +}
> +
> +static struct memdirtylimit_cgroup *
> +cg_to_dlcg(struct cgroup *cg)
> +{
> +
> +	return css_to_dlcg(cg_to_css(cg));
> +}
> +
> +/* ---------------------------------------- */
> +
> +static void
> +getfraction(struct memdirtylimit_cgroup *dlcg, long *numeratorp,
> +    long *denominatorp)
> +{
> +
> +	spin_lock(&dlcg->dlcg_lock);
> +	prop_fraction_single(&vm_cgroup_dirties, &dlcg->dlcg_dirties,
> +	   numeratorp, denominatorp);
> +	spin_unlock(&dlcg->dlcg_lock);
> +}
> +
> +/* ---------------------------------------- */
> +
> +void
> +memdirtylimitcgroup_dirty_inc(struct task_struct *t)
> +{
> +	struct memdirtylimit_cgroup *dlcg;
> +
> +	rcu_read_lock();
> +	dlcg = css_to_dlcg(task_to_css(t));
> +	spin_lock(&dlcg->dlcg_lock);
> +	prop_inc_single(&vm_cgroup_dirties, &dlcg->dlcg_dirties);
> +	spin_unlock(&dlcg->dlcg_lock);
> +	rcu_read_unlock();
> +}
> +
> +void
> +memdirtylimitcgroup_dirty_limit(struct task_struct *t, long *dirtyp)
> +{
> +	struct memdirtylimit_cgroup *dlcg;
> +	unsigned long dirty = *dirtyp;
> +	uint64_t tmp;
> +	long numerator;
> +	long denominator;
> +
> +	BUG_ON(*dirtyp < 0);
> +
> +	rcu_read_lock();
> +	dlcg = css_to_dlcg(task_to_css(t));
> +	getfraction(dlcg, &numerator, &denominator);
> +	rcu_read_unlock();
> +
> +	tmp = (uint64_t)(dirty >> 1) * numerator;
> +	do_div(tmp, denominator);
> +	*dirtyp = dirty - (unsigned long)tmp;
> +}
> +
> +void
> +memdirtylimitcgroup_change_shift(int shift)
> +{
> +
> +	prop_change_shift(&vm_cgroup_dirties, shift);
> +}
> +
> +void
> +memdirtylimitcgroup_init(int shift)
> +{
> +
> +	prop_descriptor_init(&vm_cgroup_dirties, shift);
> +}
> +
> +/* ---------------------------------------- */
> +
> +static u64
> +memdirtylimit_cgroup_read_fraction(struct cgroup *cg, struct cftype *cft)
> +{
> +	struct memdirtylimit_cgroup *dlcg;
> +	uint64_t result;
> +	long numerator;
> +	long denominator;
> +
> +	dlcg = cg_to_dlcg(cg);
> +	getfraction(dlcg, &numerator, &denominator);
> +	result = (uint64_t)100 * numerator;
> +	do_div(result, denominator);
> +	return result;
> +}
> +
> +static const struct cftype files[] = {
> +	{
> +		.name = "fraction",
> +		.read_u64 = memdirtylimit_cgroup_read_fraction,
> +	},
> +};
> +
> +static int
> +memdirtylimit_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cg)
> +{
> +
> +	return cgroup_add_files(cg, ss, files, ARRAY_SIZE(files));
> +}
> +
> +static struct cgroup_subsys_state *
> +memdirtylimit_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cg)
> +{
> +	struct memdirtylimit_cgroup *dlcg;
> +	int error;
> +
> +	dlcg = kzalloc(sizeof(*dlcg), GFP_KERNEL);
> +	if (dlcg == NULL)
> +		return ERR_PTR(-ENOMEM);
> +	error = prop_local_init_single(&dlcg->dlcg_dirties);
> +	if (error != 0) {
> +		kfree(dlcg);
> +		return ERR_PTR(error);
> +	}
> +	spin_lock_init(&dlcg->dlcg_lock);
> +	return &dlcg->dlcg_css;
> +}
> +
> +static void
> +memdirtylimit_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cg)
> +{
> +	struct memdirtylimit_cgroup *dlcg = cg_to_dlcg(cg);
> +
> +	prop_local_destroy_single(&dlcg->dlcg_dirties);
> +	kfree(dlcg);
> +}
> +
> +struct cgroup_subsys memdirtylimit_cgroup_subsys = {
> +	.name = "memdirtylimit",
> +	.subsys_id = memdirtylimit_cgroup_subsys_id,
> +	.create = memdirtylimit_cgroup_create,
> +	.destroy = memdirtylimit_cgroup_destroy,
> +	.populate = memdirtylimit_cgroup_populate,
> +};
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index e6fa69e..f971532 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -34,6 +34,7 @@
>  #include <linux/syscalls.h>
>  #include <linux/buffer_head.h>
>  #include <linux/pagevec.h>
> +#include <linux/memdirtylimitcgroup.h>
>  
>  /*
>   * The maximum number of pages to writeout in a single bdflush/kupdate
> @@ -152,6 +153,7 @@ int dirty_ratio_handler(struct ctl_table *table, int write,
>  		int shift = calc_period_shift();
>  		prop_change_shift(&vm_completions, shift);
>  		prop_change_shift(&vm_dirties, shift);
> +		memdirtylimitcgroup_change_shift(shift);
>  	}
>  	return ret;
>  }
> @@ -393,6 +395,8 @@ get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
>  	if (bdi) {
>  		u64 bdi_dirty;
>  		long numerator, denominator;
> +		long task_dirty;
> +		long cgroup_dirty;
>  
>  		/*
>  		 * Calculate this BDI's share of the dirty ratio.
> @@ -408,7 +412,11 @@ get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
>  
>  		*pbdi_dirty = bdi_dirty;
>  		clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty);
> -		task_dirty_limit(current, pbdi_dirty);
> +		task_dirty = *pbdi_dirty;
> +		task_dirty_limit(current, &task_dirty);
> +		cgroup_dirty = *pbdi_dirty;
> +		memdirtylimitcgroup_dirty_limit(current, &cgroup_dirty);
> +		*pbdi_dirty = min(task_dirty, cgroup_dirty);
>  	}
>  }
>  
> @@ -842,6 +850,7 @@ void __init page_writeback_init(void)
>  	shift = calc_period_shift();
>  	prop_descriptor_init(&vm_completions, shift);
>  	prop_descriptor_init(&vm_dirties, shift);
> +	memdirtylimitcgroup_init(shift);
>  }
>  
>  /**
> @@ -1105,6 +1114,7 @@ int __set_page_dirty_nobuffers(struct page *page)
>  	}
>  
>  	task_dirty_inc(current);
> +	memdirtylimitcgroup_dirty_inc(current);
>  
>  	return 1;
>  }
> 


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH][RFC] dirty balancing for cgroups
  2008-07-10 23:54 ` KAMEZAWA Hiroyuki
@ 2008-07-11  4:06   ` YAMAMOTO Takashi
  2008-07-11  5:15     ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 19+ messages in thread
From: YAMAMOTO Takashi @ 2008-07-11  4:06 UTC (permalink / raw)
  To: kamezawa.hiroyu; +Cc: menage, containers, linux-kernel, a.p.zijlstra, linux-mm

hi,

> On Wed,  9 Jul 2008 15:00:34 +0900 (JST)
> yamamoto@valinux.co.jp (YAMAMOTO Takashi) wrote:
> 
> > hi,
> > 
> > the following patch is a simple implementation of
> > dirty balancing for cgroups.  any comments?
> > 
> > it depends on the following fix:
> > 	http://lkml.org/lkml/2008/7/8/428
> > 
> 
> A few comments  ;)

thanks for comments.

>  - This looks simple but, could you merge this into memory resource controller ?

why?

>    (if conflict, I'll queue on my stack.)
>  - Do you have some number ? or How we can test this works well ?

i have no numbers yet.
it should work as well as the one for tasks.  (ie. i don't know. :)

>  - please CC to linux-mm.

ok, i will.

YAMAMOTO Takashi

> 
> Thanks,
> -Kame
> 
> > YAMAMOTO Takashi
> > 
> > 
> > Signed-off-by: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
> > ---
> > 
> > diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
> > index 23c02e2..f5453cc 100644
> > --- a/include/linux/cgroup_subsys.h
> > +++ b/include/linux/cgroup_subsys.h
> > @@ -52,3 +52,9 @@ SUBSYS(memrlimit_cgroup)
> >  #endif
> >  
> >  /* */
> > +
> > +#ifdef CONFIG_CGROUP_MEMDIRTYLIMIT_CTLR
> > +SUBSYS(memdirtylimit_cgroup)
> > +#endif
> > +
> > +/* */
> > diff --git a/include/linux/memdirtylimitcgroup.h b/include/linux/memdirtylimitcgroup.h
> > new file mode 100644
> > index 0000000..667d312
> > --- /dev/null
> > +++ b/include/linux/memdirtylimitcgroup.h
> > @@ -0,0 +1,47 @@
> > +
> > +/*
> > + * memdirtylimitcgroup.h COPYRIGHT FUJITSU LIMITED 2008
> > + *
> > + * Author: yamamoto@valinux.co.jp
> > + */
> > +
> > +struct task_struct;
> > +
> > +#if defined(CONFIG_CGROUP_MEMDIRTYLIMIT_CTLR)
> > +
> > +void memdirtylimitcgroup_dirty_inc(struct task_struct *);
> > +void memdirtylimitcgroup_dirty_limit(struct task_struct *, long *);
> > +void memdirtylimitcgroup_change_shift(int);
> > +void memdirtylimitcgroup_init(int);
> > +
> > +#else /* defined(CONFIG_CGROUP_MEMDIRTYLIMIT_CTLR) */
> > +
> > +static inline void
> > +memdirtylimitcgroup_dirty_inc(struct task_struct *t)
> > +{
> > +
> > +	/* nothing */
> > +}
> > +
> > +static inline void
> > +memdirtylimitcgroup_dirty_limit(struct task_struct *t, long *dirtyp)
> > +{
> > +
> > +	/* nothing */
> > +}
> > +
> > +static inline void
> > +memdirtylimitcgroup_change_shift(int shift)
> > +{
> > +
> > +	/* nothing */
> > +}
> > +
> > +static inline void
> > +memdirtylimitcgroup_init(int shift)
> > +{
> > +
> > +	/* nothing */
> > +}
> > +
> > +#endif /* defined(CONFIG_CGROUP_MEMDIRTYLIMIT_CTLR) */
> > diff --git a/init/Kconfig b/init/Kconfig
> > index 162d462..985bac8 100644
> > --- a/init/Kconfig
> > +++ b/init/Kconfig
> > @@ -418,6 +418,12 @@ config CGROUP_MEMRLIMIT_CTLR
> >  	  memory RSS and Page Cache control. Virtual address space control
> >  	  is provided by this controller.
> >  
> > +config CGROUP_MEMDIRTYLIMIT_CTLR
> > +	bool "Memory Dirty Limit Controller for Control Groups"
> > +	depends on CGROUPS && RESOURCE_COUNTERS
> > +	help
> > +	  XXX TBD
> > +
> >  config SYSFS_DEPRECATED
> >  	bool
> >  
> > diff --git a/mm/Makefile b/mm/Makefile
> > index f54232d..8603d19 100644
> > --- a/mm/Makefile
> > +++ b/mm/Makefile
> > @@ -35,4 +35,5 @@ obj-$(CONFIG_SMP) += allocpercpu.o
> >  obj-$(CONFIG_QUICKLIST) += quicklist.o
> >  obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o
> >  obj-$(CONFIG_CGROUP_MEMRLIMIT_CTLR) += memrlimitcgroup.o
> > +obj-$(CONFIG_CGROUP_MEMDIRTYLIMIT_CTLR) += memdirtylimitcgroup.o
> >  obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
> > diff --git a/mm/memdirtylimitcgroup.c b/mm/memdirtylimitcgroup.c
> > new file mode 100644
> > index 0000000..b70b33d
> > --- /dev/null
> > +++ b/mm/memdirtylimitcgroup.c
> > @@ -0,0 +1,179 @@
> > +
> > +/*
> > + * memdirtylimitcgroup.c COPYRIGHT FUJITSU LIMITED 2008
> > + *
> > + * Author: yamamoto@valinux.co.jp
> > + */
> > +
> > +#include <linux/err.h>
> > +#include <linux/cgroup.h>
> > +#include <linux/rcupdate.h>
> > +#include <linux/slab.h>
> > +#include <linux/memdirtylimitcgroup.h>
> > +
> > +#include <asm/div64.h>
> > +
> > +static struct prop_descriptor vm_cgroup_dirties;
> > +
> > +struct memdirtylimit_cgroup {
> > +	struct cgroup_subsys_state dlcg_css;
> > +	spinlock_t dlcg_lock;
> > +	struct prop_local_single dlcg_dirties;
> > +};
> > +
> > +static struct cgroup_subsys_state *
> > +task_to_css(struct task_struct *task)
> > +{
> > +
> > +	return task_subsys_state(task, memdirtylimit_cgroup_subsys_id);
> > +}
> > +
> > +static struct memdirtylimit_cgroup *
> > +css_to_dlcg(struct cgroup_subsys_state *css)
> > +{
> > +
> > +	return container_of(css, struct memdirtylimit_cgroup, dlcg_css);
> > +}
> > +
> > +static struct cgroup_subsys_state *
> > +cg_to_css(struct cgroup *cg)
> > +{
> > +
> > +	return cgroup_subsys_state(cg, memdirtylimit_cgroup_subsys_id);
> > +}
> > +
> > +static struct memdirtylimit_cgroup *
> > +cg_to_dlcg(struct cgroup *cg)
> > +{
> > +
> > +	return css_to_dlcg(cg_to_css(cg));
> > +}
> > +
> > +/* ---------------------------------------- */
> > +
> > +static void
> > +getfraction(struct memdirtylimit_cgroup *dlcg, long *numeratorp,
> > +    long *denominatorp)
> > +{
> > +
> > +	spin_lock(&dlcg->dlcg_lock);
> > +	prop_fraction_single(&vm_cgroup_dirties, &dlcg->dlcg_dirties,
> > +	   numeratorp, denominatorp);
> > +	spin_unlock(&dlcg->dlcg_lock);
> > +}
> > +
> > +/* ---------------------------------------- */
> > +
> > +void
> > +memdirtylimitcgroup_dirty_inc(struct task_struct *t)
> > +{
> > +	struct memdirtylimit_cgroup *dlcg;
> > +
> > +	rcu_read_lock();
> > +	dlcg = css_to_dlcg(task_to_css(t));
> > +	spin_lock(&dlcg->dlcg_lock);
> > +	prop_inc_single(&vm_cgroup_dirties, &dlcg->dlcg_dirties);
> > +	spin_unlock(&dlcg->dlcg_lock);
> > +	rcu_read_unlock();
> > +}
> > +
> > +void
> > +memdirtylimitcgroup_dirty_limit(struct task_struct *t, long *dirtyp)
> > +{
> > +	struct memdirtylimit_cgroup *dlcg;
> > +	unsigned long dirty = *dirtyp;
> > +	uint64_t tmp;
> > +	long numerator;
> > +	long denominator;
> > +
> > +	BUG_ON(*dirtyp < 0);
> > +
> > +	rcu_read_lock();
> > +	dlcg = css_to_dlcg(task_to_css(t));
> > +	getfraction(dlcg, &numerator, &denominator);
> > +	rcu_read_unlock();
> > +
> > +	tmp = (uint64_t)(dirty >> 1) * numerator;
> > +	do_div(tmp, denominator);
> > +	*dirtyp = dirty - (unsigned long)tmp;
> > +}
> > +
> > +void
> > +memdirtylimitcgroup_change_shift(int shift)
> > +{
> > +
> > +	prop_change_shift(&vm_cgroup_dirties, shift);
> > +}
> > +
> > +void
> > +memdirtylimitcgroup_init(int shift)
> > +{
> > +
> > +	prop_descriptor_init(&vm_cgroup_dirties, shift);
> > +}
> > +
> > +/* ---------------------------------------- */
> > +
> > +static u64
> > +memdirtylimit_cgroup_read_fraction(struct cgroup *cg, struct cftype *cft)
> > +{
> > +	struct memdirtylimit_cgroup *dlcg;
> > +	uint64_t result;
> > +	long numerator;
> > +	long denominator;
> > +
> > +	dlcg = cg_to_dlcg(cg);
> > +	getfraction(dlcg, &numerator, &denominator);
> > +	result = (uint64_t)100 * numerator;
> > +	do_div(result, denominator);
> > +	return result;
> > +}
> > +
> > +static const struct cftype files[] = {
> > +	{
> > +		.name = "fraction",
> > +		.read_u64 = memdirtylimit_cgroup_read_fraction,
> > +	},
> > +};
> > +
> > +static int
> > +memdirtylimit_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cg)
> > +{
> > +
> > +	return cgroup_add_files(cg, ss, files, ARRAY_SIZE(files));
> > +}
> > +
> > +static struct cgroup_subsys_state *
> > +memdirtylimit_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cg)
> > +{
> > +	struct memdirtylimit_cgroup *dlcg;
> > +	int error;
> > +
> > +	dlcg = kzalloc(sizeof(*dlcg), GFP_KERNEL);
> > +	if (dlcg == NULL)
> > +		return ERR_PTR(-ENOMEM);
> > +	error = prop_local_init_single(&dlcg->dlcg_dirties);
> > +	if (error != 0) {
> > +		kfree(dlcg);
> > +		return ERR_PTR(error);
> > +	}
> > +	spin_lock_init(&dlcg->dlcg_lock);
> > +	return &dlcg->dlcg_css;
> > +}
> > +
> > +static void
> > +memdirtylimit_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cg)
> > +{
> > +	struct memdirtylimit_cgroup *dlcg = cg_to_dlcg(cg);
> > +
> > +	prop_local_destroy_single(&dlcg->dlcg_dirties);
> > +	kfree(dlcg);
> > +}
> > +
> > +struct cgroup_subsys memdirtylimit_cgroup_subsys = {
> > +	.name = "memdirtylimit",
> > +	.subsys_id = memdirtylimit_cgroup_subsys_id,
> > +	.create = memdirtylimit_cgroup_create,
> > +	.destroy = memdirtylimit_cgroup_destroy,
> > +	.populate = memdirtylimit_cgroup_populate,
> > +};
> > diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> > index e6fa69e..f971532 100644
> > --- a/mm/page-writeback.c
> > +++ b/mm/page-writeback.c
> > @@ -34,6 +34,7 @@
> >  #include <linux/syscalls.h>
> >  #include <linux/buffer_head.h>
> >  #include <linux/pagevec.h>
> > +#include <linux/memdirtylimitcgroup.h>
> >  
> >  /*
> >   * The maximum number of pages to writeout in a single bdflush/kupdate
> > @@ -152,6 +153,7 @@ int dirty_ratio_handler(struct ctl_table *table, int write,
> >  		int shift = calc_period_shift();
> >  		prop_change_shift(&vm_completions, shift);
> >  		prop_change_shift(&vm_dirties, shift);
> > +		memdirtylimitcgroup_change_shift(shift);
> >  	}
> >  	return ret;
> >  }
> > @@ -393,6 +395,8 @@ get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
> >  	if (bdi) {
> >  		u64 bdi_dirty;
> >  		long numerator, denominator;
> > +		long task_dirty;
> > +		long cgroup_dirty;
> >  
> >  		/*
> >  		 * Calculate this BDI's share of the dirty ratio.
> > @@ -408,7 +412,11 @@ get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
> >  
> >  		*pbdi_dirty = bdi_dirty;
> >  		clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty);
> > -		task_dirty_limit(current, pbdi_dirty);
> > +		task_dirty = *pbdi_dirty;
> > +		task_dirty_limit(current, &task_dirty);
> > +		cgroup_dirty = *pbdi_dirty;
> > +		memdirtylimitcgroup_dirty_limit(current, &cgroup_dirty);
> > +		*pbdi_dirty = min(task_dirty, cgroup_dirty);
> >  	}
> >  }
> >  
> > @@ -842,6 +850,7 @@ void __init page_writeback_init(void)
> >  	shift = calc_period_shift();
> >  	prop_descriptor_init(&vm_completions, shift);
> >  	prop_descriptor_init(&vm_dirties, shift);
> > +	memdirtylimitcgroup_init(shift);
> >  }
> >  
> >  /**
> > @@ -1105,6 +1114,7 @@ int __set_page_dirty_nobuffers(struct page *page)
> >  	}
> >  
> >  	task_dirty_inc(current);
> > +	memdirtylimitcgroup_dirty_inc(current);
> >  
> >  	return 1;
> >  }
> > 
> 
> _______________________________________________
> Containers mailing list
> Containers@lists.linux-foundation.org
> https://lists.linux-foundation.org/mailman/listinfo/containers

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH][RFC] dirty balancing for cgroups
  2008-07-11  4:06   ` YAMAMOTO Takashi
@ 2008-07-11  5:15     ` KAMEZAWA Hiroyuki
  2008-07-11  5:59       ` YAMAMOTO Takashi
  0 siblings, 1 reply; 19+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-07-11  5:15 UTC (permalink / raw)
  To: YAMAMOTO Takashi; +Cc: menage, containers, linux-kernel, a.p.zijlstra, linux-mm

On Fri, 11 Jul 2008 13:06:57 +0900 (JST)
yamamoto@valinux.co.jp (YAMAMOTO Takashi) wrote:

> hi,
> 
> > On Wed,  9 Jul 2008 15:00:34 +0900 (JST)
> > yamamoto@valinux.co.jp (YAMAMOTO Takashi) wrote:
> > 
> > > hi,
> > > 
> > > the following patch is a simple implementation of
> > > dirty balancing for cgroups.  any comments?
> > > 
> > > it depends on the following fix:
> > > 	http://lkml.org/lkml/2008/7/8/428
> > > 
> > 
> > A few comments  ;)
> 
> thanks for comments.
> 
> >  - This looks simple but, could you merge this into memory resource controller ?
> 
> why?
> 
3 points.
 1. Is this useful if used alone ?
 2. memcg requires this kind of feature, basically.
 3. I wonder I need more work to make this work well under memcg.

If chasing page->cgroup and memcg make this patch much more complex,
I think this style of implimentation is a choice.

 About 3. 
    Does this works well if I changes get_dirty_limit()'s
    determine_dirtyable_memory() calculation under memcg ?
    But to do this seems not valid if dirty_ratio cgroup and memcg cgroup 
    containes different set of tasks.
 
Thanks,
-Kame


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH][RFC] dirty balancing for cgroups
  2008-07-11  5:15     ` KAMEZAWA Hiroyuki
@ 2008-07-11  5:59       ` YAMAMOTO Takashi
  2008-07-11  7:13         ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 19+ messages in thread
From: YAMAMOTO Takashi @ 2008-07-11  5:59 UTC (permalink / raw)
  To: kamezawa.hiroyu; +Cc: a.p.zijlstra, linux-mm, menage, containers, linux-kernel

> > >  - This looks simple but, could you merge this into memory resource controller ?
> > 
> > why?
> > 
> 3 points.
>  1. Is this useful if used alone ?

it can be.  why not?

>  2. memcg requires this kind of feature, basically.
> 
>  3. I wonder I need more work to make this work well under memcg.

i'm not sure if i understand these points.  can you explain a bit?

my patch penalizes heavy-writer cgroups as task_dirty_limit does
for heavy-writer tasks.  i don't think that it's necessary to be
tied to the memory subsystem because i merely want to group writers.

otoh, if you want to limit the number (or percentage or whatever) of
dirty pages in a memory cgroup, it can't be done independently from
the memory subsystem, of course.  it's another story, tho.

YAMAMOTO Takashi

> 
> If chasing page->cgroup and memcg make this patch much more complex,
> I think this style of implimentation is a choice.
> 
>  About 3. 
>     Does this works well if I changes get_dirty_limit()'s
>     determine_dirtyable_memory() calculation under memcg ?
>     But to do this seems not valid if dirty_ratio cgroup and memcg cgroup 
>     containes different set of tasks.
>  
> Thanks,
> -Kame

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH][RFC] dirty balancing for cgroups
  2008-07-11  5:59       ` YAMAMOTO Takashi
@ 2008-07-11  7:13         ` KAMEZAWA Hiroyuki
  2008-07-11  8:34           ` YAMAMOTO Takashi
                             ` (2 more replies)
  0 siblings, 3 replies; 19+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-07-11  7:13 UTC (permalink / raw)
  To: YAMAMOTO Takashi; +Cc: a.p.zijlstra, linux-mm, menage, containers, linux-kernel

On Fri, 11 Jul 2008 14:59:26 +0900 (JST)
yamamoto@valinux.co.jp (YAMAMOTO Takashi) wrote:

> > > >  - This looks simple but, could you merge this into memory resource controller ?
> > > 
> > > why?
> > > 
> > 3 points.
> >  1. Is this useful if used alone ?
> 
> it can be.  why not?
> 
> >  2. memcg requires this kind of feature, basically.
> > 
> >  3. I wonder I need more work to make this work well under memcg.
> 
> i'm not sure if i understand these points.  can you explain a bit?
> 
In my understanding, dirty_ratio is for helping memory (reclaim) subsystem.

See comments in fs/page-writeback.c:: determin_dirtyable_memory()
==
/*
 * Work out the current dirty-memory clamping and background writeout
 * thresholds.
 *
 * The main aim here is to lower them aggressively if there is a lot of mapped
 * memory around.  To avoid stressing page reclaim with lots of unreclaimable
 * pages.  It is better to clamp down on writers than to start swapping, and
 * performing lots of scanning.
 *
 * We only allow 1/2 of the currently-unmapped memory to be dirtied.
 *
 * We don't permit the clamping level to fall below 5% - that is getting rather
 * excessive.
 *
 * We make sure that the background writeout level is below the adjusted
 * clamping level.
==

"To avoid stressing page reclaim with lots of unreclaimable pages"

Then, I think memcg should support this for helping relcaim under memcg.

> my patch penalizes heavy-writer cgroups as task_dirty_limit does
> for heavy-writer tasks.  i don't think that it's necessary to be
> tied to the memory subsystem because i merely want to group writers.
> 
Hmm, maybe what I need is different from this ;)
Does not seem to be a help for memory reclaim under memcg.


Thanks,
-Kame


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH][RFC] dirty balancing for cgroups
  2008-07-11  7:13         ` KAMEZAWA Hiroyuki
@ 2008-07-11  8:34           ` YAMAMOTO Takashi
  2008-07-11  8:52             ` KAMEZAWA Hiroyuki
  2008-07-14 13:49           ` Peter Zijlstra
  2008-07-14 14:38           ` kamezawa.hiroyu
  2 siblings, 1 reply; 19+ messages in thread
From: YAMAMOTO Takashi @ 2008-07-11  8:34 UTC (permalink / raw)
  To: kamezawa.hiroyu; +Cc: linux-mm, menage, containers, a.p.zijlstra, linux-kernel

hi,

> > my patch penalizes heavy-writer cgroups as task_dirty_limit does
> > for heavy-writer tasks.  i don't think that it's necessary to be
> > tied to the memory subsystem because i merely want to group writers.
> > 
> Hmm, maybe what I need is different from this ;)
> Does not seem to be a help for memory reclaim under memcg.

to implement what you need, i think that we need to keep track of
the numbers of dirty-pages in each memory cgroups as a first step.
do you agree?

YAMAMOTO Takashi

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH][RFC] dirty balancing for cgroups
  2008-07-11  8:34           ` YAMAMOTO Takashi
@ 2008-07-11  8:52             ` KAMEZAWA Hiroyuki
  2008-08-06  8:20               ` YAMAMOTO Takashi
  0 siblings, 1 reply; 19+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-07-11  8:52 UTC (permalink / raw)
  To: YAMAMOTO Takashi; +Cc: linux-mm, menage, containers, a.p.zijlstra, linux-kernel

On Fri, 11 Jul 2008 17:34:46 +0900 (JST)
yamamoto@valinux.co.jp (YAMAMOTO Takashi) wrote:

> hi,
> 
> > > my patch penalizes heavy-writer cgroups as task_dirty_limit does
> > > for heavy-writer tasks.  i don't think that it's necessary to be
> > > tied to the memory subsystem because i merely want to group writers.
> > > 
> > Hmm, maybe what I need is different from this ;)
> > Does not seem to be a help for memory reclaim under memcg.
> 
> to implement what you need, i think that we need to keep track of
> the numbers of dirty-pages in each memory cgroups as a first step.
> do you agree?
> 
yes, I think so, now.

may be not difficult but will add extra overhead ;( Sigh..



Thanks,
-Kame


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH][RFC] dirty balancing for cgroups
  2008-07-09  6:00 [PATCH][RFC] dirty balancing for cgroups YAMAMOTO Takashi
  2008-07-10 23:54 ` KAMEZAWA Hiroyuki
@ 2008-07-14 13:37 ` Peter Zijlstra
  1 sibling, 0 replies; 19+ messages in thread
From: Peter Zijlstra @ 2008-07-14 13:37 UTC (permalink / raw)
  To: YAMAMOTO Takashi
  Cc: linux-kernel, containers, menage, kamezawa.hiroyu, linux-mm

On Wed, 2008-07-09 at 15:00 +0900, YAMAMOTO Takashi wrote:
> hi,
> 
> the following patch is a simple implementation of
> dirty balancing for cgroups.  any comments?
> 
> it depends on the following fix:
> 	http://lkml.org/lkml/2008/7/8/428
> 
> YAMAMOTO Takashi
> 
> 
> Signed-off-by: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
> ---

Yamamoto-san,

> @@ -408,7 +412,11 @@ get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
>  
>  		*pbdi_dirty = bdi_dirty;
>  		clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty);
> -		task_dirty_limit(current, pbdi_dirty);
> +		task_dirty = *pbdi_dirty;
> +		task_dirty_limit(current, &task_dirty);
> +		cgroup_dirty = *pbdi_dirty;
> +		memdirtylimitcgroup_dirty_limit(current, &cgroup_dirty);
> +		*pbdi_dirty = min(task_dirty, cgroup_dirty);
>  	}
>  }

I think this is wrong - is basically breaks task dirty throttling within
groups. You'd need a multiplicative operation, something like:

  bdi_dirty = dirty * p(bdi) * p(cgroup) * (1 - p(task))

However then we still have problems... see the next email further down
the thread.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH][RFC] dirty balancing for cgroups
  2008-07-11  7:13         ` KAMEZAWA Hiroyuki
  2008-07-11  8:34           ` YAMAMOTO Takashi
@ 2008-07-14 13:49           ` Peter Zijlstra
  2008-07-17  1:43             ` YAMAMOTO Takashi
  2008-08-14  8:38             ` Paul Menage
  2008-07-14 14:38           ` kamezawa.hiroyu
  2 siblings, 2 replies; 19+ messages in thread
From: Peter Zijlstra @ 2008-07-14 13:49 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: YAMAMOTO Takashi, linux-mm, menage, containers, linux-kernel

On Fri, 2008-07-11 at 16:13 +0900, KAMEZAWA Hiroyuki wrote:
> On Fri, 11 Jul 2008 14:59:26 +0900 (JST)
> yamamoto@valinux.co.jp (YAMAMOTO Takashi) wrote:
> 
> > > > >  - This looks simple but, could you merge this into memory resource controller ?
> > > > 
> > > > why?
> > > > 
> > > 3 points.
> > >  1. Is this useful if used alone ?
> > 
> > it can be.  why not?
> > 
> > >  2. memcg requires this kind of feature, basically.
> > > 
> > >  3. I wonder I need more work to make this work well under memcg.
> > 
> > i'm not sure if i understand these points.  can you explain a bit?
> > 
> In my understanding, dirty_ratio is for helping memory (reclaim) subsystem.
> 
> See comments in fs/page-writeback.c:: determin_dirtyable_memory()
> ==
> /*
>  * Work out the current dirty-memory clamping and background writeout
>  * thresholds.
>  *
>  * The main aim here is to lower them aggressively if there is a lot of mapped
>  * memory around.  To avoid stressing page reclaim with lots of unreclaimable
>  * pages.  It is better to clamp down on writers than to start swapping, and
>  * performing lots of scanning.
>  *
>  * We only allow 1/2 of the currently-unmapped memory to be dirtied.
>  *
>  * We don't permit the clamping level to fall below 5% - that is getting rather
>  * excessive.
>  *
>  * We make sure that the background writeout level is below the adjusted
>  * clamping level.
> ==
> 
> "To avoid stressing page reclaim with lots of unreclaimable pages"
> 
> Then, I think memcg should support this for helping relcaim under memcg.

That comment is unclear at best.

The dirty page limit avoids deadlocks under certain situations, the per
BDI dirty limit avoids even mode deadlocks by providing isolation
between BDIs.


The fundamental deadlock solved by the dirty page limit is the typical
reclaim deadlock - needing memory to free memory. It does this by
ensuring only some part of the total memory used for the page-cache can
be dirty, thus we always have clean pages around that can be reclaimed
so we can launder the dirty pages.

This on its own generates a new deadlock for stacked devices, imagine
device A on top of B. When A generates loads of dirty pages it will
eventually hit the dirty limit and we'd start to launder them. However
in order to launder A's dirty pages we'd need to dirty pages for B, but
we can't since we're at the global limit.

This problem is solved by introducing a per BDI dirty limit, by
assigning each BDI an individual dirty limit (whoes sum is the total
dirty limit) we avoid that deadlock. Take the previous example; A would
start laundering its pages when it hits its own limit, B's operation
isn't hampered by that.

[ even when B's limit is 0 we're able to make progress, since we'll only
  wait for B's dirty page count to decrease - effectively reducing to
  sync writes. ]

Of course this raises the question how to assign the various dirty
limits - any fixed distribution is hard to maintain and suboptimial for
most workloads.

We solve this by assigning each BDI a fraction proportional to its
current launder speed. That is to say, if A launders pages twice as fast
as B does, then A will get 2/3-rd of the total dirty page limit, versus
1/3-rd for B.


Then there is the task dirty stuff - this is basically a 'fix' for the
problem where a slow writer gets starved by a fast reader. Imagine two
tasks competing for bandwidth, 1 the fast writer and 2 the slow writer.

1 will dirty loads of pages but all things being equal 2 will have to
wait for 1's dirty pages.

So what we do is lower the dirty limit for fast writers - so these get
to wait sooner and slow writers have a little room to make progress
before they too have to wait.

To properly solve this we'd need to track p_{bdi,task}. However that's
intracktable. Therefore we approximate that with p_bdi * p_task. This
approximation looses detail.

Imagine two tasks: 1 and 2, and two BDIs A and B (independent this
time). If 1 is a (fast) writer to A and 2 is a (slow) writer to B, we
need not throttle 2 sooner as there is no actual competition.

The full proportional tensor p_{bdi,task} can express this, but the
simple approximation p_bdi * p_task can not.

The approximation will reduce 1's bandwidth a little even though there
is no actual competition.


Now the problem this patch tries to address...

As you can see you'd need p_{bdi,cgroup,task} for it to work, and the
obvious approximation p_bdi * p_cgroup * p_task will get even more
coarse.

You could possibly attempt to do p_{bdi,cgroup} * p_task since the bdi
and cgroup set are pretty static, but still that would be painful.

So, could you please give some more justification for this work, I'm not
seeing the value in complicating all this just yet.


Thanks for reading this far,

Peter


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: Re: [PATCH][RFC] dirty balancing for cgroups
  2008-07-11  7:13         ` KAMEZAWA Hiroyuki
  2008-07-11  8:34           ` YAMAMOTO Takashi
  2008-07-14 13:49           ` Peter Zijlstra
@ 2008-07-14 14:38           ` kamezawa.hiroyu
  2 siblings, 0 replies; 19+ messages in thread
From: kamezawa.hiroyu @ 2008-07-14 14:38 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: KAMEZAWA Hiroyuki, YAMAMOTO Takashi, linux-mm, menage,
	containers, linux-kernel

----- Original Message -----
>> See comments in fs/page-writeback.c:: determin_dirtyable_memory()
>> ==
>> /*
>>  * Work out the current dirty-memory clamping and background writeout
>>  * thresholds.
>>  *
>>  * The main aim here is to lower them aggressively if there is a lot of map
ped
>>  * memory around.  To avoid stressing page reclaim with lots of unreclaimab
le
>>  * pages.  It is better to clamp down on writers than to start swapping, an
d
>>  * performing lots of scanning.
>>  *
>>  * We only allow 1/2 of the currently-unmapped memory to be dirtied.
>>  *
>>  * We don't permit the clamping level to fall below 5% - that is getting ra
ther
>>  * excessive.
>>  *
>>  * We make sure that the background writeout level is below the adjusted
>>  * clamping level.
>> ==
>> 
>> "To avoid stressing page reclaim with lots of unreclaimable pages"
>> 
>> Then, I think memcg should support this for helping relcaim under memcg.
>
>That comment is unclear at best.
>
>The dirty page limit avoids deadlocks under certain situations, the per
>BDI dirty limit avoids even mode deadlocks by providing isolation
>between BDIs.
>
>
>The fundamental deadlock solved by the dirty page limit is the typical
>reclaim deadlock - needing memory to free memory. It does this by
>ensuring only some part of the total memory used for the page-cache can
>be dirty, thus we always have clean pages around that can be reclaimed
>so we can launder the dirty pages.
>
>This on its own generates a new deadlock for stacked devices, imagine
>device A on top of B. When A generates loads of dirty pages it will
>eventually hit the dirty limit and we'd start to launder them. However
>in order to launder A's dirty pages we'd need to dirty pages for B, but
>we can't since we're at the global limit.
>
>This problem is solved by introducing a per BDI dirty limit, by
>assigning each BDI an individual dirty limit (whoes sum is the total
>dirty limit) we avoid that deadlock. Take the previous example; A would
>start laundering its pages when it hits its own limit, B's operation
>isn't hampered by that.
>
>[ even when B's limit is 0 we're able to make progress, since we'll only
>  wait for B's dirty page count to decrease - effectively reducing to
>  sync writes. ]
>
>Of course this raises the question how to assign the various dirty
>limits - any fixed distribution is hard to maintain and suboptimial for
>most workloads.
>
>We solve this by assigning each BDI a fraction proportional to its
>current launder speed. That is to say, if A launders pages twice as fast
>as B does, then A will get 2/3-rd of the total dirty page limit, versus
>1/3-rd for B.
>
>
>Then there is the task dirty stuff - this is basically a 'fix' for the
>problem where a slow writer gets starved by a fast reader. Imagine two
>tasks competing for bandwidth, 1 the fast writer and 2 the slow writer.
>
>1 will dirty loads of pages but all things being equal 2 will have to
>wait for 1's dirty pages.
>
>So what we do is lower the dirty limit for fast writers - so these get
>to wait sooner and slow writers have a little room to make progress
>before they too have to wait.
>
>To properly solve this we'd need to track p_{bdi,task}. However that's
>intracktable. Therefore we approximate that with p_bdi * p_task. This
>approximation looses detail.
>
>Imagine two tasks: 1 and 2, and two BDIs A and B (independent this
>time). If 1 is a (fast) writer to A and 2 is a (slow) writer to B, we
>need not throttle 2 sooner as there is no actual competition.
>
>The full proportional tensor p_{bdi,task} can express this, but the
>simple approximation p_bdi * p_task can not.
>
>The approximation will reduce 1's bandwidth a little even though there
>is no actual competition.
>
>
>Now the problem this patch tries to address...
>
>As you can see you'd need p_{bdi,cgroup,task} for it to work, and the
>obvious approximation p_bdi * p_cgroup * p_task will get even more
>coarse.
>
>You could possibly attempt to do p_{bdi,cgroup} * p_task since the bdi
>and cgroup set are pretty static, but still that would be painful.
>
>So, could you please give some more justification for this work, I'm not
>seeing the value in complicating all this just yet.
>
>
>Thanks for reading this far,
>
Thank you for explanation. Maybe I misunderstood something.

What I thought was to keep some amount of clean pages under memcg
for smooth reclaiming. But after reading your explanation, I'm now
considering again...

about DEADLOCK:

memory cgroup's reclaim path is working as following now.
==
mem_cgroup_charge() or others.
  -> try_to_free_mem_cgroup_pages()
      -> shrink_zone()...
          -> subsystem may call alloc_pages()
               -> ....(*)

Unlike global LRU, memory_cgroup does not affect (*) because add_tp_page_cache
() is called here and maybe no deadlock.

about Memory Reclaim's  health:

It doesn't seem good to allow users to make all pages under memcg
to be dirty without penalty(throttling).
 
Dirty/Written-backed pages are not to be reclaimed by first lru scan because
it has to be written out (and rolated to the tail of LRU.) and clean pages,
which are easy to be dropped, are dropped soon.

Under following situation
 - large file copy under memcg.
 - big coredump under memcg.

I think too much (clean) pages are swapped out by memcg's LRU because
of tons of dirty pages. (but I don't have enough data now. just my concern.)

What can I do against this ? Could you give me a hint ?

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH][RFC] dirty balancing for cgroups
  2008-07-14 13:49           ` Peter Zijlstra
@ 2008-07-17  1:43             ` YAMAMOTO Takashi
  2008-08-14  8:38             ` Paul Menage
  1 sibling, 0 replies; 19+ messages in thread
From: YAMAMOTO Takashi @ 2008-07-17  1:43 UTC (permalink / raw)
  To: a.p.zijlstra; +Cc: kamezawa.hiroyu, linux-mm, menage, containers, linux-kernel

hi,

> Now the problem this patch tries to address...
> 
> As you can see you'd need p_{bdi,cgroup,task} for it to work, and the
> obvious approximation p_bdi * p_cgroup * p_task will get even more
> coarse.
> 
> You could possibly attempt to do p_{bdi,cgroup} * p_task since the bdi
> and cgroup set are pretty static, but still that would be painful.

i chose min(p_bdi * p_cgroup, p_bdi * p_task) because i couldn't imagine
a case where p_bdi * p_cgroup * p_task is better.

> So, could you please give some more justification for this work, I'm not
> seeing the value in complicating all this just yet.

a simple example for which my patch can make some sense is:

	while :;do dd if=/dev/zero of=file conv=notrunc bs=4096 count=1;done

YAMAMOTO Takashi

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH][RFC] dirty balancing for cgroups
  2008-07-11  8:52             ` KAMEZAWA Hiroyuki
@ 2008-08-06  8:20               ` YAMAMOTO Takashi
  2008-08-06  8:53                 ` KAMEZAWA Hiroyuki
  2008-08-07 13:36                 ` Peter Zijlstra
  0 siblings, 2 replies; 19+ messages in thread
From: YAMAMOTO Takashi @ 2008-08-06  8:20 UTC (permalink / raw)
  To: kamezawa.hiroyu; +Cc: linux-mm, menage, containers, linux-kernel, a.p.zijlstra

hi,

> On Fri, 11 Jul 2008 17:34:46 +0900 (JST)
> yamamoto@valinux.co.jp (YAMAMOTO Takashi) wrote:
> 
> > hi,
> > 
> > > > my patch penalizes heavy-writer cgroups as task_dirty_limit does
> > > > for heavy-writer tasks.  i don't think that it's necessary to be
> > > > tied to the memory subsystem because i merely want to group writers.
> > > > 
> > > Hmm, maybe what I need is different from this ;)
> > > Does not seem to be a help for memory reclaim under memcg.
> > 
> > to implement what you need, i think that we need to keep track of
> > the numbers of dirty-pages in each memory cgroups as a first step.
> > do you agree?
> > 
> yes, I think so, now.
> 
> may be not difficult but will add extra overhead ;( Sigh..

the following is a patch to add the overhead. :)
any comments?

YAMAMOTO Takashi


Signed-off-by: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
---

diff --git a/fs/buffer.c b/fs/buffer.c
index 9749a90..a2dc642 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -41,6 +41,7 @@
 #include <linux/bitops.h>
 #include <linux/mpage.h>
 #include <linux/bit_spinlock.h>
+#include <linux/memcontrol.h>
 
 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
 
@@ -708,12 +709,19 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
 static int __set_page_dirty(struct page *page,
 		struct address_space *mapping, int warn)
 {
-	if (unlikely(!mapping))
-		return !TestSetPageDirty(page);
+	if (unlikely(!mapping)) {
+		if (TestSetPageDirty(page)) {
+			return 0;
+		} else {
+			mem_cgroup_set_page_dirty(page);
+			return 1;
+		}
+	}
 
 	if (TestSetPageDirty(page))
 		return 0;
 
+	mem_cgroup_set_page_dirty(page);
 	spin_lock_irq(&mapping->tree_lock);
 	if (page->mapping) {	/* Race with truncate? */
 		WARN_ON_ONCE(warn && !PageUptodate(page));
@@ -762,8 +770,14 @@ int __set_page_dirty_buffers(struct page *page)
 {
 	struct address_space *mapping = page_mapping(page);
 
-	if (unlikely(!mapping))
-		return !TestSetPageDirty(page);
+	if (unlikely(!mapping)) {
+		if (TestSetPageDirty(page)) {
+			return 0;
+		} else {
+			mem_cgroup_set_page_dirty(page);
+			return 1;
+		}
+	}
 
 	spin_lock(&mapping->private_lock);
 	if (page_has_buffers(page)) {
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index ee1b2fc..f04441f 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -57,6 +57,9 @@ extern int
 mem_cgroup_prepare_migration(struct page *page, struct page *newpage);
 extern void mem_cgroup_end_migration(struct page *page);
 
+extern void mem_cgroup_set_page_dirty(struct page *page);
+extern void mem_cgroup_clear_page_dirty(struct page *page);
+
 /*
  * For memory reclaim.
  */
@@ -132,6 +135,14 @@ static inline void mem_cgroup_end_migration(struct page *page)
 {
 }
 
+static inline void mem_cgroup_set_page_dirty(struct page *page)
+{
+}
+
+static inline void mem_cgroup_clear_page_dirty(struct page *page)
+{
+}
+
 static inline int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
 {
 	return 0;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 344a477..33d14b7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -49,6 +49,7 @@ enum mem_cgroup_stat_index {
 	 */
 	MEM_CGROUP_STAT_CACHE, 	   /* # of pages charged as cache */
 	MEM_CGROUP_STAT_RSS,	   /* # of pages charged as rss */
+	MEM_CGROUP_STAT_DIRTY,     /* # of dirty pages */
 	MEM_CGROUP_STAT_PGPGIN_COUNT,	/* # of pages paged in */
 	MEM_CGROUP_STAT_PGPGOUT_COUNT,	/* # of pages paged out */
 
@@ -73,6 +74,14 @@ static void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat *stat,
 	stat->cpustat[cpu].count[idx] += val;
 }
 
+static void __mem_cgroup_stat_add(struct mem_cgroup_stat *stat,
+		enum mem_cgroup_stat_index idx, int val)
+{
+	int cpu = get_cpu();
+	stat->cpustat[cpu].count[idx] += val;
+	put_cpu();
+}
+
 static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
 		enum mem_cgroup_stat_index idx)
 {
@@ -164,6 +173,7 @@ struct page_cgroup {
 #define PAGE_CGROUP_FLAG_ACTIVE    (0x2)	/* page is active in this cgroup */
 #define PAGE_CGROUP_FLAG_FILE	   (0x4)	/* page is file system backed */
 #define PAGE_CGROUP_FLAG_UNEVICTABLE (0x8)	/* page is unevictableable */
+#define PAGE_CGROUP_FLAG_DIRTY     (0x10)	/* accounted as dirty */
 
 static int page_cgroup_nid(struct page_cgroup *pc)
 {
@@ -196,6 +206,9 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags,
 	else
 		__mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val);
 
+	if (flags & PAGE_CGROUP_FLAG_DIRTY)
+		__mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_DIRTY, val);
+
 	if (charge)
 		__mem_cgroup_stat_add_safe(stat,
 				MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
@@ -284,6 +297,7 @@ static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
 {
 	int lru = LRU_BASE;
 
+	VM_BUG_ON(!page_cgroup_locked(pc->page));
 	if (pc->flags & PAGE_CGROUP_FLAG_UNEVICTABLE)
 		lru = LRU_UNEVICTABLE;
 	else {
@@ -304,6 +318,7 @@ static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
 {
 	int lru = LRU_BASE;
 
+	VM_BUG_ON(!page_cgroup_locked(pc->page));
 	if (pc->flags & PAGE_CGROUP_FLAG_UNEVICTABLE)
 		lru = LRU_UNEVICTABLE;
 	else {
@@ -328,6 +343,8 @@ static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru)
 	enum lru_list from = unevictable ? LRU_UNEVICTABLE :
 				(LRU_FILE * !!file + !!active);
 
+	VM_BUG_ON(!page_cgroup_locked(pc->page));
+
 	if (lru == from)
 		return;
 
@@ -485,7 +502,10 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 		if (PageUnevictable(page) ||
 		    (PageActive(page) && !active) ||
 		    (!PageActive(page) && active)) {
-			__mem_cgroup_move_lists(pc, page_lru(page));
+			if (try_lock_page_cgroup(page)) {
+				__mem_cgroup_move_lists(pc, page_lru(page));
+				unlock_page_cgroup(page);
+			}
 			continue;
 		}
 
@@ -772,6 +792,38 @@ void mem_cgroup_end_migration(struct page *newpage)
 		mem_cgroup_uncharge_page(newpage);
 }
 
+void mem_cgroup_set_page_dirty(struct page *pg)
+{
+	struct page_cgroup *pc;
+
+	lock_page_cgroup(pg);
+	pc = page_get_page_cgroup(pg);
+	if (pc != NULL && (pc->flags & PAGE_CGROUP_FLAG_DIRTY) == 0) {
+		struct mem_cgroup *mem = pc->mem_cgroup;
+		struct mem_cgroup_stat *stat = &mem->stat;
+
+		pc->flags |= PAGE_CGROUP_FLAG_DIRTY;
+		__mem_cgroup_stat_add(stat, MEM_CGROUP_STAT_DIRTY, 1);
+	}
+	unlock_page_cgroup(pg);
+}
+
+void mem_cgroup_clear_page_dirty(struct page *pg)
+{
+	struct page_cgroup *pc;
+
+	lock_page_cgroup(pg);
+	pc = page_get_page_cgroup(pg);
+	if (pc != NULL && (pc->flags & PAGE_CGROUP_FLAG_DIRTY) != 0) {
+		struct mem_cgroup *mem = pc->mem_cgroup;
+		struct mem_cgroup_stat *stat = &mem->stat;
+
+		pc->flags &= ~PAGE_CGROUP_FLAG_DIRTY;
+		__mem_cgroup_stat_add(stat, MEM_CGROUP_STAT_DIRTY, -1);
+	}
+	unlock_page_cgroup(pg);
+}
+
 /*
  * A call to try to shrink memory usage under specified resource controller.
  * This is typically used for page reclaiming for shmem for reducing side
@@ -957,6 +1009,7 @@ static const struct mem_cgroup_stat_desc {
 } mem_cgroup_stat_desc[] = {
 	[MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, },
 	[MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, },
+	[MEM_CGROUP_STAT_DIRTY] = { "dirty", 1, },
 	[MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, },
 	[MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, },
 };
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index c6d6088..14dc9af 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -34,6 +34,7 @@
 #include <linux/syscalls.h>
 #include <linux/buffer_head.h>
 #include <linux/pagevec.h>
+#include <linux/memcontrol.h>
 
 /*
  * The maximum number of pages to writeout in a single bdflush/kupdate
@@ -1081,6 +1082,8 @@ int __set_page_dirty_nobuffers(struct page *page)
 		struct address_space *mapping = page_mapping(page);
 		struct address_space *mapping2;
 
+		mem_cgroup_set_page_dirty(page);
+
 		if (!mapping)
 			return 1;
 
@@ -1138,8 +1141,10 @@ static int __set_page_dirty(struct page *page)
 		return (*spd)(page);
 	}
 	if (!PageDirty(page)) {
-		if (!TestSetPageDirty(page))
+		if (!TestSetPageDirty(page)) {
+			mem_cgroup_set_page_dirty(page);
 			return 1;
+		}
 	}
 	return 0;
 }
@@ -1234,6 +1239,7 @@ int clear_page_dirty_for_io(struct page *page)
 		 * for more comments.
 		 */
 		if (TestClearPageDirty(page)) {
+			mem_cgroup_clear_page_dirty(page);
 			dec_zone_page_state(page, NR_FILE_DIRTY);
 			dec_bdi_stat(mapping->backing_dev_info,
 					BDI_RECLAIMABLE);
@@ -1241,7 +1247,11 @@ int clear_page_dirty_for_io(struct page *page)
 		}
 		return 0;
 	}
-	return TestClearPageDirty(page);
+	if (TestClearPageDirty(page)) {
+		mem_cgroup_clear_page_dirty(page);
+		return 1;
+	}
+	return 0;
 }
 EXPORT_SYMBOL(clear_page_dirty_for_io);
 
diff --git a/mm/truncate.c b/mm/truncate.c
index 4d129a5..9b1e215 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -18,6 +18,7 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/buffer_head.h>	/* grr. try_to_release_page,
 				   do_invalidatepage */
+#include <linux/memcontrol.h>
 #include "internal.h"
 
 
@@ -72,6 +73,8 @@ void cancel_dirty_page(struct page *page, unsigned int account_size)
 {
 	if (TestClearPageDirty(page)) {
 		struct address_space *mapping = page->mapping;
+
+		mem_cgroup_clear_page_dirty(page);
 		if (mapping && mapping_cap_account_dirty(mapping)) {
 			dec_zone_page_state(page, NR_FILE_DIRTY);
 			dec_bdi_stat(mapping->backing_dev_info,

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH][RFC] dirty balancing for cgroups
  2008-08-06  8:20               ` YAMAMOTO Takashi
@ 2008-08-06  8:53                 ` KAMEZAWA Hiroyuki
  2008-08-06  9:10                   ` YAMAMOTO Takashi
  2008-08-07 13:36                 ` Peter Zijlstra
  1 sibling, 1 reply; 19+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-08-06  8:53 UTC (permalink / raw)
  To: YAMAMOTO Takashi; +Cc: linux-mm, menage, containers, linux-kernel, a.p.zijlstra

On Wed,  6 Aug 2008 17:20:46 +0900 (JST)
yamamoto@valinux.co.jp (YAMAMOTO Takashi) wrote:

> hi,
> 
> > On Fri, 11 Jul 2008 17:34:46 +0900 (JST)
> > yamamoto@valinux.co.jp (YAMAMOTO Takashi) wrote:
> > 
> > > hi,
> > > 
> > > > > my patch penalizes heavy-writer cgroups as task_dirty_limit does
> > > > > for heavy-writer tasks.  i don't think that it's necessary to be
> > > > > tied to the memory subsystem because i merely want to group writers.
> > > > > 
> > > > Hmm, maybe what I need is different from this ;)
> > > > Does not seem to be a help for memory reclaim under memcg.
> > > 
> > > to implement what you need, i think that we need to keep track of
> > > the numbers of dirty-pages in each memory cgroups as a first step.
> > > do you agree?
> > > 
> > yes, I think so, now.
> > 
> > may be not difficult but will add extra overhead ;( Sigh..
> 
> the following is a patch to add the overhead. :)
> any comments?
> 
Do you have some numbers ? ;) 
I like this because this seems very straightforward. thank you.


> @@ -485,7 +502,10 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
>  		if (PageUnevictable(page) ||
>  		    (PageActive(page) && !active) ||
>  		    (!PageActive(page) && active)) {
> -			__mem_cgroup_move_lists(pc, page_lru(page));
> +			if (try_lock_page_cgroup(page)) {
> +				__mem_cgroup_move_lists(pc, page_lru(page));
> +				unlock_page_cgroup(page);
> +			}
>  			continue;
>  		}
>  

Hmm..ok, there will be new race between Dirty Bit and LRU bits.


> @@ -772,6 +792,38 @@ void mem_cgroup_end_migration(struct page *newpage)
>  		mem_cgroup_uncharge_page(newpage);
>  }
>  
> +void mem_cgroup_set_page_dirty(struct page *pg)
> +{
> +	struct page_cgroup *pc;
> +
> +	lock_page_cgroup(pg);
> +	pc = page_get_page_cgroup(pg);
> +	if (pc != NULL && (pc->flags & PAGE_CGROUP_FLAG_DIRTY) == 0) {
> +		struct mem_cgroup *mem = pc->mem_cgroup;
> +		struct mem_cgroup_stat *stat = &mem->stat;
> +
> +		pc->flags |= PAGE_CGROUP_FLAG_DIRTY;
> +		__mem_cgroup_stat_add(stat, MEM_CGROUP_STAT_DIRTY, 1);
> +	}
> +	unlock_page_cgroup(pg);
> +}
> +
> +void mem_cgroup_clear_page_dirty(struct page *pg)
> +{
> +	struct page_cgroup *pc;
> +
> +	lock_page_cgroup(pg);
> +	pc = page_get_page_cgroup(pg);
> +	if (pc != NULL && (pc->flags & PAGE_CGROUP_FLAG_DIRTY) != 0) {
> +		struct mem_cgroup *mem = pc->mem_cgroup;
> +		struct mem_cgroup_stat *stat = &mem->stat;
> +
> +		pc->flags &= ~PAGE_CGROUP_FLAG_DIRTY;
> +		__mem_cgroup_stat_add(stat, MEM_CGROUP_STAT_DIRTY, -1);
> +	}
> +	unlock_page_cgroup(pg);
> +}
> +

How about changing these to be

==
void mem_cgroup_test_set_page_dirty()
{
	if (try_lock_page_cgroup(pg)) {
		pc = page_get_page_cgroup(pg);
		if (pc ......) {
		}
		unlock_page_cgroup(pg)
	}
}
==


Off-topic: I wonder we can delete this "lock" in future.

Because page->page_cgroup is
 1. attached at first use.(Obiously no race with set_dirty)
 2. deleted at removal. (force_empty is problematic here..)

But, now, we need this lock.

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH][RFC] dirty balancing for cgroups
  2008-08-06  8:53                 ` KAMEZAWA Hiroyuki
@ 2008-08-06  9:10                   ` YAMAMOTO Takashi
  0 siblings, 0 replies; 19+ messages in thread
From: YAMAMOTO Takashi @ 2008-08-06  9:10 UTC (permalink / raw)
  To: kamezawa.hiroyu; +Cc: linux-mm, menage, containers, linux-kernel, a.p.zijlstra

hi,

> On Wed,  6 Aug 2008 17:20:46 +0900 (JST)
> yamamoto@valinux.co.jp (YAMAMOTO Takashi) wrote:
> 
> > hi,
> > 
> > > On Fri, 11 Jul 2008 17:34:46 +0900 (JST)
> > > yamamoto@valinux.co.jp (YAMAMOTO Takashi) wrote:
> > > 
> > > > hi,
> > > > 
> > > > > > my patch penalizes heavy-writer cgroups as task_dirty_limit does
> > > > > > for heavy-writer tasks.  i don't think that it's necessary to be
> > > > > > tied to the memory subsystem because i merely want to group writers.
> > > > > > 
> > > > > Hmm, maybe what I need is different from this ;)
> > > > > Does not seem to be a help for memory reclaim under memcg.
> > > > 
> > > > to implement what you need, i think that we need to keep track of
> > > > the numbers of dirty-pages in each memory cgroups as a first step.
> > > > do you agree?
> > > > 
> > > yes, I think so, now.
> > > 
> > > may be not difficult but will add extra overhead ;( Sigh..
> > 
> > the following is a patch to add the overhead. :)
> > any comments?
> > 
> Do you have some numbers ? ;) 

not yet.

> I like this because this seems very straightforward. thank you.

good to hear.

> How about changing these to be
> 
> ==
> void mem_cgroup_test_set_page_dirty()
> {
> 	if (try_lock_page_cgroup(pg)) {
> 		pc = page_get_page_cgroup(pg);
> 		if (pc ......) {
> 		}
> 		unlock_page_cgroup(pg)
> 	}
> }
> ==

i'm not sure how many opportunities to update statistics
we would lose for the trylock failure.
although the statistics don't need to be too precise,
its error should have a reasonable upper-limit to be useful.

> Off-topic: I wonder we can delete this "lock" in future.
> 
> Because page->page_cgroup is
>  1. attached at first use.(Obiously no race with set_dirty)
>  2. deleted at removal. (force_empty is problematic here..)

i hope it's possible. :)

YAMAMOTO Takashi

> 
> But, now, we need this lock.
> 
> Thanks,
> -Kame
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH][RFC] dirty balancing for cgroups
  2008-08-06  8:20               ` YAMAMOTO Takashi
  2008-08-06  8:53                 ` KAMEZAWA Hiroyuki
@ 2008-08-07 13:36                 ` Peter Zijlstra
  2008-08-13  7:15                   ` YAMAMOTO Takashi
  1 sibling, 1 reply; 19+ messages in thread
From: Peter Zijlstra @ 2008-08-07 13:36 UTC (permalink / raw)
  To: YAMAMOTO Takashi
  Cc: kamezawa.hiroyu, linux-mm, menage, containers, linux-kernel

On Wed, 2008-08-06 at 17:20 +0900, YAMAMOTO Takashi wrote:
> hi,
> 
> > On Fri, 11 Jul 2008 17:34:46 +0900 (JST)
> > yamamoto@valinux.co.jp (YAMAMOTO Takashi) wrote:
> > 
> > > hi,
> > > 
> > > > > my patch penalizes heavy-writer cgroups as task_dirty_limit does
> > > > > for heavy-writer tasks.  i don't think that it's necessary to be
> > > > > tied to the memory subsystem because i merely want to group writers.
> > > > > 
> > > > Hmm, maybe what I need is different from this ;)
> > > > Does not seem to be a help for memory reclaim under memcg.
> > > 
> > > to implement what you need, i think that we need to keep track of
> > > the numbers of dirty-pages in each memory cgroups as a first step.
> > > do you agree?
> > > 
> > yes, I think so, now.
> > 
> > may be not difficult but will add extra overhead ;( Sigh..
> 
> the following is a patch to add the overhead. :)
> any comments?
> 
> YAMAMOTO Takashi

It _might_ (depends on the uglyness of the result) make sense to try and
stick the mem_cgroup_*_page_dirty() stuff into the *PageDirty() macros.


> @@ -485,7 +502,10 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
>  		if (PageUnevictable(page) ||
>  		    (PageActive(page) && !active) ||
>  		    (!PageActive(page) && active)) {
> -			__mem_cgroup_move_lists(pc, page_lru(page));
> +			if (try_lock_page_cgroup(page)) {
> +				__mem_cgroup_move_lists(pc, page_lru(page));
> +				unlock_page_cgroup(page);
> +			}
>  			continue;
>  		}

This chunk seems unrelated and lost....


> @@ -772,6 +792,38 @@ void mem_cgroup_end_migration(struct page *newpage)
>  		mem_cgroup_uncharge_page(newpage);
>  }
>  
> +void mem_cgroup_set_page_dirty(struct page *pg)
> +{
> +	struct page_cgroup *pc;
> +
> +	lock_page_cgroup(pg);
> +	pc = page_get_page_cgroup(pg);
> +	if (pc != NULL && (pc->flags & PAGE_CGROUP_FLAG_DIRTY) == 0) {
> +		struct mem_cgroup *mem = pc->mem_cgroup;
> +		struct mem_cgroup_stat *stat = &mem->stat;
> +
> +		pc->flags |= PAGE_CGROUP_FLAG_DIRTY;
> +		__mem_cgroup_stat_add(stat, MEM_CGROUP_STAT_DIRTY, 1);
> +	}
> +	unlock_page_cgroup(pg);
> +}
> +
> +void mem_cgroup_clear_page_dirty(struct page *pg)
> +{
> +	struct page_cgroup *pc;
> +
> +	lock_page_cgroup(pg);
> +	pc = page_get_page_cgroup(pg);
> +	if (pc != NULL && (pc->flags & PAGE_CGROUP_FLAG_DIRTY) != 0) {
> +		struct mem_cgroup *mem = pc->mem_cgroup;
> +		struct mem_cgroup_stat *stat = &mem->stat;
> +
> +		pc->flags &= ~PAGE_CGROUP_FLAG_DIRTY;
> +		__mem_cgroup_stat_add(stat, MEM_CGROUP_STAT_DIRTY, -1);
> +	}
> +	unlock_page_cgroup(pg);
> +}
> +
>  /*
>   * A call to try to shrink memory usage under specified resource controller.
>   * This is typically used for page reclaiming for shmem for reducing side


I presonally dislike the != 0, == 0 comparisons for bitmask operations,
they seem to make it harder to read somewhow. I prefer to write !(flags
& mask) and (flags & mask), instead.

I guess taste differs,...


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH][RFC] dirty balancing for cgroups
  2008-08-07 13:36                 ` Peter Zijlstra
@ 2008-08-13  7:15                   ` YAMAMOTO Takashi
  2008-08-18  7:58                     ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 19+ messages in thread
From: YAMAMOTO Takashi @ 2008-08-13  7:15 UTC (permalink / raw)
  To: a.p.zijlstra; +Cc: kamezawa.hiroyu, linux-mm, menage, containers, linux-kernel

hi,

> > @@ -485,7 +502,10 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
> >  		if (PageUnevictable(page) ||
> >  		    (PageActive(page) && !active) ||
> >  		    (!PageActive(page) && active)) {
> > -			__mem_cgroup_move_lists(pc, page_lru(page));
> > +			if (try_lock_page_cgroup(page)) {
> > +				__mem_cgroup_move_lists(pc, page_lru(page));
> > +				unlock_page_cgroup(page);
> > +			}
> >  			continue;
> >  		}
> 
> This chunk seems unrelated and lost....

it's necessary to protect from mem_cgroup_{set,clear}_dirty
which modify pc->flags without holding mz->lru_lock.

> I presonally dislike the != 0, == 0 comparisons for bitmask operations,
> they seem to make it harder to read somewhow. I prefer to write !(flags
> & mask) and (flags & mask), instead.
> 
> I guess taste differs,...

yes, it seems different. :)

YAMAMOTO Takashi

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH][RFC] dirty balancing for cgroups
  2008-07-14 13:49           ` Peter Zijlstra
  2008-07-17  1:43             ` YAMAMOTO Takashi
@ 2008-08-14  8:38             ` Paul Menage
  1 sibling, 0 replies; 19+ messages in thread
From: Paul Menage @ 2008-08-14  8:38 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: KAMEZAWA Hiroyuki, YAMAMOTO Takashi, linux-mm, containers, linux-kernel

On Mon, Jul 14, 2008 at 6:49 AM, Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
>
> The dirty page limit avoids deadlocks under certain situations, the per
> BDI dirty limit avoids even mode deadlocks by providing isolation
> between BDIs.
>

As well as deadlocks, in the case of cgroups a big advantage of dirty
limits is that it makes it easier to "loan" memory to groups above and
beyond what they have been guaranteed. As long as we limit the
dirty/locked memory for a cgroup to its guarantee, and require any
extra memory to be clean and unlocked, then we can reclaim it in a
hurry if another cgroup (that had been guaranteed that memory) needs
it.

Paul

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH][RFC] dirty balancing for cgroups
  2008-08-13  7:15                   ` YAMAMOTO Takashi
@ 2008-08-18  7:58                     ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 19+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-08-18  7:58 UTC (permalink / raw)
  To: YAMAMOTO Takashi; +Cc: a.p.zijlstra, linux-mm, menage, containers, linux-kernel

On Wed, 13 Aug 2008 16:15:05 +0900 (JST)
yamamoto@valinux.co.jp (YAMAMOTO Takashi) wrote:

> hi,
> 
> > > @@ -485,7 +502,10 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
> > >  		if (PageUnevictable(page) ||
> > >  		    (PageActive(page) && !active) ||
> > >  		    (!PageActive(page) && active)) {
> > > -			__mem_cgroup_move_lists(pc, page_lru(page));
> > > +			if (try_lock_page_cgroup(page)) {
> > > +				__mem_cgroup_move_lists(pc, page_lru(page));
> > > +				unlock_page_cgroup(page);
> > > +			}
> > >  			continue;
> > >  		}
> > 
> > This chunk seems unrelated and lost....
> 
> it's necessary to protect from mem_cgroup_{set,clear}_dirty
> which modify pc->flags without holding mz->lru_lock.
> 

I'm now writing a patch to make page_cgroup->flags to be atomic_ops.
Don't worry about this.
(With remove-page-lock-cgroup patch, atomic_ops patch's performace is
 quite well.)

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2008-08-18  7:52 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-07-09  6:00 [PATCH][RFC] dirty balancing for cgroups YAMAMOTO Takashi
2008-07-10 23:54 ` KAMEZAWA Hiroyuki
2008-07-11  4:06   ` YAMAMOTO Takashi
2008-07-11  5:15     ` KAMEZAWA Hiroyuki
2008-07-11  5:59       ` YAMAMOTO Takashi
2008-07-11  7:13         ` KAMEZAWA Hiroyuki
2008-07-11  8:34           ` YAMAMOTO Takashi
2008-07-11  8:52             ` KAMEZAWA Hiroyuki
2008-08-06  8:20               ` YAMAMOTO Takashi
2008-08-06  8:53                 ` KAMEZAWA Hiroyuki
2008-08-06  9:10                   ` YAMAMOTO Takashi
2008-08-07 13:36                 ` Peter Zijlstra
2008-08-13  7:15                   ` YAMAMOTO Takashi
2008-08-18  7:58                     ` KAMEZAWA Hiroyuki
2008-07-14 13:49           ` Peter Zijlstra
2008-07-17  1:43             ` YAMAMOTO Takashi
2008-08-14  8:38             ` Paul Menage
2008-07-14 14:38           ` kamezawa.hiroyu
2008-07-14 13:37 ` Peter Zijlstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).