All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
@ 2011-08-11 12:20 ` Kautuk Consul
  0 siblings, 0 replies; 47+ messages in thread
From: Kautuk Consul @ 2011-08-11 12:20 UTC (permalink / raw)
  To: linux-mm, linux-kernel

Hi,

Currently the /proc/sys/vm/dirty_writeback_centisecs and
/proc/sys/vm/dirty_expire_centisecs values are
global to the system.
All the BDI flush-* threads are controlled by these central values.

However, the user/admin might want to set different writeback speeds
for different block devices based on
their page write-back performance.
For example, the user might want to write-back pages in smaller
intervals to a block device which has a
faster known writeback speed.

This patch creates 3 new counters (in centisecs) for all the BDI
threads that were controlled centrally by these
2 counters:
i)   /sys/block/<block_dev>/bdi/dirty_writeback_interval,
ii)  /sys/block/<block_dev>/bdi/dirty_expire_interval,
iii) /proc/sys/vm/sync_supers_centisecs.

Although these new counters can be tuned individually, I have taken
care that they be centrally reset by changes
to the /proc/sys/vm/dirty_expire_centisecs and
/proc/sys/vm/dirty_writeback_centisecs so that the earlier
functionality is not broken by distributions using these central values.
After resetting all values centrally, these values can be tuned
individually without altering the central values.

Signed-off-by: Kautuk Consul <consul.kautuk@gmail.com>
---

diff -uprN a/fs/fs-writeback.c b/fs/fs-writeback.c
--- a/fs/fs-writeback.c	2011-08-05 10:29:21.000000000 +0530
+++ b/fs/fs-writeback.c	2011-08-09 09:15:37.093041675 +0530
@@ -638,8 +638,8 @@ static inline bool over_bground_thresh(v
  * just walks the superblock inode list, writing back any inodes which are
  * older than a specific point in time.
  *
- * Try to run once per dirty_writeback_interval.  But if a writeback event
- * takes longer than a dirty_writeback_interval interval, then leave a
+ * Try to run once per bdi->dirty_writeback_interval.  But if a writeback event
+ * takes longer than a bdi->dirty_writeback_interval interval, then leave a
  * one-second gap.
  *
  * older_than_this takes precedence over nr_to_write.  So we'll only write back
@@ -663,7 +663,7 @@ static long wb_writeback(struct bdi_writ
 	if (wbc.for_kupdate) {
 		wbc.older_than_this = &oldest_jif;
 		oldest_jif = jiffies -
-				msecs_to_jiffies(dirty_expire_interval * 10);
+				msecs_to_jiffies(wb->bdi->dirty_expire_interval * 10);
 	}
 	if (!wbc.range_cyclic) {
 		wbc.range_start = 0;
@@ -811,15 +811,16 @@ static long wb_check_old_data_flush(stru
 {
 	unsigned long expired;
 	long nr_pages;
+	struct backing_dev_info *bdi = wb->bdi;

 	/*
 	 * When set to zero, disable periodic writeback
 	 */
-	if (!dirty_writeback_interval)
+	if (!bdi->dirty_writeback_interval)
 		return 0;

 	expired = wb->last_old_flush +
-			msecs_to_jiffies(dirty_writeback_interval * 10);
+			msecs_to_jiffies(bdi->dirty_writeback_interval * 10);
 	if (time_before(jiffies, expired))
 		return 0;

@@ -923,8 +924,8 @@ int bdi_writeback_thread(void *data)
 			continue;
 		}

-		if (wb_has_dirty_io(wb) && dirty_writeback_interval)
-			schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
+		if (wb_has_dirty_io(wb) && bdi->dirty_writeback_interval)
+			schedule_timeout(msecs_to_jiffies(bdi->dirty_writeback_interval * 10));
 		else {
 			/*
 			 * We have nothing to do, so can go sleep without any
diff -uprN a/include/linux/backing-dev.h b/include/linux/backing-dev.h
--- a/include/linux/backing-dev.h	2011-08-05 10:29:21.000000000 +0530
+++ b/include/linux/backing-dev.h	2011-08-09 09:15:37.094041619 +0530
@@ -76,6 +76,8 @@ struct backing_dev_info {

 	unsigned int min_ratio;
 	unsigned int max_ratio, max_prop_frac;
+	unsigned int dirty_writeback_interval;
+	unsigned int dirty_expire_interval;

 	struct bdi_writeback wb;  /* default writeback info for this bdi */
 	spinlock_t wb_lock;	  /* protects work_list */
@@ -333,4 +335,5 @@ static inline int bdi_sched_wait(void *w
 	return 0;
 }

+extern unsigned int shortest_dirty_writeback_interval;
 #endif		/* _LINUX_BACKING_DEV_H */
diff -uprN a/include/linux/writeback.h b/include/linux/writeback.h
--- a/include/linux/writeback.h	2011-08-05 10:29:21.000000000 +0530
+++ b/include/linux/writeback.h	2011-08-09 10:09:23.581268260 +0530
@@ -100,6 +100,7 @@ extern unsigned long dirty_background_by
 extern int vm_dirty_ratio;
 extern unsigned long vm_dirty_bytes;
 extern unsigned int dirty_writeback_interval;
+extern unsigned int sync_supers_interval;
 extern unsigned int dirty_expire_interval;
 extern int vm_highmem_is_dirtyable;
 extern int block_dump;
@@ -123,6 +124,10 @@ extern int dirty_bytes_handler(struct ct
 struct ctl_table;
 int dirty_writeback_centisecs_handler(struct ctl_table *, int,
 				      void __user *, size_t *, loff_t *);
+int sync_supers_centisecs_handler(struct ctl_table *, int,
+				      void __user *, size_t *, loff_t *);
+int dirty_expire_centisecs_handler(struct ctl_table *, int,
+				      void __user *, size_t *, loff_t *);

 void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
 unsigned long bdi_dirty_limit(struct backing_dev_info *bdi,
diff -uprN a/kernel/sysctl.c b/kernel/sysctl.c
--- a/kernel/sysctl.c	2011-08-05 10:29:21.000000000 +0530
+++ b/kernel/sysctl.c	2011-08-09 12:39:43.453087554 +0530
@@ -1076,12 +1076,19 @@ static struct ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= dirty_writeback_centisecs_handler,
 	},
+    {
+        .procname   = "sync_supers_centisecs",
+        .data       = &sync_supers_interval,
+        .maxlen     = sizeof(sync_supers_interval),
+        .mode       = 0644,
+        .proc_handler   = sync_supers_centisecs_handler,
+    },
 	{
 		.procname	= "dirty_expire_centisecs",
 		.data		= &dirty_expire_interval,
 		.maxlen		= sizeof(dirty_expire_interval),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= dirty_expire_centisecs_handler,
 		.extra1		= &zero,
 	},
 	{
diff -uprN a/mm/backing-dev.c b/mm/backing-dev.c
--- a/mm/backing-dev.c	2011-08-05 10:29:21.000000000 +0530
+++ b/mm/backing-dev.c	2011-08-09 12:08:06.287079027 +0530
@@ -39,6 +39,10 @@ DEFINE_SPINLOCK(bdi_lock);
 LIST_HEAD(bdi_list);
 LIST_HEAD(bdi_pending_list);

+/* Same value as the dirty_writeback_interval as this is what our
+ * initial shortest_dirty_writeback_interval. */
+unsigned int shortest_dirty_writeback_interval = 5 * 100;
+
 static struct task_struct *sync_supers_tsk;
 static struct timer_list sync_supers_timer;

@@ -204,12 +208,50 @@ static ssize_t max_ratio_store(struct de
 }
 BDI_SHOW(max_ratio, bdi->max_ratio)

+static ssize_t dirty_writeback_interval_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct backing_dev_info *bdi = dev_get_drvdata(dev);
+	char *end;
+	unsigned int interval;
+	ssize_t ret = -EINVAL;
+
+	interval = simple_strtoul(buf, &end, 10);
+	if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
+		bdi->dirty_writeback_interval = interval;
+		shortest_dirty_writeback_interval =
+						min(shortest_dirty_writeback_interval,interval);
+		ret = count;
+	}
+	return ret;
+}
+BDI_SHOW(dirty_writeback_interval, bdi->dirty_writeback_interval)
+
+static ssize_t dirty_expire_interval_store (struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct backing_dev_info *bdi = dev_get_drvdata(dev);
+	char *end;
+	unsigned int interval;
+	ssize_t ret = -EINVAL;
+
+	interval = simple_strtoul(buf, &end, 10);
+	if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
+		bdi->dirty_expire_interval = interval;
+		ret = count;
+	}
+	return ret;
+}
+BDI_SHOW(dirty_expire_interval, bdi->dirty_expire_interval)
+
 #define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)

 static struct device_attribute bdi_dev_attrs[] = {
 	__ATTR_RW(read_ahead_kb),
 	__ATTR_RW(min_ratio),
 	__ATTR_RW(max_ratio),
+	__ATTR_RW(dirty_writeback_interval),
+	__ATTR_RW(dirty_expire_interval),
 	__ATTR_NULL,
 };

@@ -291,7 +333,7 @@ void bdi_arm_supers_timer(void)
 	if (!dirty_writeback_interval)
 		return;

-	next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
+	next = msecs_to_jiffies(sync_supers_interval* 10) + jiffies;
 	mod_timer(&sync_supers_timer, round_jiffies_up(next));
 }

@@ -336,7 +378,7 @@ void bdi_wakeup_thread_delayed(struct ba
 {
 	unsigned long timeout;

-	timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
+	timeout = msecs_to_jiffies(bdi->dirty_writeback_interval * 10);
 	mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout);
 }

@@ -348,7 +390,19 @@ static unsigned long bdi_longest_inactiv
 {
 	unsigned long interval;

-	interval = msecs_to_jiffies(dirty_writeback_interval * 10);
+	interval = msecs_to_jiffies(shortest_dirty_writeback_interval * 10);
+	return max(5UL * 60 * HZ, interval);
+}
+
+/*
+ * Calculate the longest interval (jiffies) this bdi thread is allowed to be
+ * inactive.
+ */
+static unsigned long bdi_longest_inactive_this(struct backing_dev_info *bdi)
+{
+	unsigned long interval;
+
+	interval = msecs_to_jiffies(bdi->dirty_writeback_interval * 10);
 	return max(5UL * 60 * HZ, interval);
 }

@@ -422,7 +476,7 @@ static int bdi_forker_thread(void *ptr)
 			 */
 			if (bdi->wb.task && !have_dirty_io &&
 			    time_after(jiffies, bdi->wb.last_active +
-						bdi_longest_inactive())) {
+						bdi_longest_inactive_this(bdi))) {
 				task = bdi->wb.task;
 				bdi->wb.task = NULL;
 				spin_unlock(&bdi->wb_lock);
@@ -469,7 +523,7 @@ static int bdi_forker_thread(void *ptr)
 			break;

 		case NO_ACTION:
-			if (!wb_has_dirty_io(me) || !dirty_writeback_interval)
+			if (!wb_has_dirty_io(me) || !me->bdi->dirty_writeback_interval)
 				/*
 				 * There are no dirty data. The only thing we
 				 * should now care about is checking for
@@ -479,7 +533,7 @@ static int bdi_forker_thread(void *ptr)
 				 */
 				schedule_timeout(bdi_longest_inactive());
 			else
-				schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
+				schedule_timeout(msecs_to_jiffies(me->bdi->dirty_writeback_interval * 10));
 			try_to_freeze();
 			/* Back to the main loop */
 			continue;
@@ -641,6 +695,8 @@ int bdi_init(struct backing_dev_info *bd
 	bdi->min_ratio = 0;
 	bdi->max_ratio = 100;
 	bdi->max_prop_frac = PROP_FRAC_BASE;
+	bdi->dirty_writeback_interval = dirty_writeback_interval;
+	bdi->dirty_expire_interval = dirty_expire_interval;
 	spin_lock_init(&bdi->wb_lock);
 	INIT_LIST_HEAD(&bdi->bdi_list);
 	INIT_LIST_HEAD(&bdi->work_list);
diff -uprN a/mm/page-writeback.c b/mm/page-writeback.c
--- a/mm/page-writeback.c	2011-08-05 10:29:21.000000000 +0530
+++ b/mm/page-writeback.c	2011-08-09 13:09:37.985919961 +0530
@@ -92,6 +92,11 @@ unsigned long vm_dirty_bytes;
 unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */

 /*
+ * The interval between sync_supers thread writebacks
+ */
+unsigned int sync_supers_interval = 5 * 100; /* centiseconds */
+
+/*
  * The longest time for which data is allowed to remain dirty
  */
 unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */
@@ -686,8 +691,60 @@ void throttle_vm_writeout(gfp_t gfp_mask
 int dirty_writeback_centisecs_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
+	struct backing_dev_info *bdi;
+
+	proc_dointvec(table, write, buffer, length, ppos);
+
+	if (write) {
+		/* Traverse all the BDIs registered to the BDI list and reset their
+		 * bdi->dirty_writeback_interval to this value. */
+	    spin_lock_bh(&bdi_lock);
+		list_for_each_entry(bdi, &bdi_list, bdi_list)
+			bdi->dirty_writeback_interval = dirty_writeback_interval;
+	    spin_unlock_bh(&bdi_lock);
+
+		sync_supers_interval =
+			shortest_dirty_writeback_interval = dirty_writeback_interval;
+
+	}
+
+	bdi_arm_supers_timer();
+
+	return 0;
+}
+
+/*
+ * sysctl handler for /proc/sys/vm/sync_supers_centisecs
+ */
+int sync_supers_centisecs_handler(ctl_table *table, int write,
+	void __user *buffer, size_t *length, loff_t *ppos)
+{
 	proc_dointvec(table, write, buffer, length, ppos);
+
 	bdi_arm_supers_timer();
+
+	return 0;
+}
+
+/*
+ * sysctl handler for /proc/sys/vm/dirty_expire_centisecs
+ */
+int dirty_expire_centisecs_handler(ctl_table *table, int write,
+	void __user *buffer, size_t *length, loff_t *ppos)
+{
+	struct backing_dev_info *bdi;
+
+	proc_dointvec_minmax(table, write, buffer, length, ppos);
+
+	if (write) {
+		/* Traverse all the BDIs registered to the BDI list and reset their
+		 * bdi->dirty_expire_interval to this value. */
+	    spin_lock_bh(&bdi_lock);
+		list_for_each_entry(bdi, &bdi_list, bdi_list)
+			bdi->dirty_expire_interval = dirty_expire_interval;
+	    spin_unlock_bh(&bdi_lock);
+	}
+
 	return 0;
 }

^ permalink raw reply	[flat|nested] 47+ messages in thread

* [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
@ 2011-08-11 12:20 ` Kautuk Consul
  0 siblings, 0 replies; 47+ messages in thread
From: Kautuk Consul @ 2011-08-11 12:20 UTC (permalink / raw)
  To: linux-mm, linux-kernel

Hi,

Currently the /proc/sys/vm/dirty_writeback_centisecs and
/proc/sys/vm/dirty_expire_centisecs values are
global to the system.
All the BDI flush-* threads are controlled by these central values.

However, the user/admin might want to set different writeback speeds
for different block devices based on
their page write-back performance.
For example, the user might want to write-back pages in smaller
intervals to a block device which has a
faster known writeback speed.

This patch creates 3 new counters (in centisecs) for all the BDI
threads that were controlled centrally by these
2 counters:
i)   /sys/block/<block_dev>/bdi/dirty_writeback_interval,
ii)  /sys/block/<block_dev>/bdi/dirty_expire_interval,
iii) /proc/sys/vm/sync_supers_centisecs.

Although these new counters can be tuned individually, I have taken
care that they be centrally reset by changes
to the /proc/sys/vm/dirty_expire_centisecs and
/proc/sys/vm/dirty_writeback_centisecs so that the earlier
functionality is not broken by distributions using these central values.
After resetting all values centrally, these values can be tuned
individually without altering the central values.

Signed-off-by: Kautuk Consul <consul.kautuk@gmail.com>
---

diff -uprN a/fs/fs-writeback.c b/fs/fs-writeback.c
--- a/fs/fs-writeback.c	2011-08-05 10:29:21.000000000 +0530
+++ b/fs/fs-writeback.c	2011-08-09 09:15:37.093041675 +0530
@@ -638,8 +638,8 @@ static inline bool over_bground_thresh(v
  * just walks the superblock inode list, writing back any inodes which are
  * older than a specific point in time.
  *
- * Try to run once per dirty_writeback_interval.  But if a writeback event
- * takes longer than a dirty_writeback_interval interval, then leave a
+ * Try to run once per bdi->dirty_writeback_interval.  But if a writeback event
+ * takes longer than a bdi->dirty_writeback_interval interval, then leave a
  * one-second gap.
  *
  * older_than_this takes precedence over nr_to_write.  So we'll only write back
@@ -663,7 +663,7 @@ static long wb_writeback(struct bdi_writ
 	if (wbc.for_kupdate) {
 		wbc.older_than_this = &oldest_jif;
 		oldest_jif = jiffies -
-				msecs_to_jiffies(dirty_expire_interval * 10);
+				msecs_to_jiffies(wb->bdi->dirty_expire_interval * 10);
 	}
 	if (!wbc.range_cyclic) {
 		wbc.range_start = 0;
@@ -811,15 +811,16 @@ static long wb_check_old_data_flush(stru
 {
 	unsigned long expired;
 	long nr_pages;
+	struct backing_dev_info *bdi = wb->bdi;

 	/*
 	 * When set to zero, disable periodic writeback
 	 */
-	if (!dirty_writeback_interval)
+	if (!bdi->dirty_writeback_interval)
 		return 0;

 	expired = wb->last_old_flush +
-			msecs_to_jiffies(dirty_writeback_interval * 10);
+			msecs_to_jiffies(bdi->dirty_writeback_interval * 10);
 	if (time_before(jiffies, expired))
 		return 0;

@@ -923,8 +924,8 @@ int bdi_writeback_thread(void *data)
 			continue;
 		}

-		if (wb_has_dirty_io(wb) && dirty_writeback_interval)
-			schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
+		if (wb_has_dirty_io(wb) && bdi->dirty_writeback_interval)
+			schedule_timeout(msecs_to_jiffies(bdi->dirty_writeback_interval * 10));
 		else {
 			/*
 			 * We have nothing to do, so can go sleep without any
diff -uprN a/include/linux/backing-dev.h b/include/linux/backing-dev.h
--- a/include/linux/backing-dev.h	2011-08-05 10:29:21.000000000 +0530
+++ b/include/linux/backing-dev.h	2011-08-09 09:15:37.094041619 +0530
@@ -76,6 +76,8 @@ struct backing_dev_info {

 	unsigned int min_ratio;
 	unsigned int max_ratio, max_prop_frac;
+	unsigned int dirty_writeback_interval;
+	unsigned int dirty_expire_interval;

 	struct bdi_writeback wb;  /* default writeback info for this bdi */
 	spinlock_t wb_lock;	  /* protects work_list */
@@ -333,4 +335,5 @@ static inline int bdi_sched_wait(void *w
 	return 0;
 }

+extern unsigned int shortest_dirty_writeback_interval;
 #endif		/* _LINUX_BACKING_DEV_H */
diff -uprN a/include/linux/writeback.h b/include/linux/writeback.h
--- a/include/linux/writeback.h	2011-08-05 10:29:21.000000000 +0530
+++ b/include/linux/writeback.h	2011-08-09 10:09:23.581268260 +0530
@@ -100,6 +100,7 @@ extern unsigned long dirty_background_by
 extern int vm_dirty_ratio;
 extern unsigned long vm_dirty_bytes;
 extern unsigned int dirty_writeback_interval;
+extern unsigned int sync_supers_interval;
 extern unsigned int dirty_expire_interval;
 extern int vm_highmem_is_dirtyable;
 extern int block_dump;
@@ -123,6 +124,10 @@ extern int dirty_bytes_handler(struct ct
 struct ctl_table;
 int dirty_writeback_centisecs_handler(struct ctl_table *, int,
 				      void __user *, size_t *, loff_t *);
+int sync_supers_centisecs_handler(struct ctl_table *, int,
+				      void __user *, size_t *, loff_t *);
+int dirty_expire_centisecs_handler(struct ctl_table *, int,
+				      void __user *, size_t *, loff_t *);

 void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
 unsigned long bdi_dirty_limit(struct backing_dev_info *bdi,
diff -uprN a/kernel/sysctl.c b/kernel/sysctl.c
--- a/kernel/sysctl.c	2011-08-05 10:29:21.000000000 +0530
+++ b/kernel/sysctl.c	2011-08-09 12:39:43.453087554 +0530
@@ -1076,12 +1076,19 @@ static struct ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= dirty_writeback_centisecs_handler,
 	},
+    {
+        .procname   = "sync_supers_centisecs",
+        .data       = &sync_supers_interval,
+        .maxlen     = sizeof(sync_supers_interval),
+        .mode       = 0644,
+        .proc_handler   = sync_supers_centisecs_handler,
+    },
 	{
 		.procname	= "dirty_expire_centisecs",
 		.data		= &dirty_expire_interval,
 		.maxlen		= sizeof(dirty_expire_interval),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= dirty_expire_centisecs_handler,
 		.extra1		= &zero,
 	},
 	{
diff -uprN a/mm/backing-dev.c b/mm/backing-dev.c
--- a/mm/backing-dev.c	2011-08-05 10:29:21.000000000 +0530
+++ b/mm/backing-dev.c	2011-08-09 12:08:06.287079027 +0530
@@ -39,6 +39,10 @@ DEFINE_SPINLOCK(bdi_lock);
 LIST_HEAD(bdi_list);
 LIST_HEAD(bdi_pending_list);

+/* Same value as the dirty_writeback_interval as this is what our
+ * initial shortest_dirty_writeback_interval. */
+unsigned int shortest_dirty_writeback_interval = 5 * 100;
+
 static struct task_struct *sync_supers_tsk;
 static struct timer_list sync_supers_timer;

@@ -204,12 +208,50 @@ static ssize_t max_ratio_store(struct de
 }
 BDI_SHOW(max_ratio, bdi->max_ratio)

+static ssize_t dirty_writeback_interval_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct backing_dev_info *bdi = dev_get_drvdata(dev);
+	char *end;
+	unsigned int interval;
+	ssize_t ret = -EINVAL;
+
+	interval = simple_strtoul(buf, &end, 10);
+	if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
+		bdi->dirty_writeback_interval = interval;
+		shortest_dirty_writeback_interval =
+						min(shortest_dirty_writeback_interval,interval);
+		ret = count;
+	}
+	return ret;
+}
+BDI_SHOW(dirty_writeback_interval, bdi->dirty_writeback_interval)
+
+static ssize_t dirty_expire_interval_store (struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct backing_dev_info *bdi = dev_get_drvdata(dev);
+	char *end;
+	unsigned int interval;
+	ssize_t ret = -EINVAL;
+
+	interval = simple_strtoul(buf, &end, 10);
+	if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
+		bdi->dirty_expire_interval = interval;
+		ret = count;
+	}
+	return ret;
+}
+BDI_SHOW(dirty_expire_interval, bdi->dirty_expire_interval)
+
 #define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)

 static struct device_attribute bdi_dev_attrs[] = {
 	__ATTR_RW(read_ahead_kb),
 	__ATTR_RW(min_ratio),
 	__ATTR_RW(max_ratio),
+	__ATTR_RW(dirty_writeback_interval),
+	__ATTR_RW(dirty_expire_interval),
 	__ATTR_NULL,
 };

@@ -291,7 +333,7 @@ void bdi_arm_supers_timer(void)
 	if (!dirty_writeback_interval)
 		return;

-	next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
+	next = msecs_to_jiffies(sync_supers_interval* 10) + jiffies;
 	mod_timer(&sync_supers_timer, round_jiffies_up(next));
 }

@@ -336,7 +378,7 @@ void bdi_wakeup_thread_delayed(struct ba
 {
 	unsigned long timeout;

-	timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
+	timeout = msecs_to_jiffies(bdi->dirty_writeback_interval * 10);
 	mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout);
 }

@@ -348,7 +390,19 @@ static unsigned long bdi_longest_inactiv
 {
 	unsigned long interval;

-	interval = msecs_to_jiffies(dirty_writeback_interval * 10);
+	interval = msecs_to_jiffies(shortest_dirty_writeback_interval * 10);
+	return max(5UL * 60 * HZ, interval);
+}
+
+/*
+ * Calculate the longest interval (jiffies) this bdi thread is allowed to be
+ * inactive.
+ */
+static unsigned long bdi_longest_inactive_this(struct backing_dev_info *bdi)
+{
+	unsigned long interval;
+
+	interval = msecs_to_jiffies(bdi->dirty_writeback_interval * 10);
 	return max(5UL * 60 * HZ, interval);
 }

@@ -422,7 +476,7 @@ static int bdi_forker_thread(void *ptr)
 			 */
 			if (bdi->wb.task && !have_dirty_io &&
 			    time_after(jiffies, bdi->wb.last_active +
-						bdi_longest_inactive())) {
+						bdi_longest_inactive_this(bdi))) {
 				task = bdi->wb.task;
 				bdi->wb.task = NULL;
 				spin_unlock(&bdi->wb_lock);
@@ -469,7 +523,7 @@ static int bdi_forker_thread(void *ptr)
 			break;

 		case NO_ACTION:
-			if (!wb_has_dirty_io(me) || !dirty_writeback_interval)
+			if (!wb_has_dirty_io(me) || !me->bdi->dirty_writeback_interval)
 				/*
 				 * There are no dirty data. The only thing we
 				 * should now care about is checking for
@@ -479,7 +533,7 @@ static int bdi_forker_thread(void *ptr)
 				 */
 				schedule_timeout(bdi_longest_inactive());
 			else
-				schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
+				schedule_timeout(msecs_to_jiffies(me->bdi->dirty_writeback_interval * 10));
 			try_to_freeze();
 			/* Back to the main loop */
 			continue;
@@ -641,6 +695,8 @@ int bdi_init(struct backing_dev_info *bd
 	bdi->min_ratio = 0;
 	bdi->max_ratio = 100;
 	bdi->max_prop_frac = PROP_FRAC_BASE;
+	bdi->dirty_writeback_interval = dirty_writeback_interval;
+	bdi->dirty_expire_interval = dirty_expire_interval;
 	spin_lock_init(&bdi->wb_lock);
 	INIT_LIST_HEAD(&bdi->bdi_list);
 	INIT_LIST_HEAD(&bdi->work_list);
diff -uprN a/mm/page-writeback.c b/mm/page-writeback.c
--- a/mm/page-writeback.c	2011-08-05 10:29:21.000000000 +0530
+++ b/mm/page-writeback.c	2011-08-09 13:09:37.985919961 +0530
@@ -92,6 +92,11 @@ unsigned long vm_dirty_bytes;
 unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */

 /*
+ * The interval between sync_supers thread writebacks
+ */
+unsigned int sync_supers_interval = 5 * 100; /* centiseconds */
+
+/*
  * The longest time for which data is allowed to remain dirty
  */
 unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */
@@ -686,8 +691,60 @@ void throttle_vm_writeout(gfp_t gfp_mask
 int dirty_writeback_centisecs_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
+	struct backing_dev_info *bdi;
+
+	proc_dointvec(table, write, buffer, length, ppos);
+
+	if (write) {
+		/* Traverse all the BDIs registered to the BDI list and reset their
+		 * bdi->dirty_writeback_interval to this value. */
+	    spin_lock_bh(&bdi_lock);
+		list_for_each_entry(bdi, &bdi_list, bdi_list)
+			bdi->dirty_writeback_interval = dirty_writeback_interval;
+	    spin_unlock_bh(&bdi_lock);
+
+		sync_supers_interval =
+			shortest_dirty_writeback_interval = dirty_writeback_interval;
+
+	}
+
+	bdi_arm_supers_timer();
+
+	return 0;
+}
+
+/*
+ * sysctl handler for /proc/sys/vm/sync_supers_centisecs
+ */
+int sync_supers_centisecs_handler(ctl_table *table, int write,
+	void __user *buffer, size_t *length, loff_t *ppos)
+{
 	proc_dointvec(table, write, buffer, length, ppos);
+
 	bdi_arm_supers_timer();
+
+	return 0;
+}
+
+/*
+ * sysctl handler for /proc/sys/vm/dirty_expire_centisecs
+ */
+int dirty_expire_centisecs_handler(ctl_table *table, int write,
+	void __user *buffer, size_t *length, loff_t *ppos)
+{
+	struct backing_dev_info *bdi;
+
+	proc_dointvec_minmax(table, write, buffer, length, ppos);
+
+	if (write) {
+		/* Traverse all the BDIs registered to the BDI list and reset their
+		 * bdi->dirty_expire_interval to this value. */
+	    spin_lock_bh(&bdi_lock);
+		list_for_each_entry(bdi, &bdi_list, bdi_list)
+			bdi->dirty_expire_interval = dirty_expire_interval;
+	    spin_unlock_bh(&bdi_lock);
+	}
+
 	return 0;
 }

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
  2011-08-11 12:20 ` Kautuk Consul
@ 2011-08-18  9:48   ` Wu Fengguang
  -1 siblings, 0 replies; 47+ messages in thread
From: Wu Fengguang @ 2011-08-18  9:48 UTC (permalink / raw)
  To: Kautuk Consul
  Cc: Mel Gorman, KOSAKI Motohiro, linux-mm, linux-kernel, linux-fsdevel

Hi Kautuk,

Add CC to fsdevel and Mel and KOSAKI.

When submitting patches you can find the relevant mailing list and
developers to CC with this command under the kernel source tree:

        scripts/get_maintainer.pl YOUR-PATCH-FILE

On Thu, Aug 11, 2011 at 05:50:56PM +0530, Kautuk Consul wrote:
> Hi,
> 
> Currently the /proc/sys/vm/dirty_writeback_centisecs and
> /proc/sys/vm/dirty_expire_centisecs values are
> global to the system.
> All the BDI flush-* threads are controlled by these central values.

Yes.

> However, the user/admin might want to set different writeback speeds
> for different block devices based on
> their page write-back performance.

How can the above two sysctl values impact "writeback speeds"?
In particular, what's the "speed" you mean?

> For example, the user might want to write-back pages in smaller
> intervals to a block device which has a
> faster known writeback speed.

That's not a complete rational. What does the user ultimately want by
setting a smaller interval? What would be the problems to the other
slow devices if the user does so by simply setting a small value
_globally_?

We need strong use cases for doing such user interface changes.
Would you detail the problem and the pains that can only (or best)
be addressed by this patch?

Thanks,
Fengguang

> This patch creates 3 new counters (in centisecs) for all the BDI
> threads that were controlled centrally by these
> 2 counters:
> i)   /sys/block/<block_dev>/bdi/dirty_writeback_interval,
> ii)  /sys/block/<block_dev>/bdi/dirty_expire_interval,
> iii) /proc/sys/vm/sync_supers_centisecs.
> 
> Although these new counters can be tuned individually, I have taken
> care that they be centrally reset by changes
> to the /proc/sys/vm/dirty_expire_centisecs and
> /proc/sys/vm/dirty_writeback_centisecs so that the earlier
> functionality is not broken by distributions using these central values.
> After resetting all values centrally, these values can be tuned
> individually without altering the central values.
> 
> Signed-off-by: Kautuk Consul <consul.kautuk@gmail.com>
> ---
> 
> diff -uprN a/fs/fs-writeback.c b/fs/fs-writeback.c
> --- a/fs/fs-writeback.c	2011-08-05 10:29:21.000000000 +0530
> +++ b/fs/fs-writeback.c	2011-08-09 09:15:37.093041675 +0530
> @@ -638,8 +638,8 @@ static inline bool over_bground_thresh(v
>   * just walks the superblock inode list, writing back any inodes which are
>   * older than a specific point in time.
>   *
> - * Try to run once per dirty_writeback_interval.  But if a writeback event
> - * takes longer than a dirty_writeback_interval interval, then leave a
> + * Try to run once per bdi->dirty_writeback_interval.  But if a writeback event
> + * takes longer than a bdi->dirty_writeback_interval interval, then leave a
>   * one-second gap.
>   *
>   * older_than_this takes precedence over nr_to_write.  So we'll only write back
> @@ -663,7 +663,7 @@ static long wb_writeback(struct bdi_writ
>  	if (wbc.for_kupdate) {
>  		wbc.older_than_this = &oldest_jif;
>  		oldest_jif = jiffies -
> -				msecs_to_jiffies(dirty_expire_interval * 10);
> +				msecs_to_jiffies(wb->bdi->dirty_expire_interval * 10);
>  	}
>  	if (!wbc.range_cyclic) {
>  		wbc.range_start = 0;
> @@ -811,15 +811,16 @@ static long wb_check_old_data_flush(stru
>  {
>  	unsigned long expired;
>  	long nr_pages;
> +	struct backing_dev_info *bdi = wb->bdi;
> 
>  	/*
>  	 * When set to zero, disable periodic writeback
>  	 */
> -	if (!dirty_writeback_interval)
> +	if (!bdi->dirty_writeback_interval)
>  		return 0;
> 
>  	expired = wb->last_old_flush +
> -			msecs_to_jiffies(dirty_writeback_interval * 10);
> +			msecs_to_jiffies(bdi->dirty_writeback_interval * 10);
>  	if (time_before(jiffies, expired))
>  		return 0;
> 
> @@ -923,8 +924,8 @@ int bdi_writeback_thread(void *data)
>  			continue;
>  		}
> 
> -		if (wb_has_dirty_io(wb) && dirty_writeback_interval)
> -			schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
> +		if (wb_has_dirty_io(wb) && bdi->dirty_writeback_interval)
> +			schedule_timeout(msecs_to_jiffies(bdi->dirty_writeback_interval * 10));
>  		else {
>  			/*
>  			 * We have nothing to do, so can go sleep without any
> diff -uprN a/include/linux/backing-dev.h b/include/linux/backing-dev.h
> --- a/include/linux/backing-dev.h	2011-08-05 10:29:21.000000000 +0530
> +++ b/include/linux/backing-dev.h	2011-08-09 09:15:37.094041619 +0530
> @@ -76,6 +76,8 @@ struct backing_dev_info {
> 
>  	unsigned int min_ratio;
>  	unsigned int max_ratio, max_prop_frac;
> +	unsigned int dirty_writeback_interval;
> +	unsigned int dirty_expire_interval;
> 
>  	struct bdi_writeback wb;  /* default writeback info for this bdi */
>  	spinlock_t wb_lock;	  /* protects work_list */
> @@ -333,4 +335,5 @@ static inline int bdi_sched_wait(void *w
>  	return 0;
>  }
> 
> +extern unsigned int shortest_dirty_writeback_interval;
>  #endif		/* _LINUX_BACKING_DEV_H */
> diff -uprN a/include/linux/writeback.h b/include/linux/writeback.h
> --- a/include/linux/writeback.h	2011-08-05 10:29:21.000000000 +0530
> +++ b/include/linux/writeback.h	2011-08-09 10:09:23.581268260 +0530
> @@ -100,6 +100,7 @@ extern unsigned long dirty_background_by
>  extern int vm_dirty_ratio;
>  extern unsigned long vm_dirty_bytes;
>  extern unsigned int dirty_writeback_interval;
> +extern unsigned int sync_supers_interval;
>  extern unsigned int dirty_expire_interval;
>  extern int vm_highmem_is_dirtyable;
>  extern int block_dump;
> @@ -123,6 +124,10 @@ extern int dirty_bytes_handler(struct ct
>  struct ctl_table;
>  int dirty_writeback_centisecs_handler(struct ctl_table *, int,
>  				      void __user *, size_t *, loff_t *);
> +int sync_supers_centisecs_handler(struct ctl_table *, int,
> +				      void __user *, size_t *, loff_t *);
> +int dirty_expire_centisecs_handler(struct ctl_table *, int,
> +				      void __user *, size_t *, loff_t *);
> 
>  void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
>  unsigned long bdi_dirty_limit(struct backing_dev_info *bdi,
> diff -uprN a/kernel/sysctl.c b/kernel/sysctl.c
> --- a/kernel/sysctl.c	2011-08-05 10:29:21.000000000 +0530
> +++ b/kernel/sysctl.c	2011-08-09 12:39:43.453087554 +0530
> @@ -1076,12 +1076,19 @@ static struct ctl_table vm_table[] = {
>  		.mode		= 0644,
>  		.proc_handler	= dirty_writeback_centisecs_handler,
>  	},
> +    {
> +        .procname   = "sync_supers_centisecs",
> +        .data       = &sync_supers_interval,
> +        .maxlen     = sizeof(sync_supers_interval),
> +        .mode       = 0644,
> +        .proc_handler   = sync_supers_centisecs_handler,
> +    },
>  	{
>  		.procname	= "dirty_expire_centisecs",
>  		.data		= &dirty_expire_interval,
>  		.maxlen		= sizeof(dirty_expire_interval),
>  		.mode		= 0644,
> -		.proc_handler	= proc_dointvec_minmax,
> +		.proc_handler	= dirty_expire_centisecs_handler,
>  		.extra1		= &zero,
>  	},
>  	{
> diff -uprN a/mm/backing-dev.c b/mm/backing-dev.c
> --- a/mm/backing-dev.c	2011-08-05 10:29:21.000000000 +0530
> +++ b/mm/backing-dev.c	2011-08-09 12:08:06.287079027 +0530
> @@ -39,6 +39,10 @@ DEFINE_SPINLOCK(bdi_lock);
>  LIST_HEAD(bdi_list);
>  LIST_HEAD(bdi_pending_list);
> 
> +/* Same value as the dirty_writeback_interval as this is what our
> + * initial shortest_dirty_writeback_interval. */
> +unsigned int shortest_dirty_writeback_interval = 5 * 100;
> +
>  static struct task_struct *sync_supers_tsk;
>  static struct timer_list sync_supers_timer;
> 
> @@ -204,12 +208,50 @@ static ssize_t max_ratio_store(struct de
>  }
>  BDI_SHOW(max_ratio, bdi->max_ratio)
> 
> +static ssize_t dirty_writeback_interval_store(struct device *dev,
> +		struct device_attribute *attr, const char *buf, size_t count)
> +{
> +	struct backing_dev_info *bdi = dev_get_drvdata(dev);
> +	char *end;
> +	unsigned int interval;
> +	ssize_t ret = -EINVAL;
> +
> +	interval = simple_strtoul(buf, &end, 10);
> +	if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
> +		bdi->dirty_writeback_interval = interval;
> +		shortest_dirty_writeback_interval =
> +						min(shortest_dirty_writeback_interval,interval);
> +		ret = count;
> +	}
> +	return ret;
> +}
> +BDI_SHOW(dirty_writeback_interval, bdi->dirty_writeback_interval)
> +
> +static ssize_t dirty_expire_interval_store (struct device *dev,
> +		struct device_attribute *attr, const char *buf, size_t count)
> +{
> +	struct backing_dev_info *bdi = dev_get_drvdata(dev);
> +	char *end;
> +	unsigned int interval;
> +	ssize_t ret = -EINVAL;
> +
> +	interval = simple_strtoul(buf, &end, 10);
> +	if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
> +		bdi->dirty_expire_interval = interval;
> +		ret = count;
> +	}
> +	return ret;
> +}
> +BDI_SHOW(dirty_expire_interval, bdi->dirty_expire_interval)
> +
>  #define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
> 
>  static struct device_attribute bdi_dev_attrs[] = {
>  	__ATTR_RW(read_ahead_kb),
>  	__ATTR_RW(min_ratio),
>  	__ATTR_RW(max_ratio),
> +	__ATTR_RW(dirty_writeback_interval),
> +	__ATTR_RW(dirty_expire_interval),
>  	__ATTR_NULL,
>  };
> 
> @@ -291,7 +333,7 @@ void bdi_arm_supers_timer(void)
>  	if (!dirty_writeback_interval)
>  		return;
> 
> -	next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
> +	next = msecs_to_jiffies(sync_supers_interval* 10) + jiffies;
>  	mod_timer(&sync_supers_timer, round_jiffies_up(next));
>  }
> 
> @@ -336,7 +378,7 @@ void bdi_wakeup_thread_delayed(struct ba
>  {
>  	unsigned long timeout;
> 
> -	timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
> +	timeout = msecs_to_jiffies(bdi->dirty_writeback_interval * 10);
>  	mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout);
>  }
> 
> @@ -348,7 +390,19 @@ static unsigned long bdi_longest_inactiv
>  {
>  	unsigned long interval;
> 
> -	interval = msecs_to_jiffies(dirty_writeback_interval * 10);
> +	interval = msecs_to_jiffies(shortest_dirty_writeback_interval * 10);
> +	return max(5UL * 60 * HZ, interval);
> +}
> +
> +/*
> + * Calculate the longest interval (jiffies) this bdi thread is allowed to be
> + * inactive.
> + */
> +static unsigned long bdi_longest_inactive_this(struct backing_dev_info *bdi)
> +{
> +	unsigned long interval;
> +
> +	interval = msecs_to_jiffies(bdi->dirty_writeback_interval * 10);
>  	return max(5UL * 60 * HZ, interval);
>  }
> 
> @@ -422,7 +476,7 @@ static int bdi_forker_thread(void *ptr)
>  			 */
>  			if (bdi->wb.task && !have_dirty_io &&
>  			    time_after(jiffies, bdi->wb.last_active +
> -						bdi_longest_inactive())) {
> +						bdi_longest_inactive_this(bdi))) {
>  				task = bdi->wb.task;
>  				bdi->wb.task = NULL;
>  				spin_unlock(&bdi->wb_lock);
> @@ -469,7 +523,7 @@ static int bdi_forker_thread(void *ptr)
>  			break;
> 
>  		case NO_ACTION:
> -			if (!wb_has_dirty_io(me) || !dirty_writeback_interval)
> +			if (!wb_has_dirty_io(me) || !me->bdi->dirty_writeback_interval)
>  				/*
>  				 * There are no dirty data. The only thing we
>  				 * should now care about is checking for
> @@ -479,7 +533,7 @@ static int bdi_forker_thread(void *ptr)
>  				 */
>  				schedule_timeout(bdi_longest_inactive());
>  			else
> -				schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
> +				schedule_timeout(msecs_to_jiffies(me->bdi->dirty_writeback_interval * 10));
>  			try_to_freeze();
>  			/* Back to the main loop */
>  			continue;
> @@ -641,6 +695,8 @@ int bdi_init(struct backing_dev_info *bd
>  	bdi->min_ratio = 0;
>  	bdi->max_ratio = 100;
>  	bdi->max_prop_frac = PROP_FRAC_BASE;
> +	bdi->dirty_writeback_interval = dirty_writeback_interval;
> +	bdi->dirty_expire_interval = dirty_expire_interval;
>  	spin_lock_init(&bdi->wb_lock);
>  	INIT_LIST_HEAD(&bdi->bdi_list);
>  	INIT_LIST_HEAD(&bdi->work_list);
> diff -uprN a/mm/page-writeback.c b/mm/page-writeback.c
> --- a/mm/page-writeback.c	2011-08-05 10:29:21.000000000 +0530
> +++ b/mm/page-writeback.c	2011-08-09 13:09:37.985919961 +0530
> @@ -92,6 +92,11 @@ unsigned long vm_dirty_bytes;
>  unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */
> 
>  /*
> + * The interval between sync_supers thread writebacks
> + */
> +unsigned int sync_supers_interval = 5 * 100; /* centiseconds */
> +
> +/*
>   * The longest time for which data is allowed to remain dirty
>   */
>  unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */
> @@ -686,8 +691,60 @@ void throttle_vm_writeout(gfp_t gfp_mask
>  int dirty_writeback_centisecs_handler(ctl_table *table, int write,
>  	void __user *buffer, size_t *length, loff_t *ppos)
>  {
> +	struct backing_dev_info *bdi;
> +
> +	proc_dointvec(table, write, buffer, length, ppos);
> +
> +	if (write) {
> +		/* Traverse all the BDIs registered to the BDI list and reset their
> +		 * bdi->dirty_writeback_interval to this value. */
> +	    spin_lock_bh(&bdi_lock);
> +		list_for_each_entry(bdi, &bdi_list, bdi_list)
> +			bdi->dirty_writeback_interval = dirty_writeback_interval;
> +	    spin_unlock_bh(&bdi_lock);
> +
> +		sync_supers_interval =
> +			shortest_dirty_writeback_interval = dirty_writeback_interval;
> +
> +	}
> +
> +	bdi_arm_supers_timer();
> +
> +	return 0;
> +}
> +
> +/*
> + * sysctl handler for /proc/sys/vm/sync_supers_centisecs
> + */
> +int sync_supers_centisecs_handler(ctl_table *table, int write,
> +	void __user *buffer, size_t *length, loff_t *ppos)
> +{
>  	proc_dointvec(table, write, buffer, length, ppos);
> +
>  	bdi_arm_supers_timer();
> +
> +	return 0;
> +}
> +
> +/*
> + * sysctl handler for /proc/sys/vm/dirty_expire_centisecs
> + */
> +int dirty_expire_centisecs_handler(ctl_table *table, int write,
> +	void __user *buffer, size_t *length, loff_t *ppos)
> +{
> +	struct backing_dev_info *bdi;
> +
> +	proc_dointvec_minmax(table, write, buffer, length, ppos);
> +
> +	if (write) {
> +		/* Traverse all the BDIs registered to the BDI list and reset their
> +		 * bdi->dirty_expire_interval to this value. */
> +	    spin_lock_bh(&bdi_lock);
> +		list_for_each_entry(bdi, &bdi_list, bdi_list)
> +			bdi->dirty_expire_interval = dirty_expire_interval;
> +	    spin_unlock_bh(&bdi_lock);
> +	}
> +
>  	return 0;
>  }
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
@ 2011-08-18  9:48   ` Wu Fengguang
  0 siblings, 0 replies; 47+ messages in thread
From: Wu Fengguang @ 2011-08-18  9:48 UTC (permalink / raw)
  To: Kautuk Consul
  Cc: Mel Gorman, KOSAKI Motohiro, linux-mm, linux-kernel, linux-fsdevel

Hi Kautuk,

Add CC to fsdevel and Mel and KOSAKI.

When submitting patches you can find the relevant mailing list and
developers to CC with this command under the kernel source tree:

        scripts/get_maintainer.pl YOUR-PATCH-FILE

On Thu, Aug 11, 2011 at 05:50:56PM +0530, Kautuk Consul wrote:
> Hi,
> 
> Currently the /proc/sys/vm/dirty_writeback_centisecs and
> /proc/sys/vm/dirty_expire_centisecs values are
> global to the system.
> All the BDI flush-* threads are controlled by these central values.

Yes.

> However, the user/admin might want to set different writeback speeds
> for different block devices based on
> their page write-back performance.

How can the above two sysctl values impact "writeback speeds"?
In particular, what's the "speed" you mean?

> For example, the user might want to write-back pages in smaller
> intervals to a block device which has a
> faster known writeback speed.

That's not a complete rational. What does the user ultimately want by
setting a smaller interval? What would be the problems to the other
slow devices if the user does so by simply setting a small value
_globally_?

We need strong use cases for doing such user interface changes.
Would you detail the problem and the pains that can only (or best)
be addressed by this patch?

Thanks,
Fengguang

> This patch creates 3 new counters (in centisecs) for all the BDI
> threads that were controlled centrally by these
> 2 counters:
> i)   /sys/block/<block_dev>/bdi/dirty_writeback_interval,
> ii)  /sys/block/<block_dev>/bdi/dirty_expire_interval,
> iii) /proc/sys/vm/sync_supers_centisecs.
> 
> Although these new counters can be tuned individually, I have taken
> care that they be centrally reset by changes
> to the /proc/sys/vm/dirty_expire_centisecs and
> /proc/sys/vm/dirty_writeback_centisecs so that the earlier
> functionality is not broken by distributions using these central values.
> After resetting all values centrally, these values can be tuned
> individually without altering the central values.
> 
> Signed-off-by: Kautuk Consul <consul.kautuk@gmail.com>
> ---
> 
> diff -uprN a/fs/fs-writeback.c b/fs/fs-writeback.c
> --- a/fs/fs-writeback.c	2011-08-05 10:29:21.000000000 +0530
> +++ b/fs/fs-writeback.c	2011-08-09 09:15:37.093041675 +0530
> @@ -638,8 +638,8 @@ static inline bool over_bground_thresh(v
>   * just walks the superblock inode list, writing back any inodes which are
>   * older than a specific point in time.
>   *
> - * Try to run once per dirty_writeback_interval.  But if a writeback event
> - * takes longer than a dirty_writeback_interval interval, then leave a
> + * Try to run once per bdi->dirty_writeback_interval.  But if a writeback event
> + * takes longer than a bdi->dirty_writeback_interval interval, then leave a
>   * one-second gap.
>   *
>   * older_than_this takes precedence over nr_to_write.  So we'll only write back
> @@ -663,7 +663,7 @@ static long wb_writeback(struct bdi_writ
>  	if (wbc.for_kupdate) {
>  		wbc.older_than_this = &oldest_jif;
>  		oldest_jif = jiffies -
> -				msecs_to_jiffies(dirty_expire_interval * 10);
> +				msecs_to_jiffies(wb->bdi->dirty_expire_interval * 10);
>  	}
>  	if (!wbc.range_cyclic) {
>  		wbc.range_start = 0;
> @@ -811,15 +811,16 @@ static long wb_check_old_data_flush(stru
>  {
>  	unsigned long expired;
>  	long nr_pages;
> +	struct backing_dev_info *bdi = wb->bdi;
> 
>  	/*
>  	 * When set to zero, disable periodic writeback
>  	 */
> -	if (!dirty_writeback_interval)
> +	if (!bdi->dirty_writeback_interval)
>  		return 0;
> 
>  	expired = wb->last_old_flush +
> -			msecs_to_jiffies(dirty_writeback_interval * 10);
> +			msecs_to_jiffies(bdi->dirty_writeback_interval * 10);
>  	if (time_before(jiffies, expired))
>  		return 0;
> 
> @@ -923,8 +924,8 @@ int bdi_writeback_thread(void *data)
>  			continue;
>  		}
> 
> -		if (wb_has_dirty_io(wb) && dirty_writeback_interval)
> -			schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
> +		if (wb_has_dirty_io(wb) && bdi->dirty_writeback_interval)
> +			schedule_timeout(msecs_to_jiffies(bdi->dirty_writeback_interval * 10));
>  		else {
>  			/*
>  			 * We have nothing to do, so can go sleep without any
> diff -uprN a/include/linux/backing-dev.h b/include/linux/backing-dev.h
> --- a/include/linux/backing-dev.h	2011-08-05 10:29:21.000000000 +0530
> +++ b/include/linux/backing-dev.h	2011-08-09 09:15:37.094041619 +0530
> @@ -76,6 +76,8 @@ struct backing_dev_info {
> 
>  	unsigned int min_ratio;
>  	unsigned int max_ratio, max_prop_frac;
> +	unsigned int dirty_writeback_interval;
> +	unsigned int dirty_expire_interval;
> 
>  	struct bdi_writeback wb;  /* default writeback info for this bdi */
>  	spinlock_t wb_lock;	  /* protects work_list */
> @@ -333,4 +335,5 @@ static inline int bdi_sched_wait(void *w
>  	return 0;
>  }
> 
> +extern unsigned int shortest_dirty_writeback_interval;
>  #endif		/* _LINUX_BACKING_DEV_H */
> diff -uprN a/include/linux/writeback.h b/include/linux/writeback.h
> --- a/include/linux/writeback.h	2011-08-05 10:29:21.000000000 +0530
> +++ b/include/linux/writeback.h	2011-08-09 10:09:23.581268260 +0530
> @@ -100,6 +100,7 @@ extern unsigned long dirty_background_by
>  extern int vm_dirty_ratio;
>  extern unsigned long vm_dirty_bytes;
>  extern unsigned int dirty_writeback_interval;
> +extern unsigned int sync_supers_interval;
>  extern unsigned int dirty_expire_interval;
>  extern int vm_highmem_is_dirtyable;
>  extern int block_dump;
> @@ -123,6 +124,10 @@ extern int dirty_bytes_handler(struct ct
>  struct ctl_table;
>  int dirty_writeback_centisecs_handler(struct ctl_table *, int,
>  				      void __user *, size_t *, loff_t *);
> +int sync_supers_centisecs_handler(struct ctl_table *, int,
> +				      void __user *, size_t *, loff_t *);
> +int dirty_expire_centisecs_handler(struct ctl_table *, int,
> +				      void __user *, size_t *, loff_t *);
> 
>  void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
>  unsigned long bdi_dirty_limit(struct backing_dev_info *bdi,
> diff -uprN a/kernel/sysctl.c b/kernel/sysctl.c
> --- a/kernel/sysctl.c	2011-08-05 10:29:21.000000000 +0530
> +++ b/kernel/sysctl.c	2011-08-09 12:39:43.453087554 +0530
> @@ -1076,12 +1076,19 @@ static struct ctl_table vm_table[] = {
>  		.mode		= 0644,
>  		.proc_handler	= dirty_writeback_centisecs_handler,
>  	},
> +    {
> +        .procname   = "sync_supers_centisecs",
> +        .data       = &sync_supers_interval,
> +        .maxlen     = sizeof(sync_supers_interval),
> +        .mode       = 0644,
> +        .proc_handler   = sync_supers_centisecs_handler,
> +    },
>  	{
>  		.procname	= "dirty_expire_centisecs",
>  		.data		= &dirty_expire_interval,
>  		.maxlen		= sizeof(dirty_expire_interval),
>  		.mode		= 0644,
> -		.proc_handler	= proc_dointvec_minmax,
> +		.proc_handler	= dirty_expire_centisecs_handler,
>  		.extra1		= &zero,
>  	},
>  	{
> diff -uprN a/mm/backing-dev.c b/mm/backing-dev.c
> --- a/mm/backing-dev.c	2011-08-05 10:29:21.000000000 +0530
> +++ b/mm/backing-dev.c	2011-08-09 12:08:06.287079027 +0530
> @@ -39,6 +39,10 @@ DEFINE_SPINLOCK(bdi_lock);
>  LIST_HEAD(bdi_list);
>  LIST_HEAD(bdi_pending_list);
> 
> +/* Same value as the dirty_writeback_interval as this is what our
> + * initial shortest_dirty_writeback_interval. */
> +unsigned int shortest_dirty_writeback_interval = 5 * 100;
> +
>  static struct task_struct *sync_supers_tsk;
>  static struct timer_list sync_supers_timer;
> 
> @@ -204,12 +208,50 @@ static ssize_t max_ratio_store(struct de
>  }
>  BDI_SHOW(max_ratio, bdi->max_ratio)
> 
> +static ssize_t dirty_writeback_interval_store(struct device *dev,
> +		struct device_attribute *attr, const char *buf, size_t count)
> +{
> +	struct backing_dev_info *bdi = dev_get_drvdata(dev);
> +	char *end;
> +	unsigned int interval;
> +	ssize_t ret = -EINVAL;
> +
> +	interval = simple_strtoul(buf, &end, 10);
> +	if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
> +		bdi->dirty_writeback_interval = interval;
> +		shortest_dirty_writeback_interval =
> +						min(shortest_dirty_writeback_interval,interval);
> +		ret = count;
> +	}
> +	return ret;
> +}
> +BDI_SHOW(dirty_writeback_interval, bdi->dirty_writeback_interval)
> +
> +static ssize_t dirty_expire_interval_store (struct device *dev,
> +		struct device_attribute *attr, const char *buf, size_t count)
> +{
> +	struct backing_dev_info *bdi = dev_get_drvdata(dev);
> +	char *end;
> +	unsigned int interval;
> +	ssize_t ret = -EINVAL;
> +
> +	interval = simple_strtoul(buf, &end, 10);
> +	if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
> +		bdi->dirty_expire_interval = interval;
> +		ret = count;
> +	}
> +	return ret;
> +}
> +BDI_SHOW(dirty_expire_interval, bdi->dirty_expire_interval)
> +
>  #define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
> 
>  static struct device_attribute bdi_dev_attrs[] = {
>  	__ATTR_RW(read_ahead_kb),
>  	__ATTR_RW(min_ratio),
>  	__ATTR_RW(max_ratio),
> +	__ATTR_RW(dirty_writeback_interval),
> +	__ATTR_RW(dirty_expire_interval),
>  	__ATTR_NULL,
>  };
> 
> @@ -291,7 +333,7 @@ void bdi_arm_supers_timer(void)
>  	if (!dirty_writeback_interval)
>  		return;
> 
> -	next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
> +	next = msecs_to_jiffies(sync_supers_interval* 10) + jiffies;
>  	mod_timer(&sync_supers_timer, round_jiffies_up(next));
>  }
> 
> @@ -336,7 +378,7 @@ void bdi_wakeup_thread_delayed(struct ba
>  {
>  	unsigned long timeout;
> 
> -	timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
> +	timeout = msecs_to_jiffies(bdi->dirty_writeback_interval * 10);
>  	mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout);
>  }
> 
> @@ -348,7 +390,19 @@ static unsigned long bdi_longest_inactiv
>  {
>  	unsigned long interval;
> 
> -	interval = msecs_to_jiffies(dirty_writeback_interval * 10);
> +	interval = msecs_to_jiffies(shortest_dirty_writeback_interval * 10);
> +	return max(5UL * 60 * HZ, interval);
> +}
> +
> +/*
> + * Calculate the longest interval (jiffies) this bdi thread is allowed to be
> + * inactive.
> + */
> +static unsigned long bdi_longest_inactive_this(struct backing_dev_info *bdi)
> +{
> +	unsigned long interval;
> +
> +	interval = msecs_to_jiffies(bdi->dirty_writeback_interval * 10);
>  	return max(5UL * 60 * HZ, interval);
>  }
> 
> @@ -422,7 +476,7 @@ static int bdi_forker_thread(void *ptr)
>  			 */
>  			if (bdi->wb.task && !have_dirty_io &&
>  			    time_after(jiffies, bdi->wb.last_active +
> -						bdi_longest_inactive())) {
> +						bdi_longest_inactive_this(bdi))) {
>  				task = bdi->wb.task;
>  				bdi->wb.task = NULL;
>  				spin_unlock(&bdi->wb_lock);
> @@ -469,7 +523,7 @@ static int bdi_forker_thread(void *ptr)
>  			break;
> 
>  		case NO_ACTION:
> -			if (!wb_has_dirty_io(me) || !dirty_writeback_interval)
> +			if (!wb_has_dirty_io(me) || !me->bdi->dirty_writeback_interval)
>  				/*
>  				 * There are no dirty data. The only thing we
>  				 * should now care about is checking for
> @@ -479,7 +533,7 @@ static int bdi_forker_thread(void *ptr)
>  				 */
>  				schedule_timeout(bdi_longest_inactive());
>  			else
> -				schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
> +				schedule_timeout(msecs_to_jiffies(me->bdi->dirty_writeback_interval * 10));
>  			try_to_freeze();
>  			/* Back to the main loop */
>  			continue;
> @@ -641,6 +695,8 @@ int bdi_init(struct backing_dev_info *bd
>  	bdi->min_ratio = 0;
>  	bdi->max_ratio = 100;
>  	bdi->max_prop_frac = PROP_FRAC_BASE;
> +	bdi->dirty_writeback_interval = dirty_writeback_interval;
> +	bdi->dirty_expire_interval = dirty_expire_interval;
>  	spin_lock_init(&bdi->wb_lock);
>  	INIT_LIST_HEAD(&bdi->bdi_list);
>  	INIT_LIST_HEAD(&bdi->work_list);
> diff -uprN a/mm/page-writeback.c b/mm/page-writeback.c
> --- a/mm/page-writeback.c	2011-08-05 10:29:21.000000000 +0530
> +++ b/mm/page-writeback.c	2011-08-09 13:09:37.985919961 +0530
> @@ -92,6 +92,11 @@ unsigned long vm_dirty_bytes;
>  unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */
> 
>  /*
> + * The interval between sync_supers thread writebacks
> + */
> +unsigned int sync_supers_interval = 5 * 100; /* centiseconds */
> +
> +/*
>   * The longest time for which data is allowed to remain dirty
>   */
>  unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */
> @@ -686,8 +691,60 @@ void throttle_vm_writeout(gfp_t gfp_mask
>  int dirty_writeback_centisecs_handler(ctl_table *table, int write,
>  	void __user *buffer, size_t *length, loff_t *ppos)
>  {
> +	struct backing_dev_info *bdi;
> +
> +	proc_dointvec(table, write, buffer, length, ppos);
> +
> +	if (write) {
> +		/* Traverse all the BDIs registered to the BDI list and reset their
> +		 * bdi->dirty_writeback_interval to this value. */
> +	    spin_lock_bh(&bdi_lock);
> +		list_for_each_entry(bdi, &bdi_list, bdi_list)
> +			bdi->dirty_writeback_interval = dirty_writeback_interval;
> +	    spin_unlock_bh(&bdi_lock);
> +
> +		sync_supers_interval =
> +			shortest_dirty_writeback_interval = dirty_writeback_interval;
> +
> +	}
> +
> +	bdi_arm_supers_timer();
> +
> +	return 0;
> +}
> +
> +/*
> + * sysctl handler for /proc/sys/vm/sync_supers_centisecs
> + */
> +int sync_supers_centisecs_handler(ctl_table *table, int write,
> +	void __user *buffer, size_t *length, loff_t *ppos)
> +{
>  	proc_dointvec(table, write, buffer, length, ppos);
> +
>  	bdi_arm_supers_timer();
> +
> +	return 0;
> +}
> +
> +/*
> + * sysctl handler for /proc/sys/vm/dirty_expire_centisecs
> + */
> +int dirty_expire_centisecs_handler(ctl_table *table, int write,
> +	void __user *buffer, size_t *length, loff_t *ppos)
> +{
> +	struct backing_dev_info *bdi;
> +
> +	proc_dointvec_minmax(table, write, buffer, length, ppos);
> +
> +	if (write) {
> +		/* Traverse all the BDIs registered to the BDI list and reset their
> +		 * bdi->dirty_expire_interval to this value. */
> +	    spin_lock_bh(&bdi_lock);
> +		list_for_each_entry(bdi, &bdi_list, bdi_list)
> +			bdi->dirty_expire_interval = dirty_expire_interval;
> +	    spin_unlock_bh(&bdi_lock);
> +	}
> +
>  	return 0;
>  }
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
  2011-08-18  9:48   ` Wu Fengguang
@ 2011-08-18  9:51     ` Wu Fengguang
  -1 siblings, 0 replies; 47+ messages in thread
From: Wu Fengguang @ 2011-08-18  9:51 UTC (permalink / raw)
  To: Kautuk Consul
  Cc: Mel Gorman, KOSAKI Motohiro, linux-mm, linux-kernel,
	linux-fsdevel, Jan Kara, Dave Chinner

[correct email to Mel and add Jan/Dave]

> On Thu, Aug 11, 2011 at 05:50:56PM +0530, Kautuk Consul wrote:
> > Hi,
> > 
> > Currently the /proc/sys/vm/dirty_writeback_centisecs and
> > /proc/sys/vm/dirty_expire_centisecs values are
> > global to the system.
> > All the BDI flush-* threads are controlled by these central values.
> 
> Yes.
> 
> > However, the user/admin might want to set different writeback speeds
> > for different block devices based on
> > their page write-back performance.
> 
> How can the above two sysctl values impact "writeback speeds"?
> In particular, what's the "speed" you mean?
> 
> > For example, the user might want to write-back pages in smaller
> > intervals to a block device which has a
> > faster known writeback speed.
> 
> That's not a complete rational. What does the user ultimately want by
> setting a smaller interval? What would be the problems to the other
> slow devices if the user does so by simply setting a small value
> _globally_?
> 
> We need strong use cases for doing such user interface changes.
> Would you detail the problem and the pains that can only (or best)
> be addressed by this patch?
> 
> Thanks,
> Fengguang
> 
> > This patch creates 3 new counters (in centisecs) for all the BDI
> > threads that were controlled centrally by these
> > 2 counters:
> > i)   /sys/block/<block_dev>/bdi/dirty_writeback_interval,
> > ii)  /sys/block/<block_dev>/bdi/dirty_expire_interval,
> > iii) /proc/sys/vm/sync_supers_centisecs.
> > 
> > Although these new counters can be tuned individually, I have taken
> > care that they be centrally reset by changes
> > to the /proc/sys/vm/dirty_expire_centisecs and
> > /proc/sys/vm/dirty_writeback_centisecs so that the earlier
> > functionality is not broken by distributions using these central values.
> > After resetting all values centrally, these values can be tuned
> > individually without altering the central values.
> > 
> > Signed-off-by: Kautuk Consul <consul.kautuk@gmail.com>
> > ---
> > 
> > diff -uprN a/fs/fs-writeback.c b/fs/fs-writeback.c
> > --- a/fs/fs-writeback.c	2011-08-05 10:29:21.000000000 +0530
> > +++ b/fs/fs-writeback.c	2011-08-09 09:15:37.093041675 +0530
> > @@ -638,8 +638,8 @@ static inline bool over_bground_thresh(v
> >   * just walks the superblock inode list, writing back any inodes which are
> >   * older than a specific point in time.
> >   *
> > - * Try to run once per dirty_writeback_interval.  But if a writeback event
> > - * takes longer than a dirty_writeback_interval interval, then leave a
> > + * Try to run once per bdi->dirty_writeback_interval.  But if a writeback event
> > + * takes longer than a bdi->dirty_writeback_interval interval, then leave a
> >   * one-second gap.
> >   *
> >   * older_than_this takes precedence over nr_to_write.  So we'll only write back
> > @@ -663,7 +663,7 @@ static long wb_writeback(struct bdi_writ
> >  	if (wbc.for_kupdate) {
> >  		wbc.older_than_this = &oldest_jif;
> >  		oldest_jif = jiffies -
> > -				msecs_to_jiffies(dirty_expire_interval * 10);
> > +				msecs_to_jiffies(wb->bdi->dirty_expire_interval * 10);
> >  	}
> >  	if (!wbc.range_cyclic) {
> >  		wbc.range_start = 0;
> > @@ -811,15 +811,16 @@ static long wb_check_old_data_flush(stru
> >  {
> >  	unsigned long expired;
> >  	long nr_pages;
> > +	struct backing_dev_info *bdi = wb->bdi;
> > 
> >  	/*
> >  	 * When set to zero, disable periodic writeback
> >  	 */
> > -	if (!dirty_writeback_interval)
> > +	if (!bdi->dirty_writeback_interval)
> >  		return 0;
> > 
> >  	expired = wb->last_old_flush +
> > -			msecs_to_jiffies(dirty_writeback_interval * 10);
> > +			msecs_to_jiffies(bdi->dirty_writeback_interval * 10);
> >  	if (time_before(jiffies, expired))
> >  		return 0;
> > 
> > @@ -923,8 +924,8 @@ int bdi_writeback_thread(void *data)
> >  			continue;
> >  		}
> > 
> > -		if (wb_has_dirty_io(wb) && dirty_writeback_interval)
> > -			schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
> > +		if (wb_has_dirty_io(wb) && bdi->dirty_writeback_interval)
> > +			schedule_timeout(msecs_to_jiffies(bdi->dirty_writeback_interval * 10));
> >  		else {
> >  			/*
> >  			 * We have nothing to do, so can go sleep without any
> > diff -uprN a/include/linux/backing-dev.h b/include/linux/backing-dev.h
> > --- a/include/linux/backing-dev.h	2011-08-05 10:29:21.000000000 +0530
> > +++ b/include/linux/backing-dev.h	2011-08-09 09:15:37.094041619 +0530
> > @@ -76,6 +76,8 @@ struct backing_dev_info {
> > 
> >  	unsigned int min_ratio;
> >  	unsigned int max_ratio, max_prop_frac;
> > +	unsigned int dirty_writeback_interval;
> > +	unsigned int dirty_expire_interval;
> > 
> >  	struct bdi_writeback wb;  /* default writeback info for this bdi */
> >  	spinlock_t wb_lock;	  /* protects work_list */
> > @@ -333,4 +335,5 @@ static inline int bdi_sched_wait(void *w
> >  	return 0;
> >  }
> > 
> > +extern unsigned int shortest_dirty_writeback_interval;
> >  #endif		/* _LINUX_BACKING_DEV_H */
> > diff -uprN a/include/linux/writeback.h b/include/linux/writeback.h
> > --- a/include/linux/writeback.h	2011-08-05 10:29:21.000000000 +0530
> > +++ b/include/linux/writeback.h	2011-08-09 10:09:23.581268260 +0530
> > @@ -100,6 +100,7 @@ extern unsigned long dirty_background_by
> >  extern int vm_dirty_ratio;
> >  extern unsigned long vm_dirty_bytes;
> >  extern unsigned int dirty_writeback_interval;
> > +extern unsigned int sync_supers_interval;
> >  extern unsigned int dirty_expire_interval;
> >  extern int vm_highmem_is_dirtyable;
> >  extern int block_dump;
> > @@ -123,6 +124,10 @@ extern int dirty_bytes_handler(struct ct
> >  struct ctl_table;
> >  int dirty_writeback_centisecs_handler(struct ctl_table *, int,
> >  				      void __user *, size_t *, loff_t *);
> > +int sync_supers_centisecs_handler(struct ctl_table *, int,
> > +				      void __user *, size_t *, loff_t *);
> > +int dirty_expire_centisecs_handler(struct ctl_table *, int,
> > +				      void __user *, size_t *, loff_t *);
> > 
> >  void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
> >  unsigned long bdi_dirty_limit(struct backing_dev_info *bdi,
> > diff -uprN a/kernel/sysctl.c b/kernel/sysctl.c
> > --- a/kernel/sysctl.c	2011-08-05 10:29:21.000000000 +0530
> > +++ b/kernel/sysctl.c	2011-08-09 12:39:43.453087554 +0530
> > @@ -1076,12 +1076,19 @@ static struct ctl_table vm_table[] = {
> >  		.mode		= 0644,
> >  		.proc_handler	= dirty_writeback_centisecs_handler,
> >  	},
> > +    {
> > +        .procname   = "sync_supers_centisecs",
> > +        .data       = &sync_supers_interval,
> > +        .maxlen     = sizeof(sync_supers_interval),
> > +        .mode       = 0644,
> > +        .proc_handler   = sync_supers_centisecs_handler,
> > +    },
> >  	{
> >  		.procname	= "dirty_expire_centisecs",
> >  		.data		= &dirty_expire_interval,
> >  		.maxlen		= sizeof(dirty_expire_interval),
> >  		.mode		= 0644,
> > -		.proc_handler	= proc_dointvec_minmax,
> > +		.proc_handler	= dirty_expire_centisecs_handler,
> >  		.extra1		= &zero,
> >  	},
> >  	{
> > diff -uprN a/mm/backing-dev.c b/mm/backing-dev.c
> > --- a/mm/backing-dev.c	2011-08-05 10:29:21.000000000 +0530
> > +++ b/mm/backing-dev.c	2011-08-09 12:08:06.287079027 +0530
> > @@ -39,6 +39,10 @@ DEFINE_SPINLOCK(bdi_lock);
> >  LIST_HEAD(bdi_list);
> >  LIST_HEAD(bdi_pending_list);
> > 
> > +/* Same value as the dirty_writeback_interval as this is what our
> > + * initial shortest_dirty_writeback_interval. */
> > +unsigned int shortest_dirty_writeback_interval = 5 * 100;
> > +
> >  static struct task_struct *sync_supers_tsk;
> >  static struct timer_list sync_supers_timer;
> > 
> > @@ -204,12 +208,50 @@ static ssize_t max_ratio_store(struct de
> >  }
> >  BDI_SHOW(max_ratio, bdi->max_ratio)
> > 
> > +static ssize_t dirty_writeback_interval_store(struct device *dev,
> > +		struct device_attribute *attr, const char *buf, size_t count)
> > +{
> > +	struct backing_dev_info *bdi = dev_get_drvdata(dev);
> > +	char *end;
> > +	unsigned int interval;
> > +	ssize_t ret = -EINVAL;
> > +
> > +	interval = simple_strtoul(buf, &end, 10);
> > +	if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
> > +		bdi->dirty_writeback_interval = interval;
> > +		shortest_dirty_writeback_interval =
> > +						min(shortest_dirty_writeback_interval,interval);
> > +		ret = count;
> > +	}
> > +	return ret;
> > +}
> > +BDI_SHOW(dirty_writeback_interval, bdi->dirty_writeback_interval)
> > +
> > +static ssize_t dirty_expire_interval_store (struct device *dev,
> > +		struct device_attribute *attr, const char *buf, size_t count)
> > +{
> > +	struct backing_dev_info *bdi = dev_get_drvdata(dev);
> > +	char *end;
> > +	unsigned int interval;
> > +	ssize_t ret = -EINVAL;
> > +
> > +	interval = simple_strtoul(buf, &end, 10);
> > +	if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
> > +		bdi->dirty_expire_interval = interval;
> > +		ret = count;
> > +	}
> > +	return ret;
> > +}
> > +BDI_SHOW(dirty_expire_interval, bdi->dirty_expire_interval)
> > +
> >  #define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
> > 
> >  static struct device_attribute bdi_dev_attrs[] = {
> >  	__ATTR_RW(read_ahead_kb),
> >  	__ATTR_RW(min_ratio),
> >  	__ATTR_RW(max_ratio),
> > +	__ATTR_RW(dirty_writeback_interval),
> > +	__ATTR_RW(dirty_expire_interval),
> >  	__ATTR_NULL,
> >  };
> > 
> > @@ -291,7 +333,7 @@ void bdi_arm_supers_timer(void)
> >  	if (!dirty_writeback_interval)
> >  		return;
> > 
> > -	next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
> > +	next = msecs_to_jiffies(sync_supers_interval* 10) + jiffies;
> >  	mod_timer(&sync_supers_timer, round_jiffies_up(next));
> >  }
> > 
> > @@ -336,7 +378,7 @@ void bdi_wakeup_thread_delayed(struct ba
> >  {
> >  	unsigned long timeout;
> > 
> > -	timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
> > +	timeout = msecs_to_jiffies(bdi->dirty_writeback_interval * 10);
> >  	mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout);
> >  }
> > 
> > @@ -348,7 +390,19 @@ static unsigned long bdi_longest_inactiv
> >  {
> >  	unsigned long interval;
> > 
> > -	interval = msecs_to_jiffies(dirty_writeback_interval * 10);
> > +	interval = msecs_to_jiffies(shortest_dirty_writeback_interval * 10);
> > +	return max(5UL * 60 * HZ, interval);
> > +}
> > +
> > +/*
> > + * Calculate the longest interval (jiffies) this bdi thread is allowed to be
> > + * inactive.
> > + */
> > +static unsigned long bdi_longest_inactive_this(struct backing_dev_info *bdi)
> > +{
> > +	unsigned long interval;
> > +
> > +	interval = msecs_to_jiffies(bdi->dirty_writeback_interval * 10);
> >  	return max(5UL * 60 * HZ, interval);
> >  }
> > 
> > @@ -422,7 +476,7 @@ static int bdi_forker_thread(void *ptr)
> >  			 */
> >  			if (bdi->wb.task && !have_dirty_io &&
> >  			    time_after(jiffies, bdi->wb.last_active +
> > -						bdi_longest_inactive())) {
> > +						bdi_longest_inactive_this(bdi))) {
> >  				task = bdi->wb.task;
> >  				bdi->wb.task = NULL;
> >  				spin_unlock(&bdi->wb_lock);
> > @@ -469,7 +523,7 @@ static int bdi_forker_thread(void *ptr)
> >  			break;
> > 
> >  		case NO_ACTION:
> > -			if (!wb_has_dirty_io(me) || !dirty_writeback_interval)
> > +			if (!wb_has_dirty_io(me) || !me->bdi->dirty_writeback_interval)
> >  				/*
> >  				 * There are no dirty data. The only thing we
> >  				 * should now care about is checking for
> > @@ -479,7 +533,7 @@ static int bdi_forker_thread(void *ptr)
> >  				 */
> >  				schedule_timeout(bdi_longest_inactive());
> >  			else
> > -				schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
> > +				schedule_timeout(msecs_to_jiffies(me->bdi->dirty_writeback_interval * 10));
> >  			try_to_freeze();
> >  			/* Back to the main loop */
> >  			continue;
> > @@ -641,6 +695,8 @@ int bdi_init(struct backing_dev_info *bd
> >  	bdi->min_ratio = 0;
> >  	bdi->max_ratio = 100;
> >  	bdi->max_prop_frac = PROP_FRAC_BASE;
> > +	bdi->dirty_writeback_interval = dirty_writeback_interval;
> > +	bdi->dirty_expire_interval = dirty_expire_interval;
> >  	spin_lock_init(&bdi->wb_lock);
> >  	INIT_LIST_HEAD(&bdi->bdi_list);
> >  	INIT_LIST_HEAD(&bdi->work_list);
> > diff -uprN a/mm/page-writeback.c b/mm/page-writeback.c
> > --- a/mm/page-writeback.c	2011-08-05 10:29:21.000000000 +0530
> > +++ b/mm/page-writeback.c	2011-08-09 13:09:37.985919961 +0530
> > @@ -92,6 +92,11 @@ unsigned long vm_dirty_bytes;
> >  unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */
> > 
> >  /*
> > + * The interval between sync_supers thread writebacks
> > + */
> > +unsigned int sync_supers_interval = 5 * 100; /* centiseconds */
> > +
> > +/*
> >   * The longest time for which data is allowed to remain dirty
> >   */
> >  unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */
> > @@ -686,8 +691,60 @@ void throttle_vm_writeout(gfp_t gfp_mask
> >  int dirty_writeback_centisecs_handler(ctl_table *table, int write,
> >  	void __user *buffer, size_t *length, loff_t *ppos)
> >  {
> > +	struct backing_dev_info *bdi;
> > +
> > +	proc_dointvec(table, write, buffer, length, ppos);
> > +
> > +	if (write) {
> > +		/* Traverse all the BDIs registered to the BDI list and reset their
> > +		 * bdi->dirty_writeback_interval to this value. */
> > +	    spin_lock_bh(&bdi_lock);
> > +		list_for_each_entry(bdi, &bdi_list, bdi_list)
> > +			bdi->dirty_writeback_interval = dirty_writeback_interval;
> > +	    spin_unlock_bh(&bdi_lock);
> > +
> > +		sync_supers_interval =
> > +			shortest_dirty_writeback_interval = dirty_writeback_interval;
> > +
> > +	}
> > +
> > +	bdi_arm_supers_timer();
> > +
> > +	return 0;
> > +}
> > +
> > +/*
> > + * sysctl handler for /proc/sys/vm/sync_supers_centisecs
> > + */
> > +int sync_supers_centisecs_handler(ctl_table *table, int write,
> > +	void __user *buffer, size_t *length, loff_t *ppos)
> > +{
> >  	proc_dointvec(table, write, buffer, length, ppos);
> > +
> >  	bdi_arm_supers_timer();
> > +
> > +	return 0;
> > +}
> > +
> > +/*
> > + * sysctl handler for /proc/sys/vm/dirty_expire_centisecs
> > + */
> > +int dirty_expire_centisecs_handler(ctl_table *table, int write,
> > +	void __user *buffer, size_t *length, loff_t *ppos)
> > +{
> > +	struct backing_dev_info *bdi;
> > +
> > +	proc_dointvec_minmax(table, write, buffer, length, ppos);
> > +
> > +	if (write) {
> > +		/* Traverse all the BDIs registered to the BDI list and reset their
> > +		 * bdi->dirty_expire_interval to this value. */
> > +	    spin_lock_bh(&bdi_lock);
> > +		list_for_each_entry(bdi, &bdi_list, bdi_list)
> > +			bdi->dirty_expire_interval = dirty_expire_interval;
> > +	    spin_unlock_bh(&bdi_lock);
> > +	}
> > +
> >  	return 0;
> >  }
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
@ 2011-08-18  9:51     ` Wu Fengguang
  0 siblings, 0 replies; 47+ messages in thread
From: Wu Fengguang @ 2011-08-18  9:51 UTC (permalink / raw)
  To: Kautuk Consul
  Cc: Mel Gorman, KOSAKI Motohiro, linux-mm, linux-kernel,
	linux-fsdevel, Jan Kara, Dave Chinner

[correct email to Mel and add Jan/Dave]

> On Thu, Aug 11, 2011 at 05:50:56PM +0530, Kautuk Consul wrote:
> > Hi,
> > 
> > Currently the /proc/sys/vm/dirty_writeback_centisecs and
> > /proc/sys/vm/dirty_expire_centisecs values are
> > global to the system.
> > All the BDI flush-* threads are controlled by these central values.
> 
> Yes.
> 
> > However, the user/admin might want to set different writeback speeds
> > for different block devices based on
> > their page write-back performance.
> 
> How can the above two sysctl values impact "writeback speeds"?
> In particular, what's the "speed" you mean?
> 
> > For example, the user might want to write-back pages in smaller
> > intervals to a block device which has a
> > faster known writeback speed.
> 
> That's not a complete rational. What does the user ultimately want by
> setting a smaller interval? What would be the problems to the other
> slow devices if the user does so by simply setting a small value
> _globally_?
> 
> We need strong use cases for doing such user interface changes.
> Would you detail the problem and the pains that can only (or best)
> be addressed by this patch?
> 
> Thanks,
> Fengguang
> 
> > This patch creates 3 new counters (in centisecs) for all the BDI
> > threads that were controlled centrally by these
> > 2 counters:
> > i)   /sys/block/<block_dev>/bdi/dirty_writeback_interval,
> > ii)  /sys/block/<block_dev>/bdi/dirty_expire_interval,
> > iii) /proc/sys/vm/sync_supers_centisecs.
> > 
> > Although these new counters can be tuned individually, I have taken
> > care that they be centrally reset by changes
> > to the /proc/sys/vm/dirty_expire_centisecs and
> > /proc/sys/vm/dirty_writeback_centisecs so that the earlier
> > functionality is not broken by distributions using these central values.
> > After resetting all values centrally, these values can be tuned
> > individually without altering the central values.
> > 
> > Signed-off-by: Kautuk Consul <consul.kautuk@gmail.com>
> > ---
> > 
> > diff -uprN a/fs/fs-writeback.c b/fs/fs-writeback.c
> > --- a/fs/fs-writeback.c	2011-08-05 10:29:21.000000000 +0530
> > +++ b/fs/fs-writeback.c	2011-08-09 09:15:37.093041675 +0530
> > @@ -638,8 +638,8 @@ static inline bool over_bground_thresh(v
> >   * just walks the superblock inode list, writing back any inodes which are
> >   * older than a specific point in time.
> >   *
> > - * Try to run once per dirty_writeback_interval.  But if a writeback event
> > - * takes longer than a dirty_writeback_interval interval, then leave a
> > + * Try to run once per bdi->dirty_writeback_interval.  But if a writeback event
> > + * takes longer than a bdi->dirty_writeback_interval interval, then leave a
> >   * one-second gap.
> >   *
> >   * older_than_this takes precedence over nr_to_write.  So we'll only write back
> > @@ -663,7 +663,7 @@ static long wb_writeback(struct bdi_writ
> >  	if (wbc.for_kupdate) {
> >  		wbc.older_than_this = &oldest_jif;
> >  		oldest_jif = jiffies -
> > -				msecs_to_jiffies(dirty_expire_interval * 10);
> > +				msecs_to_jiffies(wb->bdi->dirty_expire_interval * 10);
> >  	}
> >  	if (!wbc.range_cyclic) {
> >  		wbc.range_start = 0;
> > @@ -811,15 +811,16 @@ static long wb_check_old_data_flush(stru
> >  {
> >  	unsigned long expired;
> >  	long nr_pages;
> > +	struct backing_dev_info *bdi = wb->bdi;
> > 
> >  	/*
> >  	 * When set to zero, disable periodic writeback
> >  	 */
> > -	if (!dirty_writeback_interval)
> > +	if (!bdi->dirty_writeback_interval)
> >  		return 0;
> > 
> >  	expired = wb->last_old_flush +
> > -			msecs_to_jiffies(dirty_writeback_interval * 10);
> > +			msecs_to_jiffies(bdi->dirty_writeback_interval * 10);
> >  	if (time_before(jiffies, expired))
> >  		return 0;
> > 
> > @@ -923,8 +924,8 @@ int bdi_writeback_thread(void *data)
> >  			continue;
> >  		}
> > 
> > -		if (wb_has_dirty_io(wb) && dirty_writeback_interval)
> > -			schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
> > +		if (wb_has_dirty_io(wb) && bdi->dirty_writeback_interval)
> > +			schedule_timeout(msecs_to_jiffies(bdi->dirty_writeback_interval * 10));
> >  		else {
> >  			/*
> >  			 * We have nothing to do, so can go sleep without any
> > diff -uprN a/include/linux/backing-dev.h b/include/linux/backing-dev.h
> > --- a/include/linux/backing-dev.h	2011-08-05 10:29:21.000000000 +0530
> > +++ b/include/linux/backing-dev.h	2011-08-09 09:15:37.094041619 +0530
> > @@ -76,6 +76,8 @@ struct backing_dev_info {
> > 
> >  	unsigned int min_ratio;
> >  	unsigned int max_ratio, max_prop_frac;
> > +	unsigned int dirty_writeback_interval;
> > +	unsigned int dirty_expire_interval;
> > 
> >  	struct bdi_writeback wb;  /* default writeback info for this bdi */
> >  	spinlock_t wb_lock;	  /* protects work_list */
> > @@ -333,4 +335,5 @@ static inline int bdi_sched_wait(void *w
> >  	return 0;
> >  }
> > 
> > +extern unsigned int shortest_dirty_writeback_interval;
> >  #endif		/* _LINUX_BACKING_DEV_H */
> > diff -uprN a/include/linux/writeback.h b/include/linux/writeback.h
> > --- a/include/linux/writeback.h	2011-08-05 10:29:21.000000000 +0530
> > +++ b/include/linux/writeback.h	2011-08-09 10:09:23.581268260 +0530
> > @@ -100,6 +100,7 @@ extern unsigned long dirty_background_by
> >  extern int vm_dirty_ratio;
> >  extern unsigned long vm_dirty_bytes;
> >  extern unsigned int dirty_writeback_interval;
> > +extern unsigned int sync_supers_interval;
> >  extern unsigned int dirty_expire_interval;
> >  extern int vm_highmem_is_dirtyable;
> >  extern int block_dump;
> > @@ -123,6 +124,10 @@ extern int dirty_bytes_handler(struct ct
> >  struct ctl_table;
> >  int dirty_writeback_centisecs_handler(struct ctl_table *, int,
> >  				      void __user *, size_t *, loff_t *);
> > +int sync_supers_centisecs_handler(struct ctl_table *, int,
> > +				      void __user *, size_t *, loff_t *);
> > +int dirty_expire_centisecs_handler(struct ctl_table *, int,
> > +				      void __user *, size_t *, loff_t *);
> > 
> >  void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
> >  unsigned long bdi_dirty_limit(struct backing_dev_info *bdi,
> > diff -uprN a/kernel/sysctl.c b/kernel/sysctl.c
> > --- a/kernel/sysctl.c	2011-08-05 10:29:21.000000000 +0530
> > +++ b/kernel/sysctl.c	2011-08-09 12:39:43.453087554 +0530
> > @@ -1076,12 +1076,19 @@ static struct ctl_table vm_table[] = {
> >  		.mode		= 0644,
> >  		.proc_handler	= dirty_writeback_centisecs_handler,
> >  	},
> > +    {
> > +        .procname   = "sync_supers_centisecs",
> > +        .data       = &sync_supers_interval,
> > +        .maxlen     = sizeof(sync_supers_interval),
> > +        .mode       = 0644,
> > +        .proc_handler   = sync_supers_centisecs_handler,
> > +    },
> >  	{
> >  		.procname	= "dirty_expire_centisecs",
> >  		.data		= &dirty_expire_interval,
> >  		.maxlen		= sizeof(dirty_expire_interval),
> >  		.mode		= 0644,
> > -		.proc_handler	= proc_dointvec_minmax,
> > +		.proc_handler	= dirty_expire_centisecs_handler,
> >  		.extra1		= &zero,
> >  	},
> >  	{
> > diff -uprN a/mm/backing-dev.c b/mm/backing-dev.c
> > --- a/mm/backing-dev.c	2011-08-05 10:29:21.000000000 +0530
> > +++ b/mm/backing-dev.c	2011-08-09 12:08:06.287079027 +0530
> > @@ -39,6 +39,10 @@ DEFINE_SPINLOCK(bdi_lock);
> >  LIST_HEAD(bdi_list);
> >  LIST_HEAD(bdi_pending_list);
> > 
> > +/* Same value as the dirty_writeback_interval as this is what our
> > + * initial shortest_dirty_writeback_interval. */
> > +unsigned int shortest_dirty_writeback_interval = 5 * 100;
> > +
> >  static struct task_struct *sync_supers_tsk;
> >  static struct timer_list sync_supers_timer;
> > 
> > @@ -204,12 +208,50 @@ static ssize_t max_ratio_store(struct de
> >  }
> >  BDI_SHOW(max_ratio, bdi->max_ratio)
> > 
> > +static ssize_t dirty_writeback_interval_store(struct device *dev,
> > +		struct device_attribute *attr, const char *buf, size_t count)
> > +{
> > +	struct backing_dev_info *bdi = dev_get_drvdata(dev);
> > +	char *end;
> > +	unsigned int interval;
> > +	ssize_t ret = -EINVAL;
> > +
> > +	interval = simple_strtoul(buf, &end, 10);
> > +	if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
> > +		bdi->dirty_writeback_interval = interval;
> > +		shortest_dirty_writeback_interval =
> > +						min(shortest_dirty_writeback_interval,interval);
> > +		ret = count;
> > +	}
> > +	return ret;
> > +}
> > +BDI_SHOW(dirty_writeback_interval, bdi->dirty_writeback_interval)
> > +
> > +static ssize_t dirty_expire_interval_store (struct device *dev,
> > +		struct device_attribute *attr, const char *buf, size_t count)
> > +{
> > +	struct backing_dev_info *bdi = dev_get_drvdata(dev);
> > +	char *end;
> > +	unsigned int interval;
> > +	ssize_t ret = -EINVAL;
> > +
> > +	interval = simple_strtoul(buf, &end, 10);
> > +	if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
> > +		bdi->dirty_expire_interval = interval;
> > +		ret = count;
> > +	}
> > +	return ret;
> > +}
> > +BDI_SHOW(dirty_expire_interval, bdi->dirty_expire_interval)
> > +
> >  #define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
> > 
> >  static struct device_attribute bdi_dev_attrs[] = {
> >  	__ATTR_RW(read_ahead_kb),
> >  	__ATTR_RW(min_ratio),
> >  	__ATTR_RW(max_ratio),
> > +	__ATTR_RW(dirty_writeback_interval),
> > +	__ATTR_RW(dirty_expire_interval),
> >  	__ATTR_NULL,
> >  };
> > 
> > @@ -291,7 +333,7 @@ void bdi_arm_supers_timer(void)
> >  	if (!dirty_writeback_interval)
> >  		return;
> > 
> > -	next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
> > +	next = msecs_to_jiffies(sync_supers_interval* 10) + jiffies;
> >  	mod_timer(&sync_supers_timer, round_jiffies_up(next));
> >  }
> > 
> > @@ -336,7 +378,7 @@ void bdi_wakeup_thread_delayed(struct ba
> >  {
> >  	unsigned long timeout;
> > 
> > -	timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
> > +	timeout = msecs_to_jiffies(bdi->dirty_writeback_interval * 10);
> >  	mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout);
> >  }
> > 
> > @@ -348,7 +390,19 @@ static unsigned long bdi_longest_inactiv
> >  {
> >  	unsigned long interval;
> > 
> > -	interval = msecs_to_jiffies(dirty_writeback_interval * 10);
> > +	interval = msecs_to_jiffies(shortest_dirty_writeback_interval * 10);
> > +	return max(5UL * 60 * HZ, interval);
> > +}
> > +
> > +/*
> > + * Calculate the longest interval (jiffies) this bdi thread is allowed to be
> > + * inactive.
> > + */
> > +static unsigned long bdi_longest_inactive_this(struct backing_dev_info *bdi)
> > +{
> > +	unsigned long interval;
> > +
> > +	interval = msecs_to_jiffies(bdi->dirty_writeback_interval * 10);
> >  	return max(5UL * 60 * HZ, interval);
> >  }
> > 
> > @@ -422,7 +476,7 @@ static int bdi_forker_thread(void *ptr)
> >  			 */
> >  			if (bdi->wb.task && !have_dirty_io &&
> >  			    time_after(jiffies, bdi->wb.last_active +
> > -						bdi_longest_inactive())) {
> > +						bdi_longest_inactive_this(bdi))) {
> >  				task = bdi->wb.task;
> >  				bdi->wb.task = NULL;
> >  				spin_unlock(&bdi->wb_lock);
> > @@ -469,7 +523,7 @@ static int bdi_forker_thread(void *ptr)
> >  			break;
> > 
> >  		case NO_ACTION:
> > -			if (!wb_has_dirty_io(me) || !dirty_writeback_interval)
> > +			if (!wb_has_dirty_io(me) || !me->bdi->dirty_writeback_interval)
> >  				/*
> >  				 * There are no dirty data. The only thing we
> >  				 * should now care about is checking for
> > @@ -479,7 +533,7 @@ static int bdi_forker_thread(void *ptr)
> >  				 */
> >  				schedule_timeout(bdi_longest_inactive());
> >  			else
> > -				schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
> > +				schedule_timeout(msecs_to_jiffies(me->bdi->dirty_writeback_interval * 10));
> >  			try_to_freeze();
> >  			/* Back to the main loop */
> >  			continue;
> > @@ -641,6 +695,8 @@ int bdi_init(struct backing_dev_info *bd
> >  	bdi->min_ratio = 0;
> >  	bdi->max_ratio = 100;
> >  	bdi->max_prop_frac = PROP_FRAC_BASE;
> > +	bdi->dirty_writeback_interval = dirty_writeback_interval;
> > +	bdi->dirty_expire_interval = dirty_expire_interval;
> >  	spin_lock_init(&bdi->wb_lock);
> >  	INIT_LIST_HEAD(&bdi->bdi_list);
> >  	INIT_LIST_HEAD(&bdi->work_list);
> > diff -uprN a/mm/page-writeback.c b/mm/page-writeback.c
> > --- a/mm/page-writeback.c	2011-08-05 10:29:21.000000000 +0530
> > +++ b/mm/page-writeback.c	2011-08-09 13:09:37.985919961 +0530
> > @@ -92,6 +92,11 @@ unsigned long vm_dirty_bytes;
> >  unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */
> > 
> >  /*
> > + * The interval between sync_supers thread writebacks
> > + */
> > +unsigned int sync_supers_interval = 5 * 100; /* centiseconds */
> > +
> > +/*
> >   * The longest time for which data is allowed to remain dirty
> >   */
> >  unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */
> > @@ -686,8 +691,60 @@ void throttle_vm_writeout(gfp_t gfp_mask
> >  int dirty_writeback_centisecs_handler(ctl_table *table, int write,
> >  	void __user *buffer, size_t *length, loff_t *ppos)
> >  {
> > +	struct backing_dev_info *bdi;
> > +
> > +	proc_dointvec(table, write, buffer, length, ppos);
> > +
> > +	if (write) {
> > +		/* Traverse all the BDIs registered to the BDI list and reset their
> > +		 * bdi->dirty_writeback_interval to this value. */
> > +	    spin_lock_bh(&bdi_lock);
> > +		list_for_each_entry(bdi, &bdi_list, bdi_list)
> > +			bdi->dirty_writeback_interval = dirty_writeback_interval;
> > +	    spin_unlock_bh(&bdi_lock);
> > +
> > +		sync_supers_interval =
> > +			shortest_dirty_writeback_interval = dirty_writeback_interval;
> > +
> > +	}
> > +
> > +	bdi_arm_supers_timer();
> > +
> > +	return 0;
> > +}
> > +
> > +/*
> > + * sysctl handler for /proc/sys/vm/sync_supers_centisecs
> > + */
> > +int sync_supers_centisecs_handler(ctl_table *table, int write,
> > +	void __user *buffer, size_t *length, loff_t *ppos)
> > +{
> >  	proc_dointvec(table, write, buffer, length, ppos);
> > +
> >  	bdi_arm_supers_timer();
> > +
> > +	return 0;
> > +}
> > +
> > +/*
> > + * sysctl handler for /proc/sys/vm/dirty_expire_centisecs
> > + */
> > +int dirty_expire_centisecs_handler(ctl_table *table, int write,
> > +	void __user *buffer, size_t *length, loff_t *ppos)
> > +{
> > +	struct backing_dev_info *bdi;
> > +
> > +	proc_dointvec_minmax(table, write, buffer, length, ppos);
> > +
> > +	if (write) {
> > +		/* Traverse all the BDIs registered to the BDI list and reset their
> > +		 * bdi->dirty_expire_interval to this value. */
> > +	    spin_lock_bh(&bdi_lock);
> > +		list_for_each_entry(bdi, &bdi_list, bdi_list)
> > +			bdi->dirty_expire_interval = dirty_expire_interval;
> > +	    spin_unlock_bh(&bdi_lock);
> > +	}
> > +
> >  	return 0;
> >  }
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > Please read the FAQ at  http://www.tux.org/lkml/

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
  2011-08-18  9:48   ` Wu Fengguang
@ 2011-08-18 11:28     ` Kautuk Consul
  -1 siblings, 0 replies; 47+ messages in thread
From: Kautuk Consul @ 2011-08-18 11:28 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Mel Gorman, KOSAKI Motohiro, linux-mm, linux-kernel, linux-fsdevel

Hi Wu,

Thanks for responding.

Please find my comments inline in your email below.

On Thu, Aug 18, 2011 at 3:18 PM, Wu Fengguang <fengguang.wu@intel.com> wrote:
> Hi Kautuk,
>
> Add CC to fsdevel and Mel and KOSAKI.
>
> When submitting patches you can find the relevant mailing list and
> developers to CC with this command under the kernel source tree:
>
>        scripts/get_maintainer.pl YOUR-PATCH-FILE
>
> On Thu, Aug 11, 2011 at 05:50:56PM +0530, Kautuk Consul wrote:
>> Hi,
>>
>> Currently the /proc/sys/vm/dirty_writeback_centisecs and
>> /proc/sys/vm/dirty_expire_centisecs values are
>> global to the system.
>> All the BDI flush-* threads are controlled by these central values.
>
> Yes.
>
>> However, the user/admin might want to set different writeback speeds
>> for different block devices based on
>> their page write-back performance.
>
> How can the above two sysctl values impact "writeback speeds"?
> In particular, what's the "speed" you mean?
>

By writeback speed, I meant writeback interval, i.e. the maximum
interval after which the BDI
thread for a particular block device can wake up and try to sync pages
with disk.


>> For example, the user might want to write-back pages in smaller
>> intervals to a block device which has a
>> faster known writeback speed.
>
> That's not a complete rational. What does the user ultimately want by
> setting a smaller interval? What would be the problems to the other
> slow devices if the user does so by simply setting a small value
> _globally_?
>

I think that the user might want to set a smaller interval for faster block
devices so that the dirty pages are synced with that block device/disk sooner.
This will unset the dirty bit of the page-cache pages sooner, which
will increase the
possibility of those pages getting reclaimed quickly in high memory
usage scenarios.
For a system that writes to disk very frequently and runs a lot of
memory intensive user-mode
applications, this might be crucial for their performance as they
would possibly have to sleep
comparitively lesser during page allocation.
For example, an server handling a database needs frequent disk access
as well as
anonymous memory. In such a case it would be nice to keep the
write-back interval for a USB pen
drive BDI thread as more than that of a SATA/SCSI disk.

> We need strong use cases for doing such user interface changes.
> Would you detail the problem and the pains that can only (or best)
> be addressed by this patch?
>

Overall, I think that ever since there have been different BDI threads
for different block devices,
it seems quite rational to provide the user an option to set different
writeback intervals to different
block devices due to the reasons/examples I have mentioned above.

I do not fully theoretically understand the way your patches are
controlling the dirty rate and estimating
the future bandwidth.
But, when I looked through them I did not see any place where the
writeback interval for a BDI was being
changed.
So, I felt that my patch was more like an additional feature for the
user rather than a conflict with your
writeback patches.

> Thanks,
> Fengguang
>
>> This patch creates 3 new counters (in centisecs) for all the BDI
>> threads that were controlled centrally by these
>> 2 counters:
>> i)   /sys/block/<block_dev>/bdi/dirty_writeback_interval,
>> ii)  /sys/block/<block_dev>/bdi/dirty_expire_interval,
>> iii) /proc/sys/vm/sync_supers_centisecs.
>>
>> Although these new counters can be tuned individually, I have taken
>> care that they be centrally reset by changes
>> to the /proc/sys/vm/dirty_expire_centisecs and
>> /proc/sys/vm/dirty_writeback_centisecs so that the earlier
>> functionality is not broken by distributions using these central values.
>> After resetting all values centrally, these values can be tuned
>> individually without altering the central values.
>>
>> Signed-off-by: Kautuk Consul <consul.kautuk@gmail.com>
>> ---
>>
>> diff -uprN a/fs/fs-writeback.c b/fs/fs-writeback.c
>> --- a/fs/fs-writeback.c       2011-08-05 10:29:21.000000000 +0530
>> +++ b/fs/fs-writeback.c       2011-08-09 09:15:37.093041675 +0530
>> @@ -638,8 +638,8 @@ static inline bool over_bground_thresh(v
>>   * just walks the superblock inode list, writing back any inodes which are
>>   * older than a specific point in time.
>>   *
>> - * Try to run once per dirty_writeback_interval.  But if a writeback event
>> - * takes longer than a dirty_writeback_interval interval, then leave a
>> + * Try to run once per bdi->dirty_writeback_interval.  But if a writeback event
>> + * takes longer than a bdi->dirty_writeback_interval interval, then leave a
>>   * one-second gap.
>>   *
>>   * older_than_this takes precedence over nr_to_write.  So we'll only write back
>> @@ -663,7 +663,7 @@ static long wb_writeback(struct bdi_writ
>>       if (wbc.for_kupdate) {
>>               wbc.older_than_this = &oldest_jif;
>>               oldest_jif = jiffies -
>> -                             msecs_to_jiffies(dirty_expire_interval * 10);
>> +                             msecs_to_jiffies(wb->bdi->dirty_expire_interval * 10);
>>       }
>>       if (!wbc.range_cyclic) {
>>               wbc.range_start = 0;
>> @@ -811,15 +811,16 @@ static long wb_check_old_data_flush(stru
>>  {
>>       unsigned long expired;
>>       long nr_pages;
>> +     struct backing_dev_info *bdi = wb->bdi;
>>
>>       /*
>>        * When set to zero, disable periodic writeback
>>        */
>> -     if (!dirty_writeback_interval)
>> +     if (!bdi->dirty_writeback_interval)
>>               return 0;
>>
>>       expired = wb->last_old_flush +
>> -                     msecs_to_jiffies(dirty_writeback_interval * 10);
>> +                     msecs_to_jiffies(bdi->dirty_writeback_interval * 10);
>>       if (time_before(jiffies, expired))
>>               return 0;
>>
>> @@ -923,8 +924,8 @@ int bdi_writeback_thread(void *data)
>>                       continue;
>>               }
>>
>> -             if (wb_has_dirty_io(wb) && dirty_writeback_interval)
>> -                     schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
>> +             if (wb_has_dirty_io(wb) && bdi->dirty_writeback_interval)
>> +                     schedule_timeout(msecs_to_jiffies(bdi->dirty_writeback_interval * 10));
>>               else {
>>                       /*
>>                        * We have nothing to do, so can go sleep without any
>> diff -uprN a/include/linux/backing-dev.h b/include/linux/backing-dev.h
>> --- a/include/linux/backing-dev.h     2011-08-05 10:29:21.000000000 +0530
>> +++ b/include/linux/backing-dev.h     2011-08-09 09:15:37.094041619 +0530
>> @@ -76,6 +76,8 @@ struct backing_dev_info {
>>
>>       unsigned int min_ratio;
>>       unsigned int max_ratio, max_prop_frac;
>> +     unsigned int dirty_writeback_interval;
>> +     unsigned int dirty_expire_interval;
>>
>>       struct bdi_writeback wb;  /* default writeback info for this bdi */
>>       spinlock_t wb_lock;       /* protects work_list */
>> @@ -333,4 +335,5 @@ static inline int bdi_sched_wait(void *w
>>       return 0;
>>  }
>>
>> +extern unsigned int shortest_dirty_writeback_interval;
>>  #endif               /* _LINUX_BACKING_DEV_H */
>> diff -uprN a/include/linux/writeback.h b/include/linux/writeback.h
>> --- a/include/linux/writeback.h       2011-08-05 10:29:21.000000000 +0530
>> +++ b/include/linux/writeback.h       2011-08-09 10:09:23.581268260 +0530
>> @@ -100,6 +100,7 @@ extern unsigned long dirty_background_by
>>  extern int vm_dirty_ratio;
>>  extern unsigned long vm_dirty_bytes;
>>  extern unsigned int dirty_writeback_interval;
>> +extern unsigned int sync_supers_interval;
>>  extern unsigned int dirty_expire_interval;
>>  extern int vm_highmem_is_dirtyable;
>>  extern int block_dump;
>> @@ -123,6 +124,10 @@ extern int dirty_bytes_handler(struct ct
>>  struct ctl_table;
>>  int dirty_writeback_centisecs_handler(struct ctl_table *, int,
>>                                     void __user *, size_t *, loff_t *);
>> +int sync_supers_centisecs_handler(struct ctl_table *, int,
>> +                                   void __user *, size_t *, loff_t *);
>> +int dirty_expire_centisecs_handler(struct ctl_table *, int,
>> +                                   void __user *, size_t *, loff_t *);
>>
>>  void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
>>  unsigned long bdi_dirty_limit(struct backing_dev_info *bdi,
>> diff -uprN a/kernel/sysctl.c b/kernel/sysctl.c
>> --- a/kernel/sysctl.c 2011-08-05 10:29:21.000000000 +0530
>> +++ b/kernel/sysctl.c 2011-08-09 12:39:43.453087554 +0530
>> @@ -1076,12 +1076,19 @@ static struct ctl_table vm_table[] = {
>>               .mode           = 0644,
>>               .proc_handler   = dirty_writeback_centisecs_handler,
>>       },
>> +    {
>> +        .procname   = "sync_supers_centisecs",
>> +        .data       = &sync_supers_interval,
>> +        .maxlen     = sizeof(sync_supers_interval),
>> +        .mode       = 0644,
>> +        .proc_handler   = sync_supers_centisecs_handler,
>> +    },
>>       {
>>               .procname       = "dirty_expire_centisecs",
>>               .data           = &dirty_expire_interval,
>>               .maxlen         = sizeof(dirty_expire_interval),
>>               .mode           = 0644,
>> -             .proc_handler   = proc_dointvec_minmax,
>> +             .proc_handler   = dirty_expire_centisecs_handler,
>>               .extra1         = &zero,
>>       },
>>       {
>> diff -uprN a/mm/backing-dev.c b/mm/backing-dev.c
>> --- a/mm/backing-dev.c        2011-08-05 10:29:21.000000000 +0530
>> +++ b/mm/backing-dev.c        2011-08-09 12:08:06.287079027 +0530
>> @@ -39,6 +39,10 @@ DEFINE_SPINLOCK(bdi_lock);
>>  LIST_HEAD(bdi_list);
>>  LIST_HEAD(bdi_pending_list);
>>
>> +/* Same value as the dirty_writeback_interval as this is what our
>> + * initial shortest_dirty_writeback_interval. */
>> +unsigned int shortest_dirty_writeback_interval = 5 * 100;
>> +
>>  static struct task_struct *sync_supers_tsk;
>>  static struct timer_list sync_supers_timer;
>>
>> @@ -204,12 +208,50 @@ static ssize_t max_ratio_store(struct de
>>  }
>>  BDI_SHOW(max_ratio, bdi->max_ratio)
>>
>> +static ssize_t dirty_writeback_interval_store(struct device *dev,
>> +             struct device_attribute *attr, const char *buf, size_t count)
>> +{
>> +     struct backing_dev_info *bdi = dev_get_drvdata(dev);
>> +     char *end;
>> +     unsigned int interval;
>> +     ssize_t ret = -EINVAL;
>> +
>> +     interval = simple_strtoul(buf, &end, 10);
>> +     if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
>> +             bdi->dirty_writeback_interval = interval;
>> +             shortest_dirty_writeback_interval =
>> +                                             min(shortest_dirty_writeback_interval,interval);
>> +             ret = count;
>> +     }
>> +     return ret;
>> +}
>> +BDI_SHOW(dirty_writeback_interval, bdi->dirty_writeback_interval)
>> +
>> +static ssize_t dirty_expire_interval_store (struct device *dev,
>> +             struct device_attribute *attr, const char *buf, size_t count)
>> +{
>> +     struct backing_dev_info *bdi = dev_get_drvdata(dev);
>> +     char *end;
>> +     unsigned int interval;
>> +     ssize_t ret = -EINVAL;
>> +
>> +     interval = simple_strtoul(buf, &end, 10);
>> +     if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
>> +             bdi->dirty_expire_interval = interval;
>> +             ret = count;
>> +     }
>> +     return ret;
>> +}
>> +BDI_SHOW(dirty_expire_interval, bdi->dirty_expire_interval)
>> +
>>  #define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
>>
>>  static struct device_attribute bdi_dev_attrs[] = {
>>       __ATTR_RW(read_ahead_kb),
>>       __ATTR_RW(min_ratio),
>>       __ATTR_RW(max_ratio),
>> +     __ATTR_RW(dirty_writeback_interval),
>> +     __ATTR_RW(dirty_expire_interval),
>>       __ATTR_NULL,
>>  };
>>
>> @@ -291,7 +333,7 @@ void bdi_arm_supers_timer(void)
>>       if (!dirty_writeback_interval)
>>               return;
>>
>> -     next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
>> +     next = msecs_to_jiffies(sync_supers_interval* 10) + jiffies;
>>       mod_timer(&sync_supers_timer, round_jiffies_up(next));
>>  }
>>
>> @@ -336,7 +378,7 @@ void bdi_wakeup_thread_delayed(struct ba
>>  {
>>       unsigned long timeout;
>>
>> -     timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
>> +     timeout = msecs_to_jiffies(bdi->dirty_writeback_interval * 10);
>>       mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout);
>>  }
>>
>> @@ -348,7 +390,19 @@ static unsigned long bdi_longest_inactiv
>>  {
>>       unsigned long interval;
>>
>> -     interval = msecs_to_jiffies(dirty_writeback_interval * 10);
>> +     interval = msecs_to_jiffies(shortest_dirty_writeback_interval * 10);
>> +     return max(5UL * 60 * HZ, interval);
>> +}
>> +
>> +/*
>> + * Calculate the longest interval (jiffies) this bdi thread is allowed to be
>> + * inactive.
>> + */
>> +static unsigned long bdi_longest_inactive_this(struct backing_dev_info *bdi)
>> +{
>> +     unsigned long interval;
>> +
>> +     interval = msecs_to_jiffies(bdi->dirty_writeback_interval * 10);
>>       return max(5UL * 60 * HZ, interval);
>>  }
>>
>> @@ -422,7 +476,7 @@ static int bdi_forker_thread(void *ptr)
>>                        */
>>                       if (bdi->wb.task && !have_dirty_io &&
>>                           time_after(jiffies, bdi->wb.last_active +
>> -                                             bdi_longest_inactive())) {
>> +                                             bdi_longest_inactive_this(bdi))) {
>>                               task = bdi->wb.task;
>>                               bdi->wb.task = NULL;
>>                               spin_unlock(&bdi->wb_lock);
>> @@ -469,7 +523,7 @@ static int bdi_forker_thread(void *ptr)
>>                       break;
>>
>>               case NO_ACTION:
>> -                     if (!wb_has_dirty_io(me) || !dirty_writeback_interval)
>> +                     if (!wb_has_dirty_io(me) || !me->bdi->dirty_writeback_interval)
>>                               /*
>>                                * There are no dirty data. The only thing we
>>                                * should now care about is checking for
>> @@ -479,7 +533,7 @@ static int bdi_forker_thread(void *ptr)
>>                                */
>>                               schedule_timeout(bdi_longest_inactive());
>>                       else
>> -                             schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
>> +                             schedule_timeout(msecs_to_jiffies(me->bdi->dirty_writeback_interval * 10));
>>                       try_to_freeze();
>>                       /* Back to the main loop */
>>                       continue;
>> @@ -641,6 +695,8 @@ int bdi_init(struct backing_dev_info *bd
>>       bdi->min_ratio = 0;
>>       bdi->max_ratio = 100;
>>       bdi->max_prop_frac = PROP_FRAC_BASE;
>> +     bdi->dirty_writeback_interval = dirty_writeback_interval;
>> +     bdi->dirty_expire_interval = dirty_expire_interval;
>>       spin_lock_init(&bdi->wb_lock);
>>       INIT_LIST_HEAD(&bdi->bdi_list);
>>       INIT_LIST_HEAD(&bdi->work_list);
>> diff -uprN a/mm/page-writeback.c b/mm/page-writeback.c
>> --- a/mm/page-writeback.c     2011-08-05 10:29:21.000000000 +0530
>> +++ b/mm/page-writeback.c     2011-08-09 13:09:37.985919961 +0530
>> @@ -92,6 +92,11 @@ unsigned long vm_dirty_bytes;
>>  unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */
>>
>>  /*
>> + * The interval between sync_supers thread writebacks
>> + */
>> +unsigned int sync_supers_interval = 5 * 100; /* centiseconds */
>> +
>> +/*
>>   * The longest time for which data is allowed to remain dirty
>>   */
>>  unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */
>> @@ -686,8 +691,60 @@ void throttle_vm_writeout(gfp_t gfp_mask
>>  int dirty_writeback_centisecs_handler(ctl_table *table, int write,
>>       void __user *buffer, size_t *length, loff_t *ppos)
>>  {
>> +     struct backing_dev_info *bdi;
>> +
>> +     proc_dointvec(table, write, buffer, length, ppos);
>> +
>> +     if (write) {
>> +             /* Traverse all the BDIs registered to the BDI list and reset their
>> +              * bdi->dirty_writeback_interval to this value. */
>> +         spin_lock_bh(&bdi_lock);
>> +             list_for_each_entry(bdi, &bdi_list, bdi_list)
>> +                     bdi->dirty_writeback_interval = dirty_writeback_interval;
>> +         spin_unlock_bh(&bdi_lock);
>> +
>> +             sync_supers_interval =
>> +                     shortest_dirty_writeback_interval = dirty_writeback_interval;
>> +
>> +     }
>> +
>> +     bdi_arm_supers_timer();
>> +
>> +     return 0;
>> +}
>> +
>> +/*
>> + * sysctl handler for /proc/sys/vm/sync_supers_centisecs
>> + */
>> +int sync_supers_centisecs_handler(ctl_table *table, int write,
>> +     void __user *buffer, size_t *length, loff_t *ppos)
>> +{
>>       proc_dointvec(table, write, buffer, length, ppos);
>> +
>>       bdi_arm_supers_timer();
>> +
>> +     return 0;
>> +}
>> +
>> +/*
>> + * sysctl handler for /proc/sys/vm/dirty_expire_centisecs
>> + */
>> +int dirty_expire_centisecs_handler(ctl_table *table, int write,
>> +     void __user *buffer, size_t *length, loff_t *ppos)
>> +{
>> +     struct backing_dev_info *bdi;
>> +
>> +     proc_dointvec_minmax(table, write, buffer, length, ppos);
>> +
>> +     if (write) {
>> +             /* Traverse all the BDIs registered to the BDI list and reset their
>> +              * bdi->dirty_expire_interval to this value. */
>> +         spin_lock_bh(&bdi_lock);
>> +             list_for_each_entry(bdi, &bdi_list, bdi_list)
>> +                     bdi->dirty_expire_interval = dirty_expire_interval;
>> +         spin_unlock_bh(&bdi_lock);
>> +     }
>> +
>>       return 0;
>>  }
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at  http://www.tux.org/lkml/
>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
@ 2011-08-18 11:28     ` Kautuk Consul
  0 siblings, 0 replies; 47+ messages in thread
From: Kautuk Consul @ 2011-08-18 11:28 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Mel Gorman, KOSAKI Motohiro, linux-mm, linux-kernel, linux-fsdevel

Hi Wu,

Thanks for responding.

Please find my comments inline in your email below.

On Thu, Aug 18, 2011 at 3:18 PM, Wu Fengguang <fengguang.wu@intel.com> wrote:
> Hi Kautuk,
>
> Add CC to fsdevel and Mel and KOSAKI.
>
> When submitting patches you can find the relevant mailing list and
> developers to CC with this command under the kernel source tree:
>
>        scripts/get_maintainer.pl YOUR-PATCH-FILE
>
> On Thu, Aug 11, 2011 at 05:50:56PM +0530, Kautuk Consul wrote:
>> Hi,
>>
>> Currently the /proc/sys/vm/dirty_writeback_centisecs and
>> /proc/sys/vm/dirty_expire_centisecs values are
>> global to the system.
>> All the BDI flush-* threads are controlled by these central values.
>
> Yes.
>
>> However, the user/admin might want to set different writeback speeds
>> for different block devices based on
>> their page write-back performance.
>
> How can the above two sysctl values impact "writeback speeds"?
> In particular, what's the "speed" you mean?
>

By writeback speed, I meant writeback interval, i.e. the maximum
interval after which the BDI
thread for a particular block device can wake up and try to sync pages
with disk.


>> For example, the user might want to write-back pages in smaller
>> intervals to a block device which has a
>> faster known writeback speed.
>
> That's not a complete rational. What does the user ultimately want by
> setting a smaller interval? What would be the problems to the other
> slow devices if the user does so by simply setting a small value
> _globally_?
>

I think that the user might want to set a smaller interval for faster block
devices so that the dirty pages are synced with that block device/disk sooner.
This will unset the dirty bit of the page-cache pages sooner, which
will increase the
possibility of those pages getting reclaimed quickly in high memory
usage scenarios.
For a system that writes to disk very frequently and runs a lot of
memory intensive user-mode
applications, this might be crucial for their performance as they
would possibly have to sleep
comparitively lesser during page allocation.
For example, an server handling a database needs frequent disk access
as well as
anonymous memory. In such a case it would be nice to keep the
write-back interval for a USB pen
drive BDI thread as more than that of a SATA/SCSI disk.

> We need strong use cases for doing such user interface changes.
> Would you detail the problem and the pains that can only (or best)
> be addressed by this patch?
>

Overall, I think that ever since there have been different BDI threads
for different block devices,
it seems quite rational to provide the user an option to set different
writeback intervals to different
block devices due to the reasons/examples I have mentioned above.

I do not fully theoretically understand the way your patches are
controlling the dirty rate and estimating
the future bandwidth.
But, when I looked through them I did not see any place where the
writeback interval for a BDI was being
changed.
So, I felt that my patch was more like an additional feature for the
user rather than a conflict with your
writeback patches.

> Thanks,
> Fengguang
>
>> This patch creates 3 new counters (in centisecs) for all the BDI
>> threads that were controlled centrally by these
>> 2 counters:
>> i)   /sys/block/<block_dev>/bdi/dirty_writeback_interval,
>> ii)  /sys/block/<block_dev>/bdi/dirty_expire_interval,
>> iii) /proc/sys/vm/sync_supers_centisecs.
>>
>> Although these new counters can be tuned individually, I have taken
>> care that they be centrally reset by changes
>> to the /proc/sys/vm/dirty_expire_centisecs and
>> /proc/sys/vm/dirty_writeback_centisecs so that the earlier
>> functionality is not broken by distributions using these central values.
>> After resetting all values centrally, these values can be tuned
>> individually without altering the central values.
>>
>> Signed-off-by: Kautuk Consul <consul.kautuk@gmail.com>
>> ---
>>
>> diff -uprN a/fs/fs-writeback.c b/fs/fs-writeback.c
>> --- a/fs/fs-writeback.c       2011-08-05 10:29:21.000000000 +0530
>> +++ b/fs/fs-writeback.c       2011-08-09 09:15:37.093041675 +0530
>> @@ -638,8 +638,8 @@ static inline bool over_bground_thresh(v
>>   * just walks the superblock inode list, writing back any inodes which are
>>   * older than a specific point in time.
>>   *
>> - * Try to run once per dirty_writeback_interval.  But if a writeback event
>> - * takes longer than a dirty_writeback_interval interval, then leave a
>> + * Try to run once per bdi->dirty_writeback_interval.  But if a writeback event
>> + * takes longer than a bdi->dirty_writeback_interval interval, then leave a
>>   * one-second gap.
>>   *
>>   * older_than_this takes precedence over nr_to_write.  So we'll only write back
>> @@ -663,7 +663,7 @@ static long wb_writeback(struct bdi_writ
>>       if (wbc.for_kupdate) {
>>               wbc.older_than_this = &oldest_jif;
>>               oldest_jif = jiffies -
>> -                             msecs_to_jiffies(dirty_expire_interval * 10);
>> +                             msecs_to_jiffies(wb->bdi->dirty_expire_interval * 10);
>>       }
>>       if (!wbc.range_cyclic) {
>>               wbc.range_start = 0;
>> @@ -811,15 +811,16 @@ static long wb_check_old_data_flush(stru
>>  {
>>       unsigned long expired;
>>       long nr_pages;
>> +     struct backing_dev_info *bdi = wb->bdi;
>>
>>       /*
>>        * When set to zero, disable periodic writeback
>>        */
>> -     if (!dirty_writeback_interval)
>> +     if (!bdi->dirty_writeback_interval)
>>               return 0;
>>
>>       expired = wb->last_old_flush +
>> -                     msecs_to_jiffies(dirty_writeback_interval * 10);
>> +                     msecs_to_jiffies(bdi->dirty_writeback_interval * 10);
>>       if (time_before(jiffies, expired))
>>               return 0;
>>
>> @@ -923,8 +924,8 @@ int bdi_writeback_thread(void *data)
>>                       continue;
>>               }
>>
>> -             if (wb_has_dirty_io(wb) && dirty_writeback_interval)
>> -                     schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
>> +             if (wb_has_dirty_io(wb) && bdi->dirty_writeback_interval)
>> +                     schedule_timeout(msecs_to_jiffies(bdi->dirty_writeback_interval * 10));
>>               else {
>>                       /*
>>                        * We have nothing to do, so can go sleep without any
>> diff -uprN a/include/linux/backing-dev.h b/include/linux/backing-dev.h
>> --- a/include/linux/backing-dev.h     2011-08-05 10:29:21.000000000 +0530
>> +++ b/include/linux/backing-dev.h     2011-08-09 09:15:37.094041619 +0530
>> @@ -76,6 +76,8 @@ struct backing_dev_info {
>>
>>       unsigned int min_ratio;
>>       unsigned int max_ratio, max_prop_frac;
>> +     unsigned int dirty_writeback_interval;
>> +     unsigned int dirty_expire_interval;
>>
>>       struct bdi_writeback wb;  /* default writeback info for this bdi */
>>       spinlock_t wb_lock;       /* protects work_list */
>> @@ -333,4 +335,5 @@ static inline int bdi_sched_wait(void *w
>>       return 0;
>>  }
>>
>> +extern unsigned int shortest_dirty_writeback_interval;
>>  #endif               /* _LINUX_BACKING_DEV_H */
>> diff -uprN a/include/linux/writeback.h b/include/linux/writeback.h
>> --- a/include/linux/writeback.h       2011-08-05 10:29:21.000000000 +0530
>> +++ b/include/linux/writeback.h       2011-08-09 10:09:23.581268260 +0530
>> @@ -100,6 +100,7 @@ extern unsigned long dirty_background_by
>>  extern int vm_dirty_ratio;
>>  extern unsigned long vm_dirty_bytes;
>>  extern unsigned int dirty_writeback_interval;
>> +extern unsigned int sync_supers_interval;
>>  extern unsigned int dirty_expire_interval;
>>  extern int vm_highmem_is_dirtyable;
>>  extern int block_dump;
>> @@ -123,6 +124,10 @@ extern int dirty_bytes_handler(struct ct
>>  struct ctl_table;
>>  int dirty_writeback_centisecs_handler(struct ctl_table *, int,
>>                                     void __user *, size_t *, loff_t *);
>> +int sync_supers_centisecs_handler(struct ctl_table *, int,
>> +                                   void __user *, size_t *, loff_t *);
>> +int dirty_expire_centisecs_handler(struct ctl_table *, int,
>> +                                   void __user *, size_t *, loff_t *);
>>
>>  void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
>>  unsigned long bdi_dirty_limit(struct backing_dev_info *bdi,
>> diff -uprN a/kernel/sysctl.c b/kernel/sysctl.c
>> --- a/kernel/sysctl.c 2011-08-05 10:29:21.000000000 +0530
>> +++ b/kernel/sysctl.c 2011-08-09 12:39:43.453087554 +0530
>> @@ -1076,12 +1076,19 @@ static struct ctl_table vm_table[] = {
>>               .mode           = 0644,
>>               .proc_handler   = dirty_writeback_centisecs_handler,
>>       },
>> +    {
>> +        .procname   = "sync_supers_centisecs",
>> +        .data       = &sync_supers_interval,
>> +        .maxlen     = sizeof(sync_supers_interval),
>> +        .mode       = 0644,
>> +        .proc_handler   = sync_supers_centisecs_handler,
>> +    },
>>       {
>>               .procname       = "dirty_expire_centisecs",
>>               .data           = &dirty_expire_interval,
>>               .maxlen         = sizeof(dirty_expire_interval),
>>               .mode           = 0644,
>> -             .proc_handler   = proc_dointvec_minmax,
>> +             .proc_handler   = dirty_expire_centisecs_handler,
>>               .extra1         = &zero,
>>       },
>>       {
>> diff -uprN a/mm/backing-dev.c b/mm/backing-dev.c
>> --- a/mm/backing-dev.c        2011-08-05 10:29:21.000000000 +0530
>> +++ b/mm/backing-dev.c        2011-08-09 12:08:06.287079027 +0530
>> @@ -39,6 +39,10 @@ DEFINE_SPINLOCK(bdi_lock);
>>  LIST_HEAD(bdi_list);
>>  LIST_HEAD(bdi_pending_list);
>>
>> +/* Same value as the dirty_writeback_interval as this is what our
>> + * initial shortest_dirty_writeback_interval. */
>> +unsigned int shortest_dirty_writeback_interval = 5 * 100;
>> +
>>  static struct task_struct *sync_supers_tsk;
>>  static struct timer_list sync_supers_timer;
>>
>> @@ -204,12 +208,50 @@ static ssize_t max_ratio_store(struct de
>>  }
>>  BDI_SHOW(max_ratio, bdi->max_ratio)
>>
>> +static ssize_t dirty_writeback_interval_store(struct device *dev,
>> +             struct device_attribute *attr, const char *buf, size_t count)
>> +{
>> +     struct backing_dev_info *bdi = dev_get_drvdata(dev);
>> +     char *end;
>> +     unsigned int interval;
>> +     ssize_t ret = -EINVAL;
>> +
>> +     interval = simple_strtoul(buf, &end, 10);
>> +     if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
>> +             bdi->dirty_writeback_interval = interval;
>> +             shortest_dirty_writeback_interval =
>> +                                             min(shortest_dirty_writeback_interval,interval);
>> +             ret = count;
>> +     }
>> +     return ret;
>> +}
>> +BDI_SHOW(dirty_writeback_interval, bdi->dirty_writeback_interval)
>> +
>> +static ssize_t dirty_expire_interval_store (struct device *dev,
>> +             struct device_attribute *attr, const char *buf, size_t count)
>> +{
>> +     struct backing_dev_info *bdi = dev_get_drvdata(dev);
>> +     char *end;
>> +     unsigned int interval;
>> +     ssize_t ret = -EINVAL;
>> +
>> +     interval = simple_strtoul(buf, &end, 10);
>> +     if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
>> +             bdi->dirty_expire_interval = interval;
>> +             ret = count;
>> +     }
>> +     return ret;
>> +}
>> +BDI_SHOW(dirty_expire_interval, bdi->dirty_expire_interval)
>> +
>>  #define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
>>
>>  static struct device_attribute bdi_dev_attrs[] = {
>>       __ATTR_RW(read_ahead_kb),
>>       __ATTR_RW(min_ratio),
>>       __ATTR_RW(max_ratio),
>> +     __ATTR_RW(dirty_writeback_interval),
>> +     __ATTR_RW(dirty_expire_interval),
>>       __ATTR_NULL,
>>  };
>>
>> @@ -291,7 +333,7 @@ void bdi_arm_supers_timer(void)
>>       if (!dirty_writeback_interval)
>>               return;
>>
>> -     next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
>> +     next = msecs_to_jiffies(sync_supers_interval* 10) + jiffies;
>>       mod_timer(&sync_supers_timer, round_jiffies_up(next));
>>  }
>>
>> @@ -336,7 +378,7 @@ void bdi_wakeup_thread_delayed(struct ba
>>  {
>>       unsigned long timeout;
>>
>> -     timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
>> +     timeout = msecs_to_jiffies(bdi->dirty_writeback_interval * 10);
>>       mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout);
>>  }
>>
>> @@ -348,7 +390,19 @@ static unsigned long bdi_longest_inactiv
>>  {
>>       unsigned long interval;
>>
>> -     interval = msecs_to_jiffies(dirty_writeback_interval * 10);
>> +     interval = msecs_to_jiffies(shortest_dirty_writeback_interval * 10);
>> +     return max(5UL * 60 * HZ, interval);
>> +}
>> +
>> +/*
>> + * Calculate the longest interval (jiffies) this bdi thread is allowed to be
>> + * inactive.
>> + */
>> +static unsigned long bdi_longest_inactive_this(struct backing_dev_info *bdi)
>> +{
>> +     unsigned long interval;
>> +
>> +     interval = msecs_to_jiffies(bdi->dirty_writeback_interval * 10);
>>       return max(5UL * 60 * HZ, interval);
>>  }
>>
>> @@ -422,7 +476,7 @@ static int bdi_forker_thread(void *ptr)
>>                        */
>>                       if (bdi->wb.task && !have_dirty_io &&
>>                           time_after(jiffies, bdi->wb.last_active +
>> -                                             bdi_longest_inactive())) {
>> +                                             bdi_longest_inactive_this(bdi))) {
>>                               task = bdi->wb.task;
>>                               bdi->wb.task = NULL;
>>                               spin_unlock(&bdi->wb_lock);
>> @@ -469,7 +523,7 @@ static int bdi_forker_thread(void *ptr)
>>                       break;
>>
>>               case NO_ACTION:
>> -                     if (!wb_has_dirty_io(me) || !dirty_writeback_interval)
>> +                     if (!wb_has_dirty_io(me) || !me->bdi->dirty_writeback_interval)
>>                               /*
>>                                * There are no dirty data. The only thing we
>>                                * should now care about is checking for
>> @@ -479,7 +533,7 @@ static int bdi_forker_thread(void *ptr)
>>                                */
>>                               schedule_timeout(bdi_longest_inactive());
>>                       else
>> -                             schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
>> +                             schedule_timeout(msecs_to_jiffies(me->bdi->dirty_writeback_interval * 10));
>>                       try_to_freeze();
>>                       /* Back to the main loop */
>>                       continue;
>> @@ -641,6 +695,8 @@ int bdi_init(struct backing_dev_info *bd
>>       bdi->min_ratio = 0;
>>       bdi->max_ratio = 100;
>>       bdi->max_prop_frac = PROP_FRAC_BASE;
>> +     bdi->dirty_writeback_interval = dirty_writeback_interval;
>> +     bdi->dirty_expire_interval = dirty_expire_interval;
>>       spin_lock_init(&bdi->wb_lock);
>>       INIT_LIST_HEAD(&bdi->bdi_list);
>>       INIT_LIST_HEAD(&bdi->work_list);
>> diff -uprN a/mm/page-writeback.c b/mm/page-writeback.c
>> --- a/mm/page-writeback.c     2011-08-05 10:29:21.000000000 +0530
>> +++ b/mm/page-writeback.c     2011-08-09 13:09:37.985919961 +0530
>> @@ -92,6 +92,11 @@ unsigned long vm_dirty_bytes;
>>  unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */
>>
>>  /*
>> + * The interval between sync_supers thread writebacks
>> + */
>> +unsigned int sync_supers_interval = 5 * 100; /* centiseconds */
>> +
>> +/*
>>   * The longest time for which data is allowed to remain dirty
>>   */
>>  unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */
>> @@ -686,8 +691,60 @@ void throttle_vm_writeout(gfp_t gfp_mask
>>  int dirty_writeback_centisecs_handler(ctl_table *table, int write,
>>       void __user *buffer, size_t *length, loff_t *ppos)
>>  {
>> +     struct backing_dev_info *bdi;
>> +
>> +     proc_dointvec(table, write, buffer, length, ppos);
>> +
>> +     if (write) {
>> +             /* Traverse all the BDIs registered to the BDI list and reset their
>> +              * bdi->dirty_writeback_interval to this value. */
>> +         spin_lock_bh(&bdi_lock);
>> +             list_for_each_entry(bdi, &bdi_list, bdi_list)
>> +                     bdi->dirty_writeback_interval = dirty_writeback_interval;
>> +         spin_unlock_bh(&bdi_lock);
>> +
>> +             sync_supers_interval =
>> +                     shortest_dirty_writeback_interval = dirty_writeback_interval;
>> +
>> +     }
>> +
>> +     bdi_arm_supers_timer();
>> +
>> +     return 0;
>> +}
>> +
>> +/*
>> + * sysctl handler for /proc/sys/vm/sync_supers_centisecs
>> + */
>> +int sync_supers_centisecs_handler(ctl_table *table, int write,
>> +     void __user *buffer, size_t *length, loff_t *ppos)
>> +{
>>       proc_dointvec(table, write, buffer, length, ppos);
>> +
>>       bdi_arm_supers_timer();
>> +
>> +     return 0;
>> +}
>> +
>> +/*
>> + * sysctl handler for /proc/sys/vm/dirty_expire_centisecs
>> + */
>> +int dirty_expire_centisecs_handler(ctl_table *table, int write,
>> +     void __user *buffer, size_t *length, loff_t *ppos)
>> +{
>> +     struct backing_dev_info *bdi;
>> +
>> +     proc_dointvec_minmax(table, write, buffer, length, ppos);
>> +
>> +     if (write) {
>> +             /* Traverse all the BDIs registered to the BDI list and reset their
>> +              * bdi->dirty_expire_interval to this value. */
>> +         spin_lock_bh(&bdi_lock);
>> +             list_for_each_entry(bdi, &bdi_list, bdi_list)
>> +                     bdi->dirty_expire_interval = dirty_expire_interval;
>> +         spin_unlock_bh(&bdi_lock);
>> +     }
>> +
>>       return 0;
>>  }
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at  http://www.tux.org/lkml/
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
  2011-08-18  9:48   ` Wu Fengguang
  (?)
@ 2011-08-18 12:14     ` Artem Bityutskiy
  -1 siblings, 0 replies; 47+ messages in thread
From: Artem Bityutskiy @ 2011-08-18 12:14 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Kautuk Consul, Mel Gorman, KOSAKI Motohiro, linux-mm,
	linux-kernel, linux-fsdevel

On Thu, 2011-08-18 at 17:48 +0800, Wu Fengguang wrote:
> > For example, the user might want to write-back pages in smaller
> > intervals to a block device which has a
> > faster known writeback speed.
> 
> That's not a complete rational. What does the user ultimately want by
> setting a smaller interval? What would be the problems to the other
> slow devices if the user does so by simply setting a small value
> _globally_?
> 
> We need strong use cases for doing such user interface changes.
> Would you detail the problem and the pains that can only (or best)
> be addressed by this patch?

Here is a real use-case we had when developing the N900 phone. We had
internal flash and external microSD slot. Internal flash is soldered in
and cannot be removed by the user. MicroSD, in contrast, can be removed
by the user.

For the internal flash we wanted long intervals and relaxed limits to
gain better performance.

For MicroSD we wanted very short intervals and tough limits to make sure
that if the user suddenly removes his microSD (users do this all the
time) - we do not lose data.

The discussed capability would be very useful in that case, AFAICS.

IOW, this is not only about fast/slow devices and how quickly you want
to be able to sync the FS, this is also about data integrity guarantees.

-- 
Best Regards,
Artem Bityutskiy


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
@ 2011-08-18 12:14     ` Artem Bityutskiy
  0 siblings, 0 replies; 47+ messages in thread
From: Artem Bityutskiy @ 2011-08-18 12:14 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Kautuk Consul, Mel Gorman, KOSAKI Motohiro, linux-mm,
	linux-kernel, linux-fsdevel

On Thu, 2011-08-18 at 17:48 +0800, Wu Fengguang wrote:
> > For example, the user might want to write-back pages in smaller
> > intervals to a block device which has a
> > faster known writeback speed.
> 
> That's not a complete rational. What does the user ultimately want by
> setting a smaller interval? What would be the problems to the other
> slow devices if the user does so by simply setting a small value
> _globally_?
> 
> We need strong use cases for doing such user interface changes.
> Would you detail the problem and the pains that can only (or best)
> be addressed by this patch?

Here is a real use-case we had when developing the N900 phone. We had
internal flash and external microSD slot. Internal flash is soldered in
and cannot be removed by the user. MicroSD, in contrast, can be removed
by the user.

For the internal flash we wanted long intervals and relaxed limits to
gain better performance.

For MicroSD we wanted very short intervals and tough limits to make sure
that if the user suddenly removes his microSD (users do this all the
time) - we do not lose data.

The discussed capability would be very useful in that case, AFAICS.

IOW, this is not only about fast/slow devices and how quickly you want
to be able to sync the FS, this is also about data integrity guarantees.

-- 
Best Regards,
Artem Bityutskiy

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
@ 2011-08-18 12:14     ` Artem Bityutskiy
  0 siblings, 0 replies; 47+ messages in thread
From: Artem Bityutskiy @ 2011-08-18 12:14 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Kautuk Consul, Mel Gorman, KOSAKI Motohiro, linux-mm,
	linux-kernel, linux-fsdevel

On Thu, 2011-08-18 at 17:48 +0800, Wu Fengguang wrote:
> > For example, the user might want to write-back pages in smaller
> > intervals to a block device which has a
> > faster known writeback speed.
> 
> That's not a complete rational. What does the user ultimately want by
> setting a smaller interval? What would be the problems to the other
> slow devices if the user does so by simply setting a small value
> _globally_?
> 
> We need strong use cases for doing such user interface changes.
> Would you detail the problem and the pains that can only (or best)
> be addressed by this patch?

Here is a real use-case we had when developing the N900 phone. We had
internal flash and external microSD slot. Internal flash is soldered in
and cannot be removed by the user. MicroSD, in contrast, can be removed
by the user.

For the internal flash we wanted long intervals and relaxed limits to
gain better performance.

For MicroSD we wanted very short intervals and tough limits to make sure
that if the user suddenly removes his microSD (users do this all the
time) - we do not lose data.

The discussed capability would be very useful in that case, AFAICS.

IOW, this is not only about fast/slow devices and how quickly you want
to be able to sync the FS, this is also about data integrity guarantees.

-- 
Best Regards,
Artem Bityutskiy

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
  2011-08-18 12:14     ` Artem Bityutskiy
@ 2011-08-18 12:35       ` Wu Fengguang
  -1 siblings, 0 replies; 47+ messages in thread
From: Wu Fengguang @ 2011-08-18 12:35 UTC (permalink / raw)
  To: Artem Bityutskiy
  Cc: Kautuk Consul, Mel Gorman, KOSAKI Motohiro, linux-mm,
	linux-kernel, linux-fsdevel, Jan Kara, Dave Chinner

On Thu, Aug 18, 2011 at 08:14:57PM +0800, Artem Bityutskiy wrote:
> On Thu, 2011-08-18 at 17:48 +0800, Wu Fengguang wrote:
> > > For example, the user might want to write-back pages in smaller
> > > intervals to a block device which has a
> > > faster known writeback speed.
> > 
> > That's not a complete rational. What does the user ultimately want by
> > setting a smaller interval? What would be the problems to the other
> > slow devices if the user does so by simply setting a small value
> > _globally_?
> > 
> > We need strong use cases for doing such user interface changes.
> > Would you detail the problem and the pains that can only (or best)
> > be addressed by this patch?
> 
> Here is a real use-case we had when developing the N900 phone. We had
> internal flash and external microSD slot. Internal flash is soldered in
> and cannot be removed by the user. MicroSD, in contrast, can be removed
> by the user.
> 
> For the internal flash we wanted long intervals and relaxed limits to
> gain better performance.

Understand -- it's backed by the battery anyway.

Yeah it's a practical way. It might even optimize away some of the
writes if they are truncated some time later. It also allows possible
optimization of deferring the writes to user inactive periods.

However the ultimate optimization could be to prioritize READs over
WRITEs in the IO scheduler, so that async WRITEs have minimal impact
on normal operations. It's the only option for the MicroSD case,
anyway.

> For MicroSD we wanted very short intervals and tough limits to make sure
> that if the user suddenly removes his microSD (users do this all the
> time) - we do not lose data.

Pretty reasonable.

> The discussed capability would be very useful in that case, AFAICS.

Agreed.

> IOW, this is not only about fast/slow devices and how quickly you want
> to be able to sync the FS, this is also about data integrity guarantees.

In fact I never think it would matter for fast/slow devices.  It's the
dirty_ratio/dirty_bytes interfaces that ask for improvement if care
about too many pages being cached.

The intervals interfaces are intended for data integrity and nothing
more.

Thanks,
Fengguang

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
@ 2011-08-18 12:35       ` Wu Fengguang
  0 siblings, 0 replies; 47+ messages in thread
From: Wu Fengguang @ 2011-08-18 12:35 UTC (permalink / raw)
  To: Artem Bityutskiy
  Cc: Kautuk Consul, Mel Gorman, KOSAKI Motohiro, linux-mm,
	linux-kernel, linux-fsdevel, Jan Kara, Dave Chinner

On Thu, Aug 18, 2011 at 08:14:57PM +0800, Artem Bityutskiy wrote:
> On Thu, 2011-08-18 at 17:48 +0800, Wu Fengguang wrote:
> > > For example, the user might want to write-back pages in smaller
> > > intervals to a block device which has a
> > > faster known writeback speed.
> > 
> > That's not a complete rational. What does the user ultimately want by
> > setting a smaller interval? What would be the problems to the other
> > slow devices if the user does so by simply setting a small value
> > _globally_?
> > 
> > We need strong use cases for doing such user interface changes.
> > Would you detail the problem and the pains that can only (or best)
> > be addressed by this patch?
> 
> Here is a real use-case we had when developing the N900 phone. We had
> internal flash and external microSD slot. Internal flash is soldered in
> and cannot be removed by the user. MicroSD, in contrast, can be removed
> by the user.
> 
> For the internal flash we wanted long intervals and relaxed limits to
> gain better performance.

Understand -- it's backed by the battery anyway.

Yeah it's a practical way. It might even optimize away some of the
writes if they are truncated some time later. It also allows possible
optimization of deferring the writes to user inactive periods.

However the ultimate optimization could be to prioritize READs over
WRITEs in the IO scheduler, so that async WRITEs have minimal impact
on normal operations. It's the only option for the MicroSD case,
anyway.

> For MicroSD we wanted very short intervals and tough limits to make sure
> that if the user suddenly removes his microSD (users do this all the
> time) - we do not lose data.

Pretty reasonable.

> The discussed capability would be very useful in that case, AFAICS.

Agreed.

> IOW, this is not only about fast/slow devices and how quickly you want
> to be able to sync the FS, this is also about data integrity guarantees.

In fact I never think it would matter for fast/slow devices.  It's the
dirty_ratio/dirty_bytes interfaces that ask for improvement if care
about too many pages being cached.

The intervals interfaces are intended for data integrity and nothing
more.

Thanks,
Fengguang

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
  2011-08-18 11:28     ` Kautuk Consul
@ 2011-08-18 12:55       ` Wu Fengguang
  -1 siblings, 0 replies; 47+ messages in thread
From: Wu Fengguang @ 2011-08-18 12:55 UTC (permalink / raw)
  To: Kautuk Consul
  Cc: Mel Gorman, KOSAKI Motohiro, linux-mm, linux-kernel,
	linux-fsdevel, Jan Kara, Dave Chinner, Greg Thelen,
	Artem Bityutskiy

Hi Kautuk,

> >> However, the user/admin might want to set different writeback speeds
> >> for different block devices based on
> >> their page write-back performance.
> >
> > How can the above two sysctl values impact "writeback speeds"?
> > In particular, what's the "speed" you mean?
> >
> 
> By writeback speed, I meant writeback interval, i.e. the maximum
> interval after which the BDI
> thread for a particular block device can wake up and try to sync pages
> with disk.

OK.

> >> For example, the user might want to write-back pages in smaller
> >> intervals to a block device which has a
> >> faster known writeback speed.
> >
> > That's not a complete rational. What does the user ultimately want by
> > setting a smaller interval? What would be the problems to the other
> > slow devices if the user does so by simply setting a small value
> > _globally_?
> >
> 
> I think that the user might want to set a smaller interval for faster block
> devices so that the dirty pages are synced with that block device/disk sooner.
> This will unset the dirty bit of the page-cache pages sooner, which
> will increase the
> possibility of those pages getting reclaimed quickly in high memory
> usage scenarios.
> For a system that writes to disk very frequently and runs a lot of
> memory intensive user-mode
> applications, this might be crucial for their performance as they
> would possibly have to sleep
> comparitively lesser during page allocation.
> For example, an server handling a database needs frequent disk access
> as well as
> anonymous memory. In such a case it would be nice to keep the
> write-back interval for a USB pen
> drive BDI thread as more than that of a SATA/SCSI disk.

Nope. I'm afraid the above reasoning is totally wrong.

Firstly, it's never a guarantee for a smaller interval to stop the
dirty pages from growing large. Think about a dd task that dirties
pages at 1GB/s speed. It's going to accumulate huge number of dirty
pages before any "small" writeback interval elapsed.

Secondly, according to your logic, it's actually the low speed device
that need smaller intervals. Because if a dirtier task creates 100MB
dirty pages in the same interval, it's the slow device that requires
a lot more time to clean those pages, hence need to start the
writeback earlier.

The conclusion is, the dirty_expire_centisecs and
dirty_writeback_centisecs interfaces are solely for data integrity
purpose.

We have dirty_ratio for controlling the maximum number of dirty pages
and dirty_background_ratio for controlling when to start writeback.

There does exist the problem that their default value 10%/20% can be
too large for a system with 1TB memory and 100MB/s disk, or 4GB memory
and a 10MB/s USB memory stick. In particular they will accumulate more
than 30 seconds worth of data which could break the user assumption on
what dirty_expire_centisecs seem to promise.

Now that we have per-bdi write bandwidth estimation, that problem can
be fixed by somehow auto lowering the effective dirty (background) ratio.
I wonder if this is what you really want.  Greg had some concerns on
this issue, too.

Thanks,
Fengguang

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
@ 2011-08-18 12:55       ` Wu Fengguang
  0 siblings, 0 replies; 47+ messages in thread
From: Wu Fengguang @ 2011-08-18 12:55 UTC (permalink / raw)
  To: Kautuk Consul
  Cc: Mel Gorman, KOSAKI Motohiro, linux-mm, linux-kernel,
	linux-fsdevel, Jan Kara, Dave Chinner, Greg Thelen,
	Artem Bityutskiy

Hi Kautuk,

> >> However, the user/admin might want to set different writeback speeds
> >> for different block devices based on
> >> their page write-back performance.
> >
> > How can the above two sysctl values impact "writeback speeds"?
> > In particular, what's the "speed" you mean?
> >
> 
> By writeback speed, I meant writeback interval, i.e. the maximum
> interval after which the BDI
> thread for a particular block device can wake up and try to sync pages
> with disk.

OK.

> >> For example, the user might want to write-back pages in smaller
> >> intervals to a block device which has a
> >> faster known writeback speed.
> >
> > That's not a complete rational. What does the user ultimately want by
> > setting a smaller interval? What would be the problems to the other
> > slow devices if the user does so by simply setting a small value
> > _globally_?
> >
> 
> I think that the user might want to set a smaller interval for faster block
> devices so that the dirty pages are synced with that block device/disk sooner.
> This will unset the dirty bit of the page-cache pages sooner, which
> will increase the
> possibility of those pages getting reclaimed quickly in high memory
> usage scenarios.
> For a system that writes to disk very frequently and runs a lot of
> memory intensive user-mode
> applications, this might be crucial for their performance as they
> would possibly have to sleep
> comparitively lesser during page allocation.
> For example, an server handling a database needs frequent disk access
> as well as
> anonymous memory. In such a case it would be nice to keep the
> write-back interval for a USB pen
> drive BDI thread as more than that of a SATA/SCSI disk.

Nope. I'm afraid the above reasoning is totally wrong.

Firstly, it's never a guarantee for a smaller interval to stop the
dirty pages from growing large. Think about a dd task that dirties
pages at 1GB/s speed. It's going to accumulate huge number of dirty
pages before any "small" writeback interval elapsed.

Secondly, according to your logic, it's actually the low speed device
that need smaller intervals. Because if a dirtier task creates 100MB
dirty pages in the same interval, it's the slow device that requires
a lot more time to clean those pages, hence need to start the
writeback earlier.

The conclusion is, the dirty_expire_centisecs and
dirty_writeback_centisecs interfaces are solely for data integrity
purpose.

We have dirty_ratio for controlling the maximum number of dirty pages
and dirty_background_ratio for controlling when to start writeback.

There does exist the problem that their default value 10%/20% can be
too large for a system with 1TB memory and 100MB/s disk, or 4GB memory
and a 10MB/s USB memory stick. In particular they will accumulate more
than 30 seconds worth of data which could break the user assumption on
what dirty_expire_centisecs seem to promise.

Now that we have per-bdi write bandwidth estimation, that problem can
be fixed by somehow auto lowering the effective dirty (background) ratio.
I wonder if this is what you really want.  Greg had some concerns on
this issue, too.

Thanks,
Fengguang

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
  2011-08-18 12:14     ` Artem Bityutskiy
@ 2011-08-18 13:13       ` Wu Fengguang
  -1 siblings, 0 replies; 47+ messages in thread
From: Wu Fengguang @ 2011-08-18 13:13 UTC (permalink / raw)
  To: Artem Bityutskiy
  Cc: Kautuk Consul, Mel Gorman, KOSAKI Motohiro, linux-mm,
	linux-kernel, linux-fsdevel, Jan Kara, Dave Chinner, Greg Thelen

Hi Artem,

> Here is a real use-case we had when developing the N900 phone. We had
> internal flash and external microSD slot. Internal flash is soldered in
> and cannot be removed by the user. MicroSD, in contrast, can be removed
> by the user.
> 
> For the internal flash we wanted long intervals and relaxed limits to
> gain better performance.
> 
> For MicroSD we wanted very short intervals and tough limits to make sure
> that if the user suddenly removes his microSD (users do this all the
> time) - we do not lose data.

Thinking twice about it, I find that the different requirements for
interval flash/external microSD can also be solved by this scheme.

Introduce a per-bdi dirty_background_time (and optionally dirty_time)
as the counterpart of (and works in parallel to) global dirty[_background]_ratio,
however with unit "milliseconds worth of data".

The per-bdi dirty_background_time will be set low for external microSD
and high for internal flash. Then you get timely writeouts for microSD
and reasonably delayed writes for internal flash (controllable by the
global dirty_expire_centisecs).

The dirty_background_time will actually work more reliable than
dirty_expire_centisecs because it will checked immediately after the
application dirties more pages. And the dirty_time could provide
strong data integrity guarantee -- much stronger than
dirty_expire_centisecs -- if used.

Does that sound reasonable?

Thanks,
Fengguang

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
@ 2011-08-18 13:13       ` Wu Fengguang
  0 siblings, 0 replies; 47+ messages in thread
From: Wu Fengguang @ 2011-08-18 13:13 UTC (permalink / raw)
  To: Artem Bityutskiy
  Cc: Kautuk Consul, Mel Gorman, KOSAKI Motohiro, linux-mm,
	linux-kernel, linux-fsdevel, Jan Kara, Dave Chinner, Greg Thelen

Hi Artem,

> Here is a real use-case we had when developing the N900 phone. We had
> internal flash and external microSD slot. Internal flash is soldered in
> and cannot be removed by the user. MicroSD, in contrast, can be removed
> by the user.
> 
> For the internal flash we wanted long intervals and relaxed limits to
> gain better performance.
> 
> For MicroSD we wanted very short intervals and tough limits to make sure
> that if the user suddenly removes his microSD (users do this all the
> time) - we do not lose data.

Thinking twice about it, I find that the different requirements for
interval flash/external microSD can also be solved by this scheme.

Introduce a per-bdi dirty_background_time (and optionally dirty_time)
as the counterpart of (and works in parallel to) global dirty[_background]_ratio,
however with unit "milliseconds worth of data".

The per-bdi dirty_background_time will be set low for external microSD
and high for internal flash. Then you get timely writeouts for microSD
and reasonably delayed writes for internal flash (controllable by the
global dirty_expire_centisecs).

The dirty_background_time will actually work more reliable than
dirty_expire_centisecs because it will checked immediately after the
application dirties more pages. And the dirty_time could provide
strong data integrity guarantee -- much stronger than
dirty_expire_centisecs -- if used.

Does that sound reasonable?

Thanks,
Fengguang

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
  2011-08-18 12:35       ` Wu Fengguang
@ 2011-08-18 15:26         ` Kautuk Consul
  -1 siblings, 0 replies; 47+ messages in thread
From: Kautuk Consul @ 2011-08-18 15:26 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Artem Bityutskiy, Mel Gorman, KOSAKI Motohiro, linux-mm,
	linux-kernel, linux-fsdevel, Jan Kara, Dave Chinner

Please find my comments inline to the email below:

On Thu, Aug 18, 2011 at 6:05 PM, Wu Fengguang <fengguang.wu@intel.com> wrote:
> On Thu, Aug 18, 2011 at 08:14:57PM +0800, Artem Bityutskiy wrote:
>> On Thu, 2011-08-18 at 17:48 +0800, Wu Fengguang wrote:
>> > > For example, the user might want to write-back pages in smaller
>> > > intervals to a block device which has a
>> > > faster known writeback speed.
>> >
>> > That's not a complete rational. What does the user ultimately want by
>> > setting a smaller interval? What would be the problems to the other
>> > slow devices if the user does so by simply setting a small value
>> > _globally_?
>> >
>> > We need strong use cases for doing such user interface changes.
>> > Would you detail the problem and the pains that can only (or best)
>> > be addressed by this patch?
>>
>> Here is a real use-case we had when developing the N900 phone. We had
>> internal flash and external microSD slot. Internal flash is soldered in
>> and cannot be removed by the user. MicroSD, in contrast, can be removed
>> by the user.

Yes, of course. I forgot this aspect also.
In fact I, too work on embedded platforms and I have faced this
problem with removable USB
disks. Our embedded applications don't even tell the user when it
would be a good time to remove
the USB stick.
Hence we run into data integrity problems for our filesystems when
some writebacks have not been
completed before removal of the USB disk.
Thanks for mentioning this as this adds to a use-case for this feature.

>>
>> For the internal flash we wanted long intervals and relaxed limits to
>> gain better performance.
>
> Understand -- it's backed by the battery anyway.
>
> Yeah it's a practical way. It might even optimize away some of the
> writes if they are truncated some time later. It also allows possible
> optimization of deferring the writes to user inactive periods.
>
> However the ultimate optimization could be to prioritize READs over
> WRITEs in the IO scheduler, so that async WRITEs have minimal impact
> on normal operations. It's the only option for the MicroSD case,
> anyway.
>
>> For MicroSD we wanted very short intervals and tough limits to make sure
>> that if the user suddenly removes his microSD (users do this all the
>> time) - we do not lose data.
>
> Pretty reasonable.
>
>> The discussed capability would be very useful in that case, AFAICS.
>
> Agreed.
>
>> IOW, this is not only about fast/slow devices and how quickly you want
>> to be able to sync the FS, this is also about data integrity guarantees.
>
> In fact I never think it would matter for fast/slow devices.  It's the

As I mentioned, if there is a comparitively faster device, you might want to set
smaller intervals in which your pages are synced with disk for quicker
memory reclamation
purposes. This can be used on servers that run apps that have high
disk accesses as
well as need a lot of memory. As I explained before, in that case, the
direct reclamation
procedure will cause the usermode apps to sleep while trying to free
up pages by flushing
them to disk via the filesystem's writepage().

> dirty_ratio/dirty_bytes interfaces that ask for improvement if care
> about too many pages being cached.
>

The dirty_ratio/dirty_bytes interface is good as a spatial approach in
terms of number of pages
to actually write after each interval.
This still cannot solve the problem Artem is mentioning, because the
time at which removable disks
can be detached is indeterminable as the user can do this anytime he wants.
Whatever algorithm you use, you will eventually run into some
situation where the user detaches a
disk before the writeback can really happen.
I think it is up to the user/admin to determine how much write-back
interval is actually required for his/her
specific application.

> The intervals interfaces are intended for data integrity and nothing
> more.

Yes. That is correct, but do you feel that this data integrity is
possible in this age of removable
disks ?
That said, I would say that your patches are a very nice spatial
approach to a part of the solution.
Do you feel that combining a temporal approach along with your spatial
pattern analysis technique would
be the best way to ensure data integrity along with proper bandwidth
estimation for specific applications ?

>
> Thanks,
> Fengguang
>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
@ 2011-08-18 15:26         ` Kautuk Consul
  0 siblings, 0 replies; 47+ messages in thread
From: Kautuk Consul @ 2011-08-18 15:26 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Artem Bityutskiy, Mel Gorman, KOSAKI Motohiro, linux-mm,
	linux-kernel, linux-fsdevel, Jan Kara, Dave Chinner

Please find my comments inline to the email below:

On Thu, Aug 18, 2011 at 6:05 PM, Wu Fengguang <fengguang.wu@intel.com> wrote:
> On Thu, Aug 18, 2011 at 08:14:57PM +0800, Artem Bityutskiy wrote:
>> On Thu, 2011-08-18 at 17:48 +0800, Wu Fengguang wrote:
>> > > For example, the user might want to write-back pages in smaller
>> > > intervals to a block device which has a
>> > > faster known writeback speed.
>> >
>> > That's not a complete rational. What does the user ultimately want by
>> > setting a smaller interval? What would be the problems to the other
>> > slow devices if the user does so by simply setting a small value
>> > _globally_?
>> >
>> > We need strong use cases for doing such user interface changes.
>> > Would you detail the problem and the pains that can only (or best)
>> > be addressed by this patch?
>>
>> Here is a real use-case we had when developing the N900 phone. We had
>> internal flash and external microSD slot. Internal flash is soldered in
>> and cannot be removed by the user. MicroSD, in contrast, can be removed
>> by the user.

Yes, of course. I forgot this aspect also.
In fact I, too work on embedded platforms and I have faced this
problem with removable USB
disks. Our embedded applications don't even tell the user when it
would be a good time to remove
the USB stick.
Hence we run into data integrity problems for our filesystems when
some writebacks have not been
completed before removal of the USB disk.
Thanks for mentioning this as this adds to a use-case for this feature.

>>
>> For the internal flash we wanted long intervals and relaxed limits to
>> gain better performance.
>
> Understand -- it's backed by the battery anyway.
>
> Yeah it's a practical way. It might even optimize away some of the
> writes if they are truncated some time later. It also allows possible
> optimization of deferring the writes to user inactive periods.
>
> However the ultimate optimization could be to prioritize READs over
> WRITEs in the IO scheduler, so that async WRITEs have minimal impact
> on normal operations. It's the only option for the MicroSD case,
> anyway.
>
>> For MicroSD we wanted very short intervals and tough limits to make sure
>> that if the user suddenly removes his microSD (users do this all the
>> time) - we do not lose data.
>
> Pretty reasonable.
>
>> The discussed capability would be very useful in that case, AFAICS.
>
> Agreed.
>
>> IOW, this is not only about fast/slow devices and how quickly you want
>> to be able to sync the FS, this is also about data integrity guarantees.
>
> In fact I never think it would matter for fast/slow devices.  It's the

As I mentioned, if there is a comparitively faster device, you might want to set
smaller intervals in which your pages are synced with disk for quicker
memory reclamation
purposes. This can be used on servers that run apps that have high
disk accesses as
well as need a lot of memory. As I explained before, in that case, the
direct reclamation
procedure will cause the usermode apps to sleep while trying to free
up pages by flushing
them to disk via the filesystem's writepage().

> dirty_ratio/dirty_bytes interfaces that ask for improvement if care
> about too many pages being cached.
>

The dirty_ratio/dirty_bytes interface is good as a spatial approach in
terms of number of pages
to actually write after each interval.
This still cannot solve the problem Artem is mentioning, because the
time at which removable disks
can be detached is indeterminable as the user can do this anytime he wants.
Whatever algorithm you use, you will eventually run into some
situation where the user detaches a
disk before the writeback can really happen.
I think it is up to the user/admin to determine how much write-back
interval is actually required for his/her
specific application.

> The intervals interfaces are intended for data integrity and nothing
> more.

Yes. That is correct, but do you feel that this data integrity is
possible in this age of removable
disks ?
That said, I would say that your patches are a very nice spatial
approach to a part of the solution.
Do you feel that combining a temporal approach along with your spatial
pattern analysis technique would
be the best way to ensure data integrity along with proper bandwidth
estimation for specific applications ?

>
> Thanks,
> Fengguang
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
  2011-08-18 13:13       ` Wu Fengguang
@ 2011-08-18 16:25         ` Kautuk Consul
  -1 siblings, 0 replies; 47+ messages in thread
From: Kautuk Consul @ 2011-08-18 16:25 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Artem Bityutskiy, Mel Gorman, KOSAKI Motohiro, linux-mm,
	linux-kernel, linux-fsdevel, Jan Kara, Dave Chinner, Greg Thelen

Hi Wu,

On Thu, Aug 18, 2011 at 6:43 PM, Wu Fengguang <fengguang.wu@intel.com> wrote:
> Hi Artem,
>
>> Here is a real use-case we had when developing the N900 phone. We had
>> internal flash and external microSD slot. Internal flash is soldered in
>> and cannot be removed by the user. MicroSD, in contrast, can be removed
>> by the user.
>>
>> For the internal flash we wanted long intervals and relaxed limits to
>> gain better performance.
>>
>> For MicroSD we wanted very short intervals and tough limits to make sure
>> that if the user suddenly removes his microSD (users do this all the
>> time) - we do not lose data.
>
> Thinking twice about it, I find that the different requirements for
> interval flash/external microSD can also be solved by this scheme.
>
> Introduce a per-bdi dirty_background_time (and optionally dirty_time)
> as the counterpart of (and works in parallel to) global dirty[_background]_ratio,
> however with unit "milliseconds worth of data".
>
> The per-bdi dirty_background_time will be set low for external microSD
> and high for internal flash. Then you get timely writeouts for microSD
> and reasonably delayed writes for internal flash (controllable by the
> global dirty_expire_centisecs).
>
> The dirty_background_time will actually work more reliable than
> dirty_expire_centisecs because it will checked immediately after the
> application dirties more pages. And the dirty_time could provide
> strong data integrity guarantee -- much stronger than
> dirty_expire_centisecs -- if used.
>
> Does that sound reasonable?
>
> Thanks,
> Fengguang
>

My understanding of your email appears that you are agreeing in
principle that the temporal
aspect of this problem needs to be addressed along with your spatial
pattern analysis technique.

I feel a more generic solution to the problem is required because the
problem faced by Artem can appear
in a different situation for a different application.

I can re-implement my original patch in either centiseconds or
milliseconds as suggested by you.

Kindly advise if my understanding is correct.

Thanks,
Kautuk Consul.

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
@ 2011-08-18 16:25         ` Kautuk Consul
  0 siblings, 0 replies; 47+ messages in thread
From: Kautuk Consul @ 2011-08-18 16:25 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Artem Bityutskiy, Mel Gorman, KOSAKI Motohiro, linux-mm,
	linux-kernel, linux-fsdevel, Jan Kara, Dave Chinner, Greg Thelen

Hi Wu,

On Thu, Aug 18, 2011 at 6:43 PM, Wu Fengguang <fengguang.wu@intel.com> wrote:
> Hi Artem,
>
>> Here is a real use-case we had when developing the N900 phone. We had
>> internal flash and external microSD slot. Internal flash is soldered in
>> and cannot be removed by the user. MicroSD, in contrast, can be removed
>> by the user.
>>
>> For the internal flash we wanted long intervals and relaxed limits to
>> gain better performance.
>>
>> For MicroSD we wanted very short intervals and tough limits to make sure
>> that if the user suddenly removes his microSD (users do this all the
>> time) - we do not lose data.
>
> Thinking twice about it, I find that the different requirements for
> interval flash/external microSD can also be solved by this scheme.
>
> Introduce a per-bdi dirty_background_time (and optionally dirty_time)
> as the counterpart of (and works in parallel to) global dirty[_background]_ratio,
> however with unit "milliseconds worth of data".
>
> The per-bdi dirty_background_time will be set low for external microSD
> and high for internal flash. Then you get timely writeouts for microSD
> and reasonably delayed writes for internal flash (controllable by the
> global dirty_expire_centisecs).
>
> The dirty_background_time will actually work more reliable than
> dirty_expire_centisecs because it will checked immediately after the
> application dirties more pages. And the dirty_time could provide
> strong data integrity guarantee -- much stronger than
> dirty_expire_centisecs -- if used.
>
> Does that sound reasonable?
>
> Thanks,
> Fengguang
>

My understanding of your email appears that you are agreeing in
principle that the temporal
aspect of this problem needs to be addressed along with your spatial
pattern analysis technique.

I feel a more generic solution to the problem is required because the
problem faced by Artem can appear
in a different situation for a different application.

I can re-implement my original patch in either centiseconds or
milliseconds as suggested by you.

Kindly advise if my understanding is correct.

Thanks,
Kautuk Consul.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
  2011-08-18 15:26         ` Kautuk Consul
  (?)
@ 2011-08-19  2:17           ` Wu Fengguang
  -1 siblings, 0 replies; 47+ messages in thread
From: Wu Fengguang @ 2011-08-19  2:17 UTC (permalink / raw)
  To: Kautuk Consul
  Cc: Artem Bityutskiy, Mel Gorman, KOSAKI Motohiro, linux-mm,
	linux-kernel, linux-fsdevel, Jan Kara, Dave Chinner

On Thu, Aug 18, 2011 at 11:26:29PM +0800, Kautuk Consul wrote:
> Please find my comments inline to the email below:
> 
> On Thu, Aug 18, 2011 at 6:05 PM, Wu Fengguang <fengguang.wu@intel.com> wrote:
> > On Thu, Aug 18, 2011 at 08:14:57PM +0800, Artem Bityutskiy wrote:
> >> On Thu, 2011-08-18 at 17:48 +0800, Wu Fengguang wrote:
> >> > > For example, the user might want to write-back pages in smaller
> >> > > intervals to a block device which has a
> >> > > faster known writeback speed.
> >> >
> >> > That's not a complete rational. What does the user ultimately want by
> >> > setting a smaller interval? What would be the problems to the other
> >> > slow devices if the user does so by simply setting a small value
> >> > _globally_?
> >> >
> >> > We need strong use cases for doing such user interface changes.
> >> > Would you detail the problem and the pains that can only (or best)
> >> > be addressed by this patch?
> >>
> >> Here is a real use-case we had when developing the N900 phone. We had
> >> internal flash and external microSD slot. Internal flash is soldered in
> >> and cannot be removed by the user. MicroSD, in contrast, can be removed
> >> by the user.
> 
> Yes, of course. I forgot this aspect also.
> In fact I, too work on embedded platforms and I have faced this
> problem with removable USB
> disks. Our embedded applications don't even tell the user when it
> would be a good time to remove
> the USB stick.
> Hence we run into data integrity problems for our filesystems when
> some writebacks have not been
> completed before removal of the USB disk.
> Thanks for mentioning this as this adds to a use-case for this feature.

For the removable USB disks, we can do a policy that set
dirty_background_time = 0.

This will work better than hacking the dirty intervals. For one thing, it's
impractical to set the latter to tiny values so as to avoid excessive wakeups.

And the intervals interface is never a guarantee. dirty_expire_interval only
promises to _start_ writeback on the expired inodes in "best efforts" way.
Only the dirty_ratio interface guarantees to keep the number of pages under the
limit, hence limiting the most data that can be lost in hot removal events.

> >> For the internal flash we wanted long intervals and relaxed limits to
> >> gain better performance.
> >
> > Understand -- it's backed by the battery anyway.
> >
> > Yeah it's a practical way. It might even optimize away some of the
> > writes if they are truncated some time later. It also allows possible
> > optimization of deferring the writes to user inactive periods.
> >
> > However the ultimate optimization could be to prioritize READs over
> > WRITEs in the IO scheduler, so that async WRITEs have minimal impact
> > on normal operations. It's the only option for the MicroSD case,
> > anyway.
> >
> >> For MicroSD we wanted very short intervals and tough limits to make sure
> >> that if the user suddenly removes his microSD (users do this all the
> >> time) - we do not lose data.
> >
> > Pretty reasonable.
> >
> >> The discussed capability would be very useful in that case, AFAICS.
> >
> > Agreed.
> >
> >> IOW, this is not only about fast/slow devices and how quickly you want
> >> to be able to sync the FS, this is also about data integrity guarantees.
> >
> > In fact I never think it would matter for fast/slow devices.  It's the
> 
> As I mentioned, if there is a comparitively faster device, you might want to set
> smaller intervals in which your pages are synced with disk for quicker
> memory reclamation
> purposes. This can be used on servers that run apps that have high
> disk accesses as
> well as need a lot of memory. As I explained before, in that case, the
> direct reclamation
> procedure will cause the usermode apps to sleep while trying to free
> up pages by flushing
> them to disk via the filesystem's writepage().

Here you want to limit the number of dirty pages for reducing the
chances page reclaim run into them. Again, the right interface for
doing this job is dirty_ratio. Or if you need to do it per-bdi, it
will be some per-bdi dirty_time interface that works in parallel with
dirty_ratio, whatever smaller value will take effect.

> > dirty_ratio/dirty_bytes interfaces that ask for improvement if care
> > about too many pages being cached.
> >
> 
> The dirty_ratio/dirty_bytes interface is good as a spatial approach in
> terms of number of pages
> to actually write after each interval.
> This still cannot solve the problem Artem is mentioning, because the
> time at which removable disks
> can be detached is indeterminable as the user can do this anytime he wants.
> Whatever algorithm you use, you will eventually run into some
> situation where the user detaches a
> disk before the writeback can really happen.
> I think it is up to the user/admin to determine how much write-back
> interval is actually required for his/her
> specific application.

You seem to mis-understand how the dirty intervals sysctl values are
carried out and have rather high expectations for them...

What can be best done for removable disk, in terms of data integrity,
is to immediate start writeout IO for any newly dirtied pages. Which
can only be provided by the provisioned per-bdi dirty_background_time
interface (by setting it to 0 for USB disks).

> > The intervals interfaces are intended for data integrity and nothing
> > more.
> 
> Yes. That is correct, but do you feel that this data integrity is
> possible in this age of removable
> disks ?
> That said, I would say that your patches are a very nice spatial
> approach to a part of the solution.
> Do you feel that combining a temporal approach along with your spatial
> pattern analysis technique would
> be the best way to ensure data integrity along with proper bandwidth
> estimation for specific applications ?

There will be no spatial/temporal difference when
dirty_background_time=0, which is exactly what we can best do to
protect data for removable disks.

Thanks,
Fengguang

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
@ 2011-08-19  2:17           ` Wu Fengguang
  0 siblings, 0 replies; 47+ messages in thread
From: Wu Fengguang @ 2011-08-19  2:17 UTC (permalink / raw)
  To: Kautuk Consul
  Cc: Artem Bityutskiy, Mel Gorman, KOSAKI Motohiro, linux-mm,
	linux-kernel, linux-fsdevel, Jan Kara, Dave Chinner

On Thu, Aug 18, 2011 at 11:26:29PM +0800, Kautuk Consul wrote:
> Please find my comments inline to the email below:
> 
> On Thu, Aug 18, 2011 at 6:05 PM, Wu Fengguang <fengguang.wu@intel.com> wrote:
> > On Thu, Aug 18, 2011 at 08:14:57PM +0800, Artem Bityutskiy wrote:
> >> On Thu, 2011-08-18 at 17:48 +0800, Wu Fengguang wrote:
> >> > > For example, the user might want to write-back pages in smaller
> >> > > intervals to a block device which has a
> >> > > faster known writeback speed.
> >> >
> >> > That's not a complete rational. What does the user ultimately want by
> >> > setting a smaller interval? What would be the problems to the other
> >> > slow devices if the user does so by simply setting a small value
> >> > _globally_?
> >> >
> >> > We need strong use cases for doing such user interface changes.
> >> > Would you detail the problem and the pains that can only (or best)
> >> > be addressed by this patch?
> >>
> >> Here is a real use-case we had when developing the N900 phone. We had
> >> internal flash and external microSD slot. Internal flash is soldered in
> >> and cannot be removed by the user. MicroSD, in contrast, can be removed
> >> by the user.
> 
> Yes, of course. I forgot this aspect also.
> In fact I, too work on embedded platforms and I have faced this
> problem with removable USB
> disks. Our embedded applications don't even tell the user when it
> would be a good time to remove
> the USB stick.
> Hence we run into data integrity problems for our filesystems when
> some writebacks have not been
> completed before removal of the USB disk.
> Thanks for mentioning this as this adds to a use-case for this feature.

For the removable USB disks, we can do a policy that set
dirty_background_time = 0.

This will work better than hacking the dirty intervals. For one thing, it's
impractical to set the latter to tiny values so as to avoid excessive wakeups.

And the intervals interface is never a guarantee. dirty_expire_interval only
promises to _start_ writeback on the expired inodes in "best efforts" way.
Only the dirty_ratio interface guarantees to keep the number of pages under the
limit, hence limiting the most data that can be lost in hot removal events.

> >> For the internal flash we wanted long intervals and relaxed limits to
> >> gain better performance.
> >
> > Understand -- it's backed by the battery anyway.
> >
> > Yeah it's a practical way. It might even optimize away some of the
> > writes if they are truncated some time later. It also allows possible
> > optimization of deferring the writes to user inactive periods.
> >
> > However the ultimate optimization could be to prioritize READs over
> > WRITEs in the IO scheduler, so that async WRITEs have minimal impact
> > on normal operations. It's the only option for the MicroSD case,
> > anyway.
> >
> >> For MicroSD we wanted very short intervals and tough limits to make sure
> >> that if the user suddenly removes his microSD (users do this all the
> >> time) - we do not lose data.
> >
> > Pretty reasonable.
> >
> >> The discussed capability would be very useful in that case, AFAICS.
> >
> > Agreed.
> >
> >> IOW, this is not only about fast/slow devices and how quickly you want
> >> to be able to sync the FS, this is also about data integrity guarantees.
> >
> > In fact I never think it would matter for fast/slow devices.  It's the
> 
> As I mentioned, if there is a comparitively faster device, you might want to set
> smaller intervals in which your pages are synced with disk for quicker
> memory reclamation
> purposes. This can be used on servers that run apps that have high
> disk accesses as
> well as need a lot of memory. As I explained before, in that case, the
> direct reclamation
> procedure will cause the usermode apps to sleep while trying to free
> up pages by flushing
> them to disk via the filesystem's writepage().

Here you want to limit the number of dirty pages for reducing the
chances page reclaim run into them. Again, the right interface for
doing this job is dirty_ratio. Or if you need to do it per-bdi, it
will be some per-bdi dirty_time interface that works in parallel with
dirty_ratio, whatever smaller value will take effect.

> > dirty_ratio/dirty_bytes interfaces that ask for improvement if care
> > about too many pages being cached.
> >
> 
> The dirty_ratio/dirty_bytes interface is good as a spatial approach in
> terms of number of pages
> to actually write after each interval.
> This still cannot solve the problem Artem is mentioning, because the
> time at which removable disks
> can be detached is indeterminable as the user can do this anytime he wants.
> Whatever algorithm you use, you will eventually run into some
> situation where the user detaches a
> disk before the writeback can really happen.
> I think it is up to the user/admin to determine how much write-back
> interval is actually required for his/her
> specific application.

You seem to mis-understand how the dirty intervals sysctl values are
carried out and have rather high expectations for them...

What can be best done for removable disk, in terms of data integrity,
is to immediate start writeout IO for any newly dirtied pages. Which
can only be provided by the provisioned per-bdi dirty_background_time
interface (by setting it to 0 for USB disks).

> > The intervals interfaces are intended for data integrity and nothing
> > more.
> 
> Yes. That is correct, but do you feel that this data integrity is
> possible in this age of removable
> disks ?
> That said, I would say that your patches are a very nice spatial
> approach to a part of the solution.
> Do you feel that combining a temporal approach along with your spatial
> pattern analysis technique would
> be the best way to ensure data integrity along with proper bandwidth
> estimation for specific applications ?

There will be no spatial/temporal difference when
dirty_background_time=0, which is exactly what we can best do to
protect data for removable disks.

Thanks,
Fengguang

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
@ 2011-08-19  2:17           ` Wu Fengguang
  0 siblings, 0 replies; 47+ messages in thread
From: Wu Fengguang @ 2011-08-19  2:17 UTC (permalink / raw)
  To: Kautuk Consul
  Cc: Artem Bityutskiy, Mel Gorman, KOSAKI Motohiro, linux-mm,
	linux-kernel, linux-fsdevel, Jan Kara, Dave Chinner

On Thu, Aug 18, 2011 at 11:26:29PM +0800, Kautuk Consul wrote:
> Please find my comments inline to the email below:
> 
> On Thu, Aug 18, 2011 at 6:05 PM, Wu Fengguang <fengguang.wu@intel.com> wrote:
> > On Thu, Aug 18, 2011 at 08:14:57PM +0800, Artem Bityutskiy wrote:
> >> On Thu, 2011-08-18 at 17:48 +0800, Wu Fengguang wrote:
> >> > > For example, the user might want to write-back pages in smaller
> >> > > intervals to a block device which has a
> >> > > faster known writeback speed.
> >> >
> >> > That's not a complete rational. What does the user ultimately want by
> >> > setting a smaller interval? What would be the problems to the other
> >> > slow devices if the user does so by simply setting a small value
> >> > _globally_?
> >> >
> >> > We need strong use cases for doing such user interface changes.
> >> > Would you detail the problem and the pains that can only (or best)
> >> > be addressed by this patch?
> >>
> >> Here is a real use-case we had when developing the N900 phone. We had
> >> internal flash and external microSD slot. Internal flash is soldered in
> >> and cannot be removed by the user. MicroSD, in contrast, can be removed
> >> by the user.
> 
> Yes, of course. I forgot this aspect also.
> In fact I, too work on embedded platforms and I have faced this
> problem with removable USB
> disks. Our embedded applications don't even tell the user when it
> would be a good time to remove
> the USB stick.
> Hence we run into data integrity problems for our filesystems when
> some writebacks have not been
> completed before removal of the USB disk.
> Thanks for mentioning this as this adds to a use-case for this feature.

For the removable USB disks, we can do a policy that set
dirty_background_time = 0.

This will work better than hacking the dirty intervals. For one thing, it's
impractical to set the latter to tiny values so as to avoid excessive wakeups.

And the intervals interface is never a guarantee. dirty_expire_interval only
promises to _start_ writeback on the expired inodes in "best efforts" way.
Only the dirty_ratio interface guarantees to keep the number of pages under the
limit, hence limiting the most data that can be lost in hot removal events.

> >> For the internal flash we wanted long intervals and relaxed limits to
> >> gain better performance.
> >
> > Understand -- it's backed by the battery anyway.
> >
> > Yeah it's a practical way. It might even optimize away some of the
> > writes if they are truncated some time later. It also allows possible
> > optimization of deferring the writes to user inactive periods.
> >
> > However the ultimate optimization could be to prioritize READs over
> > WRITEs in the IO scheduler, so that async WRITEs have minimal impact
> > on normal operations. It's the only option for the MicroSD case,
> > anyway.
> >
> >> For MicroSD we wanted very short intervals and tough limits to make sure
> >> that if the user suddenly removes his microSD (users do this all the
> >> time) - we do not lose data.
> >
> > Pretty reasonable.
> >
> >> The discussed capability would be very useful in that case, AFAICS.
> >
> > Agreed.
> >
> >> IOW, this is not only about fast/slow devices and how quickly you want
> >> to be able to sync the FS, this is also about data integrity guarantees.
> >
> > In fact I never think it would matter for fast/slow devices. A It's the
> 
> As I mentioned, if there is a comparitively faster device, you might want to set
> smaller intervals in which your pages are synced with disk for quicker
> memory reclamation
> purposes. This can be used on servers that run apps that have high
> disk accesses as
> well as need a lot of memory. As I explained before, in that case, the
> direct reclamation
> procedure will cause the usermode apps to sleep while trying to free
> up pages by flushing
> them to disk via the filesystem's writepage().

Here you want to limit the number of dirty pages for reducing the
chances page reclaim run into them. Again, the right interface for
doing this job is dirty_ratio. Or if you need to do it per-bdi, it
will be some per-bdi dirty_time interface that works in parallel with
dirty_ratio, whatever smaller value will take effect.

> > dirty_ratio/dirty_bytes interfaces that ask for improvement if care
> > about too many pages being cached.
> >
> 
> The dirty_ratio/dirty_bytes interface is good as a spatial approach in
> terms of number of pages
> to actually write after each interval.
> This still cannot solve the problem Artem is mentioning, because the
> time at which removable disks
> can be detached is indeterminable as the user can do this anytime he wants.
> Whatever algorithm you use, you will eventually run into some
> situation where the user detaches a
> disk before the writeback can really happen.
> I think it is up to the user/admin to determine how much write-back
> interval is actually required for his/her
> specific application.

You seem to mis-understand how the dirty intervals sysctl values are
carried out and have rather high expectations for them...

What can be best done for removable disk, in terms of data integrity,
is to immediate start writeout IO for any newly dirtied pages. Which
can only be provided by the provisioned per-bdi dirty_background_time
interface (by setting it to 0 for USB disks).

> > The intervals interfaces are intended for data integrity and nothing
> > more.
> 
> Yes. That is correct, but do you feel that this data integrity is
> possible in this age of removable
> disks ?
> That said, I would say that your patches are a very nice spatial
> approach to a part of the solution.
> Do you feel that combining a temporal approach along with your spatial
> pattern analysis technique would
> be the best way to ensure data integrity along with proper bandwidth
> estimation for specific applications ?

There will be no spatial/temporal difference when
dirty_background_time=0, which is exactly what we can best do to
protect data for removable disks.

Thanks,
Fengguang

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
  2011-08-18 16:25         ` Kautuk Consul
@ 2011-08-19  2:34           ` Wu Fengguang
  -1 siblings, 0 replies; 47+ messages in thread
From: Wu Fengguang @ 2011-08-19  2:34 UTC (permalink / raw)
  To: Kautuk Consul
  Cc: Artem Bityutskiy, Mel Gorman, KOSAKI Motohiro, linux-mm,
	linux-kernel, linux-fsdevel, Jan Kara, Dave Chinner, Greg Thelen

Hi Kautuk,

On Fri, Aug 19, 2011 at 12:25:58AM +0800, Kautuk Consul wrote:
> 
> Lines: 59
> 
> Hi Wu,
> 
> On Thu, Aug 18, 2011 at 6:43 PM, Wu Fengguang <fengguang.wu@intel.com> wrote:
> > Hi Artem,
> >
> >> Here is a real use-case we had when developing the N900 phone. We had
> >> internal flash and external microSD slot. Internal flash is soldered in
> >> and cannot be removed by the user. MicroSD, in contrast, can be removed
> >> by the user.
> >>
> >> For the internal flash we wanted long intervals and relaxed limits to
> >> gain better performance.
> >>
> >> For MicroSD we wanted very short intervals and tough limits to make sure
> >> that if the user suddenly removes his microSD (users do this all the
> >> time) - we do not lose data.
> >
> > Thinking twice about it, I find that the different requirements for
> > interval flash/external microSD can also be solved by this scheme.
> >
> > Introduce a per-bdi dirty_background_time (and optionally dirty_time)
> > as the counterpart of (and works in parallel to) global dirty[_background]_ratio,
> > however with unit "milliseconds worth of data".
> >
> > The per-bdi dirty_background_time will be set low for external microSD
> > and high for internal flash. Then you get timely writeouts for microSD
> > and reasonably delayed writes for internal flash (controllable by the
> > global dirty_expire_centisecs).
> >
> > The dirty_background_time will actually work more reliable than
> > dirty_expire_centisecs because it will checked immediately after the
> > application dirties more pages. And the dirty_time could provide
> > strong data integrity guarantee -- much stronger than
> > dirty_expire_centisecs -- if used.
> >
> > Does that sound reasonable?
> >
> > Thanks,
> > Fengguang
> >
> 
> My understanding of your email appears that you are agreeing in
> principle that the temporal
> aspect of this problem needs to be addressed along with your spatial
> pattern analysis technique.

Yup.

> I feel a more generic solution to the problem is required because the
> problem faced by Artem can appear
> in a different situation for a different application.
> 
> I can re-implement my original patch in either centiseconds or
> milliseconds as suggested by you.

My concern on your patch is the possible conflicts and confusions
between the global and the per-bdi dirty_expire_centisecs. To maintain
compatibility you need to keep the global one. Then there is the hard
question of "what to do with the per-bdi values when the global value
is changed". Whatever policy you choose, there will be user unexpected
behaviors.

I don't like such conflicting/inconsistent interfaces.

Given that we'll need to introduce the dirty_background_time interface
anyway, and it happen to can address the N900 internal/removable storage
problem (mostly), I'm more than glad to cancel the dirty_expire_centisecs
problem.

Or, do you have better way out of the dirty_expire_centisecs dilemma?

Thanks,
Fengguang

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
@ 2011-08-19  2:34           ` Wu Fengguang
  0 siblings, 0 replies; 47+ messages in thread
From: Wu Fengguang @ 2011-08-19  2:34 UTC (permalink / raw)
  To: Kautuk Consul
  Cc: Artem Bityutskiy, Mel Gorman, KOSAKI Motohiro, linux-mm,
	linux-kernel, linux-fsdevel, Jan Kara, Dave Chinner, Greg Thelen

Hi Kautuk,

On Fri, Aug 19, 2011 at 12:25:58AM +0800, Kautuk Consul wrote:
> 
> Lines: 59
> 
> Hi Wu,
> 
> On Thu, Aug 18, 2011 at 6:43 PM, Wu Fengguang <fengguang.wu@intel.com> wrote:
> > Hi Artem,
> >
> >> Here is a real use-case we had when developing the N900 phone. We had
> >> internal flash and external microSD slot. Internal flash is soldered in
> >> and cannot be removed by the user. MicroSD, in contrast, can be removed
> >> by the user.
> >>
> >> For the internal flash we wanted long intervals and relaxed limits to
> >> gain better performance.
> >>
> >> For MicroSD we wanted very short intervals and tough limits to make sure
> >> that if the user suddenly removes his microSD (users do this all the
> >> time) - we do not lose data.
> >
> > Thinking twice about it, I find that the different requirements for
> > interval flash/external microSD can also be solved by this scheme.
> >
> > Introduce a per-bdi dirty_background_time (and optionally dirty_time)
> > as the counterpart of (and works in parallel to) global dirty[_background]_ratio,
> > however with unit "milliseconds worth of data".
> >
> > The per-bdi dirty_background_time will be set low for external microSD
> > and high for internal flash. Then you get timely writeouts for microSD
> > and reasonably delayed writes for internal flash (controllable by the
> > global dirty_expire_centisecs).
> >
> > The dirty_background_time will actually work more reliable than
> > dirty_expire_centisecs because it will checked immediately after the
> > application dirties more pages. And the dirty_time could provide
> > strong data integrity guarantee -- much stronger than
> > dirty_expire_centisecs -- if used.
> >
> > Does that sound reasonable?
> >
> > Thanks,
> > Fengguang
> >
> 
> My understanding of your email appears that you are agreeing in
> principle that the temporal
> aspect of this problem needs to be addressed along with your spatial
> pattern analysis technique.

Yup.

> I feel a more generic solution to the problem is required because the
> problem faced by Artem can appear
> in a different situation for a different application.
> 
> I can re-implement my original patch in either centiseconds or
> milliseconds as suggested by you.

My concern on your patch is the possible conflicts and confusions
between the global and the per-bdi dirty_expire_centisecs. To maintain
compatibility you need to keep the global one. Then there is the hard
question of "what to do with the per-bdi values when the global value
is changed". Whatever policy you choose, there will be user unexpected
behaviors.

I don't like such conflicting/inconsistent interfaces.

Given that we'll need to introduce the dirty_background_time interface
anyway, and it happen to can address the N900 internal/removable storage
problem (mostly), I'm more than glad to cancel the dirty_expire_centisecs
problem.

Or, do you have better way out of the dirty_expire_centisecs dilemma?

Thanks,
Fengguang

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
  2011-08-19  2:34           ` Wu Fengguang
@ 2011-08-19  4:38             ` Kautuk Consul
  -1 siblings, 0 replies; 47+ messages in thread
From: Kautuk Consul @ 2011-08-19  4:38 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Artem Bityutskiy, Mel Gorman, KOSAKI Motohiro, linux-mm,
	linux-kernel, linux-fsdevel, Jan Kara, Dave Chinner, Greg Thelen

HI Wu,

On Fri, Aug 19, 2011 at 8:04 AM, Wu Fengguang <fengguang.wu@intel.com> wrote:
> Hi Kautuk,
>
> On Fri, Aug 19, 2011 at 12:25:58AM +0800, Kautuk Consul wrote:
>>
>> Lines: 59
>>
>> Hi Wu,
>>
>> On Thu, Aug 18, 2011 at 6:43 PM, Wu Fengguang <fengguang.wu@intel.com> wrote:
>> > Hi Artem,
>> >
>> >> Here is a real use-case we had when developing the N900 phone. We had
>> >> internal flash and external microSD slot. Internal flash is soldered in
>> >> and cannot be removed by the user. MicroSD, in contrast, can be removed
>> >> by the user.
>> >>
>> >> For the internal flash we wanted long intervals and relaxed limits to
>> >> gain better performance.
>> >>
>> >> For MicroSD we wanted very short intervals and tough limits to make sure
>> >> that if the user suddenly removes his microSD (users do this all the
>> >> time) - we do not lose data.
>> >
>> > Thinking twice about it, I find that the different requirements for
>> > interval flash/external microSD can also be solved by this scheme.
>> >
>> > Introduce a per-bdi dirty_background_time (and optionally dirty_time)
>> > as the counterpart of (and works in parallel to) global dirty[_background]_ratio,
>> > however with unit "milliseconds worth of data".
>> >
>> > The per-bdi dirty_background_time will be set low for external microSD
>> > and high for internal flash. Then you get timely writeouts for microSD
>> > and reasonably delayed writes for internal flash (controllable by the
>> > global dirty_expire_centisecs).
>> >
>> > The dirty_background_time will actually work more reliable than
>> > dirty_expire_centisecs because it will checked immediately after the
>> > application dirties more pages. And the dirty_time could provide
>> > strong data integrity guarantee -- much stronger than
>> > dirty_expire_centisecs -- if used.

The dirty_writeback_centisecs is the value we are also actually
interested in, and not just
dirty_expire_interval. This value is what is actually used to reset
the per-BDI timeout in the code.

>> >
>> > Does that sound reasonable?
>> >
>> > Thanks,
>> > Fengguang
>> >
>>
>> My understanding of your email appears that you are agreeing in
>> principle that the temporal
>> aspect of this problem needs to be addressed along with your spatial
>> pattern analysis technique.
>
> Yup.
>
>> I feel a more generic solution to the problem is required because the
>> problem faced by Artem can appear
>> in a different situation for a different application.
>>
>> I can re-implement my original patch in either centiseconds or
>> milliseconds as suggested by you.
>
> My concern on your patch is the possible conflicts and confusions
> between the global and the per-bdi dirty_expire_centisecs. To maintain
> compatibility you need to keep the global one. Then there is the hard

If you refer to my original email, I have addressed this as follows:
When the global value is set, then all the per-BDI dirty*_centisecs
are also reset
to the global value.
This is essential for retaining the functionality across Linux
distributions using
the global values.
This amounts to compatibility as the global values will take effect.
After that point, if the user/admin feels, he/she can adjust/tune the
per-BDI counters to
certain empirical value as per the specific application. This will not
alter the global values.

> question of "what to do with the per-bdi values when the global value
> is changed". Whatever policy you choose, there will be user unexpected
> behaviors.
>

How ?
Of course, if the user tuned some per-BDI values and then chose to
reset the global values
we need to reset the per-BDI interfaces also as that is what the
original functionality of those
counters is.
Worst case scenario : The timeout might not take effect immediately as
per the newer global value.
The first timeout might still happen as per the older per-BDI value.

Individual per-BDI tuning should be done after the global values have been set.
Worst case scenario: Again, the timeout might not take effect
immediately as per the newer
per-BDI value. This first timeout might still happen as per the older
global value that the per-BDI had
before its individual tuning.

Both of the above worst case scenarios can lead to unexpected
behaviours but for short intervals
and only in the first timeout.
i)  The above timeout scenario can also happen if you don't alter this
interface.
      The first timeout might be at the end of the older interval time
and only after that the new value will
       take effect in terms of intervals.
ii)   Since these values would be quite important for the overall
functionality of the device, I don't
expect that the globals and the individual values would be frequently set/reset.
iii)   Anyways, only an advanced user would try to tune these per-BDI
values and would take care of
the point at which she/he set/reset these values in the system.
Or, maybe we solve this by fiddling around with the timeout values to
modify/cancel the timer based
on the new value ?

Is there any other possible worst-case scenario I left out ?

> I don't like such conflicting/inconsistent interfaces.
>

Well, I believe that the inconsistency or the lack of functionality
existed earlier, when global
values were all that existed. When the logical decision of having
per-block device threads
came around most of the /proc/sys/vm/dirty_* functionality can and
needs to be split.
This decision is as natural as your decision to have per-BDI dirty
bandwidth estimation.
Again, this problem is due to the advent of removable disk devices and
needs to be addressed
at a per-BDI level.

> Given that we'll need to introduce the dirty_background_time interface
> anyway, and it happen to can address the N900 internal/removable storage
> problem (mostly), I'm more than glad to cancel the dirty_expire_centisecs
> problem.
>

I have following doubts with respect to your dirty_background_time
interface suggestion:
i)   You say that you'll do this only for the N900 problem for solving
the unexpected disk removal
      problem.
      I believe you are ignoring the problem of rate of undirtying of
the block device pages for
      making reclamation of that block device's file-cache pages at a
sooner possible future time.
      I mentioned this in my earlier emails also.
ii)   Will you be changing the dirty_background_time dynamically with
your algorithm ?
      According to your description, I think not.
iii)  I cannot see how your implementation of dirty_background_time is
different from mine, except
      maybe for the first time interval taking effect properly.
      However, we can also think that the first time interval should
probably be honoured with the older
      value to make the transition from the old timer value to new
timer value smoother in terms of
      periodic writeback functionality.

> Or, do you have better way out of the dirty_expire_centisecs dilemma?
>

Maybe we can delete the global value entirely. However as you
correctly mentioned above, this
will impact other tools distributions altering these global values.

You mentioned the close relationship between the dirty_background_time
and the global dirty
[_background]_ratio.
Do you mean to say that you intend to change the dirty_background_time
based on changes to
the dirty_background_ratio ?
Since the global dirty_background_ratio doesn't result in changes to
the dirty_writeback_centisecs
wouldn't this amount to a radical change in the existing relationship
of these configurable values ?

> Thanks,
> Fengguang
>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
@ 2011-08-19  4:38             ` Kautuk Consul
  0 siblings, 0 replies; 47+ messages in thread
From: Kautuk Consul @ 2011-08-19  4:38 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Artem Bityutskiy, Mel Gorman, KOSAKI Motohiro, linux-mm,
	linux-kernel, linux-fsdevel, Jan Kara, Dave Chinner, Greg Thelen

HI Wu,

On Fri, Aug 19, 2011 at 8:04 AM, Wu Fengguang <fengguang.wu@intel.com> wrote:
> Hi Kautuk,
>
> On Fri, Aug 19, 2011 at 12:25:58AM +0800, Kautuk Consul wrote:
>>
>> Lines: 59
>>
>> Hi Wu,
>>
>> On Thu, Aug 18, 2011 at 6:43 PM, Wu Fengguang <fengguang.wu@intel.com> wrote:
>> > Hi Artem,
>> >
>> >> Here is a real use-case we had when developing the N900 phone. We had
>> >> internal flash and external microSD slot. Internal flash is soldered in
>> >> and cannot be removed by the user. MicroSD, in contrast, can be removed
>> >> by the user.
>> >>
>> >> For the internal flash we wanted long intervals and relaxed limits to
>> >> gain better performance.
>> >>
>> >> For MicroSD we wanted very short intervals and tough limits to make sure
>> >> that if the user suddenly removes his microSD (users do this all the
>> >> time) - we do not lose data.
>> >
>> > Thinking twice about it, I find that the different requirements for
>> > interval flash/external microSD can also be solved by this scheme.
>> >
>> > Introduce a per-bdi dirty_background_time (and optionally dirty_time)
>> > as the counterpart of (and works in parallel to) global dirty[_background]_ratio,
>> > however with unit "milliseconds worth of data".
>> >
>> > The per-bdi dirty_background_time will be set low for external microSD
>> > and high for internal flash. Then you get timely writeouts for microSD
>> > and reasonably delayed writes for internal flash (controllable by the
>> > global dirty_expire_centisecs).
>> >
>> > The dirty_background_time will actually work more reliable than
>> > dirty_expire_centisecs because it will checked immediately after the
>> > application dirties more pages. And the dirty_time could provide
>> > strong data integrity guarantee -- much stronger than
>> > dirty_expire_centisecs -- if used.

The dirty_writeback_centisecs is the value we are also actually
interested in, and not just
dirty_expire_interval. This value is what is actually used to reset
the per-BDI timeout in the code.

>> >
>> > Does that sound reasonable?
>> >
>> > Thanks,
>> > Fengguang
>> >
>>
>> My understanding of your email appears that you are agreeing in
>> principle that the temporal
>> aspect of this problem needs to be addressed along with your spatial
>> pattern analysis technique.
>
> Yup.
>
>> I feel a more generic solution to the problem is required because the
>> problem faced by Artem can appear
>> in a different situation for a different application.
>>
>> I can re-implement my original patch in either centiseconds or
>> milliseconds as suggested by you.
>
> My concern on your patch is the possible conflicts and confusions
> between the global and the per-bdi dirty_expire_centisecs. To maintain
> compatibility you need to keep the global one. Then there is the hard

If you refer to my original email, I have addressed this as follows:
When the global value is set, then all the per-BDI dirty*_centisecs
are also reset
to the global value.
This is essential for retaining the functionality across Linux
distributions using
the global values.
This amounts to compatibility as the global values will take effect.
After that point, if the user/admin feels, he/she can adjust/tune the
per-BDI counters to
certain empirical value as per the specific application. This will not
alter the global values.

> question of "what to do with the per-bdi values when the global value
> is changed". Whatever policy you choose, there will be user unexpected
> behaviors.
>

How ?
Of course, if the user tuned some per-BDI values and then chose to
reset the global values
we need to reset the per-BDI interfaces also as that is what the
original functionality of those
counters is.
Worst case scenario : The timeout might not take effect immediately as
per the newer global value.
The first timeout might still happen as per the older per-BDI value.

Individual per-BDI tuning should be done after the global values have been set.
Worst case scenario: Again, the timeout might not take effect
immediately as per the newer
per-BDI value. This first timeout might still happen as per the older
global value that the per-BDI had
before its individual tuning.

Both of the above worst case scenarios can lead to unexpected
behaviours but for short intervals
and only in the first timeout.
i)  The above timeout scenario can also happen if you don't alter this
interface.
      The first timeout might be at the end of the older interval time
and only after that the new value will
       take effect in terms of intervals.
ii)   Since these values would be quite important for the overall
functionality of the device, I don't
expect that the globals and the individual values would be frequently set/reset.
iii)   Anyways, only an advanced user would try to tune these per-BDI
values and would take care of
the point at which she/he set/reset these values in the system.
Or, maybe we solve this by fiddling around with the timeout values to
modify/cancel the timer based
on the new value ?

Is there any other possible worst-case scenario I left out ?

> I don't like such conflicting/inconsistent interfaces.
>

Well, I believe that the inconsistency or the lack of functionality
existed earlier, when global
values were all that existed. When the logical decision of having
per-block device threads
came around most of the /proc/sys/vm/dirty_* functionality can and
needs to be split.
This decision is as natural as your decision to have per-BDI dirty
bandwidth estimation.
Again, this problem is due to the advent of removable disk devices and
needs to be addressed
at a per-BDI level.

> Given that we'll need to introduce the dirty_background_time interface
> anyway, and it happen to can address the N900 internal/removable storage
> problem (mostly), I'm more than glad to cancel the dirty_expire_centisecs
> problem.
>

I have following doubts with respect to your dirty_background_time
interface suggestion:
i)   You say that you'll do this only for the N900 problem for solving
the unexpected disk removal
      problem.
      I believe you are ignoring the problem of rate of undirtying of
the block device pages for
      making reclamation of that block device's file-cache pages at a
sooner possible future time.
      I mentioned this in my earlier emails also.
ii)   Will you be changing the dirty_background_time dynamically with
your algorithm ?
      According to your description, I think not.
iii)  I cannot see how your implementation of dirty_background_time is
different from mine, except
      maybe for the first time interval taking effect properly.
      However, we can also think that the first time interval should
probably be honoured with the older
      value to make the transition from the old timer value to new
timer value smoother in terms of
      periodic writeback functionality.

> Or, do you have better way out of the dirty_expire_centisecs dilemma?
>

Maybe we can delete the global value entirely. However as you
correctly mentioned above, this
will impact other tools distributions altering these global values.

You mentioned the close relationship between the dirty_background_time
and the global dirty
[_background]_ratio.
Do you mean to say that you intend to change the dirty_background_time
based on changes to
the dirty_background_ratio ?
Since the global dirty_background_ratio doesn't result in changes to
the dirty_writeback_centisecs
wouldn't this amount to a radical change in the existing relationship
of these configurable values ?

> Thanks,
> Fengguang
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
  2011-08-19  4:38             ` Kautuk Consul
@ 2011-08-19  5:28               ` Wu Fengguang
  -1 siblings, 0 replies; 47+ messages in thread
From: Wu Fengguang @ 2011-08-19  5:28 UTC (permalink / raw)
  To: Kautuk Consul
  Cc: Artem Bityutskiy, Mel Gorman, KOSAKI Motohiro, linux-mm,
	linux-kernel, linux-fsdevel, Jan Kara, Dave Chinner, Greg Thelen

On Fri, Aug 19, 2011 at 12:38:36PM +0800, Kautuk Consul wrote:
> HI Wu,
> 
> On Fri, Aug 19, 2011 at 8:04 AM, Wu Fengguang <fengguang.wu@intel.com> wrote:
> > Hi Kautuk,
> >
> > On Fri, Aug 19, 2011 at 12:25:58AM +0800, Kautuk Consul wrote:
> >>
> >> Lines: 59
> >>
> >> Hi Wu,
> >>
> >> On Thu, Aug 18, 2011 at 6:43 PM, Wu Fengguang <fengguang.wu@intel.com> wrote:
> >> > Hi Artem,
> >> >
> >> >> Here is a real use-case we had when developing the N900 phone. We had
> >> >> internal flash and external microSD slot. Internal flash is soldered in
> >> >> and cannot be removed by the user. MicroSD, in contrast, can be removed
> >> >> by the user.
> >> >>
> >> >> For the internal flash we wanted long intervals and relaxed limits to
> >> >> gain better performance.
> >> >>
> >> >> For MicroSD we wanted very short intervals and tough limits to make sure
> >> >> that if the user suddenly removes his microSD (users do this all the
> >> >> time) - we do not lose data.
> >> >
> >> > Thinking twice about it, I find that the different requirements for
> >> > interval flash/external microSD can also be solved by this scheme.
> >> >
> >> > Introduce a per-bdi dirty_background_time (and optionally dirty_time)
> >> > as the counterpart of (and works in parallel to) global dirty[_background]_ratio,
> >> > however with unit "milliseconds worth of data".
> >> >
> >> > The per-bdi dirty_background_time will be set low for external microSD
> >> > and high for internal flash. Then you get timely writeouts for microSD
> >> > and reasonably delayed writes for internal flash (controllable by the
> >> > global dirty_expire_centisecs).
> >> >
> >> > The dirty_background_time will actually work more reliable than
> >> > dirty_expire_centisecs because it will checked immediately after the
> >> > application dirties more pages. And the dirty_time could provide
> >> > strong data integrity guarantee -- much stronger than
> >> > dirty_expire_centisecs -- if used.
> 
> The dirty_writeback_centisecs is the value we are also actually
> interested in, and not just
> dirty_expire_interval. This value is what is actually used to reset
> the per-BDI timeout in the code.

Yes. I assumed if one reduced dirty_expire_centisecs, he may well want
to reduce dirty_writeback_centisecs.

> >> >
> >> > Does that sound reasonable?
> >> >
> >> > Thanks,
> >> > Fengguang
> >> >
> >>
> >> My understanding of your email appears that you are agreeing in
> >> principle that the temporal
> >> aspect of this problem needs to be addressed along with your spatial
> >> pattern analysis technique.
> >
> > Yup.
> >
> >> I feel a more generic solution to the problem is required because the
> >> problem faced by Artem can appear
> >> in a different situation for a different application.
> >>
> >> I can re-implement my original patch in either centiseconds or
> >> milliseconds as suggested by you.
> >
> > My concern on your patch is the possible conflicts and confusions
> > between the global and the per-bdi dirty_expire_centisecs. To maintain
> > compatibility you need to keep the global one. Then there is the hard
> 
> If you refer to my original email, I have addressed this as follows:
> When the global value is set, then all the per-BDI dirty*_centisecs
> are also reset
> to the global value.
> This is essential for retaining the functionality across Linux
> distributions using
> the global values.
> This amounts to compatibility as the global values will take effect.
> After that point, if the user/admin feels, he/she can adjust/tune the
> per-BDI counters to
> certain empirical value as per the specific application. This will not
> alter the global values.

Such "resetting all" behavior could be disgusting. Some users without
the global view may be puzzled why their set value is lost.

A better scheme would be to use the bdi value if it's non-zero, and
fall back to the global value otherwise. This will reduce complexity
of the code as well as interface.

> > Given that we'll need to introduce the dirty_background_time interface
> > anyway, and it happen to can address the N900 internal/removable storage
> > problem (mostly), I'm more than glad to cancel the dirty_expire_centisecs
> > problem.
> >
> 
> I have following doubts with respect to your dirty_background_time
> interface suggestion:
> i)   You say that you'll do this only for the N900 problem for solving
> the unexpected disk removal
>       problem.
>       I believe you are ignoring the problem of rate of undirtying of
> the block device pages for
>       making reclamation of that block device's file-cache pages at a
> sooner possible future time.
>       I mentioned this in my earlier emails also.

I care the dirty page reclaim problem a lot, however this patch is
fundamentally not the right answer to that problem.

> ii)   Will you be changing the dirty_background_time dynamically with
> your algorithm ?
>       According to your description, I think not.

dirty_background_time will be some static value.

> iii)  I cannot see how your implementation of dirty_background_time is
> different from mine, except
>       maybe for the first time interval taking effect properly.

dirty_background_time will be the analog to dirty_background_ratio.

dirty_background_ratio/dirty_ratio and
dirty_writeback_centisecs/dirty_expire_centisecs is as different as
apple and orange.

>       However, we can also think that the first time interval should
> probably be honoured with the older
>       value to make the transition from the old timer value to new
> timer value smoother in terms of
>       periodic writeback functionality.

There is no "interval" thing for dirty_background_time.
(I'll show you the implementation tomorrow.)

> > Or, do you have better way out of the dirty_expire_centisecs dilemma?
> >
> 
> Maybe we can delete the global value entirely. However as you
> correctly mentioned above, this
> will impact other tools distributions altering these global values.

Right. Deleting existing interfaces are NOT an option.

> You mentioned the close relationship between the dirty_background_time
> and the global dirty
> [_background]_ratio.
> Do you mean to say that you intend to change the dirty_background_time
> based on changes to
> the dirty_background_ratio ?
> Since the global dirty_background_ratio doesn't result in changes to
> the dirty_writeback_centisecs
> wouldn't this amount to a radical change in the existing relationship
> of these configurable values ?

dirty_background_time will be complementing dirty_background_ratio.
One will be adaptive to device bandwidth, the other to memory size.

The users typically don't want to accumulate many dirty pages to eat
up the memory, or take too much time to writeout.

So it's very natural to introduce dirty_background_time to fill the gap.

Thanks,
Fengguang

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
@ 2011-08-19  5:28               ` Wu Fengguang
  0 siblings, 0 replies; 47+ messages in thread
From: Wu Fengguang @ 2011-08-19  5:28 UTC (permalink / raw)
  To: Kautuk Consul
  Cc: Artem Bityutskiy, Mel Gorman, KOSAKI Motohiro, linux-mm,
	linux-kernel, linux-fsdevel, Jan Kara, Dave Chinner, Greg Thelen

On Fri, Aug 19, 2011 at 12:38:36PM +0800, Kautuk Consul wrote:
> HI Wu,
> 
> On Fri, Aug 19, 2011 at 8:04 AM, Wu Fengguang <fengguang.wu@intel.com> wrote:
> > Hi Kautuk,
> >
> > On Fri, Aug 19, 2011 at 12:25:58AM +0800, Kautuk Consul wrote:
> >>
> >> Lines: 59
> >>
> >> Hi Wu,
> >>
> >> On Thu, Aug 18, 2011 at 6:43 PM, Wu Fengguang <fengguang.wu@intel.com> wrote:
> >> > Hi Artem,
> >> >
> >> >> Here is a real use-case we had when developing the N900 phone. We had
> >> >> internal flash and external microSD slot. Internal flash is soldered in
> >> >> and cannot be removed by the user. MicroSD, in contrast, can be removed
> >> >> by the user.
> >> >>
> >> >> For the internal flash we wanted long intervals and relaxed limits to
> >> >> gain better performance.
> >> >>
> >> >> For MicroSD we wanted very short intervals and tough limits to make sure
> >> >> that if the user suddenly removes his microSD (users do this all the
> >> >> time) - we do not lose data.
> >> >
> >> > Thinking twice about it, I find that the different requirements for
> >> > interval flash/external microSD can also be solved by this scheme.
> >> >
> >> > Introduce a per-bdi dirty_background_time (and optionally dirty_time)
> >> > as the counterpart of (and works in parallel to) global dirty[_background]_ratio,
> >> > however with unit "milliseconds worth of data".
> >> >
> >> > The per-bdi dirty_background_time will be set low for external microSD
> >> > and high for internal flash. Then you get timely writeouts for microSD
> >> > and reasonably delayed writes for internal flash (controllable by the
> >> > global dirty_expire_centisecs).
> >> >
> >> > The dirty_background_time will actually work more reliable than
> >> > dirty_expire_centisecs because it will checked immediately after the
> >> > application dirties more pages. And the dirty_time could provide
> >> > strong data integrity guarantee -- much stronger than
> >> > dirty_expire_centisecs -- if used.
> 
> The dirty_writeback_centisecs is the value we are also actually
> interested in, and not just
> dirty_expire_interval. This value is what is actually used to reset
> the per-BDI timeout in the code.

Yes. I assumed if one reduced dirty_expire_centisecs, he may well want
to reduce dirty_writeback_centisecs.

> >> >
> >> > Does that sound reasonable?
> >> >
> >> > Thanks,
> >> > Fengguang
> >> >
> >>
> >> My understanding of your email appears that you are agreeing in
> >> principle that the temporal
> >> aspect of this problem needs to be addressed along with your spatial
> >> pattern analysis technique.
> >
> > Yup.
> >
> >> I feel a more generic solution to the problem is required because the
> >> problem faced by Artem can appear
> >> in a different situation for a different application.
> >>
> >> I can re-implement my original patch in either centiseconds or
> >> milliseconds as suggested by you.
> >
> > My concern on your patch is the possible conflicts and confusions
> > between the global and the per-bdi dirty_expire_centisecs. To maintain
> > compatibility you need to keep the global one. Then there is the hard
> 
> If you refer to my original email, I have addressed this as follows:
> When the global value is set, then all the per-BDI dirty*_centisecs
> are also reset
> to the global value.
> This is essential for retaining the functionality across Linux
> distributions using
> the global values.
> This amounts to compatibility as the global values will take effect.
> After that point, if the user/admin feels, he/she can adjust/tune the
> per-BDI counters to
> certain empirical value as per the specific application. This will not
> alter the global values.

Such "resetting all" behavior could be disgusting. Some users without
the global view may be puzzled why their set value is lost.

A better scheme would be to use the bdi value if it's non-zero, and
fall back to the global value otherwise. This will reduce complexity
of the code as well as interface.

> > Given that we'll need to introduce the dirty_background_time interface
> > anyway, and it happen to can address the N900 internal/removable storage
> > problem (mostly), I'm more than glad to cancel the dirty_expire_centisecs
> > problem.
> >
> 
> I have following doubts with respect to your dirty_background_time
> interface suggestion:
> i)   You say that you'll do this only for the N900 problem for solving
> the unexpected disk removal
>       problem.
>       I believe you are ignoring the problem of rate of undirtying of
> the block device pages for
>       making reclamation of that block device's file-cache pages at a
> sooner possible future time.
>       I mentioned this in my earlier emails also.

I care the dirty page reclaim problem a lot, however this patch is
fundamentally not the right answer to that problem.

> ii)   Will you be changing the dirty_background_time dynamically with
> your algorithm ?
>       According to your description, I think not.

dirty_background_time will be some static value.

> iii)  I cannot see how your implementation of dirty_background_time is
> different from mine, except
>       maybe for the first time interval taking effect properly.

dirty_background_time will be the analog to dirty_background_ratio.

dirty_background_ratio/dirty_ratio and
dirty_writeback_centisecs/dirty_expire_centisecs is as different as
apple and orange.

>       However, we can also think that the first time interval should
> probably be honoured with the older
>       value to make the transition from the old timer value to new
> timer value smoother in terms of
>       periodic writeback functionality.

There is no "interval" thing for dirty_background_time.
(I'll show you the implementation tomorrow.)

> > Or, do you have better way out of the dirty_expire_centisecs dilemma?
> >
> 
> Maybe we can delete the global value entirely. However as you
> correctly mentioned above, this
> will impact other tools distributions altering these global values.

Right. Deleting existing interfaces are NOT an option.

> You mentioned the close relationship between the dirty_background_time
> and the global dirty
> [_background]_ratio.
> Do you mean to say that you intend to change the dirty_background_time
> based on changes to
> the dirty_background_ratio ?
> Since the global dirty_background_ratio doesn't result in changes to
> the dirty_writeback_centisecs
> wouldn't this amount to a radical change in the existing relationship
> of these configurable values ?

dirty_background_time will be complementing dirty_background_ratio.
One will be adaptive to device bandwidth, the other to memory size.

The users typically don't want to accumulate many dirty pages to eat
up the memory, or take too much time to writeout.

So it's very natural to introduce dirty_background_time to fill the gap.

Thanks,
Fengguang

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
  2011-08-19  5:28               ` Wu Fengguang
@ 2011-08-19  6:08                 ` Wu Fengguang
  -1 siblings, 0 replies; 47+ messages in thread
From: Wu Fengguang @ 2011-08-19  6:08 UTC (permalink / raw)
  To: Kautuk Consul
  Cc: Artem Bityutskiy, Mel Gorman, KOSAKI Motohiro, linux-mm,
	linux-kernel, linux-fsdevel, Jan Kara, Dave Chinner, Greg Thelen

Kautuk,

Here is a quick demo for bdi->dirty_background_time. Totally untested.

Thanks,
Fengguang

---
 fs/fs-writeback.c           |   16 +++++++++++-----
 include/linux/backing-dev.h |    1 +
 include/linux/writeback.h   |    1 +
 mm/backing-dev.c            |   23 +++++++++++++++++++++++
 mm/page-writeback.c         |    3 ++-
 5 files changed, 38 insertions(+), 6 deletions(-)

--- linux-next.orig/fs/fs-writeback.c	2011-08-19 13:59:41.000000000 +0800
+++ linux-next/fs/fs-writeback.c	2011-08-19 14:00:36.000000000 +0800
@@ -653,14 +653,20 @@ long writeback_inodes_wb(struct bdi_writ
 	return nr_pages - work.nr_pages;
 }
 
-static inline bool over_bground_thresh(void)
+bool over_bground_thresh(struct backing_dev_info *bdi)
 {
 	unsigned long background_thresh, dirty_thresh;
 
 	global_dirty_limits(&background_thresh, &dirty_thresh);
 
-	return (global_page_state(NR_FILE_DIRTY) +
-		global_page_state(NR_UNSTABLE_NFS) > background_thresh);
+	if (global_page_state(NR_FILE_DIRTY) +
+	    global_page_state(NR_UNSTABLE_NFS) > background_thresh)
+		return true;
+
+	background_thresh = bdi->avg_write_bandwidth *
+					(u64)bdi->dirty_background_time / 1000;
+
+	return bdi_stat(bdi, BDI_RECLAIMABLE) > background_thresh;
 }
 
 /*
@@ -722,7 +728,7 @@ static long wb_writeback(struct bdi_writ
 		 * For background writeout, stop when we are below the
 		 * background dirty threshold
 		 */
-		if (work->for_background && !over_bground_thresh())
+		if (work->for_background && !over_bground_thresh(wb->bdi))
 			break;
 
 		if (work->for_kupdate) {
@@ -806,7 +812,7 @@ static unsigned long get_nr_dirty_pages(
 
 static long wb_check_background_flush(struct bdi_writeback *wb)
 {
-	if (over_bground_thresh()) {
+	if (over_bground_thresh(wb->bdi)) {
 
 		struct wb_writeback_work work = {
 			.nr_pages	= LONG_MAX,
--- linux-next.orig/include/linux/backing-dev.h	2011-08-19 13:59:41.000000000 +0800
+++ linux-next/include/linux/backing-dev.h	2011-08-19 14:00:07.000000000 +0800
@@ -91,6 +91,7 @@ struct backing_dev_info {
 
 	unsigned int min_ratio;
 	unsigned int max_ratio, max_prop_frac;
+	unsigned int dirty_background_time;
 
 	struct bdi_writeback wb;  /* default writeback info for this bdi */
 	spinlock_t wb_lock;	  /* protects work_list */
--- linux-next.orig/mm/backing-dev.c	2011-08-19 13:59:41.000000000 +0800
+++ linux-next/mm/backing-dev.c	2011-08-19 14:03:15.000000000 +0800
@@ -225,12 +225,33 @@ static ssize_t max_ratio_store(struct de
 }
 BDI_SHOW(max_ratio, bdi->max_ratio)
 
+static ssize_t dirty_background_time_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct backing_dev_info *bdi = dev_get_drvdata(dev);
+	char *end;
+	unsigned int ms;
+	ssize_t ret = -EINVAL;
+
+	ms = simple_strtoul(buf, &end, 10);
+	if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
+		bdi->dirty_background_time = ms;
+		if (!ret)
+			ret = count;
+		if (over_bground_thresh(bdi))
+			bdi_start_background_writeback(bdi);
+	}
+	return ret;
+}
+BDI_SHOW(dirty_background_time, bdi->dirty_background_time)
+
 #define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
 
 static struct device_attribute bdi_dev_attrs[] = {
 	__ATTR_RW(read_ahead_kb),
 	__ATTR_RW(min_ratio),
 	__ATTR_RW(max_ratio),
+	__ATTR_RW(dirty_background_time),
 	__ATTR_NULL,
 };
 
@@ -657,6 +678,8 @@ int bdi_init(struct backing_dev_info *bd
 	bdi->min_ratio = 0;
 	bdi->max_ratio = 100;
 	bdi->max_prop_frac = PROP_FRAC_BASE;
+	bdi->dirty_background_time = 10000;
+
 	spin_lock_init(&bdi->wb_lock);
 	INIT_LIST_HEAD(&bdi->bdi_list);
 	INIT_LIST_HEAD(&bdi->work_list);
--- linux-next.orig/mm/page-writeback.c	2011-08-19 14:00:07.000000000 +0800
+++ linux-next/mm/page-writeback.c	2011-08-19 14:00:07.000000000 +0800
@@ -1163,7 +1163,8 @@ pause:
 	if (laptop_mode)
 		return;
 
-	if (nr_reclaimable > background_thresh)
+	if (nr_reclaimable > background_thresh ||
+	    over_bground_thresh(bdi))
 		bdi_start_background_writeback(bdi);
 }
 
--- linux-next.orig/include/linux/writeback.h	2011-08-19 14:00:41.000000000 +0800
+++ linux-next/include/linux/writeback.h	2011-08-19 14:01:19.000000000 +0800
@@ -132,6 +132,7 @@ extern int block_dump;
 extern int laptop_mode;
 
 extern unsigned long determine_dirtyable_memory(void);
+extern bool over_bground_thresh(struct backing_dev_info *bdi);
 
 extern int dirty_background_ratio_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
@ 2011-08-19  6:08                 ` Wu Fengguang
  0 siblings, 0 replies; 47+ messages in thread
From: Wu Fengguang @ 2011-08-19  6:08 UTC (permalink / raw)
  To: Kautuk Consul
  Cc: Artem Bityutskiy, Mel Gorman, KOSAKI Motohiro, linux-mm,
	linux-kernel, linux-fsdevel, Jan Kara, Dave Chinner, Greg Thelen

Kautuk,

Here is a quick demo for bdi->dirty_background_time. Totally untested.

Thanks,
Fengguang

---
 fs/fs-writeback.c           |   16 +++++++++++-----
 include/linux/backing-dev.h |    1 +
 include/linux/writeback.h   |    1 +
 mm/backing-dev.c            |   23 +++++++++++++++++++++++
 mm/page-writeback.c         |    3 ++-
 5 files changed, 38 insertions(+), 6 deletions(-)

--- linux-next.orig/fs/fs-writeback.c	2011-08-19 13:59:41.000000000 +0800
+++ linux-next/fs/fs-writeback.c	2011-08-19 14:00:36.000000000 +0800
@@ -653,14 +653,20 @@ long writeback_inodes_wb(struct bdi_writ
 	return nr_pages - work.nr_pages;
 }
 
-static inline bool over_bground_thresh(void)
+bool over_bground_thresh(struct backing_dev_info *bdi)
 {
 	unsigned long background_thresh, dirty_thresh;
 
 	global_dirty_limits(&background_thresh, &dirty_thresh);
 
-	return (global_page_state(NR_FILE_DIRTY) +
-		global_page_state(NR_UNSTABLE_NFS) > background_thresh);
+	if (global_page_state(NR_FILE_DIRTY) +
+	    global_page_state(NR_UNSTABLE_NFS) > background_thresh)
+		return true;
+
+	background_thresh = bdi->avg_write_bandwidth *
+					(u64)bdi->dirty_background_time / 1000;
+
+	return bdi_stat(bdi, BDI_RECLAIMABLE) > background_thresh;
 }
 
 /*
@@ -722,7 +728,7 @@ static long wb_writeback(struct bdi_writ
 		 * For background writeout, stop when we are below the
 		 * background dirty threshold
 		 */
-		if (work->for_background && !over_bground_thresh())
+		if (work->for_background && !over_bground_thresh(wb->bdi))
 			break;
 
 		if (work->for_kupdate) {
@@ -806,7 +812,7 @@ static unsigned long get_nr_dirty_pages(
 
 static long wb_check_background_flush(struct bdi_writeback *wb)
 {
-	if (over_bground_thresh()) {
+	if (over_bground_thresh(wb->bdi)) {
 
 		struct wb_writeback_work work = {
 			.nr_pages	= LONG_MAX,
--- linux-next.orig/include/linux/backing-dev.h	2011-08-19 13:59:41.000000000 +0800
+++ linux-next/include/linux/backing-dev.h	2011-08-19 14:00:07.000000000 +0800
@@ -91,6 +91,7 @@ struct backing_dev_info {
 
 	unsigned int min_ratio;
 	unsigned int max_ratio, max_prop_frac;
+	unsigned int dirty_background_time;
 
 	struct bdi_writeback wb;  /* default writeback info for this bdi */
 	spinlock_t wb_lock;	  /* protects work_list */
--- linux-next.orig/mm/backing-dev.c	2011-08-19 13:59:41.000000000 +0800
+++ linux-next/mm/backing-dev.c	2011-08-19 14:03:15.000000000 +0800
@@ -225,12 +225,33 @@ static ssize_t max_ratio_store(struct de
 }
 BDI_SHOW(max_ratio, bdi->max_ratio)
 
+static ssize_t dirty_background_time_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct backing_dev_info *bdi = dev_get_drvdata(dev);
+	char *end;
+	unsigned int ms;
+	ssize_t ret = -EINVAL;
+
+	ms = simple_strtoul(buf, &end, 10);
+	if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
+		bdi->dirty_background_time = ms;
+		if (!ret)
+			ret = count;
+		if (over_bground_thresh(bdi))
+			bdi_start_background_writeback(bdi);
+	}
+	return ret;
+}
+BDI_SHOW(dirty_background_time, bdi->dirty_background_time)
+
 #define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
 
 static struct device_attribute bdi_dev_attrs[] = {
 	__ATTR_RW(read_ahead_kb),
 	__ATTR_RW(min_ratio),
 	__ATTR_RW(max_ratio),
+	__ATTR_RW(dirty_background_time),
 	__ATTR_NULL,
 };
 
@@ -657,6 +678,8 @@ int bdi_init(struct backing_dev_info *bd
 	bdi->min_ratio = 0;
 	bdi->max_ratio = 100;
 	bdi->max_prop_frac = PROP_FRAC_BASE;
+	bdi->dirty_background_time = 10000;
+
 	spin_lock_init(&bdi->wb_lock);
 	INIT_LIST_HEAD(&bdi->bdi_list);
 	INIT_LIST_HEAD(&bdi->work_list);
--- linux-next.orig/mm/page-writeback.c	2011-08-19 14:00:07.000000000 +0800
+++ linux-next/mm/page-writeback.c	2011-08-19 14:00:07.000000000 +0800
@@ -1163,7 +1163,8 @@ pause:
 	if (laptop_mode)
 		return;
 
-	if (nr_reclaimable > background_thresh)
+	if (nr_reclaimable > background_thresh ||
+	    over_bground_thresh(bdi))
 		bdi_start_background_writeback(bdi);
 }
 
--- linux-next.orig/include/linux/writeback.h	2011-08-19 14:00:41.000000000 +0800
+++ linux-next/include/linux/writeback.h	2011-08-19 14:01:19.000000000 +0800
@@ -132,6 +132,7 @@ extern int block_dump;
 extern int laptop_mode;
 
 extern unsigned long determine_dirtyable_memory(void);
+extern bool over_bground_thresh(struct backing_dev_info *bdi);
 
 extern int dirty_background_ratio_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
  2011-08-19  6:08                 ` Wu Fengguang
  (?)
@ 2011-08-19  7:00                   ` Kautuk Consul
  -1 siblings, 0 replies; 47+ messages in thread
From: Kautuk Consul @ 2011-08-19  7:00 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Artem Bityutskiy, Mel Gorman, KOSAKI Motohiro, linux-mm,
	linux-kernel, linux-fsdevel, Jan Kara, Dave Chinner, Greg Thelen

Hi Wu,

Yes. I think I do understand your approach.

Your aim is to always retain the per BDI timeout value.

You want to check for threshholds by mathematically adjusting the
background time too
into your over_bground_thresh() formula so that your understanding
holds true always and also
affects the page dirtying scenario I mentioned.
This definitely helps and refines this scenario in terms of flushing
out of the dirty pages.

Doubts:
i)   Your entire implementation seems to be dependent on someone
calling balance_dirty_pages()
     directly or indirectly. This function will call the
bdi_start_background_writeback() which wakes
     up the flusher thread.
     What about those page dirtying code paths which might not call
balance_dirty_pages ?
     Those paths then depend on the BDI thread periodically writing it
to disk and then we are again
     dependent on the writeback interval.
     Can we assume that the kernel will reliably call
balance_dirty_pages() whenever the pages
     are dirtied ? If that was true, then we would not need bdi
periodic writeback threads ever.

ii)  Even after your rigorous checking, the bdi_writeback_thread()
will still do a schedule_timeout()
     with the global value. Will your current solution then handle
Artem's disk removal scenario ?
     Else, you start using your value in the schedule_timeout() call
in the bdi_writeback_thread()
     function, which brings us back to the interval phenomenon I was
talking about.

Does this patch really help the user control exact time when the write
BIO is transferred from the
MM to the Block layer assuming balance_dirty_pages() is not called ?

Please correct me if I am wrong.

Thanks,
Kautuk.

On Fri, Aug 19, 2011 at 11:38 AM, Wu Fengguang <fengguang.wu@intel.com> wrote:
> Kautuk,
>
> Here is a quick demo for bdi->dirty_background_time. Totally untested.
>
> Thanks,
> Fengguang
>
> ---
>  fs/fs-writeback.c           |   16 +++++++++++-----
>  include/linux/backing-dev.h |    1 +
>  include/linux/writeback.h   |    1 +
>  mm/backing-dev.c            |   23 +++++++++++++++++++++++
>  mm/page-writeback.c         |    3 ++-
>  5 files changed, 38 insertions(+), 6 deletions(-)
>
> --- linux-next.orig/fs/fs-writeback.c   2011-08-19 13:59:41.000000000 +0800
> +++ linux-next/fs/fs-writeback.c        2011-08-19 14:00:36.000000000 +0800
> @@ -653,14 +653,20 @@ long writeback_inodes_wb(struct bdi_writ
>        return nr_pages - work.nr_pages;
>  }
>
> -static inline bool over_bground_thresh(void)
> +bool over_bground_thresh(struct backing_dev_info *bdi)
>  {
>        unsigned long background_thresh, dirty_thresh;
>
>        global_dirty_limits(&background_thresh, &dirty_thresh);
>
> -       return (global_page_state(NR_FILE_DIRTY) +
> -               global_page_state(NR_UNSTABLE_NFS) > background_thresh);
> +       if (global_page_state(NR_FILE_DIRTY) +
> +           global_page_state(NR_UNSTABLE_NFS) > background_thresh)
> +               return true;
> +
> +       background_thresh = bdi->avg_write_bandwidth *
> +                                       (u64)bdi->dirty_background_time / 1000;
> +
> +       return bdi_stat(bdi, BDI_RECLAIMABLE) > background_thresh;
>  }
>
>  /*
> @@ -722,7 +728,7 @@ static long wb_writeback(struct bdi_writ
>                 * For background writeout, stop when we are below the
>                 * background dirty threshold
>                 */
> -               if (work->for_background && !over_bground_thresh())
> +               if (work->for_background && !over_bground_thresh(wb->bdi))
>                        break;
>
>                if (work->for_kupdate) {
> @@ -806,7 +812,7 @@ static unsigned long get_nr_dirty_pages(
>
>  static long wb_check_background_flush(struct bdi_writeback *wb)
>  {
> -       if (over_bground_thresh()) {
> +       if (over_bground_thresh(wb->bdi)) {
>
>                struct wb_writeback_work work = {
>                        .nr_pages       = LONG_MAX,
> --- linux-next.orig/include/linux/backing-dev.h 2011-08-19 13:59:41.000000000 +0800
> +++ linux-next/include/linux/backing-dev.h      2011-08-19 14:00:07.000000000 +0800
> @@ -91,6 +91,7 @@ struct backing_dev_info {
>
>        unsigned int min_ratio;
>        unsigned int max_ratio, max_prop_frac;
> +       unsigned int dirty_background_time;
>
>        struct bdi_writeback wb;  /* default writeback info for this bdi */
>        spinlock_t wb_lock;       /* protects work_list */
> --- linux-next.orig/mm/backing-dev.c    2011-08-19 13:59:41.000000000 +0800
> +++ linux-next/mm/backing-dev.c 2011-08-19 14:03:15.000000000 +0800
> @@ -225,12 +225,33 @@ static ssize_t max_ratio_store(struct de
>  }
>  BDI_SHOW(max_ratio, bdi->max_ratio)
>
> +static ssize_t dirty_background_time_store(struct device *dev,
> +               struct device_attribute *attr, const char *buf, size_t count)
> +{
> +       struct backing_dev_info *bdi = dev_get_drvdata(dev);
> +       char *end;
> +       unsigned int ms;
> +       ssize_t ret = -EINVAL;
> +
> +       ms = simple_strtoul(buf, &end, 10);
> +       if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
> +               bdi->dirty_background_time = ms;
> +               if (!ret)
> +                       ret = count;
> +               if (over_bground_thresh(bdi))
> +                       bdi_start_background_writeback(bdi);
> +       }
> +       return ret;
> +}
> +BDI_SHOW(dirty_background_time, bdi->dirty_background_time)
> +
>  #define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
>
>  static struct device_attribute bdi_dev_attrs[] = {
>        __ATTR_RW(read_ahead_kb),
>        __ATTR_RW(min_ratio),
>        __ATTR_RW(max_ratio),
> +       __ATTR_RW(dirty_background_time),
>        __ATTR_NULL,
>  };
>
> @@ -657,6 +678,8 @@ int bdi_init(struct backing_dev_info *bd
>        bdi->min_ratio = 0;
>        bdi->max_ratio = 100;
>        bdi->max_prop_frac = PROP_FRAC_BASE;
> +       bdi->dirty_background_time = 10000;
> +
>        spin_lock_init(&bdi->wb_lock);
>        INIT_LIST_HEAD(&bdi->bdi_list);
>        INIT_LIST_HEAD(&bdi->work_list);
> --- linux-next.orig/mm/page-writeback.c 2011-08-19 14:00:07.000000000 +0800
> +++ linux-next/mm/page-writeback.c      2011-08-19 14:00:07.000000000 +0800
> @@ -1163,7 +1163,8 @@ pause:
>        if (laptop_mode)
>                return;
>
> -       if (nr_reclaimable > background_thresh)
> +       if (nr_reclaimable > background_thresh ||
> +           over_bground_thresh(bdi))
>                bdi_start_background_writeback(bdi);
>  }
>
> --- linux-next.orig/include/linux/writeback.h   2011-08-19 14:00:41.000000000 +0800
> +++ linux-next/include/linux/writeback.h        2011-08-19 14:01:19.000000000 +0800
> @@ -132,6 +132,7 @@ extern int block_dump;
>  extern int laptop_mode;
>
>  extern unsigned long determine_dirtyable_memory(void);
> +extern bool over_bground_thresh(struct backing_dev_info *bdi);
>
>  extern int dirty_background_ratio_handler(struct ctl_table *table, int write,
>                void __user *buffer, size_t *lenp,
>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
@ 2011-08-19  7:00                   ` Kautuk Consul
  0 siblings, 0 replies; 47+ messages in thread
From: Kautuk Consul @ 2011-08-19  7:00 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Artem Bityutskiy, Mel Gorman, KOSAKI Motohiro, linux-mm,
	linux-kernel, linux-fsdevel, Jan Kara, Dave Chinner, Greg Thelen

Hi Wu,

Yes. I think I do understand your approach.

Your aim is to always retain the per BDI timeout value.

You want to check for threshholds by mathematically adjusting the
background time too
into your over_bground_thresh() formula so that your understanding
holds true always and also
affects the page dirtying scenario I mentioned.
This definitely helps and refines this scenario in terms of flushing
out of the dirty pages.

Doubts:
i)   Your entire implementation seems to be dependent on someone
calling balance_dirty_pages()
     directly or indirectly. This function will call the
bdi_start_background_writeback() which wakes
     up the flusher thread.
     What about those page dirtying code paths which might not call
balance_dirty_pages ?
     Those paths then depend on the BDI thread periodically writing it
to disk and then we are again
     dependent on the writeback interval.
     Can we assume that the kernel will reliably call
balance_dirty_pages() whenever the pages
     are dirtied ? If that was true, then we would not need bdi
periodic writeback threads ever.

ii)  Even after your rigorous checking, the bdi_writeback_thread()
will still do a schedule_timeout()
     with the global value. Will your current solution then handle
Artem's disk removal scenario ?
     Else, you start using your value in the schedule_timeout() call
in the bdi_writeback_thread()
     function, which brings us back to the interval phenomenon I was
talking about.

Does this patch really help the user control exact time when the write
BIO is transferred from the
MM to the Block layer assuming balance_dirty_pages() is not called ?

Please correct me if I am wrong.

Thanks,
Kautuk.

On Fri, Aug 19, 2011 at 11:38 AM, Wu Fengguang <fengguang.wu@intel.com> wrote:
> Kautuk,
>
> Here is a quick demo for bdi->dirty_background_time. Totally untested.
>
> Thanks,
> Fengguang
>
> ---
>  fs/fs-writeback.c           |   16 +++++++++++-----
>  include/linux/backing-dev.h |    1 +
>  include/linux/writeback.h   |    1 +
>  mm/backing-dev.c            |   23 +++++++++++++++++++++++
>  mm/page-writeback.c         |    3 ++-
>  5 files changed, 38 insertions(+), 6 deletions(-)
>
> --- linux-next.orig/fs/fs-writeback.c   2011-08-19 13:59:41.000000000 +0800
> +++ linux-next/fs/fs-writeback.c        2011-08-19 14:00:36.000000000 +0800
> @@ -653,14 +653,20 @@ long writeback_inodes_wb(struct bdi_writ
>        return nr_pages - work.nr_pages;
>  }
>
> -static inline bool over_bground_thresh(void)
> +bool over_bground_thresh(struct backing_dev_info *bdi)
>  {
>        unsigned long background_thresh, dirty_thresh;
>
>        global_dirty_limits(&background_thresh, &dirty_thresh);
>
> -       return (global_page_state(NR_FILE_DIRTY) +
> -               global_page_state(NR_UNSTABLE_NFS) > background_thresh);
> +       if (global_page_state(NR_FILE_DIRTY) +
> +           global_page_state(NR_UNSTABLE_NFS) > background_thresh)
> +               return true;
> +
> +       background_thresh = bdi->avg_write_bandwidth *
> +                                       (u64)bdi->dirty_background_time / 1000;
> +
> +       return bdi_stat(bdi, BDI_RECLAIMABLE) > background_thresh;
>  }
>
>  /*
> @@ -722,7 +728,7 @@ static long wb_writeback(struct bdi_writ
>                 * For background writeout, stop when we are below the
>                 * background dirty threshold
>                 */
> -               if (work->for_background && !over_bground_thresh())
> +               if (work->for_background && !over_bground_thresh(wb->bdi))
>                        break;
>
>                if (work->for_kupdate) {
> @@ -806,7 +812,7 @@ static unsigned long get_nr_dirty_pages(
>
>  static long wb_check_background_flush(struct bdi_writeback *wb)
>  {
> -       if (over_bground_thresh()) {
> +       if (over_bground_thresh(wb->bdi)) {
>
>                struct wb_writeback_work work = {
>                        .nr_pages       = LONG_MAX,
> --- linux-next.orig/include/linux/backing-dev.h 2011-08-19 13:59:41.000000000 +0800
> +++ linux-next/include/linux/backing-dev.h      2011-08-19 14:00:07.000000000 +0800
> @@ -91,6 +91,7 @@ struct backing_dev_info {
>
>        unsigned int min_ratio;
>        unsigned int max_ratio, max_prop_frac;
> +       unsigned int dirty_background_time;
>
>        struct bdi_writeback wb;  /* default writeback info for this bdi */
>        spinlock_t wb_lock;       /* protects work_list */
> --- linux-next.orig/mm/backing-dev.c    2011-08-19 13:59:41.000000000 +0800
> +++ linux-next/mm/backing-dev.c 2011-08-19 14:03:15.000000000 +0800
> @@ -225,12 +225,33 @@ static ssize_t max_ratio_store(struct de
>  }
>  BDI_SHOW(max_ratio, bdi->max_ratio)
>
> +static ssize_t dirty_background_time_store(struct device *dev,
> +               struct device_attribute *attr, const char *buf, size_t count)
> +{
> +       struct backing_dev_info *bdi = dev_get_drvdata(dev);
> +       char *end;
> +       unsigned int ms;
> +       ssize_t ret = -EINVAL;
> +
> +       ms = simple_strtoul(buf, &end, 10);
> +       if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
> +               bdi->dirty_background_time = ms;
> +               if (!ret)
> +                       ret = count;
> +               if (over_bground_thresh(bdi))
> +                       bdi_start_background_writeback(bdi);
> +       }
> +       return ret;
> +}
> +BDI_SHOW(dirty_background_time, bdi->dirty_background_time)
> +
>  #define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
>
>  static struct device_attribute bdi_dev_attrs[] = {
>        __ATTR_RW(read_ahead_kb),
>        __ATTR_RW(min_ratio),
>        __ATTR_RW(max_ratio),
> +       __ATTR_RW(dirty_background_time),
>        __ATTR_NULL,
>  };
>
> @@ -657,6 +678,8 @@ int bdi_init(struct backing_dev_info *bd
>        bdi->min_ratio = 0;
>        bdi->max_ratio = 100;
>        bdi->max_prop_frac = PROP_FRAC_BASE;
> +       bdi->dirty_background_time = 10000;
> +
>        spin_lock_init(&bdi->wb_lock);
>        INIT_LIST_HEAD(&bdi->bdi_list);
>        INIT_LIST_HEAD(&bdi->work_list);
> --- linux-next.orig/mm/page-writeback.c 2011-08-19 14:00:07.000000000 +0800
> +++ linux-next/mm/page-writeback.c      2011-08-19 14:00:07.000000000 +0800
> @@ -1163,7 +1163,8 @@ pause:
>        if (laptop_mode)
>                return;
>
> -       if (nr_reclaimable > background_thresh)
> +       if (nr_reclaimable > background_thresh ||
> +           over_bground_thresh(bdi))
>                bdi_start_background_writeback(bdi);
>  }
>
> --- linux-next.orig/include/linux/writeback.h   2011-08-19 14:00:41.000000000 +0800
> +++ linux-next/include/linux/writeback.h        2011-08-19 14:01:19.000000000 +0800
> @@ -132,6 +132,7 @@ extern int block_dump;
>  extern int laptop_mode;
>
>  extern unsigned long determine_dirtyable_memory(void);
> +extern bool over_bground_thresh(struct backing_dev_info *bdi);
>
>  extern int dirty_background_ratio_handler(struct ctl_table *table, int write,
>                void __user *buffer, size_t *lenp,
>
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
@ 2011-08-19  7:00                   ` Kautuk Consul
  0 siblings, 0 replies; 47+ messages in thread
From: Kautuk Consul @ 2011-08-19  7:00 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Artem Bityutskiy, Mel Gorman, KOSAKI Motohiro, linux-mm,
	linux-kernel, linux-fsdevel, Jan Kara, Dave Chinner, Greg Thelen

Hi Wu,

Yes. I think I do understand your approach.

Your aim is to always retain the per BDI timeout value.

You want to check for threshholds by mathematically adjusting the
background time too
into your over_bground_thresh() formula so that your understanding
holds true always and also
affects the page dirtying scenario I mentioned.
This definitely helps and refines this scenario in terms of flushing
out of the dirty pages.

Doubts:
i)   Your entire implementation seems to be dependent on someone
calling balance_dirty_pages()
     directly or indirectly. This function will call the
bdi_start_background_writeback() which wakes
     up the flusher thread.
     What about those page dirtying code paths which might not call
balance_dirty_pages ?
     Those paths then depend on the BDI thread periodically writing it
to disk and then we are again
     dependent on the writeback interval.
     Can we assume that the kernel will reliably call
balance_dirty_pages() whenever the pages
     are dirtied ? If that was true, then we would not need bdi
periodic writeback threads ever.

ii)  Even after your rigorous checking, the bdi_writeback_thread()
will still do a schedule_timeout()
     with the global value. Will your current solution then handle
Artem's disk removal scenario ?
     Else, you start using your value in the schedule_timeout() call
in the bdi_writeback_thread()
     function, which brings us back to the interval phenomenon I was
talking about.

Does this patch really help the user control exact time when the write
BIO is transferred from the
MM to the Block layer assuming balance_dirty_pages() is not called ?

Please correct me if I am wrong.

Thanks,
Kautuk.

On Fri, Aug 19, 2011 at 11:38 AM, Wu Fengguang <fengguang.wu@intel.com> wrote:
> Kautuk,
>
> Here is a quick demo for bdi->dirty_background_time. Totally untested.
>
> Thanks,
> Fengguang
>
> ---
>  fs/fs-writeback.c           |   16 +++++++++++-----
>  include/linux/backing-dev.h |    1 +
>  include/linux/writeback.h   |    1 +
>  mm/backing-dev.c            |   23 +++++++++++++++++++++++
>  mm/page-writeback.c         |    3 ++-
>  5 files changed, 38 insertions(+), 6 deletions(-)
>
> --- linux-next.orig/fs/fs-writeback.c   2011-08-19 13:59:41.000000000 +0800
> +++ linux-next/fs/fs-writeback.c        2011-08-19 14:00:36.000000000 +0800
> @@ -653,14 +653,20 @@ long writeback_inodes_wb(struct bdi_writ
>        return nr_pages - work.nr_pages;
>  }
>
> -static inline bool over_bground_thresh(void)
> +bool over_bground_thresh(struct backing_dev_info *bdi)
>  {
>        unsigned long background_thresh, dirty_thresh;
>
>        global_dirty_limits(&background_thresh, &dirty_thresh);
>
> -       return (global_page_state(NR_FILE_DIRTY) +
> -               global_page_state(NR_UNSTABLE_NFS) > background_thresh);
> +       if (global_page_state(NR_FILE_DIRTY) +
> +           global_page_state(NR_UNSTABLE_NFS) > background_thresh)
> +               return true;
> +
> +       background_thresh = bdi->avg_write_bandwidth *
> +                                       (u64)bdi->dirty_background_time / 1000;
> +
> +       return bdi_stat(bdi, BDI_RECLAIMABLE) > background_thresh;
>  }
>
>  /*
> @@ -722,7 +728,7 @@ static long wb_writeback(struct bdi_writ
>                 * For background writeout, stop when we are below the
>                 * background dirty threshold
>                 */
> -               if (work->for_background && !over_bground_thresh())
> +               if (work->for_background && !over_bground_thresh(wb->bdi))
>                        break;
>
>                if (work->for_kupdate) {
> @@ -806,7 +812,7 @@ static unsigned long get_nr_dirty_pages(
>
>  static long wb_check_background_flush(struct bdi_writeback *wb)
>  {
> -       if (over_bground_thresh()) {
> +       if (over_bground_thresh(wb->bdi)) {
>
>                struct wb_writeback_work work = {
>                        .nr_pages       = LONG_MAX,
> --- linux-next.orig/include/linux/backing-dev.h 2011-08-19 13:59:41.000000000 +0800
> +++ linux-next/include/linux/backing-dev.h      2011-08-19 14:00:07.000000000 +0800
> @@ -91,6 +91,7 @@ struct backing_dev_info {
>
>        unsigned int min_ratio;
>        unsigned int max_ratio, max_prop_frac;
> +       unsigned int dirty_background_time;
>
>        struct bdi_writeback wb;  /* default writeback info for this bdi */
>        spinlock_t wb_lock;       /* protects work_list */
> --- linux-next.orig/mm/backing-dev.c    2011-08-19 13:59:41.000000000 +0800
> +++ linux-next/mm/backing-dev.c 2011-08-19 14:03:15.000000000 +0800
> @@ -225,12 +225,33 @@ static ssize_t max_ratio_store(struct de
>  }
>  BDI_SHOW(max_ratio, bdi->max_ratio)
>
> +static ssize_t dirty_background_time_store(struct device *dev,
> +               struct device_attribute *attr, const char *buf, size_t count)
> +{
> +       struct backing_dev_info *bdi = dev_get_drvdata(dev);
> +       char *end;
> +       unsigned int ms;
> +       ssize_t ret = -EINVAL;
> +
> +       ms = simple_strtoul(buf, &end, 10);
> +       if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
> +               bdi->dirty_background_time = ms;
> +               if (!ret)
> +                       ret = count;
> +               if (over_bground_thresh(bdi))
> +                       bdi_start_background_writeback(bdi);
> +       }
> +       return ret;
> +}
> +BDI_SHOW(dirty_background_time, bdi->dirty_background_time)
> +
>  #define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
>
>  static struct device_attribute bdi_dev_attrs[] = {
>        __ATTR_RW(read_ahead_kb),
>        __ATTR_RW(min_ratio),
>        __ATTR_RW(max_ratio),
> +       __ATTR_RW(dirty_background_time),
>        __ATTR_NULL,
>  };
>
> @@ -657,6 +678,8 @@ int bdi_init(struct backing_dev_info *bd
>        bdi->min_ratio = 0;
>        bdi->max_ratio = 100;
>        bdi->max_prop_frac = PROP_FRAC_BASE;
> +       bdi->dirty_background_time = 10000;
> +
>        spin_lock_init(&bdi->wb_lock);
>        INIT_LIST_HEAD(&bdi->bdi_list);
>        INIT_LIST_HEAD(&bdi->work_list);
> --- linux-next.orig/mm/page-writeback.c 2011-08-19 14:00:07.000000000 +0800
> +++ linux-next/mm/page-writeback.c      2011-08-19 14:00:07.000000000 +0800
> @@ -1163,7 +1163,8 @@ pause:
>        if (laptop_mode)
>                return;
>
> -       if (nr_reclaimable > background_thresh)
> +       if (nr_reclaimable > background_thresh ||
> +           over_bground_thresh(bdi))
>                bdi_start_background_writeback(bdi);
>  }
>
> --- linux-next.orig/include/linux/writeback.h   2011-08-19 14:00:41.000000000 +0800
> +++ linux-next/include/linux/writeback.h        2011-08-19 14:01:19.000000000 +0800
> @@ -132,6 +132,7 @@ extern int block_dump;
>  extern int laptop_mode;
>
>  extern unsigned long determine_dirtyable_memory(void);
> +extern bool over_bground_thresh(struct backing_dev_info *bdi);
>
>  extern int dirty_background_ratio_handler(struct ctl_table *table, int write,
>                void __user *buffer, size_t *lenp,
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
  2011-08-18 13:13       ` Wu Fengguang
  (?)
@ 2011-08-19 11:55         ` Artem Bityutskiy
  -1 siblings, 0 replies; 47+ messages in thread
From: Artem Bityutskiy @ 2011-08-19 11:55 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Kautuk Consul, Mel Gorman, KOSAKI Motohiro, linux-mm,
	linux-kernel, linux-fsdevel, Jan Kara, Dave Chinner, Greg Thelen

On Thu, 2011-08-18 at 21:13 +0800, Wu Fengguang wrote:
> Thinking twice about it, I find that the different requirements for
> interval flash/external microSD can also be solved by this scheme.
> 
> Introduce a per-bdi dirty_background_time (and optionally dirty_time)
> as the counterpart of (and works in parallel to) global dirty[_background]_ratio,
> however with unit "milliseconds worth of data".
> 
> The per-bdi dirty_background_time will be set low for external microSD
> and high for internal flash. Then you get timely writeouts for microSD
> and reasonably delayed writes for internal flash (controllable by the
> global dirty_expire_centisecs).
> 
> The dirty_background_time will actually work more reliable than
> dirty_expire_centisecs because it will checked immediately after the
> application dirties more pages. And the dirty_time could provide
> strong data integrity guarantee -- much stronger than
> dirty_expire_centisecs -- if used.
> 
> Does that sound reasonable?

Yes, this would probably work. But note, we do not have this problem
anymore, I was just talking about the past experience, so I cannot
validate any possible patch.

Thanks.

-- 
Best Regards,
Artem Bityutskiy


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
@ 2011-08-19 11:55         ` Artem Bityutskiy
  0 siblings, 0 replies; 47+ messages in thread
From: Artem Bityutskiy @ 2011-08-19 11:55 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Kautuk Consul, Mel Gorman, KOSAKI Motohiro, linux-mm,
	linux-kernel, linux-fsdevel, Jan Kara, Dave Chinner, Greg Thelen

On Thu, 2011-08-18 at 21:13 +0800, Wu Fengguang wrote:
> Thinking twice about it, I find that the different requirements for
> interval flash/external microSD can also be solved by this scheme.
> 
> Introduce a per-bdi dirty_background_time (and optionally dirty_time)
> as the counterpart of (and works in parallel to) global dirty[_background]_ratio,
> however with unit "milliseconds worth of data".
> 
> The per-bdi dirty_background_time will be set low for external microSD
> and high for internal flash. Then you get timely writeouts for microSD
> and reasonably delayed writes for internal flash (controllable by the
> global dirty_expire_centisecs).
> 
> The dirty_background_time will actually work more reliable than
> dirty_expire_centisecs because it will checked immediately after the
> application dirties more pages. And the dirty_time could provide
> strong data integrity guarantee -- much stronger than
> dirty_expire_centisecs -- if used.
> 
> Does that sound reasonable?

Yes, this would probably work. But note, we do not have this problem
anymore, I was just talking about the past experience, so I cannot
validate any possible patch.

Thanks.

-- 
Best Regards,
Artem Bityutskiy

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
@ 2011-08-19 11:55         ` Artem Bityutskiy
  0 siblings, 0 replies; 47+ messages in thread
From: Artem Bityutskiy @ 2011-08-19 11:55 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Kautuk Consul, Mel Gorman, KOSAKI Motohiro, linux-mm,
	linux-kernel, linux-fsdevel, Jan Kara, Dave Chinner, Greg Thelen

On Thu, 2011-08-18 at 21:13 +0800, Wu Fengguang wrote:
> Thinking twice about it, I find that the different requirements for
> interval flash/external microSD can also be solved by this scheme.
> 
> Introduce a per-bdi dirty_background_time (and optionally dirty_time)
> as the counterpart of (and works in parallel to) global dirty[_background]_ratio,
> however with unit "milliseconds worth of data".
> 
> The per-bdi dirty_background_time will be set low for external microSD
> and high for internal flash. Then you get timely writeouts for microSD
> and reasonably delayed writes for internal flash (controllable by the
> global dirty_expire_centisecs).
> 
> The dirty_background_time will actually work more reliable than
> dirty_expire_centisecs because it will checked immediately after the
> application dirties more pages. And the dirty_time could provide
> strong data integrity guarantee -- much stronger than
> dirty_expire_centisecs -- if used.
> 
> Does that sound reasonable?

Yes, this would probably work. But note, we do not have this problem
anymore, I was just talking about the past experience, so I cannot
validate any possible patch.

Thanks.

-- 
Best Regards,
Artem Bityutskiy

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
  2011-08-19  7:00                   ` Kautuk Consul
@ 2011-08-19 14:24                     ` Wu Fengguang
  -1 siblings, 0 replies; 47+ messages in thread
From: Wu Fengguang @ 2011-08-19 14:24 UTC (permalink / raw)
  To: Kautuk Consul
  Cc: Artem Bityutskiy, Mel Gorman, KOSAKI Motohiro, linux-mm,
	linux-kernel, linux-fsdevel, Jan Kara, Dave Chinner, Greg Thelen

Hi Kautuk,

On Fri, Aug 19, 2011 at 03:00:30PM +0800, Kautuk Consul wrote:
> Hi Wu,
> 
> Yes. I think I do understand your approach.
> 
> Your aim is to always retain the per BDI timeout value.
> 
> You want to check for threshholds by mathematically adjusting the
> background time too
> into your over_bground_thresh() formula so that your understanding
> holds true always and also
> affects the page dirtying scenario I mentioned.
> This definitely helps and refines this scenario in terms of flushing
> out of the dirty pages.

Thanks.

> Doubts:
> i)   Your entire implementation seems to be dependent on someone
> calling balance_dirty_pages()
>      directly or indirectly. This function will call the
> bdi_start_background_writeback() which wakes
>      up the flusher thread.
>      What about those page dirtying code paths which might not call
> balance_dirty_pages ?
>      Those paths then depend on the BDI thread periodically writing it
> to disk and then we are again
>      dependent on the writeback interval.
>      Can we assume that the kernel will reliably call
> balance_dirty_pages() whenever the pages
>      are dirtied ? If that was true, then we would not need bdi
> periodic writeback threads ever.

Yes. The kernel need a way to limit the total number of dirty pages at
any given time and to keep them under dirty_ratio/dirty_bytes.

balance_dirty_pages() is such a central place to throttle the dirty
pages. Whatever code path generating dirty pages are required to call
into balance_dirty_pages_ratelimited_nr() which will in turn call
balance_dirty_pages().

So, the values specified by dirty_ratio/dirty_bytes will be executed
effectively by balance_dirty_pages(). In contrast, the values
specified by dirty_expire_centisecs is merely a parameter used by
wb_writeback() to select the eligible inodes to do writeout. The 30s
dirty expire time is never a guarantee that all inodes/pages dirtied
before 30s will be timely written to disk. It's better interpreted in
the opposite way: when under the dirty_background_ratio threshold and
hence background writeout does not kick in, dirty inodes younger than
30s won't be written to disk by the flusher.

> ii)  Even after your rigorous checking, the bdi_writeback_thread()
> will still do a schedule_timeout()
>      with the global value. Will your current solution then handle
> Artem's disk removal scenario ?
>      Else, you start using your value in the schedule_timeout() call
> in the bdi_writeback_thread()
>      function, which brings us back to the interval phenomenon I was
> talking about.

wb_writeback() will keep running as long as over_bground_thresh().

The flusher will keep writing as long as there are more works, since
there is a

                if (!list_empty(&bdi->work_list))
                        continue;

before the schedule_timeout() call.

And the flusher thread will always be woke up timely from
balance_dirty_pages().

So schedule_timeout() won't block in the way at all.

> Does this patch really help the user control exact time when the write
> BIO is transferred from the
> MM to the Block layer assuming balance_dirty_pages() is not called ?

It would be a serious bug if balance_dirty_pages() is somehow not
called. But note that balance_dirty_pages() is designed to be called
on every N pages to reduce overheads.

Thanks,
Fengguang

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
@ 2011-08-19 14:24                     ` Wu Fengguang
  0 siblings, 0 replies; 47+ messages in thread
From: Wu Fengguang @ 2011-08-19 14:24 UTC (permalink / raw)
  To: Kautuk Consul
  Cc: Artem Bityutskiy, Mel Gorman, KOSAKI Motohiro, linux-mm,
	linux-kernel, linux-fsdevel, Jan Kara, Dave Chinner, Greg Thelen

Hi Kautuk,

On Fri, Aug 19, 2011 at 03:00:30PM +0800, Kautuk Consul wrote:
> Hi Wu,
> 
> Yes. I think I do understand your approach.
> 
> Your aim is to always retain the per BDI timeout value.
> 
> You want to check for threshholds by mathematically adjusting the
> background time too
> into your over_bground_thresh() formula so that your understanding
> holds true always and also
> affects the page dirtying scenario I mentioned.
> This definitely helps and refines this scenario in terms of flushing
> out of the dirty pages.

Thanks.

> Doubts:
> i)   Your entire implementation seems to be dependent on someone
> calling balance_dirty_pages()
>      directly or indirectly. This function will call the
> bdi_start_background_writeback() which wakes
>      up the flusher thread.
>      What about those page dirtying code paths which might not call
> balance_dirty_pages ?
>      Those paths then depend on the BDI thread periodically writing it
> to disk and then we are again
>      dependent on the writeback interval.
>      Can we assume that the kernel will reliably call
> balance_dirty_pages() whenever the pages
>      are dirtied ? If that was true, then we would not need bdi
> periodic writeback threads ever.

Yes. The kernel need a way to limit the total number of dirty pages at
any given time and to keep them under dirty_ratio/dirty_bytes.

balance_dirty_pages() is such a central place to throttle the dirty
pages. Whatever code path generating dirty pages are required to call
into balance_dirty_pages_ratelimited_nr() which will in turn call
balance_dirty_pages().

So, the values specified by dirty_ratio/dirty_bytes will be executed
effectively by balance_dirty_pages(). In contrast, the values
specified by dirty_expire_centisecs is merely a parameter used by
wb_writeback() to select the eligible inodes to do writeout. The 30s
dirty expire time is never a guarantee that all inodes/pages dirtied
before 30s will be timely written to disk. It's better interpreted in
the opposite way: when under the dirty_background_ratio threshold and
hence background writeout does not kick in, dirty inodes younger than
30s won't be written to disk by the flusher.

> ii)  Even after your rigorous checking, the bdi_writeback_thread()
> will still do a schedule_timeout()
>      with the global value. Will your current solution then handle
> Artem's disk removal scenario ?
>      Else, you start using your value in the schedule_timeout() call
> in the bdi_writeback_thread()
>      function, which brings us back to the interval phenomenon I was
> talking about.

wb_writeback() will keep running as long as over_bground_thresh().

The flusher will keep writing as long as there are more works, since
there is a

                if (!list_empty(&bdi->work_list))
                        continue;

before the schedule_timeout() call.

And the flusher thread will always be woke up timely from
balance_dirty_pages().

So schedule_timeout() won't block in the way at all.

> Does this patch really help the user control exact time when the write
> BIO is transferred from the
> MM to the Block layer assuming balance_dirty_pages() is not called ?

It would be a serious bug if balance_dirty_pages() is somehow not
called. But note that balance_dirty_pages() is designed to be called
on every N pages to reduce overheads.

Thanks,
Fengguang

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
  2011-08-19 11:55         ` Artem Bityutskiy
@ 2011-08-19 14:27           ` Wu Fengguang
  -1 siblings, 0 replies; 47+ messages in thread
From: Wu Fengguang @ 2011-08-19 14:27 UTC (permalink / raw)
  To: Artem Bityutskiy
  Cc: Kautuk Consul, Mel Gorman, KOSAKI Motohiro, linux-mm,
	linux-kernel, linux-fsdevel, Jan Kara, Dave Chinner, Greg Thelen

On Fri, Aug 19, 2011 at 07:55:43PM +0800, Artem Bityutskiy wrote:
> On Thu, 2011-08-18 at 21:13 +0800, Wu Fengguang wrote:
> > Thinking twice about it, I find that the different requirements for
> > interval flash/external microSD can also be solved by this scheme.
> > 
> > Introduce a per-bdi dirty_background_time (and optionally dirty_time)
> > as the counterpart of (and works in parallel to) global dirty[_background]_ratio,
> > however with unit "milliseconds worth of data".
> > 
> > The per-bdi dirty_background_time will be set low for external microSD
> > and high for internal flash. Then you get timely writeouts for microSD
> > and reasonably delayed writes for internal flash (controllable by the
> > global dirty_expire_centisecs).
> > 
> > The dirty_background_time will actually work more reliable than
> > dirty_expire_centisecs because it will checked immediately after the
> > application dirties more pages. And the dirty_time could provide
> > strong data integrity guarantee -- much stronger than
> > dirty_expire_centisecs -- if used.
> > 
> > Does that sound reasonable?
> 
> Yes, this would probably work. But note, we do not have this problem
> anymore, I was just talking about the past experience, so I cannot
> validate any possible patch.

OK, thanks for the information. What do you mean by "not have this
problem any more"? Did you worked around it in other ways, such as
sync mount (which seems rather inefficient though)?

Thanks,
Fengguang

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
@ 2011-08-19 14:27           ` Wu Fengguang
  0 siblings, 0 replies; 47+ messages in thread
From: Wu Fengguang @ 2011-08-19 14:27 UTC (permalink / raw)
  To: Artem Bityutskiy
  Cc: Kautuk Consul, Mel Gorman, KOSAKI Motohiro, linux-mm,
	linux-kernel, linux-fsdevel, Jan Kara, Dave Chinner, Greg Thelen

On Fri, Aug 19, 2011 at 07:55:43PM +0800, Artem Bityutskiy wrote:
> On Thu, 2011-08-18 at 21:13 +0800, Wu Fengguang wrote:
> > Thinking twice about it, I find that the different requirements for
> > interval flash/external microSD can also be solved by this scheme.
> > 
> > Introduce a per-bdi dirty_background_time (and optionally dirty_time)
> > as the counterpart of (and works in parallel to) global dirty[_background]_ratio,
> > however with unit "milliseconds worth of data".
> > 
> > The per-bdi dirty_background_time will be set low for external microSD
> > and high for internal flash. Then you get timely writeouts for microSD
> > and reasonably delayed writes for internal flash (controllable by the
> > global dirty_expire_centisecs).
> > 
> > The dirty_background_time will actually work more reliable than
> > dirty_expire_centisecs because it will checked immediately after the
> > application dirties more pages. And the dirty_time could provide
> > strong data integrity guarantee -- much stronger than
> > dirty_expire_centisecs -- if used.
> > 
> > Does that sound reasonable?
> 
> Yes, this would probably work. But note, we do not have this problem
> anymore, I was just talking about the past experience, so I cannot
> validate any possible patch.

OK, thanks for the information. What do you mean by "not have this
problem any more"? Did you worked around it in other ways, such as
sync mount (which seems rather inefficient though)?

Thanks,
Fengguang

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
  2011-08-19 14:24                     ` Wu Fengguang
@ 2011-08-19 17:20                       ` Kautuk Consul
  -1 siblings, 0 replies; 47+ messages in thread
From: Kautuk Consul @ 2011-08-19 17:20 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Artem Bityutskiy, Mel Gorman, KOSAKI Motohiro, linux-mm,
	linux-kernel, linux-fsdevel, Jan Kara, Dave Chinner, Greg Thelen

Hi Wu,

You're right, the BDI threads should be woken up reliably by the
balance_dirty_pages() and balance_dirty_pages()
needs to be called from all code that is responsible for dirtying the pages.
Sorry, I was not too aware of the balance_dirty_pages() functionality
and the way it was being called in entirety or I would
have spotted this.

Thanks for adding the dirty_background_time into your
over_bground_thresh() formula.

Now that you seem to have included the time into the threshold, I can
relate to your patch better
as a solution for the problems I earlier mentioned.

Thanks again,
Kautuk.


On Fri, Aug 19, 2011 at 7:54 PM, Wu Fengguang <fengguang.wu@intel.com> wrote:
> Hi Kautuk,
>
> On Fri, Aug 19, 2011 at 03:00:30PM +0800, Kautuk Consul wrote:
>> Hi Wu,
>>
>> Yes. I think I do understand your approach.
>>
>> Your aim is to always retain the per BDI timeout value.
>>
>> You want to check for threshholds by mathematically adjusting the
>> background time too
>> into your over_bground_thresh() formula so that your understanding
>> holds true always and also
>> affects the page dirtying scenario I mentioned.
>> This definitely helps and refines this scenario in terms of flushing
>> out of the dirty pages.
>
> Thanks.
>
>> Doubts:
>> i)   Your entire implementation seems to be dependent on someone
>> calling balance_dirty_pages()
>>      directly or indirectly. This function will call the
>> bdi_start_background_writeback() which wakes
>>      up the flusher thread.
>>      What about those page dirtying code paths which might not call
>> balance_dirty_pages ?
>>      Those paths then depend on the BDI thread periodically writing it
>> to disk and then we are again
>>      dependent on the writeback interval.
>>      Can we assume that the kernel will reliably call
>> balance_dirty_pages() whenever the pages
>>      are dirtied ? If that was true, then we would not need bdi
>> periodic writeback threads ever.
>
> Yes. The kernel need a way to limit the total number of dirty pages at
> any given time and to keep them under dirty_ratio/dirty_bytes.
>
> balance_dirty_pages() is such a central place to throttle the dirty
> pages. Whatever code path generating dirty pages are required to call
> into balance_dirty_pages_ratelimited_nr() which will in turn call
> balance_dirty_pages().
>
> So, the values specified by dirty_ratio/dirty_bytes will be executed
> effectively by balance_dirty_pages(). In contrast, the values
> specified by dirty_expire_centisecs is merely a parameter used by
> wb_writeback() to select the eligible inodes to do writeout. The 30s
> dirty expire time is never a guarantee that all inodes/pages dirtied
> before 30s will be timely written to disk. It's better interpreted in
> the opposite way: when under the dirty_background_ratio threshold and
> hence background writeout does not kick in, dirty inodes younger than
> 30s won't be written to disk by the flusher.
>
>> ii)  Even after your rigorous checking, the bdi_writeback_thread()
>> will still do a schedule_timeout()
>>      with the global value. Will your current solution then handle
>> Artem's disk removal scenario ?
>>      Else, you start using your value in the schedule_timeout() call
>> in the bdi_writeback_thread()
>>      function, which brings us back to the interval phenomenon I was
>> talking about.
>
> wb_writeback() will keep running as long as over_bground_thresh().
>
> The flusher will keep writing as long as there are more works, since
> there is a
>
>                if (!list_empty(&bdi->work_list))
>                        continue;
>
> before the schedule_timeout() call.
>
> And the flusher thread will always be woke up timely from
> balance_dirty_pages().
>
> So schedule_timeout() won't block in the way at all.
>
>> Does this patch really help the user control exact time when the write
>> BIO is transferred from the
>> MM to the Block layer assuming balance_dirty_pages() is not called ?
>
> It would be a serious bug if balance_dirty_pages() is somehow not
> called. But note that balance_dirty_pages() is designed to be called
> on every N pages to reduce overheads.
>
> Thanks,
> Fengguang
>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
@ 2011-08-19 17:20                       ` Kautuk Consul
  0 siblings, 0 replies; 47+ messages in thread
From: Kautuk Consul @ 2011-08-19 17:20 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Artem Bityutskiy, Mel Gorman, KOSAKI Motohiro, linux-mm,
	linux-kernel, linux-fsdevel, Jan Kara, Dave Chinner, Greg Thelen

Hi Wu,

You're right, the BDI threads should be woken up reliably by the
balance_dirty_pages() and balance_dirty_pages()
needs to be called from all code that is responsible for dirtying the pages.
Sorry, I was not too aware of the balance_dirty_pages() functionality
and the way it was being called in entirety or I would
have spotted this.

Thanks for adding the dirty_background_time into your
over_bground_thresh() formula.

Now that you seem to have included the time into the threshold, I can
relate to your patch better
as a solution for the problems I earlier mentioned.

Thanks again,
Kautuk.


On Fri, Aug 19, 2011 at 7:54 PM, Wu Fengguang <fengguang.wu@intel.com> wrote:
> Hi Kautuk,
>
> On Fri, Aug 19, 2011 at 03:00:30PM +0800, Kautuk Consul wrote:
>> Hi Wu,
>>
>> Yes. I think I do understand your approach.
>>
>> Your aim is to always retain the per BDI timeout value.
>>
>> You want to check for threshholds by mathematically adjusting the
>> background time too
>> into your over_bground_thresh() formula so that your understanding
>> holds true always and also
>> affects the page dirtying scenario I mentioned.
>> This definitely helps and refines this scenario in terms of flushing
>> out of the dirty pages.
>
> Thanks.
>
>> Doubts:
>> i)   Your entire implementation seems to be dependent on someone
>> calling balance_dirty_pages()
>>      directly or indirectly. This function will call the
>> bdi_start_background_writeback() which wakes
>>      up the flusher thread.
>>      What about those page dirtying code paths which might not call
>> balance_dirty_pages ?
>>      Those paths then depend on the BDI thread periodically writing it
>> to disk and then we are again
>>      dependent on the writeback interval.
>>      Can we assume that the kernel will reliably call
>> balance_dirty_pages() whenever the pages
>>      are dirtied ? If that was true, then we would not need bdi
>> periodic writeback threads ever.
>
> Yes. The kernel need a way to limit the total number of dirty pages at
> any given time and to keep them under dirty_ratio/dirty_bytes.
>
> balance_dirty_pages() is such a central place to throttle the dirty
> pages. Whatever code path generating dirty pages are required to call
> into balance_dirty_pages_ratelimited_nr() which will in turn call
> balance_dirty_pages().
>
> So, the values specified by dirty_ratio/dirty_bytes will be executed
> effectively by balance_dirty_pages(). In contrast, the values
> specified by dirty_expire_centisecs is merely a parameter used by
> wb_writeback() to select the eligible inodes to do writeout. The 30s
> dirty expire time is never a guarantee that all inodes/pages dirtied
> before 30s will be timely written to disk. It's better interpreted in
> the opposite way: when under the dirty_background_ratio threshold and
> hence background writeout does not kick in, dirty inodes younger than
> 30s won't be written to disk by the flusher.
>
>> ii)  Even after your rigorous checking, the bdi_writeback_thread()
>> will still do a schedule_timeout()
>>      with the global value. Will your current solution then handle
>> Artem's disk removal scenario ?
>>      Else, you start using your value in the schedule_timeout() call
>> in the bdi_writeback_thread()
>>      function, which brings us back to the interval phenomenon I was
>> talking about.
>
> wb_writeback() will keep running as long as over_bground_thresh().
>
> The flusher will keep writing as long as there are more works, since
> there is a
>
>                if (!list_empty(&bdi->work_list))
>                        continue;
>
> before the schedule_timeout() call.
>
> And the flusher thread will always be woke up timely from
> balance_dirty_pages().
>
> So schedule_timeout() won't block in the way at all.
>
>> Does this patch really help the user control exact time when the write
>> BIO is transferred from the
>> MM to the Block layer assuming balance_dirty_pages() is not called ?
>
> It would be a serious bug if balance_dirty_pages() is somehow not
> called. But note that balance_dirty_pages() is designed to be called
> on every N pages to reduce overheads.
>
> Thanks,
> Fengguang
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
  2011-08-19 17:20                       ` Kautuk Consul
  (?)
@ 2011-08-21 14:11                         ` Wu Fengguang
  -1 siblings, 0 replies; 47+ messages in thread
From: Wu Fengguang @ 2011-08-21 14:11 UTC (permalink / raw)
  To: Kautuk Consul
  Cc: Artem Bityutskiy, Mel Gorman, KOSAKI Motohiro, linux-mm,
	linux-kernel, linux-fsdevel, Jan Kara, Dave Chinner, Greg Thelen

Hi Kautuk,

On Sat, Aug 20, 2011 at 01:20:52AM +0800, Kautuk Consul wrote:
> Hi Wu,
> 
> You're right, the BDI threads should be woken up reliably by the
> balance_dirty_pages() and balance_dirty_pages()
> needs to be called from all code that is responsible for dirtying the pages.
> Sorry, I was not too aware of the balance_dirty_pages() functionality
> and the way it was being called in entirety or I would
> have spotted this.

That's fine. One have to get to know the code bit by bit :)

> Thanks for adding the dirty_background_time into your
> over_bground_thresh() formula.
> 
> Now that you seem to have included the time into the threshold, I can
> relate to your patch better
> as a solution for the problems I earlier mentioned.

Great, thank you.

Thanks,
Fengguang

> On Fri, Aug 19, 2011 at 7:54 PM, Wu Fengguang <fengguang.wu@intel.com> wrote:
> > Hi Kautuk,
> >
> > On Fri, Aug 19, 2011 at 03:00:30PM +0800, Kautuk Consul wrote:
> >> Hi Wu,
> >>
> >> Yes. I think I do understand your approach.
> >>
> >> Your aim is to always retain the per BDI timeout value.
> >>
> >> You want to check for threshholds by mathematically adjusting the
> >> background time too
> >> into your over_bground_thresh() formula so that your understanding
> >> holds true always and also
> >> affects the page dirtying scenario I mentioned.
> >> This definitely helps and refines this scenario in terms of flushing
> >> out of the dirty pages.
> >
> > Thanks.
> >
> >> Doubts:
> >> i)   Your entire implementation seems to be dependent on someone
> >> calling balance_dirty_pages()
> >>      directly or indirectly. This function will call the
> >> bdi_start_background_writeback() which wakes
> >>      up the flusher thread.
> >>      What about those page dirtying code paths which might not call
> >> balance_dirty_pages ?
> >>      Those paths then depend on the BDI thread periodically writing it
> >> to disk and then we are again
> >>      dependent on the writeback interval.
> >>      Can we assume that the kernel will reliably call
> >> balance_dirty_pages() whenever the pages
> >>      are dirtied ? If that was true, then we would not need bdi
> >> periodic writeback threads ever.
> >
> > Yes. The kernel need a way to limit the total number of dirty pages at
> > any given time and to keep them under dirty_ratio/dirty_bytes.
> >
> > balance_dirty_pages() is such a central place to throttle the dirty
> > pages. Whatever code path generating dirty pages are required to call
> > into balance_dirty_pages_ratelimited_nr() which will in turn call
> > balance_dirty_pages().
> >
> > So, the values specified by dirty_ratio/dirty_bytes will be executed
> > effectively by balance_dirty_pages(). In contrast, the values
> > specified by dirty_expire_centisecs is merely a parameter used by
> > wb_writeback() to select the eligible inodes to do writeout. The 30s
> > dirty expire time is never a guarantee that all inodes/pages dirtied
> > before 30s will be timely written to disk. It's better interpreted in
> > the opposite way: when under the dirty_background_ratio threshold and
> > hence background writeout does not kick in, dirty inodes younger than
> > 30s won't be written to disk by the flusher.
> >
> >> ii)  Even after your rigorous checking, the bdi_writeback_thread()
> >> will still do a schedule_timeout()
> >>      with the global value. Will your current solution then handle
> >> Artem's disk removal scenario ?
> >>      Else, you start using your value in the schedule_timeout() call
> >> in the bdi_writeback_thread()
> >>      function, which brings us back to the interval phenomenon I was
> >> talking about.
> >
> > wb_writeback() will keep running as long as over_bground_thresh().
> >
> > The flusher will keep writing as long as there are more works, since
> > there is a
> >
> >                if (!list_empty(&bdi->work_list))
> >                        continue;
> >
> > before the schedule_timeout() call.
> >
> > And the flusher thread will always be woke up timely from
> > balance_dirty_pages().
> >
> > So schedule_timeout() won't block in the way at all.
> >
> >> Does this patch really help the user control exact time when the write
> >> BIO is transferred from the
> >> MM to the Block layer assuming balance_dirty_pages() is not called ?
> >
> > It would be a serious bug if balance_dirty_pages() is somehow not
> > called. But note that balance_dirty_pages() is designed to be called
> > on every N pages to reduce overheads.
> >
> > Thanks,
> > Fengguang
> >

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
@ 2011-08-21 14:11                         ` Wu Fengguang
  0 siblings, 0 replies; 47+ messages in thread
From: Wu Fengguang @ 2011-08-21 14:11 UTC (permalink / raw)
  To: Kautuk Consul
  Cc: Artem Bityutskiy, Mel Gorman, KOSAKI Motohiro, linux-mm,
	linux-kernel, linux-fsdevel, Jan Kara, Dave Chinner, Greg Thelen

Hi Kautuk,

On Sat, Aug 20, 2011 at 01:20:52AM +0800, Kautuk Consul wrote:
> Hi Wu,
> 
> You're right, the BDI threads should be woken up reliably by the
> balance_dirty_pages() and balance_dirty_pages()
> needs to be called from all code that is responsible for dirtying the pages.
> Sorry, I was not too aware of the balance_dirty_pages() functionality
> and the way it was being called in entirety or I would
> have spotted this.

That's fine. One have to get to know the code bit by bit :)

> Thanks for adding the dirty_background_time into your
> over_bground_thresh() formula.
> 
> Now that you seem to have included the time into the threshold, I can
> relate to your patch better
> as a solution for the problems I earlier mentioned.

Great, thank you.

Thanks,
Fengguang

> On Fri, Aug 19, 2011 at 7:54 PM, Wu Fengguang <fengguang.wu@intel.com> wrote:
> > Hi Kautuk,
> >
> > On Fri, Aug 19, 2011 at 03:00:30PM +0800, Kautuk Consul wrote:
> >> Hi Wu,
> >>
> >> Yes. I think I do understand your approach.
> >>
> >> Your aim is to always retain the per BDI timeout value.
> >>
> >> You want to check for threshholds by mathematically adjusting the
> >> background time too
> >> into your over_bground_thresh() formula so that your understanding
> >> holds true always and also
> >> affects the page dirtying scenario I mentioned.
> >> This definitely helps and refines this scenario in terms of flushing
> >> out of the dirty pages.
> >
> > Thanks.
> >
> >> Doubts:
> >> i)   Your entire implementation seems to be dependent on someone
> >> calling balance_dirty_pages()
> >>      directly or indirectly. This function will call the
> >> bdi_start_background_writeback() which wakes
> >>      up the flusher thread.
> >>      What about those page dirtying code paths which might not call
> >> balance_dirty_pages ?
> >>      Those paths then depend on the BDI thread periodically writing it
> >> to disk and then we are again
> >>      dependent on the writeback interval.
> >>      Can we assume that the kernel will reliably call
> >> balance_dirty_pages() whenever the pages
> >>      are dirtied ? If that was true, then we would not need bdi
> >> periodic writeback threads ever.
> >
> > Yes. The kernel need a way to limit the total number of dirty pages at
> > any given time and to keep them under dirty_ratio/dirty_bytes.
> >
> > balance_dirty_pages() is such a central place to throttle the dirty
> > pages. Whatever code path generating dirty pages are required to call
> > into balance_dirty_pages_ratelimited_nr() which will in turn call
> > balance_dirty_pages().
> >
> > So, the values specified by dirty_ratio/dirty_bytes will be executed
> > effectively by balance_dirty_pages(). In contrast, the values
> > specified by dirty_expire_centisecs is merely a parameter used by
> > wb_writeback() to select the eligible inodes to do writeout. The 30s
> > dirty expire time is never a guarantee that all inodes/pages dirtied
> > before 30s will be timely written to disk. It's better interpreted in
> > the opposite way: when under the dirty_background_ratio threshold and
> > hence background writeout does not kick in, dirty inodes younger than
> > 30s won't be written to disk by the flusher.
> >
> >> ii)  Even after your rigorous checking, the bdi_writeback_thread()
> >> will still do a schedule_timeout()
> >>      with the global value. Will your current solution then handle
> >> Artem's disk removal scenario ?
> >>      Else, you start using your value in the schedule_timeout() call
> >> in the bdi_writeback_thread()
> >>      function, which brings us back to the interval phenomenon I was
> >> talking about.
> >
> > wb_writeback() will keep running as long as over_bground_thresh().
> >
> > The flusher will keep writing as long as there are more works, since
> > there is a
> >
> >                if (!list_empty(&bdi->work_list))
> >                        continue;
> >
> > before the schedule_timeout() call.
> >
> > And the flusher thread will always be woke up timely from
> > balance_dirty_pages().
> >
> > So schedule_timeout() won't block in the way at all.
> >
> >> Does this patch really help the user control exact time when the write
> >> BIO is transferred from the
> >> MM to the Block layer assuming balance_dirty_pages() is not called ?
> >
> > It would be a serious bug if balance_dirty_pages() is somehow not
> > called. But note that balance_dirty_pages() is designed to be called
> > on every N pages to reduce overheads.
> >
> > Thanks,
> > Fengguang
> >
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval.
@ 2011-08-21 14:11                         ` Wu Fengguang
  0 siblings, 0 replies; 47+ messages in thread
From: Wu Fengguang @ 2011-08-21 14:11 UTC (permalink / raw)
  To: Kautuk Consul
  Cc: Artem Bityutskiy, Mel Gorman, KOSAKI Motohiro, linux-mm,
	linux-kernel, linux-fsdevel, Jan Kara, Dave Chinner, Greg Thelen

Hi Kautuk,

On Sat, Aug 20, 2011 at 01:20:52AM +0800, Kautuk Consul wrote:
> Hi Wu,
> 
> You're right, the BDI threads should be woken up reliably by the
> balance_dirty_pages() and balance_dirty_pages()
> needs to be called from all code that is responsible for dirtying the pages.
> Sorry, I was not too aware of the balance_dirty_pages() functionality
> and the way it was being called in entirety or I would
> have spotted this.

That's fine. One have to get to know the code bit by bit :)

> Thanks for adding the dirty_background_time into your
> over_bground_thresh() formula.
> 
> Now that you seem to have included the time into the threshold, I can
> relate to your patch better
> as a solution for the problems I earlier mentioned.

Great, thank you.

Thanks,
Fengguang

> On Fri, Aug 19, 2011 at 7:54 PM, Wu Fengguang <fengguang.wu@intel.com> wrote:
> > Hi Kautuk,
> >
> > On Fri, Aug 19, 2011 at 03:00:30PM +0800, Kautuk Consul wrote:
> >> Hi Wu,
> >>
> >> Yes. I think I do understand your approach.
> >>
> >> Your aim is to always retain the per BDI timeout value.
> >>
> >> You want to check for threshholds by mathematically adjusting the
> >> background time too
> >> into your over_bground_thresh() formula so that your understanding
> >> holds true always and also
> >> affects the page dirtying scenario I mentioned.
> >> This definitely helps and refines this scenario in terms of flushing
> >> out of the dirty pages.
> >
> > Thanks.
> >
> >> Doubts:
> >> i) A  Your entire implementation seems to be dependent on someone
> >> calling balance_dirty_pages()
> >> A  A  A directly or indirectly. This function will call the
> >> bdi_start_background_writeback() which wakes
> >> A  A  A up the flusher thread.
> >> A  A  A What about those page dirtying code paths which might not call
> >> balance_dirty_pages ?
> >> A  A  A Those paths then depend on the BDI thread periodically writing it
> >> to disk and then we are again
> >> A  A  A dependent on the writeback interval.
> >> A  A  A Can we assume that the kernel will reliably call
> >> balance_dirty_pages() whenever the pages
> >> A  A  A are dirtied ? If that was true, then we would not need bdi
> >> periodic writeback threads ever.
> >
> > Yes. The kernel need a way to limit the total number of dirty pages at
> > any given time and to keep them under dirty_ratio/dirty_bytes.
> >
> > balance_dirty_pages() is such a central place to throttle the dirty
> > pages. Whatever code path generating dirty pages are required to call
> > into balance_dirty_pages_ratelimited_nr() which will in turn call
> > balance_dirty_pages().
> >
> > So, the values specified by dirty_ratio/dirty_bytes will be executed
> > effectively by balance_dirty_pages(). In contrast, the values
> > specified by dirty_expire_centisecs is merely a parameter used by
> > wb_writeback() to select the eligible inodes to do writeout. The 30s
> > dirty expire time is never a guarantee that all inodes/pages dirtied
> > before 30s will be timely written to disk. It's better interpreted in
> > the opposite way: when under the dirty_background_ratio threshold and
> > hence background writeout does not kick in, dirty inodes younger than
> > 30s won't be written to disk by the flusher.
> >
> >> ii) A Even after your rigorous checking, the bdi_writeback_thread()
> >> will still do a schedule_timeout()
> >> A  A  A with the global value. Will your current solution then handle
> >> Artem's disk removal scenario ?
> >> A  A  A Else, you start using your value in the schedule_timeout() call
> >> in the bdi_writeback_thread()
> >> A  A  A function, which brings us back to the interval phenomenon I was
> >> talking about.
> >
> > wb_writeback() will keep running as long as over_bground_thresh().
> >
> > The flusher will keep writing as long as there are more works, since
> > there is a
> >
> > A  A  A  A  A  A  A  A if (!list_empty(&bdi->work_list))
> > A  A  A  A  A  A  A  A  A  A  A  A continue;
> >
> > before the schedule_timeout() call.
> >
> > And the flusher thread will always be woke up timely from
> > balance_dirty_pages().
> >
> > So schedule_timeout() won't block in the way at all.
> >
> >> Does this patch really help the user control exact time when the write
> >> BIO is transferred from the
> >> MM to the Block layer assuming balance_dirty_pages() is not called ?
> >
> > It would be a serious bug if balance_dirty_pages() is somehow not
> > called. But note that balance_dirty_pages() is designed to be called
> > on every N pages to reduce overheads.
> >
> > Thanks,
> > Fengguang
> >

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

end of thread, other threads:[~2011-08-21 14:11 UTC | newest]

Thread overview: 47+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-08-11 12:20 [PATCH] writeback: Per-block device bdi->dirty_writeback_interval and bdi->dirty_expire_interval Kautuk Consul
2011-08-11 12:20 ` Kautuk Consul
2011-08-18  9:48 ` Wu Fengguang
2011-08-18  9:48   ` Wu Fengguang
2011-08-18  9:51   ` Wu Fengguang
2011-08-18  9:51     ` Wu Fengguang
2011-08-18 11:28   ` Kautuk Consul
2011-08-18 11:28     ` Kautuk Consul
2011-08-18 12:55     ` Wu Fengguang
2011-08-18 12:55       ` Wu Fengguang
2011-08-18 12:14   ` Artem Bityutskiy
2011-08-18 12:14     ` Artem Bityutskiy
2011-08-18 12:14     ` Artem Bityutskiy
2011-08-18 12:35     ` Wu Fengguang
2011-08-18 12:35       ` Wu Fengguang
2011-08-18 15:26       ` Kautuk Consul
2011-08-18 15:26         ` Kautuk Consul
2011-08-19  2:17         ` Wu Fengguang
2011-08-19  2:17           ` Wu Fengguang
2011-08-19  2:17           ` Wu Fengguang
2011-08-18 13:13     ` Wu Fengguang
2011-08-18 13:13       ` Wu Fengguang
2011-08-18 16:25       ` Kautuk Consul
2011-08-18 16:25         ` Kautuk Consul
2011-08-19  2:34         ` Wu Fengguang
2011-08-19  2:34           ` Wu Fengguang
2011-08-19  4:38           ` Kautuk Consul
2011-08-19  4:38             ` Kautuk Consul
2011-08-19  5:28             ` Wu Fengguang
2011-08-19  5:28               ` Wu Fengguang
2011-08-19  6:08               ` Wu Fengguang
2011-08-19  6:08                 ` Wu Fengguang
2011-08-19  7:00                 ` Kautuk Consul
2011-08-19  7:00                   ` Kautuk Consul
2011-08-19  7:00                   ` Kautuk Consul
2011-08-19 14:24                   ` Wu Fengguang
2011-08-19 14:24                     ` Wu Fengguang
2011-08-19 17:20                     ` Kautuk Consul
2011-08-19 17:20                       ` Kautuk Consul
2011-08-21 14:11                       ` Wu Fengguang
2011-08-21 14:11                         ` Wu Fengguang
2011-08-21 14:11                         ` Wu Fengguang
2011-08-19 11:55       ` Artem Bityutskiy
2011-08-19 11:55         ` Artem Bityutskiy
2011-08-19 11:55         ` Artem Bityutskiy
2011-08-19 14:27         ` Wu Fengguang
2011-08-19 14:27           ` Wu Fengguang

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.