All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/2 v2] Flexible proportions for BDIs
@ 2012-05-03 22:39 Jan Kara
  2012-05-03 22:39 ` [PATCH 1/2] lib: Proportions with flexible period Jan Kara
                   ` (2 more replies)
  0 siblings, 3 replies; 16+ messages in thread
From: Jan Kara @ 2012-05-03 22:39 UTC (permalink / raw)
  To: linux-mm; +Cc: Wu Fengguang, peterz


  Hello,

  this is the second iteration of my patches for flexible proportions. Since
previous submission, I've converted BDI proportion calculations to use flexible
proportions so now we can test proportions in kernel. Fengguang, can you give
them a run in your JBOD setup? You might try to tweak VM_COMPLETIONS_PERIOD_LEN
if things are fluctuating too much... I'm not yet completely decided how to set
that constant. Thanks!

								Honza

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 1/2] lib: Proportions with flexible period
  2012-05-03 22:39 [PATCH 0/2 v2] Flexible proportions for BDIs Jan Kara
@ 2012-05-03 22:39 ` Jan Kara
  2012-05-03 22:39 ` [PATCH 2/2] block: Convert BDI proportion calculations to flexible proportions Jan Kara
  2012-05-07 14:43 ` [PATCH 0/2 v2] Flexible proportions for BDIs Fengguang Wu
  2 siblings, 0 replies; 16+ messages in thread
From: Jan Kara @ 2012-05-03 22:39 UTC (permalink / raw)
  To: linux-mm; +Cc: Wu Fengguang, peterz, Jan Kara

Implement code computing proportions of events of different type (like code in
lib/proportions.c) but allowing periods to have different lengths. This allows
us to have aging periods of fixed wallclock time which gives better proportion
estimates given the hugely varying throughput of different devices - previous
measuring of aging period by number of events has the problem that a reasonable
period length for a system with low-end USB stick is not a reasonable period
length for a system with high-end storage array resulting either in too slow
proportion updates or too fluctuating proportion updates.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/linux/flex_proportions.h |   91 +++++++++++++++
 lib/Makefile                     |    2 +-
 lib/flex_proportions.c           |  238 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 330 insertions(+), 1 deletions(-)
 create mode 100644 include/linux/flex_proportions.h
 create mode 100644 lib/flex_proportions.c

diff --git a/include/linux/flex_proportions.h b/include/linux/flex_proportions.h
new file mode 100644
index 0000000..0c3c63f
--- /dev/null
+++ b/include/linux/flex_proportions.h
@@ -0,0 +1,91 @@
+/*
+ * Floating proportions with flexible aging period
+ *
+ *  Copyright (C) 2011, SUSE, Jan Kara <jack@suse.cz>
+ */
+
+#ifndef _LINUX_FLEX_PROPORTIONS_H
+#define _LINUX_FLEX_PROPORTIONS_H
+
+#include <linux/percpu_counter.h>
+#include <linux/spinlock.h>
+#include <linux/seqlock.h>
+
+/*
+ * ---- Global proportion definitions ----
+ */
+struct fprop_global {
+	/* Number of events in the current period */
+	struct percpu_counter events;
+	/* Current period */
+	unsigned int period;
+	/* Synchronization with period transitions */
+	seqcount_t sequence;
+};
+
+int fprop_global_init(struct fprop_global *p);
+void fprop_global_destroy(struct fprop_global *p);
+void fprop_new_period(struct fprop_global *p);
+
+/*
+ *  ---- SINGLE ----
+ */
+struct fprop_local_single {
+	/* the local events counter */
+	unsigned long events;
+	/* Period in which we last updated events */
+	unsigned int period;
+	raw_spinlock_t lock;	/* Protect period and numerator */
+};
+
+#define INIT_FPROP_LOCAL_SINGLE(name)			\
+{	.lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock),	\
+}
+
+int fprop_local_init_single(struct fprop_local_single *pl);
+void __fprop_inc_single(struct fprop_global *p, struct fprop_local_single *pl);
+void fprop_fraction_single(struct fprop_global *p,
+	struct fprop_local_single *pl, unsigned long *numerator,
+	unsigned long *denominator);
+
+static inline
+void fprop_inc_single(struct fprop_global *p, struct fprop_local_single *pl)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	__fprop_inc_single(p, pl);
+	local_irq_restore(flags);
+}
+
+/*
+ * ---- PERCPU ----
+ */
+struct fprop_local_percpu {
+	/* the local events counter */
+	struct percpu_counter events;
+	/* Period in which we last updated events */
+	unsigned int period;
+	raw_spinlock_t lock;	/* Protect period and numerator */
+};
+
+int fprop_local_init_percpu(struct fprop_local_percpu *pl);
+void fprop_local_destroy_percpu(struct fprop_local_percpu *pl);
+void __fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl);
+void __fprop_inc_percpu_max(struct fprop_global *p, struct fprop_local_percpu *pl,
+			    int max_frac);
+void fprop_fraction_percpu(struct fprop_global *p,
+	struct fprop_local_percpu *pl, unsigned long *numerator,
+	unsigned long *denominator);
+
+static inline
+void fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	__fprop_inc_percpu(p, pl);
+	local_irq_restore(flags);
+}
+
+#endif
diff --git a/lib/Makefile b/lib/Makefile
index 18515f0..e144536 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -11,7 +11,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 rbtree.o radix-tree.o dump_stack.o timerqueue.o\
 	 idr.o int_sqrt.o extable.o prio_tree.o \
 	 sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \
-	 proportions.o prio_heap.o ratelimit.o show_mem.o \
+	 proportions.o flex_proportions.o prio_heap.o ratelimit.o show_mem.o \
 	 is_single_threaded.o plist.o decompress.o
 
 lib-$(CONFIG_MMU) += ioremap.o
diff --git a/lib/flex_proportions.c b/lib/flex_proportions.c
new file mode 100644
index 0000000..d3a2468
--- /dev/null
+++ b/lib/flex_proportions.c
@@ -0,0 +1,238 @@
+/*
+ *  Floating proportions with flexible aging period
+ *
+ *   Copyright (C) 2011, SUSE, Jan Kara <jack@suse.cz>
+ *
+ * The goal of this code is: Given different types of event, measure proportion
+ * of each type of event over time. The proportions are measured with
+ * exponentially decaying history to give smooth transitions. A formula
+ * expressing proportion of event of type 'j' is:
+ *
+ *   p_{j} = (\Sum_{i>=0} x_{i,j}/2^{i+1})/(\Sum_{i>=0} x_i/2^{i+1})
+ *
+ * Where x_{i,j} is j's number of events in i-th last time period and x_i is
+ * total number of events in i-th last time period.
+ *
+ * Note that p_{j}'s are normalised, i.e.
+ *
+ *   \Sum_{j} p_{j} = 1,
+ *
+ * This formula can be straightforwardly computed by maintaing denominator
+ * (let's call it 'd') and for each event type its numerator (let's call it
+ * 'n_j'). When an event of type 'j' happens, we simply need to do:
+ *   n_j++; d++;
+ *
+ * When a new period is declared, we could do:
+ *   d /= 2
+ *   for each j
+ *     n_j /= 2
+ *
+ * To avoid iteration over all event types, we instead shift numerator of event
+ * j lazily when someone asks for a proportion of event j or when event j
+ * occurs. This can bit trivially implemented by remembering last period in
+ * which something happened with proportion of type j.
+ */
+#include <linux/flex_proportions.h>
+
+int fprop_global_init(struct fprop_global *p)
+{
+	int err;
+
+	p->period = 0;
+	/* Use 1 to avoid dealing with periods with 0 events... */
+	err = percpu_counter_init(&p->events, 1);
+	if (err)
+		return err;
+	seqcount_init(&p->sequence);
+	return 0;
+}
+
+void fprop_global_destroy(struct fprop_global *p)
+{
+	percpu_counter_destroy(&p->events);
+}
+
+/*
+ * Declare new period. It is upto the caller to make sure two period
+ * transitions cannot happen in parallel.
+ */
+void fprop_new_period(struct fprop_global *p)
+{
+	u64 events = percpu_counter_sum(&p->events);
+
+	/*
+	 * Don't do anything if there are no events.
+	 */
+	if (events <= 1)
+		return;
+	write_seqcount_begin(&p->sequence);
+	 /* We use addition to avoid losing events happening between sum and set. */
+	percpu_counter_add(&p->events, -(events >> 1));
+	p->period++;
+	write_seqcount_end(&p->sequence);
+}
+
+/*
+ * ---- SINGLE ----
+ */
+
+int fprop_local_init_single(struct fprop_local_single *pl)
+{
+	pl->events = 0;
+	pl->period = 0;
+	raw_spin_lock_init(&pl->lock);
+	return 0;
+}
+
+void fprop_local_destroy_single(struct fprop_local_single *pl)
+{
+}
+
+static void fprop_reflect_period_single(struct fprop_global *p,
+					struct fprop_local_single *pl)
+{
+	unsigned int period = p->period;
+	unsigned long flags;
+
+	/* Fast path - period didn't change */
+	if (pl->period == period)
+		return;
+	raw_spin_lock_irqsave(&pl->lock, flags);
+	/* Someone updated pl->period while we were spinning? */
+	if (pl->period >= period) {
+		raw_spin_unlock_irqrestore(&pl->lock, flags);
+		return;
+	}
+	/* Aging zeroed our fraction? */
+	if (period - pl->period < BITS_PER_LONG)
+		pl->events >>= period - pl->period;
+	else
+		pl->events = 0;
+	pl->period = period;
+	raw_spin_unlock_irqrestore(&pl->lock, flags);
+}
+
+/* Event of type pl happened */
+void __fprop_inc_single(struct fprop_global *p, struct fprop_local_single *pl)
+{
+	fprop_reflect_period_single(p, pl);
+	pl->events++;
+	percpu_counter_add(&p->events, 1);
+}
+
+/* Return fraction of events of type pl */
+void fprop_fraction_single(struct fprop_global *p,
+			   struct fprop_local_single *pl,
+			   unsigned long *numerator, unsigned long *denominator)
+{
+	unsigned int seq;
+	s64 den;
+
+	do {
+		seq = read_seqcount_begin(&p->sequence);
+		fprop_reflect_period_single(p, pl);
+		*numerator = pl->events;
+		den = percpu_counter_read(&p->events);
+		if (den <= 0)
+			den = percpu_counter_sum(&p->events);
+		*denominator = den;
+	} while (read_seqcount_retry(&p->sequence, seq));
+}
+
+/*
+ * ---- PERCPU ----
+ */
+#define PROP_BATCH (8*(1+ilog2(nr_cpu_ids)))
+
+int fprop_local_init_percpu(struct fprop_local_percpu *pl)
+{
+	int err;
+
+	err = percpu_counter_init(&pl->events, 0);
+	if (err)
+		return err;
+	pl->period = 0;
+	raw_spin_lock_init(&pl->lock);
+	return 0;
+}
+
+void fprop_local_destroy_percpu(struct fprop_local_percpu *pl)
+{
+	percpu_counter_destroy(&pl->events);
+}
+
+static void fprop_reflect_period_percpu(struct fprop_global *p,
+					struct fprop_local_percpu *pl)
+{
+	unsigned int period = p->period;
+	unsigned long flags;
+
+	/* Fast path - period didn't change */
+	if (pl->period == period)
+		return;
+	raw_spin_lock_irqsave(&pl->lock, flags);
+	/* Someone updated pl->period while we were spinning? */
+	if (pl->period >= period) {
+		raw_spin_unlock_irqrestore(&pl->lock, flags);
+		return;
+	}
+	/* Aging zeroed our fraction? */
+	if (period - pl->period < BITS_PER_LONG) {
+		s64 val = percpu_counter_read(&pl->events);
+
+		if (val < (nr_cpu_ids * PROP_BATCH))
+			val = percpu_counter_sum(&pl->events);
+
+		__percpu_counter_add(&pl->events,
+			-val + (val >> (period-pl->period)), PROP_BATCH);
+	} else
+		percpu_counter_set(&pl->events, 0);
+	pl->period = period;
+	raw_spin_unlock_irqrestore(&pl->lock, flags);
+}
+
+/* Event of type pl happened */
+void __fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl)
+{
+	fprop_reflect_period_percpu(p, pl);
+	__percpu_counter_add(&pl->events, 1, PROP_BATCH);
+	percpu_counter_add(&p->events, 1);
+}
+
+void fprop_fraction_percpu(struct fprop_global *p,
+			   struct fprop_local_percpu *pl,
+			   unsigned long *numerator, unsigned long *denominator)
+{
+	unsigned int seq;
+	s64 den;
+
+	do {
+		seq = read_seqcount_begin(&p->sequence);
+		fprop_reflect_period_percpu(p, pl);
+		*numerator = percpu_counter_read_positive(&pl->events);
+		den = percpu_counter_read(&p->events);
+		if (den <= 0)
+			den = percpu_counter_sum(&p->events);
+		*denominator = den;
+	} while (read_seqcount_retry(&p->sequence, seq));
+}
+
+/*
+ * Like __fprop_inc_percpu() except that event is counted only if the given
+ * type has fraction smaller than @max_frac/100
+ */
+void __fprop_inc_percpu_max(struct fprop_global *p,
+			    struct fprop_local_percpu *pl, int max_frac)
+{
+	if (unlikely(max_frac < 100)) {
+		unsigned long numerator, denominator;
+
+		fprop_fraction_percpu(p, pl, &numerator, &denominator);
+		if (numerator > ((long long)denominator) * max_frac / 100)
+			return;
+	} else
+		fprop_reflect_period_percpu(p, pl);
+	__percpu_counter_add(&pl->events, 1, PROP_BATCH);
+	percpu_counter_add(&p->events, 1);
+}
+
-- 
1.7.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 2/2] block: Convert BDI proportion calculations to flexible proportions
  2012-05-03 22:39 [PATCH 0/2 v2] Flexible proportions for BDIs Jan Kara
  2012-05-03 22:39 ` [PATCH 1/2] lib: Proportions with flexible period Jan Kara
@ 2012-05-03 22:39 ` Jan Kara
  2012-05-07 14:47   ` Fengguang Wu
  2012-05-07 14:43 ` [PATCH 0/2 v2] Flexible proportions for BDIs Fengguang Wu
  2 siblings, 1 reply; 16+ messages in thread
From: Jan Kara @ 2012-05-03 22:39 UTC (permalink / raw)
  To: linux-mm; +Cc: Wu Fengguang, peterz, Jan Kara

Convert calculations of proportion of writeback each bdi does to new flexible
proportion code. That allows us to use aging period of fixed wallclock time
which gives better proportion estimates given the hugely varying throughput of
different devices.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/linux/backing-dev.h |    6 ++--
 mm/backing-dev.c            |    5 +--
 mm/page-writeback.c         |   80 ++++++++++++++++++++----------------------
 3 files changed, 43 insertions(+), 48 deletions(-)

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index b1038bd..64a3617 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -10,7 +10,7 @@
 
 #include <linux/percpu_counter.h>
 #include <linux/log2.h>
-#include <linux/proportions.h>
+#include <linux/flex_proportions.h>
 #include <linux/kernel.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
@@ -89,11 +89,11 @@ struct backing_dev_info {
 	unsigned long dirty_ratelimit;
 	unsigned long balanced_dirty_ratelimit;
 
-	struct prop_local_percpu completions;
+	struct fprop_local_percpu completions;
 	int dirty_exceeded;
 
 	unsigned int min_ratio;
-	unsigned int max_ratio, max_prop_frac;
+	unsigned int max_ratio;
 
 	struct bdi_writeback wb;  /* default writeback info for this bdi */
 	spinlock_t wb_lock;	  /* protects work_list */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index dd8e2aa..f3a2608 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -677,7 +677,6 @@ int bdi_init(struct backing_dev_info *bdi)
 
 	bdi->min_ratio = 0;
 	bdi->max_ratio = 100;
-	bdi->max_prop_frac = PROP_FRAC_BASE;
 	spin_lock_init(&bdi->wb_lock);
 	INIT_LIST_HEAD(&bdi->bdi_list);
 	INIT_LIST_HEAD(&bdi->work_list);
@@ -700,7 +699,7 @@ int bdi_init(struct backing_dev_info *bdi)
 	bdi->write_bandwidth = INIT_BW;
 	bdi->avg_write_bandwidth = INIT_BW;
 
-	err = prop_local_init_percpu(&bdi->completions);
+	err = fprop_local_init_percpu(&bdi->completions);
 
 	if (err) {
 err:
@@ -744,7 +743,7 @@ void bdi_destroy(struct backing_dev_info *bdi)
 	for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
 		percpu_counter_destroy(&bdi->bdi_stat[i]);
 
-	prop_local_destroy_percpu(&bdi->completions);
+	fprop_local_destroy_percpu(&bdi->completions);
 }
 EXPORT_SYMBOL(bdi_destroy);
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 26adea8..c8e59da 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -34,6 +34,7 @@
 #include <linux/syscalls.h>
 #include <linux/buffer_head.h> /* __set_page_dirty_buffers */
 #include <linux/pagevec.h>
+#include <linux/workqueue.h>
 #include <trace/events/writeback.h>
 
 /*
@@ -135,7 +136,18 @@ unsigned long global_dirty_limit;
  * measured in page writeback completions.
  *
  */
-static struct prop_descriptor vm_completions;
+static struct fprop_global vm_completions;
+
+static void vm_completions_period(struct work_struct *work);
+/* Work for aging of vm_completions */
+static DECLARE_DEFERRED_WORK(vm_completions_period_work, vm_completions_period);
+
+/*
+ * Length of period for aging writeout fractions of bdis. This is an
+ * arbitrarily chosen number. The longer the period, the slower fractions will
+ * reflect changes in current writeout rate.
+ */
+#define VM_COMPLETIONS_PERIOD_LEN (HZ/2)
 
 /*
  * Work out the current dirty-memory clamping and background writeout
@@ -322,34 +334,6 @@ bool zone_dirty_ok(struct zone *zone)
 	       zone_page_state(zone, NR_WRITEBACK) <= limit;
 }
 
-/*
- * couple the period to the dirty_ratio:
- *
- *   period/2 ~ roundup_pow_of_two(dirty limit)
- */
-static int calc_period_shift(void)
-{
-	unsigned long dirty_total;
-
-	if (vm_dirty_bytes)
-		dirty_total = vm_dirty_bytes / PAGE_SIZE;
-	else
-		dirty_total = (vm_dirty_ratio * global_dirtyable_memory()) /
-				100;
-	return 2 + ilog2(dirty_total - 1);
-}
-
-/*
- * update the period when the dirty threshold changes.
- */
-static void update_completion_period(void)
-{
-	int shift = calc_period_shift();
-	prop_change_shift(&vm_completions, shift);
-
-	writeback_set_ratelimit();
-}
-
 int dirty_background_ratio_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos)
@@ -383,7 +367,7 @@ int dirty_ratio_handler(struct ctl_table *table, int write,
 
 	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
-		update_completion_period();
+		writeback_set_ratelimit();
 		vm_dirty_bytes = 0;
 	}
 	return ret;
@@ -398,7 +382,7 @@ int dirty_bytes_handler(struct ctl_table *table, int write,
 
 	ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
 	if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
-		update_completion_period();
+		writeback_set_ratelimit();
 		vm_dirty_ratio = 0;
 	}
 	return ret;
@@ -411,8 +395,8 @@ int dirty_bytes_handler(struct ctl_table *table, int write,
 static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
 {
 	__inc_bdi_stat(bdi, BDI_WRITTEN);
-	__prop_inc_percpu_max(&vm_completions, &bdi->completions,
-			      bdi->max_prop_frac);
+	__fprop_inc_percpu_max(&vm_completions, &bdi->completions,
+			       bdi->max_ratio);
 }
 
 void bdi_writeout_inc(struct backing_dev_info *bdi)
@@ -431,10 +415,18 @@ EXPORT_SYMBOL_GPL(bdi_writeout_inc);
 static void bdi_writeout_fraction(struct backing_dev_info *bdi,
 		long *numerator, long *denominator)
 {
-	prop_fraction_percpu(&vm_completions, &bdi->completions,
+	fprop_fraction_percpu(&vm_completions, &bdi->completions,
 				numerator, denominator);
 }
 
+
+static void vm_completions_period(struct work_struct *work)
+{
+	fprop_new_period(&vm_completions);
+	schedule_delayed_work(&vm_completions_period_work,
+			      VM_COMPLETIONS_PERIOD_LEN);
+}
+
 /*
  * bdi_min_ratio keeps the sum of the minimum dirty shares of all
  * registered backing devices, which, for obvious reasons, can not
@@ -471,12 +463,10 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
 		return -EINVAL;
 
 	spin_lock_bh(&bdi_lock);
-	if (bdi->min_ratio > max_ratio) {
+	if (bdi->min_ratio > max_ratio)
 		ret = -EINVAL;
-	} else {
+	else
 		bdi->max_ratio = max_ratio;
-		bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
-	}
 	spin_unlock_bh(&bdi_lock);
 
 	return ret;
@@ -1605,14 +1595,20 @@ static struct notifier_block __cpuinitdata ratelimit_nb = {
  */
 void __init page_writeback_init(void)
 {
-	int shift;
-
 	writeback_set_ratelimit();
 	register_cpu_notifier(&ratelimit_nb);
 
-	shift = calc_period_shift();
-	prop_descriptor_init(&vm_completions, shift);
+	fprop_global_init(&vm_completions);
+}
+
+/* This must be called only after workqueues are initialized */
+static int __init completions_period_init(void)
+{
+	schedule_delayed_work(&vm_completions_period_work,
+			      VM_COMPLETIONS_PERIOD_LEN);
+	return 0;
 }
+postcore_initcall(completions_period_init);
 
 /**
  * tag_pages_for_writeback - tag pages to be written by write_cache_pages
-- 
1.7.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2 v2] Flexible proportions for BDIs
  2012-05-03 22:39 [PATCH 0/2 v2] Flexible proportions for BDIs Jan Kara
  2012-05-03 22:39 ` [PATCH 1/2] lib: Proportions with flexible period Jan Kara
  2012-05-03 22:39 ` [PATCH 2/2] block: Convert BDI proportion calculations to flexible proportions Jan Kara
@ 2012-05-07 14:43 ` Fengguang Wu
  2012-05-09 11:37   ` Jan Kara
  2 siblings, 1 reply; 16+ messages in thread
From: Fengguang Wu @ 2012-05-07 14:43 UTC (permalink / raw)
  To: Jan Kara; +Cc: linux-mm, peterz

[-- Attachment #1: Type: text/plain, Size: 6404 bytes --]

On Fri, May 04, 2012 at 12:39:18AM +0200, Jan Kara wrote:
> 
>   Hello,
> 
>   this is the second iteration of my patches for flexible proportions. Since
> previous submission, I've converted BDI proportion calculations to use flexible
> proportions so now we can test proportions in kernel. Fengguang, can you give
> them a run in your JBOD setup? You might try to tweak VM_COMPLETIONS_PERIOD_LEN
> if things are fluctuating too much... I'm not yet completely decided how to set
> that constant. Thanks!

Kara, I've got some results and it's working great. Overall performance
remains good. The default VM_COMPLETIONS_PERIOD_LEN = 0.5s is obviously
too small, so I tried increasing it to 3s and then 8s. Results for xfs
(which has most fluctuating IO completions and ditto for bdi_setpoint)
are attached. The XFS result of vanilla 3.3 is also attached. The
graphs are all for case bay/JBOD-2HDD-thresh=1000M/xfs-10dd.

Look at the gray "bdi setpoint" lines. The
VM_COMPLETIONS_PERIOD_LEN=8s kernel is able to achieve roughly the
same stable bdi_setpoint as the vanilla kernel, while being able to
adapt to the balanced bdi_setpoint much more fast (actually now the
bdi_setpoint is immediately close to the balanced value when
balance_dirty_pages() starts throttling, while the vanilla kernel
takes about 20 seconds for bdi_setpoint to grow up).

               3.4.0-rc2          3.4.0-rc2-prop8+
------------------------  ------------------------
                  195.86        +0.0%       195.87  bay/JBOD-2HDD-thresh=1000M/btrfs-10dd-1-3.4.0-rc2
                  196.68        +0.1%       196.81  bay/JBOD-2HDD-thresh=1000M/btrfs-1dd-1-3.4.0-rc2
                  187.39        -1.1%       185.28  bay/JBOD-2HDD-thresh=1000M/ext4-10dd-1-3.4.0-rc2
                  191.94        +0.7%       193.27  bay/JBOD-2HDD-thresh=1000M/ext4-1dd-1-3.4.0-rc2
                  193.01        -0.1%       192.76  bay/JBOD-2HDD-thresh=1000M/xfs-10dd-1-3.4.0-rc2
                  197.28        -0.1%       197.07  bay/JBOD-2HDD-thresh=1000M/xfs-1dd-1-3.4.0-rc2
                  197.09        -0.2%       196.74  bay/JBOD-2HDD-thresh=100M/btrfs-1dd-1-3.4.0-rc2
                  160.16        -1.2%       158.21  bay/JBOD-2HDD-thresh=100M/ext4-10dd-1-3.4.0-rc2
                  192.10        -0.1%       191.97  bay/JBOD-2HDD-thresh=100M/ext4-1dd-1-3.4.0-rc2
                  163.35        +0.3%       163.79  bay/JBOD-2HDD-thresh=100M/xfs-10dd-1-3.4.0-rc2
                  194.90        +0.0%       194.99  bay/JBOD-2HDD-thresh=100M/xfs-1dd-1-3.4.0-rc2
                  191.10        +0.2%       191.53  bay/RAID0-2HDD-thresh=1000M/btrfs-1dd-1-3.4.0-rc2
                  183.38        +2.6%       188.12  bay/RAID0-2HDD-thresh=1000M/ext4-1dd-1-3.4.0-rc2
                  196.02        -0.0%       195.99  bay/RAID0-2HDD-thresh=1000M/xfs-1dd-1-3.4.0-rc2
                  170.18        +0.3%       170.70  bay/RAID0-2HDD-thresh=100M/btrfs-1dd-1-3.4.0-rc2
                  180.79        +1.6%       183.72  bay/RAID0-2HDD-thresh=100M/ext4-1dd-1-3.4.0-rc2
                  189.00        -0.2%       188.68  bay/RAID0-2HDD-thresh=100M/xfs-1dd-1-3.4.0-rc2
                   97.57        +0.0%        97.61  bay/RAID1-2HDD-thresh=1000M/btrfs-1dd-1-3.4.0-rc2
                   96.69        +0.8%        97.50  bay/RAID1-2HDD-thresh=1000M/ext4-1dd-1-3.4.0-rc2
                   96.99        +0.9%        97.86  bay/RAID1-2HDD-thresh=1000M/xfs-1dd-1-3.4.0-rc2
                   97.53        +0.2%        97.71  bay/RAID1-2HDD-thresh=100M/btrfs-1dd-1-3.4.0-rc2
                   95.85        -0.0%        95.80  bay/RAID1-2HDD-thresh=100M/ext4-1dd-1-3.4.0-rc2
                   97.11        +0.1%        97.23  bay/RAID1-2HDD-thresh=100M/xfs-1dd-1-3.4.0-rc2
                   99.38        -0.0%        99.37  bay/thresh=1000M/btrfs-1dd-1-3.4.0-rc2
                   98.50        +0.2%        98.65  bay/thresh=1000M/ext4-1dd-1-3.4.0-rc2
                   97.54        -0.0%        97.50  bay/thresh=1000M/xfs-10dd-1-3.4.0-rc2
                   99.67        -0.1%        99.56  bay/thresh=1000M/xfs-1dd-1-3.4.0-rc2
                   99.34        -0.0%        99.33  bay/thresh=100M/btrfs-1dd-1-3.4.0-rc2
                   97.56        -0.1%        97.44  bay/thresh=100M/ext4-1dd-1-3.4.0-rc2
                   87.62        +0.2%        87.83  bay/thresh=100M/xfs-10dd-1-3.4.0-rc2
                   99.18        +0.0%        99.21  bay/thresh=100M/xfs-1dd-1-3.4.0-rc2
                   86.99        -0.8%        86.33  bay/thresh=10M/btrfs-1dd-1-3.4.0-rc2
                   91.65        -0.2%        91.42  bay/thresh=10M/ext4-1dd-1-3.4.0-rc2
                   69.54        +0.2%        69.72  bay/thresh=10M/xfs-10dd-1-3.4.0-rc2
                   91.73        +0.4%        92.10  bay/thresh=10M/xfs-1dd-1-3.4.0-rc2
                    3.31        -2.7%         3.22  bay/thresh=1M/btrfs-1dd-1-3.4.0-rc2
                   84.17        +0.7%        84.76  bay/thresh=1M/ext4-1dd-1-3.4.0-rc2
                   65.87        -2.2%        64.44  bay/thresh=1M/xfs-10dd-1-3.4.0-rc2
                   75.02        -0.6%        74.59  bay/thresh=1M/xfs-1dd-1-3.4.0-rc2
                 5109.05        +0.1%      5114.70  TOTAL write_bw

              3723146.77        +0.0%   3723782.19  TOTAL io_wkB_s
                12136.02        -0.3%     12096.63  TOTAL io_w_s
                 6246.22        +0.5%      6280.40  TOTAL io_wrqm_s
                    7.86        -5.6%         7.42  TOTAL io_rkB_s
                    2.00        -5.0%         1.90  TOTAL io_r_s
                    0.02       -38.5%         0.01  TOTAL io_rrqm_s
                35432.39        -0.0%     35417.66  TOTAL io_avgrq_sz
                 2804.24        +0.9%      2830.82  TOTAL io_avgqu_sz
                14045.59        +1.2%     14220.95  TOTAL io_await
                  150.05       +28.4%       192.65  TOTAL io_svctm
                14048.25        +1.3%     14223.99  TOTAL io_util
                   97.79        +0.4%        98.20  TOTAL cpu_user
                    0.00                      0.00  TOTAL cpu_nice
                 1150.46        -0.8%      1141.17  TOTAL cpu_system
                 1311.93        +0.5%      1319.01  TOTAL cpu_iowait
                    0.00                      0.00  TOTAL cpu_steal
                 1439.81        +0.1%      1441.63  TOTAL cpu_idle

Thanks,
Fengguang

[-- Attachment #2: balance_dirty_pages-pages+.png --]
[-- Type: image/png, Size: 1066658 bytes --]

[-- Attachment #3: balance_dirty_pages-pages+.png --]
[-- Type: image/png, Size: 800273 bytes --]

[-- Attachment #4: balance_dirty_pages-pages+.png --]
[-- Type: image/png, Size: 534200 bytes --]

[-- Attachment #5: balance_dirty_pages-pages+.png --]
[-- Type: image/png, Size: 602403 bytes --]

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] block: Convert BDI proportion calculations to flexible proportions
  2012-05-03 22:39 ` [PATCH 2/2] block: Convert BDI proportion calculations to flexible proportions Jan Kara
@ 2012-05-07 14:47   ` Fengguang Wu
  2012-05-07 15:21     ` Peter Zijlstra
  0 siblings, 1 reply; 16+ messages in thread
From: Fengguang Wu @ 2012-05-07 14:47 UTC (permalink / raw)
  To: Jan Kara; +Cc: linux-mm, peterz

On Fri, May 04, 2012 at 12:39:20AM +0200, Jan Kara wrote:
> Convert calculations of proportion of writeback each bdi does to new flexible
> proportion code. That allows us to use aging period of fixed wallclock time
> which gives better proportion estimates given the hugely varying throughput of
> different devices.
> 
> Signed-off-by: Jan Kara <jack@suse.cz>
> ---
>  include/linux/backing-dev.h |    6 ++--
>  mm/backing-dev.c            |    5 +--
>  mm/page-writeback.c         |   80 ++++++++++++++++++++----------------------
>  3 files changed, 43 insertions(+), 48 deletions(-)

> +static void vm_completions_period(struct work_struct *work);
> +/* Work for aging of vm_completions */
> +static DECLARE_DEFERRED_WORK(vm_completions_period_work, vm_completions_period);

> +
> +static void vm_completions_period(struct work_struct *work)
> +{
> +	fprop_new_period(&vm_completions);
> +	schedule_delayed_work(&vm_completions_period_work,
> +			      VM_COMPLETIONS_PERIOD_LEN);
> +}
> +

Is it possible to optimize away the periodic work when there are no
disk writes?

Thanks,
Fengguang

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] block: Convert BDI proportion calculations to flexible proportions
  2012-05-07 14:47   ` Fengguang Wu
@ 2012-05-07 15:21     ` Peter Zijlstra
  2012-05-09 11:38       ` Jan Kara
  0 siblings, 1 reply; 16+ messages in thread
From: Peter Zijlstra @ 2012-05-07 15:21 UTC (permalink / raw)
  To: Fengguang Wu; +Cc: Jan Kara, linux-mm

On Mon, 2012-05-07 at 22:47 +0800, Fengguang Wu wrote:
> On Fri, May 04, 2012 at 12:39:20AM +0200, Jan Kara wrote:
> > Convert calculations of proportion of writeback each bdi does to new flexible
> > proportion code. That allows us to use aging period of fixed wallclock time
> > which gives better proportion estimates given the hugely varying throughput of
> > different devices.
> > 
> > Signed-off-by: Jan Kara <jack@suse.cz>
> > ---
> >  include/linux/backing-dev.h |    6 ++--
> >  mm/backing-dev.c            |    5 +--
> >  mm/page-writeback.c         |   80 ++++++++++++++++++++----------------------
> >  3 files changed, 43 insertions(+), 48 deletions(-)
> 
> > +static void vm_completions_period(struct work_struct *work);
> > +/* Work for aging of vm_completions */
> > +static DECLARE_DEFERRED_WORK(vm_completions_period_work, vm_completions_period);
> 
> > +
> > +static void vm_completions_period(struct work_struct *work)
> > +{
> > +	fprop_new_period(&vm_completions);
> > +	schedule_delayed_work(&vm_completions_period_work,
> > +			      VM_COMPLETIONS_PERIOD_LEN);
> > +}
> > +
> 
> Is it possible to optimize away the periodic work when there are no
> disk writes?

That should really be a timer, nothing in there requires scheduling so
the entire addition of the workqueue muck is pure overhead.

You could keep a second period counter that tracks the last observed
period and whenever the period and last_observed_period are further
apart than BITS_PER_LONG you can stop the timer.

You'll have to restart it when updating last_observed_period.



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2 v2] Flexible proportions for BDIs
  2012-05-07 14:43 ` [PATCH 0/2 v2] Flexible proportions for BDIs Fengguang Wu
@ 2012-05-09 11:37   ` Jan Kara
  2012-05-10  7:31     ` Fengguang Wu
  0 siblings, 1 reply; 16+ messages in thread
From: Jan Kara @ 2012-05-09 11:37 UTC (permalink / raw)
  To: Fengguang Wu; +Cc: Jan Kara, linux-mm, peterz

  Hello,

On Mon 07-05-12 22:43:44, Wu Fengguang wrote:
> On Fri, May 04, 2012 at 12:39:18AM +0200, Jan Kara wrote:
> >   this is the second iteration of my patches for flexible proportions. Since
> > previous submission, I've converted BDI proportion calculations to use flexible
> > proportions so now we can test proportions in kernel. Fengguang, can you give
> > them a run in your JBOD setup? You might try to tweak VM_COMPLETIONS_PERIOD_LEN
> > if things are fluctuating too much... I'm not yet completely decided how to set
> > that constant. Thanks!
> 
> Kara, I've got some results and it's working great. Overall performance
> remains good. The default VM_COMPLETIONS_PERIOD_LEN = 0.5s is obviously
> too small, so I tried increasing it to 3s and then 8s. Results for xfs
> (which has most fluctuating IO completions and ditto for bdi_setpoint)
> are attached. The XFS result of vanilla 3.3 is also attached. The
> graphs are all for case bay/JBOD-2HDD-thresh=1000M/xfs-10dd.
  Thanks for testing! I agree that 0.5s period is probably on the low end.
OTOH 8s seems a bit too much. Consider two bdi's with vastly different
speeds - say their throughput ratio is 1:32 (e.g. an USB stick and a raid
backed storage). When you write to the fast storage, then stop and start
writing to the USB stick, then it will take 5 periods for bdi writeout
ratio to become 1:1 and another 4-5 periods to be close to real current
situation which is no IO to storage 100% io to USB stick. So with 8s period
this will give you total transition time ~80s with seems like too much to
me.
 
> Look at the gray "bdi setpoint" lines. The
> VM_COMPLETIONS_PERIOD_LEN=8s kernel is able to achieve roughly the
> same stable bdi_setpoint as the vanilla kernel, while being able to
> adapt to the balanced bdi_setpoint much more fast (actually now the
> bdi_setpoint is immediately close to the balanced value when
> balance_dirty_pages() starts throttling, while the vanilla kernel
> takes about 20 seconds for bdi_setpoint to grow up).
  Which graph is from which kernel? All four graphs have the same name so
I'm not sure...

  The faster (almost immediate) initial adaptation to bdi's writeout fraction
is mostly an effect of better normalization with my patches. Although it is
pleasant, it happens just at the moment when there is a small number of
periods with non-zero number of events. So more important for practice is
in my opininion to compare transition of computed fractions when workload
changes (i.e. we start writing to one bdi while writing to another bdi or
so).

								Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] block: Convert BDI proportion calculations to flexible proportions
  2012-05-07 15:21     ` Peter Zijlstra
@ 2012-05-09 11:38       ` Jan Kara
  0 siblings, 0 replies; 16+ messages in thread
From: Jan Kara @ 2012-05-09 11:38 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Fengguang Wu, Jan Kara, linux-mm

On Mon 07-05-12 17:21:07, Peter Zijlstra wrote:
> On Mon, 2012-05-07 at 22:47 +0800, Fengguang Wu wrote:
> > On Fri, May 04, 2012 at 12:39:20AM +0200, Jan Kara wrote:
> > > Convert calculations of proportion of writeback each bdi does to new flexible
> > > proportion code. That allows us to use aging period of fixed wallclock time
> > > which gives better proportion estimates given the hugely varying throughput of
> > > different devices.
> > > 
> > > Signed-off-by: Jan Kara <jack@suse.cz>
> > > ---
> > >  include/linux/backing-dev.h |    6 ++--
> > >  mm/backing-dev.c            |    5 +--
> > >  mm/page-writeback.c         |   80 ++++++++++++++++++++----------------------
> > >  3 files changed, 43 insertions(+), 48 deletions(-)
> > 
> > > +static void vm_completions_period(struct work_struct *work);
> > > +/* Work for aging of vm_completions */
> > > +static DECLARE_DEFERRED_WORK(vm_completions_period_work, vm_completions_period);
> > 
> > > +
> > > +static void vm_completions_period(struct work_struct *work)
> > > +{
> > > +	fprop_new_period(&vm_completions);
> > > +	schedule_delayed_work(&vm_completions_period_work,
> > > +			      VM_COMPLETIONS_PERIOD_LEN);
> > > +}
> > > +
> > 
> > Is it possible to optimize away the periodic work when there are no
> > disk writes?
> 
> That should really be a timer, nothing in there requires scheduling so
> the entire addition of the workqueue muck is pure overhead.
> 
> You could keep a second period counter that tracks the last observed
> period and whenever the period and last_observed_period are further
> apart than BITS_PER_LONG you can stop the timer.
> 
> You'll have to restart it when updating last_observed_period.
  Good points. I'll improve this in the next version.

								Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2 v2] Flexible proportions for BDIs
  2012-05-09 11:37   ` Jan Kara
@ 2012-05-10  7:31     ` Fengguang Wu
  2012-05-11 14:51       ` Fengguang Wu
  0 siblings, 1 reply; 16+ messages in thread
From: Fengguang Wu @ 2012-05-10  7:31 UTC (permalink / raw)
  To: Jan Kara; +Cc: linux-mm, peterz

On Wed, May 09, 2012 at 01:37:20PM +0200, Jan Kara wrote:
>   Hello,
> 
> On Mon 07-05-12 22:43:44, Wu Fengguang wrote:
> > On Fri, May 04, 2012 at 12:39:18AM +0200, Jan Kara wrote:
> > >   this is the second iteration of my patches for flexible proportions. Since
> > > previous submission, I've converted BDI proportion calculations to use flexible
> > > proportions so now we can test proportions in kernel. Fengguang, can you give
> > > them a run in your JBOD setup? You might try to tweak VM_COMPLETIONS_PERIOD_LEN
> > > if things are fluctuating too much... I'm not yet completely decided how to set
> > > that constant. Thanks!
> > 
> > Kara, I've got some results and it's working great. Overall performance
> > remains good. The default VM_COMPLETIONS_PERIOD_LEN = 0.5s is obviously
> > too small, so I tried increasing it to 3s and then 8s. Results for xfs
> > (which has most fluctuating IO completions and ditto for bdi_setpoint)
> > are attached. The XFS result of vanilla 3.3 is also attached. The
> > graphs are all for case bay/JBOD-2HDD-thresh=1000M/xfs-10dd.
>   Thanks for testing! I agree that 0.5s period is probably on the low end.
> OTOH 8s seems a bit too much. Consider two bdi's with vastly different
> speeds - say their throughput ratio is 1:32 (e.g. an USB stick and a raid
> backed storage). When you write to the fast storage, then stop and start
> writing to the USB stick, then it will take 5 periods for bdi writeout
> ratio to become 1:1 and another 4-5 periods to be close to real current
> situation which is no IO to storage 100% io to USB stick. So with 8s period
> this will give you total transition time ~80s with seems like too much to
> me.

OK, got it.

> > Look at the gray "bdi setpoint" lines. The
> > VM_COMPLETIONS_PERIOD_LEN=8s kernel is able to achieve roughly the
> > same stable bdi_setpoint as the vanilla kernel, while being able to
> > adapt to the balanced bdi_setpoint much more fast (actually now the
> > bdi_setpoint is immediately close to the balanced value when
> > balance_dirty_pages() starts throttling, while the vanilla kernel
> > takes about 20 seconds for bdi_setpoint to grow up).
>   Which graph is from which kernel? All four graphs have the same name so
> I'm not sure...

They are for test cases:

0.5s period
        bay/JBOD-2HDD-thresh=1000M/xfs-1dd-1-3.4.0-rc2-prop+/balance_dirty_pages-pages+.png
3s period
        bay/JBOD-2HDD-thresh=1000M/xfs-1dd-1-3.4.0-rc2-prop3+/balance_dirty_pages-pages+.png
8s period
        bay/JBOD-2HDD-thresh=1000M/xfs-1dd-1-3.4.0-rc2-prop8+/balance_dirty_pages-pages+.png
vanilla
        bay/JBOD-2HDD-thresh=1000M/xfs-1dd-1-3.3.0/balance_dirty_pages-pages+.png

>   The faster (almost immediate) initial adaptation to bdi's writeout fraction
> is mostly an effect of better normalization with my patches. Although it is
> pleasant, it happens just at the moment when there is a small number of
> periods with non-zero number of events. So more important for practice is
> in my opininion to compare transition of computed fractions when workload
> changes (i.e. we start writing to one bdi while writing to another bdi or
> so).

OK. I'll test this scheme and report back.

        loop {
                dd to disk 1 for 30s
                dd to disk 2 for 30s
        }

Thanks,
Fengguang

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2 v2] Flexible proportions for BDIs
  2012-05-10  7:31     ` Fengguang Wu
@ 2012-05-11 14:51       ` Fengguang Wu
  2012-05-13  3:29         ` Fengguang Wu
  2012-05-14 21:12         ` Jan Kara
  0 siblings, 2 replies; 16+ messages in thread
From: Fengguang Wu @ 2012-05-11 14:51 UTC (permalink / raw)
  To: Jan Kara; +Cc: linux-mm, peterz

[-- Attachment #1: Type: text/plain, Size: 2191 bytes --]

> > > Look at the gray "bdi setpoint" lines. The
> > > VM_COMPLETIONS_PERIOD_LEN=8s kernel is able to achieve roughly the
> > > same stable bdi_setpoint as the vanilla kernel, while being able to
> > > adapt to the balanced bdi_setpoint much more fast (actually now the
> > > bdi_setpoint is immediately close to the balanced value when
> > > balance_dirty_pages() starts throttling, while the vanilla kernel
> > > takes about 20 seconds for bdi_setpoint to grow up).
> >   Which graph is from which kernel? All four graphs have the same name so
> > I'm not sure...
> 
> They are for test cases:
> 
> 0.5s period
>         bay/JBOD-2HDD-thresh=1000M/xfs-1dd-1-3.4.0-rc2-prop+/balance_dirty_pages-pages+.png
> 3s period
>         bay/JBOD-2HDD-thresh=1000M/xfs-1dd-1-3.4.0-rc2-prop3+/balance_dirty_pages-pages+.png
> 8s period
>         bay/JBOD-2HDD-thresh=1000M/xfs-1dd-1-3.4.0-rc2-prop8+/balance_dirty_pages-pages+.png
> vanilla
>         bay/JBOD-2HDD-thresh=1000M/xfs-1dd-1-3.3.0/balance_dirty_pages-pages+.png
> 
> >   The faster (almost immediate) initial adaptation to bdi's writeout fraction
> > is mostly an effect of better normalization with my patches. Although it is
> > pleasant, it happens just at the moment when there is a small number of
> > periods with non-zero number of events. So more important for practice is
> > in my opininion to compare transition of computed fractions when workload
> > changes (i.e. we start writing to one bdi while writing to another bdi or
> > so).
> 
> OK. I'll test this scheme and report back.
> 
>         loop {
>                 dd to disk 1 for 30s
>                 dd to disk 2 for 30s
>         }

Here are the new results. For simplicity I run the dd dirtiers
continuously, and start another dd reader to knock down the write
bandwidth from time to time:

         loop {
                 dd from disk 1 for 30s
                 dd from disk 2 for 30s
         }

The first attached iostat graph shows the resulting read/write
bandwidth for one of the two disks.

The followed graphs are for
        - 3s period
        - 8s period
        - vanilla
in order. The test case is (xfs-1dd, mem=2GB, 2 disks JBOD).

Thanks,
Fengguang

[-- Attachment #2: iostat-bw.png --]
[-- Type: image/png, Size: 131686 bytes --]

[-- Attachment #3: balance_dirty_pages-pages.png --]
[-- Type: image/png, Size: 283309 bytes --]

[-- Attachment #4: balance_dirty_pages-pages.png --]
[-- Type: image/png, Size: 266833 bytes --]

[-- Attachment #5: balance_dirty_pages-pages.png --]
[-- Type: image/png, Size: 255150 bytes --]

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2 v2] Flexible proportions for BDIs
  2012-05-11 14:51       ` Fengguang Wu
@ 2012-05-13  3:29         ` Fengguang Wu
  2012-05-14 21:28           ` Jan Kara
  2012-05-14 21:12         ` Jan Kara
  1 sibling, 1 reply; 16+ messages in thread
From: Fengguang Wu @ 2012-05-13  3:29 UTC (permalink / raw)
  To: Jan Kara; +Cc: linux-mm, peterz

[-- Attachment #1: Type: text/plain, Size: 3117 bytes --]

On Fri, May 11, 2012 at 10:51:14PM +0800, Fengguang Wu wrote:
> > > > Look at the gray "bdi setpoint" lines. The
> > > > VM_COMPLETIONS_PERIOD_LEN=8s kernel is able to achieve roughly the
> > > > same stable bdi_setpoint as the vanilla kernel, while being able to
> > > > adapt to the balanced bdi_setpoint much more fast (actually now the
> > > > bdi_setpoint is immediately close to the balanced value when
> > > > balance_dirty_pages() starts throttling, while the vanilla kernel
> > > > takes about 20 seconds for bdi_setpoint to grow up).
> > >   Which graph is from which kernel? All four graphs have the same name so
> > > I'm not sure...
> > 
> > They are for test cases:
> > 
> > 0.5s period
> >         bay/JBOD-2HDD-thresh=1000M/xfs-1dd-1-3.4.0-rc2-prop+/balance_dirty_pages-pages+.png
> > 3s period
> >         bay/JBOD-2HDD-thresh=1000M/xfs-1dd-1-3.4.0-rc2-prop3+/balance_dirty_pages-pages+.png
> > 8s period
> >         bay/JBOD-2HDD-thresh=1000M/xfs-1dd-1-3.4.0-rc2-prop8+/balance_dirty_pages-pages+.png
> > vanilla
> >         bay/JBOD-2HDD-thresh=1000M/xfs-1dd-1-3.3.0/balance_dirty_pages-pages+.png
> > 
> > >   The faster (almost immediate) initial adaptation to bdi's writeout fraction
> > > is mostly an effect of better normalization with my patches. Although it is
> > > pleasant, it happens just at the moment when there is a small number of
> > > periods with non-zero number of events. So more important for practice is
> > > in my opininion to compare transition of computed fractions when workload
> > > changes (i.e. we start writing to one bdi while writing to another bdi or
> > > so).
> > 
> > OK. I'll test this scheme and report back.
> > 
> >         loop {
> >                 dd to disk 1 for 30s
> >                 dd to disk 2 for 30s
> >         }
> 
> Here are the new results. For simplicity I run the dd dirtiers
> continuously, and start another dd reader to knock down the write
> bandwidth from time to time:
> 
>          loop {
>                  dd from disk 1 for 30s
>                  dd from disk 2 for 30s
>          }
> 
> The first attached iostat graph shows the resulting read/write
> bandwidth for one of the two disks.
> 
> The followed graphs are for
>         - 3s period
>         - 8s period
>         - vanilla
> in order. The test case is (xfs-1dd, mem=2GB, 2 disks JBOD).

Here are more results for another test box with mem=256G running 4
SSDs. This time I run 8 dd readers to better disturb the writes.

The first 3 graphs are for cases:

lkp-nex04/alternant_read_8/xfs-10dd-2-3.4.0-rc5-prop3+
lkp-nex04/alternant_read_8/xfs-10dd-2-3.4.0-rc5-prop8+
lkp-nex04/alternant_read_8/xfs-10dd-2-3.3.0

The last graph shows how the write bandwidth is squeezed by reads over time:

lkp-nex04/alternant_read_8/xfs-10dd-2-3.4.0-rc5-prop8+/iostat-bw.png

The observations for this box are

- the 3s and 8s periods result in roughly the same adaption speed

- the patch makes a really *big* difference in systems with big
  memory:bandwidth ratio. It's sweet! In comparison, the vanilla
  kernel adapts to new write bandwidth so much slower.

Thanks,
Fengguang

[-- Attachment #2: balance_dirty_pages-pages.png --]
[-- Type: image/png, Size: 103745 bytes --]

[-- Attachment #3: balance_dirty_pages-pages.png --]
[-- Type: image/png, Size: 109200 bytes --]

[-- Attachment #4: balance_dirty_pages-pages.png --]
[-- Type: image/png, Size: 87825 bytes --]

[-- Attachment #5: iostat-bw.png --]
[-- Type: image/png, Size: 49078 bytes --]

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2 v2] Flexible proportions for BDIs
  2012-05-11 14:51       ` Fengguang Wu
  2012-05-13  3:29         ` Fengguang Wu
@ 2012-05-14 21:12         ` Jan Kara
  1 sibling, 0 replies; 16+ messages in thread
From: Jan Kara @ 2012-05-14 21:12 UTC (permalink / raw)
  To: Fengguang Wu; +Cc: Jan Kara, linux-mm, peterz

On Fri 11-05-12 22:51:14, Wu Fengguang wrote:
> > > > Look at the gray "bdi setpoint" lines. The
> > > > VM_COMPLETIONS_PERIOD_LEN=8s kernel is able to achieve roughly the
> > > > same stable bdi_setpoint as the vanilla kernel, while being able to
> > > > adapt to the balanced bdi_setpoint much more fast (actually now the
> > > > bdi_setpoint is immediately close to the balanced value when
> > > > balance_dirty_pages() starts throttling, while the vanilla kernel
> > > > takes about 20 seconds for bdi_setpoint to grow up).
> > >   Which graph is from which kernel? All four graphs have the same name so
> > > I'm not sure...
> > 
> > They are for test cases:
> > 
> > 0.5s period
> >         bay/JBOD-2HDD-thresh=1000M/xfs-1dd-1-3.4.0-rc2-prop+/balance_dirty_pages-pages+.png
> > 3s period
> >         bay/JBOD-2HDD-thresh=1000M/xfs-1dd-1-3.4.0-rc2-prop3+/balance_dirty_pages-pages+.png
> > 8s period
> >         bay/JBOD-2HDD-thresh=1000M/xfs-1dd-1-3.4.0-rc2-prop8+/balance_dirty_pages-pages+.png
> > vanilla
> >         bay/JBOD-2HDD-thresh=1000M/xfs-1dd-1-3.3.0/balance_dirty_pages-pages+.png
> > 
> > >   The faster (almost immediate) initial adaptation to bdi's writeout fraction
> > > is mostly an effect of better normalization with my patches. Although it is
> > > pleasant, it happens just at the moment when there is a small number of
> > > periods with non-zero number of events. So more important for practice is
> > > in my opininion to compare transition of computed fractions when workload
> > > changes (i.e. we start writing to one bdi while writing to another bdi or
> > > so).
> > 
> > OK. I'll test this scheme and report back.
> > 
> >         loop {
> >                 dd to disk 1 for 30s
> >                 dd to disk 2 for 30s
> >         }
> 
> Here are the new results. For simplicity I run the dd dirtiers
> continuously, and start another dd reader to knock down the write
> bandwidth from time to time:
> 
>          loop {
>                  dd from disk 1 for 30s
>                  dd from disk 2 for 30s
>          }
> 
> The first attached iostat graph shows the resulting read/write
> bandwidth for one of the two disks.
> 
> The followed graphs are for
>         - 3s period
>         - 8s period
>         - vanilla
> in order. The test case is (xfs-1dd, mem=2GB, 2 disks JBOD).
  Thanks for the test! So here 3s period adapts to changed throughput
fairly quickly, similarly as vanilla kernel, 8s period takes a bit more time.
Random variations in computed proportions for 3s period are about the same
as for vanilla kernel and in a reasonable range I'd say. For 8s period
variations are even smaller as expected.

So all in all I'd say 3s period did fine here, although it did not offer
much benefit over the previous algorithm. 8s period was a bit too slow to
adapt.

								Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2 v2] Flexible proportions for BDIs
  2012-05-13  3:29         ` Fengguang Wu
@ 2012-05-14 21:28           ` Jan Kara
  2012-05-15 11:12             ` Peter Zijlstra
  2012-05-15 13:15             ` Fengguang Wu
  0 siblings, 2 replies; 16+ messages in thread
From: Jan Kara @ 2012-05-14 21:28 UTC (permalink / raw)
  To: Fengguang Wu; +Cc: Jan Kara, linux-mm, peterz

On Sun 13-05-12 11:29:52, Wu Fengguang wrote:
> On Fri, May 11, 2012 at 10:51:14PM +0800, Fengguang Wu wrote:
> > > > > Look at the gray "bdi setpoint" lines. The
> > > > > VM_COMPLETIONS_PERIOD_LEN=8s kernel is able to achieve roughly the
> > > > > same stable bdi_setpoint as the vanilla kernel, while being able to
> > > > > adapt to the balanced bdi_setpoint much more fast (actually now the
> > > > > bdi_setpoint is immediately close to the balanced value when
> > > > > balance_dirty_pages() starts throttling, while the vanilla kernel
> > > > > takes about 20 seconds for bdi_setpoint to grow up).
> > > >   Which graph is from which kernel? All four graphs have the same name so
> > > > I'm not sure...
> > > 
> > > They are for test cases:
> > > 
> > > 0.5s period
> > >         bay/JBOD-2HDD-thresh=1000M/xfs-1dd-1-3.4.0-rc2-prop+/balance_dirty_pages-pages+.png
> > > 3s period
> > >         bay/JBOD-2HDD-thresh=1000M/xfs-1dd-1-3.4.0-rc2-prop3+/balance_dirty_pages-pages+.png
> > > 8s period
> > >         bay/JBOD-2HDD-thresh=1000M/xfs-1dd-1-3.4.0-rc2-prop8+/balance_dirty_pages-pages+.png
> > > vanilla
> > >         bay/JBOD-2HDD-thresh=1000M/xfs-1dd-1-3.3.0/balance_dirty_pages-pages+.png
> > > 
> > > >   The faster (almost immediate) initial adaptation to bdi's writeout fraction
> > > > is mostly an effect of better normalization with my patches. Although it is
> > > > pleasant, it happens just at the moment when there is a small number of
> > > > periods with non-zero number of events. So more important for practice is
> > > > in my opininion to compare transition of computed fractions when workload
> > > > changes (i.e. we start writing to one bdi while writing to another bdi or
> > > > so).
> > > 
> > > OK. I'll test this scheme and report back.
> > > 
> > >         loop {
> > >                 dd to disk 1 for 30s
> > >                 dd to disk 2 for 30s
> > >         }
> > 
> > Here are the new results. For simplicity I run the dd dirtiers
> > continuously, and start another dd reader to knock down the write
> > bandwidth from time to time:
> > 
> >          loop {
> >                  dd from disk 1 for 30s
> >                  dd from disk 2 for 30s
> >          }
> > 
> > The first attached iostat graph shows the resulting read/write
> > bandwidth for one of the two disks.
> > 
> > The followed graphs are for
> >         - 3s period
> >         - 8s period
> >         - vanilla
> > in order. The test case is (xfs-1dd, mem=2GB, 2 disks JBOD).
> 
> Here are more results for another test box with mem=256G running 4
> SSDs. This time I run 8 dd readers to better disturb the writes.
> 
> The first 3 graphs are for cases:
> 
> lkp-nex04/alternant_read_8/xfs-10dd-2-3.4.0-rc5-prop3+
> lkp-nex04/alternant_read_8/xfs-10dd-2-3.4.0-rc5-prop8+
> lkp-nex04/alternant_read_8/xfs-10dd-2-3.3.0
> 
> The last graph shows how the write bandwidth is squeezed by reads over time:
> 
> lkp-nex04/alternant_read_8/xfs-10dd-2-3.4.0-rc5-prop8+/iostat-bw.png
> 
> The observations for this box are
> 
> - the 3s and 8s periods result in roughly the same adaption speed
> 
> - the patch makes a really *big* difference in systems with big
>   memory:bandwidth ratio. It's sweet! In comparison, the vanilla
>   kernel adapts to new write bandwidth so much slower.
  Yes, in this configuration the benefit of the new algorithm can be clearly
seen. Together with the results of previous test I'd say 3s period is the
best candidate.

  Just I was thinking whether the period shouldn't be somehow set
automatically because I'm not convinced 3s will be right for everybody...
Maybe something based on how big fluctuations in completion rate we
observe. But it would be tricky given the load itself changes as well. So
for now we'll have to live with a hardwired period I guess.

  Thanks for the tests Fengguang! So is anybody against merging this?

								Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2 v2] Flexible proportions for BDIs
  2012-05-14 21:28           ` Jan Kara
@ 2012-05-15 11:12             ` Peter Zijlstra
  2012-05-15 15:14               ` Jan Kara
  2012-05-15 13:15             ` Fengguang Wu
  1 sibling, 1 reply; 16+ messages in thread
From: Peter Zijlstra @ 2012-05-15 11:12 UTC (permalink / raw)
  To: Jan Kara; +Cc: Fengguang Wu, linux-mm

On Mon, 2012-05-14 at 23:28 +0200, Jan Kara wrote:
> So is anybody against merging this?

I'd like to see the timer disable itself stuff first.. other than that,
no.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2 v2] Flexible proportions for BDIs
  2012-05-14 21:28           ` Jan Kara
  2012-05-15 11:12             ` Peter Zijlstra
@ 2012-05-15 13:15             ` Fengguang Wu
  1 sibling, 0 replies; 16+ messages in thread
From: Fengguang Wu @ 2012-05-15 13:15 UTC (permalink / raw)
  To: Jan Kara; +Cc: linux-mm, peterz

Hi Jan,

> > The observations for this box are
> > 
> > - the 3s and 8s periods result in roughly the same adaption speed
> > 
> > - the patch makes a really *big* difference in systems with big
> >   memory:bandwidth ratio. It's sweet! In comparison, the vanilla
> >   kernel adapts to new write bandwidth so much slower.
>   Yes, in this configuration the benefit of the new algorithm can be clearly
> seen. Together with the results of previous test I'd say 3s period is the
> best candidate.
 
Agreed. I'm fine with the fixed 3s period. 

>   Just I was thinking whether the period shouldn't be somehow set
> automatically because I'm not convinced 3s will be right for everybody...
> Maybe something based on how big fluctuations in completion rate we
> observe. But it would be tricky given the load itself changes as well. So
> for now we'll have to live with a hardwired period I guess.

Yeah, simple fixed periods should be good enough.

>   Thanks for the tests Fengguang! So is anybody against merging this?

No problem for me, when Peter's concern is addressed.

Thanks!

Fengguang

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2 v2] Flexible proportions for BDIs
  2012-05-15 11:12             ` Peter Zijlstra
@ 2012-05-15 15:14               ` Jan Kara
  0 siblings, 0 replies; 16+ messages in thread
From: Jan Kara @ 2012-05-15 15:14 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Jan Kara, Fengguang Wu, linux-mm

On Tue 15-05-12 13:12:12, Peter Zijlstra wrote:
> On Mon, 2012-05-14 at 23:28 +0200, Jan Kara wrote:
> > So is anybody against merging this?
> 
> I'd like to see the timer disable itself stuff first.. other than that,
> no.
  Ah, my fault. I have the code written already but forgot to send it...
I'll post v3 in a minute.

								Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2012-05-15 15:14 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-05-03 22:39 [PATCH 0/2 v2] Flexible proportions for BDIs Jan Kara
2012-05-03 22:39 ` [PATCH 1/2] lib: Proportions with flexible period Jan Kara
2012-05-03 22:39 ` [PATCH 2/2] block: Convert BDI proportion calculations to flexible proportions Jan Kara
2012-05-07 14:47   ` Fengguang Wu
2012-05-07 15:21     ` Peter Zijlstra
2012-05-09 11:38       ` Jan Kara
2012-05-07 14:43 ` [PATCH 0/2 v2] Flexible proportions for BDIs Fengguang Wu
2012-05-09 11:37   ` Jan Kara
2012-05-10  7:31     ` Fengguang Wu
2012-05-11 14:51       ` Fengguang Wu
2012-05-13  3:29         ` Fengguang Wu
2012-05-14 21:28           ` Jan Kara
2012-05-15 11:12             ` Peter Zijlstra
2012-05-15 15:14               ` Jan Kara
2012-05-15 13:15             ` Fengguang Wu
2012-05-14 21:12         ` Jan Kara

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.