All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/5] bcache: don't write back data if reading it failed
@ 2017-09-26 19:24 Michael Lyle
  2017-09-26 19:24 ` [PATCH 2/5] bcache: implement PI controller for writeback rate Michael Lyle
                   ` (4 more replies)
  0 siblings, 5 replies; 65+ messages in thread
From: Michael Lyle @ 2017-09-26 19:24 UTC (permalink / raw)
  To: linux-bcache; +Cc: i, Michael Lyle

If an IO operation fails, and we didn't successfully read data from the
cache, don't writeback invalid/partial data to the backing disk.

Signed-off-by: Michael Lyle <mlyle@lyle.org>
---
 drivers/md/bcache/writeback.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index e663ca082183..eea49bf36401 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -179,13 +179,20 @@ static void write_dirty(struct closure *cl)
 	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
 	struct keybuf_key *w = io->bio.bi_private;
 
-	dirty_init(w);
-	bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0);
-	io->bio.bi_iter.bi_sector = KEY_START(&w->key);
-	bio_set_dev(&io->bio, io->dc->bdev);
-	io->bio.bi_end_io	= dirty_endio;
+	/* IO errors are signalled using the dirty bit on the key.
+	 * If we failed to read, we should not attempt to write to the
+	 * backing device.  Instead, immediately go to write_dirty_finish
+	 * to clean up.
+	 */
+	if (KEY_DIRTY(&w->key)) {
+		dirty_init(w);
+		bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0);
+		io->bio.bi_iter.bi_sector = KEY_START(&w->key);
+		bio_set_dev(&io->bio, io->dc->bdev);
+		io->bio.bi_end_io	= dirty_endio;
 
-	closure_bio_submit(&io->bio, cl);
+		closure_bio_submit(&io->bio, cl);
+	}
 
 	continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq);
 }
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 65+ messages in thread

* [PATCH 2/5] bcache: implement PI controller for writeback rate
  2017-09-26 19:24 [PATCH 1/5] bcache: don't write back data if reading it failed Michael Lyle
@ 2017-09-26 19:24 ` Michael Lyle
  2017-09-27  8:35   ` Coly Li
  2017-09-26 19:24 ` [PATCH 3/5] bcache: smooth writeback rate control Michael Lyle
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 65+ messages in thread
From: Michael Lyle @ 2017-09-26 19:24 UTC (permalink / raw)
  To: linux-bcache; +Cc: i, Michael Lyle

bcache uses a control system to attempt to keep the amount of dirty data
in cache at a user-configured level, while not responding excessively to
transients and variations in write rate.  Previously, the system was a
PD controller; but the output from it was integrated, turning the
Proportional term into an Integral term, and turning the Derivative term
into a crude Proportional term.  Performance of the controller has been
uneven in production, and it has tended to respond slowly, oscillate,
and overshoot.

This patch set replaces the current control system with an explicit PI
controller and tuning that should be correct for most hardware.  By
default, it attempts to write at a rate that would retire 1/40th of the
current excess blocks per second.  An integral term in turn works to
remove steady state errors.

IMO, this yields benefits in simplicity (removing weighted average
filtering, etc) and system performance.

Another small change is a tunable parameter is introduced to allow the
user to specify a minimum rate at which dirty blocks are retired.

There is a slight difference from earlier versions of the patch in
integral handling to prevent excessive negative integral windup.

Signed-off-by: Michael Lyle <mlyle@lyle.org>
---
 drivers/md/bcache/bcache.h    |  9 ++---
 drivers/md/bcache/sysfs.c     | 18 +++++----
 drivers/md/bcache/writeback.c | 89 ++++++++++++++++++++++++-------------------
 3 files changed, 64 insertions(+), 52 deletions(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 2ed9bd231d84..eb83be693d60 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -265,9 +265,6 @@ struct bcache_device {
 	atomic_t		*stripe_sectors_dirty;
 	unsigned long		*full_dirty_stripes;
 
-	unsigned long		sectors_dirty_last;
-	long			sectors_dirty_derivative;
-
 	struct bio_set		*bio_split;
 
 	unsigned		data_csum:1;
@@ -362,12 +359,14 @@ struct cached_dev {
 
 	uint64_t		writeback_rate_target;
 	int64_t			writeback_rate_proportional;
-	int64_t			writeback_rate_derivative;
+	int64_t			writeback_rate_integral;
+	int64_t			writeback_rate_integral_scaled;
 	int64_t			writeback_rate_change;
 
 	unsigned		writeback_rate_update_seconds;
-	unsigned		writeback_rate_d_term;
+	unsigned		writeback_rate_i_term_inverse;
 	unsigned		writeback_rate_p_term_inverse;
+	unsigned		writeback_rate_minimum;
 };
 
 enum alloc_reserve {
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 104c57cd666c..eb493814759c 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -81,8 +81,9 @@ rw_attribute(writeback_delay);
 rw_attribute(writeback_rate);
 
 rw_attribute(writeback_rate_update_seconds);
-rw_attribute(writeback_rate_d_term);
+rw_attribute(writeback_rate_i_term_inverse);
 rw_attribute(writeback_rate_p_term_inverse);
+rw_attribute(writeback_rate_minimum);
 read_attribute(writeback_rate_debug);
 
 read_attribute(stripe_size);
@@ -130,15 +131,16 @@ SHOW(__bch_cached_dev)
 	sysfs_hprint(writeback_rate,	dc->writeback_rate.rate << 9);
 
 	var_print(writeback_rate_update_seconds);
-	var_print(writeback_rate_d_term);
+	var_print(writeback_rate_i_term_inverse);
 	var_print(writeback_rate_p_term_inverse);
+	var_print(writeback_rate_minimum);
 
 	if (attr == &sysfs_writeback_rate_debug) {
 		char rate[20];
 		char dirty[20];
 		char target[20];
 		char proportional[20];
-		char derivative[20];
+		char integral[20];
 		char change[20];
 		s64 next_io;
 
@@ -146,7 +148,7 @@ SHOW(__bch_cached_dev)
 		bch_hprint(dirty,	bcache_dev_sectors_dirty(&dc->disk) << 9);
 		bch_hprint(target,	dc->writeback_rate_target << 9);
 		bch_hprint(proportional,dc->writeback_rate_proportional << 9);
-		bch_hprint(derivative,	dc->writeback_rate_derivative << 9);
+		bch_hprint(integral,	dc->writeback_rate_integral_scaled << 9);
 		bch_hprint(change,	dc->writeback_rate_change << 9);
 
 		next_io = div64_s64(dc->writeback_rate.next - local_clock(),
@@ -157,11 +159,11 @@ SHOW(__bch_cached_dev)
 			       "dirty:\t\t%s\n"
 			       "target:\t\t%s\n"
 			       "proportional:\t%s\n"
-			       "derivative:\t%s\n"
+			       "integral:\t%s\n"
 			       "change:\t\t%s/sec\n"
 			       "next io:\t%llims\n",
 			       rate, dirty, target, proportional,
-			       derivative, change, next_io);
+			       integral, change, next_io);
 	}
 
 	sysfs_hprint(dirty_data,
@@ -213,7 +215,7 @@ STORE(__cached_dev)
 			    dc->writeback_rate.rate, 1, INT_MAX);
 
 	d_strtoul_nonzero(writeback_rate_update_seconds);
-	d_strtoul(writeback_rate_d_term);
+	d_strtoul(writeback_rate_i_term_inverse);
 	d_strtoul_nonzero(writeback_rate_p_term_inverse);
 
 	d_strtoi_h(sequential_cutoff);
@@ -319,7 +321,7 @@ static struct attribute *bch_cached_dev_files[] = {
 	&sysfs_writeback_percent,
 	&sysfs_writeback_rate,
 	&sysfs_writeback_rate_update_seconds,
-	&sysfs_writeback_rate_d_term,
+	&sysfs_writeback_rate_i_term_inverse,
 	&sysfs_writeback_rate_p_term_inverse,
 	&sysfs_writeback_rate_debug,
 	&sysfs_dirty_data,
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index eea49bf36401..5a8f5655151b 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -25,48 +25,60 @@ static void __update_writeback_rate(struct cached_dev *dc)
 				bcache_flash_devs_sectors_dirty(c);
 	uint64_t cache_dirty_target =
 		div_u64(cache_sectors * dc->writeback_percent, 100);
-
 	int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev),
 				   c->cached_dev_sectors);
 
-	/* PD controller */
-
+	/* PI controller:
+	 * Figures out the amount that should be written per second.
+	 *
+	 * First, the error (number of sectors that are dirty beyond our
+	 * target) is calculated.  The error is accumulated (numerically
+	 * integrated).
+	 *
+	 * Then, the proportional value and integral value are scaled
+	 * based on configured values.  These are stored as inverses to
+	 * avoid fixed point math and to make configuration easy-- e.g.
+	 * the default value of 40 for writeback_rate_p_term_inverse
+	 * attempts to write at a rate that would retire all the dirty
+	 * blocks in 40 seconds.
+	 *
+	 * The writeback_rate_i_inverse value of 10000 means that 1/10000th
+	 * of the error is accumulated in the integral term per second.
+	 * This acts as a slow, long-term average that is not subject to
+	 * variations in usage like the p term.
+	 */
 	int64_t dirty = bcache_dev_sectors_dirty(&dc->disk);
-	int64_t derivative = dirty - dc->disk.sectors_dirty_last;
-	int64_t proportional = dirty - target;
-	int64_t change;
-
-	dc->disk.sectors_dirty_last = dirty;
-
-	/* Scale to sectors per second */
-
-	proportional *= dc->writeback_rate_update_seconds;
-	proportional = div_s64(proportional, dc->writeback_rate_p_term_inverse);
-
-	derivative = div_s64(derivative, dc->writeback_rate_update_seconds);
-
-	derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative,
-			      (dc->writeback_rate_d_term /
-			       dc->writeback_rate_update_seconds) ?: 1, 0);
-
-	derivative *= dc->writeback_rate_d_term;
-	derivative = div_s64(derivative, dc->writeback_rate_p_term_inverse);
-
-	change = proportional + derivative;
+	int64_t error = dirty - target;
+	int64_t proportional_scaled =
+		div_s64(error, dc->writeback_rate_p_term_inverse);
+	int64_t integral_scaled, new_rate;
+
+	if ((error < 0 && dc->writeback_rate_integral > 0) ||
+	    (error > 0 && time_before64(local_clock(),
+			 dc->writeback_rate.next + NSEC_PER_MSEC))) {
+		/* Only decrease the integral term if it's more than
+		 * zero.  Only increase the integral term if the device
+		 * is keeping up.  (Don't wind up the integral
+		 * ineffectively in either case).
+		 *
+		 * It's necessary to scale this by
+		 * writeback_rate_update_seconds to keep the integral
+		 * term dimensioned properly.
+		 */
+		dc->writeback_rate_integral += error *
+			dc->writeback_rate_update_seconds;
+	}
 
-	/* Don't increase writeback rate if the device isn't keeping up */
-	if (change > 0 &&
-	    time_after64(local_clock(),
-			 dc->writeback_rate.next + NSEC_PER_MSEC))
-		change = 0;
+	integral_scaled = div_s64(dc->writeback_rate_integral,
+			dc->writeback_rate_i_term_inverse);
 
-	dc->writeback_rate.rate =
-		clamp_t(int64_t, (int64_t) dc->writeback_rate.rate + change,
-			1, NSEC_PER_MSEC);
+	new_rate = clamp_t(int64_t, (proportional_scaled + integral_scaled),
+			dc->writeback_rate_minimum, NSEC_PER_MSEC);
 
-	dc->writeback_rate_proportional = proportional;
-	dc->writeback_rate_derivative = derivative;
-	dc->writeback_rate_change = change;
+	dc->writeback_rate_proportional = proportional_scaled;
+	dc->writeback_rate_integral_scaled = integral_scaled;
+	dc->writeback_rate_change = new_rate - dc->writeback_rate.rate;
+	dc->writeback_rate.rate = new_rate;
 	dc->writeback_rate_target = target;
 }
 
@@ -498,8 +510,6 @@ void bch_sectors_dirty_init(struct bcache_device *d)
 
 	bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0),
 			   sectors_dirty_init_fn, 0);
-
-	d->sectors_dirty_last = bcache_dev_sectors_dirty(d);
 }
 
 void bch_cached_dev_writeback_init(struct cached_dev *dc)
@@ -513,10 +523,11 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
 	dc->writeback_percent		= 10;
 	dc->writeback_delay		= 30;
 	dc->writeback_rate.rate		= 1024;
+	dc->writeback_rate_minimum	= 1;
 
 	dc->writeback_rate_update_seconds = 5;
-	dc->writeback_rate_d_term	= 30;
-	dc->writeback_rate_p_term_inverse = 6000;
+	dc->writeback_rate_p_term_inverse = 40;
+	dc->writeback_rate_i_term_inverse = 10000;
 
 	INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
 }
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 65+ messages in thread

* [PATCH 3/5] bcache: smooth writeback rate control
  2017-09-26 19:24 [PATCH 1/5] bcache: don't write back data if reading it failed Michael Lyle
  2017-09-26 19:24 ` [PATCH 2/5] bcache: implement PI controller for writeback rate Michael Lyle
@ 2017-09-26 19:24 ` Michael Lyle
  2017-09-27  8:44   ` Coly Li
  2017-09-26 19:24 ` [PATCH 4/5] bcache: writeback: collapse contiguous IO better Michael Lyle
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 65+ messages in thread
From: Michael Lyle @ 2017-09-26 19:24 UTC (permalink / raw)
  To: linux-bcache; +Cc: i, Michael Lyle

This works in conjunction with the new PI controller.  Currently, in
real-world workloads, the rate controller attempts to write back 1
sector per second.  In practice, these minimum-rate writebacks are
between 4k and 60k in test scenarios, since bcache aggregates and
attempts to do contiguous writes and because filesystems on top of
bcachefs typically write 4k or more.

Previously, bcache used to guarantee to write at least once per second.
This means that the actual writeback rate would exceed the configured
amount by a factor of 8-120 or more.

This patch adjusts to be willing to sleep up to 2.5 seconds, and to
target writing 4k/second.  On the smallest writes, it will sleep 1
second like before, but many times it will sleep longer and load the
backing device less.  This keeps the loading on the cache and backing
device related to writeback more consistent when writing back at low
rates.

Signed-off-by: Michael Lyle <mlyle@lyle.org>
---
 drivers/md/bcache/util.c      | 10 ++++++++--
 drivers/md/bcache/writeback.c |  2 +-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index 176d3c2ef5f5..4dbe37e82877 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -232,8 +232,14 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
 
 	d->next += div_u64(done * NSEC_PER_SEC, d->rate);
 
-	if (time_before64(now + NSEC_PER_SEC, d->next))
-		d->next = now + NSEC_PER_SEC;
+	/* Bound the time.  Don't let us fall further than 2 seconds behind
+	 * (this prevents unnecessary backlog that would make it impossible
+	 * to catch up).  If we're ahead of the desired writeback rate,
+	 * don't let us sleep more than 2.5 seconds (so we can notice/respond
+	 * if the control system tells us to speed up!).
+	 */
+	if (time_before64(now + NSEC_PER_SEC * 5 / 2, d->next))
+		d->next = now + NSEC_PER_SEC * 5 / 2;
 
 	if (time_after64(now - NSEC_PER_SEC * 2, d->next))
 		d->next = now - NSEC_PER_SEC * 2;
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 5a8f5655151b..0b7c89813635 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -523,7 +523,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
 	dc->writeback_percent		= 10;
 	dc->writeback_delay		= 30;
 	dc->writeback_rate.rate		= 1024;
-	dc->writeback_rate_minimum	= 1;
+	dc->writeback_rate_minimum	= 8;
 
 	dc->writeback_rate_update_seconds = 5;
 	dc->writeback_rate_p_term_inverse = 40;
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 65+ messages in thread

* [PATCH 4/5] bcache: writeback: collapse contiguous IO better
  2017-09-26 19:24 [PATCH 1/5] bcache: don't write back data if reading it failed Michael Lyle
  2017-09-26 19:24 ` [PATCH 2/5] bcache: implement PI controller for writeback rate Michael Lyle
  2017-09-26 19:24 ` [PATCH 3/5] bcache: smooth writeback rate control Michael Lyle
@ 2017-09-26 19:24 ` Michael Lyle
  2017-09-26 19:24 ` [PATCH 5/5] bcache: writeback: properly order backing device IO Michael Lyle
  2017-09-27  8:22 ` [PATCH 1/5] bcache: don't write back data if reading it failed Coly Li
  4 siblings, 0 replies; 65+ messages in thread
From: Michael Lyle @ 2017-09-26 19:24 UTC (permalink / raw)
  To: linux-bcache; +Cc: i, Michael Lyle

Previously, there was some logic that attempted to immediately issue
writeback of backing-contiguous blocks when the writeback rate was
fast.

The previous logic did not have any limits on the aggregate size it
would issue, nor the number of keys it would combine at once.  It
would also discard the chance to do a contiguous write when the
writeback rate was low-- e.g. at "background" writeback of target
rate = 8, it would not combine two adjacent 4k writes and would
instead seek the disk twice.

This patch imposes limits and explicitly understands the size of
contiguous I/O during issue.  It also will combine contiguous I/O
in all circumstances, not just when writeback is requested to be
relatively fast.

It is a win on its own, but also lays the groundwork for skip writes to
short keys to make the I/O more sequential/contiguous.

Signed-off-by: Michael Lyle <mlyle@lyle.org>
---
 drivers/md/bcache/bcache.h    |   6 --
 drivers/md/bcache/writeback.c | 131 ++++++++++++++++++++++++++++++------------
 2 files changed, 93 insertions(+), 44 deletions(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index eb83be693d60..da803a3b1981 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -321,12 +321,6 @@ struct cached_dev {
 	struct bch_ratelimit	writeback_rate;
 	struct delayed_work	writeback_rate_update;
 
-	/*
-	 * Internal to the writeback code, so read_dirty() can keep track of
-	 * where it's at.
-	 */
-	sector_t		last_read;
-
 	/* Limit number of writeback bios in flight */
 	struct semaphore	in_flight;
 	struct task_struct	*writeback_thread;
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 0b7c89813635..cf29c44605b9 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -229,10 +229,26 @@ static void read_dirty_submit(struct closure *cl)
 	continue_at(cl, write_dirty, io->dc->writeback_write_wq);
 }
 
+static inline bool keys_contiguous(struct cached_dev *dc,
+		struct keybuf_key *first, struct keybuf_key *second)
+{
+	if (PTR_CACHE(dc->disk.c, &second->key, 0)->bdev !=
+			PTR_CACHE(dc->disk.c, &first->key, 0)->bdev)
+		return false;
+
+	if (PTR_OFFSET(&second->key, 0) !=
+			(PTR_OFFSET(&first->key, 0) + KEY_SIZE(&first->key)))
+		return false;
+
+	return true;
+}
+
 static void read_dirty(struct cached_dev *dc)
 {
 	unsigned delay = 0;
-	struct keybuf_key *w;
+	struct keybuf_key *next, *keys[5], *w;
+	size_t size;
+	int nk, i;
 	struct dirty_io *io;
 	struct closure cl;
 
@@ -243,45 +259,84 @@ static void read_dirty(struct cached_dev *dc)
 	 * mempools.
 	 */
 
-	while (!kthread_should_stop()) {
-
-		w = bch_keybuf_next(&dc->writeback_keys);
-		if (!w)
-			break;
-
-		BUG_ON(ptr_stale(dc->disk.c, &w->key, 0));
-
-		if (KEY_START(&w->key) != dc->last_read ||
-		    jiffies_to_msecs(delay) > 50)
-			while (!kthread_should_stop() && delay)
-				delay = schedule_timeout_interruptible(delay);
-
-		dc->last_read	= KEY_OFFSET(&w->key);
-
-		io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec)
-			     * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
-			     GFP_KERNEL);
-		if (!io)
-			goto err;
-
-		w->private	= io;
-		io->dc		= dc;
-
-		dirty_init(w);
-		bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
-		io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
-		bio_set_dev(&io->bio, PTR_CACHE(dc->disk.c, &w->key, 0)->bdev);
-		io->bio.bi_end_io	= read_dirty_endio;
-
-		if (bio_alloc_pages(&io->bio, GFP_KERNEL))
-			goto err_free;
-
-		trace_bcache_writeback(&w->key);
+	next = bch_keybuf_next(&dc->writeback_keys);
+
+	while (!kthread_should_stop() && next) {
+		size = 0;
+		nk = 0;
+
+		do {
+			BUG_ON(ptr_stale(dc->disk.c, &next->key, 0));
+
+			/* Don't combine too many operations, even if they
+			 * are all small.
+			 */
+			if (nk >= 5)
+				break;
+
+			/* If the current operation is very large, don't
+			 * further combine operations.
+			 */
+			if (size > 5000)
+				break;
+
+			/* Operations are only eligible to be combined
+			 * if they are contiguous.
+			 *
+			 * TODO: add a heuristic willing to fire a
+			 * certain amount of non-contiguous IO per pass,
+			 * so that we can benefit from backing device
+			 * command queueing.
+			 */
+			if (nk != 0 && !keys_contiguous(dc, keys[nk-1], next))
+				break;
+
+			size += KEY_SIZE(&next->key);
+			keys[nk++] = next;
+		} while ((next = bch_keybuf_next(&dc->writeback_keys)));
+
+		/* Now we have gathered a set of 1..5 keys to write back. */
+
+		for (i = 0; i < nk; i++) {
+			w = keys[i];
+
+			io = kzalloc(sizeof(struct dirty_io) +
+				     sizeof(struct bio_vec) *
+				     DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
+				     GFP_KERNEL);
+			if (!io)
+				goto err;
+
+			w->private	= io;
+			io->dc		= dc;
+
+			dirty_init(w);
+			bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
+			io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
+			bio_set_dev(&io->bio,
+				    PTR_CACHE(dc->disk.c, &w->key, 0)->bdev);
+			io->bio.bi_end_io	= read_dirty_endio;
+
+			if (bio_alloc_pages(&io->bio, GFP_KERNEL))
+				goto err_free;
+
+			trace_bcache_writeback(&w->key);
+
+			down(&dc->in_flight);
+
+			/* We've acquired a semaphore for the maximum
+			 * simultaneous number of writebacks; from here
+			 * everything happens asynchronously.
+			 */
+			closure_call(&io->cl, read_dirty_submit, NULL, &cl);
+		}
 
-		down(&dc->in_flight);
-		closure_call(&io->cl, read_dirty_submit, NULL, &cl);
+		delay = writeback_delay(dc, size);
 
-		delay = writeback_delay(dc, KEY_SIZE(&w->key));
+		while (!kthread_should_stop() && delay) {
+			schedule_timeout_interruptible(delay);
+			delay = writeback_delay(dc, 0);
+		}
 	}
 
 	if (0) {
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 65+ messages in thread

* [PATCH 5/5] bcache: writeback: properly order backing device IO
  2017-09-26 19:24 [PATCH 1/5] bcache: don't write back data if reading it failed Michael Lyle
                   ` (2 preceding siblings ...)
  2017-09-26 19:24 ` [PATCH 4/5] bcache: writeback: collapse contiguous IO better Michael Lyle
@ 2017-09-26 19:24 ` Michael Lyle
  2017-09-27  8:22 ` [PATCH 1/5] bcache: don't write back data if reading it failed Coly Li
  4 siblings, 0 replies; 65+ messages in thread
From: Michael Lyle @ 2017-09-26 19:24 UTC (permalink / raw)
  To: linux-bcache; +Cc: i, Michael Lyle

Writeback keys are presently iterated and dispatched for writeback in
order of the logical block address on the backing device.  Multiple may
be, in parallel, read from the cache device and then written back
(especially when there are contiguous I/O).

However-- there was no guarantee with the existing code that the writes
would be issued in LBA order, as the reads from the cache device are
often re-ordered.  In turn, when writing back quickly, the backing disk
often has to seek backwards-- this slows writeback and increases
utilization.

This patch introduces an ordering mechanism that guarantees that the
original order of issue is maintained for the write portion of the I/O.
Performance for writeback is significantly improved when there are
multiple contiguous keys or high writeback rates.

Signed-off-by: Michael Lyle <mlyle@lyle.org>
---
 drivers/md/bcache/bcache.h    |  7 +++++++
 drivers/md/bcache/writeback.c | 28 ++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index da803a3b1981..6ae6efda6c57 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -328,6 +328,13 @@ struct cached_dev {
 
 	struct keybuf		writeback_keys;
 
+	/* Order the write-half of writeback operations strongly in dispatch
+	 * order.  (Maintain LBA order; don't allow reads completing out of
+	 * order to re-order the writes...)
+	 */
+	struct closure_waitlist writeback_ordering_wait;
+	atomic_t		writeback_sequence_next;
+
 	/* For tracking sequential IO */
 #define RECENT_IO_BITS	7
 #define RECENT_IO	(1 << RECENT_IO_BITS)
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index cf29c44605b9..3ca775fbae7f 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -112,6 +112,7 @@ static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
 struct dirty_io {
 	struct closure		cl;
 	struct cached_dev	*dc;
+	uint16_t		sequence;
 	struct bio		bio;
 };
 
@@ -190,6 +191,26 @@ static void write_dirty(struct closure *cl)
 {
 	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
 	struct keybuf_key *w = io->bio.bi_private;
+	struct cached_dev *dc = io->dc;
+
+	uint16_t next_sequence;
+
+	if (atomic_read(&dc->writeback_sequence_next) != io->sequence) {
+		/* Not our turn to write; wait for a write to complete */
+		closure_wait(&dc->writeback_ordering_wait, cl);
+
+		if (atomic_read(&dc->writeback_sequence_next) == io->sequence) {
+			/* Edge case-- it happened in indeterminate order
+			 * relative to when we were added to wait list..
+			 */
+			closure_wake_up(&dc->writeback_ordering_wait);
+		}
+
+		continue_at(cl, write_dirty, io->dc->writeback_write_wq);
+		return;
+	}
+
+	next_sequence = io->sequence + 1;
 
 	/* IO errors are signalled using the dirty bit on the key.
 	 * If we failed to read, we should not attempt to write to the
@@ -206,6 +227,9 @@ static void write_dirty(struct closure *cl)
 		closure_bio_submit(&io->bio, cl);
 	}
 
+	atomic_set(&dc->writeback_sequence_next, next_sequence);
+	closure_wake_up(&dc->writeback_ordering_wait);
+
 	continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq);
 }
 
@@ -251,7 +275,10 @@ static void read_dirty(struct cached_dev *dc)
 	int nk, i;
 	struct dirty_io *io;
 	struct closure cl;
+	uint16_t sequence = 0;
 
+	BUG_ON(!llist_empty(&dc->writeback_ordering_wait.list));
+	atomic_set(&dc->writeback_sequence_next, sequence);
 	closure_init_stack(&cl);
 
 	/*
@@ -309,6 +336,7 @@ static void read_dirty(struct cached_dev *dc)
 
 			w->private	= io;
 			io->dc		= dc;
+			io->sequence    = sequence++;
 
 			dirty_init(w);
 			bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 65+ messages in thread

* Re: [PATCH 1/5] bcache: don't write back data if reading it failed
  2017-09-26 19:24 [PATCH 1/5] bcache: don't write back data if reading it failed Michael Lyle
                   ` (3 preceding siblings ...)
  2017-09-26 19:24 ` [PATCH 5/5] bcache: writeback: properly order backing device IO Michael Lyle
@ 2017-09-27  8:22 ` Coly Li
  2017-09-27  8:28   ` Michael Lyle
  4 siblings, 1 reply; 65+ messages in thread
From: Coly Li @ 2017-09-27  8:22 UTC (permalink / raw)
  To: Michael Lyle; +Cc: linux-bcache

On 2017/9/27 上午3:24, Michael Lyle wrote:
> If an IO operation fails, and we didn't successfully read data from the
> cache, don't writeback invalid/partial data to the backing disk.
> 
> Signed-off-by: Michael Lyle <mlyle@lyle.org>
> ---
>  drivers/md/bcache/writeback.c | 19 +++++++++++++------
>  1 file changed, 13 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
> index e663ca082183..eea49bf36401 100644
> --- a/drivers/md/bcache/writeback.c
> +++ b/drivers/md/bcache/writeback.c
> @@ -179,13 +179,20 @@ static void write_dirty(struct closure *cl)
>  	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
>  	struct keybuf_key *w = io->bio.bi_private;
>  
> -	dirty_init(w);
> -	bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0);
> -	io->bio.bi_iter.bi_sector = KEY_START(&w->key);
> -	bio_set_dev(&io->bio, io->dc->bdev);
> -	io->bio.bi_end_io	= dirty_endio;
> +	/* IO errors are signalled using the dirty bit on the key.
> +	 * If we failed to read, we should not attempt to write to the
> +	 * backing device.  Instead, immediately go to write_dirty_finish
> +	 * to clean up.
> +	 */
> +	if (KEY_DIRTY(&w->key)) {
Maybe it should be "if (!KEY_DIRTY(&w->key))" ? In dirty_endio(),
SET_KEY_DIRTY() is called only when bio->bi_status is not 0.


> +		dirty_init(w);
> +		bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0);
> +		io->bio.bi_iter.bi_sector = KEY_START(&w->key);
> +		bio_set_dev(&io->bio, io->dc->bdev);
> +		io->bio.bi_end_io	= dirty_endio;
>  
> -	closure_bio_submit(&io->bio, cl);
> +		closure_bio_submit(&io->bio, cl);
> +	}
>  
>  	continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq);
>  }
> 


-- 
Coly Li

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 1/5] bcache: don't write back data if reading it failed
  2017-09-27  8:22 ` [PATCH 1/5] bcache: don't write back data if reading it failed Coly Li
@ 2017-09-27  8:28   ` Michael Lyle
  2017-09-27  8:40     ` Coly Li
  0 siblings, 1 reply; 65+ messages in thread
From: Michael Lyle @ 2017-09-27  8:28 UTC (permalink / raw)
  To: Coly Li; +Cc: linux-bcache

SET_KEY_DIRTY sets it to false when bi_status is not 0.

We do the write if KEY_DIRTY is true, else it is skipped.

Mike

On Wed, Sep 27, 2017 at 1:22 AM, Coly Li <i@coly.li> wrote:
> On 2017/9/27 上午3:24, Michael Lyle wrote:
>> If an IO operation fails, and we didn't successfully read data from the
>> cache, don't writeback invalid/partial data to the backing disk.
>>
>> Signed-off-by: Michael Lyle <mlyle@lyle.org>
>> ---
>>  drivers/md/bcache/writeback.c | 19 +++++++++++++------
>>  1 file changed, 13 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
>> index e663ca082183..eea49bf36401 100644
>> --- a/drivers/md/bcache/writeback.c
>> +++ b/drivers/md/bcache/writeback.c
>> @@ -179,13 +179,20 @@ static void write_dirty(struct closure *cl)
>>       struct dirty_io *io = container_of(cl, struct dirty_io, cl);
>>       struct keybuf_key *w = io->bio.bi_private;
>>
>> -     dirty_init(w);
>> -     bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0);
>> -     io->bio.bi_iter.bi_sector = KEY_START(&w->key);
>> -     bio_set_dev(&io->bio, io->dc->bdev);
>> -     io->bio.bi_end_io       = dirty_endio;
>> +     /* IO errors are signalled using the dirty bit on the key.
>> +      * If we failed to read, we should not attempt to write to the
>> +      * backing device.  Instead, immediately go to write_dirty_finish
>> +      * to clean up.
>> +      */
>> +     if (KEY_DIRTY(&w->key)) {
> Maybe it should be "if (!KEY_DIRTY(&w->key))" ? In dirty_endio(),
> SET_KEY_DIRTY() is called only when bio->bi_status is not 0.
>
>
>> +             dirty_init(w);
>> +             bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0);
>> +             io->bio.bi_iter.bi_sector = KEY_START(&w->key);
>> +             bio_set_dev(&io->bio, io->dc->bdev);
>> +             io->bio.bi_end_io       = dirty_endio;
>>
>> -     closure_bio_submit(&io->bio, cl);
>> +             closure_bio_submit(&io->bio, cl);
>> +     }
>>
>>       continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq);
>>  }
>>
>
>
> --
> Coly Li

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 2/5] bcache: implement PI controller for writeback rate
  2017-09-26 19:24 ` [PATCH 2/5] bcache: implement PI controller for writeback rate Michael Lyle
@ 2017-09-27  8:35   ` Coly Li
  2017-09-27  8:40     ` Michael Lyle
  0 siblings, 1 reply; 65+ messages in thread
From: Coly Li @ 2017-09-27  8:35 UTC (permalink / raw)
  To: Michael Lyle, linux-bcache

On 2017/9/27 上午3:24, Michael Lyle wrote:
> bcache uses a control system to attempt to keep the amount of dirty data
> in cache at a user-configured level, while not responding excessively to
> transients and variations in write rate.  Previously, the system was a
> PD controller; but the output from it was integrated, turning the
> Proportional term into an Integral term, and turning the Derivative term
> into a crude Proportional term.  Performance of the controller has been
> uneven in production, and it has tended to respond slowly, oscillate,
> and overshoot.
> 
> This patch set replaces the current control system with an explicit PI
> controller and tuning that should be correct for most hardware.  By
> default, it attempts to write at a rate that would retire 1/40th of the
> current excess blocks per second.  An integral term in turn works to
> remove steady state errors.
> 
> IMO, this yields benefits in simplicity (removing weighted average
> filtering, etc) and system performance.
> 
> Another small change is a tunable parameter is introduced to allow the
> user to specify a minimum rate at which dirty blocks are retired.
> 
> There is a slight difference from earlier versions of the patch in
> integral handling to prevent excessive negative integral windup.
> 
> Signed-off-by: Michael Lyle <mlyle@lyle.org>

Reviewed-by: Coly Li <colyli@suse.de>

Last comment, could you please to use current code comments style in
bcache, like this,

/*
 * The allocation code needs gc_mark in struct bucket to be correct, but
 * it's not while a gc is in progress. Protected by bucket_lock.
 */

Or like this,

/*
 * For any bio we don't skip we subtract the number of sectors from
 * rescale; when it hits 0 we rescale all the bucket priorities.
 */

Rested part are OK to me. Thanks for your effort to implement a simpler
PI controller.

Coly Li

> ---
>  drivers/md/bcache/bcache.h    |  9 ++---
>  drivers/md/bcache/sysfs.c     | 18 +++++----
>  drivers/md/bcache/writeback.c | 89 ++++++++++++++++++++++++-------------------
>  3 files changed, 64 insertions(+), 52 deletions(-)
> 
> diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
> index 2ed9bd231d84..eb83be693d60 100644
> --- a/drivers/md/bcache/bcache.h
> +++ b/drivers/md/bcache/bcache.h
> @@ -265,9 +265,6 @@ struct bcache_device {
>  	atomic_t		*stripe_sectors_dirty;
>  	unsigned long		*full_dirty_stripes;
>  
> -	unsigned long		sectors_dirty_last;
> -	long			sectors_dirty_derivative;
> -
>  	struct bio_set		*bio_split;
>  
>  	unsigned		data_csum:1;
> @@ -362,12 +359,14 @@ struct cached_dev {
>  
>  	uint64_t		writeback_rate_target;
>  	int64_t			writeback_rate_proportional;
> -	int64_t			writeback_rate_derivative;
> +	int64_t			writeback_rate_integral;
> +	int64_t			writeback_rate_integral_scaled;
>  	int64_t			writeback_rate_change;
>  
>  	unsigned		writeback_rate_update_seconds;
> -	unsigned		writeback_rate_d_term;
> +	unsigned		writeback_rate_i_term_inverse;
>  	unsigned		writeback_rate_p_term_inverse;
> +	unsigned		writeback_rate_minimum;
>  };
>  
>  enum alloc_reserve {
> diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
> index 104c57cd666c..eb493814759c 100644
> --- a/drivers/md/bcache/sysfs.c
> +++ b/drivers/md/bcache/sysfs.c
> @@ -81,8 +81,9 @@ rw_attribute(writeback_delay);
>  rw_attribute(writeback_rate);
>  
>  rw_attribute(writeback_rate_update_seconds);
> -rw_attribute(writeback_rate_d_term);
> +rw_attribute(writeback_rate_i_term_inverse);
>  rw_attribute(writeback_rate_p_term_inverse);
> +rw_attribute(writeback_rate_minimum);
>  read_attribute(writeback_rate_debug);
>  
>  read_attribute(stripe_size);
> @@ -130,15 +131,16 @@ SHOW(__bch_cached_dev)
>  	sysfs_hprint(writeback_rate,	dc->writeback_rate.rate << 9);
>  
>  	var_print(writeback_rate_update_seconds);
> -	var_print(writeback_rate_d_term);
> +	var_print(writeback_rate_i_term_inverse);
>  	var_print(writeback_rate_p_term_inverse);
> +	var_print(writeback_rate_minimum);
>  
>  	if (attr == &sysfs_writeback_rate_debug) {
>  		char rate[20];
>  		char dirty[20];
>  		char target[20];
>  		char proportional[20];
> -		char derivative[20];
> +		char integral[20];
>  		char change[20];
>  		s64 next_io;
>  
> @@ -146,7 +148,7 @@ SHOW(__bch_cached_dev)
>  		bch_hprint(dirty,	bcache_dev_sectors_dirty(&dc->disk) << 9);
>  		bch_hprint(target,	dc->writeback_rate_target << 9);
>  		bch_hprint(proportional,dc->writeback_rate_proportional << 9);
> -		bch_hprint(derivative,	dc->writeback_rate_derivative << 9);
> +		bch_hprint(integral,	dc->writeback_rate_integral_scaled << 9);
>  		bch_hprint(change,	dc->writeback_rate_change << 9);
>  
>  		next_io = div64_s64(dc->writeback_rate.next - local_clock(),
> @@ -157,11 +159,11 @@ SHOW(__bch_cached_dev)
>  			       "dirty:\t\t%s\n"
>  			       "target:\t\t%s\n"
>  			       "proportional:\t%s\n"
> -			       "derivative:\t%s\n"
> +			       "integral:\t%s\n"
>  			       "change:\t\t%s/sec\n"
>  			       "next io:\t%llims\n",
>  			       rate, dirty, target, proportional,
> -			       derivative, change, next_io);
> +			       integral, change, next_io);
>  	}
>  
>  	sysfs_hprint(dirty_data,
> @@ -213,7 +215,7 @@ STORE(__cached_dev)
>  			    dc->writeback_rate.rate, 1, INT_MAX);
>  
>  	d_strtoul_nonzero(writeback_rate_update_seconds);
> -	d_strtoul(writeback_rate_d_term);
> +	d_strtoul(writeback_rate_i_term_inverse);
>  	d_strtoul_nonzero(writeback_rate_p_term_inverse);
>  
>  	d_strtoi_h(sequential_cutoff);
> @@ -319,7 +321,7 @@ static struct attribute *bch_cached_dev_files[] = {
>  	&sysfs_writeback_percent,
>  	&sysfs_writeback_rate,
>  	&sysfs_writeback_rate_update_seconds,
> -	&sysfs_writeback_rate_d_term,
> +	&sysfs_writeback_rate_i_term_inverse,
>  	&sysfs_writeback_rate_p_term_inverse,
>  	&sysfs_writeback_rate_debug,
>  	&sysfs_dirty_data,
> diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
> index eea49bf36401..5a8f5655151b 100644
> --- a/drivers/md/bcache/writeback.c
> +++ b/drivers/md/bcache/writeback.c
> @@ -25,48 +25,60 @@ static void __update_writeback_rate(struct cached_dev *dc)
>  				bcache_flash_devs_sectors_dirty(c);
>  	uint64_t cache_dirty_target =
>  		div_u64(cache_sectors * dc->writeback_percent, 100);
> -
>  	int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev),
>  				   c->cached_dev_sectors);
>  
> -	/* PD controller */
> -
> +	/* PI controller:
> +	 * Figures out the amount that should be written per second.
> +	 *
> +	 * First, the error (number of sectors that are dirty beyond our
> +	 * target) is calculated.  The error is accumulated (numerically
> +	 * integrated).
> +	 *
> +	 * Then, the proportional value and integral value are scaled
> +	 * based on configured values.  These are stored as inverses to
> +	 * avoid fixed point math and to make configuration easy-- e.g.
> +	 * the default value of 40 for writeback_rate_p_term_inverse
> +	 * attempts to write at a rate that would retire all the dirty
> +	 * blocks in 40 seconds.
> +	 *
> +	 * The writeback_rate_i_inverse value of 10000 means that 1/10000th
> +	 * of the error is accumulated in the integral term per second.
> +	 * This acts as a slow, long-term average that is not subject to
> +	 * variations in usage like the p term.
> +	 */
>  	int64_t dirty = bcache_dev_sectors_dirty(&dc->disk);
> -	int64_t derivative = dirty - dc->disk.sectors_dirty_last;
> -	int64_t proportional = dirty - target;
> -	int64_t change;
> -
> -	dc->disk.sectors_dirty_last = dirty;
> -
> -	/* Scale to sectors per second */
> -
> -	proportional *= dc->writeback_rate_update_seconds;
> -	proportional = div_s64(proportional, dc->writeback_rate_p_term_inverse);
> -
> -	derivative = div_s64(derivative, dc->writeback_rate_update_seconds);
> -
> -	derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative,
> -			      (dc->writeback_rate_d_term /
> -			       dc->writeback_rate_update_seconds) ?: 1, 0);
> -
> -	derivative *= dc->writeback_rate_d_term;
> -	derivative = div_s64(derivative, dc->writeback_rate_p_term_inverse);
> -
> -	change = proportional + derivative;
> +	int64_t error = dirty - target;
> +	int64_t proportional_scaled =
> +		div_s64(error, dc->writeback_rate_p_term_inverse);
> +	int64_t integral_scaled, new_rate;
> +
> +	if ((error < 0 && dc->writeback_rate_integral > 0) ||
> +	    (error > 0 && time_before64(local_clock(),
> +			 dc->writeback_rate.next + NSEC_PER_MSEC))) {
> +		/* Only decrease the integral term if it's more than
> +		 * zero.  Only increase the integral term if the device
> +		 * is keeping up.  (Don't wind up the integral
> +		 * ineffectively in either case).
> +		 *
> +		 * It's necessary to scale this by
> +		 * writeback_rate_update_seconds to keep the integral
> +		 * term dimensioned properly.
> +		 */
> +		dc->writeback_rate_integral += error *
> +			dc->writeback_rate_update_seconds;
> +	}
>  
> -	/* Don't increase writeback rate if the device isn't keeping up */
> -	if (change > 0 &&
> -	    time_after64(local_clock(),
> -			 dc->writeback_rate.next + NSEC_PER_MSEC))
> -		change = 0;
> +	integral_scaled = div_s64(dc->writeback_rate_integral,
> +			dc->writeback_rate_i_term_inverse);
>  
> -	dc->writeback_rate.rate =
> -		clamp_t(int64_t, (int64_t) dc->writeback_rate.rate + change,
> -			1, NSEC_PER_MSEC);
> +	new_rate = clamp_t(int64_t, (proportional_scaled + integral_scaled),
> +			dc->writeback_rate_minimum, NSEC_PER_MSEC);
>  
> -	dc->writeback_rate_proportional = proportional;
> -	dc->writeback_rate_derivative = derivative;
> -	dc->writeback_rate_change = change;
> +	dc->writeback_rate_proportional = proportional_scaled;
> +	dc->writeback_rate_integral_scaled = integral_scaled;
> +	dc->writeback_rate_change = new_rate - dc->writeback_rate.rate;
> +	dc->writeback_rate.rate = new_rate;
>  	dc->writeback_rate_target = target;
>  }
>  
> @@ -498,8 +510,6 @@ void bch_sectors_dirty_init(struct bcache_device *d)
>  
>  	bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0),
>  			   sectors_dirty_init_fn, 0);
> -
> -	d->sectors_dirty_last = bcache_dev_sectors_dirty(d);
>  }
>  
>  void bch_cached_dev_writeback_init(struct cached_dev *dc)
> @@ -513,10 +523,11 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
>  	dc->writeback_percent		= 10;
>  	dc->writeback_delay		= 30;
>  	dc->writeback_rate.rate		= 1024;
> +	dc->writeback_rate_minimum	= 1;
>  
>  	dc->writeback_rate_update_seconds = 5;
> -	dc->writeback_rate_d_term	= 30;
> -	dc->writeback_rate_p_term_inverse = 6000;
> +	dc->writeback_rate_p_term_inverse = 40;
> +	dc->writeback_rate_i_term_inverse = 10000;
>  
>  	INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
>  }
> 

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 1/5] bcache: don't write back data if reading it failed
  2017-09-27  8:28   ` Michael Lyle
@ 2017-09-27  8:40     ` Coly Li
  0 siblings, 0 replies; 65+ messages in thread
From: Coly Li @ 2017-09-27  8:40 UTC (permalink / raw)
  To: Michael Lyle; +Cc: linux-bcache

On 2017/9/27 下午4:28, Michael Lyle wrote:
> SET_KEY_DIRTY sets it to false when bi_status is not 0.
> 
> We do the write if KEY_DIRTY is true, else it is skipped.

I see your point, yes you are right.

Reviewed-by: Coly Li <colyli@suse.de>

Thanks!


> On Wed, Sep 27, 2017 at 1:22 AM, Coly Li <i@coly.li> wrote:
>> On 2017/9/27 上午3:24, Michael Lyle wrote:
>>> If an IO operation fails, and we didn't successfully read data from the
>>> cache, don't writeback invalid/partial data to the backing disk.
>>>
>>> Signed-off-by: Michael Lyle <mlyle@lyle.org>
>>> ---
>>>  drivers/md/bcache/writeback.c | 19 +++++++++++++------
>>>  1 file changed, 13 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
>>> index e663ca082183..eea49bf36401 100644
>>> --- a/drivers/md/bcache/writeback.c
>>> +++ b/drivers/md/bcache/writeback.c
>>> @@ -179,13 +179,20 @@ static void write_dirty(struct closure *cl)
>>>       struct dirty_io *io = container_of(cl, struct dirty_io, cl);
>>>       struct keybuf_key *w = io->bio.bi_private;
>>>
>>> -     dirty_init(w);
>>> -     bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0);
>>> -     io->bio.bi_iter.bi_sector = KEY_START(&w->key);
>>> -     bio_set_dev(&io->bio, io->dc->bdev);
>>> -     io->bio.bi_end_io       = dirty_endio;
>>> +     /* IO errors are signalled using the dirty bit on the key.
>>> +      * If we failed to read, we should not attempt to write to the
>>> +      * backing device.  Instead, immediately go to write_dirty_finish
>>> +      * to clean up.
>>> +      */
>>> +     if (KEY_DIRTY(&w->key)) {
>> Maybe it should be "if (!KEY_DIRTY(&w->key))" ? In dirty_endio(),
>> SET_KEY_DIRTY() is called only when bio->bi_status is not 0.
>>
>>
>>> +             dirty_init(w);
>>> +             bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0);
>>> +             io->bio.bi_iter.bi_sector = KEY_START(&w->key);
>>> +             bio_set_dev(&io->bio, io->dc->bdev);
>>> +             io->bio.bi_end_io       = dirty_endio;
>>>
>>> -     closure_bio_submit(&io->bio, cl);
>>> +             closure_bio_submit(&io->bio, cl);
>>> +     }
>>>
>>>       continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq);
>>>  }
>>>
>>
>>
>> --
>> Coly Li


-- 
Coly Li

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 2/5] bcache: implement PI controller for writeback rate
  2017-09-27  8:35   ` Coly Li
@ 2017-09-27  8:40     ` Michael Lyle
  0 siblings, 0 replies; 65+ messages in thread
From: Michael Lyle @ 2017-09-27  8:40 UTC (permalink / raw)
  To: Coly Li; +Cc: linux-bcache

Coly--

Excellent, thank you for all your help.  I will send out another
version of these patches after a little more time for review.  I have
made the change you've suggested.

Mike

On Wed, Sep 27, 2017 at 1:35 AM, Coly Li <i@coly.li> wrote:
> On 2017/9/27 上午3:24, Michael Lyle wrote:
>> bcache uses a control system to attempt to keep the amount of dirty data
>> in cache at a user-configured level, while not responding excessively to
>> transients and variations in write rate.  Previously, the system was a
>> PD controller; but the output from it was integrated, turning the
>> Proportional term into an Integral term, and turning the Derivative term
>> into a crude Proportional term.  Performance of the controller has been
>> uneven in production, and it has tended to respond slowly, oscillate,
>> and overshoot.
>>
>> This patch set replaces the current control system with an explicit PI
>> controller and tuning that should be correct for most hardware.  By
>> default, it attempts to write at a rate that would retire 1/40th of the
>> current excess blocks per second.  An integral term in turn works to
>> remove steady state errors.
>>
>> IMO, this yields benefits in simplicity (removing weighted average
>> filtering, etc) and system performance.
>>
>> Another small change is a tunable parameter is introduced to allow the
>> user to specify a minimum rate at which dirty blocks are retired.
>>
>> There is a slight difference from earlier versions of the patch in
>> integral handling to prevent excessive negative integral windup.
>>
>> Signed-off-by: Michael Lyle <mlyle@lyle.org>
>
> Reviewed-by: Coly Li <colyli@suse.de>
>
> Last comment, could you please to use current code comments style in
> bcache, like this,
>
> /*
>  * The allocation code needs gc_mark in struct bucket to be correct, but
>  * it's not while a gc is in progress. Protected by bucket_lock.
>  */
>
> Or like this,
>
> /*
>  * For any bio we don't skip we subtract the number of sectors from
>  * rescale; when it hits 0 we rescale all the bucket priorities.
>  */
>
> Rested part are OK to me. Thanks for your effort to implement a simpler
> PI controller.
>
> Coly Li
>
>> ---
>>  drivers/md/bcache/bcache.h    |  9 ++---
>>  drivers/md/bcache/sysfs.c     | 18 +++++----
>>  drivers/md/bcache/writeback.c | 89 ++++++++++++++++++++++++-------------------
>>  3 files changed, 64 insertions(+), 52 deletions(-)
>>
>> diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
>> index 2ed9bd231d84..eb83be693d60 100644
>> --- a/drivers/md/bcache/bcache.h
>> +++ b/drivers/md/bcache/bcache.h
>> @@ -265,9 +265,6 @@ struct bcache_device {
>>       atomic_t                *stripe_sectors_dirty;
>>       unsigned long           *full_dirty_stripes;
>>
>> -     unsigned long           sectors_dirty_last;
>> -     long                    sectors_dirty_derivative;
>> -
>>       struct bio_set          *bio_split;
>>
>>       unsigned                data_csum:1;
>> @@ -362,12 +359,14 @@ struct cached_dev {
>>
>>       uint64_t                writeback_rate_target;
>>       int64_t                 writeback_rate_proportional;
>> -     int64_t                 writeback_rate_derivative;
>> +     int64_t                 writeback_rate_integral;
>> +     int64_t                 writeback_rate_integral_scaled;
>>       int64_t                 writeback_rate_change;
>>
>>       unsigned                writeback_rate_update_seconds;
>> -     unsigned                writeback_rate_d_term;
>> +     unsigned                writeback_rate_i_term_inverse;
>>       unsigned                writeback_rate_p_term_inverse;
>> +     unsigned                writeback_rate_minimum;
>>  };
>>
>>  enum alloc_reserve {
>> diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
>> index 104c57cd666c..eb493814759c 100644
>> --- a/drivers/md/bcache/sysfs.c
>> +++ b/drivers/md/bcache/sysfs.c
>> @@ -81,8 +81,9 @@ rw_attribute(writeback_delay);
>>  rw_attribute(writeback_rate);
>>
>>  rw_attribute(writeback_rate_update_seconds);
>> -rw_attribute(writeback_rate_d_term);
>> +rw_attribute(writeback_rate_i_term_inverse);
>>  rw_attribute(writeback_rate_p_term_inverse);
>> +rw_attribute(writeback_rate_minimum);
>>  read_attribute(writeback_rate_debug);
>>
>>  read_attribute(stripe_size);
>> @@ -130,15 +131,16 @@ SHOW(__bch_cached_dev)
>>       sysfs_hprint(writeback_rate,    dc->writeback_rate.rate << 9);
>>
>>       var_print(writeback_rate_update_seconds);
>> -     var_print(writeback_rate_d_term);
>> +     var_print(writeback_rate_i_term_inverse);
>>       var_print(writeback_rate_p_term_inverse);
>> +     var_print(writeback_rate_minimum);
>>
>>       if (attr == &sysfs_writeback_rate_debug) {
>>               char rate[20];
>>               char dirty[20];
>>               char target[20];
>>               char proportional[20];
>> -             char derivative[20];
>> +             char integral[20];
>>               char change[20];
>>               s64 next_io;
>>
>> @@ -146,7 +148,7 @@ SHOW(__bch_cached_dev)
>>               bch_hprint(dirty,       bcache_dev_sectors_dirty(&dc->disk) << 9);
>>               bch_hprint(target,      dc->writeback_rate_target << 9);
>>               bch_hprint(proportional,dc->writeback_rate_proportional << 9);
>> -             bch_hprint(derivative,  dc->writeback_rate_derivative << 9);
>> +             bch_hprint(integral,    dc->writeback_rate_integral_scaled << 9);
>>               bch_hprint(change,      dc->writeback_rate_change << 9);
>>
>>               next_io = div64_s64(dc->writeback_rate.next - local_clock(),
>> @@ -157,11 +159,11 @@ SHOW(__bch_cached_dev)
>>                              "dirty:\t\t%s\n"
>>                              "target:\t\t%s\n"
>>                              "proportional:\t%s\n"
>> -                            "derivative:\t%s\n"
>> +                            "integral:\t%s\n"
>>                              "change:\t\t%s/sec\n"
>>                              "next io:\t%llims\n",
>>                              rate, dirty, target, proportional,
>> -                            derivative, change, next_io);
>> +                            integral, change, next_io);
>>       }
>>
>>       sysfs_hprint(dirty_data,
>> @@ -213,7 +215,7 @@ STORE(__cached_dev)
>>                           dc->writeback_rate.rate, 1, INT_MAX);
>>
>>       d_strtoul_nonzero(writeback_rate_update_seconds);
>> -     d_strtoul(writeback_rate_d_term);
>> +     d_strtoul(writeback_rate_i_term_inverse);
>>       d_strtoul_nonzero(writeback_rate_p_term_inverse);
>>
>>       d_strtoi_h(sequential_cutoff);
>> @@ -319,7 +321,7 @@ static struct attribute *bch_cached_dev_files[] = {
>>       &sysfs_writeback_percent,
>>       &sysfs_writeback_rate,
>>       &sysfs_writeback_rate_update_seconds,
>> -     &sysfs_writeback_rate_d_term,
>> +     &sysfs_writeback_rate_i_term_inverse,
>>       &sysfs_writeback_rate_p_term_inverse,
>>       &sysfs_writeback_rate_debug,
>>       &sysfs_dirty_data,
>> diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
>> index eea49bf36401..5a8f5655151b 100644
>> --- a/drivers/md/bcache/writeback.c
>> +++ b/drivers/md/bcache/writeback.c
>> @@ -25,48 +25,60 @@ static void __update_writeback_rate(struct cached_dev *dc)
>>                               bcache_flash_devs_sectors_dirty(c);
>>       uint64_t cache_dirty_target =
>>               div_u64(cache_sectors * dc->writeback_percent, 100);
>> -
>>       int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev),
>>                                  c->cached_dev_sectors);
>>
>> -     /* PD controller */
>> -
>> +     /* PI controller:
>> +      * Figures out the amount that should be written per second.
>> +      *
>> +      * First, the error (number of sectors that are dirty beyond our
>> +      * target) is calculated.  The error is accumulated (numerically
>> +      * integrated).
>> +      *
>> +      * Then, the proportional value and integral value are scaled
>> +      * based on configured values.  These are stored as inverses to
>> +      * avoid fixed point math and to make configuration easy-- e.g.
>> +      * the default value of 40 for writeback_rate_p_term_inverse
>> +      * attempts to write at a rate that would retire all the dirty
>> +      * blocks in 40 seconds.
>> +      *
>> +      * The writeback_rate_i_inverse value of 10000 means that 1/10000th
>> +      * of the error is accumulated in the integral term per second.
>> +      * This acts as a slow, long-term average that is not subject to
>> +      * variations in usage like the p term.
>> +      */
>>       int64_t dirty = bcache_dev_sectors_dirty(&dc->disk);
>> -     int64_t derivative = dirty - dc->disk.sectors_dirty_last;
>> -     int64_t proportional = dirty - target;
>> -     int64_t change;
>> -
>> -     dc->disk.sectors_dirty_last = dirty;
>> -
>> -     /* Scale to sectors per second */
>> -
>> -     proportional *= dc->writeback_rate_update_seconds;
>> -     proportional = div_s64(proportional, dc->writeback_rate_p_term_inverse);
>> -
>> -     derivative = div_s64(derivative, dc->writeback_rate_update_seconds);
>> -
>> -     derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative,
>> -                           (dc->writeback_rate_d_term /
>> -                            dc->writeback_rate_update_seconds) ?: 1, 0);
>> -
>> -     derivative *= dc->writeback_rate_d_term;
>> -     derivative = div_s64(derivative, dc->writeback_rate_p_term_inverse);
>> -
>> -     change = proportional + derivative;
>> +     int64_t error = dirty - target;
>> +     int64_t proportional_scaled =
>> +             div_s64(error, dc->writeback_rate_p_term_inverse);
>> +     int64_t integral_scaled, new_rate;
>> +
>> +     if ((error < 0 && dc->writeback_rate_integral > 0) ||
>> +         (error > 0 && time_before64(local_clock(),
>> +                      dc->writeback_rate.next + NSEC_PER_MSEC))) {
>> +             /* Only decrease the integral term if it's more than
>> +              * zero.  Only increase the integral term if the device
>> +              * is keeping up.  (Don't wind up the integral
>> +              * ineffectively in either case).
>> +              *
>> +              * It's necessary to scale this by
>> +              * writeback_rate_update_seconds to keep the integral
>> +              * term dimensioned properly.
>> +              */
>> +             dc->writeback_rate_integral += error *
>> +                     dc->writeback_rate_update_seconds;
>> +     }
>>
>> -     /* Don't increase writeback rate if the device isn't keeping up */
>> -     if (change > 0 &&
>> -         time_after64(local_clock(),
>> -                      dc->writeback_rate.next + NSEC_PER_MSEC))
>> -             change = 0;
>> +     integral_scaled = div_s64(dc->writeback_rate_integral,
>> +                     dc->writeback_rate_i_term_inverse);
>>
>> -     dc->writeback_rate.rate =
>> -             clamp_t(int64_t, (int64_t) dc->writeback_rate.rate + change,
>> -                     1, NSEC_PER_MSEC);
>> +     new_rate = clamp_t(int64_t, (proportional_scaled + integral_scaled),
>> +                     dc->writeback_rate_minimum, NSEC_PER_MSEC);
>>
>> -     dc->writeback_rate_proportional = proportional;
>> -     dc->writeback_rate_derivative = derivative;
>> -     dc->writeback_rate_change = change;
>> +     dc->writeback_rate_proportional = proportional_scaled;
>> +     dc->writeback_rate_integral_scaled = integral_scaled;
>> +     dc->writeback_rate_change = new_rate - dc->writeback_rate.rate;
>> +     dc->writeback_rate.rate = new_rate;
>>       dc->writeback_rate_target = target;
>>  }
>>
>> @@ -498,8 +510,6 @@ void bch_sectors_dirty_init(struct bcache_device *d)
>>
>>       bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0),
>>                          sectors_dirty_init_fn, 0);
>> -
>> -     d->sectors_dirty_last = bcache_dev_sectors_dirty(d);
>>  }
>>
>>  void bch_cached_dev_writeback_init(struct cached_dev *dc)
>> @@ -513,10 +523,11 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
>>       dc->writeback_percent           = 10;
>>       dc->writeback_delay             = 30;
>>       dc->writeback_rate.rate         = 1024;
>> +     dc->writeback_rate_minimum      = 1;
>>
>>       dc->writeback_rate_update_seconds = 5;
>> -     dc->writeback_rate_d_term       = 30;
>> -     dc->writeback_rate_p_term_inverse = 6000;
>> +     dc->writeback_rate_p_term_inverse = 40;
>> +     dc->writeback_rate_i_term_inverse = 10000;
>>
>>       INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
>>  }
>>

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 3/5] bcache: smooth writeback rate control
  2017-09-26 19:24 ` [PATCH 3/5] bcache: smooth writeback rate control Michael Lyle
@ 2017-09-27  8:44   ` Coly Li
  0 siblings, 0 replies; 65+ messages in thread
From: Coly Li @ 2017-09-27  8:44 UTC (permalink / raw)
  To: Michael Lyle, linux-bcache

On 2017/9/27 上午3:24, Michael Lyle wrote:
> This works in conjunction with the new PI controller.  Currently, in
> real-world workloads, the rate controller attempts to write back 1
> sector per second.  In practice, these minimum-rate writebacks are
> between 4k and 60k in test scenarios, since bcache aggregates and
> attempts to do contiguous writes and because filesystems on top of
> bcachefs typically write 4k or more.
> 
> Previously, bcache used to guarantee to write at least once per second.
> This means that the actual writeback rate would exceed the configured
> amount by a factor of 8-120 or more.
> 
> This patch adjusts to be willing to sleep up to 2.5 seconds, and to
> target writing 4k/second.  On the smallest writes, it will sleep 1
> second like before, but many times it will sleep longer and load the
> backing device less.  This keeps the loading on the cache and backing
> device related to writeback more consistent when writing back at low
> rates.
> 
> Signed-off-by: Michael Lyle <mlyle@lyle.org>

Reviewed-by: Coly Li <colyli@suse.de>

Added to for-test. Thanks.

> ---
>  drivers/md/bcache/util.c      | 10 ++++++++--
>  drivers/md/bcache/writeback.c |  2 +-
>  2 files changed, 9 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
> index 176d3c2ef5f5..4dbe37e82877 100644
> --- a/drivers/md/bcache/util.c
> +++ b/drivers/md/bcache/util.c
> @@ -232,8 +232,14 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
>  
>  	d->next += div_u64(done * NSEC_PER_SEC, d->rate);
>  
> -	if (time_before64(now + NSEC_PER_SEC, d->next))
> -		d->next = now + NSEC_PER_SEC;
> +	/* Bound the time.  Don't let us fall further than 2 seconds behind
> +	 * (this prevents unnecessary backlog that would make it impossible
> +	 * to catch up).  If we're ahead of the desired writeback rate,
> +	 * don't let us sleep more than 2.5 seconds (so we can notice/respond
> +	 * if the control system tells us to speed up!).
> +	 */
> +	if (time_before64(now + NSEC_PER_SEC * 5 / 2, d->next))
> +		d->next = now + NSEC_PER_SEC * 5 / 2;
>  
>  	if (time_after64(now - NSEC_PER_SEC * 2, d->next))
>  		d->next = now - NSEC_PER_SEC * 2;
> diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
> index 5a8f5655151b..0b7c89813635 100644
> --- a/drivers/md/bcache/writeback.c
> +++ b/drivers/md/bcache/writeback.c
> @@ -523,7 +523,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
>  	dc->writeback_percent		= 10;
>  	dc->writeback_delay		= 30;
>  	dc->writeback_rate.rate		= 1024;
> -	dc->writeback_rate_minimum	= 1;
> +	dc->writeback_rate_minimum	= 8;
>  
>  	dc->writeback_rate_update_seconds = 5;
>  	dc->writeback_rate_p_term_inverse = 40;
> 


-- 
Coly Li

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
  2017-10-09 18:58                                                               ` Coly Li
@ 2017-10-10  0:00                                                                 ` Michael Lyle
  0 siblings, 0 replies; 65+ messages in thread
From: Michael Lyle @ 2017-10-10  0:00 UTC (permalink / raw)
  To: Coly Li
  Cc: Hannes Reinecke, Junhui Tang, linux-bcache, linux-block, Kent Overstreet

Hi Coly--

We talked a bunch on IRC, so I'd just like to answer a couple of the
things again here for the benefit of everyone else / further
discussion.

On Mon, Oct 9, 2017 at 11:58 AM, Coly Li <i@coly.li> wrote:
> I do observe the perfect side of your patches, when I/O blocksize
> <=256kB, I see great advantage of writeback performance when bio
> reordered. Especially when blocksiz is 8K, writeback performance is 3x
> at least (because without your patch the writeback is too slow and I
> gave up after hours).
>
> The performance regression happens when fio blocksize increased to 512K
> and dirty data increased to 900GB. And when fio blocksize increased to
> 1MB and dirty data on cache increased to 900, writeback performance
> regression becomes easily recognized.
>
> An interesting behavior I observed is, for large blocksize and dirty
> data, without bio reorder patches, writeback performance is much higher
> than the bio reorder one. An example is,
> http://blog.coly.li/wp-content/uploads/2017/10/writeback_throughput_on_linear_900_1800G_cache_half.png
>
> The first 15 minutes, bcache without bio reorder performs much higher
> than the reorder one. 15 minutes later, all the writeback rate decreased
> to a similar level. That's said, most of the performance regression
> happens at the beginning when writeback starts.

I have looked at this and I believe I understand why.  Patch 4 changes
the issuance of sequential I/O.  It limits how much writeback will be
issued at a time, as opposed to being willing to issue any amount in
the previous code:

+                       if (size >= MAX_WRITESIZE_IN_PASS)
+                               break;

MAX_WRITESIZE_IN_PASS corresponds to 2500 KBytes, or it'll hit the
limit with 2 1MB blocks issued and then go back to delay.

The reason I put these lines here is to prevent from issuing a set of
writebacks that are so large that they tie up the backing disk for a
very long time and hurt interactive performance.  We could make it
tunable, though I'm hesitant to have too many tunables as it'll make
testing and verifying all this difficult.

> All the tests are under ideal situations, no writeback happens when fio
> writes dirty data onto cache device. If in more generic situations when
> less LBA contiguous dirty blocks on cache, I guess the writeback
> regression might be more obvious.

I think it is because of this conditional which is confined to only
LBA contiguous things.  We could remove the conditional but it's there
to try and preserve interactive/frontend performance during writeback.

> When dirty blocks are not LBA contiguous on cache device, for small
> dirty block size, I don't worry. Because you explained in previous
> emails clearly, the worst case is the performance backing to the numbers
> which has no bio reorder patch. But for large blocksize and large dirty
> data, that's the common case when bcache is used for distributed
> computing/storage systems like Ceph or Hadoop (multiple hard drives
> attached to a large SSD cache, normally object file sizes are from 4MB
> to 128MB).

I don't think a SSD cache will help the big block portion of workloads
like this much, as they're not access-time constrained but instead
bandwidth constrained.  That's why, default configuration of bcache is
for >=4MB sequential stuff to bypass the cache and go directly to the
spinning disks.

Even if the SSD has twice the bandwidth of the spinning disk array,
everything written back needs to be written to the SSD and then later
read for writeback, so it's no quicker at steady state.

[snip]
> I raise a green flag to your bio reorder patches :-) I do hope I could
> have seen such performance data at first time :P
>
> *Last one question*: Could you please to consider an option to
> enable/disable the bio reorder code in sysfs? It can be enabled as
> default. When people cares about large dirty data size writeback, they
> can choose to disable the bio reorder policy.

I don't think it's the bio reorder code itself, but instead a change
in how we delay for writeback.  I wrote the new stuff to try and be a
better compromise between writeback rate and interactive performance.

My biggest concern is, that both the old code and the new code-- seems
to be writing back at about only 25-33% of the speed that your disks
should be capable of in this case.  I think that's a bigger issue than
the small difference in performance in this specific case.

Mike

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
  2017-10-06 18:36                                                             ` Michael Lyle
@ 2017-10-09 18:58                                                               ` Coly Li
  2017-10-10  0:00                                                                 ` Michael Lyle
  0 siblings, 1 reply; 65+ messages in thread
From: Coly Li @ 2017-10-09 18:58 UTC (permalink / raw)
  To: Michael Lyle
  Cc: Hannes Reinecke, Junhui Tang, linux-bcache, linux-block, Kent Overstreet

On 2017/10/7 上午2:36, Michael Lyle wrote:
> Sorry I missed this question:
> 
>> Is it the time from writeback starts to dirty reaches dirty target, or
>> the time from writeback starts to dirty reaches 0 ?
> 
> Not quite either.  I monitor the machine with zabbix; it's the time to
> when the backing disk reaches its background rate of activity / when
> writeback hits its minimum rate (writeback with the PI controller
> writes a little past the target because of the integral term).
> 
> Viewed one way: 5/80 is just a few percent of difference (6%).  But:
> I'm hopeful that further improvement can be made along this patch
> series, and in any event it's 5 minutes earlier that I/O will have an
> unencumbered response time after a pulse of load.

Hi Mike,

Finally, finally I finish all my test in the ideal situation which you
suggested to make dirty blocks to be contiguous better.

And it come to be clear why we had such a big difference in opinion: we
just looked at different part of a cow, you looked at tail, I looked at
head.

Here is the configurations I covered,
fio blocksize: 8kB, 16kB, 32kB, 64kB, 128kB, 256kB, 512kB, 1024kB
dirty data size:, 110GB, 500GB, 700GB
cache device size: 220GB, 1TB, 1.5TB, 1.8TB
cached device size: 1.8TB, 4TB, 7.2TB

(md linear is used to combine large device with multiple hard drives, so
large bio won't be split unless it goes across hard drive size boundary )

I don't test all the above combinations, my test cases are very limited
(still spent me several days), but most important ones are covered.

I use the following items to measure writeback performance:
- decreased dirty data amount (a.k.a throughput)
- writeback write requests per-second
- writeback write request merge numbers per-second

It turns out, in the ideal situation, bio reorder patches performance
drops if: 1) write I/O size increased 2) amount of dirty data increased

I do observe the perfect side of your patches, when I/O blocksize
<=256kB, I see great advantage of writeback performance when bio
reordered. Especially when blocksiz is 8K, writeback performance is 3x
at least (because without your patch the writeback is too slow and I
gave up after hours).

The performance regression happens when fio blocksize increased to 512K
and dirty data increased to 900GB. And when fio blocksize increased to
1MB and dirty data on cache increased to 900, writeback performance
regression becomes easily recognized.

An interesting behavior I observed is, for large blocksize and dirty
data, without bio reorder patches, writeback performance is much higher
than the bio reorder one. An example is,
http://blog.coly.li/wp-content/uploads/2017/10/writeback_throughput_on_linear_900_1800G_cache_half.png

The first 15 minutes, bcache without bio reorder performs much higher
than the reorder one. 15 minutes later, all the writeback rate decreased
to a similar level. That's said, most of the performance regression
happens at the beginning when writeback starts.

All the tests are under ideal situations, no writeback happens when fio
writes dirty data onto cache device. If in more generic situations when
less LBA contiguous dirty blocks on cache, I guess the writeback
regression might be more obvious.

When dirty blocks are not LBA contiguous on cache device, for small
dirty block size, I don't worry. Because you explained in previous
emails clearly, the worst case is the performance backing to the numbers
which has no bio reorder patch. But for large blocksize and large dirty
data, that's the common case when bcache is used for distributed
computing/storage systems like Ceph or Hadoop (multiple hard drives
attached to a large SSD cache, normally object file sizes are from 4MB
to 128MB).

Here is the command line I used to initialize bcache device:
#ake-bcache -B /dev/md0 -C /dev/nvme1n1p1
echo /dev/nvme1n1p1 > /sys/fs/bcache/register
echo /dev/md0 > /sys/fs/bcache/register
sleep 1
echo 0 > /sys/block/bcache0/bcache/cache/congested_read_threshold_us
echo 0 > /sys/block/bcache0/bcache/cache/congested_write_threshold_us
echo writeback > /sys/block/bcache0/bcache/cache_mode
echo 0 > /sys/block/bcache0/bcache/writeback_running

After fio writes enough dirty data onto cache device, I write 1 into
writeback_runing.

Here is the fio job file I used,
[global]
direct=1
thread=1
ioengine=libaio

[job]
filename=/dev/bcache0
readwrite=randwrite
numjobs=1
blocksize=<test block size>
iodepth=256
size=<dirty data amount>

I see the writeback performance advantage in ideal situation, it is
desired :-) But I also worry about the performance regression for large
dirty block size and dirty data.

I raise a green flag to your bio reorder patches :-) I do hope I could
have seen such performance data at first time :P

*Last one question*: Could you please to consider an option to
enable/disable the bio reorder code in sysfs? It can be enabled as
default. When people cares about large dirty data size writeback, they
can choose to disable the bio reorder policy.

I hope we can get an agreement and make this patch move forward.

Thanks for your patience, and continuous following up the discussion.

-- 
Coly Li

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
  2017-10-06 18:09                                                           ` Coly Li
  2017-10-06 18:23                                                             ` Michael Lyle
  2017-10-06 18:36                                                             ` Michael Lyle
@ 2017-10-09  5:59                                                             ` Hannes Reinecke
  2 siblings, 0 replies; 65+ messages in thread
From: Hannes Reinecke @ 2017-10-09  5:59 UTC (permalink / raw)
  To: Coly Li, Michael Lyle
  Cc: Junhui Tang, linux-bcache, linux-block, Kent Overstreet

On 10/06/2017 08:09 PM, Coly Li wrote:
> Hi Mike,
> 
> On 2017/10/7 上午1:36, Michael Lyle wrote:
>> It's 240GB (half of a Samsung 850) in front of a 1TB 5400RPM disk.
>>
> 
> Copied.
> 
>> The size isn't critical.  1/8 is chosen to exceed 10% (default
>> writeback dirty data thresh), it might need to be 1/6 on really big
>> environments.  It needs to be big enough that it takes more than 100
>> seconds to write back, but less than 40% of the cache[1].
>>
> 
> OK, I will do similar thing.
> 
>> I am also running this on "mountain", which is my virtualization
>> server.  It has 2x Samsung 950 PRO 512GB NVMe disks (MD raid1) in
>> front of 2x Seagate Ironwolf 6TB (MD raid1) drivers.  I can't run
>> benchmarks on it, but it runs nightly builds and is otherwise lightly
>> loaded in the middle of night.  It does writeback for about 75 minutes
>> on average after a set of nightly builds of my repositories (last 5
>> days 75, 79, 77, 72, 76); without this patch set it was more like 80
>> (77, 80, 74, 85, 84, 76, 82, 84).  It has 128GB of RAM and doesn't do
> 
> Is it the time from writeback starts to dirty reaches dirty target, or
> the time from writeback starts to dirty reaches 0 ?
> 
>> many reads-- most of its hottest working set hits the page cache--
>> bcache serves to accelerate the random writes its workloads generate.
>> This is the workload that I sought to improve with the writeback
>> changes.
>>
>> When you ran the tests, did you compare 1-5 to 1-3?  Because
> 
> Yes, we did same thing. I only add/remove the last 2 patches about bio
> reorder, the first 3 patches are always applied in my testing kernels.
> 
>> otherwise, another uncontrolled factor is the writeback tuning is
>> drastically different with patch 2.  (On most systems patch 2 makes
>> writeback less aggressive overall by default).
> 
> I observe this behavior :-) I always use the new PI controller in my
> testings, it works fine.
> 
>> [1]: If you don't think this is fair--- this would match how I've been
>> suggesting enterprise users set up bcache for various workloads-- a
>> writeback cache isn't going to be very effective at lowering write
>> workload if the size of the most active write working set doesn't fit
>> nicely in the cache.  so e.g. if you deploy under a SQL database the
>> cache should be double the size of the indices to get a real write
>> aggregation benefit; if you deploy under an environment doing builds
>> the cache should be double the size of the amount written during
>> builds; if you deploy under an environment doing rapid VM provisioning
>> the cache should be double the size of the images blitted per
>> provisioning cycle, etc.
>>
> 
> If I use a 1.8T hard disk as cached device, and 1TB SSD as cache device,
> and set fio to write 500G dirty data in total. Is this configuration
> close to the working set and cache size you suggested ?
> 
Weell, for a realistic scenario you should be aiming for having about
one order of magnitude difference between the cache and the cached device.
So for your setup you'd be needing about 4TB of cached device, or using
only parts of the caching device (say around 500G).

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		   Teamlead Storage & Networking
hare@suse.de			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: F. Imendörffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG Nürnberg)

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
  2017-10-06 18:09                                                           ` Coly Li
  2017-10-06 18:23                                                             ` Michael Lyle
@ 2017-10-06 18:36                                                             ` Michael Lyle
  2017-10-09 18:58                                                               ` Coly Li
  2017-10-09  5:59                                                             ` Hannes Reinecke
  2 siblings, 1 reply; 65+ messages in thread
From: Michael Lyle @ 2017-10-06 18:36 UTC (permalink / raw)
  To: Coly Li
  Cc: Hannes Reinecke, Junhui Tang, linux-bcache, linux-block, Kent Overstreet

Sorry I missed this question:

> Is it the time from writeback starts to dirty reaches dirty target, or
> the time from writeback starts to dirty reaches 0 ?

Not quite either.  I monitor the machine with zabbix; it's the time to
when the backing disk reaches its background rate of activity / when
writeback hits its minimum rate (writeback with the PI controller
writes a little past the target because of the integral term).

Viewed one way: 5/80 is just a few percent of difference (6%).  But:
I'm hopeful that further improvement can be made along this patch
series, and in any event it's 5 minutes earlier that I/O will have an
unencumbered response time after a pulse of load.

Mike

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
  2017-10-06 18:09                                                           ` Coly Li
@ 2017-10-06 18:23                                                             ` Michael Lyle
  2017-10-06 18:36                                                             ` Michael Lyle
  2017-10-09  5:59                                                             ` Hannes Reinecke
  2 siblings, 0 replies; 65+ messages in thread
From: Michael Lyle @ 2017-10-06 18:23 UTC (permalink / raw)
  To: Coly Li
  Cc: Hannes Reinecke, Junhui Tang, linux-bcache, linux-block, Kent Overstreet

On Fri, Oct 6, 2017 at 11:09 AM, Coly Li <i@coly.li> wrote:
> If I use a 1.8T hard disk as cached device, and 1TB SSD as cache device,
> and set fio to write 500G dirty data in total. Is this configuration
> close to the working set and cache size you suggested ?

I think it's quicker and easier (and meaningful) to write 125-200G of
small blocks. (1/8 to 1/5)

Once you exceed 70% (CUTOFF_WRITEBACK_SYNC) you will get a **lot of
holes** because of write data skipping the cache.  And please note--
something I don't quite understand-- there is some kind of "write
amplification" happening somewhere.  When I write 30G, I get 38G of
dirty data; I wouldn't expect filesystem creation and metadata updates
to do this much.  At some point I need to trace and understand why
this is happening.

500G is probably still OK (1/2), but there starts to be a random
factor the more you write:

Basically, the more you write, the less-contiguous the data gets,
because writeback may have scanned through the volume once and written
out extents that are now "holes" in the middle of the data.  It'll
still be mostly contiguous but the effect is random.  (This is an
effect of the test scenario, not any of the code change).

Mike

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
  2017-10-06 17:36                                                         ` Michael Lyle
@ 2017-10-06 18:09                                                           ` Coly Li
  2017-10-06 18:23                                                             ` Michael Lyle
                                                                               ` (2 more replies)
  0 siblings, 3 replies; 65+ messages in thread
From: Coly Li @ 2017-10-06 18:09 UTC (permalink / raw)
  To: Michael Lyle
  Cc: Hannes Reinecke, Junhui Tang, linux-bcache, linux-block, Kent Overstreet

Hi Mike,

On 2017/10/7 上午1:36, Michael Lyle wrote:
> It's 240GB (half of a Samsung 850) in front of a 1TB 5400RPM disk.
> 

Copied.

> The size isn't critical.  1/8 is chosen to exceed 10% (default
> writeback dirty data thresh), it might need to be 1/6 on really big
> environments.  It needs to be big enough that it takes more than 100
> seconds to write back, but less than 40% of the cache[1].
>

OK, I will do similar thing.

> I am also running this on "mountain", which is my virtualization
> server.  It has 2x Samsung 950 PRO 512GB NVMe disks (MD raid1) in
> front of 2x Seagate Ironwolf 6TB (MD raid1) drivers.  I can't run
> benchmarks on it, but it runs nightly builds and is otherwise lightly
> loaded in the middle of night.  It does writeback for about 75 minutes
> on average after a set of nightly builds of my repositories (last 5
> days 75, 79, 77, 72, 76); without this patch set it was more like 80
> (77, 80, 74, 85, 84, 76, 82, 84).  It has 128GB of RAM and doesn't do

Is it the time from writeback starts to dirty reaches dirty target, or
the time from writeback starts to dirty reaches 0 ?

> many reads-- most of its hottest working set hits the page cache--
> bcache serves to accelerate the random writes its workloads generate.
> This is the workload that I sought to improve with the writeback
> changes.
> 
> When you ran the tests, did you compare 1-5 to 1-3?  Because

Yes, we did same thing. I only add/remove the last 2 patches about bio
reorder, the first 3 patches are always applied in my testing kernels.

> otherwise, another uncontrolled factor is the writeback tuning is
> drastically different with patch 2.  (On most systems patch 2 makes
> writeback less aggressive overall by default).

I observe this behavior :-) I always use the new PI controller in my
testings, it works fine.

> [1]: If you don't think this is fair--- this would match how I've been
> suggesting enterprise users set up bcache for various workloads-- a
> writeback cache isn't going to be very effective at lowering write
> workload if the size of the most active write working set doesn't fit
> nicely in the cache.  so e.g. if you deploy under a SQL database the
> cache should be double the size of the indices to get a real write
> aggregation benefit; if you deploy under an environment doing builds
> the cache should be double the size of the amount written during
> builds; if you deploy under an environment doing rapid VM provisioning
> the cache should be double the size of the images blitted per
> provisioning cycle, etc.
> 

If I use a 1.8T hard disk as cached device, and 1TB SSD as cache device,
and set fio to write 500G dirty data in total. Is this configuration
close to the working set and cache size you suggested ?

[snip]

Thanks.


-- 
Coly Li

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
  2017-10-06 12:20                                                 ` Coly Li
@ 2017-10-06 17:53                                                   ` Michael Lyle
  0 siblings, 0 replies; 65+ messages in thread
From: Michael Lyle @ 2017-10-06 17:53 UTC (permalink / raw)
  To: Coly Li; +Cc: Junhui Tang, linux-bcache, linux-block, Kent Overstreet

Just one more note:

On Fri, Oct 6, 2017 at 5:20 AM, Coly Li <i@coly.li> wrote:
> [snip]
> And when we talked about patch 5/5, you mentioned 1MB writes:
> "- When writeback rate is medium, it does I/O more efficiently.  e.g.
> if the current writeback rate is 10MB/sec, and there are two
> contiguous 1MB segments, they would not presently be combined.  A 1MB
> write would occur, then we would increase the delay counter by 100ms,
> and then the next write would wait; this new code would issue 2 1MB
> writes one after the other, and then sleep 200ms.  On a disk that does
> 150MB/sec sequential, and has a 7ms seek time, this uses the disk for
> 13ms + 7ms, compared to the old code that does 13ms + 7ms * 2.  This
> is the difference between using 10% of the disk's I/O throughput and
> 13% of the disk's throughput to do the same work."
>
> Then I assume the bio reorder patches should work well for write size
> from 4KB to 1MB. Also I think "hmm, if the write size is smaller, there
> will be less chance for dirty blocks to be contiguous on cached device",
> then I choose 512KB.

Please note those two conversations were talking about different
controversial stuff in response to different questions from you.
There's the ordering change, which is expected to improve peak
writeback rate for small blocks.  The test scenarios we're running
right now are mostly trying to measure that.  There needs to be
relatively small I/O to see a difference-- that is, backing disk
access time needs to dominate to make a difference.

The other is the change in I/O aggregation behavior when writeback
rate is not maximum.  That's what I was discussing with the 1MB I/O
example.    Unfortunately it is difficult to craft a test scenario to
test this one, but the argument of why it is better is pretty simple:

-  It's much better to aggregate a few contiguous I/Os and issue them
together than to issue them some time apart because of rate limiting.
The new code will aggregate a limited number of contiguous writes even
if it doesn't have to, to meet rate.  That is, it's much better for
disk utilization to issue 5 256k writes that are sequential to each
other at the same time, and then delay 250ms instead of one at a time,
50ms apart.  In the first case, it might be a 7ms access + 13ms of
writing and then 230ms of sleeping (8% disk time used); in the second
case, if user I/O is seeking the disk away from where we're writing,
it'll be 5 groups of: 7ms access + 2.5ms writing and then 40.5ms of
sleeping (24% disk time used).

Mike

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
  2017-10-06 12:37                                                       ` Coly Li
@ 2017-10-06 17:36                                                         ` Michael Lyle
  2017-10-06 18:09                                                           ` Coly Li
  0 siblings, 1 reply; 65+ messages in thread
From: Michael Lyle @ 2017-10-06 17:36 UTC (permalink / raw)
  To: Coly Li
  Cc: Hannes Reinecke, Junhui Tang, linux-bcache, linux-block, Kent Overstreet

It's 240GB (half of a Samsung 850) in front of a 1TB 5400RPM disk.

The size isn't critical.  1/8 is chosen to exceed 10% (default
writeback dirty data thresh), it might need to be 1/6 on really big
environments.  It needs to be big enough that it takes more than 100
seconds to write back, but less than 40% of the cache[1].

I am also running this on "mountain", which is my virtualization
server.  It has 2x Samsung 950 PRO 512GB NVMe disks (MD raid1) in
front of 2x Seagate Ironwolf 6TB (MD raid1) drivers.  I can't run
benchmarks on it, but it runs nightly builds and is otherwise lightly
loaded in the middle of night.  It does writeback for about 75 minutes
on average after a set of nightly builds of my repositories (last 5
days 75, 79, 77, 72, 76); without this patch set it was more like 80
(77, 80, 74, 85, 84, 76, 82, 84).  It has 128GB of RAM and doesn't do
many reads-- most of its hottest working set hits the page cache--
bcache serves to accelerate the random writes its workloads generate.
This is the workload that I sought to improve with the writeback
changes.

When you ran the tests, did you compare 1-5 to 1-3?  Because
otherwise, another uncontrolled factor is the writeback tuning is
drastically different with patch 2.  (On most systems patch 2 makes
writeback less aggressive overall by default).

Mike

[1]: If you don't think this is fair--- this would match how I've been
suggesting enterprise users set up bcache for various workloads-- a
writeback cache isn't going to be very effective at lowering write
workload if the size of the most active write working set doesn't fit
nicely in the cache.  so e.g. if you deploy under a SQL database the
cache should be double the size of the indices to get a real write
aggregation benefit; if you deploy under an environment doing builds
the cache should be double the size of the amount written during
builds; if you deploy under an environment doing rapid VM provisioning
the cache should be double the size of the images blitted per
provisioning cycle, etc.

Basically if this criterion isn't met, the writeback cache doesn't do
very much to improve performance.  I think the test scenarios you ran
didn't satisfy this :/

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
  2017-10-06 11:57                                                       ` Michael Lyle
  (?)
@ 2017-10-06 12:37                                                       ` Coly Li
  2017-10-06 17:36                                                         ` Michael Lyle
  -1 siblings, 1 reply; 65+ messages in thread
From: Coly Li @ 2017-10-06 12:37 UTC (permalink / raw)
  To: Michael Lyle
  Cc: Hannes Reinecke, Junhui Tang, linux-bcache, linux-block, Kent Overstreet

On 2017/10/6 下午7:57, Michael Lyle wrote:
> OK, here's some data:  http://jar.lyle.org/~mlyle/writeback/
> 
> The complete test script is there to automate running writeback
> scenarios--- NOTE DONT RUN WITHOUT EDITING THE DEVICES FOR YOUR
> HARDWARE.
> 
> Only one run each way, but they take 8-9 minutes to run, we can easily
> get more ;)  I compared patches 1-3 (which are uncontroversial) to
> 1-5.
> 
> Concerns I've heard:
> 
> - The new patches will contend for I/O bandwidth with front-end writes:
> 
> No:
>  3 PATCHES: write: io=29703MB, bw=83191KB/s, iops=10398, runt=365618msec
> vs
>  5 PATCHES: write: io=29746MB, bw=86177KB/s, iops=10771, runt=353461msec
> 
> It may actually be slightly better-- 3% or so.
> 
> - The new patches will not improve writeback rate.
> 
> No:
> 
> 3 PATCHES: the active period of the test was 366+100=466 seconds, and
> at the end there was 33.4G dirty.
> 5 PATCHES: the active period of the test was 353+100=453 seconds, and
> at the end there was 32.7G dirty.
> 
> This is a moderate improvement.
> 
> - The IO scheduler can combine the writes anyways so this type of
> patch will not increase write queue merge.
> 
> No:
> 
> Average wrqm/s is 1525.4 in the 3 PATCHES dataset; average wrqm/s is
> 1643.7 in the 5 PATCHES dataset.
> 
> During the last 100 seconds, when ONLY WRITEBACK is occurring, wrqm is
> 1398.0 in 3 PATCHES, and 1811.6 with 5 PATCHES.
> 
> - Front-end latency will suffer:
> 
> No:
> 
> The datasets look the same to my eye.  By far the worst thing is the
> occasional 1000ms+ times the bcache goes to sleep in both scenarios,
> contending for the writeback lock (not affected by these patches, but
> an item for future work if I ever get to move on to a new topic).
> 
> Conclusion:
> 
> These patches provide a small but significant improvement in writeback
> rates, that can be seen with careful testing that produces actual
> sequential writeback.  They lay the groundwork for further
> improvements, through the use of plugging the block layer and to allow
> accelerated writeback when the device is idle.
> 
[snip]

Hi Mike,

Thank you for the detailed information!

In your test, dirty data occupies 1/8 space of CACHEDEVICE, can I know
exact sizes of the cache device and cached device, then I will set up a
similar configuration on my machine, and try to reproduce the test.

-- 
Coly Li

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
  2017-10-06 10:42                                                 ` Michael Lyle
                                                                   ` (2 preceding siblings ...)
  (?)
@ 2017-10-06 12:20                                                 ` Coly Li
  2017-10-06 17:53                                                   ` Michael Lyle
  -1 siblings, 1 reply; 65+ messages in thread
From: Coly Li @ 2017-10-06 12:20 UTC (permalink / raw)
  To: Michael Lyle; +Cc: Junhui Tang, linux-bcache, linux-block, Kent Overstreet

On 2017/10/6 下午6:42, Michael Lyle wrote:
> Coly--
> 
> Holy crap, I'm not surprised you don't see a difference if you're
> writing with 512K size!  The potential benefit from merging is much
> less, and the odds of missing a merge is much smaller.  512KB is 5ms
> sequential by itself on a 100MB/sec disk--- lots more time to wait to
> get the next chunks in order, and even if you fail to merge the
> potential benefit is much less-- if the difference is mostly
> rotational latency from failing to merge then we're talking 5ms vs
> 5+2ms.
> 

Hi Mike,

This is how wars happend LOL :-)

> Do you even understand what you are trying to test?

When I read patch 4/5, I saw you mentioned 4KB writes:
"e.g. at "background" writeback of target rate = 8, it would not combine
two adjacent 4k writes and would instead seek the disk twice."

And when we talked about patch 5/5, you mentioned 1MB writes:
"- When writeback rate is medium, it does I/O more efficiently.  e.g.
if the current writeback rate is 10MB/sec, and there are two
contiguous 1MB segments, they would not presently be combined.  A 1MB
write would occur, then we would increase the delay counter by 100ms,
and then the next write would wait; this new code would issue 2 1MB
writes one after the other, and then sleep 200ms.  On a disk that does
150MB/sec sequential, and has a 7ms seek time, this uses the disk for
13ms + 7ms, compared to the old code that does 13ms + 7ms * 2.  This
is the difference between using 10% of the disk's I/O throughput and
13% of the disk's throughput to do the same work."

Then I assume the bio reorder patches should work well for write size
from 4KB to 1MB. Also I think "hmm, if the write size is smaller, there
will be less chance for dirty blocks to be contiguous on cached device",
then I choose 512KB.

Here is my command line to setup the bcache:

make-bcache -B <cached device> -C <cache device>
echo <cache device> > /sys/fs/bcache/register
echo <cached device> > /sys/fs/bcache/register
sleep 1
echo 0 > /sys/block/bcache0/bcache/cache/congested_read_threshold_us
echo 0 > /sys/block/bcache0/bcache/cache/congested_write_threshold_us
echo writeback > /sys/block/bcache0/bcache/cache_mode
echo 0 > /sys/block/bcache0/bcache/writeback_running

Now writeback is disabled, I start to use fio to write dirty data on
cache device. The following is fio job file.
[global]
direct=1
thread=1
ioengine=libaio

[job]
filename=/dev/bcache0
readwrite=randwrite
numjobs=8
;blocksize=64k
blocksize=512k
;blocksize=1M
iodepth=128
size=3000G
time_based=1
;runtime=10m
ramp_time=4
gtod_reduce=1
randrepeat=1

ramp_time, gtod_reduce and randrepeat are what I copied from your fio
example.

Then I watch the dirty data amount, when the dirty increases to a target
number (half full example), I kill fio process.

Then I start writeback by
echo 1 > /sys/block/bcache0/bcache/writeback_running

and immediately run 2 bash scripts to collect performance data:
1) writeback_rate.sh
  while [ 1 ];do
        cat /sys/block/bcache0/bcache/writeback_rate_debug
        echo -e "\n\n"
        sleep 60
  done
2) iostat command line
  iostat -x 1 <cache device> <cached device> <more disks compose md
device> | tee iostat.log

The writeback rate debug information is collected every 1 minute, iostat
information is collected every 1 seconds. They are all dedicated disks
for testing, just raw disks without any file system.

Thanks.

Coly


> On Fri, Oct 6, 2017 at 3:36 AM, Coly Li <i@coly.li> wrote:
>> On 2017/10/6 下午5:20, Michael Lyle wrote:
>>> Coly--
>>>
>>> I did not say the result from the changes will be random.
>>>
>>> I said the result from your test will be random, because where the
>>> writeback position is making non-contiguous holes in the data is
>>> nondeterministic-- it depends where it is on the disk at the instant
>>> that writeback begins.  There is a high degree of dispersion in the
>>> test scenario you are running that is likely to exceed the differences
>>> from my patch.
>>
>> Hi Mike,
>>
>> I did the test quite carefully. Here is how I ran the test,
>> - disable writeback by echo 0 to writeback_runing.
>> - write random data into cache to full or half size, then stop the I/O
>> immediately.
>> - echo 1 to writeback_runing to start writeback
>> - and record performance data at once
>>
>> It might be random position where the writeback starts, but there should
>> not be too much difference of statistical number of the continuous
>> blocks (on cached device). Because fio just send random 512KB blocks
>> onto cache device, the statistical number of contiguous blocks depends
>> on cache device vs. cached device size, and how full the cache device is
>> occupied.
>>
>> Indeed, I repeated some tests more than once (except the md raid5 and md
>> raid0 configurations), the results are quite sable when I see the data
>> charts, no big difference.
>>
>> If you feel the performance result I provided is problematic, it would
>> be better to let the data talk. You need to show your performance test
>> number to prove that the bio reorder patches are helpful for general
>> workloads, or at least helpful to many typical workloads.
>>
>> Let the data talk.
>>
>> Thanks.
>>
>> --
>> Coly Li

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
  2017-10-06 11:09                                                     ` Michael Lyle
@ 2017-10-06 11:57                                                       ` Michael Lyle
  -1 siblings, 0 replies; 65+ messages in thread
From: Michael Lyle @ 2017-10-06 11:57 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: Coly Li, Junhui Tang, linux-bcache, linux-block, Kent Overstreet

OK, here's some data:  http://jar.lyle.org/~mlyle/writeback/

The complete test script is there to automate running writeback
scenarios--- NOTE DONT RUN WITHOUT EDITING THE DEVICES FOR YOUR
HARDWARE.

Only one run each way, but they take 8-9 minutes to run, we can easily
get more ;)  I compared patches 1-3 (which are uncontroversial) to
1-5.

Concerns I've heard:

- The new patches will contend for I/O bandwidth with front-end writes:

No:
 3 PATCHES: write: io=3D29703MB, bw=3D83191KB/s, iops=3D10398, runt=3D36561=
8msec
vs
 5 PATCHES: write: io=3D29746MB, bw=3D86177KB/s, iops=3D10771, runt=3D35346=
1msec

It may actually be slightly better-- 3% or so.

- The new patches will not improve writeback rate.

No:

3 PATCHES: the active period of the test was 366+100=3D466 seconds, and
at the end there was 33.4G dirty.
5 PATCHES: the active period of the test was 353+100=3D453 seconds, and
at the end there was 32.7G dirty.

This is a moderate improvement.

- The IO scheduler can combine the writes anyways so this type of
patch will not increase write queue merge.

No:

Average wrqm/s is 1525.4 in the 3 PATCHES dataset; average wrqm/s is
1643.7 in the 5 PATCHES dataset.

During the last 100 seconds, when ONLY WRITEBACK is occurring, wrqm is
1398.0 in 3 PATCHES, and 1811.6 with 5 PATCHES.

- Front-end latency will suffer:

No:

The datasets look the same to my eye.  By far the worst thing is the
occasional 1000ms+ times the bcache goes to sleep in both scenarios,
contending for the writeback lock (not affected by these patches, but
an item for future work if I ever get to move on to a new topic).

Conclusion:

These patches provide a small but significant improvement in writeback
rates, that can be seen with careful testing that produces actual
sequential writeback.  They lay the groundwork for further
improvements, through the use of plugging the block layer and to allow
accelerated writeback when the device is idle.

Mike

On Fri, Oct 6, 2017 at 4:09 AM, Michael Lyle <mlyle@lyle.org> wrote:
> Hannes--
>
> Thanks for your input.
>
> Assuming there's contiguous data to writeback, the dataset size is
> immaterial; writeback gathers 500 extents from a btree, and writes
> back up to 64 of them at a time.  With 8k extents, the amount of data
> the writeback code is juggling at a time is about 4 megabytes at
> maximum.
>
> Optimizing writeback only does something significant when the chunks
> to write back are relatively small, and when there's actual extents
> next to each other to write back.
>
> If there's big chunks, the spinning disk takes a long time to write
> each one; and that time allows both the drive itself with native
> command queueing and the IO scheduler lots of time to combine the
> write.  Not to mention that even if there is a small delay due to
> non-sequential / short-seek the difference in performance is minimal,
> because 512k extents tie up the disk for a long time.
>
> Also, I think the test scenario doesn't really have any adjacent
> extents to writeback, which doesn't help.
>
> I will forward performance data and complete scripts to run a
> reasonable scenario.
>
> Mike
>
> On Fri, Oct 6, 2017 at 4:00 AM, Hannes Reinecke <hare@suse.de> wrote:
>> On 10/06/2017 12:42 PM, Michael Lyle wrote:
>>> Coly--
>>>
>>> Holy crap, I'm not surprised you don't see a difference if you're
>>> writing with 512K size!  The potential benefit from merging is much
>>> less, and the odds of missing a merge is much smaller.  512KB is 5ms
>>> sequential by itself on a 100MB/sec disk--- lots more time to wait to
>>> get the next chunks in order, and even if you fail to merge the
>>> potential benefit is much less-- if the difference is mostly
>>> rotational latency from failing to merge then we're talking 5ms vs
>>> 5+2ms.
>>>
>>> Do you even understand what you are trying to test?
>>>
>>> Mike
>>>
>>> On Fri, Oct 6, 2017 at 3:36 AM, Coly Li <i@coly.li> wrote:
>>>> On 2017/10/6 =E4=B8=8B=E5=8D=885:20, Michael Lyle wrote:
>>>>> Coly--
>>>>>
>>>>> I did not say the result from the changes will be random.
>>>>>
>>>>> I said the result from your test will be random, because where the
>>>>> writeback position is making non-contiguous holes in the data is
>>>>> nondeterministic-- it depends where it is on the disk at the instant
>>>>> that writeback begins.  There is a high degree of dispersion in the
>>>>> test scenario you are running that is likely to exceed the difference=
s
>>>>> from my patch.
>>>>
>>>> Hi Mike,
>>>>
>>>> I did the test quite carefully. Here is how I ran the test,
>>>> - disable writeback by echo 0 to writeback_runing.
>>>> - write random data into cache to full or half size, then stop the I/O
>>>> immediately.
>>>> - echo 1 to writeback_runing to start writeback
>>>> - and record performance data at once
>>>>
>>>> It might be random position where the writeback starts, but there shou=
ld
>>>> not be too much difference of statistical number of the continuous
>>>> blocks (on cached device). Because fio just send random 512KB blocks
>>>> onto cache device, the statistical number of contiguous blocks depends
>>>> on cache device vs. cached device size, and how full the cache device =
is
>>>> occupied.
>>>>
>>>> Indeed, I repeated some tests more than once (except the md raid5 and =
md
>>>> raid0 configurations), the results are quite sable when I see the data
>>>> charts, no big difference.
>>>>
>>>> If you feel the performance result I provided is problematic, it would
>>>> be better to let the data talk. You need to show your performance test
>>>> number to prove that the bio reorder patches are helpful for general
>>>> workloads, or at least helpful to many typical workloads.
>>>>
>>>> Let the data talk.
>>>>
>>
>> I think it would be easier for everyone concerned if Coly could attach
>> the fio script / cmdline and the bcache setup here.
>> There still is a chance that both are correct, as different hardware
>> setups are being used.
>> We've seen this many times trying to establish workable performance
>> regression metrics for I/O; depending on the hardware one set of
>> optimisations fail to deliver the expected benefit on other platforms.
>> Just look at the discussion we're having currently with Ming Lei on the
>> SCSI mailing list trying to improve sequential I/O performance.
>>
>> But please try to calm down everyone. It's not that Coly is deliberately
>> blocking your patches, it's just that he doesn't see the performance
>> benefit on his side.
>> Might be that he's using the wrong parameters, but than that should be
>> clarified once the fio script is posted.
>>
>> At the same time I don't think that the size of the dataset is
>> immaterial. Larger datasets take up more space, and inevitably add more
>> overhead just for looking up the data in memory. Plus Coly has some
>> quite high-powered NVMe for the caching device, which will affect
>> writeback patterns, too.
>>
>> Cheers,
>>
>> Hannes
>> --
>> Dr. Hannes Reinecke                Teamlead Storage & Networking
>> hare@suse.de                                   +49 911 74053 688
>> SUSE LINUX GmbH, Maxfeldstr. 5, 90409 N=C3=BCrnberg
>> GF: F. Imend=C3=B6rffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
>> HRB 21284 (AG N=C3=BCrnberg)

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
@ 2017-10-06 11:57                                                       ` Michael Lyle
  0 siblings, 0 replies; 65+ messages in thread
From: Michael Lyle @ 2017-10-06 11:57 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: Coly Li, Junhui Tang, linux-bcache, linux-block, Kent Overstreet

OK, here's some data:  http://jar.lyle.org/~mlyle/writeback/

The complete test script is there to automate running writeback
scenarios--- NOTE DONT RUN WITHOUT EDITING THE DEVICES FOR YOUR
HARDWARE.

Only one run each way, but they take 8-9 minutes to run, we can easily
get more ;)  I compared patches 1-3 (which are uncontroversial) to
1-5.

Concerns I've heard:

- The new patches will contend for I/O bandwidth with front-end writes:

No:
 3 PATCHES: write: io=29703MB, bw=83191KB/s, iops=10398, runt=365618msec
vs
 5 PATCHES: write: io=29746MB, bw=86177KB/s, iops=10771, runt=353461msec

It may actually be slightly better-- 3% or so.

- The new patches will not improve writeback rate.

No:

3 PATCHES: the active period of the test was 366+100=466 seconds, and
at the end there was 33.4G dirty.
5 PATCHES: the active period of the test was 353+100=453 seconds, and
at the end there was 32.7G dirty.

This is a moderate improvement.

- The IO scheduler can combine the writes anyways so this type of
patch will not increase write queue merge.

No:

Average wrqm/s is 1525.4 in the 3 PATCHES dataset; average wrqm/s is
1643.7 in the 5 PATCHES dataset.

During the last 100 seconds, when ONLY WRITEBACK is occurring, wrqm is
1398.0 in 3 PATCHES, and 1811.6 with 5 PATCHES.

- Front-end latency will suffer:

No:

The datasets look the same to my eye.  By far the worst thing is the
occasional 1000ms+ times the bcache goes to sleep in both scenarios,
contending for the writeback lock (not affected by these patches, but
an item for future work if I ever get to move on to a new topic).

Conclusion:

These patches provide a small but significant improvement in writeback
rates, that can be seen with careful testing that produces actual
sequential writeback.  They lay the groundwork for further
improvements, through the use of plugging the block layer and to allow
accelerated writeback when the device is idle.

Mike

On Fri, Oct 6, 2017 at 4:09 AM, Michael Lyle <mlyle@lyle.org> wrote:
> Hannes--
>
> Thanks for your input.
>
> Assuming there's contiguous data to writeback, the dataset size is
> immaterial; writeback gathers 500 extents from a btree, and writes
> back up to 64 of them at a time.  With 8k extents, the amount of data
> the writeback code is juggling at a time is about 4 megabytes at
> maximum.
>
> Optimizing writeback only does something significant when the chunks
> to write back are relatively small, and when there's actual extents
> next to each other to write back.
>
> If there's big chunks, the spinning disk takes a long time to write
> each one; and that time allows both the drive itself with native
> command queueing and the IO scheduler lots of time to combine the
> write.  Not to mention that even if there is a small delay due to
> non-sequential / short-seek the difference in performance is minimal,
> because 512k extents tie up the disk for a long time.
>
> Also, I think the test scenario doesn't really have any adjacent
> extents to writeback, which doesn't help.
>
> I will forward performance data and complete scripts to run a
> reasonable scenario.
>
> Mike
>
> On Fri, Oct 6, 2017 at 4:00 AM, Hannes Reinecke <hare@suse.de> wrote:
>> On 10/06/2017 12:42 PM, Michael Lyle wrote:
>>> Coly--
>>>
>>> Holy crap, I'm not surprised you don't see a difference if you're
>>> writing with 512K size!  The potential benefit from merging is much
>>> less, and the odds of missing a merge is much smaller.  512KB is 5ms
>>> sequential by itself on a 100MB/sec disk--- lots more time to wait to
>>> get the next chunks in order, and even if you fail to merge the
>>> potential benefit is much less-- if the difference is mostly
>>> rotational latency from failing to merge then we're talking 5ms vs
>>> 5+2ms.
>>>
>>> Do you even understand what you are trying to test?
>>>
>>> Mike
>>>
>>> On Fri, Oct 6, 2017 at 3:36 AM, Coly Li <i@coly.li> wrote:
>>>> On 2017/10/6 下午5:20, Michael Lyle wrote:
>>>>> Coly--
>>>>>
>>>>> I did not say the result from the changes will be random.
>>>>>
>>>>> I said the result from your test will be random, because where the
>>>>> writeback position is making non-contiguous holes in the data is
>>>>> nondeterministic-- it depends where it is on the disk at the instant
>>>>> that writeback begins.  There is a high degree of dispersion in the
>>>>> test scenario you are running that is likely to exceed the differences
>>>>> from my patch.
>>>>
>>>> Hi Mike,
>>>>
>>>> I did the test quite carefully. Here is how I ran the test,
>>>> - disable writeback by echo 0 to writeback_runing.
>>>> - write random data into cache to full or half size, then stop the I/O
>>>> immediately.
>>>> - echo 1 to writeback_runing to start writeback
>>>> - and record performance data at once
>>>>
>>>> It might be random position where the writeback starts, but there should
>>>> not be too much difference of statistical number of the continuous
>>>> blocks (on cached device). Because fio just send random 512KB blocks
>>>> onto cache device, the statistical number of contiguous blocks depends
>>>> on cache device vs. cached device size, and how full the cache device is
>>>> occupied.
>>>>
>>>> Indeed, I repeated some tests more than once (except the md raid5 and md
>>>> raid0 configurations), the results are quite sable when I see the data
>>>> charts, no big difference.
>>>>
>>>> If you feel the performance result I provided is problematic, it would
>>>> be better to let the data talk. You need to show your performance test
>>>> number to prove that the bio reorder patches are helpful for general
>>>> workloads, or at least helpful to many typical workloads.
>>>>
>>>> Let the data talk.
>>>>
>>
>> I think it would be easier for everyone concerned if Coly could attach
>> the fio script / cmdline and the bcache setup here.
>> There still is a chance that both are correct, as different hardware
>> setups are being used.
>> We've seen this many times trying to establish workable performance
>> regression metrics for I/O; depending on the hardware one set of
>> optimisations fail to deliver the expected benefit on other platforms.
>> Just look at the discussion we're having currently with Ming Lei on the
>> SCSI mailing list trying to improve sequential I/O performance.
>>
>> But please try to calm down everyone. It's not that Coly is deliberately
>> blocking your patches, it's just that he doesn't see the performance
>> benefit on his side.
>> Might be that he's using the wrong parameters, but than that should be
>> clarified once the fio script is posted.
>>
>> At the same time I don't think that the size of the dataset is
>> immaterial. Larger datasets take up more space, and inevitably add more
>> overhead just for looking up the data in memory. Plus Coly has some
>> quite high-powered NVMe for the caching device, which will affect
>> writeback patterns, too.
>>
>> Cheers,
>>
>> Hannes
>> --
>> Dr. Hannes Reinecke                Teamlead Storage & Networking
>> hare@suse.de                                   +49 911 74053 688
>> SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
>> GF: F. Imendörffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
>> HRB 21284 (AG Nürnberg)

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
  2017-10-06 11:00                                                 ` Hannes Reinecke
@ 2017-10-06 11:09                                                     ` Michael Lyle
  0 siblings, 0 replies; 65+ messages in thread
From: Michael Lyle @ 2017-10-06 11:09 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: Coly Li, Junhui Tang, linux-bcache, linux-block, Kent Overstreet

Hannes--

Thanks for your input.

Assuming there's contiguous data to writeback, the dataset size is
immaterial; writeback gathers 500 extents from a btree, and writes
back up to 64 of them at a time.  With 8k extents, the amount of data
the writeback code is juggling at a time is about 4 megabytes at
maximum.

Optimizing writeback only does something significant when the chunks
to write back are relatively small, and when there's actual extents
next to each other to write back.

If there's big chunks, the spinning disk takes a long time to write
each one; and that time allows both the drive itself with native
command queueing and the IO scheduler lots of time to combine the
write.  Not to mention that even if there is a small delay due to
non-sequential / short-seek the difference in performance is minimal,
because 512k extents tie up the disk for a long time.

Also, I think the test scenario doesn't really have any adjacent
extents to writeback, which doesn't help.

I will forward performance data and complete scripts to run a
reasonable scenario.

Mike

On Fri, Oct 6, 2017 at 4:00 AM, Hannes Reinecke <hare@suse.de> wrote:
> On 10/06/2017 12:42 PM, Michael Lyle wrote:
>> Coly--
>>
>> Holy crap, I'm not surprised you don't see a difference if you're
>> writing with 512K size!  The potential benefit from merging is much
>> less, and the odds of missing a merge is much smaller.  512KB is 5ms
>> sequential by itself on a 100MB/sec disk--- lots more time to wait to
>> get the next chunks in order, and even if you fail to merge the
>> potential benefit is much less-- if the difference is mostly
>> rotational latency from failing to merge then we're talking 5ms vs
>> 5+2ms.
>>
>> Do you even understand what you are trying to test?
>>
>> Mike
>>
>> On Fri, Oct 6, 2017 at 3:36 AM, Coly Li <i@coly.li> wrote:
>>> On 2017/10/6 =E4=B8=8B=E5=8D=885:20, Michael Lyle wrote:
>>>> Coly--
>>>>
>>>> I did not say the result from the changes will be random.
>>>>
>>>> I said the result from your test will be random, because where the
>>>> writeback position is making non-contiguous holes in the data is
>>>> nondeterministic-- it depends where it is on the disk at the instant
>>>> that writeback begins.  There is a high degree of dispersion in the
>>>> test scenario you are running that is likely to exceed the differences
>>>> from my patch.
>>>
>>> Hi Mike,
>>>
>>> I did the test quite carefully. Here is how I ran the test,
>>> - disable writeback by echo 0 to writeback_runing.
>>> - write random data into cache to full or half size, then stop the I/O
>>> immediately.
>>> - echo 1 to writeback_runing to start writeback
>>> - and record performance data at once
>>>
>>> It might be random position where the writeback starts, but there shoul=
d
>>> not be too much difference of statistical number of the continuous
>>> blocks (on cached device). Because fio just send random 512KB blocks
>>> onto cache device, the statistical number of contiguous blocks depends
>>> on cache device vs. cached device size, and how full the cache device i=
s
>>> occupied.
>>>
>>> Indeed, I repeated some tests more than once (except the md raid5 and m=
d
>>> raid0 configurations), the results are quite sable when I see the data
>>> charts, no big difference.
>>>
>>> If you feel the performance result I provided is problematic, it would
>>> be better to let the data talk. You need to show your performance test
>>> number to prove that the bio reorder patches are helpful for general
>>> workloads, or at least helpful to many typical workloads.
>>>
>>> Let the data talk.
>>>
>
> I think it would be easier for everyone concerned if Coly could attach
> the fio script / cmdline and the bcache setup here.
> There still is a chance that both are correct, as different hardware
> setups are being used.
> We've seen this many times trying to establish workable performance
> regression metrics for I/O; depending on the hardware one set of
> optimisations fail to deliver the expected benefit on other platforms.
> Just look at the discussion we're having currently with Ming Lei on the
> SCSI mailing list trying to improve sequential I/O performance.
>
> But please try to calm down everyone. It's not that Coly is deliberately
> blocking your patches, it's just that he doesn't see the performance
> benefit on his side.
> Might be that he's using the wrong parameters, but than that should be
> clarified once the fio script is posted.
>
> At the same time I don't think that the size of the dataset is
> immaterial. Larger datasets take up more space, and inevitably add more
> overhead just for looking up the data in memory. Plus Coly has some
> quite high-powered NVMe for the caching device, which will affect
> writeback patterns, too.
>
> Cheers,
>
> Hannes
> --
> Dr. Hannes Reinecke                Teamlead Storage & Networking
> hare@suse.de                                   +49 911 74053 688
> SUSE LINUX GmbH, Maxfeldstr. 5, 90409 N=C3=BCrnberg
> GF: F. Imend=C3=B6rffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
> HRB 21284 (AG N=C3=BCrnberg)

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
@ 2017-10-06 11:09                                                     ` Michael Lyle
  0 siblings, 0 replies; 65+ messages in thread
From: Michael Lyle @ 2017-10-06 11:09 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: Coly Li, Junhui Tang, linux-bcache, linux-block, Kent Overstreet

Hannes--

Thanks for your input.

Assuming there's contiguous data to writeback, the dataset size is
immaterial; writeback gathers 500 extents from a btree, and writes
back up to 64 of them at a time.  With 8k extents, the amount of data
the writeback code is juggling at a time is about 4 megabytes at
maximum.

Optimizing writeback only does something significant when the chunks
to write back are relatively small, and when there's actual extents
next to each other to write back.

If there's big chunks, the spinning disk takes a long time to write
each one; and that time allows both the drive itself with native
command queueing and the IO scheduler lots of time to combine the
write.  Not to mention that even if there is a small delay due to
non-sequential / short-seek the difference in performance is minimal,
because 512k extents tie up the disk for a long time.

Also, I think the test scenario doesn't really have any adjacent
extents to writeback, which doesn't help.

I will forward performance data and complete scripts to run a
reasonable scenario.

Mike

On Fri, Oct 6, 2017 at 4:00 AM, Hannes Reinecke <hare@suse.de> wrote:
> On 10/06/2017 12:42 PM, Michael Lyle wrote:
>> Coly--
>>
>> Holy crap, I'm not surprised you don't see a difference if you're
>> writing with 512K size!  The potential benefit from merging is much
>> less, and the odds of missing a merge is much smaller.  512KB is 5ms
>> sequential by itself on a 100MB/sec disk--- lots more time to wait to
>> get the next chunks in order, and even if you fail to merge the
>> potential benefit is much less-- if the difference is mostly
>> rotational latency from failing to merge then we're talking 5ms vs
>> 5+2ms.
>>
>> Do you even understand what you are trying to test?
>>
>> Mike
>>
>> On Fri, Oct 6, 2017 at 3:36 AM, Coly Li <i@coly.li> wrote:
>>> On 2017/10/6 下午5:20, Michael Lyle wrote:
>>>> Coly--
>>>>
>>>> I did not say the result from the changes will be random.
>>>>
>>>> I said the result from your test will be random, because where the
>>>> writeback position is making non-contiguous holes in the data is
>>>> nondeterministic-- it depends where it is on the disk at the instant
>>>> that writeback begins.  There is a high degree of dispersion in the
>>>> test scenario you are running that is likely to exceed the differences
>>>> from my patch.
>>>
>>> Hi Mike,
>>>
>>> I did the test quite carefully. Here is how I ran the test,
>>> - disable writeback by echo 0 to writeback_runing.
>>> - write random data into cache to full or half size, then stop the I/O
>>> immediately.
>>> - echo 1 to writeback_runing to start writeback
>>> - and record performance data at once
>>>
>>> It might be random position where the writeback starts, but there should
>>> not be too much difference of statistical number of the continuous
>>> blocks (on cached device). Because fio just send random 512KB blocks
>>> onto cache device, the statistical number of contiguous blocks depends
>>> on cache device vs. cached device size, and how full the cache device is
>>> occupied.
>>>
>>> Indeed, I repeated some tests more than once (except the md raid5 and md
>>> raid0 configurations), the results are quite sable when I see the data
>>> charts, no big difference.
>>>
>>> If you feel the performance result I provided is problematic, it would
>>> be better to let the data talk. You need to show your performance test
>>> number to prove that the bio reorder patches are helpful for general
>>> workloads, or at least helpful to many typical workloads.
>>>
>>> Let the data talk.
>>>
>
> I think it would be easier for everyone concerned if Coly could attach
> the fio script / cmdline and the bcache setup here.
> There still is a chance that both are correct, as different hardware
> setups are being used.
> We've seen this many times trying to establish workable performance
> regression metrics for I/O; depending on the hardware one set of
> optimisations fail to deliver the expected benefit on other platforms.
> Just look at the discussion we're having currently with Ming Lei on the
> SCSI mailing list trying to improve sequential I/O performance.
>
> But please try to calm down everyone. It's not that Coly is deliberately
> blocking your patches, it's just that he doesn't see the performance
> benefit on his side.
> Might be that he's using the wrong parameters, but than that should be
> clarified once the fio script is posted.
>
> At the same time I don't think that the size of the dataset is
> immaterial. Larger datasets take up more space, and inevitably add more
> overhead just for looking up the data in memory. Plus Coly has some
> quite high-powered NVMe for the caching device, which will affect
> writeback patterns, too.
>
> Cheers,
>
> Hannes
> --
> Dr. Hannes Reinecke                Teamlead Storage & Networking
> hare@suse.de                                   +49 911 74053 688
> SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
> GF: F. Imendörffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
> HRB 21284 (AG Nürnberg)

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
  2017-10-06 10:42                                                 ` Michael Lyle
  (?)
  (?)
@ 2017-10-06 11:00                                                 ` Hannes Reinecke
  2017-10-06 11:09                                                     ` Michael Lyle
  -1 siblings, 1 reply; 65+ messages in thread
From: Hannes Reinecke @ 2017-10-06 11:00 UTC (permalink / raw)
  To: Michael Lyle, Coly Li
  Cc: Junhui Tang, linux-bcache, linux-block, Kent Overstreet

On 10/06/2017 12:42 PM, Michael Lyle wrote:
> Coly--
> 
> Holy crap, I'm not surprised you don't see a difference if you're
> writing with 512K size!  The potential benefit from merging is much
> less, and the odds of missing a merge is much smaller.  512KB is 5ms
> sequential by itself on a 100MB/sec disk--- lots more time to wait to
> get the next chunks in order, and even if you fail to merge the
> potential benefit is much less-- if the difference is mostly
> rotational latency from failing to merge then we're talking 5ms vs
> 5+2ms.
> 
> Do you even understand what you are trying to test?
> 
> Mike
> 
> On Fri, Oct 6, 2017 at 3:36 AM, Coly Li <i@coly.li> wrote:
>> On 2017/10/6 下午5:20, Michael Lyle wrote:
>>> Coly--
>>>
>>> I did not say the result from the changes will be random.
>>>
>>> I said the result from your test will be random, because where the
>>> writeback position is making non-contiguous holes in the data is
>>> nondeterministic-- it depends where it is on the disk at the instant
>>> that writeback begins.  There is a high degree of dispersion in the
>>> test scenario you are running that is likely to exceed the differences
>>> from my patch.
>>
>> Hi Mike,
>>
>> I did the test quite carefully. Here is how I ran the test,
>> - disable writeback by echo 0 to writeback_runing.
>> - write random data into cache to full or half size, then stop the I/O
>> immediately.
>> - echo 1 to writeback_runing to start writeback
>> - and record performance data at once
>>
>> It might be random position where the writeback starts, but there should
>> not be too much difference of statistical number of the continuous
>> blocks (on cached device). Because fio just send random 512KB blocks
>> onto cache device, the statistical number of contiguous blocks depends
>> on cache device vs. cached device size, and how full the cache device is
>> occupied.
>>
>> Indeed, I repeated some tests more than once (except the md raid5 and md
>> raid0 configurations), the results are quite sable when I see the data
>> charts, no big difference.
>>
>> If you feel the performance result I provided is problematic, it would
>> be better to let the data talk. You need to show your performance test
>> number to prove that the bio reorder patches are helpful for general
>> workloads, or at least helpful to many typical workloads.
>>
>> Let the data talk.
>>

I think it would be easier for everyone concerned if Coly could attach
the fio script / cmdline and the bcache setup here.
There still is a chance that both are correct, as different hardware
setups are being used.
We've seen this many times trying to establish workable performance
regression metrics for I/O; depending on the hardware one set of
optimisations fail to deliver the expected benefit on other platforms.
Just look at the discussion we're having currently with Ming Lei on the
SCSI mailing list trying to improve sequential I/O performance.

But please try to calm down everyone. It's not that Coly is deliberately
blocking your patches, it's just that he doesn't see the performance
benefit on his side.
Might be that he's using the wrong parameters, but than that should be
clarified once the fio script is posted.

At the same time I don't think that the size of the dataset is
immaterial. Larger datasets take up more space, and inevitably add more
overhead just for looking up the data in memory. Plus Coly has some
quite high-powered NVMe for the caching device, which will affect
writeback patterns, too.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		   Teamlead Storage & Networking
hare@suse.de			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: F. Imendörffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG Nürnberg)

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
  2017-10-06 10:42                                                 ` Michael Lyle
@ 2017-10-06 10:56                                                   ` Michael Lyle
  -1 siblings, 0 replies; 65+ messages in thread
From: Michael Lyle @ 2017-10-06 10:56 UTC (permalink / raw)
  To: Coly Li; +Cc: Junhui Tang, linux-bcache, linux-block, Kent Overstreet

I will write a test bench and send results soon.

Just please note-- you've crafted a test where there's not likely to
be sequential data to writeback, and chosen a block size where there
is limited difference between sequential and nonsequential writeback.
Not surprisingly, you don't see a real difference with code that is
trying to optimize sequential writeback :P

Mike

On Fri, Oct 6, 2017 at 3:42 AM, Michael Lyle <mlyle@lyle.org> wrote:
> Coly--
>
> Holy crap, I'm not surprised you don't see a difference if you're
> writing with 512K size!  The potential benefit from merging is much
> less, and the odds of missing a merge is much smaller.  512KB is 5ms
> sequential by itself on a 100MB/sec disk--- lots more time to wait to
> get the next chunks in order, and even if you fail to merge the
> potential benefit is much less-- if the difference is mostly
> rotational latency from failing to merge then we're talking 5ms vs
> 5+2ms.
>
> Do you even understand what you are trying to test?
>
> Mike
>
> On Fri, Oct 6, 2017 at 3:36 AM, Coly Li <i@coly.li> wrote:
>> On 2017/10/6 =E4=B8=8B=E5=8D=885:20, Michael Lyle wrote:
>>> Coly--
>>>
>>> I did not say the result from the changes will be random.
>>>
>>> I said the result from your test will be random, because where the
>>> writeback position is making non-contiguous holes in the data is
>>> nondeterministic-- it depends where it is on the disk at the instant
>>> that writeback begins.  There is a high degree of dispersion in the
>>> test scenario you are running that is likely to exceed the differences
>>> from my patch.
>>
>> Hi Mike,
>>
>> I did the test quite carefully. Here is how I ran the test,
>> - disable writeback by echo 0 to writeback_runing.
>> - write random data into cache to full or half size, then stop the I/O
>> immediately.
>> - echo 1 to writeback_runing to start writeback
>> - and record performance data at once
>>
>> It might be random position where the writeback starts, but there should
>> not be too much difference of statistical number of the continuous
>> blocks (on cached device). Because fio just send random 512KB blocks
>> onto cache device, the statistical number of contiguous blocks depends
>> on cache device vs. cached device size, and how full the cache device is
>> occupied.
>>
>> Indeed, I repeated some tests more than once (except the md raid5 and md
>> raid0 configurations), the results are quite sable when I see the data
>> charts, no big difference.
>>
>> If you feel the performance result I provided is problematic, it would
>> be better to let the data talk. You need to show your performance test
>> number to prove that the bio reorder patches are helpful for general
>> workloads, or at least helpful to many typical workloads.
>>
>> Let the data talk.
>>
>> Thanks.
>>
>> --
>> Coly Li

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
@ 2017-10-06 10:56                                                   ` Michael Lyle
  0 siblings, 0 replies; 65+ messages in thread
From: Michael Lyle @ 2017-10-06 10:56 UTC (permalink / raw)
  To: Coly Li; +Cc: Junhui Tang, linux-bcache, linux-block, Kent Overstreet

I will write a test bench and send results soon.

Just please note-- you've crafted a test where there's not likely to
be sequential data to writeback, and chosen a block size where there
is limited difference between sequential and nonsequential writeback.
Not surprisingly, you don't see a real difference with code that is
trying to optimize sequential writeback :P

Mike

On Fri, Oct 6, 2017 at 3:42 AM, Michael Lyle <mlyle@lyle.org> wrote:
> Coly--
>
> Holy crap, I'm not surprised you don't see a difference if you're
> writing with 512K size!  The potential benefit from merging is much
> less, and the odds of missing a merge is much smaller.  512KB is 5ms
> sequential by itself on a 100MB/sec disk--- lots more time to wait to
> get the next chunks in order, and even if you fail to merge the
> potential benefit is much less-- if the difference is mostly
> rotational latency from failing to merge then we're talking 5ms vs
> 5+2ms.
>
> Do you even understand what you are trying to test?
>
> Mike
>
> On Fri, Oct 6, 2017 at 3:36 AM, Coly Li <i@coly.li> wrote:
>> On 2017/10/6 下午5:20, Michael Lyle wrote:
>>> Coly--
>>>
>>> I did not say the result from the changes will be random.
>>>
>>> I said the result from your test will be random, because where the
>>> writeback position is making non-contiguous holes in the data is
>>> nondeterministic-- it depends where it is on the disk at the instant
>>> that writeback begins.  There is a high degree of dispersion in the
>>> test scenario you are running that is likely to exceed the differences
>>> from my patch.
>>
>> Hi Mike,
>>
>> I did the test quite carefully. Here is how I ran the test,
>> - disable writeback by echo 0 to writeback_runing.
>> - write random data into cache to full or half size, then stop the I/O
>> immediately.
>> - echo 1 to writeback_runing to start writeback
>> - and record performance data at once
>>
>> It might be random position where the writeback starts, but there should
>> not be too much difference of statistical number of the continuous
>> blocks (on cached device). Because fio just send random 512KB blocks
>> onto cache device, the statistical number of contiguous blocks depends
>> on cache device vs. cached device size, and how full the cache device is
>> occupied.
>>
>> Indeed, I repeated some tests more than once (except the md raid5 and md
>> raid0 configurations), the results are quite sable when I see the data
>> charts, no big difference.
>>
>> If you feel the performance result I provided is problematic, it would
>> be better to let the data talk. You need to show your performance test
>> number to prove that the bio reorder patches are helpful for general
>> workloads, or at least helpful to many typical workloads.
>>
>> Let the data talk.
>>
>> Thanks.
>>
>> --
>> Coly Li

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
  2017-10-06 10:36                                             ` Coly Li
@ 2017-10-06 10:42                                                 ` Michael Lyle
  0 siblings, 0 replies; 65+ messages in thread
From: Michael Lyle @ 2017-10-06 10:42 UTC (permalink / raw)
  To: Coly Li; +Cc: Junhui Tang, linux-bcache, linux-block, Kent Overstreet

Coly--

Holy crap, I'm not surprised you don't see a difference if you're
writing with 512K size!  The potential benefit from merging is much
less, and the odds of missing a merge is much smaller.  512KB is 5ms
sequential by itself on a 100MB/sec disk--- lots more time to wait to
get the next chunks in order, and even if you fail to merge the
potential benefit is much less-- if the difference is mostly
rotational latency from failing to merge then we're talking 5ms vs
5+2ms.

Do you even understand what you are trying to test?

Mike

On Fri, Oct 6, 2017 at 3:36 AM, Coly Li <i@coly.li> wrote:
> On 2017/10/6 =E4=B8=8B=E5=8D=885:20, Michael Lyle wrote:
>> Coly--
>>
>> I did not say the result from the changes will be random.
>>
>> I said the result from your test will be random, because where the
>> writeback position is making non-contiguous holes in the data is
>> nondeterministic-- it depends where it is on the disk at the instant
>> that writeback begins.  There is a high degree of dispersion in the
>> test scenario you are running that is likely to exceed the differences
>> from my patch.
>
> Hi Mike,
>
> I did the test quite carefully. Here is how I ran the test,
> - disable writeback by echo 0 to writeback_runing.
> - write random data into cache to full or half size, then stop the I/O
> immediately.
> - echo 1 to writeback_runing to start writeback
> - and record performance data at once
>
> It might be random position where the writeback starts, but there should
> not be too much difference of statistical number of the continuous
> blocks (on cached device). Because fio just send random 512KB blocks
> onto cache device, the statistical number of contiguous blocks depends
> on cache device vs. cached device size, and how full the cache device is
> occupied.
>
> Indeed, I repeated some tests more than once (except the md raid5 and md
> raid0 configurations), the results are quite sable when I see the data
> charts, no big difference.
>
> If you feel the performance result I provided is problematic, it would
> be better to let the data talk. You need to show your performance test
> number to prove that the bio reorder patches are helpful for general
> workloads, or at least helpful to many typical workloads.
>
> Let the data talk.
>
> Thanks.
>
> --
> Coly Li

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
@ 2017-10-06 10:42                                                 ` Michael Lyle
  0 siblings, 0 replies; 65+ messages in thread
From: Michael Lyle @ 2017-10-06 10:42 UTC (permalink / raw)
  To: Coly Li; +Cc: Junhui Tang, linux-bcache, linux-block, Kent Overstreet

Coly--

Holy crap, I'm not surprised you don't see a difference if you're
writing with 512K size!  The potential benefit from merging is much
less, and the odds of missing a merge is much smaller.  512KB is 5ms
sequential by itself on a 100MB/sec disk--- lots more time to wait to
get the next chunks in order, and even if you fail to merge the
potential benefit is much less-- if the difference is mostly
rotational latency from failing to merge then we're talking 5ms vs
5+2ms.

Do you even understand what you are trying to test?

Mike

On Fri, Oct 6, 2017 at 3:36 AM, Coly Li <i@coly.li> wrote:
> On 2017/10/6 下午5:20, Michael Lyle wrote:
>> Coly--
>>
>> I did not say the result from the changes will be random.
>>
>> I said the result from your test will be random, because where the
>> writeback position is making non-contiguous holes in the data is
>> nondeterministic-- it depends where it is on the disk at the instant
>> that writeback begins.  There is a high degree of dispersion in the
>> test scenario you are running that is likely to exceed the differences
>> from my patch.
>
> Hi Mike,
>
> I did the test quite carefully. Here is how I ran the test,
> - disable writeback by echo 0 to writeback_runing.
> - write random data into cache to full or half size, then stop the I/O
> immediately.
> - echo 1 to writeback_runing to start writeback
> - and record performance data at once
>
> It might be random position where the writeback starts, but there should
> not be too much difference of statistical number of the continuous
> blocks (on cached device). Because fio just send random 512KB blocks
> onto cache device, the statistical number of contiguous blocks depends
> on cache device vs. cached device size, and how full the cache device is
> occupied.
>
> Indeed, I repeated some tests more than once (except the md raid5 and md
> raid0 configurations), the results are quite sable when I see the data
> charts, no big difference.
>
> If you feel the performance result I provided is problematic, it would
> be better to let the data talk. You need to show your performance test
> number to prove that the bio reorder patches are helpful for general
> workloads, or at least helpful to many typical workloads.
>
> Let the data talk.
>
> Thanks.
>
> --
> Coly Li

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
  2017-10-06  9:20                                           ` Michael Lyle
@ 2017-10-06 10:36                                             ` Coly Li
  2017-10-06 10:42                                                 ` Michael Lyle
  0 siblings, 1 reply; 65+ messages in thread
From: Coly Li @ 2017-10-06 10:36 UTC (permalink / raw)
  To: Michael Lyle; +Cc: Junhui Tang, linux-bcache, linux-block, Kent Overstreet

On 2017/10/6 下午5:20, Michael Lyle wrote:
> Coly--
> 
> I did not say the result from the changes will be random.
> 
> I said the result from your test will be random, because where the
> writeback position is making non-contiguous holes in the data is
> nondeterministic-- it depends where it is on the disk at the instant
> that writeback begins.  There is a high degree of dispersion in the
> test scenario you are running that is likely to exceed the differences
> from my patch.

Hi Mike,

I did the test quite carefully. Here is how I ran the test,
- disable writeback by echo 0 to writeback_runing.
- write random data into cache to full or half size, then stop the I/O
immediately.
- echo 1 to writeback_runing to start writeback
- and record performance data at once

It might be random position where the writeback starts, but there should
not be too much difference of statistical number of the continuous
blocks (on cached device). Because fio just send random 512KB blocks
onto cache device, the statistical number of contiguous blocks depends
on cache device vs. cached device size, and how full the cache device is
occupied.

Indeed, I repeated some tests more than once (except the md raid5 and md
raid0 configurations), the results are quite sable when I see the data
charts, no big difference.

If you feel the performance result I provided is problematic, it would
be better to let the data talk. You need to show your performance test
number to prove that the bio reorder patches are helpful for general
workloads, or at least helpful to many typical workloads.

Let the data talk.

Thanks.

-- 
Coly Li

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
  2017-10-06  8:27                                         ` Coly Li
@ 2017-10-06  9:20                                           ` Michael Lyle
  2017-10-06 10:36                                             ` Coly Li
  0 siblings, 1 reply; 65+ messages in thread
From: Michael Lyle @ 2017-10-06  9:20 UTC (permalink / raw)
  To: Coly Li; +Cc: Junhui Tang, linux-bcache, linux-block, Kent Overstreet

Coly--

I did not say the result from the changes will be random.

I said the result from your test will be random, because where the
writeback position is making non-contiguous holes in the data is
nondeterministic-- it depends where it is on the disk at the instant
that writeback begins.  There is a high degree of dispersion in the
test scenario you are running that is likely to exceed the differences
from my patch.

Mike

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
  2017-10-05 22:59                                       ` Michael Lyle
@ 2017-10-06  8:27                                         ` Coly Li
  2017-10-06  9:20                                           ` Michael Lyle
  0 siblings, 1 reply; 65+ messages in thread
From: Coly Li @ 2017-10-06  8:27 UTC (permalink / raw)
  To: Michael Lyle; +Cc: Junhui Tang, linux-bcache, linux-block, Kent Overstreet

On 2017/10/6 上午6:59, Michael Lyle wrote:
> I think one of the problems here is that there is no consistent
> requirements or figure of merit for performance.

Hi Mike,

I agree with you. Yes, exactly it is. Performance optimization is always
perfectly for specific workload, it is almost no why to make everyone
happy. To me, most of time it is a trade off: if I want to have this, I
know there must be something I should pay or lose.

The decision is dependent on which kind of work load people cares about,
obviously we cannot get an agreement so far. For our discussion, it is
not just about correct or mistake, it is mainly about making choice.

> You've argued against changes because of A) perceived impact to
> front-end I/O latency, B) writeback rate at idle, C) peak
> instantaneous writeback rate, D) time to writeback the entire amount.
> These are all, to some extent, contradictory.
> 
> I was trying to make changes in writeback to improve **efficiency**--
> the amount of writeback that is done per the amount of time the
> backing device is tied up.  We could have added tunables for A/B/C to
> pick any operating along there.  In addition, the changes I was trying
> to make were **on the path to further improvement**-- submitting
> sequential I/O together with plugging is known to improve merge.
> 
> Users on #bcache IRC on oftc.net constantly talk about writeback
> performance and writeback use cases-- it's probably the most common
> topic of discussion about bcache.  These changes would have improved
> that and laid the groundwork for further improvement.

I assume that if the bio reorder patches work better in above tests,
then maybe they will perform better with more complicated workloads as
well. This assumption might not be correct, but at least I can have
reproducible performance numbers to make decision.

This is not the first time I encounter such discussion like we have this
time. My rule is, let the data talk to each other.

You explain what your idea very well, it is clear to me. But if you
don't have reproducible performance data to support the idea, and tell
people the performance optimization result might be random, it will be
quite hard to help people to understand it and make decision for the
trade off.

> I've retired, but a year ago I was running a bcache environment with
> about 40 computers running development environments  and continuous
> integration test rigs; we would provision and tear down hundreds of
> VMs per day; machines had 256-512GB of RAM, 1TB SATA RAID1 cache
> volumes and RAID10 sets 6-8 drives wide.  The use case was enterprise
> DB-- so I think I know a thing or two about enterprise use cases ;)
> 
> Writeback did obviously bad things in that environment, which is why I
> wanted to fix it.  I also see obviously bad behavior on my small home
> test rig of RAID1 480GB NVMe + RAID1 6TB disks.

Aha, I see why I felt so enjoyed when I read your patch (commit
9276717b9e29 ("bcache: fix bch_hprint crash and improve output")), you
are very experienced engineer. Your bio reorder patches work correctly
as you described, the problem is from me, I won't support this idea
because it may hurt performance of the workloads I care about.

It is rare to meet an experienced people like you, I do hope you may
stay here, continue to help us on patch review and code improvement.

Thanks.

-- 
Coly Li

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
  2017-10-05 17:53                                     ` Michael Lyle
  2017-10-05 18:07                                       ` Coly Li
@ 2017-10-05 22:59                                       ` Michael Lyle
  2017-10-06  8:27                                         ` Coly Li
  1 sibling, 1 reply; 65+ messages in thread
From: Michael Lyle @ 2017-10-05 22:59 UTC (permalink / raw)
  To: Coly Li; +Cc: Junhui Tang, linux-bcache, linux-block, Kent Overstreet

I think one of the problems here is that there is no consistent
requirements or figure of merit for performance.

You've argued against changes because of A) perceived impact to
front-end I/O latency, B) writeback rate at idle, C) peak
instantaneous writeback rate, D) time to writeback the entire amount.
These are all, to some extent, contradictory.

I was trying to make changes in writeback to improve **efficiency**--
the amount of writeback that is done per the amount of time the
backing device is tied up.  We could have added tunables for A/B/C to
pick any operating along there.  In addition, the changes I was trying
to make were **on the path to further improvement**-- submitting
sequential I/O together with plugging is known to improve merge.

Users on #bcache IRC on oftc.net constantly talk about writeback
performance and writeback use cases-- it's probably the most common
topic of discussion about bcache.  These changes would have improved
that and laid the groundwork for further improvement.

I've retired, but a year ago I was running a bcache environment with
about 40 computers running development environments  and continuous
integration test rigs; we would provision and tear down hundreds of
VMs per day; machines had 256-512GB of RAM, 1TB SATA RAID1 cache
volumes and RAID10 sets 6-8 drives wide.  The use case was enterprise
DB-- so I think I know a thing or two about enterprise use cases ;)

Writeback did obviously bad things in that environment, which is why I
wanted to fix it.  I also see obviously bad behavior on my small home
test rig of RAID1 480GB NVMe + RAID1 6TB disks.

Mike

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
  2017-10-05 17:53                                     ` Michael Lyle
@ 2017-10-05 18:07                                       ` Coly Li
  2017-10-05 22:59                                       ` Michael Lyle
  1 sibling, 0 replies; 65+ messages in thread
From: Coly Li @ 2017-10-05 18:07 UTC (permalink / raw)
  To: Michael Lyle; +Cc: Junhui Tang, linux-bcache, linux-block, Kent Overstreet

On 2017/10/6 上午1:53, Michael Lyle wrote:
> On Thu, Oct 5, 2017 at 10:38 AM, Coly Li <i@coly.li> wrote:
>> [snip]
>>   In this test, without bio reorder patches, writeback throughput is
>> much faster, you may see the write request number and request merge
>> number are also much faster then bio reorder patches. After around 10
>> minutes later, there is no obvious performance difference with/without
>> the bio reorder patches. Therefore in this test, with bio reorder
>> patches, I observe a worse writeback performance result.
>>
>> The above tests tell me, to get a better writeback performance with bio
>> reorder patches, a specific situation is required (many contiguous dirty
>> data on cache device), this situation can only happen in some specific
>> of work loads. In general writeback situations, reordering bios by
>> waiting does not have significant performance advantage, and even
>> performance regression is observed.
> 
> I mean, did you not notice that one of the changes is to limit the
> amount of write-issue-merge from the current behavior?  It's not
> exactly surprising that the initial/peak rate is a bit lower
> initially...
> 
> Your test methodology is flawed-- I tried telling you this from the
> beginning-- and I'm not convinced at all you understand the point of
> the patches.  But hey, there's not really any other reviewers on
> bcache stuff, so I guess my stuff is blocked forever. ;)

Patch 1,2,3 are in my for-next directory and continue for more testing.

The new PI controller is simpler and easier to understand with your
comprehensive commit log and code comments, and works as expected as a
flow controller. We should have it in 4.15, and I will try to make it.

Thank you for the great job :-)

-- 
Coly Li

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
  2017-10-05 17:38                                   ` Coly Li
@ 2017-10-05 17:53                                     ` Michael Lyle
  2017-10-05 18:07                                       ` Coly Li
  2017-10-05 22:59                                       ` Michael Lyle
  0 siblings, 2 replies; 65+ messages in thread
From: Michael Lyle @ 2017-10-05 17:53 UTC (permalink / raw)
  To: Coly Li; +Cc: Junhui Tang, linux-bcache, linux-block, Kent Overstreet

On Thu, Oct 5, 2017 at 10:38 AM, Coly Li <i@coly.li> wrote:
> [snip]
>   In this test, without bio reorder patches, writeback throughput is
> much faster, you may see the write request number and request merge
> number are also much faster then bio reorder patches. After around 10
> minutes later, there is no obvious performance difference with/without
> the bio reorder patches. Therefore in this test, with bio reorder
> patches, I observe a worse writeback performance result.
>
> The above tests tell me, to get a better writeback performance with bio
> reorder patches, a specific situation is required (many contiguous dirty
> data on cache device), this situation can only happen in some specific
> of work loads. In general writeback situations, reordering bios by
> waiting does not have significant performance advantage, and even
> performance regression is observed.

I mean, did you not notice that one of the changes is to limit the
amount of write-issue-merge from the current behavior?  It's not
exactly surprising that the initial/peak rate is a bit lower
initially...

Your test methodology is flawed-- I tried telling you this from the
beginning-- and I'm not convinced at all you understand the point of
the patches.  But hey, there's not really any other reviewers on
bcache stuff, so I guess my stuff is blocked forever. ;)

Mike

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
  2017-10-04 23:54                                   ` Michael Lyle
  (?)
@ 2017-10-05 17:38                                   ` Coly Li
  2017-10-05 17:53                                     ` Michael Lyle
  -1 siblings, 1 reply; 65+ messages in thread
From: Coly Li @ 2017-10-05 17:38 UTC (permalink / raw)
  To: Michael Lyle; +Cc: Junhui Tang, linux-bcache, linux-block, Kent Overstreet

On 2017/10/5 上午7:54, Michael Lyle wrote:
> Coly---
> 
> Thanks for running these tests.
> 

Hi Mike,

You provided very detailed information for the PI controller patch, make
me understand it better. As a return, I spend several days to test your
bio reorder patches, you are deserved :-)


> The change is expected to improve performance when the application
> writes to many adjacent blocks, out of order.  Many database workloads
> are like this.  My VM provisioning / installation / cleanup workloads
> have a lot of this, too.
> 

When you talk about an example of performance improvement, it can be
more easier to be understood with performance number. Like what I do for
you. We need to see the real data more than talk.

Maybe your above example is good for single VM, or database record
insert to single table. If multiple VMs installing or starting, or
multiple inserts to multiple databases or multiple tables, I don't know
whether your bio reorder patches still perform better.


> I believe that it really has nothing to do with how full the cache
> device is, or whether things are contiguous on the cache device.  It
> has to do with what proportion of the data is contiguous on the
> **backing device**.

Let me change to a more clear way to express: For a give size cache
device and cached device, more dirty data on cache device, it means more
probability for these dirty data to be contiguous on cached device. This
is another workload independent view to look at contiguous for dirty
blocks, because a randwrite fio does not generate the working data set
you specified in following example.

> 
> To get the best example of this from fio, the working set size, for
> write, needs to be less than half the size of the cache (otherwise,
> previous writebacks make "holes" in the middle of the data that will
> make things from being contiguous), but large enough to trigger
> writeback.  It may help in other circumstances, but the performance
> measurement will be much more random (it effectively depends on where
> the writeback cursor is in its cycle).
> 

Yes, I agree. But I am not able to understand a performance optimization
when its result is random. What I care about is, is your expected
working data set a common cases for bcache usage ? or will it help to
improve writeback performance in most of bcache usage ?

Current writeback percentage is in [0, 40%] and 10% as default. Then
your reorder patches might perform better when dirty data occupies
10%~50% cache device space. In my testing, writeback rate keeps maximum
number (488.2M/sec on my machine) and changes to minimum number
(4.0k/sec with PI controller) in 2 minutes when dirty number gets close
to dirty targe. I sample content of writeback_rate_debug file every 1
minute, here is the data:

rate:           488.2M/sec
dirty:          273.9G
target:         357.6G
proportional:   -2.0G
integral:       2.8G
change:         0.0k/sec
next io:        -1213ms



rate:           264.5M/sec
dirty:          271.8G
target:         357.6G
proportional:   -2.1G
integral:       2.3G
change:         -48.1M/sec
next io:        -2205ms



rate:           4.0k/sec
dirty:          270.7G
target:         357.6G
proportional:   -2.1G
integral:       1.8G
change:         0.0k/sec
next io:        1756ms

The writeback rate changes from maximum number to minimum number in 2
minutes, then wrteback rate will keep on 4.0k/sec, and from the
benchmark data there is little performance difference with/without bio
reorder patches. When cache device size is 232G, it spent 278 minutes
for dirty data decreased from full to target number. Before the 2
minutes window, writeback rate is always maximum number (delay in
read_dirty() is always 0), after the 2 minutes window, writeback rate is
always the minimum number. Therefore the ideal rate for your patch4,5
maybe only happens during the 2 minutes window. It does exist, but far
from enough as an optimization.

> I'm not surprised that peak writeback rate on very fast backing
> devices will probably be a little less-- we only try to writeback 64
> things at a time; and NCQ queue depths are 32 or so-- so across a
> parallel RAID0 installation the drives will not have their queues
> filled entirely already.  Waiting for blocks to be in order will make
> the queues even less full.  However, they'll be provided with I/O in
> LBA order, so presumably the IO utilization and latency will be better
> during this.  Plugging will magnify these effects-- it will write back
> faster when there's contiguous data and utilize the devices more
> efficiently,
> 

You need real performance data to support your opinion.

> We could add a tunable for the writeback semaphore size if it's
> desired to have more than 64 things in flight-- of course, we can't
> have too many because dirty extents are currently selected from a pool
> of maximum 500.

Maybe increases in_flight semaphore helps. But this is not what I
concerned from beginning. A typical cache device size should be around
20% of the backing cached device size, or maybe less. My concern is, in
such a configuration, is there enough contiguous dirty blocks that show
performance advantage by reordering them with a small delay before
issuing them out.

If this assumption does not exist, an optimization for such situation
does not help too much for real workload. This is why I require you to
show real performance data, not only explain the reordering idea is good
in "theory".

I do not agree with current reordering patches because I have the
following benchmark results, they tell me patch4,5 do not have
performance advantage in many cases and even there is performance
regression ...

1) When dirty data on cache device is full
1.1) cache device: 1TB NVMe SSD
     cached device: 1.8TB hard disk
- existing dirty data on cache device

http://blog.coly.li/wp-content/uploads/2017/10/existing_dirty_data_on_cache_single_disk_1T_full_cache.png
- writeback request merge number on hard disk

http://blog.coly.li/wp-content/uploads/2017/10/write_request_merge_single_disk_1T_full_cache.png
- writeback request numbers on hard disk

http://blog.coly.li/wp-content/uploads/2017/10/writeback_request_numbers_on_single_disk_1T_full_cache.png
- writeback throughput on hard disk

http://blog.coly.li/wp-content/uploads/2017/10/writeback_throughput_sampling_single_disk_1T_full_cache.png
  The above results are the best cases I observe, and they are good :-)
This is a laptop-alike configuration, I can see with bio reodering, more
writeback requests issued and merged, and it is even 2x faster than
current bcache code.

1.2) cache device: 232G NVMe SSD
     cached device: 232G SATA SSD
- existing dirty data on cache device

http://blog.coly.li/wp-content/uploads/2017/10/existing_dirty_data_on_cache_single_SATA_SSD_and_232G_full_cache.png
- writeback request merge number on SATA SSD

http://blog.coly.li/wp-content/uploads/2017/10/writeback_request_merge_numbers_on_SATA_SSD_232G_full_cache.png
- writeback request numbers on SATA SSD

http://blog.coly.li/wp-content/uploads/2017/10/writeback_request_numbers_on_SATA_SSD_232G_full_cache.png
- writeback throughput on SATA SSD

http://blog.coly.li/wp-content/uploads/2017/10/writeback_throughput_on_SATA_SSD_232G_full_cache.png
You may say in the above configuration, if backing device is fast
enough, there is almost no difference with/without the bio reordering
patches. (I still don't know the reason why with the bio reordering
patches, writeback rate decreases faster then current bcache code when
dirty percentage gets close to dirty target.)

1.3) cache device: 1T NVMe SSD
     cached device: 1T md raid5 composed by 4 hard disks
- existing dirty data on cache device

http://blog.coly.li/wp-content/uploads/2017/10/existing_dirty_data_on_SSD_raid5_as_backing_and_1T_cache_full.png
- writeback throughput on md raid5

http://blog.coly.li/wp-content/uploads/2017/10/writeback_throughput_sampling_raid5_1T_cache_full.png
  The testing is incomplete. It is very slow, dirty data decreases 150MB
in 2 hours, still far from dirty targe. It should take more than 8 hours
to reach dirty target, so I only record first 25% data and give up. It
seems at first the bio reorder patches is a little slower, but around 55
minutes later, it starts to be faster than current bcache code, and when
I stop the test on 133 minutes, bio reorder patches has 50MB less dirty
data on cache device. At least bio reorder patches are not bad :-) But
complete such testing needs 16 hours at least, I give up.
  I also observe similar performance behavior on md raid0 composed by 4
hard disks, give up after 2+ hours too.

2) when dirty data on cache device is close to dirty target
1.1) cache device: 3.4TB NVMe SSD
     cached device: 7.2TB md raid0 by 4 hard disks
- read dirty requests on NVMe SSD

http://blog.coly.li/wp-content/uploads/2017/10/read_dirty_requests_on_SSD_small_data_set.png
- read dirty throughput on NVMe SSD

http://blog.coly.li/wp-content/uploads/2017/10/read_dirty_throughput_on_SSD_small_set.png
  I mentioned these performance number in previous email, when dirty
data gets close to dirty target, writeback rate drops from maximum
number to minimum number in 2 minutes, then almost no performance
diference with/without the bio reorder patches. It is interesting that
without bio reordering patches, in my test the read dirty requests are
even faster. It only happens in several minutes, so not a big issue.

3) when dirty data on cache device occupies 50% cache space
3.1) cache device: 1.8TB NVMe SSD
     cached device: 3.6TB md linear device composed by 2 hard disks
     dirty data occupies 900G on cache before writeback starts
- existing dirty data on cache devcie

- writeback request merge number on hard disks

http://blog.coly.li/wp-content/uploads/2017/10/writeback_request_merge_number_on_linear_900_1800G_cache_half.png
- writeback request number on hard disks

http://blog.coly.li/wp-content/uploads/2017/10/writeback_request_number_on_linear_900_1800G_cache_half.png
- writeback throughput on hard disks

http://blog.coly.li/wp-content/uploads/2017/10/writeback_throughput_on_linear_900_1800G_cache_half.png
  In this test, without bio reorder patches, writeback throughput is
much faster, you may see the write request number and request merge
number are also much faster then bio reorder patches. After around 10
minutes later, there is no obvious performance difference with/without
the bio reorder patches. Therefore in this test, with bio reorder
patches, I observe a worse writeback performance result.

The above tests tell me, to get a better writeback performance with bio
reorder patches, a specific situation is required (many contiguous dirty
data on cache device), this situation can only happen in some specific
of work loads. In general writeback situations, reordering bios by
waiting does not have significant performance advantage, and even
performance regression is observed.

Maybe I am wrong, but you need to provide more positive performance
numbers of more generic workloads as evidence in further discussion.

Thanks.

-- 
Coly Li

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
  2017-10-04 18:43                                 ` Coly Li
@ 2017-10-04 23:54                                   ` Michael Lyle
  -1 siblings, 0 replies; 65+ messages in thread
From: Michael Lyle @ 2017-10-04 23:54 UTC (permalink / raw)
  To: Coly Li; +Cc: Junhui Tang, linux-bcache, linux-block, Kent Overstreet

Coly---

Thanks for running these tests.

The change is expected to improve performance when the application
writes to many adjacent blocks, out of order.  Many database workloads
are like this.  My VM provisioning / installation / cleanup workloads
have a lot of this, too.

I believe that it really has nothing to do with how full the cache
device is, or whether things are contiguous on the cache device.  It
has to do with what proportion of the data is contiguous on the
**backing device**.

To get the best example of this from fio, the working set size, for
write, needs to be less than half the size of the cache (otherwise,
previous writebacks make "holes" in the middle of the data that will
make things from being contiguous), but large enough to trigger
writeback.  It may help in other circumstances, but the performance
measurement will be much more random (it effectively depends on where
the writeback cursor is in its cycle).

I'm not surprised that peak writeback rate on very fast backing
devices will probably be a little less-- we only try to writeback 64
things at a time; and NCQ queue depths are 32 or so-- so across a
parallel RAID0 installation the drives will not have their queues
filled entirely already.  Waiting for blocks to be in order will make
the queues even less full.  However, they'll be provided with I/O in
LBA order, so presumably the IO utilization and latency will be better
during this.  Plugging will magnify these effects-- it will write back
faster when there's contiguous data and utilize the devices more
efficiently,

We could add a tunable for the writeback semaphore size if it's
desired to have more than 64 things in flight-- of course, we can't
have too many because dirty extents are currently selected from a pool
of maximum 500.

Mike

On Wed, Oct 4, 2017 at 11:43 AM, Coly Li <i@coly.li> wrote:
> On 2017/10/2 =E4=B8=8A=E5=8D=881:34, Michael Lyle wrote:
>> On Sun, Oct 1, 2017 at 10:23 AM, Coly Li <i@coly.li> wrote:
>>> Hi Mike,
>>>
>>> Your data set is too small. Normally bcache users I talk with, they use
>>> bcache for distributed storage cluster or commercial data base, their
>>> catch device is large and fast. It is possible we see different I/O
>>> behaviors because we use different configurations.
>>
>> A small dataset is sufficient to tell whether the I/O subsystem is
>> successfully aggregating sequential writes or not.  :P  It doesn't
>> matter whether the test is 10 minutes or 10 hours...  The writeback
>> stuff walks the data in order.  :P
>
> Hi Mike,
>
> I test your patch4,5 all these days, it turns out that your patch works
> better when dirty data is full on cache device. And I can say it works
> perfectly when dirty data is full on cache device and backing cached
> device is a single spinning hard disk.
>
> It is not because your data set is full, it is because when your data
> set is small, the cache can be close to full state, then there is more
> possibility to have adjacent dirty data blocks to writeback.
>
> In the best case of your patch4,5, dirty data full on cache device and
> cached device is a single spinning hard disk, writeback performance can
> be 2x faster when there is no front end I/O. See one of the performmance
> data (the lower the better)
> http://blog.coly.li/wp-content/uploads/2017/10/existing_dirty_data_on_cac=
he_single_disk_1T_full_cache.png
>
>
>
> When the backing cached device gets faster and faster, your patch4,5
> performs less and less advantage.
>
> For same backing cached device size, when cache device gets smaller and
> smaller, your patch4,5 performs less and less advantage.
>
> And in the following configuration I find current bcache code performs
> better (not too much) then your patch4,5 reorder method,
> - cached device: A md linear device combined by 2x1.8T hard disks
> - cache device: A 1800G NVMe SSD
> - fio rand write blocksize 512K
> - dirty data occupies 50% space of cache device (900G from 1800G)
> One of the performance data can be found here,
> http://blog.coly.li/wp-content/uploads/2017/10/existing_dirty_data_on_ssd=
_900_1800G_cache_half.png
>
>>
>> ***We are measuring whether the cache and I/O scheduler can correctly
>> order up-to-64-outstanding writebacks from a chunk of 500 dirty
>> extents-- we do not need to do 12 hours of writes first to measure
>> this.***
>>
>> It's important that there be actual contiguous data, though, or the
>> difference will be less significant.  If you write too much, there
>> will be a lot more holes in the data from writeback during the test
>> and from writes bypassing the cache.
>>
>
> I see,  your patches do perform better when dirty data are contiguous on
> SSD. But we should know how much the probability in real world this
> assumption can be real. Especially in some cases, your patches make
> writeback performance slower than current bcache code does.
>
> To test your patches, the following backing devices are used,
> - md raid5 device composed by 4 hard disks
> - md linear device composed by 2 hard disks
> - md raid0 devices composed by 4 hard disks
> - single 250G SATA SSD
> - single 1.8T hard disk
>
> And the following cache devices are used,
> - 3.8T NVMe SSD
> - 1.8T NVMe SSD partition
> - 232G NVMe SSD partition
>
>> Having all the data to writeback be sequential is an
>> artificial/synthetic condition that allows the difference to be
>> measured more easily.  It's about a 2x difference under these
>> conditions in my test environment.  I expect with real data that is
>> not purely sequential it's more like a few percent.
>
> From my test, it seems in the following situation your patches4,5 works
> better,
> 1)   backing cached device is slow
> 2.1) higher percentage of (cache_device_size/cached_device_size), this
> means dirty data on cache device has more probability to be contiguous.
> or 2.2) dirty data on cache device is almost full
>
> This is what I observe in these days testing. I will continue to try
> tomorrow to see in which percentage of dirty data on cache device when
> current bcache code performs worse than your patch. So far I see when
> cache is 50% full, and cache device is half size of cached device, your
> patch4,5 won't have performance advantage, in my testing environment.
>
> All performance comparison png files are too big, once I finish the
> final benchmark, I will combine them into a pdf file and share a link.
>
> Thanks.
>
> --
> Coly Li

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
@ 2017-10-04 23:54                                   ` Michael Lyle
  0 siblings, 0 replies; 65+ messages in thread
From: Michael Lyle @ 2017-10-04 23:54 UTC (permalink / raw)
  To: Coly Li; +Cc: Junhui Tang, linux-bcache, linux-block, Kent Overstreet

Coly---

Thanks for running these tests.

The change is expected to improve performance when the application
writes to many adjacent blocks, out of order.  Many database workloads
are like this.  My VM provisioning / installation / cleanup workloads
have a lot of this, too.

I believe that it really has nothing to do with how full the cache
device is, or whether things are contiguous on the cache device.  It
has to do with what proportion of the data is contiguous on the
**backing device**.

To get the best example of this from fio, the working set size, for
write, needs to be less than half the size of the cache (otherwise,
previous writebacks make "holes" in the middle of the data that will
make things from being contiguous), but large enough to trigger
writeback.  It may help in other circumstances, but the performance
measurement will be much more random (it effectively depends on where
the writeback cursor is in its cycle).

I'm not surprised that peak writeback rate on very fast backing
devices will probably be a little less-- we only try to writeback 64
things at a time; and NCQ queue depths are 32 or so-- so across a
parallel RAID0 installation the drives will not have their queues
filled entirely already.  Waiting for blocks to be in order will make
the queues even less full.  However, they'll be provided with I/O in
LBA order, so presumably the IO utilization and latency will be better
during this.  Plugging will magnify these effects-- it will write back
faster when there's contiguous data and utilize the devices more
efficiently,

We could add a tunable for the writeback semaphore size if it's
desired to have more than 64 things in flight-- of course, we can't
have too many because dirty extents are currently selected from a pool
of maximum 500.

Mike

On Wed, Oct 4, 2017 at 11:43 AM, Coly Li <i@coly.li> wrote:
> On 2017/10/2 上午1:34, Michael Lyle wrote:
>> On Sun, Oct 1, 2017 at 10:23 AM, Coly Li <i@coly.li> wrote:
>>> Hi Mike,
>>>
>>> Your data set is too small. Normally bcache users I talk with, they use
>>> bcache for distributed storage cluster or commercial data base, their
>>> catch device is large and fast. It is possible we see different I/O
>>> behaviors because we use different configurations.
>>
>> A small dataset is sufficient to tell whether the I/O subsystem is
>> successfully aggregating sequential writes or not.  :P  It doesn't
>> matter whether the test is 10 minutes or 10 hours...  The writeback
>> stuff walks the data in order.  :P
>
> Hi Mike,
>
> I test your patch4,5 all these days, it turns out that your patch works
> better when dirty data is full on cache device. And I can say it works
> perfectly when dirty data is full on cache device and backing cached
> device is a single spinning hard disk.
>
> It is not because your data set is full, it is because when your data
> set is small, the cache can be close to full state, then there is more
> possibility to have adjacent dirty data blocks to writeback.
>
> In the best case of your patch4,5, dirty data full on cache device and
> cached device is a single spinning hard disk, writeback performance can
> be 2x faster when there is no front end I/O. See one of the performmance
> data (the lower the better)
> http://blog.coly.li/wp-content/uploads/2017/10/existing_dirty_data_on_cache_single_disk_1T_full_cache.png
>
>
>
> When the backing cached device gets faster and faster, your patch4,5
> performs less and less advantage.
>
> For same backing cached device size, when cache device gets smaller and
> smaller, your patch4,5 performs less and less advantage.
>
> And in the following configuration I find current bcache code performs
> better (not too much) then your patch4,5 reorder method,
> - cached device: A md linear device combined by 2x1.8T hard disks
> - cache device: A 1800G NVMe SSD
> - fio rand write blocksize 512K
> - dirty data occupies 50% space of cache device (900G from 1800G)
> One of the performance data can be found here,
> http://blog.coly.li/wp-content/uploads/2017/10/existing_dirty_data_on_ssd_900_1800G_cache_half.png
>
>>
>> ***We are measuring whether the cache and I/O scheduler can correctly
>> order up-to-64-outstanding writebacks from a chunk of 500 dirty
>> extents-- we do not need to do 12 hours of writes first to measure
>> this.***
>>
>> It's important that there be actual contiguous data, though, or the
>> difference will be less significant.  If you write too much, there
>> will be a lot more holes in the data from writeback during the test
>> and from writes bypassing the cache.
>>
>
> I see,  your patches do perform better when dirty data are contiguous on
> SSD. But we should know how much the probability in real world this
> assumption can be real. Especially in some cases, your patches make
> writeback performance slower than current bcache code does.
>
> To test your patches, the following backing devices are used,
> - md raid5 device composed by 4 hard disks
> - md linear device composed by 2 hard disks
> - md raid0 devices composed by 4 hard disks
> - single 250G SATA SSD
> - single 1.8T hard disk
>
> And the following cache devices are used,
> - 3.8T NVMe SSD
> - 1.8T NVMe SSD partition
> - 232G NVMe SSD partition
>
>> Having all the data to writeback be sequential is an
>> artificial/synthetic condition that allows the difference to be
>> measured more easily.  It's about a 2x difference under these
>> conditions in my test environment.  I expect with real data that is
>> not purely sequential it's more like a few percent.
>
> From my test, it seems in the following situation your patches4,5 works
> better,
> 1)   backing cached device is slow
> 2.1) higher percentage of (cache_device_size/cached_device_size), this
> means dirty data on cache device has more probability to be contiguous.
> or 2.2) dirty data on cache device is almost full
>
> This is what I observe in these days testing. I will continue to try
> tomorrow to see in which percentage of dirty data on cache device when
> current bcache code performs worse than your patch. So far I see when
> cache is 50% full, and cache device is half size of cached device, your
> patch4,5 won't have performance advantage, in my testing environment.
>
> All performance comparison png files are too big, once I finish the
> final benchmark, I will combine them into a pdf file and share a link.
>
> Thanks.
>
> --
> Coly Li

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
  2017-10-01 17:34                             ` Michael Lyle
@ 2017-10-04 18:43                                 ` Coly Li
  0 siblings, 0 replies; 65+ messages in thread
From: Coly Li @ 2017-10-04 18:43 UTC (permalink / raw)
  To: Michael Lyle; +Cc: Junhui Tang, linux-bcache, linux-block, Kent Overstreet

On 2017/10/2 上午1:34, Michael Lyle wrote:
> On Sun, Oct 1, 2017 at 10:23 AM, Coly Li <i@coly.li> wrote:
>> Hi Mike,
>>
>> Your data set is too small. Normally bcache users I talk with, they use
>> bcache for distributed storage cluster or commercial data base, their
>> catch device is large and fast. It is possible we see different I/O
>> behaviors because we use different configurations.
> 
> A small dataset is sufficient to tell whether the I/O subsystem is
> successfully aggregating sequential writes or not.  :P  It doesn't
> matter whether the test is 10 minutes or 10 hours...  The writeback
> stuff walks the data in order.  :P

Hi Mike,

I test your patch4,5 all these days, it turns out that your patch works
better when dirty data is full on cache device. And I can say it works
perfectly when dirty data is full on cache device and backing cached
device is a single spinning hard disk.

It is not because your data set is full, it is because when your data
set is small, the cache can be close to full state, then there is more
possibility to have adjacent dirty data blocks to writeback.

In the best case of your patch4,5, dirty data full on cache device and
cached device is a single spinning hard disk, writeback performance can
be 2x faster when there is no front end I/O. See one of the performmance
data (the lower the better)
http://blog.coly.li/wp-content/uploads/2017/10/existing_dirty_data_on_cache_single_disk_1T_full_cache.png



When the backing cached device gets faster and faster, your patch4,5
performs less and less advantage.

For same backing cached device size, when cache device gets smaller and
smaller, your patch4,5 performs less and less advantage.

And in the following configuration I find current bcache code performs
better (not too much) then your patch4,5 reorder method,
- cached device: A md linear device combined by 2x1.8T hard disks
- cache device: A 1800G NVMe SSD
- fio rand write blocksize 512K
- dirty data occupies 50% space of cache device (900G from 1800G)
One of the performance data can be found here,
http://blog.coly.li/wp-content/uploads/2017/10/existing_dirty_data_on_ssd_900_1800G_cache_half.png

> 
> ***We are measuring whether the cache and I/O scheduler can correctly
> order up-to-64-outstanding writebacks from a chunk of 500 dirty
> extents-- we do not need to do 12 hours of writes first to measure
> this.***
> 
> It's important that there be actual contiguous data, though, or the
> difference will be less significant.  If you write too much, there
> will be a lot more holes in the data from writeback during the test
> and from writes bypassing the cache.
>

I see,  your patches do perform better when dirty data are contiguous on
SSD. But we should know how much the probability in real world this
assumption can be real. Especially in some cases, your patches make
writeback performance slower than current bcache code does.

To test your patches, the following backing devices are used,
- md raid5 device composed by 4 hard disks
- md linear device composed by 2 hard disks
- md raid0 devices composed by 4 hard disks
- single 250G SATA SSD
- single 1.8T hard disk

And the following cache devices are used,
- 3.8T NVMe SSD
- 1.8T NVMe SSD partition
- 232G NVMe SSD partition

> Having all the data to writeback be sequential is an
> artificial/synthetic condition that allows the difference to be
> measured more easily.  It's about a 2x difference under these
> conditions in my test environment.  I expect with real data that is
> not purely sequential it's more like a few percent.

>From my test, it seems in the following situation your patches4,5 works
better,
1)   backing cached device is slow
2.1) higher percentage of (cache_device_size/cached_device_size), this
means dirty data on cache device has more probability to be contiguous.
or 2.2) dirty data on cache device is almost full

This is what I observe in these days testing. I will continue to try
tomorrow to see in which percentage of dirty data on cache device when
current bcache code performs worse than your patch. So far I see when
cache is 50% full, and cache device is half size of cached device, your
patch4,5 won't have performance advantage, in my testing environment.

All performance comparison png files are too big, once I finish the
final benchmark, I will combine them into a pdf file and share a link.

Thanks.

-- 
Coly Li

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
@ 2017-10-04 18:43                                 ` Coly Li
  0 siblings, 0 replies; 65+ messages in thread
From: Coly Li @ 2017-10-04 18:43 UTC (permalink / raw)
  To: Michael Lyle; +Cc: Junhui Tang, linux-bcache, linux-block, Kent Overstreet

On 2017/10/2 上午1:34, Michael Lyle wrote:
> On Sun, Oct 1, 2017 at 10:23 AM, Coly Li <i@coly.li> wrote:
>> Hi Mike,
>>
>> Your data set is too small. Normally bcache users I talk with, they use
>> bcache for distributed storage cluster or commercial data base, their
>> catch device is large and fast. It is possible we see different I/O
>> behaviors because we use different configurations.
> 
> A small dataset is sufficient to tell whether the I/O subsystem is
> successfully aggregating sequential writes or not.  :P  It doesn't
> matter whether the test is 10 minutes or 10 hours...  The writeback
> stuff walks the data in order.  :P

Hi Mike,

I test your patch4,5 all these days, it turns out that your patch works
better when dirty data is full on cache device. And I can say it works
perfectly when dirty data is full on cache device and backing cached
device is a single spinning hard disk.

It is not because your data set is full, it is because when your data
set is small, the cache can be close to full state, then there is more
possibility to have adjacent dirty data blocks to writeback.

In the best case of your patch4,5, dirty data full on cache device and
cached device is a single spinning hard disk, writeback performance can
be 2x faster when there is no front end I/O. See one of the performmance
data (the lower the better)
http://blog.coly.li/wp-content/uploads/2017/10/existing_dirty_data_on_cache_single_disk_1T_full_cache.png



When the backing cached device gets faster and faster, your patch4,5
performs less and less advantage.

For same backing cached device size, when cache device gets smaller and
smaller, your patch4,5 performs less and less advantage.

And in the following configuration I find current bcache code performs
better (not too much) then your patch4,5 reorder method,
- cached device: A md linear device combined by 2x1.8T hard disks
- cache device: A 1800G NVMe SSD
- fio rand write blocksize 512K
- dirty data occupies 50% space of cache device (900G from 1800G)
One of the performance data can be found here,
http://blog.coly.li/wp-content/uploads/2017/10/existing_dirty_data_on_ssd_900_1800G_cache_half.png

> 
> ***We are measuring whether the cache and I/O scheduler can correctly
> order up-to-64-outstanding writebacks from a chunk of 500 dirty
> extents-- we do not need to do 12 hours of writes first to measure
> this.***
> 
> It's important that there be actual contiguous data, though, or the
> difference will be less significant.  If you write too much, there
> will be a lot more holes in the data from writeback during the test
> and from writes bypassing the cache.
>

I see,  your patches do perform better when dirty data are contiguous on
SSD. But we should know how much the probability in real world this
assumption can be real. Especially in some cases, your patches make
writeback performance slower than current bcache code does.

To test your patches, the following backing devices are used,
- md raid5 device composed by 4 hard disks
- md linear device composed by 2 hard disks
- md raid0 devices composed by 4 hard disks
- single 250G SATA SSD
- single 1.8T hard disk

And the following cache devices are used,
- 3.8T NVMe SSD
- 1.8T NVMe SSD partition
- 232G NVMe SSD partition

> Having all the data to writeback be sequential is an
> artificial/synthetic condition that allows the difference to be
> measured more easily.  It's about a 2x difference under these
> conditions in my test environment.  I expect with real data that is
> not purely sequential it's more like a few percent.

From my test, it seems in the following situation your patches4,5 works
better,
1)   backing cached device is slow
2.1) higher percentage of (cache_device_size/cached_device_size), this
means dirty data on cache device has more probability to be contiguous.
or 2.2) dirty data on cache device is almost full

This is what I observe in these days testing. I will continue to try
tomorrow to see in which percentage of dirty data on cache device when
current bcache code performs worse than your patch. So far I see when
cache is 50% full, and cache device is half size of cached device, your
patch4,5 won't have performance advantage, in my testing environment.

All performance comparison png files are too big, once I finish the
final benchmark, I will combine them into a pdf file and share a link.

Thanks.

-- 
Coly Li

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
  2017-10-01 17:23                           ` Coly Li
@ 2017-10-01 17:34                             ` Michael Lyle
  2017-10-04 18:43                                 ` Coly Li
  0 siblings, 1 reply; 65+ messages in thread
From: Michael Lyle @ 2017-10-01 17:34 UTC (permalink / raw)
  To: Coly Li; +Cc: Junhui Tang, linux-bcache, linux-block, Kent Overstreet

On Sun, Oct 1, 2017 at 10:23 AM, Coly Li <i@coly.li> wrote:
> Hi Mike,
>
> Your data set is too small. Normally bcache users I talk with, they use
> bcache for distributed storage cluster or commercial data base, their
> catch device is large and fast. It is possible we see different I/O
> behaviors because we use different configurations.

A small dataset is sufficient to tell whether the I/O subsystem is
successfully aggregating sequential writes or not.  :P  It doesn't
matter whether the test is 10 minutes or 10 hours...  The writeback
stuff walks the data in order.  :P

***We are measuring whether the cache and I/O scheduler can correctly
order up-to-64-outstanding writebacks from a chunk of 500 dirty
extents-- we do not need to do 12 hours of writes first to measure
this.***

It's important that there be actual contiguous data, though, or the
difference will be less significant.  If you write too much, there
will be a lot more holes in the data from writeback during the test
and from writes bypassing the cache.

Having all the data to writeback be sequential is an
artificial/synthetic condition that allows the difference to be
measured more easily.  It's about a 2x difference under these
conditions in my test environment.  I expect with real data that is
not purely sequential it's more like a few percent.

Mike

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
  2017-10-01 16:56                           ` Michael Lyle
  (?)
@ 2017-10-01 17:23                           ` Coly Li
  2017-10-01 17:34                             ` Michael Lyle
  -1 siblings, 1 reply; 65+ messages in thread
From: Coly Li @ 2017-10-01 17:23 UTC (permalink / raw)
  To: Michael Lyle; +Cc: Junhui Tang, linux-bcache, linux-block, Kent Overstreet

On 2017/10/2 上午12:56, Michael Lyle wrote:
> That's strange-- are you doing the same test scenario?  How much
> random I/O did you ask for?
> 
> My tests took 6-7 minutes to do the 30G of 8k not-repeating I/Os in a
> 30G file (about 9k IOPs for me-- it's actually significantly faster
> but then starves every few seconds-- not new with these patches)..
> your cache device if 3.8T, so to have a similar 12-13% of the cache
> you'd need to do 15x as much (90 mins if you're the same speed--- but
> your I/O subsystem is also much faster...)
> 
> If you're doing more like 3.8T of writes--  note that's not the same
> test.  (It will result in less contiguous stuff in the cache and it
> will be less repeatable / more volatile).

Hi Mike,

Your data set is too small. Normally bcache users I talk with, they use
bcache for distributed storage cluster or commercial data base, their
catch device is large and fast. It is possible we see different I/O
behaviors because we use different configurations.

I use a 3.8T cache, and test two conditions,
1, dirty data is around 2.1TB, stop front end write and observe
background writeback.
2, dirty data is around dirty target (in my test it was 305GB), then
stop front end writing and observe bacm ground writeback.

It spent a lot of time to get the dirty data ready, and then record
writeback rate and iostat output for hours. At the very beginning I can
see write merge number is high (even more then 110 wrqm/s as peak value
on single disk) but a few minutes later there is almost no write merge.

When there is write merge, bcache with/without your patch 4,5 both work
well. But when there is no write merge, bcache with/without your patch
4,5 both work badly. Even writback_rate_debug displays rate: 488.2M/sec,
real write throughput is 10MB/sec in total, that's 2~3MB/sec on each
hard disk, obviously the bottleneck is not on hard disk.

Right now I am collecting the last group data about bcache without your
patche 4,5 and has 2.1TB dirty data, then observe how background
writeback works.

The progress is slower than I expected, tomorrow morning I will get the
data. Hope 2 days later I may have an benchmark analysis to share.

I will update the result then, if I didn't do anything wrong during my
performance testing.

Coly Li

> On Sat, Sep 30, 2017 at 9:51 PM, Coly Li <i@coly.li> wrote:
>> On 2017/10/1 上午6:49, Michael Lyle wrote:
>>> One final attempt to resend, because gmail has been giving me trouble
>>> sending plain text mail.
>>>
>>> Two instances of this.  Tested as above, with a big set of random I/Os
>>> that ultimately cover every block in a file (e.g. allowing sequential
>>> writeback).
>>>
>>> With the 5 patches, samsung 940 SSD cache + crummy 5400 RPM USB hard drive:
>>>
>>> Typical seconds look like:
>>>
>>> Reading 38232K from cache in 4809 IO.  38232/4809=7.95k per cache device IO.
>>>
>>> Writing 38112k to cache in 400 I/O = 95.28k -- or we are combining
>>> about 11.9 extents to a contiguous writeback.  Tracing, there are
>>> still contiguous things that are not getting merged well, but it's OK.
>>> (I'm hoping plugging makes this better).
>>>
>>> sda            4809.00     38232.00       446.00      38232        446
>>> sdb             400.00         0.00     38112.00          0      38112
>>>
>>> Without the 5 patches, a typical second--
>>>
>>> sda            2509.00     19968.00       316.00      19968        316
>>> sdb             502.00         0.00     19648.00          0      38112
>>>
>>> or we are combining about 4.9 extents to a contiguous writeback, and
>>> writing back at about half the rate.  All of these numbers are +/- 10%
>>> and obtained by eyeballing and grabbing representative seconds.
>>>
>>> Mike


-- 
Coly Li

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
  2017-10-01  4:51                       ` Coly Li
@ 2017-10-01 16:56                           ` Michael Lyle
  0 siblings, 0 replies; 65+ messages in thread
From: Michael Lyle @ 2017-10-01 16:56 UTC (permalink / raw)
  To: Coly Li; +Cc: Junhui Tang, linux-bcache, linux-block, Kent Overstreet

That's strange-- are you doing the same test scenario?  How much
random I/O did you ask for?

My tests took 6-7 minutes to do the 30G of 8k not-repeating I/Os in a
30G file (about 9k IOPs for me-- it's actually significantly faster
but then starves every few seconds-- not new with these patches)..
your cache device if 3.8T, so to have a similar 12-13% of the cache
you'd need to do 15x as much (90 mins if you're the same speed--- but
your I/O subsystem is also much faster...)

If you're doing more like 3.8T of writes--  note that's not the same
test.  (It will result in less contiguous stuff in the cache and it
will be less repeatable / more volatile).

Mike

On Sat, Sep 30, 2017 at 9:51 PM, Coly Li <i@coly.li> wrote:
> On 2017/10/1 =E4=B8=8A=E5=8D=886:49, Michael Lyle wrote:
>> One final attempt to resend, because gmail has been giving me trouble
>> sending plain text mail.
>>
>> Two instances of this.  Tested as above, with a big set of random I/Os
>> that ultimately cover every block in a file (e.g. allowing sequential
>> writeback).
>>
>> With the 5 patches, samsung 940 SSD cache + crummy 5400 RPM USB hard dri=
ve:
>>
>> Typical seconds look like:
>>
>> Reading 38232K from cache in 4809 IO.  38232/4809=3D7.95k per cache devi=
ce IO.
>>
>> Writing 38112k to cache in 400 I/O =3D 95.28k -- or we are combining
>> about 11.9 extents to a contiguous writeback.  Tracing, there are
>> still contiguous things that are not getting merged well, but it's OK.
>> (I'm hoping plugging makes this better).
>>
>> sda            4809.00     38232.00       446.00      38232        446
>> sdb             400.00         0.00     38112.00          0      38112
>>
>> Without the 5 patches, a typical second--
>>
>> sda            2509.00     19968.00       316.00      19968        316
>> sdb             502.00         0.00     19648.00          0      38112
>>
>> or we are combining about 4.9 extents to a contiguous writeback, and
>> writing back at about half the rate.  All of these numbers are +/- 10%
>> and obtained by eyeballing and grabbing representative seconds.
>>
>> Mike
>>
>> On Sat, Sep 30, 2017 at 2:02 AM, Michael Lyle <mlyle@lyle.org> wrote:
>>>
>>> Resent because I can't seem to get gmail to not send HTML mail.  And of=
f to sleep.
>>>
>>>
>>> On Sat, Sep 30, 2017 at 1:57 AM, Michael Lyle <mlyle@lyle.org> wrote:
>>>> Two instances of this.
>>>>
>>>> With the 5 patches, samsung 940 SSD cache + crummy 5400 RPM USB hard d=
rive:
>>>>
>>>> Typical seconds look like:
>>>>
>>>> Reading 38232K from cache in 4809 IO.  38232/4809=3D7.95k per cache de=
vice IO.
>>>>
>>>> Writing 38112k to cache in 400 I/O =3D 95.28k -- or we are combining a=
bout
>>>> 11.9 extents to a contiguous writeback.  Tracing, there are still cont=
iguous
>>>> things that are not getting merged well, but it's OK.  (I'm hoping plu=
gging
>>>> makes this better).
>>>>
>>>> sda            4809.00     38232.00       446.00      38232        446
>>>> sdb             400.00         0.00     38112.00          0      38112
>>>>
>>>> Without the 5 patches, a typical second--
>>>>
>>>> sda            2509.00     19968.00       316.00      19968        316
>>>> sdb             502.00         0.00     19648.00          0      38112
>>>>
>>>> or we are combining about 4.9 extents to a contiguous writeback, and w=
riting
>>>> back at about half the rate.
>
> Hi Mike,
>
> Get it. Now I am testing your patches (all 5 patches). It has been 12+
> hours, should be 12+ hours more. The backing device is a raid0 composed
> by 4x1.8T 2.5inch harddisk, cache device is a 3.8T NVMe SSD. Block size
> is 512K.
>
> Hope it works as you expected on my server.
>
> --
> Coly Li

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
@ 2017-10-01 16:56                           ` Michael Lyle
  0 siblings, 0 replies; 65+ messages in thread
From: Michael Lyle @ 2017-10-01 16:56 UTC (permalink / raw)
  To: Coly Li; +Cc: Junhui Tang, linux-bcache, linux-block, Kent Overstreet

That's strange-- are you doing the same test scenario?  How much
random I/O did you ask for?

My tests took 6-7 minutes to do the 30G of 8k not-repeating I/Os in a
30G file (about 9k IOPs for me-- it's actually significantly faster
but then starves every few seconds-- not new with these patches)..
your cache device if 3.8T, so to have a similar 12-13% of the cache
you'd need to do 15x as much (90 mins if you're the same speed--- but
your I/O subsystem is also much faster...)

If you're doing more like 3.8T of writes--  note that's not the same
test.  (It will result in less contiguous stuff in the cache and it
will be less repeatable / more volatile).

Mike

On Sat, Sep 30, 2017 at 9:51 PM, Coly Li <i@coly.li> wrote:
> On 2017/10/1 上午6:49, Michael Lyle wrote:
>> One final attempt to resend, because gmail has been giving me trouble
>> sending plain text mail.
>>
>> Two instances of this.  Tested as above, with a big set of random I/Os
>> that ultimately cover every block in a file (e.g. allowing sequential
>> writeback).
>>
>> With the 5 patches, samsung 940 SSD cache + crummy 5400 RPM USB hard drive:
>>
>> Typical seconds look like:
>>
>> Reading 38232K from cache in 4809 IO.  38232/4809=7.95k per cache device IO.
>>
>> Writing 38112k to cache in 400 I/O = 95.28k -- or we are combining
>> about 11.9 extents to a contiguous writeback.  Tracing, there are
>> still contiguous things that are not getting merged well, but it's OK.
>> (I'm hoping plugging makes this better).
>>
>> sda            4809.00     38232.00       446.00      38232        446
>> sdb             400.00         0.00     38112.00          0      38112
>>
>> Without the 5 patches, a typical second--
>>
>> sda            2509.00     19968.00       316.00      19968        316
>> sdb             502.00         0.00     19648.00          0      38112
>>
>> or we are combining about 4.9 extents to a contiguous writeback, and
>> writing back at about half the rate.  All of these numbers are +/- 10%
>> and obtained by eyeballing and grabbing representative seconds.
>>
>> Mike
>>
>> On Sat, Sep 30, 2017 at 2:02 AM, Michael Lyle <mlyle@lyle.org> wrote:
>>>
>>> Resent because I can't seem to get gmail to not send HTML mail.  And off to sleep.
>>>
>>>
>>> On Sat, Sep 30, 2017 at 1:57 AM, Michael Lyle <mlyle@lyle.org> wrote:
>>>> Two instances of this.
>>>>
>>>> With the 5 patches, samsung 940 SSD cache + crummy 5400 RPM USB hard drive:
>>>>
>>>> Typical seconds look like:
>>>>
>>>> Reading 38232K from cache in 4809 IO.  38232/4809=7.95k per cache device IO.
>>>>
>>>> Writing 38112k to cache in 400 I/O = 95.28k -- or we are combining about
>>>> 11.9 extents to a contiguous writeback.  Tracing, there are still contiguous
>>>> things that are not getting merged well, but it's OK.  (I'm hoping plugging
>>>> makes this better).
>>>>
>>>> sda            4809.00     38232.00       446.00      38232        446
>>>> sdb             400.00         0.00     38112.00          0      38112
>>>>
>>>> Without the 5 patches, a typical second--
>>>>
>>>> sda            2509.00     19968.00       316.00      19968        316
>>>> sdb             502.00         0.00     19648.00          0      38112
>>>>
>>>> or we are combining about 4.9 extents to a contiguous writeback, and writing
>>>> back at about half the rate.
>
> Hi Mike,
>
> Get it. Now I am testing your patches (all 5 patches). It has been 12+
> hours, should be 12+ hours more. The backing device is a raid0 composed
> by 4x1.8T 2.5inch harddisk, cache device is a 3.8T NVMe SSD. Block size
> is 512K.
>
> Hope it works as you expected on my server.
>
> --
> Coly Li

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
  2017-09-30 22:49                     ` Michael Lyle
@ 2017-10-01  4:51                       ` Coly Li
  2017-10-01 16:56                           ` Michael Lyle
  0 siblings, 1 reply; 65+ messages in thread
From: Coly Li @ 2017-10-01  4:51 UTC (permalink / raw)
  To: Michael Lyle; +Cc: Junhui Tang, linux-bcache, linux-block, Kent Overstreet

On 2017/10/1 上午6:49, Michael Lyle wrote:
> One final attempt to resend, because gmail has been giving me trouble
> sending plain text mail.
> 
> Two instances of this.  Tested as above, with a big set of random I/Os
> that ultimately cover every block in a file (e.g. allowing sequential
> writeback).
> 
> With the 5 patches, samsung 940 SSD cache + crummy 5400 RPM USB hard drive:
> 
> Typical seconds look like:
> 
> Reading 38232K from cache in 4809 IO.  38232/4809=7.95k per cache device IO.
> 
> Writing 38112k to cache in 400 I/O = 95.28k -- or we are combining
> about 11.9 extents to a contiguous writeback.  Tracing, there are
> still contiguous things that are not getting merged well, but it's OK.
> (I'm hoping plugging makes this better).
> 
> sda            4809.00     38232.00       446.00      38232        446
> sdb             400.00         0.00     38112.00          0      38112
> 
> Without the 5 patches, a typical second--
> 
> sda            2509.00     19968.00       316.00      19968        316
> sdb             502.00         0.00     19648.00          0      38112
> 
> or we are combining about 4.9 extents to a contiguous writeback, and
> writing back at about half the rate.  All of these numbers are +/- 10%
> and obtained by eyeballing and grabbing representative seconds.
> 
> Mike
> 
> On Sat, Sep 30, 2017 at 2:02 AM, Michael Lyle <mlyle@lyle.org> wrote:
>>
>> Resent because I can't seem to get gmail to not send HTML mail.  And off to sleep.
>>
>>
>> On Sat, Sep 30, 2017 at 1:57 AM, Michael Lyle <mlyle@lyle.org> wrote:
>>> Two instances of this.
>>>
>>> With the 5 patches, samsung 940 SSD cache + crummy 5400 RPM USB hard drive:
>>>
>>> Typical seconds look like:
>>>
>>> Reading 38232K from cache in 4809 IO.  38232/4809=7.95k per cache device IO.
>>>
>>> Writing 38112k to cache in 400 I/O = 95.28k -- or we are combining about
>>> 11.9 extents to a contiguous writeback.  Tracing, there are still contiguous
>>> things that are not getting merged well, but it's OK.  (I'm hoping plugging
>>> makes this better).
>>>
>>> sda            4809.00     38232.00       446.00      38232        446
>>> sdb             400.00         0.00     38112.00          0      38112
>>>
>>> Without the 5 patches, a typical second--
>>>
>>> sda            2509.00     19968.00       316.00      19968        316
>>> sdb             502.00         0.00     19648.00          0      38112
>>>
>>> or we are combining about 4.9 extents to a contiguous writeback, and writing
>>> back at about half the rate.

Hi Mike,

Get it. Now I am testing your patches (all 5 patches). It has been 12+
hours, should be 12+ hours more. The backing device is a raid0 composed
by 4x1.8T 2.5inch harddisk, cache device is a 3.8T NVMe SSD. Block size
is 512K.

Hope it works as you expected on my server.

-- 
Coly Li

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
       [not found]                   ` <CAJ+L6qcyq-E4MrWNfB9kGA8DMD_U1HMxJii-=-qPfv0LeRL45w@mail.gmail.com>
@ 2017-09-30 22:49                     ` Michael Lyle
  2017-10-01  4:51                       ` Coly Li
  0 siblings, 1 reply; 65+ messages in thread
From: Michael Lyle @ 2017-09-30 22:49 UTC (permalink / raw)
  To: Coly Li; +Cc: Junhui Tang, linux-bcache, linux-block, Kent Overstreet

One final attempt to resend, because gmail has been giving me trouble
sending plain text mail.

Two instances of this.  Tested as above, with a big set of random I/Os
that ultimately cover every block in a file (e.g. allowing sequential
writeback).

With the 5 patches, samsung 940 SSD cache + crummy 5400 RPM USB hard drive:

Typical seconds look like:

Reading 38232K from cache in 4809 IO.  38232/4809=7.95k per cache device IO.

Writing 38112k to cache in 400 I/O = 95.28k -- or we are combining
about 11.9 extents to a contiguous writeback.  Tracing, there are
still contiguous things that are not getting merged well, but it's OK.
(I'm hoping plugging makes this better).

sda            4809.00     38232.00       446.00      38232        446
sdb             400.00         0.00     38112.00          0      38112

Without the 5 patches, a typical second--

sda            2509.00     19968.00       316.00      19968        316
sdb             502.00         0.00     19648.00          0      38112

or we are combining about 4.9 extents to a contiguous writeback, and
writing back at about half the rate.  All of these numbers are +/- 10%
and obtained by eyeballing and grabbing representative seconds.

Mike

On Sat, Sep 30, 2017 at 2:02 AM, Michael Lyle <mlyle@lyle.org> wrote:
>
> Resent because I can't seem to get gmail to not send HTML mail.  And off to sleep.
>
>
> On Sat, Sep 30, 2017 at 1:57 AM, Michael Lyle <mlyle@lyle.org> wrote:
> > Two instances of this.
> >
> > With the 5 patches, samsung 940 SSD cache + crummy 5400 RPM USB hard drive:
> >
> > Typical seconds look like:
> >
> > Reading 38232K from cache in 4809 IO.  38232/4809=7.95k per cache device IO.
> >
> > Writing 38112k to cache in 400 I/O = 95.28k -- or we are combining about
> > 11.9 extents to a contiguous writeback.  Tracing, there are still contiguous
> > things that are not getting merged well, but it's OK.  (I'm hoping plugging
> > makes this better).
> >
> > sda            4809.00     38232.00       446.00      38232        446
> > sdb             400.00         0.00     38112.00          0      38112
> >
> > Without the 5 patches, a typical second--
> >
> > sda            2509.00     19968.00       316.00      19968        316
> > sdb             502.00         0.00     19648.00          0      38112
> >
> > or we are combining about 4.9 extents to a contiguous writeback, and writing
> > back at about half the rate.
> >
> > Anyways, cya, I'm unsubscribed.
> >
> > Mike
>

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
  2017-09-30  8:23           ` Michael Lyle
@ 2017-09-30  8:31             ` Michael Lyle
       [not found]               ` <CAJ+L6qcU+Db5TP1Q2J-V8angdzeW9DFGwc7KQqc4di9CSxusLg@mail.gmail.com>
  0 siblings, 1 reply; 65+ messages in thread
From: Michael Lyle @ 2017-09-30  8:31 UTC (permalink / raw)
  To: Coly Li; +Cc: Junhui Tang, linux-bcache, linux-block, Kent Overstreet

Just one more note---

IO merging is not happening properly right now.

It's easy to get a case together where basically all the writeback is
sequential.  E.g. if your writeback dirty data target is 15GB, do
something like:

$ sync;fio --randrepeat=1 --ioengine=libaio --direct=1 --gtod_reduce=1
--name=test --filename=test --bs=8k --iodepth=256 --size=30G
--readwrite=randwrite --ramp_time=4

This creates a situation where bcache has lots of 8K segments and
they're almost all sequential.

Wait for the test to complete, then watch iostat 1 .  You'll see that
the writeback KB written / tps == just a little more than 8, despite
the spinning disks doing IO at maximum rate.  You can feel the disk
and tell it is seeking tons.  The IOs are not being merged properly.

Mike

On Sat, Sep 30, 2017 at 1:23 AM, Michael Lyle <mlyle@lyle.org> wrote:
> On Sat, Sep 30, 2017 at 1:03 AM, Coly Li <i@coly.li> wrote:
>>>> If writeback_rate is not minimum value, it means there are front end
>>>> write requests existing.
>>>
>>> This is wrong.  Else we'd never make it back to the target.
>>>
>>
>> Of cause we can :-) When dirty data is far beyond dirty percent,
>> writeback rate is quite high, in that case dc->last_read takes effect.
>> But this is not the situation which your patch handles better.
>
> ... and dirty data can be beyond dirty percent without front-end write
> requests happening, right?  dirty data being significantly beyond
> dirty percent means that we fell far behind at some point in the past,
> because the spinning disk couldn't keep up with the writeback rate...
>
> I really don't even understand what you're saying.  Front-end write
> exists existing that will make their way to the backing disk will only
> happen if writeback is bypassed or if they're judged to be sequential.
> The point of writeback is to eat these front-end write requests.
>
>> You are right. But when writeback rate is high, dc->last_read can solve
>> the seeking problem as your patch does.
>
> Sure, but it doesn't lay the ground work for plugging..
>
>> Do you mean write I/O cannot be issued within slice_idle, or they are
>> issued within slice_idle but underlying elevator does not merge the
>> continuous request ?
>
> I mean this:  imagine the writeback rate is high with the current
> code, and the writeback code issues read requests for backing
> locations 1,2,3,4,5,6, which are noncontiguous on the SSD.
>
> 1, 3, 4, 5 are read quickly and the underlying elevator merges 3, 4, and 5.
>
> 2 & 6 arrive 2ms later.
>
> What happens?
>
>> I meant the dirty_io list in your future patch, not current 5 bios
>> patch. Hmm, maybe you can ignore my noise here, I just jumped off the topic.
>
> Keeping the set of dirty_io objects that are being issued in a list so
> that it's easy to iterate contiguous ones doesn't change the
> commitment behavior.
>
>>>> And plug list will be unplugged automatically as default, when context
>>>> switching happens. If you will performance read I/Os to the btrees, a
>>>> context switch is probably to happen, then you won't keep a large bio
>>>> lists ...
>>>
>>> Thankfully we'll have 5 things to fire immediately after each other,
>>> so we don't need to worry about automatic unplug.
>>>
>>
>> Forgive me, again I meant dirty_io list stuffs, not this patch...
>
> The idea is this: the group of up-to-5 IOs that the current patch
> gathers, are put in a list.  When all 5 have been read, then the
> device is plugged, and the writes are issued together, and then the
> device is unplugged.  Kent suggested this to me on IRC.
>
>> Could you please show exact benchmark result ? Then it will be easier to
>> change my mind. I only know if writeback I/O is not continuously issued
>> within slice_idle, mostly it is because read dirty delayed in line 238
>> of read_dirty(),
>> 235     if (KEY_START(&w->key) != dc->last_read ||
>> 236         jiffies_to_msecs(delay) > 50)
>> 237             while (!kthread_should_stop() && delay)
>> 238                     delay = schedule_timeout_interruptible(delay);
>>
>> If writeback rate is high and writeback I/O is continuous, read I/O will
>> be issued without delay. Then writeback I/O will be issued by
>> write_dirty() in very short time.
>
> What this code says is, you're willing to get up to 50ms "ahead" on
> the writeback for contiguous I/O.
>
> This is bad, because: A) there is no bounds on how much you are
> willing to aggregate.  If the writeback rate is high, 50ms could be a
> lot of data, and a lot of IOPs to the cache device.  The new code
> bounds this.  B) Even if a single I/O could put you 50ms ahead, it
> still could be advantageous to do multiple writes.
>
>> Therefore without a real performance comparison, I cannot image why
>> adjacent write requests cannot merged in elevator... Unless I do the
>> test myself.
>
> See the above example with the lack of write-ordering-enforcement.
>
> Mike
>
>>
>>
>> --
>> Coly Li

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
  2017-09-30  8:03         ` Coly Li
@ 2017-09-30  8:23           ` Michael Lyle
  2017-09-30  8:31             ` Michael Lyle
  0 siblings, 1 reply; 65+ messages in thread
From: Michael Lyle @ 2017-09-30  8:23 UTC (permalink / raw)
  To: Coly Li; +Cc: Junhui Tang, linux-bcache, linux-block, Kent Overstreet

On Sat, Sep 30, 2017 at 1:03 AM, Coly Li <i@coly.li> wrote:
>>> If writeback_rate is not minimum value, it means there are front end
>>> write requests existing.
>>
>> This is wrong.  Else we'd never make it back to the target.
>>
>
> Of cause we can :-) When dirty data is far beyond dirty percent,
> writeback rate is quite high, in that case dc->last_read takes effect.
> But this is not the situation which your patch handles better.

... and dirty data can be beyond dirty percent without front-end write
requests happening, right?  dirty data being significantly beyond
dirty percent means that we fell far behind at some point in the past,
because the spinning disk couldn't keep up with the writeback rate...

I really don't even understand what you're saying.  Front-end write
exists existing that will make their way to the backing disk will only
happen if writeback is bypassed or if they're judged to be sequential.
The point of writeback is to eat these front-end write requests.

> You are right. But when writeback rate is high, dc->last_read can solve
> the seeking problem as your patch does.

Sure, but it doesn't lay the ground work for plugging..

> Do you mean write I/O cannot be issued within slice_idle, or they are
> issued within slice_idle but underlying elevator does not merge the
> continuous request ?

I mean this:  imagine the writeback rate is high with the current
code, and the writeback code issues read requests for backing
locations 1,2,3,4,5,6, which are noncontiguous on the SSD.

1, 3, 4, 5 are read quickly and the underlying elevator merges 3, 4, and 5.

2 & 6 arrive 2ms later.

What happens?

> I meant the dirty_io list in your future patch, not current 5 bios
> patch. Hmm, maybe you can ignore my noise here, I just jumped off the topic.

Keeping the set of dirty_io objects that are being issued in a list so
that it's easy to iterate contiguous ones doesn't change the
commitment behavior.

>>> And plug list will be unplugged automatically as default, when context
>>> switching happens. If you will performance read I/Os to the btrees, a
>>> context switch is probably to happen, then you won't keep a large bio
>>> lists ...
>>
>> Thankfully we'll have 5 things to fire immediately after each other,
>> so we don't need to worry about automatic unplug.
>>
>
> Forgive me, again I meant dirty_io list stuffs, not this patch...

The idea is this: the group of up-to-5 IOs that the current patch
gathers, are put in a list.  When all 5 have been read, then the
device is plugged, and the writes are issued together, and then the
device is unplugged.  Kent suggested this to me on IRC.

> Could you please show exact benchmark result ? Then it will be easier to
> change my mind. I only know if writeback I/O is not continuously issued
> within slice_idle, mostly it is because read dirty delayed in line 238
> of read_dirty(),
> 235     if (KEY_START(&w->key) != dc->last_read ||
> 236         jiffies_to_msecs(delay) > 50)
> 237             while (!kthread_should_stop() && delay)
> 238                     delay = schedule_timeout_interruptible(delay);
>
> If writeback rate is high and writeback I/O is continuous, read I/O will
> be issued without delay. Then writeback I/O will be issued by
> write_dirty() in very short time.

What this code says is, you're willing to get up to 50ms "ahead" on
the writeback for contiguous I/O.

This is bad, because: A) there is no bounds on how much you are
willing to aggregate.  If the writeback rate is high, 50ms could be a
lot of data, and a lot of IOPs to the cache device.  The new code
bounds this.  B) Even if a single I/O could put you 50ms ahead, it
still could be advantageous to do multiple writes.

> Therefore without a real performance comparison, I cannot image why
> adjacent write requests cannot merged in elevator... Unless I do the
> test myself.

See the above example with the lack of write-ordering-enforcement.

Mike

>
>
> --
> Coly Li

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
  2017-09-30  7:13         ` Michael Lyle
  (?)
  (?)
@ 2017-09-30  8:03         ` Coly Li
  2017-09-30  8:23           ` Michael Lyle
  -1 siblings, 1 reply; 65+ messages in thread
From: Coly Li @ 2017-09-30  8:03 UTC (permalink / raw)
  To: Michael Lyle; +Cc: Junhui Tang, linux-bcache, linux-block, Kent Overstreet

On 2017/9/30 下午3:13, Michael Lyle wrote:
> Coly--
> 
> 
> On Fri, Sep 29, 2017 at 11:58 PM, Coly Li <i@coly.li> wrote:
>> On 2017/9/30 上午11:17, Michael Lyle wrote:
>> [snip]
>>
>> If writeback_rate is not minimum value, it means there are front end
>> write requests existing.
> 
> This is wrong.  Else we'd never make it back to the target.
> 

Of cause we can :-) When dirty data is far beyond dirty percent,
writeback rate is quite high, in that case dc->last_read takes effect.
But this is not the situation which your patch handles better.


>> In this case, backend writeback I/O should nice
>> I/O throughput to front end I/O. Otherwise, application will observe
>> increased I/O latency, especially when dirty percentage is not very
>> high. For enterprise workload, this change hurts performance.
> 
> No, utilizing less of disk throughput for a given writeback rate
> improves workload latency.  :P  The maximum that will be aggregated in
> this way is constrained more strongly than the old code...
> 

Could you please show exact latency comparison data ? Imagine that by
code is too hard to me ...


>> An desired behavior for low latency enterprise workload is, when dirty
>> percentage is low, once there is front end I/O, backend writeback should
>> be at minimum rate. This patch will introduce unstable and unpredictable
>> I/O latency.
> 
> Nope.  It lowers disk utilization overall, and the amount of disk
> bandwidth any individual request chunk can use is explicitly bounded
> (unlike before, where it was not).
> 

It will be great if we can see performance data here.

>> Unless there is performance bottleneck of writeback seeking, at least
>> enterprise users will focus more on front end I/O latency ....
> 
> The less writeback seeks the disk, the more the real workload can seek
> this disk!  And if you're at high writeback rates, the vast majority
> of the time the disk is seeking!
> 

You are right. But when writeback rate is high, dc->last_read can solve
the seeking problem as your patch does.


>> This method is helpful only when writeback I/Os is not issued
>> continuously, other wise if they are issued within slice_idle,
>> underlying elevator will reorder or merge the I/Os in larger request.
>> We have a subsystem in place to rate-limit and make sure that they are
> not issued continuously!  If you want to preserve good latency for
> userspace, you want to keep the system in the regime where that
> control system is effective!
> 


Do you mean write I/O cannot be issued within slice_idle, or they are
issued within slice_idle but underlying elevator does not merge the
continuous request ?


>> Hmm, if you move the dirty IO from btree into dirty_io list, then
>> perform I/O, there is risk that once machine is power down during
>> writeback there might be dirty data lost. If you continuously issue
>> dirty I/O and remove them from btree at same time, that means you will
>> introduce more latency to front end I/O...
> 
> No... this doesn't change the consistency properties.  I am just
> saying-- if we have (up to 5 contiguous things that we're going to
> write back), wait till they're all read; plug; dispatch writes ;
> unplug.
> 

I meant the dirty_io list in your future patch, not current 5 bios
patch. Hmm, maybe you can ignore my noise here, I just jumped off the topic.


>> And plug list will be unplugged automatically as default, when context
>> switching happens. If you will performance read I/Os to the btrees, a
>> context switch is probably to happen, then you won't keep a large bio
>> lists ...
> 
> Thankfully we'll have 5 things to fire immediately after each other,
> so we don't need to worry about automatic unplug.
> 

Forgive me, again I meant dirty_io list stuffs, not this patch...


>> IMHO when writeback rate is low, especially when backing hard disk is
>> not bottleneck, group continuous I/Os in bcache code does not help too
>> much for writeback performance. The only benefit is less I/O issued when
>> front end I/O is low or idle, but not most of users care about it,
>> especially enterprise users.
> 
> I disagree!
> 
>>> I believe patch 4 is useful on its own, but I have this and other
>>> pieces of development that depend upon it.
>>
>> Current bcache code works well in most of writeback loads, I just worry
>> that implementing an elevator in bcache writeback logic is a big
>> investment with a little return.
> 
> Bcache already implements a (one-way) elevator!  Bcache invests
> **significant effort** already to do writebacks in LBA order (to
> effectively short-stroke the disk), but because of different arrival
> times of the read request they can end up reordered.  This is bad.
> This is a bug.

Could you please show exact benchmark result ? Then it will be easier to
change my mind. I only know if writeback I/O is not continuously issued
within slice_idle, mostly it is because read dirty delayed in line 238
of read_dirty(),
235	if (KEY_START(&w->key) != dc->last_read ||
236         jiffies_to_msecs(delay) > 50)
237		while (!kthread_should_stop() && delay)
238			delay = schedule_timeout_interruptible(delay);

If writeback rate is high and writeback I/O is continuous, read I/O will
be issued without delay. Then writeback I/O will be issued by
write_dirty() in very short time.

Therefore without a real performance comparison, I cannot image why
adjacent write requests cannot merged in elevator... Unless I do the
test myself.


-- 
Coly Li

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
  2017-09-30  7:13         ` Michael Lyle
@ 2017-09-30  7:33           ` Michael Lyle
  -1 siblings, 0 replies; 65+ messages in thread
From: Michael Lyle @ 2017-09-30  7:33 UTC (permalink / raw)
  To: Coly Li; +Cc: Junhui Tang, linux-bcache, linux-block, Kent Overstreet

Actually-- I give up.

You've initially bounced every single one of my patches, even the ones
that fix crash & data corruption bugs.

I spend 10x as much time fighting with you as writing stuff for
bcache, and basically every time it's turned out that you're wrong.

I will go do something else where it is just not so much work to get
something done.

Mike

On Sat, Sep 30, 2017 at 12:13 AM, Michael Lyle <mlyle@lyle.org> wrote:
> Coly--
>
>
> On Fri, Sep 29, 2017 at 11:58 PM, Coly Li <i@coly.li> wrote:
>> On 2017/9/30 =E4=B8=8A=E5=8D=8811:17, Michael Lyle wrote:
>> [snip]
>>
>> If writeback_rate is not minimum value, it means there are front end
>> write requests existing.
>
> This is wrong.  Else we'd never make it back to the target.
>
>> In this case, backend writeback I/O should nice
>> I/O throughput to front end I/O. Otherwise, application will observe
>> increased I/O latency, especially when dirty percentage is not very
>> high. For enterprise workload, this change hurts performance.
>
> No, utilizing less of disk throughput for a given writeback rate
> improves workload latency.  :P  The maximum that will be aggregated in
> this way is constrained more strongly than the old code...
>
>> An desired behavior for low latency enterprise workload is, when dirty
>> percentage is low, once there is front end I/O, backend writeback should
>> be at minimum rate. This patch will introduce unstable and unpredictable
>> I/O latency.
>
> Nope.  It lowers disk utilization overall, and the amount of disk
> bandwidth any individual request chunk can use is explicitly bounded
> (unlike before, where it was not).
>
>> Unless there is performance bottleneck of writeback seeking, at least
>> enterprise users will focus more on front end I/O latency ....
>
> The less writeback seeks the disk, the more the real workload can seek
> this disk!  And if you're at high writeback rates, the vast majority
> of the time the disk is seeking!
>
>> This method is helpful only when writeback I/Os is not issued
>> continuously, other wise if they are issued within slice_idle,
>> underlying elevator will reorder or merge the I/Os in larger request.
>
> We have a subsystem in place to rate-limit and make sure that they are
> not issued continuously!  If you want to preserve good latency for
> userspace, you want to keep the system in the regime where that
> control system is effective!
>
>> Hmm, if you move the dirty IO from btree into dirty_io list, then
>> perform I/O, there is risk that once machine is power down during
>> writeback there might be dirty data lost. If you continuously issue
>> dirty I/O and remove them from btree at same time, that means you will
>> introduce more latency to front end I/O...
>
> No... this doesn't change the consistency properties.  I am just
> saying-- if we have (up to 5 contiguous things that we're going to
> write back), wait till they're all read; plug; dispatch writes ;
> unplug.
>
>> And plug list will be unplugged automatically as default, when context
>> switching happens. If you will performance read I/Os to the btrees, a
>> context switch is probably to happen, then you won't keep a large bio
>> lists ...
>
> Thankfully we'll have 5 things to fire immediately after each other,
> so we don't need to worry about automatic unplug.
>
>> IMHO when writeback rate is low, especially when backing hard disk is
>> not bottleneck, group continuous I/Os in bcache code does not help too
>> much for writeback performance. The only benefit is less I/O issued when
>> front end I/O is low or idle, but not most of users care about it,
>> especially enterprise users.
>
> I disagree!
>
>>> I believe patch 4 is useful on its own, but I have this and other
>>> pieces of development that depend upon it.
>>
>> Current bcache code works well in most of writeback loads, I just worry
>> that implementing an elevator in bcache writeback logic is a big
>> investment with a little return.
>
> Bcache already implements a (one-way) elevator!  Bcache invests
> **significant effort** already to do writebacks in LBA order (to
> effectively short-stroke the disk), but because of different arrival
> times of the read request they can end up reordered.  This is bad.
> This is a bug.
>
> Mike
>
>>
>> --
>> Coly Li
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-bcache" =
in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
@ 2017-09-30  7:33           ` Michael Lyle
  0 siblings, 0 replies; 65+ messages in thread
From: Michael Lyle @ 2017-09-30  7:33 UTC (permalink / raw)
  To: Coly Li; +Cc: Junhui Tang, linux-bcache, linux-block, Kent Overstreet

Actually-- I give up.

You've initially bounced every single one of my patches, even the ones
that fix crash & data corruption bugs.

I spend 10x as much time fighting with you as writing stuff for
bcache, and basically every time it's turned out that you're wrong.

I will go do something else where it is just not so much work to get
something done.

Mike

On Sat, Sep 30, 2017 at 12:13 AM, Michael Lyle <mlyle@lyle.org> wrote:
> Coly--
>
>
> On Fri, Sep 29, 2017 at 11:58 PM, Coly Li <i@coly.li> wrote:
>> On 2017/9/30 上午11:17, Michael Lyle wrote:
>> [snip]
>>
>> If writeback_rate is not minimum value, it means there are front end
>> write requests existing.
>
> This is wrong.  Else we'd never make it back to the target.
>
>> In this case, backend writeback I/O should nice
>> I/O throughput to front end I/O. Otherwise, application will observe
>> increased I/O latency, especially when dirty percentage is not very
>> high. For enterprise workload, this change hurts performance.
>
> No, utilizing less of disk throughput for a given writeback rate
> improves workload latency.  :P  The maximum that will be aggregated in
> this way is constrained more strongly than the old code...
>
>> An desired behavior for low latency enterprise workload is, when dirty
>> percentage is low, once there is front end I/O, backend writeback should
>> be at minimum rate. This patch will introduce unstable and unpredictable
>> I/O latency.
>
> Nope.  It lowers disk utilization overall, and the amount of disk
> bandwidth any individual request chunk can use is explicitly bounded
> (unlike before, where it was not).
>
>> Unless there is performance bottleneck of writeback seeking, at least
>> enterprise users will focus more on front end I/O latency ....
>
> The less writeback seeks the disk, the more the real workload can seek
> this disk!  And if you're at high writeback rates, the vast majority
> of the time the disk is seeking!
>
>> This method is helpful only when writeback I/Os is not issued
>> continuously, other wise if they are issued within slice_idle,
>> underlying elevator will reorder or merge the I/Os in larger request.
>
> We have a subsystem in place to rate-limit and make sure that they are
> not issued continuously!  If you want to preserve good latency for
> userspace, you want to keep the system in the regime where that
> control system is effective!
>
>> Hmm, if you move the dirty IO from btree into dirty_io list, then
>> perform I/O, there is risk that once machine is power down during
>> writeback there might be dirty data lost. If you continuously issue
>> dirty I/O and remove them from btree at same time, that means you will
>> introduce more latency to front end I/O...
>
> No... this doesn't change the consistency properties.  I am just
> saying-- if we have (up to 5 contiguous things that we're going to
> write back), wait till they're all read; plug; dispatch writes ;
> unplug.
>
>> And plug list will be unplugged automatically as default, when context
>> switching happens. If you will performance read I/Os to the btrees, a
>> context switch is probably to happen, then you won't keep a large bio
>> lists ...
>
> Thankfully we'll have 5 things to fire immediately after each other,
> so we don't need to worry about automatic unplug.
>
>> IMHO when writeback rate is low, especially when backing hard disk is
>> not bottleneck, group continuous I/Os in bcache code does not help too
>> much for writeback performance. The only benefit is less I/O issued when
>> front end I/O is low or idle, but not most of users care about it,
>> especially enterprise users.
>
> I disagree!
>
>>> I believe patch 4 is useful on its own, but I have this and other
>>> pieces of development that depend upon it.
>>
>> Current bcache code works well in most of writeback loads, I just worry
>> that implementing an elevator in bcache writeback logic is a big
>> investment with a little return.
>
> Bcache already implements a (one-way) elevator!  Bcache invests
> **significant effort** already to do writebacks in LBA order (to
> effectively short-stroke the disk), but because of different arrival
> times of the read request they can end up reordered.  This is bad.
> This is a bug.
>
> Mike
>
>>
>> --
>> Coly Li
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-bcache" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
  2017-09-30  6:58     ` Coly Li
@ 2017-09-30  7:13         ` Michael Lyle
  0 siblings, 0 replies; 65+ messages in thread
From: Michael Lyle @ 2017-09-30  7:13 UTC (permalink / raw)
  To: Coly Li; +Cc: Junhui Tang, linux-bcache, linux-block, Kent Overstreet

Coly--


On Fri, Sep 29, 2017 at 11:58 PM, Coly Li <i@coly.li> wrote:
> On 2017/9/30 =E4=B8=8A=E5=8D=8811:17, Michael Lyle wrote:
> [snip]
>
> If writeback_rate is not minimum value, it means there are front end
> write requests existing.

This is wrong.  Else we'd never make it back to the target.

> In this case, backend writeback I/O should nice
> I/O throughput to front end I/O. Otherwise, application will observe
> increased I/O latency, especially when dirty percentage is not very
> high. For enterprise workload, this change hurts performance.

No, utilizing less of disk throughput for a given writeback rate
improves workload latency.  :P  The maximum that will be aggregated in
this way is constrained more strongly than the old code...

> An desired behavior for low latency enterprise workload is, when dirty
> percentage is low, once there is front end I/O, backend writeback should
> be at minimum rate. This patch will introduce unstable and unpredictable
> I/O latency.

Nope.  It lowers disk utilization overall, and the amount of disk
bandwidth any individual request chunk can use is explicitly bounded
(unlike before, where it was not).

> Unless there is performance bottleneck of writeback seeking, at least
> enterprise users will focus more on front end I/O latency ....

The less writeback seeks the disk, the more the real workload can seek
this disk!  And if you're at high writeback rates, the vast majority
of the time the disk is seeking!

> This method is helpful only when writeback I/Os is not issued
> continuously, other wise if they are issued within slice_idle,
> underlying elevator will reorder or merge the I/Os in larger request.

We have a subsystem in place to rate-limit and make sure that they are
not issued continuously!  If you want to preserve good latency for
userspace, you want to keep the system in the regime where that
control system is effective!

> Hmm, if you move the dirty IO from btree into dirty_io list, then
> perform I/O, there is risk that once machine is power down during
> writeback there might be dirty data lost. If you continuously issue
> dirty I/O and remove them from btree at same time, that means you will
> introduce more latency to front end I/O...

No... this doesn't change the consistency properties.  I am just
saying-- if we have (up to 5 contiguous things that we're going to
write back), wait till they're all read; plug; dispatch writes ;
unplug.

> And plug list will be unplugged automatically as default, when context
> switching happens. If you will performance read I/Os to the btrees, a
> context switch is probably to happen, then you won't keep a large bio
> lists ...

Thankfully we'll have 5 things to fire immediately after each other,
so we don't need to worry about automatic unplug.

> IMHO when writeback rate is low, especially when backing hard disk is
> not bottleneck, group continuous I/Os in bcache code does not help too
> much for writeback performance. The only benefit is less I/O issued when
> front end I/O is low or idle, but not most of users care about it,
> especially enterprise users.

I disagree!

>> I believe patch 4 is useful on its own, but I have this and other
>> pieces of development that depend upon it.
>
> Current bcache code works well in most of writeback loads, I just worry
> that implementing an elevator in bcache writeback logic is a big
> investment with a little return.

Bcache already implements a (one-way) elevator!  Bcache invests
**significant effort** already to do writebacks in LBA order (to
effectively short-stroke the disk), but because of different arrival
times of the read request they can end up reordered.  This is bad.
This is a bug.

Mike

>
> --
> Coly Li
> --
> To unsubscribe from this list: send the line "unsubscribe linux-bcache" i=
n
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
@ 2017-09-30  7:13         ` Michael Lyle
  0 siblings, 0 replies; 65+ messages in thread
From: Michael Lyle @ 2017-09-30  7:13 UTC (permalink / raw)
  To: Coly Li; +Cc: Junhui Tang, linux-bcache, linux-block, Kent Overstreet

Coly--


On Fri, Sep 29, 2017 at 11:58 PM, Coly Li <i@coly.li> wrote:
> On 2017/9/30 上午11:17, Michael Lyle wrote:
> [snip]
>
> If writeback_rate is not minimum value, it means there are front end
> write requests existing.

This is wrong.  Else we'd never make it back to the target.

> In this case, backend writeback I/O should nice
> I/O throughput to front end I/O. Otherwise, application will observe
> increased I/O latency, especially when dirty percentage is not very
> high. For enterprise workload, this change hurts performance.

No, utilizing less of disk throughput for a given writeback rate
improves workload latency.  :P  The maximum that will be aggregated in
this way is constrained more strongly than the old code...

> An desired behavior for low latency enterprise workload is, when dirty
> percentage is low, once there is front end I/O, backend writeback should
> be at minimum rate. This patch will introduce unstable and unpredictable
> I/O latency.

Nope.  It lowers disk utilization overall, and the amount of disk
bandwidth any individual request chunk can use is explicitly bounded
(unlike before, where it was not).

> Unless there is performance bottleneck of writeback seeking, at least
> enterprise users will focus more on front end I/O latency ....

The less writeback seeks the disk, the more the real workload can seek
this disk!  And if you're at high writeback rates, the vast majority
of the time the disk is seeking!

> This method is helpful only when writeback I/Os is not issued
> continuously, other wise if they are issued within slice_idle,
> underlying elevator will reorder or merge the I/Os in larger request.

We have a subsystem in place to rate-limit and make sure that they are
not issued continuously!  If you want to preserve good latency for
userspace, you want to keep the system in the regime where that
control system is effective!

> Hmm, if you move the dirty IO from btree into dirty_io list, then
> perform I/O, there is risk that once machine is power down during
> writeback there might be dirty data lost. If you continuously issue
> dirty I/O and remove them from btree at same time, that means you will
> introduce more latency to front end I/O...

No... this doesn't change the consistency properties.  I am just
saying-- if we have (up to 5 contiguous things that we're going to
write back), wait till they're all read; plug; dispatch writes ;
unplug.

> And plug list will be unplugged automatically as default, when context
> switching happens. If you will performance read I/Os to the btrees, a
> context switch is probably to happen, then you won't keep a large bio
> lists ...

Thankfully we'll have 5 things to fire immediately after each other,
so we don't need to worry about automatic unplug.

> IMHO when writeback rate is low, especially when backing hard disk is
> not bottleneck, group continuous I/Os in bcache code does not help too
> much for writeback performance. The only benefit is less I/O issued when
> front end I/O is low or idle, but not most of users care about it,
> especially enterprise users.

I disagree!

>> I believe patch 4 is useful on its own, but I have this and other
>> pieces of development that depend upon it.
>
> Current bcache code works well in most of writeback loads, I just worry
> that implementing an elevator in bcache writeback logic is a big
> investment with a little return.

Bcache already implements a (one-way) elevator!  Bcache invests
**significant effort** already to do writebacks in LBA order (to
effectively short-stroke the disk), but because of different arrival
times of the read request they can end up reordered.  This is bad.
This is a bug.

Mike

>
> --
> Coly Li
> --
> To unsubscribe from this list: send the line "unsubscribe linux-bcache" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
  2017-09-30  3:17   ` Michael Lyle
@ 2017-09-30  6:58     ` Coly Li
  2017-09-30  7:13         ` Michael Lyle
  0 siblings, 1 reply; 65+ messages in thread
From: Coly Li @ 2017-09-30  6:58 UTC (permalink / raw)
  To: Michael Lyle; +Cc: Junhui Tang, linux-bcache, linux-block

On 2017/9/30 上午11:17, Michael Lyle wrote:
> Coly--
> 
> What you say is correct-- it has a few changes from current behavior.
> 
> - When writeback rate is low, it is more willing to do contiguous
> I/Os.  This provides an opportunity for the IO scheduler to combine
> operations together.  The cost of doing 5 contiguous I/Os and 1 I/O is
> usually about the same on spinning disks, because most of the cost is
> seeking and rotational latency-- the actual sequential I/O bandwidth
> is very high.  This is a benefit.

Hi Mike,

Yes I can see it.


> - When writeback rate is medium, it does I/O more efficiently.  e.g.
> if the current writeback rate is 10MB/sec, and there are two
> contiguous 1MB segments, they would not presently be combined.  A 1MB
> write would occur, then we would increase the delay counter by 100ms,
> and then the next write would wait; this new code would issue 2 1MB
> writes one after the other, and then sleep 200ms.  On a disk that does
> 150MB/sec sequential, and has a 7ms seek time, this uses the disk for
> 13ms + 7ms, compared to the old code that does 13ms + 7ms * 2.  This
> is the difference between using 10% of the disk's I/O throughput and
> 13% of the disk's throughput to do the same work.


If writeback_rate is not minimum value, it means there are front end
write requests existing. In this case, backend writeback I/O should nice
I/O throughput to front end I/O. Otherwise, application will observe
increased I/O latency, especially when dirty percentage is not very
high. For enterprise workload, this change hurts performance.

An desired behavior for low latency enterprise workload is, when dirty
percentage is low, once there is front end I/O, backend writeback should
be at minimum rate. This patch will introduce unstable and unpredictable
I/O latency.

Unless there is performance bottleneck of writeback seeking, at least
enterprise users will focus more on front end I/O latency ....

> - When writeback rate is very high (e.g. can't be obtained), there is
> not much difference currently, BUT:
> 
> Patch 5 is very important.  Right now, if there are many writebacks
> happening at once, the cached blocks can be read in any order.  This
> means that if we want to writeback blocks 1,2,3,4,5 we could actually
> end up issuing the write I/Os to the backing device as 3,1,4,2,5, with
> delays between them.  This is likely to make the disk seek a lot.
> Patch 5 provides an ordering property to ensure that the writes get
> issued in LBA order to the backing device.

This method is helpful only when writeback I/Os is not issued
continuously, other wise if they are issued within slice_idle,
underlying elevator will reorder or merge the I/Os in larger request.


> 
> ***The next step in this line of development (patch 6 ;) is to link
> groups of contiguous I/Os into a list in the dirty_io structure.  To
> know whether the "next I/Os" will be contiguous, we need to scan ahead
> like the new code in patch 4 does.  Then, in turn, we can plug the
> block device, and issue the contiguous writes together.  This allows
> us to guarantee that the I/Os will be properly merged and optimized by
> the underlying block IO scheduler.   Even with patch 5, currently the
> I/Os end up imperfectly combined, and the block layer ends up issuing
> writes 1, then 2,3, then 4,5.  This is great that things are combined
> some, but it could be combined into one big request.***  To get this
> benefit, it requires something like what was done in patch 4.
> 

Hmm, if you move the dirty IO from btree into dirty_io list, then
perform I/O, there is risk that once machine is power down during
writeback there might be dirty data lost. If you continuously issue
dirty I/O and remove them from btree at same time, that means you will
introduce more latency to front end I/O...

And plug list will be unplugged automatically as default, when context
switching happens. If you will performance read I/Os to the btrees, a
context switch is probably to happen, then you won't keep a large bio
lists ...

IMHO when writeback rate is low, especially when backing hard disk is
not bottleneck, group continuous I/Os in bcache code does not help too
much for writeback performance. The only benefit is less I/O issued when
front end I/O is low or idle, but not most of users care about it,
especially enterprise users.


> I believe patch 4 is useful on its own, but I have this and other
> pieces of development that depend upon it.

Current bcache code works well in most of writeback loads, I just worry
that implementing an elevator in bcache writeback logic is a big
investment with a little return.

-- 
Coly Li

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
  2017-09-30  2:25 ` Coly Li
@ 2017-09-30  3:17   ` Michael Lyle
  2017-09-30  6:58     ` Coly Li
  0 siblings, 1 reply; 65+ messages in thread
From: Michael Lyle @ 2017-09-30  3:17 UTC (permalink / raw)
  To: Coly Li; +Cc: Junhui Tang, linux-bcache, linux-block

Coly--

What you say is correct-- it has a few changes from current behavior.

- When writeback rate is low, it is more willing to do contiguous
I/Os.  This provides an opportunity for the IO scheduler to combine
operations together.  The cost of doing 5 contiguous I/Os and 1 I/O is
usually about the same on spinning disks, because most of the cost is
seeking and rotational latency-- the actual sequential I/O bandwidth
is very high.  This is a benefit.
- When writeback rate is medium, it does I/O more efficiently.  e.g.
if the current writeback rate is 10MB/sec, and there are two
contiguous 1MB segments, they would not presently be combined.  A 1MB
write would occur, then we would increase the delay counter by 100ms,
and then the next write would wait; this new code would issue 2 1MB
writes one after the other, and then sleep 200ms.  On a disk that does
150MB/sec sequential, and has a 7ms seek time, this uses the disk for
13ms + 7ms, compared to the old code that does 13ms + 7ms * 2.  This
is the difference between using 10% of the disk's I/O throughput and
13% of the disk's throughput to do the same work.
- When writeback rate is very high (e.g. can't be obtained), there is
not much difference currently, BUT:

Patch 5 is very important.  Right now, if there are many writebacks
happening at once, the cached blocks can be read in any order.  This
means that if we want to writeback blocks 1,2,3,4,5 we could actually
end up issuing the write I/Os to the backing device as 3,1,4,2,5, with
delays between them.  This is likely to make the disk seek a lot.
Patch 5 provides an ordering property to ensure that the writes get
issued in LBA order to the backing device.

***The next step in this line of development (patch 6 ;) is to link
groups of contiguous I/Os into a list in the dirty_io structure.  To
know whether the "next I/Os" will be contiguous, we need to scan ahead
like the new code in patch 4 does.  Then, in turn, we can plug the
block device, and issue the contiguous writes together.  This allows
us to guarantee that the I/Os will be properly merged and optimized by
the underlying block IO scheduler.   Even with patch 5, currently the
I/Os end up imperfectly combined, and the block layer ends up issuing
writes 1, then 2,3, then 4,5.  This is great that things are combined
some, but it could be combined into one big request.***  To get this
benefit, it requires something like what was done in patch 4.

I believe patch 4 is useful on its own, but I have this and other
pieces of development that depend upon it.
Thanks,

Mike

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
  2017-09-27  7:32 [PATCH 4/5] bcache: writeback: collapse contiguous IO better tang.junhui
  2017-09-27  7:47 ` Michael Lyle
@ 2017-09-30  2:25 ` Coly Li
  2017-09-30  3:17   ` Michael Lyle
  1 sibling, 1 reply; 65+ messages in thread
From: Coly Li @ 2017-09-30  2:25 UTC (permalink / raw)
  To: tang.junhui, mlyle; +Cc: linux-bcache, linux-block

On 2017/9/27 下午3:32, tang.junhui@zte.com.cn wrote:
> From: Tang Junhui <tang.junhui@zte.com.cn>
> 
> Hello Mike:
> 
> For the second question, I thinks this modification is somewhat complex, 
> cannot we do something simple to resolve it? I remember there were some
> patches trying to avoid too small writeback rate, Coly, is there any 
> progress now?
> 
Junhui,

That patch works well, but before I solve the latency of calculating
dirty stripe numbers, I won't push it upstream so far.

This patch does not conflict with my max-writeback-rate-when-idle, this
patch tries to fetch more dirty keys from cache device which are
continuous on cached device, and assume they can be continuously written
back to cached device.

For the above purpose, if writeback_rate is high, dc->last_read just
works well. But when dc->writeback_rate is low, e.g. 8, event
KEY_START(&w->key) == dc->last_read, the continuous key will only be
submit in next delay cycle. I feel Micheal wants to make larger
writeback I/O and delay more, then backing cached device may be woke up
less chance.

This policy only works better then current dc->last_read when
writeback_rate is low, that's say, when front write I/O is low or no
front write. I hesitate whether it is worthy to modify general writeback
logic for it.


> -------
> Tang Junhui
>                        
>> Ah-- re #1 -- I was investigating earlier why not as much was combined
>> as I thought should be when idle.  This is surely a factor.  Thanks
>> for the catch-- KEY_OFFSET is correct.  I will fix and retest.
>>
>> (Under heavy load, the correct thing still happens, but not under
>> light or intermediate load0.
>>
>> About #2-- I wanted to attain a bounded amount of "combining" of
>> operations.  If we have 5 4k extents in a row to dispatch, it seems
>> really wasteful to issue them as 5 IOs 60ms apart, which the existing
>> code would be willing to do-- I'd rather do a 20k write IO (basically
>> the same cost as a 4k write IO) and then sleep 300ms.  It is dependent
>> on the elevator/IO scheduler merging the requests.  At the same time,
>> I'd rather not combine a really large request.
>>
>> It would be really neat to blk_plug the backing device during the
>> write issuance, but that requires further work.
>>
>> Thanks
>>
>> Mike
>>
>> On Tue, Sep 26, 2017 at 11:51 PM,  <tang.junhui@zte.com.cn> wrote:
>>> From: Tang Junhui <tang.junhui@zte.com.cn>
>>>
>>> Hello Lyle:
>>>
>>> Two questions:
>>> 1) In keys_contiguous(), you judge I/O contiguous in cache device, but not
>>> in backing device. I think you should judge it by backing device (remove
>>> PTR_CACHE() and use KEY_OFFSET() instead of PTR_OFFSET()?).
>>>
>>> 2) I did not see you combine samll contiguous I/Os to big I/O, so I think
>>> it is useful when writeback rate was low by avoiding single I/O write, but
>>> have no sense in high writeback rate, since previously it is also write
>>> I/Os asynchronously.
>>>
>>> -----------
>>> Tang Junhui
>>>
>>>> Previously, there was some logic that attempted to immediately issue
>>>> writeback of backing-contiguous blocks when the writeback rate was
>>>> fast.
>>>>
>>>> The previous logic did not have any limits on the aggregate size it
>>>> would issue, nor the number of keys it would combine at once.  It
>>>> would also discard the chance to do a contiguous write when the
>>>> writeback rate was low-- e.g. at "background" writeback of target
>>>> rate = 8, it would not combine two adjacent 4k writes and would
>>>> instead seek the disk twice.
>>>>
>>>> This patch imposes limits and explicitly understands the size of
>>>> contiguous I/O during issue.  It also will combine contiguous I/O
>>>> in all circumstances, not just when writeback is requested to be
>>>> relatively fast.
>>>>
>>>> It is a win on its own, but also lays the groundwork for skip writes to
>>>> short keys to make the I/O more sequential/contiguous.
>>>>
>>>> Signed-off-by: Michael Lyle <mlyle@lyle.org>
[snip code]

-- 
Coly Li

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
  2017-09-29  4:22   ` Coly Li
@ 2017-09-29  4:27     ` Michael Lyle
  0 siblings, 0 replies; 65+ messages in thread
From: Michael Lyle @ 2017-09-29  4:27 UTC (permalink / raw)
  To: Coly Li; +Cc: Junhui Tang, linux-bcache, linux-block

Ah.  I will copy linux-block in the future, then.  Thanks! -mpl

On Thu, Sep 28, 2017 at 9:22 PM, Coly Li <i@coly.li> wrote:
> Oh, it is because Christoph Hellwig suggested to Cc linux-block when
> posting bcache patches to linux-bcache list, then patches may have more
> eyes on them.
>
> Thanks.
>
> Coly Li
>

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
  2017-09-29  4:15 ` Michael Lyle
  2017-09-29  4:22   ` Coly Li
@ 2017-09-29  4:26   ` Michael Lyle
  1 sibling, 0 replies; 65+ messages in thread
From: Michael Lyle @ 2017-09-29  4:26 UTC (permalink / raw)
  To: Junhui Tang; +Cc: Coly Li, linux-bcache, linux-block

P.S. another concern with the current design is that for every backing
device, there's a separate writeback thread that can want to scan for
dirty blocks with very poor locking (starving other writes).  With
several of these threads, there's no guarantee that they won't all
stack up and want to do this scan in sequence, starving userspace I/O
for an extended time.  I believe I've seen this in test.

I don't think we want to take any more shortcuts that push us towards
requiring one thread per backing device.  It's one comparison per
I/O-- on the same cache line-- to make sure the backing device is the
same.

Mike

On Thu, Sep 28, 2017 at 9:15 PM, Michael Lyle <mlyle@lyle.org> wrote:
> It's presently guaranteed, but it sure seems like a good idea to
> confirm that the keys are actually contiguous on the same volume.
> What's the harm with checking-- it prevents the code from being
> delicate if things change.  If anything, this should get factored to
> util.h with the check in place.
>
> Note that the current design guarantees reasonable loading on each
> individual backing disk, but not necessarily on the cache--- I want to
> take steps to fix this soon, because having multiple writeback threads
> each willing to fire 64 I/Os at a time can stress even a very fast
> caching volume.
>
> Also, why do you keep copying linux-block for every bit of feedback on
> this code?  It seems like we probably don't want to clutter that list
> with this discussion.
>
> Mike
>
> On Thu, Sep 28, 2017 at 8:37 PM,  <tang.junhui@zte.com.cn> wrote:
>> From: Tang Junhui <tang.junhui@zte.com.cn>
>>
>> Hello Mike:
>>
>>> +     if (KEY_INODE(&second->key) != KEY_INODE(&first->key))
>>> +      return false;
>> Please remove these redundant codes, all the keys in dc->writeback_keys
>> have the same KEY_INODE. it is guaranted by refill_dirty().
>>
>> Regards,
>> Tang Junhui
>>
>>> Previously, there was some logic that attempted to immediately issue
>>> writeback of backing-contiguous blocks when the writeback rate was
>>> fast.
>>>
>>> The previous logic did not have any limits on the aggregate size it
>>> would issue, nor the number of keys it would combine at once.  It
>>> would also discard the chance to do a contiguous write when the
>>> writeback rate was low-- e.g. at "background" writeback of target
>>> rate = 8, it would not combine two adjacent 4k writes and would
>>> instead seek the disk twice.
>>>
>>> This patch imposes limits and explicitly understands the size of
>>> contiguous I/O during issue.  It also will combine contiguous I/O
>>> in all circumstances, not just when writeback is requested to be
>>> relatively fast.
>>>
>>> It is a win on its own, but also lays the groundwork for skip writes to
>>> short keys to make the I/O more sequential/contiguous.  It also gets
>>> ready to start using blk_*_plug, and to allow issuing of non-contig
>>> I/O in parallel if requested by the user (to make use of disk
>>> throughput benefits available from higher queue depths).
>>>
>>> This patch fixes a previous version where the contiguous information
>>> was not calculated properly.
>>>
>>> Signed-off-by: Michael Lyle <mlyle@lyle.org>
>>> ---
>>>  drivers/md/bcache/bcache.h    |   6 --
>>>  drivers/md/bcache/writeback.c | 133 ++++++++++++++++++++++++++++++------------
>>>  drivers/md/bcache/writeback.h |   3 +
>>>  3 files changed, 98 insertions(+), 44 deletions(-)
>>>
>>> diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
>>> index eb83be693d60..da803a3b1981 100644
>>> --- a/drivers/md/bcache/bcache.h
>>> +++ b/drivers/md/bcache/bcache.h
>>> @@ -321,12 +321,6 @@ struct cached_dev {
>>>                struct bch_ratelimit            writeback_rate;
>>>                struct delayed_work             writeback_rate_update;
>>>
>>> -              /*
>>> -               * Internal to the writeback code, so read_dirty() can keep track of
>>> -               * where it's at.
>>> -               */
>>> -              sector_t                                last_read;
>>> -
>>>                /* Limit number of writeback bios in flight */
>>>                struct semaphore                in_flight;
>>>                struct task_struct              *writeback_thread;
>>> diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
>>> index 8deb721c355e..13c2142ea82f 100644
>>> --- a/drivers/md/bcache/writeback.c
>>> +++ b/drivers/md/bcache/writeback.c
>>> @@ -232,10 +232,25 @@ static void read_dirty_submit(struct closure *cl)
>>>                continue_at(cl, write_dirty, io->dc->writeback_write_wq);
>>>  }
>>>
>>> +static inline bool keys_contiguous(struct cached_dev *dc,
>>> +                              struct keybuf_key *first, struct keybuf_key *second)
>>> +{
>>> +              if (KEY_INODE(&second->key) != KEY_INODE(&first->key))
>>> +                              return false;
>>> +
>>> +              if (KEY_OFFSET(&second->key) !=
>>> +                                              KEY_OFFSET(&first->key) + KEY_SIZE(&first->key))
>>> +                              return false;
>>> +
>>> +              return true;
>>> +}
>>> +
>>>  static void read_dirty(struct cached_dev *dc)
>>>  {
>>>                unsigned delay = 0;
>>> -              struct keybuf_key *w;
>>> +              struct keybuf_key *next, *keys[MAX_WRITEBACKS_IN_PASS], *w;
>>> +              size_t size;
>>> +              int nk, i;
>>>                struct dirty_io *io;
>>>                struct closure cl;
>>>
>>> @@ -246,45 +261,87 @@ static void read_dirty(struct cached_dev *dc)
>>>                 * mempools.
>>>                 */
>>>
>>> -              while (!kthread_should_stop()) {
>>> -
>>> -                              w = bch_keybuf_next(&dc->writeback_keys);
>>> -                              if (!w)
>>> -                                              break;
>>> -
>>> -                              BUG_ON(ptr_stale(dc->disk.c, &w->key, 0));
>>> -
>>> -                              if (KEY_START(&w->key) != dc->last_read ||
>>> -                                  jiffies_to_msecs(delay) > 50)
>>> -                                              while (!kthread_should_stop() && delay)
>>> -                                                              delay = schedule_timeout_interruptible(delay);
>>> -
>>> -                              dc->last_read           = KEY_OFFSET(&w->key);
>>> -
>>> -                              io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec)
>>> -                                                   * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
>>> -                                                   GFP_KERNEL);
>>> -                              if (!io)
>>> -                                              goto err;
>>> -
>>> -                              w->private              = io;
>>> -                              io->dc                          = dc;
>>> -
>>> -                              dirty_init(w);
>>> -                              bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
>>> -                              io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
>>> -                              bio_set_dev(&io->bio, PTR_CACHE(dc->disk.c, &w->key, 0)->bdev);
>>> -                              io->bio.bi_end_io               = read_dirty_endio;
>>> -
>>> -                              if (bio_alloc_pages(&io->bio, GFP_KERNEL))
>>> -                                              goto err_free;
>>> -
>>> -                              trace_bcache_writeback(&w->key);
>>> +              next = bch_keybuf_next(&dc->writeback_keys);
>>> +
>>> +              while (!kthread_should_stop() && next) {
>>> +                              size = 0;
>>> +                              nk = 0;
>>> +
>>> +                              do {
>>> +                                              BUG_ON(ptr_stale(dc->disk.c, &next->key, 0));
>>> +
>>> +                                              /*
>>> +                                               * Don't combine too many operations, even if they
>>> +                                               * are all small.
>>> +                                               */
>>> +                                              if (nk >= MAX_WRITEBACKS_IN_PASS)
>>> +                                                              break;
>>> +
>>> +                                              /*
>>> +                                               * If the current operation is very large, don't
>>> +                                               * further combine operations.
>>> +                                               */
>>> +                                              if (size >= MAX_WRITESIZE_IN_PASS)
>>> +                                                              break;
>>> +
>>> +                                              /*
>>> +                                               * Operations are only eligible to be combined
>>> +                                               * if they are contiguous.
>>> +                                               *
>>> +                                               * TODO: add a heuristic willing to fire a
>>> +                                               * certain amount of non-contiguous IO per pass,
>>> +                                               * so that we can benefit from backing device
>>> +                                               * command queueing.
>>> +                                               */
>>> +                                              if (nk != 0 && !keys_contiguous(dc, keys[nk-1], next))
>>> +                                                              break;
>>> +
>>> +                                              size += KEY_SIZE(&next->key);
>>> +                                              keys[nk++] = next;
>>> +                              } while ((next = bch_keybuf_next(&dc->writeback_keys)));
>>> +
>>> +                              /* Now we have gathered a set of 1..5 keys to write back. */
>>> +
>>> +                              for (i = 0; i < nk; i++) {
>>> +                                              w = keys[i];
>>> +
>>> +                                              io = kzalloc(sizeof(struct dirty_io) +
>>> +                                                                   sizeof(struct bio_vec) *
>>> +                                                                   DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
>>> +                                                                   GFP_KERNEL);
>>> +                                              if (!io)
>>> +                                                              goto err;
>>> +
>>> +                                              w->private              = io;
>>> +                                              io->dc                          = dc;
>>> +
>>> +                                              dirty_init(w);
>>> +                                              bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
>>> +                                              io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
>>> +                                              bio_set_dev(&io->bio,
>>> +                                                                  PTR_CACHE(dc->disk.c, &w->key, 0)->bdev);
>>> +                                              io->bio.bi_end_io               = read_dirty_endio;
>>> +
>>> +                                              if (bio_alloc_pages(&io->bio, GFP_KERNEL))
>>> +                                                              goto err_free;
>>> +
>>> +                                              trace_bcache_writeback(&w->key);
>>> +
>>> +                                              down(&dc->in_flight);
>>> +
>>> +                                              /* We've acquired a semaphore for the maximum
>>> +                                               * simultaneous number of writebacks; from here
>>> +                                               * everything happens asynchronously.
>>> +                                               */
>>> +                                              closure_call(&io->cl, read_dirty_submit, NULL, &cl);
>>> +                              }
>>>
>>> -                              down(&dc->in_flight);
>>> -                              closure_call(&io->cl, read_dirty_submit, NULL, &cl);
>>> +                              delay = writeback_delay(dc, size);
>>>
>>> -                              delay = writeback_delay(dc, KEY_SIZE(&w->key));
>>> +                              while (!kthread_should_stop() && delay) {
>>> +                                              schedule_timeout_interruptible(delay);
>>> +                                              delay = writeback_delay(dc, 0);
>>> +                              }
>>>                }
>>>
>>>                if (0) {
>>> diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
>>> index e35421d20d2e..efee2be88df9 100644
>>> --- a/drivers/md/bcache/writeback.h
>>> +++ b/drivers/md/bcache/writeback.h
>>> @@ -4,6 +4,9 @@
>>>  #define CUTOFF_WRITEBACK              40
>>>  #define CUTOFF_WRITEBACK_SYNC                 70
>>>
>>> +#define MAX_WRITEBACKS_IN_PASS  5
>>> +#define MAX_WRITESIZE_IN_PASS   5000          /* *512b */
>>> +
>>>  static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
>>>  {
>>>                uint64_t i, ret = 0;
>>> --
>>> 2.11.0
>>>
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-bcache" in
>>> the body of a message to majordomo@vger.kernel.org
>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
  2017-09-29  4:15 ` Michael Lyle
@ 2017-09-29  4:22   ` Coly Li
  2017-09-29  4:27     ` Michael Lyle
  2017-09-29  4:26   ` Michael Lyle
  1 sibling, 1 reply; 65+ messages in thread
From: Coly Li @ 2017-09-29  4:22 UTC (permalink / raw)
  To: Michael Lyle; +Cc: Junhui Tang, linux-bcache, linux-block

On 2017/9/29 下午12:15, Michael Lyle wrote:
[snip]
> Also, why do you keep copying linux-block for every bit of feedback on
> this code?  It seems like we probably don't want to clutter that list
> with this discussion.

Oh, it is because Christoph Hellwig suggested to Cc linux-block when
posting bcache patches to linux-bcache list, then patches may have more
eyes on them.

Thanks.

Coly Li

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
  2017-09-29  3:37 tang.junhui
@ 2017-09-29  4:15 ` Michael Lyle
  2017-09-29  4:22   ` Coly Li
  2017-09-29  4:26   ` Michael Lyle
  0 siblings, 2 replies; 65+ messages in thread
From: Michael Lyle @ 2017-09-29  4:15 UTC (permalink / raw)
  To: Junhui Tang; +Cc: Coly Li, linux-bcache, linux-block

It's presently guaranteed, but it sure seems like a good idea to
confirm that the keys are actually contiguous on the same volume.
What's the harm with checking-- it prevents the code from being
delicate if things change.  If anything, this should get factored to
util.h with the check in place.

Note that the current design guarantees reasonable loading on each
individual backing disk, but not necessarily on the cache--- I want to
take steps to fix this soon, because having multiple writeback threads
each willing to fire 64 I/Os at a time can stress even a very fast
caching volume.

Also, why do you keep copying linux-block for every bit of feedback on
this code?  It seems like we probably don't want to clutter that list
with this discussion.

Mike

On Thu, Sep 28, 2017 at 8:37 PM,  <tang.junhui@zte.com.cn> wrote:
> From: Tang Junhui <tang.junhui@zte.com.cn>
>
> Hello Mike:
>
>> +     if (KEY_INODE(&second->key) != KEY_INODE(&first->key))
>> +      return false;
> Please remove these redundant codes, all the keys in dc->writeback_keys
> have the same KEY_INODE. it is guaranted by refill_dirty().
>
> Regards,
> Tang Junhui
>
>> Previously, there was some logic that attempted to immediately issue
>> writeback of backing-contiguous blocks when the writeback rate was
>> fast.
>>
>> The previous logic did not have any limits on the aggregate size it
>> would issue, nor the number of keys it would combine at once.  It
>> would also discard the chance to do a contiguous write when the
>> writeback rate was low-- e.g. at "background" writeback of target
>> rate = 8, it would not combine two adjacent 4k writes and would
>> instead seek the disk twice.
>>
>> This patch imposes limits and explicitly understands the size of
>> contiguous I/O during issue.  It also will combine contiguous I/O
>> in all circumstances, not just when writeback is requested to be
>> relatively fast.
>>
>> It is a win on its own, but also lays the groundwork for skip writes to
>> short keys to make the I/O more sequential/contiguous.  It also gets
>> ready to start using blk_*_plug, and to allow issuing of non-contig
>> I/O in parallel if requested by the user (to make use of disk
>> throughput benefits available from higher queue depths).
>>
>> This patch fixes a previous version where the contiguous information
>> was not calculated properly.
>>
>> Signed-off-by: Michael Lyle <mlyle@lyle.org>
>> ---
>>  drivers/md/bcache/bcache.h    |   6 --
>>  drivers/md/bcache/writeback.c | 133 ++++++++++++++++++++++++++++++------------
>>  drivers/md/bcache/writeback.h |   3 +
>>  3 files changed, 98 insertions(+), 44 deletions(-)
>>
>> diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
>> index eb83be693d60..da803a3b1981 100644
>> --- a/drivers/md/bcache/bcache.h
>> +++ b/drivers/md/bcache/bcache.h
>> @@ -321,12 +321,6 @@ struct cached_dev {
>>                struct bch_ratelimit            writeback_rate;
>>                struct delayed_work             writeback_rate_update;
>>
>> -              /*
>> -               * Internal to the writeback code, so read_dirty() can keep track of
>> -               * where it's at.
>> -               */
>> -              sector_t                                last_read;
>> -
>>                /* Limit number of writeback bios in flight */
>>                struct semaphore                in_flight;
>>                struct task_struct              *writeback_thread;
>> diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
>> index 8deb721c355e..13c2142ea82f 100644
>> --- a/drivers/md/bcache/writeback.c
>> +++ b/drivers/md/bcache/writeback.c
>> @@ -232,10 +232,25 @@ static void read_dirty_submit(struct closure *cl)
>>                continue_at(cl, write_dirty, io->dc->writeback_write_wq);
>>  }
>>
>> +static inline bool keys_contiguous(struct cached_dev *dc,
>> +                              struct keybuf_key *first, struct keybuf_key *second)
>> +{
>> +              if (KEY_INODE(&second->key) != KEY_INODE(&first->key))
>> +                              return false;
>> +
>> +              if (KEY_OFFSET(&second->key) !=
>> +                                              KEY_OFFSET(&first->key) + KEY_SIZE(&first->key))
>> +                              return false;
>> +
>> +              return true;
>> +}
>> +
>>  static void read_dirty(struct cached_dev *dc)
>>  {
>>                unsigned delay = 0;
>> -              struct keybuf_key *w;
>> +              struct keybuf_key *next, *keys[MAX_WRITEBACKS_IN_PASS], *w;
>> +              size_t size;
>> +              int nk, i;
>>                struct dirty_io *io;
>>                struct closure cl;
>>
>> @@ -246,45 +261,87 @@ static void read_dirty(struct cached_dev *dc)
>>                 * mempools.
>>                 */
>>
>> -              while (!kthread_should_stop()) {
>> -
>> -                              w = bch_keybuf_next(&dc->writeback_keys);
>> -                              if (!w)
>> -                                              break;
>> -
>> -                              BUG_ON(ptr_stale(dc->disk.c, &w->key, 0));
>> -
>> -                              if (KEY_START(&w->key) != dc->last_read ||
>> -                                  jiffies_to_msecs(delay) > 50)
>> -                                              while (!kthread_should_stop() && delay)
>> -                                                              delay = schedule_timeout_interruptible(delay);
>> -
>> -                              dc->last_read           = KEY_OFFSET(&w->key);
>> -
>> -                              io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec)
>> -                                                   * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
>> -                                                   GFP_KERNEL);
>> -                              if (!io)
>> -                                              goto err;
>> -
>> -                              w->private              = io;
>> -                              io->dc                          = dc;
>> -
>> -                              dirty_init(w);
>> -                              bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
>> -                              io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
>> -                              bio_set_dev(&io->bio, PTR_CACHE(dc->disk.c, &w->key, 0)->bdev);
>> -                              io->bio.bi_end_io               = read_dirty_endio;
>> -
>> -                              if (bio_alloc_pages(&io->bio, GFP_KERNEL))
>> -                                              goto err_free;
>> -
>> -                              trace_bcache_writeback(&w->key);
>> +              next = bch_keybuf_next(&dc->writeback_keys);
>> +
>> +              while (!kthread_should_stop() && next) {
>> +                              size = 0;
>> +                              nk = 0;
>> +
>> +                              do {
>> +                                              BUG_ON(ptr_stale(dc->disk.c, &next->key, 0));
>> +
>> +                                              /*
>> +                                               * Don't combine too many operations, even if they
>> +                                               * are all small.
>> +                                               */
>> +                                              if (nk >= MAX_WRITEBACKS_IN_PASS)
>> +                                                              break;
>> +
>> +                                              /*
>> +                                               * If the current operation is very large, don't
>> +                                               * further combine operations.
>> +                                               */
>> +                                              if (size >= MAX_WRITESIZE_IN_PASS)
>> +                                                              break;
>> +
>> +                                              /*
>> +                                               * Operations are only eligible to be combined
>> +                                               * if they are contiguous.
>> +                                               *
>> +                                               * TODO: add a heuristic willing to fire a
>> +                                               * certain amount of non-contiguous IO per pass,
>> +                                               * so that we can benefit from backing device
>> +                                               * command queueing.
>> +                                               */
>> +                                              if (nk != 0 && !keys_contiguous(dc, keys[nk-1], next))
>> +                                                              break;
>> +
>> +                                              size += KEY_SIZE(&next->key);
>> +                                              keys[nk++] = next;
>> +                              } while ((next = bch_keybuf_next(&dc->writeback_keys)));
>> +
>> +                              /* Now we have gathered a set of 1..5 keys to write back. */
>> +
>> +                              for (i = 0; i < nk; i++) {
>> +                                              w = keys[i];
>> +
>> +                                              io = kzalloc(sizeof(struct dirty_io) +
>> +                                                                   sizeof(struct bio_vec) *
>> +                                                                   DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
>> +                                                                   GFP_KERNEL);
>> +                                              if (!io)
>> +                                                              goto err;
>> +
>> +                                              w->private              = io;
>> +                                              io->dc                          = dc;
>> +
>> +                                              dirty_init(w);
>> +                                              bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
>> +                                              io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
>> +                                              bio_set_dev(&io->bio,
>> +                                                                  PTR_CACHE(dc->disk.c, &w->key, 0)->bdev);
>> +                                              io->bio.bi_end_io               = read_dirty_endio;
>> +
>> +                                              if (bio_alloc_pages(&io->bio, GFP_KERNEL))
>> +                                                              goto err_free;
>> +
>> +                                              trace_bcache_writeback(&w->key);
>> +
>> +                                              down(&dc->in_flight);
>> +
>> +                                              /* We've acquired a semaphore for the maximum
>> +                                               * simultaneous number of writebacks; from here
>> +                                               * everything happens asynchronously.
>> +                                               */
>> +                                              closure_call(&io->cl, read_dirty_submit, NULL, &cl);
>> +                              }
>>
>> -                              down(&dc->in_flight);
>> -                              closure_call(&io->cl, read_dirty_submit, NULL, &cl);
>> +                              delay = writeback_delay(dc, size);
>>
>> -                              delay = writeback_delay(dc, KEY_SIZE(&w->key));
>> +                              while (!kthread_should_stop() && delay) {
>> +                                              schedule_timeout_interruptible(delay);
>> +                                              delay = writeback_delay(dc, 0);
>> +                              }
>>                }
>>
>>                if (0) {
>> diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
>> index e35421d20d2e..efee2be88df9 100644
>> --- a/drivers/md/bcache/writeback.h
>> +++ b/drivers/md/bcache/writeback.h
>> @@ -4,6 +4,9 @@
>>  #define CUTOFF_WRITEBACK              40
>>  #define CUTOFF_WRITEBACK_SYNC                 70
>>
>> +#define MAX_WRITEBACKS_IN_PASS  5
>> +#define MAX_WRITESIZE_IN_PASS   5000          /* *512b */
>> +
>>  static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
>>  {
>>                uint64_t i, ret = 0;
>> --
>> 2.11.0
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-bcache" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
@ 2017-09-29  3:37 tang.junhui
  2017-09-29  4:15 ` Michael Lyle
  0 siblings, 1 reply; 65+ messages in thread
From: tang.junhui @ 2017-09-29  3:37 UTC (permalink / raw)
  To: i, mlyle; +Cc: linux-bcache, linux-block, Tang Junhui

From: Tang Junhui <tang.junhui@zte.com.cn>

Hello Mike:

> +	if (KEY_INODE(&second->key) != KEY_INODE(&first->key))
> +	 return false;
Please remove these redundant codes, all the keys in dc->writeback_keys 
have the same KEY_INODE. it is guaranted by refill_dirty().

Regards,
Tang Junhui

> Previously, there was some logic that attempted to immediately issue
> writeback of backing-contiguous blocks when the writeback rate was
> fast.
> 
> The previous logic did not have any limits on the aggregate size it
> would issue, nor the number of keys it would combine at once.  It
> would also discard the chance to do a contiguous write when the
> writeback rate was low-- e.g. at "background" writeback of target
> rate = 8, it would not combine two adjacent 4k writes and would
> instead seek the disk twice.
> 
> This patch imposes limits and explicitly understands the size of
> contiguous I/O during issue.  It also will combine contiguous I/O
> in all circumstances, not just when writeback is requested to be
> relatively fast.
> 
> It is a win on its own, but also lays the groundwork for skip writes to
> short keys to make the I/O more sequential/contiguous.  It also gets
> ready to start using blk_*_plug, and to allow issuing of non-contig
> I/O in parallel if requested by the user (to make use of disk
> throughput benefits available from higher queue depths).
> 
> This patch fixes a previous version where the contiguous information
> was not calculated properly.
> 
> Signed-off-by: Michael Lyle <mlyle@lyle.org>
> ---
>  drivers/md/bcache/bcache.h    |   6 --
>  drivers/md/bcache/writeback.c | 133 ++++++++++++++++++++++++++++++------------
>  drivers/md/bcache/writeback.h |   3 +
>  3 files changed, 98 insertions(+), 44 deletions(-)
> 
> diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
> index eb83be693d60..da803a3b1981 100644
> --- a/drivers/md/bcache/bcache.h
> +++ b/drivers/md/bcache/bcache.h
> @@ -321,12 +321,6 @@ struct cached_dev {
>  		 struct bch_ratelimit		 writeback_rate;
>  		 struct delayed_work		 writeback_rate_update;
>  
> -		 /*
> -		  * Internal to the writeback code, so read_dirty() can keep track of
> -		  * where it's at.
> -		  */
> -		 sector_t		 		 last_read;
> -
>  		 /* Limit number of writeback bios in flight */
>  		 struct semaphore		 in_flight;
>  		 struct task_struct		 *writeback_thread;
> diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
> index 8deb721c355e..13c2142ea82f 100644
> --- a/drivers/md/bcache/writeback.c
> +++ b/drivers/md/bcache/writeback.c
> @@ -232,10 +232,25 @@ static void read_dirty_submit(struct closure *cl)
>  		 continue_at(cl, write_dirty, io->dc->writeback_write_wq);
>  }
>  
> +static inline bool keys_contiguous(struct cached_dev *dc,
> +		 		 struct keybuf_key *first, struct keybuf_key *second)
> +{
> +		 if (KEY_INODE(&second->key) != KEY_INODE(&first->key))
> +		 		 return false;
> +
> +		 if (KEY_OFFSET(&second->key) !=
> +		 		 		 KEY_OFFSET(&first->key) + KEY_SIZE(&first->key))
> +		 		 return false;
> +
> +		 return true;
> +}
> +
>  static void read_dirty(struct cached_dev *dc)
>  {
>  		 unsigned delay = 0;
> -		 struct keybuf_key *w;
> +		 struct keybuf_key *next, *keys[MAX_WRITEBACKS_IN_PASS], *w;
> +		 size_t size;
> +		 int nk, i;
>  		 struct dirty_io *io;
>  		 struct closure cl;
>  
> @@ -246,45 +261,87 @@ static void read_dirty(struct cached_dev *dc)
>  		  * mempools.
>  		  */
>  
> -		 while (!kthread_should_stop()) {
> -
> -		 		 w = bch_keybuf_next(&dc->writeback_keys);
> -		 		 if (!w)
> -		 		 		 break;
> -
> -		 		 BUG_ON(ptr_stale(dc->disk.c, &w->key, 0));
> -
> -		 		 if (KEY_START(&w->key) != dc->last_read ||
> -		 		     jiffies_to_msecs(delay) > 50)
> -		 		 		 while (!kthread_should_stop() && delay)
> -		 		 		 		 delay = schedule_timeout_interruptible(delay);
> -
> -		 		 dc->last_read		 = KEY_OFFSET(&w->key);
> -
> -		 		 io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec)
> -		 		 		      * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
> -		 		 		      GFP_KERNEL);
> -		 		 if (!io)
> -		 		 		 goto err;
> -
> -		 		 w->private		 = io;
> -		 		 io->dc		 		 = dc;
> -
> -		 		 dirty_init(w);
> -		 		 bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
> -		 		 io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
> -		 		 bio_set_dev(&io->bio, PTR_CACHE(dc->disk.c, &w->key, 0)->bdev);
> -		 		 io->bio.bi_end_io		 = read_dirty_endio;
> -
> -		 		 if (bio_alloc_pages(&io->bio, GFP_KERNEL))
> -		 		 		 goto err_free;
> -
> -		 		 trace_bcache_writeback(&w->key);
> +		 next = bch_keybuf_next(&dc->writeback_keys);
> +
> +		 while (!kthread_should_stop() && next) {
> +		 		 size = 0;
> +		 		 nk = 0;
> +
> +		 		 do {
> +		 		 		 BUG_ON(ptr_stale(dc->disk.c, &next->key, 0));
> +
> +		 		 		 /*
> +		 		 		  * Don't combine too many operations, even if they
> +		 		 		  * are all small.
> +		 		 		  */
> +		 		 		 if (nk >= MAX_WRITEBACKS_IN_PASS)
> +		 		 		 		 break;
> +
> +		 		 		 /*
> +		 		 		  * If the current operation is very large, don't
> +		 		 		  * further combine operations.
> +		 		 		  */
> +		 		 		 if (size >= MAX_WRITESIZE_IN_PASS)
> +		 		 		 		 break;
> +
> +		 		 		 /*
> +		 		 		  * Operations are only eligible to be combined
> +		 		 		  * if they are contiguous.
> +		 		 		  *
> +		 		 		  * TODO: add a heuristic willing to fire a
> +		 		 		  * certain amount of non-contiguous IO per pass,
> +		 		 		  * so that we can benefit from backing device
> +		 		 		  * command queueing.
> +		 		 		  */
> +		 		 		 if (nk != 0 && !keys_contiguous(dc, keys[nk-1], next))
> +		 		 		 		 break;
> +
> +		 		 		 size += KEY_SIZE(&next->key);
> +		 		 		 keys[nk++] = next;
> +		 		 } while ((next = bch_keybuf_next(&dc->writeback_keys)));
> +
> +		 		 /* Now we have gathered a set of 1..5 keys to write back. */
> +
> +		 		 for (i = 0; i < nk; i++) {
> +		 		 		 w = keys[i];
> +
> +		 		 		 io = kzalloc(sizeof(struct dirty_io) +
> +		 		 		 		      sizeof(struct bio_vec) *
> +		 		 		 		      DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
> +		 		 		 		      GFP_KERNEL);
> +		 		 		 if (!io)
> +		 		 		 		 goto err;
> +
> +		 		 		 w->private		 = io;
> +		 		 		 io->dc		 		 = dc;
> +
> +		 		 		 dirty_init(w);
> +		 		 		 bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
> +		 		 		 io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
> +		 		 		 bio_set_dev(&io->bio,
> +		 		 		 		     PTR_CACHE(dc->disk.c, &w->key, 0)->bdev);
> +		 		 		 io->bio.bi_end_io		 = read_dirty_endio;
> +
> +		 		 		 if (bio_alloc_pages(&io->bio, GFP_KERNEL))
> +		 		 		 		 goto err_free;
> +
> +		 		 		 trace_bcache_writeback(&w->key);
> +
> +		 		 		 down(&dc->in_flight);
> +
> +		 		 		 /* We've acquired a semaphore for the maximum
> +		 		 		  * simultaneous number of writebacks; from here
> +		 		 		  * everything happens asynchronously.
> +		 		 		  */
> +		 		 		 closure_call(&io->cl, read_dirty_submit, NULL, &cl);
> +		 		 }
>  
> -		 		 down(&dc->in_flight);
> -		 		 closure_call(&io->cl, read_dirty_submit, NULL, &cl);
> +		 		 delay = writeback_delay(dc, size);
>  
> -		 		 delay = writeback_delay(dc, KEY_SIZE(&w->key));
> +		 		 while (!kthread_should_stop() && delay) {
> +		 		 		 schedule_timeout_interruptible(delay);
> +		 		 		 delay = writeback_delay(dc, 0);
> +		 		 }
>  		 }
>  
>  		 if (0) {
> diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
> index e35421d20d2e..efee2be88df9 100644
> --- a/drivers/md/bcache/writeback.h
> +++ b/drivers/md/bcache/writeback.h
> @@ -4,6 +4,9 @@
>  #define CUTOFF_WRITEBACK		 40
>  #define CUTOFF_WRITEBACK_SYNC		 70
>  
> +#define MAX_WRITEBACKS_IN_PASS  5
> +#define MAX_WRITESIZE_IN_PASS   5000		 /* *512b */
> +
>  static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
>  {
>  		 uint64_t i, ret = 0;
> -- 
> 2.11.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-bcache" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
  2017-09-27  7:47 ` Michael Lyle
@ 2017-09-27  7:58   ` Michael Lyle
  0 siblings, 0 replies; 65+ messages in thread
From: Michael Lyle @ 2017-09-27  7:58 UTC (permalink / raw)
  To: Junhui Tang; +Cc: Coly Li, linux-bcache, linux-block

I just fixed the problem you pointed out, and ran a test where I did

sync;fio --randrepeat=1 --ioengine=libaio --direct=1 --gtod_reduce=1
--name=test --filename=test --bs=8k --iodepth=256 --size=25G
--readwrite=randwrite --ramp_time=4

to generate a lot of 8k extents.

After letting things quiet down, writeback looks like this:

$ iostat 1 | grep sdb
sdb               3.00         0.00        40.00          0         40
sdb               0.00         0.00         0.00          0          0
sdb               0.00         0.00         0.00          0          0
sdb               2.00         0.00        16.00          0         16
sdb               0.00         0.00         0.00          0          0
sdb               3.00         0.00        40.00          0         40
sdb               0.00         0.00         0.00          0          0
sdb               0.00         0.00         0.00          0          0
sdb               3.00         0.00        40.00          0         40

So this is proven to be nicely grouping contiguous extents for
writeback in less frequent, mostly-40k chunks.

Mike

On Wed, Sep 27, 2017 at 12:47 AM, Michael Lyle <mlyle@lyle.org> wrote:
> Tang--
>
> This is a first step towards further stuff I want to do--
>
> 1. I want to allow blk_plug-- but to do that, a request needs to know
> there are subsequent contiguous requests after it when issuing the
> write.  The new structure allows that.
> 2. I want to allow tuning to issue multiple requests and control IO
> priorities for them, so that we can make use of queue depth on backing
> devices.  The new code structure allows for inserting heuristics to do
> that very easily.  When 4 operations are issued at a time, latency
> doesn't suffer very much and throughput can be 30-40% higher.
> 3. There's a small change to the delay calculation that will allow the
> actual inner delay controller to be improved to do lower rates for
> laptops and allow backing disk spindown.
>
> But I need more testing and more time to complete those, and there's
> already a benefit with the current structure.
>
> Mike
>
> On Wed, Sep 27, 2017 at 12:32 AM,  <tang.junhui@zte.com.cn> wrote:
>> From: Tang Junhui <tang.junhui@zte.com.cn>
>>
>> Hello Mike:
>>
>> For the second question, I thinks this modification is somewhat complex,
>> cannot we do something simple to resolve it? I remember there were some
>> patches trying to avoid too small writeback rate, Coly, is there any
>> progress now?
>>
>> -------
>> Tang Junhui
>>
>>> Ah-- re #1 -- I was investigating earlier why not as much was combined
>>> as I thought should be when idle.  This is surely a factor.  Thanks
>>> for the catch-- KEY_OFFSET is correct.  I will fix and retest.
>>>
>>> (Under heavy load, the correct thing still happens, but not under
>>> light or intermediate load0.
>>>
>>> About #2-- I wanted to attain a bounded amount of "combining" of
>>> operations.  If we have 5 4k extents in a row to dispatch, it seems
>>> really wasteful to issue them as 5 IOs 60ms apart, which the existing
>>> code would be willing to do-- I'd rather do a 20k write IO (basically
>>> the same cost as a 4k write IO) and then sleep 300ms.  It is dependent
>>> on the elevator/IO scheduler merging the requests.  At the same time,
>>> I'd rather not combine a really large request.
>>>
>>> It would be really neat to blk_plug the backing device during the
>>> write issuance, but that requires further work.
>>>
>>> Thanks
>>>
>>> Mike
>>>
>>> On Tue, Sep 26, 2017 at 11:51 PM,  <tang.junhui@zte.com.cn> wrote:
>>> > From: Tang Junhui <tang.junhui@zte.com.cn>
>>> >
>>> > Hello Lyle:
>>> >
>>> > Two questions:
>>> > 1) In keys_contiguous(), you judge I/O contiguous in cache device, but not
>>> > in backing device. I think you should judge it by backing device (remove
>>> > PTR_CACHE() and use KEY_OFFSET() instead of PTR_OFFSET()?).
>>> >
>>> > 2) I did not see you combine samll contiguous I/Os to big I/O, so I think
>>> > it is useful when writeback rate was low by avoiding single I/O write, but
>>> > have no sense in high writeback rate, since previously it is also write
>>> > I/Os asynchronously.
>>> >
>>> > -----------
>>> > Tang Junhui
>>> >
>>> >> Previously, there was some logic that attempted to immediately issue
>>> >> writeback of backing-contiguous blocks when the writeback rate was
>>> >> fast.
>>> >>
>>> >> The previous logic did not have any limits on the aggregate size it
>>> >> would issue, nor the number of keys it would combine at once.  It
>>> >> would also discard the chance to do a contiguous write when the
>>> >> writeback rate was low-- e.g. at "background" writeback of target
>>> >> rate = 8, it would not combine two adjacent 4k writes and would
>>> >> instead seek the disk twice.
>>> >>
>>> >> This patch imposes limits and explicitly understands the size of
>>> >> contiguous I/O during issue.  It also will combine contiguous I/O
>>> >> in all circumstances, not just when writeback is requested to be
>>> >> relatively fast.
>>> >>
>>> >> It is a win on its own, but also lays the groundwork for skip writes to
>>> >> short keys to make the I/O more sequential/contiguous.
>>> >>
>>> >> Signed-off-by: Michael Lyle <mlyle@lyle.org>
>>> >> ---
>>> >>  drivers/md/bcache/bcache.h    |   6 --
>>> >>  drivers/md/bcache/writeback.c | 131 ++++++++++++++++++++++++++++++------------
>>> >>  2 files changed, 93 insertions(+), 44 deletions(-)
>>> >>
>>> >> diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
>>> >> index eb83be693d60..da803a3b1981 100644
>>> >> --- a/drivers/md/bcache/bcache.h
>>> >> +++ b/drivers/md/bcache/bcache.h
>>> >> @@ -321,12 +321,6 @@ struct cached_dev {
>>> >>                struct bch_ratelimit            writeback_rate;
>>> >>                struct delayed_work             writeback_rate_update;
>>> >>
>>> >> -              /*
>>> >> -               * Internal to the writeback code, so read_dirty() can keep track of
>>> >> -               * where it's at.
>>> >> -               */
>>> >> -              sector_t                                last_read;
>>> >> -
>>> >>                /* Limit number of writeback bios in flight */
>>> >>                struct semaphore                in_flight;
>>> >>                struct task_struct              *writeback_thread;
>>> >> diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
>>> >> index 0b7c89813635..cf29c44605b9 100644
>>> >> --- a/drivers/md/bcache/writeback.c
>>> >> +++ b/drivers/md/bcache/writeback.c
>>> >> @@ -229,10 +229,26 @@ static void read_dirty_submit(struct closure *cl)
>>> >>                continue_at(cl, write_dirty, io->dc->writeback_write_wq);
>>> >>  }
>>> >>
>>> >> +static inline bool keys_contiguous(struct cached_dev *dc,
>>> >> +                              struct keybuf_key *first, struct keybuf_key *second)
>>> >> +{
>>> >> +              if (PTR_CACHE(dc->disk.c, &second->key, 0)->bdev !=
>>> >> +                                              PTR_CACHE(dc->disk.c, &first->key, 0)->bdev)
>>> >> +                              return false;
>>> >> +
>>> >> +              if (PTR_OFFSET(&second->key, 0) !=
>>> >> +                                              (PTR_OFFSET(&first->key, 0) + KEY_SIZE(&first->key)))
>>> >> +                              return false;
>>> >> +
>>> >> +              return true;
>>> >> +}
>>> >> +
>>> >>  static void read_dirty(struct cached_dev *dc)
>>> >>  {
>>> >>                unsigned delay = 0;
>>> >> -              struct keybuf_key *w;
>>> >> +              struct keybuf_key *next, *keys[5], *w;
>>> >> +              size_t size;
>>> >> +              int nk, i;
>>> >>                struct dirty_io *io;
>>> >>                struct closure cl;
>>> >>
>>> >> @@ -243,45 +259,84 @@ static void read_dirty(struct cached_dev *dc)
>>> >>                 * mempools.
>>> >>                 */
>>> >>
>>> >> -              while (!kthread_should_stop()) {
>>> >> -
>>> >> -                              w = bch_keybuf_next(&dc->writeback_keys);
>>> >> -                              if (!w)
>>> >> -                                              break;
>>> >> -
>>> >> -                              BUG_ON(ptr_stale(dc->disk.c, &w->key, 0));
>>> >> -
>>> >> -                              if (KEY_START(&w->key) != dc->last_read ||
>>> >> -                                  jiffies_to_msecs(delay) > 50)
>>> >> -                                              while (!kthread_should_stop() && delay)
>>> >> -                                                              delay = schedule_timeout_interruptible(delay);
>>> >> -
>>> >> -                              dc->last_read           = KEY_OFFSET(&w->key);
>>> >> -
>>> >> -                              io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec)
>>> >> -                                                   * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
>>> >> -                                                   GFP_KERNEL);
>>> >> -                              if (!io)
>>> >> -                                              goto err;
>>> >> -
>>> >> -                              w->private              = io;
>>> >> -                              io->dc                          = dc;
>>> >> -
>>> >> -                              dirty_init(w);
>>> >> -                              bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
>>> >> -                              io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
>>> >> -                              bio_set_dev(&io->bio, PTR_CACHE(dc->disk.c, &w->key, 0)->bdev);
>>> >> -                              io->bio.bi_end_io               = read_dirty_endio;
>>> >> -
>>> >> -                              if (bio_alloc_pages(&io->bio, GFP_KERNEL))
>>> >> -                                              goto err_free;
>>> >> -
>>> >> -                              trace_bcache_writeback(&w->key);
>>> >> +              next = bch_keybuf_next(&dc->writeback_keys);
>>> >> +
>>> >> +              while (!kthread_should_stop() && next) {
>>> >> +                              size = 0;
>>> >> +                              nk = 0;
>>> >> +
>>> >> +                              do {
>>> >> +                                              BUG_ON(ptr_stale(dc->disk.c, &next->key, 0));
>>> >> +
>>> >> +                                              /* Don't combine too many operations, even if they
>>> >> +                                               * are all small.
>>> >> +                                               */
>>> >> +                                              if (nk >= 5)
>>> >> +                                                              break;
>>> >> +
>>> >> +                                              /* If the current operation is very large, don't
>>> >> +                                               * further combine operations.
>>> >> +                                               */
>>> >> +                                              if (size > 5000)
>>> >> +                                                              break;
>>> >> +
>>> >> +                                              /* Operations are only eligible to be combined
>>> >> +                                               * if they are contiguous.
>>> >> +                                               *
>>> >> +                                               * TODO: add a heuristic willing to fire a
>>> >> +                                               * certain amount of non-contiguous IO per pass,
>>> >> +                                               * so that we can benefit from backing device
>>> >> +                                               * command queueing.
>>> >> +                                               */
>>> >> +                                              if (nk != 0 && !keys_contiguous(dc, keys[nk-1], next))
>>> >> +                                                              break;
>>> >> +
>>> >> +                                              size += KEY_SIZE(&next->key);
>>> >> +                                              keys[nk++] = next;
>>> >> +                              } while ((next = bch_keybuf_next(&dc->writeback_keys)));
>>> >> +
>>> >> +                              /* Now we have gathered a set of 1..5 keys to write back. */
>>> >> +
>>> >> +                              for (i = 0; i < nk; i++) {
>>> >> +                                              w = keys[i];
>>> >> +
>>> >> +                                              io = kzalloc(sizeof(struct dirty_io) +
>>> >> +                                                                   sizeof(struct bio_vec) *
>>> >> +                                                                   DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
>>> >> +                                                                   GFP_KERNEL);
>>> >> +                                              if (!io)
>>> >> +                                                              goto err;
>>> >> +
>>> >> +                                              w->private              = io;
>>> >> +                                              io->dc                          = dc;
>>> >> +
>>> >> +                                              dirty_init(w);
>>> >> +                                              bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
>>> >> +                                              io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
>>> >> +                                              bio_set_dev(&io->bio,
>>> >> +                                                                  PTR_CACHE(dc->disk.c, &w->key, 0)->bdev);
>>> >> +                                              io->bio.bi_end_io               = read_dirty_endio;
>>> >> +
>>> >> +                                              if (bio_alloc_pages(&io->bio, GFP_KERNEL))
>>> >> +                                                              goto err_free;
>>> >> +
>>> >> +                                              trace_bcache_writeback(&w->key);
>>> >> +
>>> >> +                                              down(&dc->in_flight);
>>> >> +
>>> >> +                                              /* We've acquired a semaphore for the maximum
>>> >> +                                               * simultaneous number of writebacks; from here
>>> >> +                                               * everything happens asynchronously.
>>> >> +                                               */
>>> >> +                                              closure_call(&io->cl, read_dirty_submit, NULL, &cl);
>>> >> +                              }
>>> >>
>>> >> -                              down(&dc->in_flight);
>>> >> -                              closure_call(&io->cl, read_dirty_submit, NULL, &cl);
>>> >> +                              delay = writeback_delay(dc, size);
>>> >>
>>> >> -                              delay = writeback_delay(dc, KEY_SIZE(&w->key));
>>> >> +                              while (!kthread_should_stop() && delay) {
>>> >> +                                              schedule_timeout_interruptible(delay);
>>> >> +                                              delay = writeback_delay(dc, 0);
>>> >> +                              }
>>> >>                }
>>> >>
>>> >>                if (0) {
>>> >> --
>>> > --
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-bcache" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
  2017-09-27  7:32 [PATCH 4/5] bcache: writeback: collapse contiguous IO better tang.junhui
@ 2017-09-27  7:47 ` Michael Lyle
  2017-09-27  7:58   ` Michael Lyle
  2017-09-30  2:25 ` Coly Li
  1 sibling, 1 reply; 65+ messages in thread
From: Michael Lyle @ 2017-09-27  7:47 UTC (permalink / raw)
  To: Junhui Tang; +Cc: Coly Li, linux-bcache, linux-block

Tang--

This is a first step towards further stuff I want to do--

1. I want to allow blk_plug-- but to do that, a request needs to know
there are subsequent contiguous requests after it when issuing the
write.  The new structure allows that.
2. I want to allow tuning to issue multiple requests and control IO
priorities for them, so that we can make use of queue depth on backing
devices.  The new code structure allows for inserting heuristics to do
that very easily.  When 4 operations are issued at a time, latency
doesn't suffer very much and throughput can be 30-40% higher.
3. There's a small change to the delay calculation that will allow the
actual inner delay controller to be improved to do lower rates for
laptops and allow backing disk spindown.

But I need more testing and more time to complete those, and there's
already a benefit with the current structure.

Mike

On Wed, Sep 27, 2017 at 12:32 AM,  <tang.junhui@zte.com.cn> wrote:
> From: Tang Junhui <tang.junhui@zte.com.cn>
>
> Hello Mike:
>
> For the second question, I thinks this modification is somewhat complex,
> cannot we do something simple to resolve it? I remember there were some
> patches trying to avoid too small writeback rate, Coly, is there any
> progress now?
>
> -------
> Tang Junhui
>
>> Ah-- re #1 -- I was investigating earlier why not as much was combined
>> as I thought should be when idle.  This is surely a factor.  Thanks
>> for the catch-- KEY_OFFSET is correct.  I will fix and retest.
>>
>> (Under heavy load, the correct thing still happens, but not under
>> light or intermediate load0.
>>
>> About #2-- I wanted to attain a bounded amount of "combining" of
>> operations.  If we have 5 4k extents in a row to dispatch, it seems
>> really wasteful to issue them as 5 IOs 60ms apart, which the existing
>> code would be willing to do-- I'd rather do a 20k write IO (basically
>> the same cost as a 4k write IO) and then sleep 300ms.  It is dependent
>> on the elevator/IO scheduler merging the requests.  At the same time,
>> I'd rather not combine a really large request.
>>
>> It would be really neat to blk_plug the backing device during the
>> write issuance, but that requires further work.
>>
>> Thanks
>>
>> Mike
>>
>> On Tue, Sep 26, 2017 at 11:51 PM,  <tang.junhui@zte.com.cn> wrote:
>> > From: Tang Junhui <tang.junhui@zte.com.cn>
>> >
>> > Hello Lyle:
>> >
>> > Two questions:
>> > 1) In keys_contiguous(), you judge I/O contiguous in cache device, but not
>> > in backing device. I think you should judge it by backing device (remove
>> > PTR_CACHE() and use KEY_OFFSET() instead of PTR_OFFSET()?).
>> >
>> > 2) I did not see you combine samll contiguous I/Os to big I/O, so I think
>> > it is useful when writeback rate was low by avoiding single I/O write, but
>> > have no sense in high writeback rate, since previously it is also write
>> > I/Os asynchronously.
>> >
>> > -----------
>> > Tang Junhui
>> >
>> >> Previously, there was some logic that attempted to immediately issue
>> >> writeback of backing-contiguous blocks when the writeback rate was
>> >> fast.
>> >>
>> >> The previous logic did not have any limits on the aggregate size it
>> >> would issue, nor the number of keys it would combine at once.  It
>> >> would also discard the chance to do a contiguous write when the
>> >> writeback rate was low-- e.g. at "background" writeback of target
>> >> rate = 8, it would not combine two adjacent 4k writes and would
>> >> instead seek the disk twice.
>> >>
>> >> This patch imposes limits and explicitly understands the size of
>> >> contiguous I/O during issue.  It also will combine contiguous I/O
>> >> in all circumstances, not just when writeback is requested to be
>> >> relatively fast.
>> >>
>> >> It is a win on its own, but also lays the groundwork for skip writes to
>> >> short keys to make the I/O more sequential/contiguous.
>> >>
>> >> Signed-off-by: Michael Lyle <mlyle@lyle.org>
>> >> ---
>> >>  drivers/md/bcache/bcache.h    |   6 --
>> >>  drivers/md/bcache/writeback.c | 131 ++++++++++++++++++++++++++++++------------
>> >>  2 files changed, 93 insertions(+), 44 deletions(-)
>> >>
>> >> diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
>> >> index eb83be693d60..da803a3b1981 100644
>> >> --- a/drivers/md/bcache/bcache.h
>> >> +++ b/drivers/md/bcache/bcache.h
>> >> @@ -321,12 +321,6 @@ struct cached_dev {
>> >>                struct bch_ratelimit            writeback_rate;
>> >>                struct delayed_work             writeback_rate_update;
>> >>
>> >> -              /*
>> >> -               * Internal to the writeback code, so read_dirty() can keep track of
>> >> -               * where it's at.
>> >> -               */
>> >> -              sector_t                                last_read;
>> >> -
>> >>                /* Limit number of writeback bios in flight */
>> >>                struct semaphore                in_flight;
>> >>                struct task_struct              *writeback_thread;
>> >> diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
>> >> index 0b7c89813635..cf29c44605b9 100644
>> >> --- a/drivers/md/bcache/writeback.c
>> >> +++ b/drivers/md/bcache/writeback.c
>> >> @@ -229,10 +229,26 @@ static void read_dirty_submit(struct closure *cl)
>> >>                continue_at(cl, write_dirty, io->dc->writeback_write_wq);
>> >>  }
>> >>
>> >> +static inline bool keys_contiguous(struct cached_dev *dc,
>> >> +                              struct keybuf_key *first, struct keybuf_key *second)
>> >> +{
>> >> +              if (PTR_CACHE(dc->disk.c, &second->key, 0)->bdev !=
>> >> +                                              PTR_CACHE(dc->disk.c, &first->key, 0)->bdev)
>> >> +                              return false;
>> >> +
>> >> +              if (PTR_OFFSET(&second->key, 0) !=
>> >> +                                              (PTR_OFFSET(&first->key, 0) + KEY_SIZE(&first->key)))
>> >> +                              return false;
>> >> +
>> >> +              return true;
>> >> +}
>> >> +
>> >>  static void read_dirty(struct cached_dev *dc)
>> >>  {
>> >>                unsigned delay = 0;
>> >> -              struct keybuf_key *w;
>> >> +              struct keybuf_key *next, *keys[5], *w;
>> >> +              size_t size;
>> >> +              int nk, i;
>> >>                struct dirty_io *io;
>> >>                struct closure cl;
>> >>
>> >> @@ -243,45 +259,84 @@ static void read_dirty(struct cached_dev *dc)
>> >>                 * mempools.
>> >>                 */
>> >>
>> >> -              while (!kthread_should_stop()) {
>> >> -
>> >> -                              w = bch_keybuf_next(&dc->writeback_keys);
>> >> -                              if (!w)
>> >> -                                              break;
>> >> -
>> >> -                              BUG_ON(ptr_stale(dc->disk.c, &w->key, 0));
>> >> -
>> >> -                              if (KEY_START(&w->key) != dc->last_read ||
>> >> -                                  jiffies_to_msecs(delay) > 50)
>> >> -                                              while (!kthread_should_stop() && delay)
>> >> -                                                              delay = schedule_timeout_interruptible(delay);
>> >> -
>> >> -                              dc->last_read           = KEY_OFFSET(&w->key);
>> >> -
>> >> -                              io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec)
>> >> -                                                   * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
>> >> -                                                   GFP_KERNEL);
>> >> -                              if (!io)
>> >> -                                              goto err;
>> >> -
>> >> -                              w->private              = io;
>> >> -                              io->dc                          = dc;
>> >> -
>> >> -                              dirty_init(w);
>> >> -                              bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
>> >> -                              io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
>> >> -                              bio_set_dev(&io->bio, PTR_CACHE(dc->disk.c, &w->key, 0)->bdev);
>> >> -                              io->bio.bi_end_io               = read_dirty_endio;
>> >> -
>> >> -                              if (bio_alloc_pages(&io->bio, GFP_KERNEL))
>> >> -                                              goto err_free;
>> >> -
>> >> -                              trace_bcache_writeback(&w->key);
>> >> +              next = bch_keybuf_next(&dc->writeback_keys);
>> >> +
>> >> +              while (!kthread_should_stop() && next) {
>> >> +                              size = 0;
>> >> +                              nk = 0;
>> >> +
>> >> +                              do {
>> >> +                                              BUG_ON(ptr_stale(dc->disk.c, &next->key, 0));
>> >> +
>> >> +                                              /* Don't combine too many operations, even if they
>> >> +                                               * are all small.
>> >> +                                               */
>> >> +                                              if (nk >= 5)
>> >> +                                                              break;
>> >> +
>> >> +                                              /* If the current operation is very large, don't
>> >> +                                               * further combine operations.
>> >> +                                               */
>> >> +                                              if (size > 5000)
>> >> +                                                              break;
>> >> +
>> >> +                                              /* Operations are only eligible to be combined
>> >> +                                               * if they are contiguous.
>> >> +                                               *
>> >> +                                               * TODO: add a heuristic willing to fire a
>> >> +                                               * certain amount of non-contiguous IO per pass,
>> >> +                                               * so that we can benefit from backing device
>> >> +                                               * command queueing.
>> >> +                                               */
>> >> +                                              if (nk != 0 && !keys_contiguous(dc, keys[nk-1], next))
>> >> +                                                              break;
>> >> +
>> >> +                                              size += KEY_SIZE(&next->key);
>> >> +                                              keys[nk++] = next;
>> >> +                              } while ((next = bch_keybuf_next(&dc->writeback_keys)));
>> >> +
>> >> +                              /* Now we have gathered a set of 1..5 keys to write back. */
>> >> +
>> >> +                              for (i = 0; i < nk; i++) {
>> >> +                                              w = keys[i];
>> >> +
>> >> +                                              io = kzalloc(sizeof(struct dirty_io) +
>> >> +                                                                   sizeof(struct bio_vec) *
>> >> +                                                                   DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
>> >> +                                                                   GFP_KERNEL);
>> >> +                                              if (!io)
>> >> +                                                              goto err;
>> >> +
>> >> +                                              w->private              = io;
>> >> +                                              io->dc                          = dc;
>> >> +
>> >> +                                              dirty_init(w);
>> >> +                                              bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
>> >> +                                              io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
>> >> +                                              bio_set_dev(&io->bio,
>> >> +                                                                  PTR_CACHE(dc->disk.c, &w->key, 0)->bdev);
>> >> +                                              io->bio.bi_end_io               = read_dirty_endio;
>> >> +
>> >> +                                              if (bio_alloc_pages(&io->bio, GFP_KERNEL))
>> >> +                                                              goto err_free;
>> >> +
>> >> +                                              trace_bcache_writeback(&w->key);
>> >> +
>> >> +                                              down(&dc->in_flight);
>> >> +
>> >> +                                              /* We've acquired a semaphore for the maximum
>> >> +                                               * simultaneous number of writebacks; from here
>> >> +                                               * everything happens asynchronously.
>> >> +                                               */
>> >> +                                              closure_call(&io->cl, read_dirty_submit, NULL, &cl);
>> >> +                              }
>> >>
>> >> -                              down(&dc->in_flight);
>> >> -                              closure_call(&io->cl, read_dirty_submit, NULL, &cl);
>> >> +                              delay = writeback_delay(dc, size);
>> >>
>> >> -                              delay = writeback_delay(dc, KEY_SIZE(&w->key));
>> >> +                              while (!kthread_should_stop() && delay) {
>> >> +                                              schedule_timeout_interruptible(delay);
>> >> +                                              delay = writeback_delay(dc, 0);
>> >> +                              }
>> >>                }
>> >>
>> >>                if (0) {
>> >> --
>> > --
> --
> To unsubscribe from this list: send the line "unsubscribe linux-bcache" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 65+ messages in thread

* Re: [PATCH 4/5] bcache: writeback: collapse contiguous IO better
@ 2017-09-27  7:32 tang.junhui
  2017-09-27  7:47 ` Michael Lyle
  2017-09-30  2:25 ` Coly Li
  0 siblings, 2 replies; 65+ messages in thread
From: tang.junhui @ 2017-09-27  7:32 UTC (permalink / raw)
  To: i, mlyle; +Cc: linux-bcache, linux-block, Tang Junhui

From: Tang Junhui <tang.junhui@zte.com.cn>

Hello Mike:

For the second question, I thinks this modification is somewhat complex, 
cannot we do something simple to resolve it? I remember there were some
patches trying to avoid too small writeback rate, Coly, is there any 
progress now?

-------
Tang Junhui
                       
> Ah-- re #1 -- I was investigating earlier why not as much was combined
> as I thought should be when idle.  This is surely a factor.  Thanks
> for the catch-- KEY_OFFSET is correct.  I will fix and retest.
> 
> (Under heavy load, the correct thing still happens, but not under
> light or intermediate load0.
> 
> About #2-- I wanted to attain a bounded amount of "combining" of
> operations.  If we have 5 4k extents in a row to dispatch, it seems
> really wasteful to issue them as 5 IOs 60ms apart, which the existing
> code would be willing to do-- I'd rather do a 20k write IO (basically
> the same cost as a 4k write IO) and then sleep 300ms.  It is dependent
> on the elevator/IO scheduler merging the requests.  At the same time,
> I'd rather not combine a really large request.
> 
> It would be really neat to blk_plug the backing device during the
> write issuance, but that requires further work.
> 
> Thanks
> 
> Mike
> 
> On Tue, Sep 26, 2017 at 11:51 PM,  <tang.junhui@zte.com.cn> wrote:
> > From: Tang Junhui <tang.junhui@zte.com.cn>
> >
> > Hello Lyle:
> >
> > Two questions:
> > 1) In keys_contiguous(), you judge I/O contiguous in cache device, but not
> > in backing device. I think you should judge it by backing device (remove
> > PTR_CACHE() and use KEY_OFFSET() instead of PTR_OFFSET()?).
> >
> > 2) I did not see you combine samll contiguous I/Os to big I/O, so I think
> > it is useful when writeback rate was low by avoiding single I/O write, but
> > have no sense in high writeback rate, since previously it is also write
> > I/Os asynchronously.
> >
> > -----------
> > Tang Junhui
> >
> >> Previously, there was some logic that attempted to immediately issue
> >> writeback of backing-contiguous blocks when the writeback rate was
> >> fast.
> >>
> >> The previous logic did not have any limits on the aggregate size it
> >> would issue, nor the number of keys it would combine at once.  It
> >> would also discard the chance to do a contiguous write when the
> >> writeback rate was low-- e.g. at "background" writeback of target
> >> rate = 8, it would not combine two adjacent 4k writes and would
> >> instead seek the disk twice.
> >>
> >> This patch imposes limits and explicitly understands the size of
> >> contiguous I/O during issue.  It also will combine contiguous I/O
> >> in all circumstances, not just when writeback is requested to be
> >> relatively fast.
> >>
> >> It is a win on its own, but also lays the groundwork for skip writes to
> >> short keys to make the I/O more sequential/contiguous.
> >>
> >> Signed-off-by: Michael Lyle <mlyle@lyle.org>
> >> ---
> >>  drivers/md/bcache/bcache.h    |   6 --
> >>  drivers/md/bcache/writeback.c | 131 ++++++++++++++++++++++++++++++------------
> >>  2 files changed, 93 insertions(+), 44 deletions(-)
> >>
> >> diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
> >> index eb83be693d60..da803a3b1981 100644
> >> --- a/drivers/md/bcache/bcache.h
> >> +++ b/drivers/md/bcache/bcache.h
> >> @@ -321,12 +321,6 @@ struct cached_dev {
> >>                struct bch_ratelimit            writeback_rate;
> >>                struct delayed_work             writeback_rate_update;
> >>
> >> -              /*
> >> -               * Internal to the writeback code, so read_dirty() can keep track of
> >> -               * where it's at.
> >> -               */
> >> -              sector_t                                last_read;
> >> -
> >>                /* Limit number of writeback bios in flight */
> >>                struct semaphore                in_flight;
> >>                struct task_struct              *writeback_thread;
> >> diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
> >> index 0b7c89813635..cf29c44605b9 100644
> >> --- a/drivers/md/bcache/writeback.c
> >> +++ b/drivers/md/bcache/writeback.c
> >> @@ -229,10 +229,26 @@ static void read_dirty_submit(struct closure *cl)
> >>                continue_at(cl, write_dirty, io->dc->writeback_write_wq);
> >>  }
> >>
> >> +static inline bool keys_contiguous(struct cached_dev *dc,
> >> +                              struct keybuf_key *first, struct keybuf_key *second)
> >> +{
> >> +              if (PTR_CACHE(dc->disk.c, &second->key, 0)->bdev !=
> >> +                                              PTR_CACHE(dc->disk.c, &first->key, 0)->bdev)
> >> +                              return false;
> >> +
> >> +              if (PTR_OFFSET(&second->key, 0) !=
> >> +                                              (PTR_OFFSET(&first->key, 0) + KEY_SIZE(&first->key)))
> >> +                              return false;
> >> +
> >> +              return true;
> >> +}
> >> +
> >>  static void read_dirty(struct cached_dev *dc)
> >>  {
> >>                unsigned delay = 0;
> >> -              struct keybuf_key *w;
> >> +              struct keybuf_key *next, *keys[5], *w;
> >> +              size_t size;
> >> +              int nk, i;
> >>                struct dirty_io *io;
> >>                struct closure cl;
> >>
> >> @@ -243,45 +259,84 @@ static void read_dirty(struct cached_dev *dc)
> >>                 * mempools.
> >>                 */
> >>
> >> -              while (!kthread_should_stop()) {
> >> -
> >> -                              w = bch_keybuf_next(&dc->writeback_keys);
> >> -                              if (!w)
> >> -                                              break;
> >> -
> >> -                              BUG_ON(ptr_stale(dc->disk.c, &w->key, 0));
> >> -
> >> -                              if (KEY_START(&w->key) != dc->last_read ||
> >> -                                  jiffies_to_msecs(delay) > 50)
> >> -                                              while (!kthread_should_stop() && delay)
> >> -                                                              delay = schedule_timeout_interruptible(delay);
> >> -
> >> -                              dc->last_read           = KEY_OFFSET(&w->key);
> >> -
> >> -                              io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec)
> >> -                                                   * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
> >> -                                                   GFP_KERNEL);
> >> -                              if (!io)
> >> -                                              goto err;
> >> -
> >> -                              w->private              = io;
> >> -                              io->dc                          = dc;
> >> -
> >> -                              dirty_init(w);
> >> -                              bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
> >> -                              io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
> >> -                              bio_set_dev(&io->bio, PTR_CACHE(dc->disk.c, &w->key, 0)->bdev);
> >> -                              io->bio.bi_end_io               = read_dirty_endio;
> >> -
> >> -                              if (bio_alloc_pages(&io->bio, GFP_KERNEL))
> >> -                                              goto err_free;
> >> -
> >> -                              trace_bcache_writeback(&w->key);
> >> +              next = bch_keybuf_next(&dc->writeback_keys);
> >> +
> >> +              while (!kthread_should_stop() && next) {
> >> +                              size = 0;
> >> +                              nk = 0;
> >> +
> >> +                              do {
> >> +                                              BUG_ON(ptr_stale(dc->disk.c, &next->key, 0));
> >> +
> >> +                                              /* Don't combine too many operations, even if they
> >> +                                               * are all small.
> >> +                                               */
> >> +                                              if (nk >= 5)
> >> +                                                              break;
> >> +
> >> +                                              /* If the current operation is very large, don't
> >> +                                               * further combine operations.
> >> +                                               */
> >> +                                              if (size > 5000)
> >> +                                                              break;
> >> +
> >> +                                              /* Operations are only eligible to be combined
> >> +                                               * if they are contiguous.
> >> +                                               *
> >> +                                               * TODO: add a heuristic willing to fire a
> >> +                                               * certain amount of non-contiguous IO per pass,
> >> +                                               * so that we can benefit from backing device
> >> +                                               * command queueing.
> >> +                                               */
> >> +                                              if (nk != 0 && !keys_contiguous(dc, keys[nk-1], next))
> >> +                                                              break;
> >> +
> >> +                                              size += KEY_SIZE(&next->key);
> >> +                                              keys[nk++] = next;
> >> +                              } while ((next = bch_keybuf_next(&dc->writeback_keys)));
> >> +
> >> +                              /* Now we have gathered a set of 1..5 keys to write back. */
> >> +
> >> +                              for (i = 0; i < nk; i++) {
> >> +                                              w = keys[i];
> >> +
> >> +                                              io = kzalloc(sizeof(struct dirty_io) +
> >> +                                                                   sizeof(struct bio_vec) *
> >> +                                                                   DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
> >> +                                                                   GFP_KERNEL);
> >> +                                              if (!io)
> >> +                                                              goto err;
> >> +
> >> +                                              w->private              = io;
> >> +                                              io->dc                          = dc;
> >> +
> >> +                                              dirty_init(w);
> >> +                                              bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
> >> +                                              io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
> >> +                                              bio_set_dev(&io->bio,
> >> +                                                                  PTR_CACHE(dc->disk.c, &w->key, 0)->bdev);
> >> +                                              io->bio.bi_end_io               = read_dirty_endio;
> >> +
> >> +                                              if (bio_alloc_pages(&io->bio, GFP_KERNEL))
> >> +                                                              goto err_free;
> >> +
> >> +                                              trace_bcache_writeback(&w->key);
> >> +
> >> +                                              down(&dc->in_flight);
> >> +
> >> +                                              /* We've acquired a semaphore for the maximum
> >> +                                               * simultaneous number of writebacks; from here
> >> +                                               * everything happens asynchronously.
> >> +                                               */
> >> +                                              closure_call(&io->cl, read_dirty_submit, NULL, &cl);
> >> +                              }
> >>
> >> -                              down(&dc->in_flight);
> >> -                              closure_call(&io->cl, read_dirty_submit, NULL, &cl);
> >> +                              delay = writeback_delay(dc, size);
> >>
> >> -                              delay = writeback_delay(dc, KEY_SIZE(&w->key));
> >> +                              while (!kthread_should_stop() && delay) {
> >> +                                              schedule_timeout_interruptible(delay);
> >> +                                              delay = writeback_delay(dc, 0);
> >> +                              }
> >>                }
> >>
> >>                if (0) {
> >> --
> > --

^ permalink raw reply	[flat|nested] 65+ messages in thread

end of thread, other threads:[~2017-10-10  0:00 UTC | newest]

Thread overview: 65+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-09-26 19:24 [PATCH 1/5] bcache: don't write back data if reading it failed Michael Lyle
2017-09-26 19:24 ` [PATCH 2/5] bcache: implement PI controller for writeback rate Michael Lyle
2017-09-27  8:35   ` Coly Li
2017-09-27  8:40     ` Michael Lyle
2017-09-26 19:24 ` [PATCH 3/5] bcache: smooth writeback rate control Michael Lyle
2017-09-27  8:44   ` Coly Li
2017-09-26 19:24 ` [PATCH 4/5] bcache: writeback: collapse contiguous IO better Michael Lyle
2017-09-26 19:24 ` [PATCH 5/5] bcache: writeback: properly order backing device IO Michael Lyle
2017-09-27  8:22 ` [PATCH 1/5] bcache: don't write back data if reading it failed Coly Li
2017-09-27  8:28   ` Michael Lyle
2017-09-27  8:40     ` Coly Li
2017-09-27  7:32 [PATCH 4/5] bcache: writeback: collapse contiguous IO better tang.junhui
2017-09-27  7:47 ` Michael Lyle
2017-09-27  7:58   ` Michael Lyle
2017-09-30  2:25 ` Coly Li
2017-09-30  3:17   ` Michael Lyle
2017-09-30  6:58     ` Coly Li
2017-09-30  7:13       ` Michael Lyle
2017-09-30  7:13         ` Michael Lyle
2017-09-30  7:33         ` Michael Lyle
2017-09-30  7:33           ` Michael Lyle
2017-09-30  8:03         ` Coly Li
2017-09-30  8:23           ` Michael Lyle
2017-09-30  8:31             ` Michael Lyle
     [not found]               ` <CAJ+L6qcU+Db5TP1Q2J-V8angdzeW9DFGwc7KQqc4di9CSxusLg@mail.gmail.com>
     [not found]                 ` <CAJ+L6qdu4OSRh7Qdkk-5XBgd4W_N29Y6-wVLf-jFAMKEhQrTbQ@mail.gmail.com>
     [not found]                   ` <CAJ+L6qcyq-E4MrWNfB9kGA8DMD_U1HMxJii-=-qPfv0LeRL45w@mail.gmail.com>
2017-09-30 22:49                     ` Michael Lyle
2017-10-01  4:51                       ` Coly Li
2017-10-01 16:56                         ` Michael Lyle
2017-10-01 16:56                           ` Michael Lyle
2017-10-01 17:23                           ` Coly Li
2017-10-01 17:34                             ` Michael Lyle
2017-10-04 18:43                               ` Coly Li
2017-10-04 18:43                                 ` Coly Li
2017-10-04 23:54                                 ` Michael Lyle
2017-10-04 23:54                                   ` Michael Lyle
2017-10-05 17:38                                   ` Coly Li
2017-10-05 17:53                                     ` Michael Lyle
2017-10-05 18:07                                       ` Coly Li
2017-10-05 22:59                                       ` Michael Lyle
2017-10-06  8:27                                         ` Coly Li
2017-10-06  9:20                                           ` Michael Lyle
2017-10-06 10:36                                             ` Coly Li
2017-10-06 10:42                                               ` Michael Lyle
2017-10-06 10:42                                                 ` Michael Lyle
2017-10-06 10:56                                                 ` Michael Lyle
2017-10-06 10:56                                                   ` Michael Lyle
2017-10-06 11:00                                                 ` Hannes Reinecke
2017-10-06 11:09                                                   ` Michael Lyle
2017-10-06 11:09                                                     ` Michael Lyle
2017-10-06 11:57                                                     ` Michael Lyle
2017-10-06 11:57                                                       ` Michael Lyle
2017-10-06 12:37                                                       ` Coly Li
2017-10-06 17:36                                                         ` Michael Lyle
2017-10-06 18:09                                                           ` Coly Li
2017-10-06 18:23                                                             ` Michael Lyle
2017-10-06 18:36                                                             ` Michael Lyle
2017-10-09 18:58                                                               ` Coly Li
2017-10-10  0:00                                                                 ` Michael Lyle
2017-10-09  5:59                                                             ` Hannes Reinecke
2017-10-06 12:20                                                 ` Coly Li
2017-10-06 17:53                                                   ` Michael Lyle
2017-09-29  3:37 tang.junhui
2017-09-29  4:15 ` Michael Lyle
2017-09-29  4:22   ` Coly Li
2017-09-29  4:27     ` Michael Lyle
2017-09-29  4:26   ` Michael Lyle

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.